diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 815687fa17..61a535fe91 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @kolchfa-aws @Naarcha-AWS @vagimeli @AMoo-Miki @natebower @dlvenable @stephen-crawford @epugh +* @kolchfa-aws @Naarcha-AWS @AMoo-Miki @natebower @dlvenable @epugh diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md index 0985529940..b3e03c0ffe 100644 --- a/.github/ISSUE_TEMPLATE/issue_template.md +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -15,7 +15,6 @@ assignees: '' **Tell us about your request.** Provide a summary of the request. -***Version:** List the OpenSearch version to which this issue applies, e.g. 2.14, 2.12--2.14, or all. +**Version:** List the OpenSearch version to which this issue applies, e.g. 2.14, 2.12--2.14, or all. **What other resources are available?** Provide links to related issues, POCs, steps for testing, etc. - diff --git a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt index e33ac09744..bf4c8a0a29 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt @@ -85,6 +85,7 @@ Python PyTorch Querqy Query Workbench +RankLib RCF Summarize RPM Package Manager Ruby @@ -97,4 +98,5 @@ TorchScript Tribuo VisBuilder Winlogbeat -Zstandard \ No newline at end of file +XGBoost +Zstandard diff --git a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt index d0d1c308eb..0a1d2c557c 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt @@ -77,13 +77,16 @@ Levenshtein [Mm]ultivalued [Mm]ultiword [Nn]amespace -[Oo]versamples? +[Oo]ffline [Oo]nboarding +[Oo]versamples? pebibyte p\d{2} [Pp]erformant [Pp]laintext [Pp]luggable +[Pp]reaggregate(s|d)? +[Pp]recompute(s|d)? [Pp]reconfigure [Pp]refetch [Pp]refilter @@ -105,6 +108,7 @@ p\d{2} [Rr]eprovision(ed|ing)? [Rr]erank(er|ed|ing)? [Rr]epo +[Rr]escor(e|ed|ing)? [Rr]ewriter [Rr]ollout [Rr]ollup diff --git a/.github/workflows/jekyll-spec-insert.yml b/.github/workflows/jekyll-spec-insert.yml new file mode 100644 index 0000000000..cefd477be2 --- /dev/null +++ b/.github/workflows/jekyll-spec-insert.yml @@ -0,0 +1,20 @@ +name: Lint and Test Jekyll Spec Insert +on: + push: + paths: + - 'spec-insert/**' + pull_request: + paths: + - 'spec-insert/**' +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + - run: bundle install + - working-directory: spec-insert + run: | + bundle exec rubocop + bundle exec rspec diff --git a/.github/workflows/pr_checklist.yml b/.github/workflows/pr_checklist.yml index b56174793e..e34d0cecb2 100644 --- a/.github/workflows/pr_checklist.yml +++ b/.github/workflows/pr_checklist.yml @@ -29,7 +29,7 @@ jobs: with: script: | let assignee = context.payload.pull_request.user.login; - const prOwners = ['Naarcha-AWS', 'kolchfa-aws', 'vagimeli', 'natebower']; + const prOwners = ['Naarcha-AWS', 'kolchfa-aws', 'natebower']; if (!prOwners.includes(assignee)) { assignee = 'kolchfa-aws' @@ -40,4 +40,4 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, assignees: [assignee] - }); \ No newline at end of file + }); diff --git a/.github/workflows/update-api-components.yml b/.github/workflows/update-api-components.yml new file mode 100644 index 0000000000..42cc1d2827 --- /dev/null +++ b/.github/workflows/update-api-components.yml @@ -0,0 +1,52 @@ +name: Update API Components +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * 0" # Every Sunday at midnight GMT +jobs: + update-api-components: + if: ${{ github.repository == 'opensearch-project/documentation-website' }} + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - run: git config --global pull.rebase true + + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + + - run: bundle install + + - name: Download spec and insert into documentation + run: bundle exec jekyll spec-insert + + - name: Get current date + id: date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV + + - name: GitHub App token + id: github_app_token + uses: tibdex/github-app-token@v2.1.0 + with: + app_id: ${{ secrets.APP_ID }} + private_key: ${{ secrets.APP_PRIVATE_KEY }} + + - name: Create pull request + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ steps.github_app_token.outputs.token }} + commit-message: "Updated API components to reflect the latest OpenSearch API spec (${{ env.date }})" + title: "[AUTOCUT] Update API components to reflect the latest OpenSearch API spec (${{ env.date }})" + body: | + Update API components to reflect the latest [OpenSearch API spec](https://github.com/opensearch-project/opensearch-api-specification/releases/download/main-latest/opensearch-openapi.yaml). + Date: ${{ env.date }} + branch: update-api-components-${{ env.date }} + base: main + signoff: true + labels: autocut \ No newline at end of file diff --git a/.gitignore b/.gitignore index da3cf9d144..92f01c5fca 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ Gemfile.lock *.iml .jekyll-cache .project +vendor/bundle diff --git a/.ruby-version b/.ruby-version deleted file mode 100644 index 4772543317..0000000000 --- a/.ruby-version +++ /dev/null @@ -1 +0,0 @@ -3.3.2 diff --git a/API_STYLE_GUIDE.md b/API_STYLE_GUIDE.md index a6e0551f17..a058bbe7c2 100644 --- a/API_STYLE_GUIDE.md +++ b/API_STYLE_GUIDE.md @@ -103,7 +103,7 @@ For GET and DELETE APIs: Introduce what you can do with the optional parameters. Parameter | Data type | Description :--- | :--- | :--- -### Request fields +### Request body fields For PUT and POST APIs: Introduce what the request fields are allowed to provide in the body of the request. @@ -189,7 +189,7 @@ The `POST _reindex` request returns the following response fields: } ``` -### Response fields +### Response body fields For PUT and POST APIs: Define all allowable response fields that can be returned in the body of the response. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7afa9d7596..f6d9b87ee8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -78,11 +78,13 @@ Follow these steps to set up your local copy of the repository: 1. Navigate to your cloned repository. +##### Building by using locally installed packages + 1. Install [Ruby](https://www.ruby-lang.org/en/) if you don't already have it. We recommend [RVM](https://rvm.io/), but you can use any method you prefer: ``` curl -sSL https://get.rvm.io | bash -s stable - rvm install 3.2.4 + rvm install 3.3.2 ruby -v ``` @@ -98,6 +100,14 @@ Follow these steps to set up your local copy of the repository: bundle install ``` +##### Building by using containerization + +Assuming you have `docker-compose` installed, run the following command: + + ``` + docker compose -f docker-compose.dev.yml up + ``` + #### Troubleshooting Try the following troubleshooting steps if you encounter an error when trying to build the documentation website: @@ -148,6 +158,23 @@ To ensure that our documentation adheres to the [OpenSearch Project Style Guidel Optionally, you can install the [Vale VSCode](https://github.com/chrischinchilla/vale-vscode) extension, which integrates Vale with Visual Studio Code. By default, only _errors_ and _warnings_ are underlined. To change the minimum alert level to include _suggestions_, go to **Vale VSCode** > **Extension Settings** and select **suggestion** in the **Vale > Vale CLI: Min Alert Level** dropdown list. +## Troubleshooting + +This section provides information about potential solutions for known issues. + +### Installing Ruby on an Apple silicon machine + +If you're having trouble installing Ruby with `rvm` on an Apple silicon machine, it could be because of an OpenSSL version misalignment. To fix this issue, use the following command, replacing `` with your [desired version](https://github.com/ruby/openssl/blob/master/README.md): + +``` +# Assumes Brew is installed +curl -sSL https://get.rvm.io | bash -s stable +rvm install 3.2.4 --with-openssl-dir=$(brew --prefix openssl@) +ruby -v +``` + ## Getting help For help with the contribution process, reach out to one of the [points of contact](README.md#points-of-contact). + + diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md new file mode 100644 index 0000000000..9b0ec1c79d --- /dev/null +++ b/DEVELOPER_GUIDE.md @@ -0,0 +1,135 @@ +# Developer guide +- [Introduction](#introduction) +- [Starting the Jekyll server locally](#starting-the-jekyll-server-locally) +- [Using the spec-insert Jekyll plugin](#using-the-spec-insert-jekyll-plugin) + - [Ignoring files and folders](#ignoring-files-and-folders) +- [CI/CD](#cicd) +- [Spec insert components](#spec-insert-components) + - [Query parameters](#query-parameters) + - [Path parameters](#path-parameters) + - [Paths and HTTP methods](#paths-and-http-methods) + +## Introduction + +The `.md` documents in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). These HTML pages are hosted on [opensearch.org](https://opensearch.org/docs/latest/). + +## Starting the Jekyll server locally +You can run the Jekyll server locally to view the rendered HTML pages using the following steps: + +1. Install [Ruby](https://www.ruby-lang.org/en/documentation/installation/) 3.1.0 or later for your operating system. +2. Install the required gems by running `bundle install`. +3. Run `bundle exec jekyll serve` to start the Jekyll server locally (this can take several minutes to complete). +4. Open your browser and navigate to `http://localhost:4000` to view the rendered HTML pages. + +## Using the `spec-insert` Jekyll plugin +The `spec-insert` Jekyll plugin is used to insert API components into Markdown files. The plugin downloads the [latest OpenSearch specification](https://github.com/opensearch-project/opensearch-api-specification) and renders the API components from the spec. This aims to reduce the manual effort required to keep the documentation up to date. + +To use this plugin, make sure that you have installed Ruby 3.1.0 or later and the required gems by running `bundle install`. + +Edit your Markdown file and insert the following snippet where you want render an API component: + +```markdown + + +This is where the API component will be inserted. +Everything between the `spec_insert_start` and `spec_insert_end` tags will be overwritten. + + +``` + +Then run the following Jekyll command to render the API components: +```shell +bundle exec jekyll spec-insert +``` + +If you are working on multiple Markdown files and do not want to keep running the `jekyll spec-insert` command, you can add the `--watch` (or `-W`) flag to the command to watch for changes in the Markdown files and automatically render the API components: + +```shell +bundle exec jekyll spec-insert --watch +``` + +Depending on the text editor you are using, you may need to manually reload the file from disk to see the changes applied by the plugin if the editor does not automatically reload the file periodically. + +The plugin will pull the newest OpenSearch API spec from its [repository](https://github.com/opensearch-project/opensearch-api-specification) if the spec file does not exist locally or if it is older than 24 hours. To tell the plugin to always pull the newest spec, you can add the `--refresh-spec` (or `-R`) flag to the command: + +```shell +bundle exec jekyll spec-insert --refresh-spec +``` + +### Ignoring files and folders +The `spec-insert` plugin ignores all files and folders listed in the [./_config.yml#exclude](./_config.yml) list, which is also the list of files and folders that Jekyll ignores. + +## CI/CD +The `spec-insert` plugin is run as part of the CI/CD pipeline to ensure that the API components are up to date in the documentation. This is performed through the [update-api-components.yml](.github/workflows/update-api-components.yml) GitHub Actions workflow, which creates a pull request containing the updated API components every Sunday. + +## Spec insert components +All spec insert components accept the following arguments: +- `api` (String; required): The name of the API to render the component from. This is equivalent to the `x-operation-group` field in the OpenSearch OpenAPI Spec. +- `component` (String; required): The name of the component to render, such as `query_parameters`, `path_parameters`, or `paths_and_http_methods`. +- `omit_header` (Boolean; Default is `false`): If set to `true`, the markdown header of the component will not be rendered. + +### Paths and HTTP methods +To insert paths and HTTP methods for the `search` API, use the following snippet: +```markdown + + +``` + +### Path parameters + +To insert a path parameters table of the `indices.create` API, use the following snippet. Use the `x-operation-group` field from OpenSearch OpenAPI Spec for the `api` value: + +```markdown + + +``` +This table accepts the same arguments as the query parameters table except the `include_global` argument. + +### Query parameters +To insert the API query parameters table of the `cat.indices` API, use the following snippet: +```markdown + + +``` + +This will insert the query parameters of the `cat.indices` API into the `.md` file with three default columns: `Parameter`, `Type`, and `Description`. You can customize the query parameters table by adding the `columns` argument which accepts a comma-separated list of column names. The available column names are: + +- `Parameter` +- `Type` +- `Description` +- `Required` +- `Default` + +_When `Required`/`Default` is not chosen, the information will be written in the `Description` column._ + +You can also customize this component with the following settings: + +- `include_global` (Boolean; default is `false`): Includes global query parameters in the table. +- `include_deprecated` (Boolean; default is `true`): Includes deprecated parameters in the table. +- `pretty` (Boolean; default is `false`): Renders the table in the pretty format instead of the compact format. + +The following snippet inserts the specified columns into the query parameters table: + +```markdown + + +``` diff --git a/Gemfile b/Gemfile index 7825dcd02b..fee04f3c48 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,9 @@ -source "http://rubygems.org" +# frozen_string_literal: true + +source 'https://rubygems.org' + +# Manually add csv gem since Ruby 3.4.0 no longer includes it +gem 'csv', '~> 3.0' # Hello! This is where you manage which Jekyll version is used to run. # When you want to use a different version, change it below, save the @@ -8,12 +13,12 @@ source "http://rubygems.org" # # This will help ensure the proper Jekyll version is running. # Happy Jekylling! -gem "jekyll", "~> 4.3.2" +gem 'jekyll', '~> 4.3.2' # This is the default theme for new Jekyll sites. You may change this to anything you like. -gem "just-the-docs", "~> 0.3.3" -gem "jekyll-remote-theme", "~> 0.4" -gem "jekyll-redirect-from", "~> 0.16" +gem 'jekyll-redirect-from', '~> 0.16' +gem 'jekyll-remote-theme', '~> 0.4' +gem 'just-the-docs', '~> 0.3.3' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. @@ -22,21 +27,31 @@ gem "jekyll-redirect-from", "~> 0.16" # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-last-modified-at" - gem "jekyll-sitemap" + gem 'jekyll-last-modified-at' + gem 'jekyll-sitemap' + gem 'jekyll-spec-insert', :path => './spec-insert' end # Windows does not include zoneinfo files, so bundle the tzinfo-data gem -gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby] +gem 'tzinfo-data', platforms: %i[mingw mswin x64_mingw jruby] # Performance-booster for watching directories on Windows -gem "wdm", "~> 0.1.0" if Gem.win_platform? +gem 'wdm', '~> 0.1.0' if Gem.win_platform? # Installs webrick dependency for building locally -gem "webrick", "~> 1.7" - +gem 'webrick', '~> 1.7' # Link checker -gem "typhoeus" -gem "ruby-link-checker" -gem "ruby-enum" +gem 'ruby-enum' +gem 'ruby-link-checker' +gem 'typhoeus' + +# Spec Insert +gem 'activesupport', '~> 7' +gem 'mustache', '~> 1' + +group :development, :test do + gem 'rspec' + gem 'rubocop', '~> 1.44', require: false + gem 'rubocop-rake', require: false +end diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 55b908e027..b06d367e21 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -9,14 +9,14 @@ This document lists the maintainers in this repo. See [opensearch-project/.githu | Fanit Kolchina | [kolchfa-aws](https://github.com/kolchfa-aws) | Amazon | | Nate Archer | [Naarcha-AWS](https://github.com/Naarcha-AWS) | Amazon | | Nathan Bower | [natebower](https://github.com/natebower) | Amazon | -| Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | | Miki Barahmand | [AMoo-Miki](https://github.com/AMoo-Miki) | Amazon | | David Venable | [dlvenable](https://github.com/dlvenable) | Amazon | -| Stephen Crawford | [stephen-crawford](https://github.com/stephen-crawford) | Amazon | | Eric Pugh | [epugh](https://github.com/epugh) | OpenSource Connections | ## Emeritus -| Maintainer | GitHub ID | Affiliation | -| ---------------- | ----------------------------------------------- | ----------- | -| Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | +| Maintainer | GitHub ID | Affiliation | +| ---------------- | ------------------------------------------------------- | ----------- | +| Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | +| Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | +| Stephen Crawford | [stephen-crawford](https://github.com/stephen-crawford) | Amazon | \ No newline at end of file diff --git a/README.md b/README.md index 66beb1948c..807e106309 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ # About the OpenSearch documentation repo The `documentation-website` repository contains the user documentation for OpenSearch. You can find the rendered documentation at [opensearch.org/docs](https://opensearch.org/docs). +The markdown files in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). Check the [DEVELOPER_GUIDE](DEVELOPER_GUIDE.md) for more information about how to use Jekyll for this repository. ## Contributing @@ -23,7 +24,6 @@ If you encounter problems or have questions when contributing to the documentati - [kolchfa-aws](https://github.com/kolchfa-aws) - [Naarcha-AWS](https://github.com/Naarcha-AWS) -- [vagimeli](https://github.com/vagimeli) ## Code of conduct diff --git a/TERMS.md b/TERMS.md index 7de56f9275..146f3c1049 100644 --- a/TERMS.md +++ b/TERMS.md @@ -588,6 +588,8 @@ Use % in headlines, quotations, and tables or in technical copy. An agent and REST API that allows you to query numerous performance metrics for your cluster, including aggregations of those metrics, independent of the Java Virtual Machine (JVM). +**performant** + **plaintext, plain text** Use *plaintext* only to refer to nonencrypted or decrypted text in content about encryption. Use *plain text* to refer to ASCII files. @@ -602,6 +604,10 @@ Tools inside of OpenSearch that can be customized to enhance OpenSearch's functi **pop-up** +**preaggregate** + +**precompute** + **premise, premises** With reference to property and buildings, always form as plural. diff --git a/_about/version-history.md b/_about/version-history.md index fd635aff5b..bfbd8e9f55 100644 --- a/_about/version-history.md +++ b/_about/version-history.md @@ -9,6 +9,9 @@ permalink: /version-history/ OpenSearch version | Release highlights | Release date :--- | :--- | :--- +[2.18.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.18.0.md) | Adds a redesigned home page, updated Discover interface, and collaborative workspaces to OpenSearch Dashboards. Includes improvements to ML inference processor and query grouping. Introduces reranking by field and paginated CAT APIs. Includes experimental OpenSearch Dashboards Assistant capabilities. For a full list of release highlights, see the Release Notes. | 05 November 2024 +[2.17.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.17.1.md) | Includes bug fixes for ML Commons, anomaly detection, k-NN, and security analytics. Adds various infrastructure and maintenance updates. For a full list of release highlights, see the Release Notes. | 1 October 2024 +[2.17.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.17.0.md) | Includes disk-optimized vector search, binary quantization, and byte vector encoding in k-NN. Adds asynchronous batch ingestion for ML tasks. Provides search and query performance enhancements and a new custom trace source in trace analytics. Includes application-based configuration templates. For a full list of release highlights, see the Release Notes. | 17 September 2024 [2.16.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.16.0.md) | Includes built-in byte vector quantization and binary vector support in k-NN. Adds new sort, split, and ML inference search processors for search pipelines. Provides application-based configuration templates and additional plugins to integrate multiple data sources in OpenSearch Dashboards. Includes an experimental Batch Predict ML Commons API. For a full list of release highlights, see the Release Notes. | 06 August 2024 [2.15.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.15.0.md) | Includes parallel ingestion processing, SIMD support for exact search, and the ability to disable doc values for the k-NN field. Adds wildcard and derived field types. Improves performance for single-cardinality aggregations, rolling upgrades to remote-backed clusters, and more metrics for top N queries. For a full list of release highlights, see the Release Notes. | 25 June 2024 [2.14.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.14.0.md) | Includes performance improvements to hybrid search and date histogram queries with multi-range traversal, ML model integration within the Ingest API, semantic cache for LangChain applications, low-level vector query interface for neural sparse queries, and improved k-NN search filtering. Provides an experimental tiered cache feature. For a full list of release highlights, see the Release Notes. | 14 May 2024 diff --git a/_aggregations/bucket/auto-interval-date-histogram.md b/_aggregations/bucket/auto-interval-date-histogram.md new file mode 100644 index 0000000000..b7a95a3b89 --- /dev/null +++ b/_aggregations/bucket/auto-interval-date-histogram.md @@ -0,0 +1,377 @@ +--- +layout: default +title: Auto-interval date histogram +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 12 +--- + +# Auto-interval date histogram + +Similar to the [date histogram aggregation]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-histogram/), in which you must specify an interval, the `auto_date_histogram` is a multi-bucket aggregation that automatically creates date histogram buckets based on the number of buckets you provide and the time range of your data. The actual number of buckets returned is always less than or equal to the number of buckets you specify. This aggregation is particularly useful when you are working with time-series data and want to visualize or analyze data over different time intervals without manually specifying the interval size. + +## Intervals + +The bucket interval is chosen based on the collected data to ensure that the number of returned buckets is less than or equal to the requested number. + +The following table lists the possible returned intervals for each time unit. + +| Unit | Intervals | +| :--- | :---| +| Seconds| Multiples of 1, 5, 10, and 30 | +| Minutes| Multiples of 1, 5, 10, and 30 | +| Hours | Multiples of 1, 3, and 12 | +| Days | Multiples of 1 and 7 | +| Months | Multiples of 1 and 3 | +| Years | Multiples of 1, 5, 10, 20, 50, and 100 | + +If an aggregation returns too many buckets (for example, daily buckets), OpenSearch will automatically reduce the number of buckets to ensure a manageable result. Instead of returning the exact number of requested daily buckets, it will reduce them by a factor of about 1/7. For example, if you ask for 70 buckets but the data contains too many daily intervals, OpenSearch might return only 10 buckets, grouping the data into larger intervals (such as weeks) to avoid an overwhelming number of results. This helps optimize the aggregation and prevent excessive detail when too much data is available. + +## Example + +In the following example, you'll search an index containing blog posts. + +First, create a mapping for this index and specify the `date_posted` field as the `date` type: + +```json +PUT blogs +{ + "mappings" : { + "properties" : { + "date_posted" : { + "type" : "date", + "format" : "yyyy-MM-dd" + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index the following documents into the `blogs` index: + +```json +PUT blogs/_doc/1 +{ + "name": "Semantic search in OpenSearch", + "date_posted": "2022-04-17" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs/_doc/2 +{ + "name": "Sparse search in OpenSearch", + "date_posted": "2022-05-02" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs/_doc/3 +{ + "name": "Distributed tracing with Data Prepper", + "date_posted": "2022-04-25" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs/_doc/4 +{ + "name": "Observability in OpenSearch", + "date_posted": "2023-03-23" +} + +``` +{% include copy-curl.html %} + +To use the `auto_date_histogram` aggregation, specify the field containing the date or timestamp values. For example, to aggregate blog posts by `date_posted` into two buckets, send the following request: + +```json +GET /blogs/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "buckets": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +The response shows that the blog posts were aggregated into two buckets. The interval was automatically set to 1 year, with all three 2022 blog posts collected in one bucket and the 2023 blog post in another: + +```json +{ + "took": 20, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "histogram": { + "buckets": [ + { + "key_as_string": "2022-01-01", + "key": 1640995200000, + "doc_count": 3 + }, + { + "key_as_string": "2023-01-01", + "key": 1672531200000, + "doc_count": 1 + } + ], + "interval": "1y" + } + } +} +``` + +## Returned buckets + +Each bucket contains the following information: + +```json +{ + "key_as_string": "2023-01-01", + "key": 1672531200000, + "doc_count": 1 +} +``` + +In OpenSearch, dates are internally stored as 64-bit integers representing timestamps in milliseconds since the epoch. In the aggregation response, each bucket `key` is returned as such a timestamp. The `key_as_string` value shows the same timestamp but formatted as a date string based on the [`format`](#date-format) parameter. The `doc_count` field contains the number of documents in the bucket. + +## Parameters + +Auto-interval date histogram aggregations accept the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`field` | String | The field on which to aggregate. The field must contain the date or timestamp values. Either `field` or `script` is required. +`buckets` | Integer | The desired number of buckets. The returned number of buckets is less than or equal to the desired number. Optional. Default is `10`. +`minimum_interval` | String | The minimum interval to be used. Specifying a minimum interval can make the aggregation process more efficient. Valid values are `year`, `month`, `day`, `hour`, `minute`, and `second`. Optional. +`time_zone` | String | Specifies to use a time zone other than the default (UTC) for bucketing and rounding. You can specify the `time_zone` parameter as a [UTC offset](https://en.wikipedia.org/wiki/UTC_offset), such as `-04:00`, or an [IANA time zone ID](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones), such as `America/New_York`. Optional. Default is `UTC`. For more information, see [Time zone](#time-zone). +`format` | String | The format for returning dates representing bucket keys. Optional. Default is the format specified in the field mapping. For more information, see [Date format](#date-format). +`script` | String | A document-level or value-level script for aggregating values into buckets. Either `field` or `script` is required. +`missing` | String | Specifies how to handle documents in which the field value is missing. By default, such documents are ignored. If you specify a date value in the `missing` parameter, all documents in which the field value is missing are collected into the bucket with the specified date. + +## Date format + +If you don't specify the `format` parameter, the format defined in the field mapping is used (as seen in the preceding response). To modify the format, specify the `format` parameter: + +```json +GET /blogs/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "format": "yyyy-MM-dd HH:mm:ss" + } + } + } +} +``` +{% include copy-curl.html %} + +The `key_as_string` field is now returned in the specified format: + +```json +{ + "key_as_string": "2023-01-01 00:00:00", + "key": 1672531200000, + "doc_count": 1 +} +``` + +Alternatively, you can specify one of the built-in date [formats]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/#formats): + +```json +GET /blogs/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "format": "basic_date_time_no_millis" + } + } + } +} +``` +{% include copy-curl.html %} + +The `key_as_string` field is now returned in the specified format: + +```json +{ + "key_as_string": "20230101T000000Z", + "key": 1672531200000, + "doc_count": 1 +} +``` + +## Time zone + +By default, dates are stored and processed in UTC. The `time_zone` parameter allows you to specify a different time zone for bucketing. You can specify the `time_zone` parameter as a [UTC offset](https://en.wikipedia.org/wiki/UTC_offset), such as `-04:00`, or an [IANA time zone ID](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones), such as `America/New_York`. + +As an example, index the following documents into an index: + +```json +PUT blogs1/_doc/1 +{ + "name": "Semantic search in OpenSearch", + "date_posted": "2022-04-17T01:00:00.000Z" +} +``` +{% include copy-curl.html %} + +```json +PUT blogs1/_doc/2 +{ + "name": "Sparse search in OpenSearch", + "date_posted": "2022-04-17T04:00:00.000Z" +} +``` +{% include copy-curl.html %} + +First, run an aggregation without specifying a time zone: + +```json +GET /blogs1/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "buckets": 2, + "format": "yyyy-MM-dd HH:mm:ss" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains two 3-hour buckets, starting at midnight UTC on April 17, 2022: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "histogram": { + "buckets": [ + { + "key_as_string": "2022-04-17 01:00:00", + "key": 1650157200000, + "doc_count": 1 + }, + { + "key_as_string": "2022-04-17 04:00:00", + "key": 1650168000000, + "doc_count": 1 + } + ], + "interval": "3h" + } + } +} +``` + +Now, specify a `time_zone` of `-02:00`: + +```json +GET /blogs1/_search +{ + "size": 0, + "aggs": { + "histogram": { + "auto_date_histogram": { + "field": "date_posted", + "buckets": 2, + "format": "yyyy-MM-dd HH:mm:ss", + "time_zone": "-02:00" + } + } + } +} +``` + +The response contains two buckets in which the start time is shifted by 2 hours and starts at 23:00 on April 16, 2022: + +```json +{ + "took": 17, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "histogram": { + "buckets": [ + { + "key_as_string": "2022-04-16 23:00:00", + "key": 1650157200000, + "doc_count": 1 + }, + { + "key_as_string": "2022-04-17 02:00:00", + "key": 1650168000000, + "doc_count": 1 + } + ], + "interval": "3h" + } + } +} +``` + +When using time zones with daylight saving time (DST) changes, the sizes of buckets that are near the transition may differ slightly from the sizes of neighboring buckets. +{: .note} diff --git a/_aggregations/bucket/children.md b/_aggregations/bucket/children.md new file mode 100644 index 0000000000..1f493c4620 --- /dev/null +++ b/_aggregations/bucket/children.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Children +parent: Bucket aggregations +grand_parent: Aggregations +nav_order: 15 +--- + +# Children + +The `children` aggregation connects parent documents with their related child documents. This allows you to analyze relationships between different types of data in a single query, rather than needing to run multiple queries and combine the results manually. + +--- + +## Example index, sample data, and children aggregation query + +For example, if you have a parent-child relationship between authors, posts, and comments, you can analyze the relationships between the different data types (`authors`, `posts`, and `comments`) in a single query. + +The `authors` aggregation groups the documents by the `author.keyword` field. This allows you to see the number of documents associates with each author. + +In each author group, the `children` aggregation retrieves the associated posts. This gives you a breakdown of the posts written by each author. + +In the `posts` aggregation, another `children` aggregation fetches the comments associated with each post. This provides you a way to see the comments for each individual post. + +In the `comments` aggregation, the `value_count` aggregation counts the number of comments on each post. This allows you to gauge the engagement level for each post by seeing the number of comments it has received. + +#### Example index + +```json +PUT /blog-sample +{ + "mappings": { + "properties": { + "type": { "type": "keyword" }, + "name": { "type": "keyword" }, + "title": { "type": "text" }, + "content": { "type": "text" }, + "author": { "type": "keyword" }, + "post_id": { "type": "keyword" }, + "join_field": { + "type": "join", + "relations": { + "author": "post", + "post": "comment" + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Sample documents + +```json +POST /blog-sample/_doc/1?routing=1 +{ + "type": "author", + "name": "John Doe", + "join_field": "author" +} + +POST /blog-sample/_doc/2?routing=1 +{ + "type": "post", + "title": "Introduction to OpenSearch", + "content": "OpenSearch is a powerful search and analytics engine...", + "author": "John Doe", + "join_field": { + "name": "post", + "parent": "1" + } +} + +POST /blog-sample/_doc/3?routing=1 +{ + "type": "comment", + "content": "Great article! Very informative.", + "join_field": { + "name": "comment", + "parent": "2" + } +} + +POST /blog-sample/_doc/4?routing=1 +{ + "type": "comment", + "content": "Thanks for the clear explanation.", + "join_field": { + "name": "comment", + "parent": "2" + } +} +``` +{% include copy-curl.html %} + +#### Example children aggregation query + +```json +GET /blog-sample/_search +{ + "size": 0, + "aggs": { + "authors": { + "terms": { + "field": "name.keyword" + }, + "aggs": { + "posts": { + "children": { + "type": "post" + }, + "aggs": { + "post_titles": { + "terms": { + "field": "title.keyword" + }, + "aggs": { + "comments": { + "children": { + "type": "comment" + }, + "aggs": { + "comment_count": { + "value_count": { + "field": "_id" + } + } + } + } + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +The response should appear similar to the following example: + +```json +{ + "took": 30, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "authors": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [] + } + } +} +``` diff --git a/_aggregations/bucket/index.md b/_aggregations/bucket/index.md index 1658c06ea5..e1a02b890d 100644 --- a/_aggregations/bucket/index.md +++ b/_aggregations/bucket/index.md @@ -22,6 +22,7 @@ You can use bucket aggregations to implement faceted navigation (usually placed OpenSearch supports the following bucket aggregations: - [Adjacency matrix]({{site.url}}{{site.baseurl}}/aggregations/bucket/adjacency-matrix/) +- [Children]({{site.url}}{{site.baseurl}}/aggregations/bucket/children) - [Date histogram]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-histogram/) - [Date range]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-range/) - [Diversified sampler]({{site.url}}{{site.baseurl}}/aggregations/bucket/diversified-sampler/) diff --git a/_aggregations/bucket/nested.md b/_aggregations/bucket/nested.md index 89c44c6457..affda8e437 100644 --- a/_aggregations/bucket/nested.md +++ b/_aggregations/bucket/nested.md @@ -96,8 +96,8 @@ GET logs/_search "aggregations" : { "pages" : { "doc_count" : 2, - "min_price" : { - "value" : 200.0 + "min_load_time" : { + "value" : 200 } } } diff --git a/_analyzers/character-filters/html-character-filter.md b/_analyzers/character-filters/html-character-filter.md new file mode 100644 index 0000000000..ef55930bdf --- /dev/null +++ b/_analyzers/character-filters/html-character-filter.md @@ -0,0 +1,124 @@ +--- +layout: default +title: HTML strip +parent: Character filters +nav_order: 100 +--- + +# HTML strip character filter + +The `html_strip` character filter removes HTML tags, such as `
`, `

`, and ``, from the input text and renders plain text. The filter can be configured to preserve certain tags or decode specific HTML entities, such as ` `, into spaces. + +## Example: HTML analyzer + +```json +GET /_analyze +{ + "tokenizer": "keyword", + "char_filter": [ + "html_strip" + ], + "text": "

Commonly used calculus symbols include α, β and θ

" +} +``` +{% include copy-curl.html %} + +Using the HTML analyzer, you can convert the HTML character entity references into their corresponding symbols. The processed text would read as follows: + +``` +Commonly used calculus symbols include α, β and θ +``` + +## Example: Custom analyzer with lowercase filter + +The following example query creates a custom analyzer that strips HTML tags and converts the plain text to lowercase by using the `html_strip` analyzer and `lowercase` filter: + +```json +PUT /html_strip_and_lowercase_analyzer +{ + "settings": { + "analysis": { + "char_filter": { + "html_filter": { + "type": "html_strip" + } + }, + "analyzer": { + "html_strip_analyzer": { + "type": "custom", + "char_filter": ["html_filter"], + "tokenizer": "standard", + "filter": ["lowercase"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Testing `html_strip_and_lowercase_analyzer` + +You can run the following request to test the analyzer: + +```json +GET /html_strip_and_lowercase_analyzer/_analyze +{ + "analyzer": "html_strip_analyzer", + "text": "

Welcome to OpenSearch!

" +} +``` +{% include copy-curl.html %} + +In the response, the HTML tags have been removed and the plain text has been converted to lowercase: + +``` +welcome to opensearch! +``` + +## Example: Custom analyzer that preserves HTML tags + +The following example request creates a custom analyzer that preserves HTML tags: + +```json +PUT /html_strip_preserve_analyzer +{ + "settings": { + "analysis": { + "char_filter": { + "html_filter": { + "type": "html_strip", + "escaped_tags": ["b", "i"] + } + }, + "analyzer": { + "html_strip_analyzer": { + "type": "custom", + "char_filter": ["html_filter"], + "tokenizer": "keyword" + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Testing `html_strip_preserve_analyzer` + +You can run the following request to test the analyzer: + +```json +GET /html_strip_preserve_analyzer/_analyze +{ + "analyzer": "html_strip_analyzer", + "text": "

This is a bold and italic text.

" +} +``` +{% include copy-curl.html %} + +In the response, the `italic` and `bold` tags have been retained, as specified in the custom analyzer request: + +``` +This is a bold and italic text. +``` diff --git a/_analyzers/character-filters/index.md b/_analyzers/character-filters/index.md new file mode 100644 index 0000000000..0e2ce01b8c --- /dev/null +++ b/_analyzers/character-filters/index.md @@ -0,0 +1,19 @@ +--- +layout: default +title: Character filters +nav_order: 90 +has_children: true +has_toc: false +--- + +# Character filters + +Character filters process text before tokenization to prepare it for further analysis. + +Unlike token filters, which operate on tokens (words or terms), character filters process the raw input text before tokenization. They are especially useful for cleaning or transforming structured text containing unwanted characters, such as HTML tags or special symbols. Character filters help to strip or replace these elements so that text is properly formatted for analysis. + +Use cases for character filters include: + +- **HTML stripping:** Removes HTML tags from content so that only the plain text is indexed. +- **Pattern replacement:** Replaces or removes unwanted characters or patterns in text, for example, converting hyphens to spaces. +- **Custom mappings:** Substitutes specific characters or sequences with other values, for example, to convert currency symbols into their textual equivalents. diff --git a/_analyzers/custom-analyzer.md b/_analyzers/custom-analyzer.md new file mode 100644 index 0000000000..b808268f66 --- /dev/null +++ b/_analyzers/custom-analyzer.md @@ -0,0 +1,312 @@ +--- +layout: default +title: Creating a custom analyzer +nav_order: 90 +parent: Analyzers +--- + +# Creating a custom analyzer + +To create a custom analyzer, specify a combination of the following components: + +- Character filters (zero or more) + +- Tokenizer (one) + +- Token filters (zero or more) + +## Configuration + +The following parameters can be used to configure a custom analyzer. + +| Parameter | Required/Optional | Description | +|:--- | :--- | :--- | +| `type` | Optional | The analyzer type. Default is `custom`. You can also specify a prebuilt analyzer using this parameter. | +| `tokenizer` | Required | A tokenizer to be included in the analyzer. | +| `char_filter` | Optional | A list of character filters to be included in the analyzer. | +| `filter` | Optional | A list of token filters to be included in the analyzer. | +| `position_increment_gap` | Optional | The extra spacing applied between values when indexing text fields that have multiple values. For more information, see [Position increment gap](#position-increment-gap). Default is `100`. | + +## Examples + +The following examples demonstrate various custom analyzer configurations. + +### Custom analyzer with a character filter for HTML stripping + +The following example analyzer removes HTML tags from text before tokenization: + +```json +PUT simple_html_strip_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "html_strip_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "whitespace", + "filter": ["lowercase"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET simple_html_strip_analyzer_index/_analyze +{ + "analyzer": "html_strip_analyzer", + "text": "

OpenSearch is awesome!

" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 3, + "end_offset": 13, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 14, + "end_offset": 16, + "type": "word", + "position": 1 + }, + { + "token": "awesome!", + "start_offset": 25, + "end_offset": 42, + "type": "word", + "position": 2 + } + ] +} +``` + +### Custom analyzer with a mapping character filter for synonym replacement + +The following example analyzer replaces specific characters and patterns before applying the synonym filter: + +```json +PUT mapping_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "synonym_mapping_analyzer": { + "type": "custom", + "char_filter": ["underscore_to_space"], + "tokenizer": "standard", + "filter": ["lowercase", "stop", "synonym_filter"] + } + }, + "char_filter": { + "underscore_to_space": { + "type": "mapping", + "mappings": ["_ => ' '"] + } + }, + "filter": { + "synonym_filter": { + "type": "synonym", + "synonyms": [ + "quick, fast, speedy", + "big, large, huge" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET mapping_analyzer_index/_analyze +{ + "analyzer": "synonym_mapping_analyzer", + "text": "The slow_green_turtle is very large" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "slow","start_offset": 4,"end_offset": 8,"type": "","position": 1}, + {"token": "green","start_offset": 9,"end_offset": 14,"type": "","position": 2}, + {"token": "turtle","start_offset": 15,"end_offset": 21,"type": "","position": 3}, + {"token": "very","start_offset": 25,"end_offset": 29,"type": "","position": 5}, + {"token": "large","start_offset": 30,"end_offset": 35,"type": "","position": 6}, + {"token": "big","start_offset": 30,"end_offset": 35,"type": "SYNONYM","position": 6}, + {"token": "huge","start_offset": 30,"end_offset": 35,"type": "SYNONYM","position": 6} + ] +} +``` + +### Custom analyzer with a custom pattern-based character filter for number normalization + +The following example analyzer normalizes phone numbers by removing dashes and spaces and applies edge n-grams to the normalized text to support partial matches: + +```json +PUT advanced_pattern_replace_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "phone_number_analyzer": { + "type": "custom", + "char_filter": ["phone_normalization"], + "tokenizer": "standard", + "filter": ["lowercase", "edge_ngram"] + } + }, + "char_filter": { + "phone_normalization": { + "type": "pattern_replace", + "pattern": "[-\\s]", + "replacement": "" + } + }, + "filter": { + "edge_ngram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET advanced_pattern_replace_analyzer_index/_analyze +{ + "analyzer": "phone_number_analyzer", + "text": "123-456 7890" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "123","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "12345","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "123456","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234567","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "12345678","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "123456789","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234567890","start_offset": 0,"end_offset": 12,"type": "","position": 0} + ] +} +``` + +## Position increment gap + +The `position_increment_gap` parameter sets a positional gap between terms when indexing multi-valued fields, such as arrays. This gap ensures that phrase queries don't match terms across separate values unless explicitly allowed. For example, a default gap of 100 specifies that terms in different array entries are 100 positions apart, preventing unintended matches in phrase searches. You can adjust this value or set it to `0` in order to allow phrases to span across array values. + +The following example demonstrates the effect of `position_increment_gap` using a `match_phrase` query. + +1. Index a document in a `test-index`: + + ```json + PUT test-index/_doc/1 + { + "names": [ "Slow green", "turtle swims"] + } + ``` + {% include copy-curl.html %} + +1. Query the document using a `match_phrase` query: + + ```json + GET test-index/_search + { + "query": { + "match_phrase": { + "names": { + "query": "green turtle" + } + } + } + } + ``` + {% include copy-curl.html %} + + The response returns no hits because the distance between the terms `green` and `turtle` is `100` (the default `position_increment_gap`). + +1. Now query the document using a `match_phrase` query with a `slop` parameter that is higher than the `position_increment_gap`: + + ```json + GET test-index/_search + { + "query": { + "match_phrase": { + "names": { + "query": "green turtle", + "slop": 101 + } + } + } + } + ``` + {% include copy-curl.html %} + + The response contains the matching document: + + ```json + { + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.010358453, + "hits": [ + { + "_index": "test-index", + "_id": "1", + "_score": 0.010358453, + "_source": { + "names": [ + "Slow green", + "turtle swims" + ] + } + } + ] + } + } + ``` diff --git a/_analyzers/index-analyzers.md b/_analyzers/index-analyzers.md index 72332758d0..3c40755502 100644 --- a/_analyzers/index-analyzers.md +++ b/_analyzers/index-analyzers.md @@ -2,6 +2,7 @@ layout: default title: Index analyzers nav_order: 20 +parent: Analyzers --- # Index analyzers diff --git a/_analyzers/index.md b/_analyzers/index.md index 95f97ec8ce..1dc38b2cd4 100644 --- a/_analyzers/index.md +++ b/_analyzers/index.md @@ -45,24 +45,13 @@ An analyzer must contain exactly one tokenizer and may contain zero or more char There is also a special type of analyzer called a ***normalizer***. A normalizer is similar to an analyzer except that it does not contain a tokenizer and can only include specific types of character filters and token filters. These filters can perform only character-level operations, such as character or pattern replacement, and cannot perform operations on the token as a whole. This means that replacing a token with a synonym or stemming is not supported. See [Normalizers]({{site.url}}{{site.baseurl}}/analyzers/normalizers/) for further details. -## Built-in analyzers +## Supported analyzers -The following table lists the built-in analyzers that OpenSearch provides. The last column of the table contains the result of applying the analyzer to the string `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`. - -Analyzer | Analysis performed | Analyzer output -:--- | :--- | :--- -**Standard** (default) | - Parses strings into tokens at word boundaries
- Removes most punctuation
- Converts tokens to lowercase | [`it’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] -**Simple** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts tokens to lowercase | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] -**Whitespace** | - Parses strings into tokens on white space | [`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] -**Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] -**Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] -**Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] -[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] -**Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. +For a list of supported analyzers, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/index/). ## Custom analyzers -If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. +If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. For more information, see [Creating a custom analyzer]({{site.url}}{{site.baseurl}}/analyzers/custom-analyzer/). ## Text analysis at indexing time and query time @@ -170,6 +159,31 @@ The response provides information about the analyzers for each field: } ``` +## Normalizers + +Tokenization divides text into individual terms, but it does not address variations in token forms. Normalization resolves these issues by converting tokens into a standard format. This ensures that similar terms are matched appropriately, even if they are not identical. + +### Normalization techniques + +The following normalization techniques can help address variations in token forms: + +1. **Case normalization**: Converts all tokens to lowercase to ensure case-insensitive matching. For example, "Hello" is normalized to "hello". + +2. **Stemming**: Reduces words to their root form. For instance, "cars" is stemmed to "car" and "running" is normalized to "run". + +3. **Synonym handling:** Treats synonyms as equivalent. For example, "jogging" and "running" can be indexed under a common term, such as "run". + +### Normalization + +A search for `Hello` will match documents containing `hello` because of case normalization. + +A search for `cars` will also match documents containing `car` because of stemming. + +A query for `running` can retrieve documents containing `jogging` using synonym handling. + +Normalization ensures that searches are not limited to exact term matches, allowing for more relevant results. For instance, a search for `Cars running` can be normalized to match `car run`. + ## Next steps -- Learn more about specifying [index analyzers]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) and [search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). \ No newline at end of file +- Learn more about specifying [index analyzers]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) and [search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). +- See the list of [supported analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/index/). \ No newline at end of file diff --git a/_analyzers/language-analyzers.md b/_analyzers/language-analyzers.md deleted file mode 100644 index f5a2f18cb3..0000000000 --- a/_analyzers/language-analyzers.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -layout: default -title: Language analyzers -nav_order: 10 -redirect_from: - - /query-dsl/analyzers/language-analyzers/ ---- - -# Language analyzer - -OpenSearch supports the following language values with the `analyzer` option: -`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`, and `thai`. - -To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: - -```json - "analyzer": "french" -``` - -#### Example request - -The following query specifies the `french` language analyzer for the index `my-index`: - -```json -PUT my-index -{ - "mappings": { - "properties": { - "text": { - "type": "text", - "fields": { - "french": { - "type": "text", - "analyzer": "french" - } - } - } - } - } -} -``` - - \ No newline at end of file diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md new file mode 100644 index 0000000000..e61c684cbb --- /dev/null +++ b/_analyzers/language-analyzers/arabic.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Arabic +parent: Language analyzers +grand_parent: Analyzers +nav_order: 10 +--- + +# Arabic analyzer + +The built-in `arabic` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_arabic +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_arabic_analyzer":{ + "type":"arabic", + "stem_exclusion":["تكنولوجيا","سلطة "] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Arabic analyzer internals + +The `arabic` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - stop (Arabic) + - normalization (Arabic) + - keyword + - stemmer (Arabic) + +## Custom Arabic analyzer + +You can create a custom Arabic analyzer using the following command: + +```json +PUT /arabic-index +{ + "settings": { + "analysis": { + "filter": { + "arabic_stop": { + "type": "stop", + "stopwords": "_arabic_" + }, + "arabic_stemmer": { + "type": "stemmer", + "language": "arabic" + }, + "arabic_normalization": { + "type": "arabic_normalization" + }, + "decimal_digit": { + "type": "decimal_digit" + }, + "arabic_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "arabic_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "arabic_normalization", + "decimal_digit", + "arabic_stop", + "arabic_keywords", + "arabic_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /arabic-index/_analyze +{ + "field": "content", + "text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "طلاب", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "يدرس", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "جامع", + "start_offset": 17, + "end_offset": 25, + "type": "", + "position": 3 + }, + { + "token": "عرب", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "ارقامهم", + "start_offset": 35, + "end_offset": 42, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md new file mode 100644 index 0000000000..9bd0549c80 --- /dev/null +++ b/_analyzers/language-analyzers/armenian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Armenian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 20 +--- + +# Armenian analyzer + +The built-in `armenian` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_armenian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_armenian_analyzer": { + "type": "armenian", + "stem_exclusion": ["բարև", "խաղաղություն"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Armenian analyzer internals + +The `armenian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Armenian) + - keyword + - stemmer (Armenian) + +## Custom Armenian analyzer + +You can create a custom Armenian analyzer using the following command: + +```json +PUT /armenian-index +{ + "settings": { + "analysis": { + "filter": { + "armenian_stop": { + "type": "stop", + "stopwords": "_armenian_" + }, + "armenian_stemmer": { + "type": "stemmer", + "language": "armenian" + }, + "armenian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "armenian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "armenian_stop", + "armenian_keywords", + "armenian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET armenian-index/_analyze +{ + "analyzer": "stem_exclusion_armenian_analyzer", + "text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "բարև","start_offset": 0,"end_offset": 4,"type": "","position": 0}, + {"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "","position": 1}, + {"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "","position": 3}, + {"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "","position": 5}, + {"token": "նոր","start_offset": 45,"end_offset": 48,"type": "","position": 7}, + {"token": "օր","start_offset": 49,"end_offset": 51,"type": "","position": 8}, + {"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md new file mode 100644 index 0000000000..e73510cc66 --- /dev/null +++ b/_analyzers/language-analyzers/basque.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Basque +parent: Language analyzers +grand_parent: Analyzers +nav_order: 30 +--- + +# Basque analyzer + +The built-in `basque` analyzer can be applied to a text field using the following command: + +```json +PUT /basque-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_basque_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_basque_analyzer": { + "type": "basque", + "stem_exclusion": ["autoritate", "baldintza"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Basque analyzer internals + +The `basque` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Basque) + - keyword + - stemmer (Basque) + +## Custom Basque analyzer + +You can create a custom Basque analyzer using the following command: + +```json +PUT /basque-index +{ + "settings": { + "analysis": { + "filter": { + "basque_stop": { + "type": "stop", + "stopwords": "_basque_" + }, + "basque_stemmer": { + "type": "stemmer", + "language": "basque" + }, + "basque_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "basque_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "basque_stop", + "basque_keywords", + "basque_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /basque-index/_analyze +{ + "field": "content", + "text": "Ikasleek euskal unibertsitateetan ikasten dute. Haien zenbakiak 123456 dira." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ikasle","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "euskal","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "unibertsi","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "ikas","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "haien","start_offset": 48,"end_offset": 53,"type": "","position": 5}, + {"token": "zenba","start_offset": 54,"end_offset": 63,"type": "","position": 6}, + {"token": "123456","start_offset": 64,"end_offset": 70,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md new file mode 100644 index 0000000000..af913a01ef --- /dev/null +++ b/_analyzers/language-analyzers/bengali.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Bengali +parent: Language analyzers +grand_parent: Analyzers +nav_order: 40 +--- + +# Bengali analyzer + +The built-in `bengali` analyzer can be applied to a text field using the following command: + +```json +PUT /bengali-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bengali_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bengali_analyzer": { + "type": "bengali", + "stem_exclusion": ["কর্তৃপক্ষ", "অনুমোদন"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bengali analyzer internals + +The `bengali` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - indic_normalization + - normalization (Bengali) + - stop (Bengali) + - keyword + - stemmer (Bengali) + +## Custom Bengali analyzer + +You can create a custom Bengali analyzer using the following command: + +```json +PUT /bengali-index +{ + "settings": { + "analysis": { + "filter": { + "bengali_stop": { + "type": "stop", + "stopwords": "_bengali_" + }, + "bengali_stemmer": { + "type": "stemmer", + "language": "bengali" + }, + "bengali_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bengali_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "indic_normalization", + "bengali_normalization", + "bengali_stop", + "bengali_keywords", + "bengali_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bengali-index/_analyze +{ + "field": "content", + "text": "ছাত্ররা বিশ্ববিদ্যালয়ে পড়াশোনা করে। তাদের নম্বরগুলি ১২৩৪৫৬।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ছাত্র","start_offset": 0,"end_offset": 7,"type": "","position": 0}, + {"token": "বিসসবিদালয়","start_offset": 8,"end_offset": 23,"type": "","position": 1}, + {"token": "পরাসোন","start_offset": 24,"end_offset": 32,"type": "","position": 2}, + {"token": "তা","start_offset": 38,"end_offset": 43,"type": "","position": 4}, + {"token": "নমমর","start_offset": 44,"end_offset": 53,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md new file mode 100644 index 0000000000..67db2b92bc --- /dev/null +++ b/_analyzers/language-analyzers/brazilian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Brazilian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 50 +--- + +# Brazilian analyzer + +The built-in `brazilian` analyzer can be applied to a text field using the following command: + +```json +PUT /brazilian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_brazilian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_brazilian_analyzer": { + "type": "brazilian", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Brazilian analyzer internals + +The `brazilian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Brazilian) + - keyword + - stemmer (Brazilian) + +## Custom Brazilian analyzer + +You can create a custom Brazilian analyzer using the following command: + +```json +PUT /brazilian-index +{ + "settings": { + "analysis": { + "filter": { + "brazilian_stop": { + "type": "stop", + "stopwords": "_brazilian_" + }, + "brazilian_stemmer": { + "type": "stemmer", + "language": "brazilian" + }, + "brazilian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "brazilian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "brazilian_stop", + "brazilian_keywords", + "brazilian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /brazilian-index/_analyze +{ + "field": "content", + "text": "Estudantes estudam em universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estudant","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "estud","start_offset": 11,"end_offset": 18,"type": "","position": 1}, + {"token": "univers","start_offset": 22,"end_offset": 35,"type": "","position": 3}, + {"token": "brasileir","start_offset": 36,"end_offset": 47,"type": "","position": 4}, + {"token": "numer","start_offset": 54,"end_offset": 61,"type": "","position": 6}, + {"token": "sao","start_offset": 62,"end_offset": 65,"type": "","position": 7}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md new file mode 100644 index 0000000000..42d5794e18 --- /dev/null +++ b/_analyzers/language-analyzers/bulgarian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Bulgarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 60 +--- + +# Bulgarian analyzer + +The built-in `bulgarian` analyzer can be applied to a text field using the following command: + +```json +PUT /bulgarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bulgarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bulgarian_analyzer": { + "type": "bulgarian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bulgarian analyzer internals + +The `bulgarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Bulgarian) + - keyword + - stemmer (Bulgarian) + +## Custom Bulgarian analyzer + +You can create a custom Bulgarian analyzer using the following command: + +```json +PUT /bulgarian-index +{ + "settings": { + "analysis": { + "filter": { + "bulgarian_stop": { + "type": "stop", + "stopwords": "_bulgarian_" + }, + "bulgarian_stemmer": { + "type": "stemmer", + "language": "bulgarian" + }, + "bulgarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bulgarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "bulgarian_stop", + "bulgarian_keywords", + "bulgarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bulgarian-index/_analyze +{ + "field": "content", + "text": "Студентите учат в българските университети. Техните номера са 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "студент","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "учат","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "българск","start_offset": 18,"end_offset": 29,"type": "","position": 3}, + {"token": "университят","start_offset": 30,"end_offset": 42,"type": "","position": 4}, + {"token": "техн","start_offset": 44,"end_offset": 51,"type": "","position": 5}, + {"token": "номер","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md new file mode 100644 index 0000000000..89762da094 --- /dev/null +++ b/_analyzers/language-analyzers/catalan.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Catalan +parent: Language analyzers +grand_parent: Analyzers +nav_order: 70 +--- + +# Catalan analyzer + +The built-in `catalan` analyzer can be applied to a text field using the following command: + +```json +PUT /catalan-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_catalan_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_catalan_analyzer": { + "type": "catalan", + "stem_exclusion": ["autoritat", "aprovació"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Catalan analyzer internals + +The `catalan` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Catalan) + - lowercase + - stop (Catalan) + - keyword + - stemmer (Catalan) + +## Custom Catalan analyzer + +You can create a custom Catalan analyzer using the following command: + +```json +PUT /catalan-index +{ + "settings": { + "analysis": { + "filter": { + "catalan_stop": { + "type": "stop", + "stopwords": "_catalan_" + }, + "catalan_elision": { + "type": "elision", + "articles": [ "d", "l", "m", "n", "s", "t"], + "articles_case": true + }, + "catalan_stemmer": { + "type": "stemmer", + "language": "catalan" + }, + "catalan_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "catalan_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "catalan_elision", + "lowercase", + "catalan_stop", + "catalan_keywords", + "catalan_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /catalan-index/_analyze +{ + "field": "content", + "text": "Els estudiants estudien a les universitats catalanes. Els seus números són 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 4,"end_offset": 14,"type": "","position": 1}, + {"token": "estud","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "univer","start_offset": 30,"end_offset": 42,"type": "","position": 5}, + {"token": "catalan","start_offset": 43,"end_offset": 52,"type": "","position": 6}, + {"token": "numer","start_offset": 63,"end_offset": 70,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md new file mode 100644 index 0000000000..aed7e6da22 --- /dev/null +++ b/_analyzers/language-analyzers/cjk.md @@ -0,0 +1,142 @@ +--- +layout: default +title: CJK +parent: Language analyzers +grand_parent: Analyzers +nav_order: 80 +--- + +# CJK analyzer + +The built-in `cjk` analyzer can be applied to a text field using the following command: + +```json +PUT /cjk-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_cjk_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_cjk_analyzer": { + "type": "cjk", + "stem_exclusion": ["example", "words"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## CJK analyzer internals + +The `cjk` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - cjk_width + - lowercase + - cjk_bigram + - stop (similar to English) + +## Custom CJK analyzer + +You can create a custom CJK analyzer using the following command: + +```json +PUT /cjk-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": [ + "a", "and", "are", "as", "at", "be", "but", "by", "for", + "if", "in", "into", "is", "it", "no", "not", "of", "on", + "or", "s", "such", "t", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", + "with", "www" + ] + } + }, + "analyzer": { + "cjk_custom_analyzer": { + "tokenizer": "standard", + "filter": [ + "cjk_width", + "lowercase", + "cjk_bigram", + "english_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk_custom_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /cjk-index/_analyze +{ + "field": "content", + "text": "学生们在中国、日本和韩国的大学学习。123456" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "学生","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "生们","start_offset": 1,"end_offset": 3,"type": "","position": 1}, + {"token": "们在","start_offset": 2,"end_offset": 4,"type": "","position": 2}, + {"token": "在中","start_offset": 3,"end_offset": 5,"type": "","position": 3}, + {"token": "中国","start_offset": 4,"end_offset": 6,"type": "","position": 4}, + {"token": "日本","start_offset": 7,"end_offset": 9,"type": "","position": 5}, + {"token": "本和","start_offset": 8,"end_offset": 10,"type": "","position": 6}, + {"token": "和韩","start_offset": 9,"end_offset": 11,"type": "","position": 7}, + {"token": "韩国","start_offset": 10,"end_offset": 12,"type": "","position": 8}, + {"token": "国的","start_offset": 11,"end_offset": 13,"type": "","position": 9}, + {"token": "的大","start_offset": 12,"end_offset": 14,"type": "","position": 10}, + {"token": "大学","start_offset": 13,"end_offset": 15,"type": "","position": 11}, + {"token": "学学","start_offset": 14,"end_offset": 16,"type": "","position": 12}, + {"token": "学习","start_offset": 15,"end_offset": 17,"type": "","position": 13}, + {"token": "123456","start_offset": 18,"end_offset": 24,"type": "","position": 14} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md new file mode 100644 index 0000000000..c1778cd0f4 --- /dev/null +++ b/_analyzers/language-analyzers/czech.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Czech +parent: Language analyzers +grand_parent: Analyzers +nav_order: 90 +--- + +# Czech analyzer + +The built-in `czech` analyzer can be applied to a text field using the following command: + +```json +PUT /czech-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_czech_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_czech_analyzer": { + "type": "czech", + "stem_exclusion": ["autorita", "schválení"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Czech analyzer internals + +The `czech` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Czech) + - keyword + - stemmer (Czech) + +## Custom Czech analyzer + +You can create a custom Czech analyzer using the following command: + +```json +PUT /czech-index +{ + "settings": { + "analysis": { + "filter": { + "czech_stop": { + "type": "stop", + "stopwords": "_czech_" + }, + "czech_stemmer": { + "type": "stemmer", + "language": "czech" + }, + "czech_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "czech_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "czech_stop", + "czech_keywords", + "czech_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /czech-index/_analyze +{ + "field": "content", + "text": "Studenti studují na českých univerzitách. Jejich čísla jsou 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "studuj", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "česk", + "start_offset": 20, + "end_offset": 27, + "type": "", + "position": 3 + }, + { + "token": "univerzit", + "start_offset": 28, + "end_offset": 40, + "type": "", + "position": 4 + }, + { + "token": "čísl", + "start_offset": 49, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md new file mode 100644 index 0000000000..b5ee1b0e97 --- /dev/null +++ b/_analyzers/language-analyzers/danish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Danish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 100 +--- + +# Danish analyzer + +The built-in `danish` analyzer can be applied to a text field using the following command: + +```json +PUT /danish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_danish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_danish_analyzer": { + "type": "danish", + "stem_exclusion": ["autoritet", "godkendelse"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Danish analyzer internals + +The `danish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Danish) + - keyword + - stemmer (Danish) + +## Custom Danish analyzer + +You can create a custom Danish analyzer using the following command: + +```json +PUT /danish-index +{ + "settings": { + "analysis": { + "filter": { + "danish_stop": { + "type": "stop", + "stopwords": "_danish_" + }, + "danish_stemmer": { + "type": "stemmer", + "language": "danish" + }, + "danish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "danish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "danish_stop", + "danish_keywords", + "danish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /danish-index/_analyze +{ + "field": "content", + "text": "Studerende studerer på de danske universiteter. Deres numre er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stud", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "stud", + "start_offset": 11, + "end_offset": 19, + "type": "", + "position": 1 + }, + { + "token": "dansk", + "start_offset": 26, + "end_offset": 32, + "type": "", + "position": 4 + }, + { + "token": "universitet", + "start_offset": 33, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numr", + "start_offset": 54, + "end_offset": 59, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 63, + "end_offset": 69, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md new file mode 100644 index 0000000000..0259707d78 --- /dev/null +++ b/_analyzers/language-analyzers/dutch.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Dutch +parent: Language analyzers +grand_parent: Analyzers +nav_order: 110 +--- + +# Dutch analyzer + +The built-in `dutch` analyzer can be applied to a text field using the following command: + +```json +PUT /dutch-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_dutch_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_dutch_analyzer": { + "type": "dutch", + "stem_exclusion": ["autoriteit", "goedkeuring"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Dutch analyzer internals + +The `dutch` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Dutch) + - keyword + - stemmer_override + - stemmer (Dutch) + +## Custom Dutch analyzer + +You can create a custom Dutch analyzer using the following command: + +```json +PUT /dutch-index +{ + "settings": { + "analysis": { + "filter": { + "dutch_stop": { + "type": "stop", + "stopwords": "_dutch_" + }, + "dutch_stemmer": { + "type": "stemmer", + "language": "dutch" + }, + "dutch_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "dutch_override": { + "type": "stemmer_override", + "rules": [ + "fiets=>fiets", + "bromfiets=>bromfiets", + "ei=>eier", + "kind=>kinder" + ] + } + }, + "analyzer": { + "dutch_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "dutch_stop", + "dutch_keywords", + "dutch_override", + "dutch_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /dutch-index/_analyze +{ + "field": "content", + "text": "De studenten studeren in Nederland en bezoeken Amsterdam. Hun nummers zijn 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 3,"end_offset": 12,"type": "","position": 1}, + {"token": "studer","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "nederland","start_offset": 25,"end_offset": 34,"type": "","position": 4}, + {"token": "bezoek","start_offset": 38,"end_offset": 46,"type": "","position": 6}, + {"token": "amsterdam","start_offset": 47,"end_offset": 56,"type": "","position": 7}, + {"token": "nummer","start_offset": 62,"end_offset": 69,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md new file mode 100644 index 0000000000..2d0b600312 --- /dev/null +++ b/_analyzers/language-analyzers/english.md @@ -0,0 +1,143 @@ +--- +layout: default +title: English +parent: Language analyzers +grand_parent: Analyzers +nav_order: 120 +--- + +# English analyzer + +The built-in `english` analyzer can be applied to a text field using the following command: + +```json +PUT /english-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer": { + "type": "english", + "stem_exclusion": ["authority", "authorization"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## English analyzer internals + +The `english` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - stemmer (possessive_english) + - lowercase + - stop (English) + - keyword + - stemmer (English) + +## Custom English analyzer + +You can create a custom English analyzer using the following command: + +```json +PUT /english-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "english_stemmer": { + "type": "stemmer", + "language": "english" + }, + "english_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "english_possessive_stemmer": { + "type": "stemmer", + "language": "possessive_english" + } + }, + "analyzer": { + "english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "english_possessive_stemmer", + "lowercase", + "english_stop", + "english_keywords", + "english_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /english-index/_analyze +{ + "field": "content", + "text": "The students study in the USA and work at NASA. Their numbers are 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studi","start_offset": 13,"end_offset": 18,"type": "","position": 2}, + {"token": "usa","start_offset": 26,"end_offset": 29,"type": "","position": 5}, + {"token": "work","start_offset": 34,"end_offset": 38,"type": "","position": 7}, + {"token": "nasa","start_offset": 42,"end_offset": 46,"type": "","position": 9}, + {"token": "number","start_offset": 54,"end_offset": 61,"type": "","position": 11}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md new file mode 100644 index 0000000000..a4cb664f18 --- /dev/null +++ b/_analyzers/language-analyzers/estonian.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Estonian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 130 +--- + +# Estonian analyzer + +The built-in `estonian` analyzer can be applied to a text field using the following command: + +```json +PUT /estonian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_estonian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_estonian_analyzer": { + "type": "estonian", + "stem_exclusion": ["autoriteet", "kinnitus"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Estonian analyzer internals + +The `estonian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Estonian) + - keyword + - stemmer (Estonian) + +## Custom Estonian analyzer + +You can create a custom Estonian analyzer using the following command: + +```json +PUT /estonian-index +{ + "settings": { + "analysis": { + "filter": { + "estonian_stop": { + "type": "stop", + "stopwords": "_estonian_" + }, + "estonian_stemmer": { + "type": "stemmer", + "language": "estonian" + }, + "estonian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "estonian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "estonian_stop", + "estonian_keywords", + "estonian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /estonian-index/_analyze +{ + "field": "content", + "text": "Õpilased õpivad Tallinnas ja Eesti ülikoolides. Nende numbrid on 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "õpilase","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "õpi","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "tallinna","start_offset": 16,"end_offset": 25,"type": "","position": 2}, + {"token": "eesti","start_offset": 29,"end_offset": 34,"type": "","position": 4}, + {"token": "ülikooli","start_offset": 35,"end_offset": 46,"type": "","position": 5}, + {"token": "nende","start_offset": 48,"end_offset": 53,"type": "","position": 6}, + {"token": "numbri","start_offset": 54,"end_offset": 61,"type": "","position": 7}, + {"token": "on","start_offset": 62,"end_offset": 64,"type": "","position": 8}, + {"token": "123456","start_offset": 65,"end_offset": 71,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md new file mode 100644 index 0000000000..6f559650d2 --- /dev/null +++ b/_analyzers/language-analyzers/finnish.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Finnish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 140 +--- + +# Finnish analyzer + +The built-in `finnish` analyzer can be applied to a text field using the following command: + +```json +PUT /finnish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_finnish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_finnish_analyzer": { + "type": "finnish", + "stem_exclusion": ["valta", "hyväksyntä"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Finnish analyzer internals + +The `finnish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Finnish) + - keyword + - stemmer (Finnish) + +## Custom Finnish analyzer + +You can create a custom Finnish analyzer using the following command: + +```json +PUT /finnish-index +{ + "settings": { + "analysis": { + "filter": { + "finnish_stop": { + "type": "stop", + "stopwords": "_finnish_" + }, + "finnish_stemmer": { + "type": "stemmer", + "language": "finnish" + }, + "finnish_keywords": { + "type": "keyword_marker", + "keywords": ["Helsinki", "Suomi"] + } + }, + "analyzer": { + "finnish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "finnish_stop", + "finnish_keywords", + "finnish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /finnish-index/_analyze +{ + "field": "content", + "text": "Opiskelijat opiskelevat Helsingissä ja Suomen yliopistoissa. Heidän numeronsa ovat 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "opiskelij","start_offset": 0,"end_offset": 11,"type": "","position": 0}, + {"token": "opiskelev","start_offset": 12,"end_offset": 23,"type": "","position": 1}, + {"token": "helsing","start_offset": 24,"end_offset": 35,"type": "","position": 2}, + {"token": "suome","start_offset": 39,"end_offset": 45,"type": "","position": 4}, + {"token": "yliopisto","start_offset": 46,"end_offset": 59,"type": "","position": 5}, + {"token": "numero","start_offset": 68,"end_offset": 77,"type": "","position": 7}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md new file mode 100644 index 0000000000..64e7ab5415 --- /dev/null +++ b/_analyzers/language-analyzers/french.md @@ -0,0 +1,148 @@ +--- +layout: default +title: French +parent: Language analyzers +grand_parent: Analyzers +nav_order: 150 +--- + +# French analyzer + +The built-in `french` analyzer can be applied to a text field using the following command: + +```json +PUT /french-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_french_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_french_analyzer": { + "type": "french", + "stem_exclusion": ["autorité", "acceptation"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## French analyzer internals + +The `french` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (French) + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom French analyzer + +You can create a custom French analyzer using the following command: + +```json +PUT /french-index +{ + "settings": { + "analysis": { + "filter": { + "french_stop": { + "type": "stop", + "stopwords": "_french_" + }, + "french_elision": { + "type": "elision", + "articles_case": true, + "articles": [ + "l", "m", "t", "qu", "n", "s", + "j", "d", "c", "jusqu", "quoiqu", + "lorsqu", "puisqu" + ] + }, + "french_stemmer": { + "type": "stemmer", + "language": "light_french" + }, + "french_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "french_elision", + "lowercase", + "french_stop", + "french_keywords", + "french_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french-index/_analyze +{ + "field": "content", + "text": "Les étudiants étudient à Paris et dans les universités françaises. Leurs numéros sont 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "etudiant","start_offset": 4,"end_offset": 13,"type": "","position": 1}, + {"token": "etudient","start_offset": 14,"end_offset": 22,"type": "","position": 2}, + {"token": "pari","start_offset": 25,"end_offset": 30,"type": "","position": 4}, + {"token": "universit","start_offset": 43,"end_offset": 54,"type": "","position": 8}, + {"token": "francais","start_offset": 55,"end_offset": 65,"type": "","position": 9}, + {"token": "numero","start_offset": 73,"end_offset": 80,"type": "","position": 11}, + {"token": "123456","start_offset": 86,"end_offset": 92,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md new file mode 100644 index 0000000000..00338b23a7 --- /dev/null +++ b/_analyzers/language-analyzers/galician.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Galician +parent: Language analyzers +grand_parent: Analyzers +nav_order: 160 +--- + +# Galician analyzer + +The built-in `galician` analyzer can be applied to a text field using the following command: + +```json +PUT /galician-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_galician_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_galician_analyzer": { + "type": "galician", + "stem_exclusion": ["autoridade", "aceptación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Galician analyzer internals + +The `galician` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom Galician analyzer + +You can create a custom Galician analyzer using the following command: + +```json +PUT /galician-index +{ + "settings": { + "analysis": { + "filter": { + "galician_stop": { + "type": "stop", + "stopwords": "_galician_" + }, + "galician_stemmer": { + "type": "stemmer", + "language": "galician" + }, + "galician_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "galician_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "galician_stop", + "galician_keywords", + "galician_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /galician-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudan en Santiago e nas universidades galegas. Os seus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 3,"end_offset": 13,"type": "","position": 1}, + {"token": "estud","start_offset": 14,"end_offset": 21,"type": "","position": 2}, + {"token": "santiag","start_offset": 25,"end_offset": 33,"type": "","position": 4}, + {"token": "univers","start_offset": 40,"end_offset": 53,"type": "","position": 7}, + {"token": "galeg","start_offset": 54,"end_offset": 61,"type": "","position": 8}, + {"token": "numer","start_offset": 71,"end_offset": 78,"type": "","position": 11}, + {"token": "son","start_offset": 79,"end_offset": 82,"type": "","position": 12}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md new file mode 100644 index 0000000000..4071ef5378 --- /dev/null +++ b/_analyzers/language-analyzers/german.md @@ -0,0 +1,174 @@ +--- +layout: default +title: German +parent: Language analyzers +grand_parent: Analyzers +nav_order: 170 +--- + +# German analyzer + +The built-in `german` analyzer can be applied to a text field using the following command: + +```json +PUT /german-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_german_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_german_analyzer": { + "type": "german", + "stem_exclusion": ["Autorität", "Genehmigung"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## German analyzer internals + +The `german` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (German) + - keyword + - normalization (German) + - stemmer (German) + +## Custom German analyzer + +You can create a custom German analyzer using the following command: + +```json +PUT /german-index +{ + "settings": { + "analysis": { + "filter": { + "german_stop": { + "type": "stop", + "stopwords": "_german_" + }, + "german_stemmer": { + "type": "stemmer", + "language": "light_german" + }, + "german_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "german_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_stop", + "german_keywords", + "german_normalization", + "german_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german-index/_analyze +{ + "field": "content", + "text": "Die Studenten studieren an den deutschen Universitäten. Ihre Nummern sind 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 4, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "studi", + "start_offset": 14, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "deutsch", + "start_offset": 31, + "end_offset": 40, + "type": "", + "position": 5 + }, + { + "token": "universitat", + "start_offset": 41, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "numm", + "start_offset": 61, + "end_offset": 68, + "type": "", + "position": 8 + }, + { + "token": "123456", + "start_offset": 74, + "end_offset": 80, + "type": "", + "position": 10 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md new file mode 100644 index 0000000000..2446b1e2d6 --- /dev/null +++ b/_analyzers/language-analyzers/greek.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Greek +parent: Language analyzers +grand_parent: Analyzers +nav_order: 180 +--- + +# Greek analyzer + +The built-in `greek` analyzer can be applied to a text field using the following command: + +```json +PUT /greek-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_greek_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_greek_analyzer": { + "type": "greek", + "stem_exclusion": ["αρχή", "έγκριση"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Greek analyzer internals + +The `greek` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Greek) + - keyword + - stemmer (Greek) + +## Custom Greek analyzer + +You can create a custom Greek analyzer using the following command: + +```json +PUT /greek-index +{ + "settings": { + "analysis": { + "filter": { + "greek_stop": { + "type": "stop", + "stopwords": "_greek_" + }, + "greek_stemmer": { + "type": "stemmer", + "language": "greek" + }, + "greek_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "greek_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "greek_stop", + "greek_keywords", + "greek_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /greek-index/_analyze +{ + "field": "content", + "text": "Οι φοιτητές σπουδάζουν στα ελληνικά πανεπιστήμια. Οι αριθμοί τους είναι 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "φοιτητές","start_offset": 3,"end_offset": 11,"type": "","position": 1}, + {"token": "σπουδάζ","start_offset": 12,"end_offset": 22,"type": "","position": 2}, + {"token": "στα","start_offset": 23,"end_offset": 26,"type": "","position": 3}, + {"token": "ελληνικά","start_offset": 27,"end_offset": 35,"type": "","position": 4}, + {"token": "πανεπιστήμ","start_offset": 36,"end_offset": 48,"type": "","position": 5}, + {"token": "αριθμοί","start_offset": 53,"end_offset": 60,"type": "","position": 7}, + {"token": "τους","start_offset": 61,"end_offset": 65,"type": "","position": 8}, + {"token": "είνα","start_offset": 66,"end_offset": 71,"type": "","position": 9}, + {"token": "123456","start_offset": 72,"end_offset": 78,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md new file mode 100644 index 0000000000..93f2eea319 --- /dev/null +++ b/_analyzers/language-analyzers/hindi.md @@ -0,0 +1,178 @@ +--- +layout: default +title: Hindi +parent: Language analyzers +grand_parent: Analyzers +nav_order: 190 +--- + +# Hindi analyzer + +The built-in `hindi` analyzer can be applied to a text field using the following command: + +```json +PUT /hindi-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hindi_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hindi_analyzer": { + "type": "hindi", + "stem_exclusion": ["अधिकार", "अनुमोदन"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hindi analyzer internals + +The `hindi` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - keyword + - normalization (indic) + - normalization (Hindi) + - stop (Hindi) + - stemmer (Hindi) + +## Custom Hindi analyzer + +You can create a custom Hindi analyzer using the following command: + +```json +PUT /hindi-index +{ + "settings": { + "analysis": { + "filter": { + "hindi_stop": { + "type": "stop", + "stopwords": "_hindi_" + }, + "hindi_stemmer": { + "type": "stemmer", + "language": "hindi" + }, + "hindi_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hindi_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "hindi_keywords", + "indic_normalization", + "hindi_normalization", + "hindi_stop", + "hindi_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hindi-index/_analyze +{ + "field": "content", + "text": "छात्र भारतीय विश्वविद्यालयों में पढ़ते हैं। उनके नंबर १२३४५६ हैं।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "छातर", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "भारतिय", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "विशवविदयालय", + "start_offset": 13, + "end_offset": 28, + "type": "", + "position": 2 + }, + { + "token": "पढ", + "start_offset": 33, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "नंबर", + "start_offset": 49, + "end_offset": 53, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 54, + "end_offset": 60, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md new file mode 100644 index 0000000000..d115c5d29c --- /dev/null +++ b/_analyzers/language-analyzers/hungarian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Hungarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 200 +--- + +# Hungarian analyzer + +The built-in `hungarian` analyzer can be applied to a text field using the following command: + +```json +PUT /hungarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hungarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hungarian_analyzer": { + "type": "hungarian", + "stem_exclusion": ["hatalom", "jóváhagyás"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hungarian analyzer internals + +The `hungarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Hungarian) + - keyword + - stemmer (Hungarian) + +## Custom Hungarian analyzer + +You can create a custom Hungarian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hungarian-index/_analyze +{ + "field": "content", + "text": "A diákok a magyar egyetemeken tanulnak. A számaik 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "diák", + "start_offset": 2, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "magyar", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "egyetem", + "start_offset": 18, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "tanul", + "start_offset": 30, + "end_offset": 38, + "type": "", + "position": 5 + }, + { + "token": "szám", + "start_offset": 42, + "end_offset": 49, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md new file mode 100644 index 0000000000..89a4a42254 --- /dev/null +++ b/_analyzers/language-analyzers/index.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Language analyzers +nav_order: 100 +parent: Analyzers +has_children: true +has_toc: true +redirect_from: + - /query-dsl/analyzers/language-analyzers/ + - /analyzers/language-analyzers/ +--- + +# Language analyzers + +OpenSearch supports the following language analyzers: +`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `thai`, and `turkish`. + +To use an analyzer when you map an index, specify the value in your query. For example, to map your index with the French language analyzer, specify the `french` value in the analyzer field: + +```json + "analyzer": "french" +``` + +#### Example request + +The following query specifies an index `my-index` with the `content` field configured as multi-field, and a sub-field named `french` is configured with the `french` language analyzer: + +```json +PUT my-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "fields": { + "french": { + "type": "text", + "analyzer": "french" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The default `french` analyzer can also be configured for the entire index using the following query: + +```json +PUT my-index +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "french" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text" + }, + "title": { + "type": "text" + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can apply stem exclusion to any language analyzer by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring that they are not stemmed. + +## Stem exclusion example + +Use the following request to configure `stem_exclusion`: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer":{ + "type":"english", + "stem_exclusion": ["manager", "management"] + } + } + } + } +} +``` +{% include copy-curl.html %} + + +## Stem exclusion with custom analyzers + +All language analyzers consist of tokenizers and token filters specific to a particular language. If you want to implement a custom version of the language analyzer with stem exclusion, you need to configure the `keyword_marker` token filter and list the words excluded from stemming in the `keywords` parameter: + +```json +PUT index_with_keyword_marker_analyzer +{ + "settings": { + "analysis": { + "filter": { + "protected_keywords_filter": { + "type": "keyword_marker", + "keywords": ["Apple", "OpenSearch"] + } + }, + "analyzer": { + "custom_english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "protected_keywords_filter", + "english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md new file mode 100644 index 0000000000..5c3d430b3a --- /dev/null +++ b/_analyzers/language-analyzers/indonesian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Indonesian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Indonesian analyzer + +The built-in `indonesian` analyzer can be applied to a text field using the following command: + +```json +PUT /indonesian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "indonesian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_indonesian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_indonesian_analyzer": { + "type": "indonesian", + "stem_exclusion": ["otoritas", "persetujuan"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Indonesian analyzer internals + +The `indonesian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Indonesian) + - keyword + - stemmer (Indonesian) + +## Custom Indonesian analyzer + +You can create a custom Indonesian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /indonesian-index/_analyze +{ + "field": "content", + "text": "Mahasiswa belajar di universitas Indonesia. Nomor mereka adalah 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "mahasiswa", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "ajar", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 1 + }, + { + "token": "universitas", + "start_offset": 21, + "end_offset": 32, + "type": "", + "position": 3 + }, + { + "token": "indonesia", + "start_offset": 33, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "nomor", + "start_offset": 44, + "end_offset": 49, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 64, + "end_offset": 70, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md new file mode 100644 index 0000000000..3e1535d134 --- /dev/null +++ b/_analyzers/language-analyzers/irish.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Irish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Irish analyzer + +The built-in `irish` analyzer can be applied to a text field using the following command: + +```json +PUT /irish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_irish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_irish_analyzer": { + "type": "irish", + "stem_exclusion": ["údarás", "faomhadh"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Irish analyzer internals + +The `irish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - hyphenation (Irish) + - elision (Irish) + - lowercase (Irish) + - stop (Irish) + - keyword + - stemmer (Irish) + +## Custom Irish analyzer + +You can create a custom Irish analyzer using the following command: + +```json +PUT /irish-index +{ + "settings": { + "analysis": { + "filter": { + "irish_stop": { + "type": "stop", + "stopwords": "_irish_" + }, + "irish_elision": { + "type": "elision", + "articles": [ "d", "m", "b" ], + "articles_case": true + }, + "irish_hyphenation": { + "type": "stop", + "stopwords": [ "h", "n", "t" ], + "ignore_case": true + }, + "irish_lowercase": { + "type": "lowercase", + "language": "irish" + }, + "irish_stemmer": { + "type": "stemmer", + "language": "irish" + }, + "irish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "irish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "irish_hyphenation", + "irish_elision", + "irish_lowercase", + "irish_stop", + "irish_keywords", + "irish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /irish-index/_analyze +{ + "field": "content", + "text": "Tá mic léinn ag staidéar in ollscoileanna na hÉireann. Is iad a gcuid uimhreacha ná 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "tá","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "mic","start_offset": 3,"end_offset": 6,"type": "","position": 1}, + {"token": "léinn","start_offset": 7,"end_offset": 12,"type": "","position": 2}, + {"token": "staidéar","start_offset": 16,"end_offset": 24,"type": "","position": 4}, + {"token": "ollscoileanna","start_offset": 28,"end_offset": 41,"type": "","position": 6}, + {"token": "héireann","start_offset": 45,"end_offset": 53,"type": "","position": 8}, + {"token": "cuid","start_offset": 64,"end_offset": 69,"type": "","position": 12}, + {"token": "uimhreacha","start_offset": 70,"end_offset": 80,"type": "","position": 13}, + {"token": "123456","start_offset": 84,"end_offset": 90,"type": "","position": 15} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md new file mode 100644 index 0000000000..190056d63c --- /dev/null +++ b/_analyzers/language-analyzers/italian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Italian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 220 +--- + +# Italian analyzer + +The built-in `italian` analyzer can be applied to a text field using the following command: + +```json +PUT /italian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_italian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_italian_analyzer": { + "type": "italian", + "stem_exclusion": ["autorità", "approvazione"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Italian analyzer internals + +The `italian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Italian) + - lowercase + - stop (Italian) + - keyword + - stemmer (Italian) + +## Custom Italian analyzer + +You can create a custom Italian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /italian-index/_analyze +{ + "field": "content", + "text": "Gli studenti studiano nelle università italiane. I loro numeri sono 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studian","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "universit","start_offset": 28,"end_offset": 38,"type": "","position": 4}, + {"token": "italian","start_offset": 39,"end_offset": 47,"type": "","position": 5}, + {"token": "numer","start_offset": 56,"end_offset": 62,"type": "","position": 8}, + {"token": "123456","start_offset": 68,"end_offset": 74,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md new file mode 100644 index 0000000000..2301759763 --- /dev/null +++ b/_analyzers/language-analyzers/latvian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Latvian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Latvian analyzer + +The built-in `latvian` analyzer can be applied to a text field using the following command: + +```json +PUT /latvian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "latvian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_latvian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_latvian_analyzer": { + "type": "latvian", + "stem_exclusion": ["autoritāte", "apstiprinājums"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Latvian analyzer internals + +The `latvian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Latvian) + - keyword + - stemmer (Latvian) + +## Custom Latvian analyzer + +You can create a custom Latvian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /latvian-index/_analyze +{ + "field": "content", + "text": "Studenti mācās Latvijas universitātēs. Viņu numuri ir 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "māc","start_offset": 9,"end_offset": 14,"type": "","position": 1}, + {"token": "latvij","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "universitāt","start_offset": 24,"end_offset": 37,"type": "","position": 3}, + {"token": "vin","start_offset": 39,"end_offset": 43,"type": "","position": 4}, + {"token": "numur","start_offset": 44,"end_offset": 50,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md new file mode 100644 index 0000000000..ca5966c54e --- /dev/null +++ b/_analyzers/language-analyzers/lithuanian.md @@ -0,0 +1,136 @@ +--- +layout: default +title: Lithuanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Lithuanian analyzer + +The built-in `lithuanian` analyzer can be applied to a text field using the following command: + +```json +PUT /lithuanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_lithuanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_lithuanian_analyzer": { + "type": "lithuanian", + "stem_exclusion": ["autoritetas", "patvirtinimas"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Lithuanian analyzer internals + +The `lithuanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Lithuanian) + - keyword + - stemmer (Lithuanian) + +## Custom Lithuanian analyzer + +You can create a custom Lithuanian analyzer using the following command: + +```json +PUT /lithuanian-index +{ + "settings": { + "analysis": { + "filter": { + "lithuanian_stop": { + "type": "stop", + "stopwords": "_lithuanian_" + }, + "lithuanian_stemmer": { + "type": "stemmer", + "language": "lithuanian" + }, + "lithuanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "lithuanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "lithuanian_stop", + "lithuanian_keywords", + "lithuanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /lithuanian-index/_analyze +{ + "field": "content", + "text": "Studentai mokosi Lietuvos universitetuose. Jų numeriai yra 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "mok","start_offset": 10,"end_offset": 16,"type": "","position": 1}, + {"token": "lietuv","start_offset": 17,"end_offset": 25,"type": "","position": 2}, + {"token": "universitet","start_offset": 26,"end_offset": 41,"type": "","position": 3}, + {"token": "num","start_offset": 46,"end_offset": 54,"type": "","position": 5}, + {"token": "123456","start_offset": 59,"end_offset": 65,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md new file mode 100644 index 0000000000..cfb04eebf3 --- /dev/null +++ b/_analyzers/language-analyzers/norwegian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Norwegian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 240 +--- + +# Norwegian analyzer + +The built-in `norwegian` analyzer can be applied to a text field using the following command: + +```json +PUT /norwegian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_norwegian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_norwegian_analyzer": { + "type": "norwegian", + "stem_exclusion": ["autoritet", "godkjenning"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Norwegian analyzer internals + +The `norwegian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Norwegian) + - keyword + - stemmer (Norwegian) + +## Custom Norwegian analyzer + +You can create a custom Norwegian analyzer using the following command: + +```json +PUT /norwegian-index +{ + "settings": { + "analysis": { + "filter": { + "norwegian_stop": { + "type": "stop", + "stopwords": "_norwegian_" + }, + "norwegian_stemmer": { + "type": "stemmer", + "language": "norwegian" + }, + "norwegian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "norwegian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "norwegian_stop", + "norwegian_keywords", + "norwegian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /norwegian-index/_analyze +{ + "field": "content", + "text": "Studentene studerer ved norske universiteter. Deres nummer er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "studer","start_offset": 11,"end_offset": 19,"type": "","position": 1}, + {"token": "norsk","start_offset": 24,"end_offset": 30,"type": "","position": 3}, + {"token": "universitet","start_offset": 31,"end_offset": 44,"type": "","position": 4}, + {"token": "numm","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md new file mode 100644 index 0000000000..40b38656fd --- /dev/null +++ b/_analyzers/language-analyzers/persian.md @@ -0,0 +1,144 @@ +--- +layout: default +title: Persian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 250 +--- + +# Persian analyzer + +The built-in `persian` analyzer can be applied to a text field using the following command: + +```json +PUT /persian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_persian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_persian_analyzer": { + "type": "persian", + "stem_exclusion": ["حکومت", "تأیید"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Persian analyzer internals + +The `persian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Char filter: `mapping` + +- Token filters: + - lowercase + - decimal_digit + - normalization (Arabic) + - normalization (Persian) + - keyword + - stemmer (Norwegian) + +## Custom Persian analyzer + +You can create a custom Persian analyzer using the following command: + +```json +PUT /persian-index +{ + "settings": { + "analysis": { + "filter": { + "persian_stop": { + "type": "stop", + "stopwords": "_persian_" + }, + "persian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "char_filter": { + "null_width_replace_with_space": { + "type": "mapping", + "mappings": [ "\\u200C=>\\u0020"] + } + }, + "analyzer": { + "persian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "char_filter": [ "null_width_replace_with_space" ], + "filter": [ + "lowercase", + "decimal_digit", + "arabic_normalization", + "persian_normalization", + "persian_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /persian-index/_analyze +{ + "field": "content", + "text": "دانشجویان در دانشگاه‌های ایرانی تحصیل می‌کنند. شماره‌های آن‌ها ۱۲۳۴۵۶ است." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "دانشجويان","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "دانشگاه","start_offset": 13,"end_offset": 20,"type": "","position": 2}, + {"token": "ايراني","start_offset": 25,"end_offset": 31,"type": "","position": 4}, + {"token": "تحصيل","start_offset": 32,"end_offset": 37,"type": "","position": 5}, + {"token": "شماره","start_offset": 47,"end_offset": 52,"type": "","position": 8}, + {"token": "123456","start_offset": 63,"end_offset": 69,"type": "","position": 12} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md new file mode 100644 index 0000000000..166ffa0010 --- /dev/null +++ b/_analyzers/language-analyzers/portuguese.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Portuguese +parent: Language analyzers +grand_parent: Analyzers +nav_order: 260 +--- + +# Portuguese analyzer + +The built-in `portuguese` analyzer can be applied to a text field using the following command: + +```json +PUT /portuguese-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_portuguese_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_portuguese_analyzer": { + "type": "portuguese", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Portuguese analyzer internals + +The `portuguese` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Portuguese) + - keyword + - stemmer (Portuguese) + +## Custom Portuguese analyzer + +You can create a custom Portuguese analyzer using the following command: + +```json +PUT /portuguese-index +{ + "settings": { + "analysis": { + "filter": { + "portuguese_stop": { + "type": "stop", + "stopwords": "_portuguese_" + }, + "portuguese_stemmer": { + "type": "stemmer", + "language": "light_portuguese" + }, + "portuguese_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "portuguese_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "portuguese_stop", + "portuguese_keywords", + "portuguese_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /portuguese-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudam nas universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudant", + "start_offset": 3, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "estudam", + "start_offset": 14, + "end_offset": 21, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 26, + "end_offset": 39, + "type": "", + "position": 4 + }, + { + "token": "brasileir", + "start_offset": 40, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 58, + "end_offset": 65, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 70, + "end_offset": 76, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md new file mode 100644 index 0000000000..cad0953385 --- /dev/null +++ b/_analyzers/language-analyzers/romanian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Romanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 270 +--- + +# Romanian analyzer + +The built-in `romanian` analyzer can be applied to a text field using the following command: + +```json +PUT /romanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_romanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_romanian_analyzer": { + "type": "romanian", + "stem_exclusion": ["autoritate", "aprobat"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Romanian analyzer internals + +The `romanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Romanian) + - keyword + - stemmer (Romanian) + +## Custom Romanian analyzer + +You can create a custom Romanian analyzer using the following command: + +```json +PUT /romanian-index +{ + "settings": { + "analysis": { + "filter": { + "romanian_stop": { + "type": "stop", + "stopwords": "_romanian_" + }, + "romanian_stemmer": { + "type": "stemmer", + "language": "romanian" + }, + "romanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "romanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "romanian_stop", + "romanian_keywords", + "romanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /romanian-index/_analyze +{ + "field": "content", + "text": "Studenții învață la universitățile din România. Numerele lor sunt 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "studenț", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "învaț", + "start_offset": 10, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "universităț", + "start_offset": 20, + "end_offset": 34, + "type": "", + "position": 3 + }, + { + "token": "român", + "start_offset": 39, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 48, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 66, + "end_offset": 72, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md new file mode 100644 index 0000000000..bd57ba0b27 --- /dev/null +++ b/_analyzers/language-analyzers/russian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Russian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 280 +--- + +# Russian analyzer + +The built-in `russian` analyzer can be applied to a text field using the following command: + +```json +PUT /russian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_russian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_russian_analyzer": { + "type": "russian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Russian analyzer internals + +The `russian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Russian) + - keyword + - stemmer (Russian) + +## Custom Russian analyzer + +You can create a custom Russian analyzer using the following command: + +```json +PUT /russian-index +{ + "settings": { + "analysis": { + "filter": { + "russian_stop": { + "type": "stop", + "stopwords": "_russian_" + }, + "russian_stemmer": { + "type": "stemmer", + "language": "russian" + }, + "russian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "russian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "russian_stop", + "russian_keywords", + "russian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /russian-index/_analyze +{ + "field": "content", + "text": "Студенты учатся в университетах России. Их номера 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "студент", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "учат", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "университет", + "start_offset": 18, + "end_offset": 31, + "type": "", + "position": 3 + }, + { + "token": "росс", + "start_offset": 32, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "номер", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 7 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md new file mode 100644 index 0000000000..f71d43c481 --- /dev/null +++ b/_analyzers/language-analyzers/sorani.md @@ -0,0 +1,168 @@ +--- +layout: default +title: Sorani +parent: Language analyzers +grand_parent: Analyzers +nav_order: 290 +--- + +# Sorani analyzer + +The built-in `sorani` analyzer can be applied to a text field using the following command: + +```json +PUT /sorani-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_sorani_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_sorani_analyzer": { + "type": "sorani", + "stem_exclusion": ["مؤسسه", "اجازه"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Sorani analyzer internals + +The `sorani` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - normalization (Sorani) + - lowercase + - decimal_digit + - stop (Sorani) + - keyword + - stemmer (Sorani) + +## Custom Sorani analyzer + +You can create a custom Sorani analyzer using the following command: + +```json +PUT /sorani-index +{ + "settings": { + "analysis": { + "filter": { + "sorani_stop": { + "type": "stop", + "stopwords": "_sorani_" + }, + "sorani_stemmer": { + "type": "stemmer", + "language": "sorani" + }, + "sorani_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "sorani_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "sorani_stop", + "sorani_keywords", + "sorani_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /sorani-index/_analyze +{ + "field": "content", + "text": "خوێندنی فەرمی لە هەولێرەوە. ژمارەکان ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "خوێندن", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "فەرم", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "هەولێر", + "start_offset": 17, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "ژمار", + "start_offset": 28, + "end_offset": 36, + "type": "", + "position": 4 + }, + { + "token": "123456", + "start_offset": 37, + "end_offset": 43, + "type": "", + "position": 5 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md new file mode 100644 index 0000000000..8a0d8fad3c --- /dev/null +++ b/_analyzers/language-analyzers/spanish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Spanish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 300 +--- + +# Spanish analyzer + +The built-in `spanish` analyzer can be applied to a text field using the following command: + +```json +PUT /spanish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_spanish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_spanish_analyzer": { + "type": "spanish", + "stem_exclusion": ["autoridad", "aprobación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Spanish analyzer internals + +The `spanish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Spanish) + - keyword + - stemmer (Spanish) + +## Custom Spanish analyzer + +You can create a custom Spanish analyzer using the following command: + +```json +PUT /spanish-index +{ + "settings": { + "analysis": { + "filter": { + "spanish_stop": { + "type": "stop", + "stopwords": "_spanish_" + }, + "spanish_stemmer": { + "type": "stemmer", + "language": "light_spanish" + }, + "spanish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "spanish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "spanish_stop", + "spanish_keywords", + "spanish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /spanish-index/_analyze +{ + "field": "content", + "text": "Los estudiantes estudian en universidades españolas. Sus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudiant", + "start_offset": 4, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "estudian", + "start_offset": 16, + "end_offset": 24, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 28, + "end_offset": 41, + "type": "", + "position": 4 + }, + { + "token": "español", + "start_offset": 42, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 57, + "end_offset": 64, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 69, + "end_offset": 75, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md new file mode 100644 index 0000000000..9da595f12e --- /dev/null +++ b/_analyzers/language-analyzers/swedish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Swedish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 310 +--- + +# Swedish analyzer + +The built-in `swedish` analyzer can be applied to a text field using the following command: + +```json +PUT /swedish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_swedish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_swedish_analyzer": { + "type": "swedish", + "stem_exclusion": ["myndighet", "godkännande"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Swedish analyzer internals + +The `swedish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Swedish) + - keyword + - stemmer (Swedish) + +## Custom Swedish analyzer + +You can create a custom Swedish analyzer using the following command: + +```json +PUT /swedish-index +{ + "settings": { + "analysis": { + "filter": { + "swedish_stop": { + "type": "stop", + "stopwords": "_swedish_" + }, + "swedish_stemmer": { + "type": "stemmer", + "language": "swedish" + }, + "swedish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "swedish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "swedish_stop", + "swedish_keywords", + "swedish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /swedish-index/_analyze +{ + "field": "content", + "text": "Studenter studerar vid svenska universitet. Deras nummer är 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "studer", + "start_offset": 10, + "end_offset": 18, + "type": "", + "position": 1 + }, + { + "token": "svensk", + "start_offset": 23, + "end_offset": 30, + "type": "", + "position": 3 + }, + { + "token": "universitet", + "start_offset": 31, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "numm", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md new file mode 100644 index 0000000000..e4daa1f0be --- /dev/null +++ b/_analyzers/language-analyzers/thai.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Thai +parent: Language analyzers +grand_parent: Analyzers +nav_order: 320 +--- + +# Thai analyzer + +The built-in `thai` analyzer can be applied to a text field using the following command: + +```json +PUT /thai-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_thai_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_thai_analyzer": { + "type": "thai", + "stem_exclusion": ["อำนาจ", "การอนุมัติ"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Thai analyzer internals + +The `thai` analyzer is built using the following components: + +- Tokenizer: `thai` + +- Token filters: + - lowercase + - decimal_digit + - stop (Thai) + - keyword + +## Custom Thai analyzer + +You can create a custom Thai analyzer using the following command: + +```json +PUT /thai-index +{ + "settings": { + "analysis": { + "filter": { + "thai_stop": { + "type": "stop", + "stopwords": "_thai_" + }, + "thai_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "thai_analyzer": { + "tokenizer": "thai", + "filter": [ + "lowercase", + "decimal_digit", + "thai_stop", + "thai_keywords" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /thai-index/_analyze +{ + "field": "content", + "text": "นักเรียนกำลังศึกษาอยู่ที่มหาวิทยาลัยไทย หมายเลข 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "นักเรียน","start_offset": 0,"end_offset": 8,"type": "word","position": 0}, + {"token": "กำลัง","start_offset": 8,"end_offset": 13,"type": "word","position": 1}, + {"token": "ศึกษา","start_offset": 13,"end_offset": 18,"type": "word","position": 2}, + {"token": "มหาวิทยาลัย","start_offset": 25,"end_offset": 36,"type": "word","position": 5}, + {"token": "ไทย","start_offset": 36,"end_offset": 39,"type": "word","position": 6}, + {"token": "หมายเลข","start_offset": 40,"end_offset": 47,"type": "word","position": 7}, + {"token": "123456","start_offset": 48,"end_offset": 54,"type": "word","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md new file mode 100644 index 0000000000..fb36c5413c --- /dev/null +++ b/_analyzers/language-analyzers/turkish.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Turkish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 330 +--- + +# Turkish analyzer + +The built-in `turkish` analyzer can be applied to a text field using the following command: + +```json +PUT /turkish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_turkish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_turkish_analyzer": { + "type": "turkish", + "stem_exclusion": ["otorite", "onay"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Turkish analyzer internals + +The `turkish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - apostrophe + - lowercase (Turkish) + - stop (Turkish) + - keyword + - stemmer (Turkish) + +## Custom Turkish analyzer + +You can create a custom Turkish analyzer using the following command: + +```json +PUT /turkish-index +{ + "settings": { + "analysis": { + "filter": { + "turkish_stop": { + "type": "stop", + "stopwords": "_turkish_" + }, + "turkish_stemmer": { + "type": "stemmer", + "language": "turkish" + }, + "turkish_lowercase": { + "type": "lowercase", + "language": "turkish" + }, + "turkish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "turkish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "apostrophe", + "turkish_lowercase", + "turkish_stop", + "turkish_keywords", + "turkish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /turkish-index/_analyze +{ + "field": "content", + "text": "Öğrenciler Türk üniversitelerinde öğrenim görüyor. Numara 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "öğrenci","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "türk","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "üniversite","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "öğre","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "görüyor","start_offset": 42,"end_offset": 49,"type": "","position": 4}, + {"token": "numar","start_offset": 51,"end_offset": 57,"type": "","position": 5}, + {"token": "123456","start_offset": 58,"end_offset": 64,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/normalizers.md b/_analyzers/normalizers.md index b89659f814..52841d2571 100644 --- a/_analyzers/normalizers.md +++ b/_analyzers/normalizers.md @@ -1,7 +1,7 @@ --- layout: default title: Normalizers -nav_order: 100 +nav_order: 110 --- # Normalizers diff --git a/_analyzers/search-analyzers.md b/_analyzers/search-analyzers.md index b47e739d28..52159edb70 100644 --- a/_analyzers/search-analyzers.md +++ b/_analyzers/search-analyzers.md @@ -2,6 +2,7 @@ layout: default title: Search analyzers nav_order: 30 +parent: Analyzers --- # Search analyzers @@ -42,7 +43,7 @@ GET shakespeare/_search ``` {% include copy-curl.html %} -Valid values for [built-in analyzers]({{site.url}}{{site.baseurl}}/analyzers/index#built-in-analyzers) are `standard`, `simple`, `whitespace`, `stop`, `keyword`, `pattern`, `fingerprint`, or any supported [language analyzer]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/). +For more information about supported analyzers, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/index/). ## Specifying a search analyzer for a field diff --git a/_analyzers/supported-analyzers/index.md b/_analyzers/supported-analyzers/index.md new file mode 100644 index 0000000000..43e41b8d6a --- /dev/null +++ b/_analyzers/supported-analyzers/index.md @@ -0,0 +1,41 @@ +--- +layout: default +title: Analyzers +nav_order: 40 +has_children: true +has_toc: false +redirect_from: + - /analyzers/supported-analyzers/index/ +--- + +# Analyzers + +The following sections list all analyzers that OpenSearch supports. + +## Built-in analyzers + +The following table lists the built-in analyzers that OpenSearch provides. The last column of the table contains the result of applying the analyzer to the string `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`. + +Analyzer | Analysis performed | Analyzer output +:--- | :--- | :--- +**Standard** (default) | - Parses strings into tokens at word boundaries
- Removes most punctuation
- Converts tokens to lowercase | [`it’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] +**Simple** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts tokens to lowercase | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] +**Whitespace** | - Parses strings into tokens on white space | [`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] +**Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] +**Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] +**Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] +[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] +**Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. + +## Language analyzers + +OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index). + +## Additional analyzers + +The following table lists the additional analyzers that OpenSearch supports. + +| Analyzer | Analysis performed | +|:---------------|:---------------------------------------------------------------------------------------------------------| +| `phone` | An [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) for parsing phone numbers. | +| `phone-search` | A [search analyzer]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/) for parsing phone numbers. | diff --git a/_analyzers/supported-analyzers/phone-analyzers.md b/_analyzers/supported-analyzers/phone-analyzers.md new file mode 100644 index 0000000000..f24b7cf328 --- /dev/null +++ b/_analyzers/supported-analyzers/phone-analyzers.md @@ -0,0 +1,128 @@ +--- +layout: default +title: Phone number +parent: Analyzers +nav_order: 140 +--- + +# Phone number analyzers + +The `analysis-phonenumber` plugin provides analyzers and tokenizers for parsing phone numbers. A dedicated analyzer is required because parsing phone numbers is a non-trivial task (even though it might seem trivial at first glance). For common misconceptions regarding phone number parsing, see [Falsehoods programmers believe about phone numbers](https://github.com/google/libphonenumber/blob/master/FALSEHOODS.md). + + +OpenSearch supports the following phone number analyzers: + +* [`phone`](#the-phone-analyzer): An [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) to use at indexing time. +* [`phone-search`](#the-phone-search-analyzer): A [search analyzer]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/) to use at search time. + +Internally, the plugin uses the [`libphonenumber`](https://github.com/google/libphonenumber) library and follows its parsing rules. + +The phone number analyzers are not meant to find phone numbers in larger texts. Instead, you should use them on fields that only contain phone numbers. +{: .note} + +## Installing the plugin + +Before you can use the phone number analyzers, you must install the `analysis-phonenumber` plugin by running the following command: + +```sh +./bin/opensearch-plugin install analysis-phonenumber +``` + +## Specifying a default region + +You can optionally specify a default region for parsing phone numbers by providing the `phone-region` parameter within the analyzer. Valid phone regions are represented by ISO 3166 country codes. For more information, see [List of ISO 3166 country codes](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes). + +When tokenizing phone numbers containing the international calling prefix `+`, the default region is irrelevant. However, for phone numbers that use a national prefix for international numbers (for example, `001` instead of `+1` to dial Northern America from most European countries), the region needs to be provided. You can also properly index local phone numbers with no international prefix by specifying the region. + +## Example + +The following request creates an index containing one field that ingests phone numbers for Switzerland (region code `CH`): + +```json +PUT /example-phone +{ + "settings": { + "analysis": { + "analyzer": { + "phone-ch": { + "type": "phone", + "phone-region": "CH" + }, + "phone-search-ch": { + "type": "phone-search", + "phone-region": "CH" + } + } + } + }, + "mappings": { + "properties": { + "phone_number": { + "type": "text", + "analyzer": "phone-ch", + "search_analyzer": "phone-search-ch" + } + } + } +} +``` +{% include copy-curl.html %} + +## The phone analyzer + +The `phone` analyzer generates n-grams based on the given phone number. A (fictional) Swiss phone number containing an international calling prefix can be parsed with or without the Swiss-specific phone region. Thus, the following two requests will produce the same result: + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone-ch", + "text" : "+41 60 555 12 34" +} +``` +{% include copy-curl.html %} + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone", + "text" : "+41 60 555 12 34" +} +``` +{% include copy-curl.html %} + +The response contains the generated n-grams: + +```json +["+41 60 555 12 34", "6055512", "41605551", "416055512", "6055", "41605551234", ...] +``` + +However, if you specify the phone number without the international calling prefix `+` (either by using `0041` or omitting +the international calling prefix altogether), then only the analyzer configured with the correct phone region can parse the number: + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone-ch", + "text" : "060 555 12 34" +} +``` +{% include copy-curl.html %} + +## The phone-search analyzer + +In contrast, the `phone-search` analyzer does not create n-grams and only issues some basic tokens. For example, send the following request and specify the `phone-search` analyzer: + +```json +GET /example-phone/_analyze +{ + "analyzer" : "phone-search", + "text" : "+41 60 555 12 34" +} +``` +{% include copy-curl.html %} + +The response contains the following tokens: + +```json +["+41 60 555 12 34", "41 60 555 12 34", "41605551234", "605551234", "41"] +``` diff --git a/_analyzers/token-filters/cjk-bigram.md b/_analyzers/token-filters/cjk-bigram.md new file mode 100644 index 0000000000..ab21549c47 --- /dev/null +++ b/_analyzers/token-filters/cjk-bigram.md @@ -0,0 +1,160 @@ +--- +layout: default +title: CJK bigram +parent: Token filters +nav_order: 30 +--- + +# CJK bigram token filter + +The `cjk_bigram` token filter is designed specifically for processing East Asian languages, such as Chinese, Japanese, and Korean (CJK), which typically don't use spaces to separate words. A bigram is a sequence of two adjacent elements in a string of tokens, which can be characters or words. For CJK languages, bigrams help approximate word boundaries and capture significant character pairs that can convey meaning. + + +## Parameters + +The `cjk_bigram` token filter can be configured with two parameters: `ignore_scripts`and `output_unigrams`. + +### `ignore_scripts` + +The `cjk-bigram` token filter ignores all non-CJK scripts (writing systems like Latin or Cyrillic) and tokenizes only CJK text into bigrams. Use this option to specify CJK scripts to be ignored. This option takes the following valid values: + +- `han`: The `han` script processes Han characters. [Han characters](https://simple.wikipedia.org/wiki/Chinese_characters) are logograms used in the written languages of China, Japan, and Korea. The filter can help with text processing tasks like tokenizing, normalizing, or stemming text written in Chinese, Japanese kanji, or Korean Hanja. + +- `hangul`: The `hangul` script processes Hangul characters, which are unique to the Korean language and do not exist in other East Asian scripts. + +- `hiragana`: The `hiragana` script processes hiragana, one of the two syllabaries used in the Japanese writing system. + Hiragana is typically used for native Japanese words, grammatical elements, and certain forms of punctuation. + +- `katakana`: The `katakana` script processes katakana, the other Japanese syllabary. + Katakana is mainly used for foreign loanwords, onomatopoeia, scientific names, and certain Japanese words. + + +### `output_unigrams` + +This option, when set to `true`, outputs both unigrams (single characters) and bigrams. Default is `false`. + +## Example + +The following example request creates a new index named `devanagari_example_index` and defines an analyzer with the `cjk_bigram_filter` filter and `ignored_scripts` parameter set to `katakana`: + +```json +PUT /cjk_bigram_example +{ + "settings": { + "analysis": { + "analyzer": { + "cjk_bigrams_no_katakana": { + "tokenizer": "standard", + "filter": [ "cjk_bigrams_no_katakana_filter" ] + } + }, + "filter": { + "cjk_bigrams_no_katakana_filter": { + "type": "cjk_bigram", + "ignored_scripts": [ + "katakana" + ], + "output_unigrams": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /cjk_bigram_example/_analyze +{ + "analyzer": "cjk_bigrams_no_katakana", + "text": "東京タワーに行く" +} +``` +{% include copy-curl.html %} + +Sample text: "東京タワーに行く" + + 東京 (Kanji for "Tokyo") + タワー (Katakana for "Tower") + に行く (Hiragana and Kanji for "go to") + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "東", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "東京", + "start_offset": 0, + "end_offset": 2, + "type": "", + "position": 0, + "positionLength": 2 + }, + { + "token": "京", + "start_offset": 1, + "end_offset": 2, + "type": "", + "position": 1 + }, + { + "token": "タワー", + "start_offset": 2, + "end_offset": 5, + "type": "", + "position": 2 + }, + { + "token": "に", + "start_offset": 5, + "end_offset": 6, + "type": "", + "position": 3 + }, + { + "token": "に行", + "start_offset": 5, + "end_offset": 7, + "type": "", + "position": 3, + "positionLength": 2 + }, + { + "token": "行", + "start_offset": 6, + "end_offset": 7, + "type": "", + "position": 4 + }, + { + "token": "行く", + "start_offset": 6, + "end_offset": 8, + "type": "", + "position": 4, + "positionLength": 2 + }, + { + "token": "く", + "start_offset": 7, + "end_offset": 8, + "type": "", + "position": 5 + } + ] +} +``` + + diff --git a/_analyzers/token-filters/cjk-width.md b/_analyzers/token-filters/cjk-width.md new file mode 100644 index 0000000000..4960729cd1 --- /dev/null +++ b/_analyzers/token-filters/cjk-width.md @@ -0,0 +1,96 @@ +--- +layout: default +title: CJK width +parent: Token filters +nav_order: 40 +--- + +# CJK width token filter + +The `cjk_width` token filter normalizes Chinese, Japanese, and Korean (CJK) tokens by converting full-width ASCII characters to their standard (half-width) ASCII equivalents and half-width katakana characters to their full-width equivalents. + +### Converting full-width ASCII characters + +In CJK texts, ASCII characters (such as letters and numbers) can appear in full-width form, occupying the space of two half-width characters. Full-width ASCII characters are typically used in East Asian typography for alignment with the width of CJK characters. However, for the purposes of indexing and searching, these full-width characters need to be normalized to their standard (half-width) ASCII equivalents. + +The following example illustrates ASCII character normalization: + +``` + Full-Width: ABCDE 12345 + Normalized (half-width): ABCDE 12345 +``` + +### Converting half-width katakana characters + +The `cjk_width` token filter converts half-width katakana characters to their full-width counterparts, which are the standard form used in Japanese text. This normalization, illustrated in the following example, is important for consistency in text processing and searching: + + +``` + Half-Width katakana: カタカナ + Normalized (full-width) katakana: カタカナ +``` + +## Example + +The following example request creates a new index named `cjk_width_example_index` and defines an analyzer with the `cjk_width` filter: + +```json +PUT /cjk_width_example_index +{ + "settings": { + "analysis": { + "analyzer": { + "cjk_width_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["cjk_width"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /cjk_width_example_index/_analyze +{ + "analyzer": "cjk_width_analyzer", + "text": "Tokyo 2024 カタカナ" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Tokyo", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "2024", + "start_offset": 6, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "カタカナ", + "start_offset": 11, + "end_offset": 15, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md new file mode 100644 index 0000000000..34db74a824 --- /dev/null +++ b/_analyzers/token-filters/classic.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Classic +parent: Token filters +nav_order: 50 +--- + +# Classic token filter + +The primary function of the classic token filter is to work alongside the classic tokenizer. It processes tokens by applying the following common transformations, which aid in text analysis and search: + - Removal of possessive endings such as *'s*. For example, *John's* becomes *John*. + - Removal of periods from acronyms. For example, *D.A.R.P.A.* becomes *DARPA*. + + +## Example + +The following example request creates a new index named `custom_classic_filter` and configures an analyzer with the `classic` filter: + +```json +PUT /custom_classic_filter +{ + "settings": { + "analysis": { + "analyzer": { + "custom_classic": { + "type": "custom", + "tokenizer": "classic", + "filter": ["classic"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /custom_classic_filter/_analyze +{ + "analyzer": "custom_classic", + "text": "John's co-operate was excellent." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "John", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "co", + "start_offset": 7, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "operate", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "was", + "start_offset": 18, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "excellent", + "start_offset": 22, + "end_offset": 31, + "type": "", + "position": 4 + } + ] +} +``` + diff --git a/_analyzers/token-filters/common_gram.md b/_analyzers/token-filters/common_gram.md new file mode 100644 index 0000000000..58f5bbe149 --- /dev/null +++ b/_analyzers/token-filters/common_gram.md @@ -0,0 +1,94 @@ +--- +layout: default +title: Common grams +parent: Token filters +nav_order: 60 +--- + +# Common grams token filter + +The `common_grams` token filter improves search relevance by keeping commonly occurring phrases (common grams) in the text. This is useful when dealing with languages or datasets in which certain word combinations frequently occur as a unit and can impact search relevance if treated as separate tokens. If any common words are present in the input string, this token filter generates both their unigrams and bigrams. + +Using this token filter improves search relevance by keeping common phrases intact. This can help in matching queries more accurately, particularly for frequent word combinations. It also improves search precision by reducing the number of irrelevant matches. + +When using this filter, you must carefully select and maintain the `common_words` list. +{: .warning} + +## Parameters + +The `common_grams` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`common_words` | Required | List of strings | A list of words that should be treated as words that commonly appear together. These words will be used to generate common grams. If the `common_words` parameter is an empty list, the `common_grams` token filter becomes a no-op filter, meaning that it doesn't modify the input tokens at all. +`ignore_case` | Optional | Boolean | Indicates whether the filter should ignore case differences when matching common words. Default is `false`. +`query_mode` | Optional | Boolean | When set to `true`, the following rules are applied:
- Unigrams that are generated from `common_words` are not included in the output.
- Bigrams in which a non-common word is followed by a common word are retained in the output.
- Unigrams of non-common words are excluded if they are immediately followed by a common word.
- If a non-common word appears at the end of the text and is preceded by a common word, its unigram is not included in the output. + + +## Example + +The following example request creates a new index named `my_common_grams_index` and configures an analyzer with the `common_grams` filter: + +```json +PUT /my_common_grams_index +{ + "settings": { + "analysis": { + "filter": { + "my_common_grams_filter": { + "type": "common_grams", + "common_words": ["a", "in", "for"], + "ignore_case": true, + "query_mode": true + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_common_grams_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_common_grams_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "A quick black cat jumps over the lazy dog in the park" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "a_quick","start_offset": 0,"end_offset": 7,"type": "gram","position": 0}, + {"token": "quick","start_offset": 2,"end_offset": 7,"type": "","position": 1}, + {"token": "black","start_offset": 8,"end_offset": 13,"type": "","position": 2}, + {"token": "cat","start_offset": 14,"end_offset": 17,"type": "","position": 3}, + {"token": "jumps","start_offset": 18,"end_offset": 23,"type": "","position": 4}, + {"token": "over","start_offset": 24,"end_offset": 28,"type": "","position": 5}, + {"token": "the","start_offset": 29,"end_offset": 32,"type": "","position": 6}, + {"token": "lazy","start_offset": 33,"end_offset": 37,"type": "","position": 7}, + {"token": "dog_in","start_offset": 38,"end_offset": 44,"type": "gram","position": 8}, + {"token": "in_the","start_offset": 42,"end_offset": 48,"type": "gram","position": 9}, + {"token": "the","start_offset": 45,"end_offset": 48,"type": "","position": 10}, + {"token": "park","start_offset": 49,"end_offset": 53,"type": "","position": 11} + ] +} +``` + diff --git a/_analyzers/token-filters/condition.md b/_analyzers/token-filters/condition.md new file mode 100644 index 0000000000..5e87c2cbbf --- /dev/null +++ b/_analyzers/token-filters/condition.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Condition +parent: Token filters +nav_order: 70 +--- + +# Condition token filter + +The `condition` token filter is a special type of filter that allows you to apply other token filters conditionally based on certain criteria. This provides more control over when certain token filters should be applied during text analysis. +Multiple filters can be configured and only applied when they meet the conditions you define. +This token filter can be very useful for language-specific processing and handling of special characters. + + +## Parameters + +There are two parameters that must be configured in order to use the `condition` token filter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`filter` | Required | Array | Specifies which token filters should be applied to the tokens when the specified condition (defined by the `script` parameter) is met. +`script` | Required | Object | Configures an [inline script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-script/) that defines the condition that needs to be met in order for the filters specified in the `filter` parameter to be applied (only inline scripts are accepted). + + +## Example + +The following example request creates a new index named `my_conditional_index` and configures an analyzer with a `condition` filter. This filter applies a `lowercase` filter to any tokens that contain the character sequence "um": + +```json +PUT /my_conditional_index +{ + "settings": { + "analysis": { + "filter": { + "my_conditional_filter": { + "type": "condition", + "filter": ["lowercase"], + "script": { + "source": "token.getTerm().toString().contains('um')" + } + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_conditional_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_conditional_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "THE BLACK CAT JUMPS OVER A LAZY DOG" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "THE", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "BLACK", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "CAT", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "OVER", + "start_offset": 20, + "end_offset": 24, + "type": "", + "position": 4 + }, + { + "token": "A", + "start_offset": 25, + "end_offset": 26, + "type": "", + "position": 5 + }, + { + "token": "LAZY", + "start_offset": 27, + "end_offset": 31, + "type": "", + "position": 6 + }, + { + "token": "DOG", + "start_offset": 32, + "end_offset": 35, + "type": "", + "position": 7 + } + ] +} +``` + diff --git a/_analyzers/token-filters/decimal-digit.md b/_analyzers/token-filters/decimal-digit.md new file mode 100644 index 0000000000..002375f7e5 --- /dev/null +++ b/_analyzers/token-filters/decimal-digit.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Decimal digit +parent: Token filters +nav_order: 80 +--- + +# Decimal digit token filter + +The `decimal_digit` token filter is used to normalize decimal digit characters (0--9) into their ASCII equivalents in various scripts. This is useful when you want to ensure that all digits are treated uniformly in text analysis, regardless of the script in which they are written. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `decimal_digit` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_decimal_digit_filter": { + "type": "decimal_digit" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["my_decimal_digit_filter"] + } + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "123 ١٢٣ १२३" +} +``` +{% include copy-curl.html %} + +`text` breakdown: + + - "123" (ASCII digits) + - "١٢٣" (Arabic-Indic digits) + - "१२३" (Devanagari digits) + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "123", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "123", + "start_offset": 4, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "123", + "start_offset": 8, + "end_offset": 11, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/delimited-payload.md b/_analyzers/token-filters/delimited-payload.md new file mode 100644 index 0000000000..f17bb1b1ce --- /dev/null +++ b/_analyzers/token-filters/delimited-payload.md @@ -0,0 +1,211 @@ +--- +layout: default +title: Delimited payload +parent: Token filters +nav_order: 90 +--- + +# Delimited payload token filter + +The `delimited_payload` token filter is used to parse tokens containing payloads during the analysis process. For example, the string `red|1.5 fast|2.0 car|1.0` is parsed into the tokens `red` (with a payload of `1.5`), `fast` (with a payload of `2.0`), and `car` (with a payload of `1.0`). This is particularly useful when your tokens include additional associated data (like weights, scores, or other numeric values) that you can use for scoring or custom query logic. The filter can handle different types of payloads, including integers, floats, and strings, and attach payloads (extra metadata) to tokens. + +When analyzing text, the `delimited_payload` token filter parses each token, extracts the payload, and attaches it to the token. This payload can later be used in queries to influence scoring, boosting, or other custom behaviors. + +Payloads are stored as Base64-encoded strings. By default, payloads are not returned in the query response along with the tokens. To return the payloads, you must configure additional parameters. For more information, see [Example with a stored payload]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-payload/#example-without-a-stored-payload). + +## Parameters + +The `delimited_payload` token filter has two parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`encoding` | Optional | String | Specifies the data type of the payload attached to the tokens. This determines how the payload data is interpreted during analysis and querying.
Valid values are:

- `float`: The payload is interpreted as a 32-bit floating-point number using IEEE 754 format (for example, `2.5` in `car|2.5`).
- `identity`: The payload is interpreted as a sequence of characters (for example, in `user|admin`, `admin` is interpreted as a string).
- `int`: The payload is interpreted as a 32-bit integer (for example, `1` in `priority|1`).
Default is `float`. +`delimiter` | Optional | String | Specifies the character that separates the token from its payload in the input text. Default is the pipe character (`|`). + +## Example without a stored payload + +The following example request creates a new index named `my_index` and configures an analyzer with a `delimited_payload` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_payload_filter": { + "type": "delimited_payload", + "delimiter": "|", + "encoding": "float" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "whitespace", + "filter": ["my_payload_filter"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "red|1.5 fast|2.0 car|1.0" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "red", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "fast", + "start_offset": 8, + "end_offset": 16, + "type": "word", + "position": 1 + }, + { + "token": "car", + "start_offset": 17, + "end_offset": 24, + "type": "word", + "position": 2 + } + ] +} +``` + +## Example with a stored payload + +To configure the payload to be returned in the response, create an index that stores term vectors and set `term_vector` to `with_positions_payloads` or `with_positions_offsets_payloads` in the index mappings. For example, the following index is configured to store term vectors: + +```json +PUT /visible_payloads +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "term_vector": "with_positions_payloads", + "analyzer": "custom_analyzer" + } + } + }, + "settings": { + "analysis": { + "filter": { + "my_payload_filter": { + "type": "delimited_payload", + "delimiter": "|", + "encoding": "float" + } + }, + "analyzer": { + "custom_analyzer": { + "tokenizer": "whitespace", + "filter": [ "my_payload_filter" ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +You can index a document into this index using the following request: + +```json +PUT /visible_payloads/_doc/1 +{ + "text": "red|1.5 fast|2.0 car|1.0" +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /visible_payloads/_termvectors/1 +{ + "fields": ["text"] +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens, which include payloads: + +```json +{ + "_index": "visible_payloads", + "_id": "1", + "_version": 1, + "found": true, + "took": 3, + "term_vectors": { + "text": { + "field_statistics": { + "sum_doc_freq": 3, + "doc_count": 1, + "sum_ttf": 3 + }, + "terms": { + "brown": { + "term_freq": 1, + "tokens": [ + { + "position": 1, + "start_offset": 10, + "end_offset": 19, + "payload": "QEAAAA==" + } + ] + }, + "fox": { + "term_freq": 1, + "tokens": [ + { + "position": 2, + "start_offset": 20, + "end_offset": 27, + "payload": "P8AAAA==" + } + ] + }, + "quick": { + "term_freq": 1, + "tokens": [ + { + "position": 0, + "start_offset": 0, + "end_offset": 9, + "payload": "QCAAAA==" + } + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/token-filters/dictionary-decompounder.md b/_analyzers/token-filters/dictionary-decompounder.md new file mode 100644 index 0000000000..ced6fd6fbc --- /dev/null +++ b/_analyzers/token-filters/dictionary-decompounder.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Dictionary decompounder +parent: Token filters +nav_order: 110 +--- + +# Dictionary decompounder token filter + +The `dictionary_decompounder` token filter is used to split compound words into their constituent parts based on a predefined dictionary. This filter is particularly useful for languages like German, Dutch, or Finnish, in which compound words are common, so breaking them down can improve search relevance. The `dictionary_decompounder` token filter determines whether each token (word) can be split into smaller tokens based on a list of known words. If the token can be split into known words, the filter generates the subtokens for the token. + +## Parameters + +The `dictionary_decompounder` token filter has the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`word_list` | Required unless `word_list_path` is configured | Array of strings | The dictionary of words that the filter uses to split compound words. +`word_list_path` | Required unless `word_list` is configured | String | A file path to a text file containing the dictionary words. Accepts either an absolute path or a path relative to the `config` directory. The dictionary file must be UTF-8 encoded, and each word must be listed on a separate line. +`min_word_size` | Optional | Integer | The minimum length of the entire compound word that will be considered for splitting. If a compound word is shorter than this value, it is not split. Default is `5`. +`min_subword_size` | Optional | Integer | The minimum length for any subword. If a subword is shorter than this value, it is not included in the output. Default is `2`. +`max_subword_size` | Optional | Integer | The maximum length for any subword. If a subword is longer than this value, it is not included in the output. Default is `15`. +`only_longest_match` | Optional | Boolean | If set to `true`, only the longest matching subword will be returned. Default is `false`. + +## Example + +The following example request creates a new index named `decompound_example` and configures an analyzer with the `dictionary_decompounder` filter: + +```json +PUT /decompound_example +{ + "settings": { + "analysis": { + "filter": { + "my_dictionary_decompounder": { + "type": "dictionary_decompounder", + "word_list": ["slow", "green", "turtle"] + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_dictionary_decompounder"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /decompound_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slowgreenturtleswim" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slowgreenturtleswim", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "green", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/edge-ngram.md b/_analyzers/token-filters/edge-ngram.md new file mode 100644 index 0000000000..be3eaf6fab --- /dev/null +++ b/_analyzers/token-filters/edge-ngram.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Edge n-gram +parent: Token filters +nav_order: 120 +--- +# Edge n-gram token filter +The `edge_ngram` token filter is very similar to the `ngram` token filter, where a particular string is split into substrings of different lengths. The `edge_ngram` token filter, however, generates n-grams (substrings) only from the beginning (edge) of a token. It's particularly useful in scenarios like autocomplete or prefix matching, where you want to match the beginning of words or phrases as the user types them. + +## Parameters + +The `edge_ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams that will be generated. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams that will be generated. Default is `1` for the `edge_ngram` filter and `2` for custom token filters. Avoid setting this parameter to a low value. If the value is set too low, only very short n-grams will be generated and the search term will not be found. For example, if `max_gram` is set to `3` and you index the word "banana", the longest generated token will be "ban". If the user searches for "banana", no matches will be returned. You can use the `truncate` token filter as a search analyzer to mitigate this risk. +`preserve_original` | Optional | Boolean | Includes the original token in the output. Default is `false` . + +## Example + +The following example request creates a new index named `edge_ngram_example` and configures an analyzer with the `edge_ngram` filter: + +```json +PUT /edge_ngram_example +{ + "settings": { + "analysis": { + "filter": { + "my_edge_ngram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 4 + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_edge_ngram"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /edge_ngram_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slo", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "gre", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "gree", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "tur", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "turt", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/elision.md b/_analyzers/token-filters/elision.md new file mode 100644 index 0000000000..abc6dba658 --- /dev/null +++ b/_analyzers/token-filters/elision.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Elision +parent: Token filters +nav_order: 130 +--- + +# Elision token filter + +The `elision` token filter is used to remove elided characters from words in certain languages. Elision typically occurs in languages such as French, in which words are often contracted and combined with the following word, typically by omitting a vowel and replacing it with an apostrophe. + +The `elision` token filter is already preconfigured in the following [language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/): `catalan`, `french`, `irish`, and `italian`. +{: .note} + +## Parameters + +The custom `elision` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`articles` | Required if `articles_path` is not configured | Array of strings | Defines which articles or short words should be removed when they appear as part of an elision. +`articles_path` | Required if `articles` is not configured | String | Specifies the path to a custom list of articles that should be removed during the analysis process. +`articles_case` | Optional | Boolean | Specifies whether the filter is case sensitive when matching elisions. Default is `false`. + +## Example + +The default set of French elisions is `l'`, `m'`, `t'`, `qu'`, `n'`, `s'`, `j'`, `d'`, `c'`, `jusqu'`, `quoiqu'`, `lorsqu'`, and `puisqu'`. You can update this by configuring the `french_elision` token filter. The following example request creates a new index named `french_texts` and configures an analyzer with a `french_elision` filter: + +```json +PUT /french_texts +{ + "settings": { + "analysis": { + "filter": { + "french_elision": { + "type": "elision", + "articles": [ "l", "t", "m", "d", "n", "s", "j" ] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "french_elision"] + } + } + } + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french_texts/_analyze +{ + "analyzer": "french_analyzer", + "text": "L'étudiant aime l'école et le travail." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "étudiant", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "aime", + "start_offset": 11, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "école", + "start_offset": 16, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "et", + "start_offset": 24, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "le", + "start_offset": 27, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "travail", + "start_offset": 30, + "end_offset": 37, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/fingerprint.md b/_analyzers/token-filters/fingerprint.md new file mode 100644 index 0000000000..75c6615459 --- /dev/null +++ b/_analyzers/token-filters/fingerprint.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Fingerprint +parent: Token filters +nav_order: 140 +--- + +# Fingerprint token filter + +The `fingerprint` token filter is used to standardize and deduplicate text. This is particularly useful when consistency in text processing is crucial. The `fingerprint` token filter achieves this by processing text using the following steps: + +1. **Lowercasing**: Converts all text to lowercase. +2. **Splitting**: Breaks the text into tokens. +3. **Sorting**: Arranges the tokens in alphabetical order. +4. **Removing duplicates**: Eliminates repeated tokens. +5. **Joining tokens**: Combines the tokens into a single string, typically joined by a space or another specified separator. + +## Parameters + +The `fingerprint` token filter can be configured with the following two parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_output_size` | Optional | Integer | Limits the length of the generated fingerprint string. If the concatenated string exceeds the `max_output_size`, the filter will not produce any output, resulting in an empty token. Default is `255`. +`separator` | Optional | String | Defines the character(s) used to join the tokens into a single string after they have been sorted and deduplicated. Default is space (`" "`). + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `fingerprint` token filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_fingerprint": { + "type": "fingerprint", + "max_output_size": 200, + "separator": "-" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_fingerprint" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "OpenSearch is a powerful search engine that scales easily" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "a-easily-engine-is-opensearch-powerful-scales-search-that", + "start_offset": 0, + "end_offset": 57, + "type": "fingerprint", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/flatten-graph.md b/_analyzers/token-filters/flatten-graph.md new file mode 100644 index 0000000000..8d51c57400 --- /dev/null +++ b/_analyzers/token-filters/flatten-graph.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Flatten graph +parent: Token filters +nav_order: 150 +--- + +# Flatten graph token filter + +The `flatten_graph` token filter is used to handle complex token relationships that occur when multiple tokens are generated at the same position in a graph structure. Some token filters, like `synonym_graph` and `word_delimiter_graph`, generate multi-position tokens---tokens that overlap or span multiple positions. These token graphs are useful for search queries but are not directly supported during indexing. The `flatten_graph` token filter resolves multi-position tokens into a linear sequence of tokens. Flattening the graph ensures compatibility with the indexing process. + +Token graph flattening is a lossy process. Whenever possible, avoid using the `flatten_graph` filter. Instead, apply graph token filters exclusively in search analyzers, removing the need for the `flatten_graph` filter. +{: .important} + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `flatten_graph` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_index_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_custom_filter", + "flatten_graph" + ] + } + }, + "filter": { + "my_custom_filter": { + "type": "word_delimiter_graph", + "catenate_all": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_index_analyzer", + "text": "OpenSearch helped many employers" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0, + "positionLength": 2 + }, + { + "token": "Open", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "Search", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "helped", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "many", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 3 + }, + { + "token": "employers", + "start_offset": 23, + "end_offset": 32, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/hunspell.md b/_analyzers/token-filters/hunspell.md new file mode 100644 index 0000000000..6720ba74de --- /dev/null +++ b/_analyzers/token-filters/hunspell.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Hunspell +parent: Token filters +nav_order: 160 +--- + +# Hunspell token filter + +The `hunspell` token filter is used for stemming and morphological analysis of words in a specific language. This filter applies Hunspell dictionaries, which are widely used in spell checkers. It works by breaking down words into their root forms (stemming). + +The Hunspell dictionary files are automatically loaded at startup from the `/hunspell/` directory. For example, the `en_GB` locale must have at least one `.aff` file and one or more `.dic` files in the `/hunspell/en_GB/` directory. + +You can download these files from [LibreOffice dictionaries](https://github.com/LibreOffice/dictionaries). + +## Parameters + +The `hunspell` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`language/lang/locale` | At least one of the three is required | String | Specifies the language for the Hunspell dictionary. +`dedup` | Optional | Boolean | Determines whether to remove multiple duplicate stemming terms for the same token. Default is `true`. +`dictionary` | Optional | Array of strings | Configures the dictionary files to be used for the Hunspell dictionary. Default is all files in the `/hunspell/` directory. +`longest_only` | Optional | Boolean | Specifies whether only the longest stemmed version of the token should be returned. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `hunspell` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_hunspell_filter": { + "type": "hunspell", + "lang": "en_GB", + "dedup": true, + "longest_only": true + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hunspell_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "the turtle moves slowly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "move", + "start_offset": 11, + "end_offset": 16, + "type": "", + "position": 2 + }, + { + "token": "slow", + "start_offset": 17, + "end_offset": 23, + "type": "", + "position": 3 + } + ] +} +``` diff --git a/_analyzers/token-filters/hyphenation-decompounder.md b/_analyzers/token-filters/hyphenation-decompounder.md new file mode 100644 index 0000000000..6e53d4dfd5 --- /dev/null +++ b/_analyzers/token-filters/hyphenation-decompounder.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Hyphenation decompounder +parent: Token filters +nav_order: 170 +--- + +# Hyphenation decompounder token filter + +The `hyphenation_decompounder` token filter is used to break down compound words into their constituent parts. This filter is particularly useful for languages like German, Dutch, and Swedish, in which compound words are common. The filter uses hyphenation patterns (typically defined in .xml files) to identify the possible locations within a compound word where it can be split into components. These components are then checked against a provided dictionary. If there is a match, those components are treated as valid tokens. For more information about hyphenation pattern files, see [FOP XML Hyphenation Patterns](https://offo.sourceforge.net/#FOP+XML+Hyphenation+Patterns). + +## Parameters + +The `hyphenation_decompounder` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hyphenation_patterns_path` | Required | String | The path (relative to the `config` directory or absolute) to the hyphenation patterns file, which contains the language-specific rules for word splitting. The file is typically in XML format. Sample files can be downloaded from the [OFFO SourceForge project](https://sourceforge.net/projects/offo/). +`word_list` | Required if `word_list_path` is not set | Array of strings | A list of words used to validate the components generated by the hyphenation patterns. +`word_list_path` | Required if `word_list` is not set | String | The path (relative to the `config` directory or absolute) to a list of subwords. +`max_subword_size` | Optional | Integer | The maximum subword length. If the generated subword exceeds this length, it will not be added to the generated tokens. Default is `15`. +`min_subword_size` | Optional | Integer | The minimum subword length. If the generated subword is shorter than the specified length, it will not be added to the generated tokens. Default is `2`. +`min_word_size` | Optional | Integer | The minimum word character length. Word tokens shorter than this length are excluded from decomposition into subwords. Default is `5`. +`only_longest_match` | Optional | Boolean | Only includes the longest subword in the generated tokens. Default is `false`. + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `hyphenation_decompounder` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "filter": { + "my_hyphenation_decompounder": { + "type": "hyphenation_decompounder", + "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml", + "word_list": ["notebook", "note", "book"], + "min_subword_size": 3, + "min_word_size": 5, + "only_longest_match": false + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hyphenation_decompounder" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "notebook" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "notebook", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "note", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "book", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md index a9b621d5ab..b06489c805 100644 --- a/_analyzers/token-filters/index.md +++ b/_analyzers/token-filters/index.md @@ -4,6 +4,8 @@ title: Token filters nav_order: 70 has_children: true has_toc: false +redirect_from: + - /analyzers/token-filters/index/ --- # Token filters @@ -15,51 +17,51 @@ The following table lists all token filters that OpenSearch supports. Token filter | Underlying Lucene token filter| Description [`apostrophe`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/apostrophe/) | [ApostropheFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token containing an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following it. [`asciifolding`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/asciifolding/) | [ASCIIFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. -`cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. -`cjk_width` | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into the equivalent basic Latin characters.
- Folds half-width Katakana character variants into the equivalent Kana characters. -`classic` | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. -`common_grams` | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. -`conditional` | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. -`decimal_digit` | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). -`delimited_payload` | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters before the delimiter, and a payload consists of all characters after the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. +[`cjk_bigram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-bigram/) | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. +[`cjk_width`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-width/) | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into their equivalent basic Latin characters.
- Folds half-width katakana character variants into their equivalent kana characters. +[`classic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/classic) | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. +[`common_grams`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/common_gram/) | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. +[`conditional`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/condition/) | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. +[`decimal_digit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/decimal-digit/) | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). +[`delimited_payload`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-payload/) | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters preceding the delimiter, and a payload consists of all characters following the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. [`delimited_term_freq`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-term-frequency/) | [DelimitedTermFrequencyTokenFilter](https://lucene.apache.org/core/9_7_0/analysis/common/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.html) | Separates a token stream into tokens with corresponding term frequencies, based on a provided delimiter. A token consists of all characters before the delimiter, and a term frequency is the integer after the delimiter. For example, if the delimiter is `|`, then for the string `foo|5`, `foo` is the token and `5` is the term frequency. -`dictionary_decompounder` | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. -`edge_ngram` | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. -`elision` | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). -`fingerprint` | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. -`flatten_graph` | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. -`hunspell` | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell supports a word having multiple stems, this filter can emit multiple tokens for each consumed token. Requires you to configure one or more language-specific Hunspell dictionaries. -`hyphenation_decompounder` | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. -`keep_types` | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. -`keep_word` | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. -`keyword_marker` | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. -`keyword_repeat` | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. -`kstem` | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides kstem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. -`kuromoji_completion` | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to the token stream (in addition to the original tokens). Usually used to support autocomplete on Japanese search terms. Note that the filter has a `mode` parameter, which should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). -`length` | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens whose lengths are shorter or longer than the length range specified by `min` and `max`. -`limit` | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. A common use case is to limit the size of document field values based on token count. -`lowercase` | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) is for the English language. You can set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). -`min_hash` | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. -`multiplexer` | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. -`ngram` | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. -Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. -`pattern_capture` | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. -`porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. -`predicate_token_filter` | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only. -`remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. -`reverse` | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. -`shingle` | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. -`snowball` | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). You can use the `snowball` token filter with the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. -`stemmer` | N/A | Provides algorithmic stemming for the following languages in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. -`stemmer_override` | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. -`stop` | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. -`synonym` | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. -`synonym_graph` | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. -`trim` | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space from each token in a stream. -`truncate` | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens whose length exceeds the specified character limit. -`unique` | N/A | Ensures each token is unique by removing duplicate tokens from a stream. -`uppercase` | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. -`word_delimiter` | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. -`word_delimiter_graph` | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns multi-position tokens a `positionLength` attribute. +[`dictionary_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/dictionary-decompounder/) | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. +[`edge_ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/edge-ngram/) | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. +[`elision`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/elision/) | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). +[`fingerprint`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/fingerprint/) | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. +[`flatten_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/flatten-graph/) | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. +[`hunspell`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hunspell/) | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell allows a word to have multiple stems, this filter can emit multiple tokens for each consumed token. Requires the configuration of one or more language-specific Hunspell dictionaries. +[`hyphenation_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hyphenation-decompounder/) | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. +[`keep_types`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-types/) | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. +[`keep_words`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-words/) | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. +[`keyword_marker`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-marker/) | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. +[`keyword_repeat`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-repeat/) | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. +[`kstem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kstem/) | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides KStem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. +[`kuromoji_completion`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kuromoji-completion/) | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to a token stream (in addition to the original tokens). Usually used to support autocomplete of Japanese search terms. Note that the filter has a `mode` parameter that should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). +[`length`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/length/) | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens that are shorter or longer than the length range specified by `min` and `max`. +[`limit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/limit/) | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. For example, document field value sizes can be limited based on the token count. +[`lowercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/lowercase/) | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) processes the English language. To process other languages, set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). +[`min_hash`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/min-hash/) | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. +[`multiplexer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/multiplexer/) | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. +[`ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/ngram/) | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. +[Normalization]({{site.url}}{{site.baseurl}}/analyzers/token-filters/normalization/) | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. +[`pattern_capture`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-capture/) | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`pattern_replace`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-replace/) | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`phonetic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/phonetic/) | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. +[`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/) | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. +[`predicate_token_filter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/predicate-token-filter/) | N/A | Removes tokens that do not match the specified predicate script. Supports only inline Painless scripts. +[`remove_duplicates`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/remove-duplicates/) | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. +[`reverse`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/reverse/) | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. +[`shingle`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/shingle/) | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but are generated using words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. +[`snowball`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/snowball/) | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). The `snowball` token filter supports using the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. +[`stemmer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer/) | N/A | Provides algorithmic stemming for the following languages used in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. +[`stemmer_override`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer-override/) | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. +[`stop`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stop/) | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. +[`synonym`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym/) | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. +[`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/) | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. +[`trim`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/trim/) | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space characters from each token in a stream. +[`truncate`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/truncate/) | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens with lengths exceeding the specified character limit. +[`unique`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/unique/) | N/A | Ensures that each token is unique by removing duplicate tokens from a stream. +[`uppercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/uppercase/) | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. +[`word_delimiter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter/) | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. +[`word_delimiter_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/) | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns a `positionLength` attribute to multi-position tokens. diff --git a/_analyzers/token-filters/keep-types.md b/_analyzers/token-filters/keep-types.md new file mode 100644 index 0000000000..59e617f567 --- /dev/null +++ b/_analyzers/token-filters/keep-types.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Keep types +parent: Token filters +nav_order: 180 +--- + +# Keep types token filter + +The `keep_types` token filter is a type of token filter used in text analysis to control which token types are kept or discarded. Different tokenizers produce different token types, for example, ``, ``, or ``. + +The `keyword`, `simple_pattern`, and `simple_pattern_split` tokenizers do not support the `keep_types` token filter because these tokenizers do not support token type attributes. +{: .note} + +## Parameters + +The `keep_types` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`types` | Required | List of strings | List of token types to be kept or discarded (determined by the `mode`). +`mode`| Optional | String | Whether to `include` or `exclude` the token types specified in `types`. Default is `include`. + + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `keep_types` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keep_types_filter"] + } + }, + "filter": { + "keep_types_filter": { + "type": "keep_types", + "types": [""] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /test_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Hello 2 world! This is an example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "hello", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "world", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "this", + "start_offset": 15, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "is", + "start_offset": 20, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "an", + "start_offset": 23, + "end_offset": 25, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keep-words.md b/_analyzers/token-filters/keep-words.md new file mode 100644 index 0000000000..4a6b199e5c --- /dev/null +++ b/_analyzers/token-filters/keep-words.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Keep words +parent: Token filters +nav_order: 190 +--- + +# Keep words token filter + +The `keep_words` token filter is designed to keep only certain words during the analysis process. This filter is useful if you have a large body of text but are only interested in certain keywords or terms. + +## Parameters + +The `keep_words` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`keep_words` | Required if `keep_words_path` is not configured | List of strings | The list of words to keep. +`keep_words_path` | Required if `keep_words` is not configured | String | The path to the file containing the list of words to keep. +`keep_words_case` | Optional | Boolean | Whether to lowercase all words during comparison. Default is `false`. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keep_words` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_keep_word": { + "tokenizer": "standard", + "filter": [ "keep_words_filter" ] + } + }, + "filter": { + "keep_words_filter": { + "type": "keep", + "keep_words": ["example", "world", "opensearch"], + "keep_words_case": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_keep_word", + "text": "Hello, world! This is an OpenSearch example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "world", + "start_offset": 7, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "OpenSearch", + "start_offset": 25, + "end_offset": 35, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 36, + "end_offset": 43, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keyword-marker.md b/_analyzers/token-filters/keyword-marker.md new file mode 100644 index 0000000000..0ec2cb96f5 --- /dev/null +++ b/_analyzers/token-filters/keyword-marker.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Keyword marker +parent: Token filters +nav_order: 200 +--- + +# Keyword marker token filter + +The `keyword_marker` token filter is used to prevent certain tokens from being altered by stemmers or other filters. The `keyword_marker` token filter does this by marking the specified tokens as `keywords`, which prevents any stemming or other processing. This ensures that specific words remain in their original form. + +## Parameters + +The `keyword_marker` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`ignore_case` | Optional | Boolean | Whether to ignore the letter case when matching keywords. Default is `false`. +`keywords` | Required if either `keywords_path` or `keywords_pattern` is not set | List of strings | The list of tokens to mark as keywords. +`keywords_path` | Required if either `keywords` or `keywords_pattern` is not set | String | The path (relative to the `config` directory or absolute) to the list of keywords. +`keywords_pattern` | Required if either `keywords` or `keywords_path` is not set | String | A [regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) used for matching tokens to be marked as keywords. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_marker` filter. The filter marks the word `example` as a keyword: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keyword_marker_filter", "stemmer"] + } + }, + "filter": { + "keyword_marker_filter": { + "type": "keyword_marker", + "keywords": ["example"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Favorite example" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens. Note that while the word `favorite` was stemmed, the word `example` was not stemmed because it was marked as a keyword: + +```json +{ + "tokens": [ + { + "token": "favorit", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "example", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_marker` token filter by adding the following parameters to the `_analyze` query: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "This is an OpenSearch example demonstrating keyword marker.", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +This will produce additional details in the response similar to the following: + +```json +{ + "name": "porter_stem", + "tokens": [ + ... + { + "token": "example", + "start_offset": 22, + "end_offset": 29, + "type": "", + "position": 4, + "keyword": true + }, + { + "token": "demonstr", + "start_offset": 30, + "end_offset": 43, + "type": "", + "position": 5, + "keyword": false + }, + ... + ] +} +``` diff --git a/_analyzers/token-filters/keyword-repeat.md b/_analyzers/token-filters/keyword-repeat.md new file mode 100644 index 0000000000..5ba15a037c --- /dev/null +++ b/_analyzers/token-filters/keyword-repeat.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Keyword repeat +parent: Token filters +nav_order: 210 +--- + +# Keyword repeat token filter + +The `keyword_repeat` token filter emits the keyword version of a token into a token stream. This filter is typically used when you want to retain both the original token and its modified version after further token transformations, such as stemming or synonym expansion. The duplicated tokens allow the original, unchanged version of the token to remain in the final analysis alongside the modified versions. + +The `keyword_repeat` token filter should be placed before stemming filters. Stemming is not applied to every token, thus you may have duplicate tokens in the same position after stemming. To remove duplicate tokens, use the `remove_duplicates` token filter after the stemmer. +{: .note} + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_repeat` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_kstem": { + "type": "kstem" + }, + "my_lowercase": { + "type": "lowercase" + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_lowercase", + "keyword_repeat", + "my_kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stopped", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "quickly", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "quick", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_repeat` token filter by adding the following parameters to the `_analyze` query: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +The response includes detailed information, such as tokenization, filtering, and the application of specific token filters: + +```json +{ + "detail": { + "custom_analyzer": true, + "charfilters": [], + "tokenizer": { + "name": "standard", + "tokens": [ + {"token": "OpenSearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + "tokenfilters": [ + { + "name": "lowercase", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + { + "name": "keyword_marker_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + }, + { + "name": "kstem_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "help","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employer","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + } + ] + } +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kstem.md b/_analyzers/token-filters/kstem.md new file mode 100644 index 0000000000..d13fd2c675 --- /dev/null +++ b/_analyzers/token-filters/kstem.md @@ -0,0 +1,92 @@ +--- +layout: default +title: KStem +parent: Token filters +nav_order: 220 +--- + +# KStem token filter + +The `kstem` token filter is a stemming filter used to reduce words to their root forms. The filter is a lightweight algorithmic stemmer designed for the English language that performs the following stemming operations: + +- Reduces plurals to their singular form. +- Converts different verb tenses to their base form. +- Removes common derivational endings, such as "-ing" or "-ed". + +The `kstem` token filter is equivalent to the a `stemmer` filter configured with a `light_english` language. It provides a more conservative stemming compared to other stemming filters like `porter_stem`. + +The `kstem` token filter is based on the Lucene KStemFilter. For more information, see the [Lucene documentation](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html). + +## Example + +The following example request creates a new index named `my_kstem_index` and configures an analyzer with a `kstem` filter: + +```json +PUT /my_kstem_index +{ + "settings": { + "analysis": { + "filter": { + "kstem_filter": { + "type": "kstem" + } + }, + "analyzer": { + "my_kstem_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "kstem_filter" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_kstem_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_kstem_index/_analyze +{ + "analyzer": "my_kstem_analyzer", + "text": "stops stopped" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stop", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 6, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kuromoji-completion.md b/_analyzers/token-filters/kuromoji-completion.md new file mode 100644 index 0000000000..24833e92e1 --- /dev/null +++ b/_analyzers/token-filters/kuromoji-completion.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Kuromoji completion +parent: Token filters +nav_order: 230 +--- + +# Kuromoji completion token filter + +The `kuromoji_completion` token filter is used to stem Katakana words in Japanese, which are often used to represent foreign words or loanwords. This filter is especially useful for autocompletion or suggest queries, in which partial matches on Katakana words can be expanded to include their full forms. + +To use this token filter, you must first install the `analysis-kuromoji` plugin on all nodes by running `bin/opensearch-plugin install analysis-kuromoji` and then restart the cluster. For more information about installing additional plugins, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/index/). + +## Example + +The following example request creates a new index named `kuromoji_sample` and configures an analyzer with a `kuromoji_completion` filter: + +```json +PUT kuromoji_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "my_katakana_stemmer" + ] + } + }, + "filter": { + "my_katakana_stemmer": { + "type": "kuromoji_completion" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer with text that translates to "use a computer": + +```json +POST /kuromoji_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "コンピューターを使う" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "コンピューター", // The original Katakana word "computer". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konpyuーtaー", // Romanized version (Romaji) of "コンピューター". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konnpyuーtaー", // Another possible romanized version of "コンピューター" (with a slight variation in the spelling). + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "を", // A Japanese particle, "wo" or "o" + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "wo", // Romanized form of the particle "を" (often pronounced as "o"). + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "o", // Another version of the romanization. + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "使う", // The verb "use" in Kanji. + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tukau", // Romanized version of "使う" + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tsukau", // Another romanized version of "使う", where "tsu" is more phonetically correct + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/length.md b/_analyzers/token-filters/length.md new file mode 100644 index 0000000000..f6c5dcc706 --- /dev/null +++ b/_analyzers/token-filters/length.md @@ -0,0 +1,91 @@ +--- +layout: default +title: Length +parent: Token filters +nav_order: 240 +--- + +# Length token filter + +The `length` token filter is used to remove tokens that don't meet specified length criteria (minimum and maximum values) from the token stream. + +## Parameters + +The `length` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min` | Optional | Integer | The minimum token length. Default is `0`. +`max` | Optional | Integer | The maximum token length. Default is `Integer.MAX_VALUE` (`2147483647`). + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `length` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "only_keep_4_to_10_characters": { + "tokenizer": "whitespace", + "filter": [ "length_4_to_10" ] + } + }, + "filter": { + "length_4_to_10": { + "type": "length", + "min": 4, + "max": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "only_keep_4_to_10_characters", + "text": "OpenSearch is a great tool!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "great", + "start_offset": 16, + "end_offset": 21, + "type": "word", + "position": 3 + }, + { + "token": "tool!", + "start_offset": 22, + "end_offset": 27, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/limit.md b/_analyzers/token-filters/limit.md new file mode 100644 index 0000000000..a849f5f06b --- /dev/null +++ b/_analyzers/token-filters/limit.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Limit +parent: Token filters +nav_order: 250 +--- + +# Limit token filter + +The `limit` token filter is used to limit the number of tokens passed through the analysis chain. + +## Parameters + +The `limit` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_count` | Optional | Integer | The maximum number of tokens to be generated. Default is `1`. +`consume_all_tokens` | Optional | Boolean | (Expert-level setting) Uses all tokens from the tokenizer, even if the result exceeds `max_token_count`. When this parameter is set, the output still only contains the number of tokens specified by `max_token_count`. However, all tokens generated by the tokenizer are processed. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `limit` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "three_token_limit": { + "tokenizer": "standard", + "filter": [ "custom_token_limit" ] + } + }, + "filter": { + "custom_token_limit": { + "type": "limit", + "max_token_count": 3 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "three_token_limit", + "text": "OpenSearch is a powerful and flexible search engine." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 14, + "end_offset": 15, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/lowercase.md b/_analyzers/token-filters/lowercase.md new file mode 100644 index 0000000000..89f0f219fa --- /dev/null +++ b/_analyzers/token-filters/lowercase.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Lowercase +parent: Token filters +nav_order: 260 +--- + +# Lowercase token filter + +The `lowercase` token filter is used to convert all characters in the token stream to lowercase, making searches case insensitive. + +## Parameters + +The `lowercase` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Description +:--- | :--- | :--- + `language` | Optional | Specifies a language-specific token filter. Valid values are:
- [`greek`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)
- [`irish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)
- [`turkish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html).
Default is the [Lucene LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html). + +## Example + +The following example request creates a new index named `custom_lowercase_example`. It configures an analyzer with a `lowercase` filter and specifies `greek` as the `language`: + +```json +PUT /custom_lowercase_example +{ + "settings": { + "analysis": { + "analyzer": { + "greek_lowercase_example": { + "type": "custom", + "tokenizer": "standard", + "filter": ["greek_lowercase"] + } + }, + "filter": { + "greek_lowercase": { + "type": "lowercase", + "language": "greek" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /custom_lowercase_example/_analyze +{ + "analyzer": "greek_lowercase_example", + "text": "Αθήνα ΕΛΛΑΔΑ" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "αθηνα", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "ελλαδα", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/min-hash.md b/_analyzers/token-filters/min-hash.md new file mode 100644 index 0000000000..e4f1a8da91 --- /dev/null +++ b/_analyzers/token-filters/min-hash.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Min hash +parent: Token filters +nav_order: 270 +--- + +# Min hash token filter + +The `min_hash` token filter is used to generate hashes for tokens based on a [MinHash](https://en.wikipedia.org/wiki/MinHash) approximation algorithm, which is useful for detecting similarity between documents. The `min_hash` token filter generates hashes for a set of tokens (typically from an analyzed field). + +## Parameters + +The `min_hash` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hash_count` | Optional | Integer | The number of hash values to generate for each token. Increasing this value generally improves the accuracy of similarity estimation but increases the computational cost. Default is `1`. +`bucket_count` | Optional | Integer | The number of hash buckets to use. This affects the granularity of the hashing. A larger number of buckets provides finer granularity and reduces hash collisions but requires more memory. Default is `512`. +`hash_set_size` | Optional | Integer | The number of hashes to retain in each bucket. This can influence the hashing quality. Larger set sizes may lead to better similarity detection but consume more memory. Default is `1`. +`with_rotation` | Optional | Boolean | When set to `true`, the filter populates empty buckets with the value from the first non-empty bucket found to its circular right, provided that the `hash_set_size` is `1`. If the `bucket_count` argument exceeds `1`, this setting automatically defaults to `true`; otherwise, it defaults to `false`. + +## Example + +The following example request creates a new index named `minhash_index` and configures an analyzer with a `min_hash` filter: + +```json +PUT /minhash_index +{ + "settings": { + "analysis": { + "filter": { + "minhash_filter": { + "type": "min_hash", + "hash_count": 3, + "bucket_count": 512, + "hash_set_size": 1, + "with_rotation": false + } + }, + "analyzer": { + "minhash_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "minhash_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /minhash_index/_analyze +{ + "analyzer": "minhash_analyzer", + "text": "OpenSearch is very powerful." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens (the tokens are not human readable because they represent hashes): + +```json +{ + "tokens" : [ + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + ... +``` + +In order to demonstrate the usefulness of the `min_hash` token filter, you can use the following Python script to compare the two strings using the previously created analyzer: + +```python +from opensearchpy import OpenSearch +from requests.auth import HTTPBasicAuth + +# Initialize the OpenSearch client with authentication +host = 'https://localhost:9200' # Update if using a different host/port +auth = ('admin', 'admin') # Username and password + +# Create the OpenSearch client with SSL verification turned off +client = OpenSearch( + hosts=[host], + http_auth=auth, + use_ssl=True, + verify_certs=False, # Disable SSL certificate validation + ssl_show_warn=False # Suppress SSL warnings in the output +) + +# Analyzes text and returns the minhash tokens +def analyze_text(index, text): + response = client.indices.analyze( + index=index, + body={ + "analyzer": "minhash_analyzer", + "text": text + } + ) + return [token['token'] for token in response['tokens']] + +# Analyze two similar texts +tokens_1 = analyze_text('minhash_index', 'OpenSearch is a powerful search engine.') +tokens_2 = analyze_text('minhash_index', 'OpenSearch is a very powerful search engine.') + +# Calculate Jaccard similarity +set_1 = set(tokens_1) +set_2 = set(tokens_2) +shared_tokens = set_1.intersection(set_2) +jaccard_similarity = len(shared_tokens) / len(set_1.union(set_2)) + +print(f"Jaccard Similarity: {jaccard_similarity}") +``` + +The response should contain the Jaccard similarity score: + +```yaml +Jaccard Similarity: 0.8571428571428571 +``` \ No newline at end of file diff --git a/_analyzers/token-filters/multiplexer.md b/_analyzers/token-filters/multiplexer.md new file mode 100644 index 0000000000..21597b7fc1 --- /dev/null +++ b/_analyzers/token-filters/multiplexer.md @@ -0,0 +1,165 @@ +--- +layout: default +title: Multiplexer +parent: Token filters +nav_order: 280 +--- + +# Multiplexer token filter + +The `multiplexer` token filter allows you to create multiple versions of the same token by applying different filters. This is useful when you want to analyze the same token in multiple ways. For example, you may want to analyze a token using different stemming, synonyms, or n-gram filters and use all of the generated tokens together. This token filter works by duplicating the token stream and applying different filters to each copy. + +The `multiplexer` token filter removes duplicate tokens from the token stream. +{: .important} + +The `multiplexer` token filter does not support multiword `synonym` or `synonym_graph` token filters or `shingle` token filters because they need to analyze not only the current token but also upcoming tokens in order to determine how to transform the input correctly. +{: .important} + +## Parameters + +The `multiplexer` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`filters` | Optional | List of strings | A comma-separated list of token filters to apply to each copy of the token stream. Default is an empty list. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `true`. + +## Example + +The following example request creates a new index named `multiplexer_index` and configures an analyzer with a `multiplexer` filter: + +```json +PUT /multiplexer_index +{ + "settings": { + "analysis": { + "filter": { + "english_stemmer": { + "type": "stemmer", + "name": "english" + }, + "synonym_filter": { + "type": "synonym", + "synonyms": [ + "quick,fast" + ] + }, + "multiplexer_filter": { + "type": "multiplexer", + "filters": ["english_stemmer", "synonym_filter"], + "preserve_original": true + } + }, + "analyzer": { + "multiplexer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "multiplexer_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /multiplexer_index/_analyze +{ + "analyzer": "multiplexer_analyzer", + "text": "The slow turtle hides from the quick dog" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 4, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "turtl", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "hides", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "hide", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "from", + "start_offset": 22, + "end_offset": 26, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 27, + "end_offset": 30, + "type": "", + "position": 5 + }, + { + "token": "quick", + "start_offset": 31, + "end_offset": 36, + "type": "", + "position": 6 + }, + { + "token": "fast", + "start_offset": 31, + "end_offset": 36, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "dog", + "start_offset": 37, + "end_offset": 40, + "type": "", + "position": 7 + } + ] +} +``` diff --git a/_analyzers/token-filters/ngram.md b/_analyzers/token-filters/ngram.md new file mode 100644 index 0000000000..c029eac26e --- /dev/null +++ b/_analyzers/token-filters/ngram.md @@ -0,0 +1,137 @@ +--- +layout: default +title: N-gram +parent: Token filters +nav_order: 290 +--- + +# N-gram token filter + +The `ngram` token filter is a powerful tool used to break down text into smaller components, known as _n-grams_, which can improve partial matching and fuzzy search capabilities. It works by splitting a token into smaller substrings of defined lengths. These filters are commonly used in search applications to support autocomplete, partial matches, and typo-tolerant search. For more information, see [Autocomplete functionality]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/) and [Did-you-mean]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/). + +## Parameters + +The `ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams. Default is `2`. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `false`. + +## Example + +The following example request creates a new index named `ngram_example_index` and configures an analyzer with an `ngram` filter: + +```json +PUT /ngram_example_index +{ + "settings": { + "analysis": { + "filter": { + "ngram_filter": { + "type": "ngram", + "min_gram": 2, + "max_gram": 3 + } + }, + "analyzer": { + "ngram_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "ngram_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /ngram_example_index/_analyze +{ + "analyzer": "ngram_analyzer", + "text": "Search" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "se", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "sea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ear", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ar", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "arc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/normalization.md b/_analyzers/token-filters/normalization.md new file mode 100644 index 0000000000..1be08e65c2 --- /dev/null +++ b/_analyzers/token-filters/normalization.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Normalization +parent: Token filters +nav_order: 300 +--- + +# Normalization token filter + +The `normalization` token filter is designed to adjust and simplify text in a way that reduces variations, particularly variations in special characters. It is primarily used to handle variations in writing by standardizing characters in specific languages. + +The following `normalization` token filters are available: + +- [arabic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html) +- [german_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html) +- [hindi_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html) +- [indic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html) +- [sorani_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html) +- [persian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html) +- [scandinavian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html) +- [scandinavian_folding](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html) +- [serbian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) + + +## Example + +The following example request creates a new index named `german_normalizer_example` and configures an analyzer with a `german_normalization` filter: + +```json +PUT /german_normalizer_example +{ + "settings": { + "analysis": { + "filter": { + "german_normalizer": { + "type": "german_normalization" + } + }, + "analyzer": { + "german_normalizer_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_normalizer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german_normalizer_example/_analyze +{ + "text": "Straße München", + "analyzer": "german_normalizer_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "strasse", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "munchen", + "start_offset": 7, + "end_offset": 14, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-capture.md b/_analyzers/token-filters/pattern-capture.md new file mode 100644 index 0000000000..cff36b583d --- /dev/null +++ b/_analyzers/token-filters/pattern-capture.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Pattern capture +parent: Token filters +nav_order: 310 +--- + +# Pattern capture token filter + +The `pattern_capture` token filter is a powerful filter that uses regular expressions to capture and extract parts of text according to specific patterns. This filter can be useful when you want to extract particular parts of tokens, such as email domains, hashtags, or numbers, and reuse them for further analysis or indexing. + +## Parameters + +The `pattern_capture` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`patterns` | Required | Array of strings | An array of regular expressions used to capture parts of text. +`preserve_original` | Required | Boolean| Whether to keep the original token in the output. Default is `true`. + + +## Example + +The following example request creates a new index named `email_index` and configures an analyzer with a `pattern_capture` filter to extract the local part and domain name from an email address: + +```json +PUT /email_index +{ + "settings": { + "analysis": { + "filter": { + "email_pattern_capture": { + "type": "pattern_capture", + "preserve_original": true, + "patterns": [ + "^([^@]+)", + "@(.+)$" + ] + } + }, + "analyzer": { + "email_analyzer": { + "tokenizer": "uax_url_email", + "filter": [ + "email_pattern_capture", + "lowercase" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /email_index/_analyze +{ + "text": "john.doe@example.com", + "analyzer": "email_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "john.doe@example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "john.doe", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-replace.md b/_analyzers/token-filters/pattern-replace.md new file mode 100644 index 0000000000..73ef7fa7d8 --- /dev/null +++ b/_analyzers/token-filters/pattern-replace.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Pattern replace +parent: Token filters +nav_order: 320 +--- + +# Pattern replace token filter + +The `pattern_replace` token filter allows you to modify tokens using regular expressions. This filter replaces patterns in tokens with the specified values, giving you flexibility in transforming or normalizing tokens before indexing them. It's particularly useful when you need to clean or standardize text during analysis. + +## Parameters + +The `pattern_replace` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Required | String | A regular expression pattern that matches the text that needs to be replaced. +`all` | Optional | Boolean | Whether to replace all pattern matches. If `false`, only the first match is replaced. Default is `true`. +`replacement` | Optional | String | A string with which to replace the matched pattern. Default is an empty string. + + +## Example + +The following example request creates a new index named `text_index` and configures an analyzer with a `pattern_replace` filter to replace tokens containing digits with the string `[NUM]`: + +```json +PUT /text_index +{ + "settings": { + "analysis": { + "filter": { + "number_replace_filter": { + "type": "pattern_replace", + "pattern": "\\d+", + "replacement": "[NUM]" + } + }, + "analyzer": { + "number_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "number_replace_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /text_index/_analyze +{ + "text": "Visit us at 98765 Example St.", + "analyzer": "number_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "visit", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "us", + "start_offset": 6, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "at", + "start_offset": 9, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "[NUM]", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "example", + "start_offset": 18, + "end_offset": 25, + "type": "", + "position": 4 + }, + { + "token": "st", + "start_offset": 26, + "end_offset": 28, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/phonetic.md b/_analyzers/token-filters/phonetic.md new file mode 100644 index 0000000000..7fe380851f --- /dev/null +++ b/_analyzers/token-filters/phonetic.md @@ -0,0 +1,98 @@ +--- +layout: default +title: Phonetic +parent: Token filters +nav_order: 330 +--- + +# Phonetic token filter + +The `phonetic` token filter transforms tokens into their phonetic representations, enabling more flexible matching of words that sound similar but are spelled differently. This is particularly useful for searching names, brands, or other entities that users might spell differently but pronounce similarly. + +The `phonetic` token filter is not included in OpenSearch distributions by default. To use this token filter, you must first install the `analysis-phonetic` plugin as follows and then restart OpenSearch: + +```bash +./bin/opensearch-plugin install analysis-phonetic +``` +{% include copy.html %} + +For more information about installing plugins, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). +{: .note} + +## Parameters + +The `phonetic` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`encoder` | Optional | String | Specifies the phonetic algorithm to use.

Valid values are:
- `metaphone` (default)
- `double_metaphone`
- `soundex`
- `refined_soundex`
- `caverphone1`
- `caverphone2`
- `cologne`
- `nysiis`
- `koelnerphonetik`
- `haasephonetik`
- `beider_morse`
- `daitch_mokotoff ` +`replace` | Optional | Boolean | Whether to replace the original token. If `false`, the original token is included in the output along with the phonetic encoding. Default is `true`. + + +## Example + +The following example request creates a new index named `names_index` and configures an analyzer with a `phonetic` filter: + +```json +PUT /names_index +{ + "settings": { + "analysis": { + "filter": { + "my_phonetic_filter": { + "type": "phonetic", + "encoder": "double_metaphone", + "replace": true + } + }, + "analyzer": { + "phonetic_analyzer": { + "tokenizer": "standard", + "filter": [ + "my_phonetic_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated for the names `Stephen` and `Steven` using the analyzer: + +```json +POST /names_index/_analyze +{ + "text": "Stephen", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +```json +POST /names_index/_analyze +{ + "text": "Steven", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +In both cases, the response contains the same generated token: + +```json +{ + "tokens": [ + { + "token": "STFN", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/porter-stem.md b/_analyzers/token-filters/porter-stem.md new file mode 100644 index 0000000000..fa2f4208a7 --- /dev/null +++ b/_analyzers/token-filters/porter-stem.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Porter stem +parent: Token filters +nav_order: 340 +--- + +# Porter stem token filter + +The `porter_stem` token filter reduces words to their base (or _stem_) form and removes common suffixes from words, which helps in matching similar words by their root. For example, the word `running` is stemmed to `run`. This token filter is primarily used for the English language and provides stemming based on the [Porter stemming algorithm](https://snowballstem.org/algorithms/porter/stemmer.html). + + +## Example + +The following example request creates a new index named `my_stem_index` and configures an analyzer with a `porter_stem` filter: + +```json +PUT /my_stem_index +{ + "settings": { + "analysis": { + "filter": { + "my_porter_stem": { + "type": "porter_stem" + } + }, + "analyzer": { + "porter_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_porter_stem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_stem_index/_analyze +{ + "text": "running runners ran", + "analyzer": "porter_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "ran", + "start_offset": 16, + "end_offset": 19, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/predicate-token-filter.md b/_analyzers/token-filters/predicate-token-filter.md new file mode 100644 index 0000000000..24729f0224 --- /dev/null +++ b/_analyzers/token-filters/predicate-token-filter.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Predicate token filter +parent: Token filters +nav_order: 340 +--- + +# Predicate token filter + +The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script. The tokens are evaluated in the analysis predicate context. This filter supports only inline Painless scripts. + +## Parameters + +The `predicate_token_filter` has one required parameter: `script`. This parameter provides a condition that is used to evaluate whether the token should be kept. + +## Example + +The following example request creates a new index named `predicate_index` and configures an analyzer with a `predicate_token_filter`. The filter specifies to only output tokens if they are longer than 7 characters: + +```json +PUT /predicate_index +{ + "settings": { + "analysis": { + "filter": { + "my_predicate_filter": { + "type": "predicate_token_filter", + "script": { + "source": "token.term.length() > 7" + } + } + }, + "analyzer": { + "predicate_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_predicate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /predicate_index/_analyze +{ + "text": "The OpenSearch community is growing rapidly", + "analyzer": "predicate_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 4, + "end_offset": 14, + "type": "", + "position": 1 + }, + { + "token": "community", + "start_offset": 15, + "end_offset": 24, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/remove-duplicates.md b/_analyzers/token-filters/remove-duplicates.md new file mode 100644 index 0000000000..b0a589884a --- /dev/null +++ b/_analyzers/token-filters/remove-duplicates.md @@ -0,0 +1,152 @@ +--- +layout: default +title: Remove duplicates +parent: Token filters +nav_order: 350 +--- + +# Remove duplicates token filter + +The `remove_duplicates` token filter is used to remove duplicate tokens that are generated in the same position during analysis. + +## Example + +The following example request creates an index with a `keyword_repeat` token filter. The filter adds a `keyword` version of each token in the same position as the token itself and then uses a `kstem` to create a stemmed version of the token: + +```json +PUT /example-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to analyze the string `Slower turtle`: + +```json +GET /example-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the token `turtle` twice in the same position: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` + +The duplicate token can be removed by adding a `remove_duplicates` token filter to the index settings: + +```json +PUT /index-remove-duplicate +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem", + "remove_duplicates" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /index-remove-duplicate/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/reverse.md b/_analyzers/token-filters/reverse.md new file mode 100644 index 0000000000..dc48f07e77 --- /dev/null +++ b/_analyzers/token-filters/reverse.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Reverse +parent: Token filters +nav_order: 360 +--- + +# Reverse token filter + +The `reverse` token filter reverses the order of the characters in each token, making suffix information accessible at the beginning of the reversed tokens during analysis. + +This is useful for suffix-based searches: + +The `reverse` token filter is useful when you need to perform suffix-based searches, such as in the following scenarios: + +- **Suffix matching**: Searching for words based on their suffixes, such as identifying words with a specific ending (for example, `-tion` or `-ing`). +- **File extension searches**: Searching for files by their extensions, such as `.txt` or `.jpg`. +- **Custom sorting or ranking**: By reversing tokens, you can implement unique sorting or ranking logic based on suffixes. +- **Autocomplete for suffixes**: Implementing autocomplete suggestions that use suffixes rather than prefixes. + + +## Example + +The following example request creates a new index named `my-reverse-index` and configures an analyzer with a `reverse` filter: + +```json +PUT /my-reverse-index +{ + "settings": { + "analysis": { + "filter": { + "reverse_filter": { + "type": "reverse" + } + }, + "analyzer": { + "my_reverse_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "reverse_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-reverse-index/_analyze +{ + "analyzer": "my_reverse_analyzer", + "text": "hello world" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "olleh", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "dlrow", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/shingle.md b/_analyzers/token-filters/shingle.md new file mode 100644 index 0000000000..ea961bf3e0 --- /dev/null +++ b/_analyzers/token-filters/shingle.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Shingle +parent: Token filters +nav_order: 370 +--- + +# Shingle token filter + +The `shingle` token filter is used to generate word n-grams, or _shingles_, from input text. For example, for the string `slow green turtle`, the `shingle` filter creates the following one- and two-word shingles: `slow`, `slow green`, `green`, `green turtle`, and `turtle`. + +This token filter is often used in conjunction with other filters to enhance search accuracy by indexing phrases rather than individual tokens. For more information, see [Phrase suggester]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/#phrase-suggester). + +## Parameters + +The `shingle` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_shingle_size` | Optional | Integer | The minimum number of tokens to concatenate. Default is `2`. +`max_shingle_size` | Optional | Integer | The maximum number of tokens to concatenate. Default is `2`. +`output_unigrams` | Optional | Boolean | Whether to include unigrams (individual tokens) as output. Default is `true`. +`output_unigrams_if_no_shingles` | Optional | Boolean | Whether to output unigrams if no shingles are generated. Default is `false`. +`token_separator` | Optional | String | A separator used to concatenate tokens into a shingle. Default is a space (`" "`). +`filler_token` | Optional | String | A token inserted into empty positions or gaps between tokens. Default is an underscore (`_`). + +If `output_unigrams` and `output_unigrams_if_no_shingles` are both set to `true`, `output_unigrams_if_no_shingles` is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-shingle-index` and configures an analyzer with a `shingle` filter: + +```json +PUT /my-shingle-index +{ + "settings": { + "analysis": { + "filter": { + "my_shingle_filter": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 2, + "output_unigrams": true + } + }, + "analyzer": { + "my_shingle_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_shingle_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-shingle-index/_analyze +{ + "analyzer": "my_shingle_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow green", + "start_offset": 0, + "end_offset": 10, + "type": "shingle", + "position": 0, + "positionLength": 2 + }, + { + "token": "green", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "green turtle", + "start_offset": 5, + "end_offset": 17, + "type": "shingle", + "position": 1, + "positionLength": 2 + }, + { + "token": "turtle", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/snowball.md b/_analyzers/token-filters/snowball.md new file mode 100644 index 0000000000..149486e727 --- /dev/null +++ b/_analyzers/token-filters/snowball.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Snowball +parent: Token filters +nav_order: 380 +--- + +# Snowball token filter + +The `snowball` token filter is a stemming filter based on the [Snowball](https://snowballstem.org/) algorithm. It supports many languages and is more efficient and accurate than the Porter stemming algorithm. + +## Parameters + +The `snowball` token filter can be configured with a `language` parameter that accepts the following values: + +- `Arabic` +- `Armenian` +- `Basque` +- `Catalan` +- `Danish` +- `Dutch` +- `English` (default) +- `Estonian` +- `Finnish` +- `French` +- `German` +- `German2` +- `Hungarian` +- `Italian` +- `Irish` +- `Kp` +- `Lithuanian` +- `Lovins` +- `Norwegian` +- `Porter` +- `Portuguese` +- `Romanian` +- `Russian` +- `Spanish` +- `Swedish` +- `Turkish` + +## Example + +The following example request creates a new index named `my-snowball-index` and configures an analyzer with a `snowball` filter: + +```json +PUT /my-snowball-index +{ + "settings": { + "analysis": { + "filter": { + "my_snowball_filter": { + "type": "snowball", + "language": "English" + } + }, + "analyzer": { + "my_snowball_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_snowball_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-snowball-index/_analyze +{ + "analyzer": "my_snowball_analyzer", + "text": "running runners" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer-override.md b/_analyzers/token-filters/stemmer-override.md new file mode 100644 index 0000000000..c06f673714 --- /dev/null +++ b/_analyzers/token-filters/stemmer-override.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Stemmer override +parent: Token filters +nav_order: 400 +--- + +# Stemmer override token filter + +The `stemmer_override` token filter allows you to define custom stemming rules that override the behavior of default stemmers like Porter or Snowball. This can be useful when you want to apply specific stemming behavior to certain words that might not be modified correctly by the standard stemming algorithms. + +## Parameters + +The `stemmer_override` token filter must be configured with exactly one of the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`rules` | String | Defines the override rules directly in the settings. +`rules_path` | String | Specifies the path to the file containing custom rules (mappings). The path can be either an absolute path or a path relative to the config directory. + +## Example + +The following example request creates a new index named `my-index` and configures an analyzer with a `stemmer_override` filter: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_stemmer_override_filter": { + "type": "stemmer_override", + "rules": [ + "running, runner => run", + "bought => buy", + "best => good" + ] + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stemmer_override_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "I am a runner and bought the best shoes" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "am", + "start_offset": 2, + "end_offset": 4, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 5, + "end_offset": 6, + "type": "", + "position": 2 + }, + { + "token": "run", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "and", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + }, + { + "token": "buy", + "start_offset": 18, + "end_offset": 24, + "type": "", + "position": 5 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 6 + }, + { + "token": "good", + "start_offset": 29, + "end_offset": 33, + "type": "", + "position": 7 + }, + { + "token": "shoes", + "start_offset": 34, + "end_offset": 39, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer.md b/_analyzers/token-filters/stemmer.md new file mode 100644 index 0000000000..dd1344fcbc --- /dev/null +++ b/_analyzers/token-filters/stemmer.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Stemmer +parent: Token filters +nav_order: 390 +--- + +# Stemmer token filter + +The `stemmer` token filter reduces words to their root or base form (also known as their _stem_). + +## Parameters + +The `stemmer` token filter can be configured with a `language` parameter that accepts the following values: + +- Arabic: `arabic` +- Armenian: `armenian` +- Basque: `basque` +- Bengali: `bengali` +- Brazilian Portuguese: `brazilian` +- Bulgarian: `bulgarian` +- Catalan: `catalan` +- Czech: `czech` +- Danish: `danish` +- Dutch: `dutch, dutch_kp` +- English: `english` (default), `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english` +- Estonian: `estonian` +- Finnish: `finnish`, `light_finnish` +- French: `light_french`, `french`, `minimal_french` +- Galician: `galician`, `minimal_galician` (plural step only) +- German: `light_german`, `german`, `german2`, `minimal_german` +- Greek: `greek` +- Hindi: `hindi` +- Hungarian: `hungarian, light_hungarian` +- Indonesian: `indonesian` +- Irish: `irish` +- Italian: `light_italian, italian` +- Kurdish (Sorani): `sorani` +- Latvian: `latvian` +- Lithuanian: `lithuanian` +- Norwegian (Bokmål): `norwegian`, `light_norwegian`, `minimal_norwegian` +- Norwegian (Nynorsk): `light_nynorsk`, `minimal_nynorsk` +- Portuguese: `light_portuguese`, `minimal_portuguese`, `portuguese`, `portuguese_rslp` +- Romanian: `romanian` +- Russian: `russian`, `light_russian` +- Spanish: `light_spanish`, `spanish` +- Swedish: `swedish`, `light_swedish` +- Turkish: `turkish` + +You can also use the `name` parameter as an alias for the `language` parameter. If both are set, the `name` parameter is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-stemmer-index` and configures an analyzer with a `stemmer` filter: + +```json +PUT /my-stemmer-index +{ + "settings": { + "analysis": { + "filter": { + "my_english_stemmer": { + "type": "stemmer", + "language": "english" + } + }, + "analyzer": { + "my_stemmer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stemmer-index/_analyze +{ + "analyzer": "my_stemmer_analyzer", + "text": "running runs" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "run", + "start_offset": 8, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stop.md b/_analyzers/token-filters/stop.md new file mode 100644 index 0000000000..8f3e01b72d --- /dev/null +++ b/_analyzers/token-filters/stop.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Stop +parent: Token filters +nav_order: 410 +--- + +# Stop token filter + +The `stop` token filter is used to remove common words (also known as _stopwords_) from a token stream during analysis. Stopwords are typically articles and prepositions, such as `a` or `for`. These words are not significantly meaningful in search queries and are often excluded to improve search efficiency and relevance. + +The default list of English stopwords includes the following words: `a`, `an`, `and`, `are`, `as`, `at`, `be`, `but`, `by`, `for`, `if`, `in`, `into`, `is`, `it`, `no`, `not`, `of`, `on`, `or`, `such`, `that`, `the`, `their`, `then`, `there`, `these`, `they`, `this`, `to`, `was`, `will`, and `with`. + +## Parameters + +The `stop` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`stopwords` | Optional | String | Specifies either a custom array of stopwords or a language for which to fetch the predefined Lucene stopword list:

- [`_arabic_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt)
- [`_armenian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hy/stopwords.txt)
- [`_basque_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/eu/stopwords.txt)
- [`_bengali_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt)
- [`_brazilian_` (Brazilian Portuguese)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/br/stopwords.txt)
- [`_bulgarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt)
- [`_catalan_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ca/stopwords.txt)
- [`_cjk_` (Chinese, Japanese, and Korean)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cjk/stopwords.txt)
- [`_czech_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cz/stopwords.txt)
- [`_danish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/danish_stop.txt)
- [`_dutch_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/dutch_stop.txt)
- [`_english_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L48) (Default)
- [`_estonian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/et/stopwords.txt)
- [`_finnish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/finnish_stop.txt)
- [`_french_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/french_stop.txt)
- [`_galician_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt)
- [`_german_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/german_stop.txt)
- [`_greek_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt)
- [`_hindi_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt)
- [`_hungarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/hungarian_stop.txt)
- [`_indonesian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt)
- [`_irish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt)
- [`_italian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/italian_stop.txt)
- [`_latvian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt)
- [`_lithuanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lt/stopwords.txt)
- [`_norwegian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/norwegian_stop.txt)
- [`_persian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt)
- [`_portuguese_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/portuguese_stop.txt)
- [`_romanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt)
- [`_russian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/russian_stop.txt)
- [`_sorani_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt)
- [`_spanish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt)
- [`_swedish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/swedish_stop.txt)
- [`_thai_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt)
- [`_turkish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt) +`stopwords_path` | Optional | String | Specifies the file path (absolute or relative to the config directory) of the file containing custom stopwords. +`ignore_case` | Optional | Boolean | If `true`, stopwords will be matched regardless of their case. Default is `false`. +`remove_trailing` | Optional | Boolean | If `true`, trailing stopwords will be removed during analysis. Default is `true`. + +## Example + +The following example request creates a new index named `my-stopword-index` and configures an analyzer with a `stop` filter that uses the predefined stopword list for the English language: + +```json +PUT /my-stopword-index +{ + "settings": { + "analysis": { + "filter": { + "my_stop_filter": { + "type": "stop", + "stopwords": "_english_" + } + }, + "analyzer": { + "my_stop_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stop_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stopword-index/_analyze +{ + "analyzer": "my_stop_analyzer", + "text": "A quick dog jumps over the turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "quick", + "start_offset": 2, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "dog", + "start_offset": 8, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "over", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "turtle", + "start_offset": 27, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/synonym-graph.md b/_analyzers/token-filters/synonym-graph.md new file mode 100644 index 0000000000..75c7c79151 --- /dev/null +++ b/_analyzers/token-filters/synonym-graph.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Synonym graph +parent: Token filters +nav_order: 420 +--- + +# Synonym graph token filter + +The `synonym_graph` token filter is a more advanced version of the `synonym` token filter. It supports multiword synonyms and processes synonyms across multiple tokens, making it ideal for phrases or scenarios in which relationships between tokens are important. + +## Parameters + +The `synonym_graph` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `false`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "synonyms": [ + "sports car, race car", + "fast car, speedy vehicle", + "luxury car, premium vehicle", + "electric car, EV" + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-car-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 2}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 6}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 7}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 8}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 9}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 10}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 11}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 11,"positionLength": 2}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 12,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 13} + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "format": "wordnet", + "synonyms": [ + "s(100000001, 1, 'sports car', n, 1, 0).", + "s(100000001, 2, 'race car', n, 1, 0).", + "s(100000001, 3, 'fast car', n, 1, 0).", + "s(100000001, 4, 'speedy vehicle', n, 1, 0)." + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "fast","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 2}, + {"token": "speedy","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 3}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 6,"positionLength": 3}, + {"token": "vehicle","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 7,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 8}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 9}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 10}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 11}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 12}, + {"token": "sports","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13}, + {"token": "race","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 2}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 3}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 13,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 14,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 15,"positionLength": 3}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 16,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 17} + ] +} +``` diff --git a/_analyzers/token-filters/synonym.md b/_analyzers/token-filters/synonym.md new file mode 100644 index 0000000000..296d5cd5db --- /dev/null +++ b/_analyzers/token-filters/synonym.md @@ -0,0 +1,277 @@ +--- +layout: default +title: Synonym +parent: Token filters +nav_order: 415 +--- + +# Synonym token filter + +The `synonym` token filter allows you to map multiple terms to a single term or create equivalence groups between words, improving search flexibility. + +## Parameters + +The `synonym` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `false`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-synonym-index` and configures an analyzer with a `synonym` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-synonym-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_filter": { + "type": "synonym", + "synonyms": [ + "car, automobile", + "quick, fast, speedy", + "laptop => computer" + ] + } + }, + "analyzer": { + "my_synonym_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-synonym-index/_analyze +{ + "analyzer": "my_synonym_analyzer", + "text": "The quick dog jumps into the car with a laptop" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "quick", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "fast", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "speedy", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "dog", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "into", + "start_offset": 20, + "end_offset": 24, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 5 + }, + { + "token": "car", + "start_offset": 29, + "end_offset": 32, + "type": "", + "position": 6 + }, + { + "token": "automobile", + "start_offset": 29, + "end_offset": 32, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "with", + "start_offset": 33, + "end_offset": 37, + "type": "", + "position": 7 + }, + { + "token": "a", + "start_offset": 38, + "end_offset": 39, + "type": "", + "position": 8 + }, + { + "token": "computer", + "start_offset": 40, + "end_offset": 46, + "type": "SYNONYM", + "position": 9 + } + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_wordnet_synonym_filter": { + "type": "synonym", + "format": "wordnet", + "synonyms": [ + "s(100000001,1,'fast',v,1,0).", + "s(100000001,2,'quick',v,1,0).", + "s(100000001,3,'swift',v,1,0)." + ] + } + }, + "analyzer": { + "my_wordnet_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_wordnet_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_wordnet_analyzer", + "text": "I have a fast car" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "have", + "start_offset": 2, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 7, + "end_offset": 8, + "type": "", + "position": 2 + }, + { + "token": "fast", + "start_offset": 9, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "quick", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "swift", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "car", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/trim.md b/_analyzers/token-filters/trim.md new file mode 100644 index 0000000000..cdfebed52f --- /dev/null +++ b/_analyzers/token-filters/trim.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Trim +parent: Token filters +nav_order: 430 +--- + +# Trim token filter + +The `trim` token filter removes leading and trailing white space characters from tokens. + +Many popular tokenizers, such as `standard`, `keyword`, and `whitespace` tokenizers, automatically strip leading and trailing white space characters during tokenization. When using these tokenizers, there is no need to configure an additional `trim` token filter. +{: .note} + + +## Example + +The following example request creates a new index named `my_pattern_trim_index` and configures an analyzer with a `trim` filter and a `pattern` tokenizer, which does not remove leading and trailing white space characters: + +```json +PUT /my_pattern_trim_index +{ + "settings": { + "analysis": { + "filter": { + "my_trim_filter": { + "type": "trim" + } + }, + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "," + } + }, + "analyzer": { + "my_pattern_trim_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer", + "filter": [ + "lowercase", + "my_trim_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_pattern_trim_index/_analyze +{ + "analyzer": "my_pattern_trim_analyzer", + "text": " OpenSearch , is , powerful " +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 12, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 13, + "end_offset": 18, + "type": "word", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 19, + "end_offset": 32, + "type": "word", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/truncate.md b/_analyzers/token-filters/truncate.md new file mode 100644 index 0000000000..16d1452901 --- /dev/null +++ b/_analyzers/token-filters/truncate.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Truncate +parent: Token filters +nav_order: 440 +--- + +# Truncate token filter + +The `truncate` token filter is used to shorten tokens exceeding a specified length. It trims tokens to a maximum number of characters, ensuring that tokens exceeding this limit are truncated. + +## Parameters + +The `truncate` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`length` | Optional | Integer | Specifies the maximum length of the generated token. Default is `10`. + +## Example + +The following example request creates a new index named `truncate_example` and configures an analyzer with a `truncate` filter: + +```json +PUT /truncate_example +{ + "settings": { + "analysis": { + "filter": { + "truncate_filter": { + "type": "truncate", + "length": 5 + } + }, + "analyzer": { + "truncate_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "truncate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /truncate_example/_analyze +{ + "analyzer": "truncate_analyzer", + "text": "OpenSearch is powerful and scalable" +} + +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opens", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "power", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 23, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "scala", + "start_offset": 27, + "end_offset": 35, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/unique.md b/_analyzers/token-filters/unique.md new file mode 100644 index 0000000000..c4dfcbab16 --- /dev/null +++ b/_analyzers/token-filters/unique.md @@ -0,0 +1,106 @@ +--- +layout: default +title: Unique +parent: Token filters +nav_order: 450 +--- + +# Unique token filter + +The `unique` token filter ensures that only unique tokens are kept during the analysis process, removing duplicate tokens that appear within a single field or text block. + +## Parameters + +The `unique` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`only_on_same_position` | Optional | Boolean | If `true`, the token filter acts as a `remove_duplicates` token filter and only removes tokens that are in the same position. Default is `false`. + +## Example + +The following example request creates a new index named `unique_example` and configures an analyzer with a `unique` filter: + +```json +PUT /unique_example +{ + "settings": { + "analysis": { + "filter": { + "unique_filter": { + "type": "unique", + "only_on_same_position": false + } + }, + "analyzer": { + "unique_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "unique_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /unique_example/_analyze +{ + "analyzer": "unique_analyzer", + "text": "OpenSearch OpenSearch is powerful powerful and scalable" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 22, + "end_offset": 24, + "type": "", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 25, + "end_offset": 33, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 43, + "end_offset": 46, + "type": "", + "position": 3 + }, + { + "token": "scalable", + "start_offset": 47, + "end_offset": 55, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/uppercase.md b/_analyzers/token-filters/uppercase.md new file mode 100644 index 0000000000..5026892400 --- /dev/null +++ b/_analyzers/token-filters/uppercase.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Uppercase +parent: Token filters +nav_order: 460 +--- + +# Uppercase token filter + +The `uppercase` token filter is used to convert all tokens (words) to uppercase during analysis. + +## Example + +The following example request creates a new index named `uppercase_example` and configures an analyzer with an `uppercase` filter: + +```json +PUT /uppercase_example +{ + "settings": { + "analysis": { + "filter": { + "uppercase_filter": { + "type": "uppercase" + } + }, + "analyzer": { + "uppercase_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "uppercase_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /uppercase_example/_analyze +{ + "analyzer": "uppercase_analyzer", + "text": "OpenSearch is powerful" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OPENSEARCH", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "IS", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "POWERFUL", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/word-delimiter-graph.md b/_analyzers/token-filters/word-delimiter-graph.md new file mode 100644 index 0000000000..ac734bebeb --- /dev/null +++ b/_analyzers/token-filters/word-delimiter-graph.md @@ -0,0 +1,164 @@ +--- +layout: default +title: Word delimiter graph +parent: Token filters +nav_order: 480 +--- + +# Word delimiter graph token filter + +The `word_delimiter_graph` token filter is used to split tokens at predefined characters and also offers optional token normalization based on customizable rules. + +The `word_delimiter_graph` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter_graph` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter_graph` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`adjust_offsets` | Optional | Boolean | Determines whether the token offsets should be recalculated for split or concatenated tokens. When `true`, the filter adjusts the token offsets to accurately represent the token's position within the token stream. This adjustment ensures that the token's location in the text aligns with its modified form after processing, which is particularly useful for applications like highlighting or phrase queries. When `false`, the offsets remain unchanged, which may result in misalignment when the processed tokens are mapped back to their positions in the original text. If your analyzer uses filters like `trim` that change the token lengths without changing their offsets, we recommend setting this parameter to `false`. Default is `true`. +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`ignore_keywords` | Optional | Boolean | Whether to process tokens marked as keywords. Default is `false`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split at hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter_graph` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter_graph", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` + + +## Differences between the word_delimiter_graph and word_delimiter filters + + +Both the `word_delimiter_graph` and `word_delimiter` token filters generate tokens spanning multiple positions when any of the following parameters are set to `true`: + +- `catenate_all` +- `catenate_numbers` +- `catenate_words` +- `preserve_original` + +To illustrate the differences between these filters, consider the input text `Pro-XT500`. + + +### word_delimiter_graph + + +The `word_delimiter_graph` filter assigns a `positionLength` attribute to multi-position tokens, indicating how many positions a token spans. This ensures that the filter always generates valid token graphs, making it suitable for use in advanced token graph scenarios. Although token graphs with multi-position tokens are not supported for indexing, they can still be useful in search scenarios. For example, queries like `match_phrase` can use these graphs to generate multiple subqueries from a single input string. For the example input text, the `word_delimiter_graph` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, `positionLength`: 2) + +The `positionLength` attribute the production of a valid graph to be used in advanced queries. + + +### word_delimiter + + +In contrast, the `word_delimiter` filter does not assign a `positionLength` attribute to multi-position tokens, leading to invalid graphs when these tokens are present. For the example input text, the `word_delimiter` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, no `positionLength`) + +The lack of a `positionLength` attribute results in a token graph that is invalid for token streams containing multi-position tokens. \ No newline at end of file diff --git a/_analyzers/token-filters/word-delimiter.md b/_analyzers/token-filters/word-delimiter.md new file mode 100644 index 0000000000..d820fae2a0 --- /dev/null +++ b/_analyzers/token-filters/word-delimiter.md @@ -0,0 +1,128 @@ +--- +layout: default +title: Word delimiter +parent: Token filters +nav_order: 470 +--- + +# Word delimiter token filter + +The `word_delimiter` token filter is used to split tokens at predefined characters and also offers optional token normalization based on customizable rules. + +We recommend using the `word_delimiter_graph` filter instead of the `word_delimiter` filter whenever possible because the `word_delimiter` filter sometimes produces invalid token graphs. For more information about the differences between the two filters, see [Differences between the `word_delimiter_graph` and `word_delimiter` filters]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/#differences-between-the-word_delimiter_graph-and-word_delimiter-filters). +{: .important} + +The `word_delimiter` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split at hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index 1abc5ee7ff..1f9e49c855 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -4,6 +4,8 @@ title: Tokenizers nav_order: 60 has_children: true has_toc: false +redirect_from: + - /analyzers/tokenizers/index/ --- # Tokenizers diff --git a/_api-reference/analyze-apis.md b/_api-reference/analyze-apis.md index ac8e9e249f..5a63f665d9 100644 --- a/_api-reference/analyze-apis.md +++ b/_api-reference/analyze-apis.md @@ -22,7 +22,7 @@ If you use the Security plugin, you must have the `manage index` privilege. If y ## Path and HTTP methods -``` +```json GET /_analyze GET /{index}/_analyze POST /_analyze @@ -81,7 +81,7 @@ text | String or Array of Strings | Text to analyze. If you provide an array of [Set a token limit](#set-a-token-limit) -#### Analyze array of text strings +### Analyze array of text strings When you pass an array of strings to the `text` field, it is analyzed as a multi-value field. @@ -145,7 +145,7 @@ The previous request returns the following fields: } ```` -#### Apply a built-in analyzer +### Apply a built-in analyzer If you omit the `index` path parameter, you can apply any of the built-in analyzers to the text string. @@ -190,7 +190,7 @@ The previous request returns the following fields: } ```` -#### Apply a custom analyzer +### Apply a custom analyzer You can create your own analyzer and specify it in an analyze request. @@ -244,7 +244,7 @@ The previous request returns the following fields: } ```` -#### Apply a custom transient analyzer +### Apply a custom transient analyzer You can build a custom transient analyzer from tokenizers, token filters, or character filters. Use the `filter` parameter to specify token filters. @@ -373,7 +373,7 @@ The previous request returns the following fields: } ```` -#### Specify an index +### Specify an index You can analyze text using an index's default analyzer, or you can specify a different analyzer. @@ -446,7 +446,7 @@ The previous request returns the following fields: } ```` -#### Derive the analyzer from an index field +### Derive the analyzer from an index field You can pass text and a field in the index. The API looks up the field's analyzer and uses it to analyze the text. @@ -493,7 +493,7 @@ The previous request returns the following fields: } ```` -#### Specify a normalizer +### Specify a normalizer Instead of using a keyword field, you can use the normalizer associated with the index. A normalizer causes the analysis change to produce a single token. @@ -557,7 +557,7 @@ The previous request returns the following fields: } ```` -#### Get token details +### Get token details You can obtain additional details for all tokens by setting the `explain` attribute to `true`. @@ -640,7 +640,7 @@ The previous request returns the following fields: } ```` -#### Set a token limit +### Set a token limit You can set a limit to the number of tokens generated. Setting a lower value reduces a node's memory usage. The default value is 10000. @@ -659,7 +659,7 @@ PUT /books2 The preceding request is an index API rather than an analyze API. See [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings) for additional details. {: .note} -### Response fields +## Response body fields The text analysis endpoints return the following response fields. diff --git a/_api-reference/cat/cat-aliases.md b/_api-reference/cat/cat-aliases.md index 2d5c5c300a..950d497351 100644 --- a/_api-reference/cat/cat-aliases.md +++ b/_api-reference/cat/cat-aliases.md @@ -18,17 +18,13 @@ The CAT aliases operation lists the mapping of aliases to indexes, plus routing ## Path and HTTP methods -``` +```json GET _cat/aliases/ GET _cat/aliases ``` +{% include copy-curl.html %} - -## URL parameters - -All CAT aliases URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-allocation.md b/_api-reference/cat/cat-allocation.md index 085a755dc1..a57c861a4b 100644 --- a/_api-reference/cat/cat-allocation.md +++ b/_api-reference/cat/cat-allocation.md @@ -17,16 +17,12 @@ The CAT allocation operation lists the allocation of disk space for indexes and ## Path and HTTP methods -``` +```json GET _cat/allocation?v GET _cat/allocation/ ``` -## URL parameters - -All CAT allocation URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-cluster_manager.md b/_api-reference/cat/cat-cluster_manager.md index d81e334009..1b75074e12 100644 --- a/_api-reference/cat/cat-cluster_manager.md +++ b/_api-reference/cat/cat-cluster_manager.md @@ -17,15 +17,11 @@ The CAT cluster manager operation lists information that helps identify the elec ## Path and HTTP methods -``` +```json GET _cat/cluster_manager ``` -## URL parameters - -All CAT cluster manager URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-count.md b/_api-reference/cat/cat-count.md index 8d0b4fbad2..94a422d061 100644 --- a/_api-reference/cat/cat-count.md +++ b/_api-reference/cat/cat-count.md @@ -18,15 +18,11 @@ The CAT count operation lists the number of documents in your cluster. ## Path and HTTP methods -``` +```json GET _cat/count?v GET _cat/count/?v ``` -## URL parameters - -All CAT count URL parameters are optional. You can specify any of the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index). - ## Example requests ```json diff --git a/_api-reference/cat/cat-field-data.md b/_api-reference/cat/cat-field-data.md index 05c720b952..3012bbbfe9 100644 --- a/_api-reference/cat/cat-field-data.md +++ b/_api-reference/cat/cat-field-data.md @@ -17,16 +17,12 @@ The CAT Field Data operation lists the memory size used by each field per node. ## Path and HTTP methods -``` +```json GET _cat/fielddata?v GET _cat/fielddata/?v ``` -## URL parameters - -All CAT fielddata URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameter: +## Query parameters Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-health.md b/_api-reference/cat/cat-health.md index 1c400916ad..0e4b784693 100644 --- a/_api-reference/cat/cat-health.md +++ b/_api-reference/cat/cat-health.md @@ -18,14 +18,11 @@ The CAT health operation lists the status of the cluster, how long the cluster h ## Path and HTTP methods -``` +```json GET _cat/health?v ``` -{% include copy-curl.html %} -## URL parameters - -All CAT health URL parameters are optional. +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -39,6 +36,7 @@ The following example request give cluster health information for the past 5 day ```json GET _cat/health?v&time=5d ``` +{% include copy-curl.html %} ## Example response diff --git a/_api-reference/cat/cat-indices.md b/_api-reference/cat/cat-indices.md index 16c57e5791..4bbdde573c 100644 --- a/_api-reference/cat/cat-indices.md +++ b/_api-reference/cat/cat-indices.md @@ -1,6 +1,6 @@ --- layout: default -title: CAT indices operation +title: CAT indices parent: CAT API nav_order: 25 has_children: false @@ -17,16 +17,12 @@ The CAT indices operation lists information related to indexes, that is, how muc ## Path and HTTP methods -``` +```json GET _cat/indices/ GET _cat/indices ``` -## URL parameters - -All URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index/), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -40,14 +36,14 @@ expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Comb ## Example requests -``` +```json GET _cat/indices?v ``` {% include copy-curl.html %} To limit the information to a specific index, add the index name after your query. -``` +```json GET _cat/indices/?v ``` {% include copy-curl.html %} @@ -66,3 +62,7 @@ GET _cat/indices/index1,index2,index3 health | status | index | uuid | pri | rep | docs.count | docs.deleted | store.size | pri.store.size green | open | movies | UZbpfERBQ1-3GSH2bnM3sg | 1 | 1 | 1 | 0 | 7.7kb | 3.8kb ``` + +## Limiting the response size + +To limit the number of indexes returned, configure the `cat.indices.response.limit.number_of_indices` setting. For more information, see [Cluster-level CAT response limit settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/cluster-settings/#cluster-level-cat-response-limit-settings). \ No newline at end of file diff --git a/_api-reference/cat/cat-nodeattrs.md b/_api-reference/cat/cat-nodeattrs.md index b09e164698..62471f3960 100644 --- a/_api-reference/cat/cat-nodeattrs.md +++ b/_api-reference/cat/cat-nodeattrs.md @@ -17,15 +17,11 @@ The CAT nodeattrs operation lists the attributes of custom nodes. ## Path and HTTP methods -``` +```json GET _cat/nodeattrs ``` -## URL parameters - -All CAT nodeattrs URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -36,7 +32,7 @@ cluster_manager_timeout | Time | The amount of time to wait for a connection to The following example request returns attributes about custom nodes: -``` +```json GET _cat/nodeattrs?v ``` {% include copy-curl.html %} diff --git a/_api-reference/cat/cat-nodes.md b/_api-reference/cat/cat-nodes.md index 5e7238a0d0..d20393a251 100644 --- a/_api-reference/cat/cat-nodes.md +++ b/_api-reference/cat/cat-nodes.md @@ -19,11 +19,11 @@ A few important node metrics are `pid`, `name`, `cluster_manager`, `ip`, `port`, ## Path and HTTP methods -``` +```json GET _cat/nodes ``` -## URL parameters +## Query parameters All CAT nodes URL parameters are optional. @@ -41,7 +41,7 @@ include_unloaded_segments | Boolean | Whether to include information from segmen The following example request lists node level information: -``` +```json GET _cat/nodes?v ``` {% include copy-curl.html %} diff --git a/_api-reference/cat/cat-pending-tasks.md b/_api-reference/cat/cat-pending-tasks.md index ea224670ac..b047dd5d62 100644 --- a/_api-reference/cat/cat-pending-tasks.md +++ b/_api-reference/cat/cat-pending-tasks.md @@ -18,15 +18,11 @@ The CAT pending tasks operation lists the progress of all pending tasks, includi ## Path and HTTP methods -``` +```json GET _cat/pending_tasks ``` -## URL parameters - -All CAT nodes URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -38,7 +34,7 @@ time | Time | Specify the units for time. For example, `5d` or `7h`. For more in The following example request lists the progress of all pending node tasks: -``` +```json GET _cat/pending_tasks?v ``` {% include copy-curl.html %} diff --git a/_api-reference/cat/cat-plugins.md b/_api-reference/cat/cat-plugins.md index 358eb70fbf..45866e8ebd 100644 --- a/_api-reference/cat/cat-plugins.md +++ b/_api-reference/cat/cat-plugins.md @@ -18,15 +18,15 @@ The CAT plugins operation lists the names, components, and versions of the insta ## Path and HTTP methods -``` +```json GET _cat/plugins ``` -## URL parameters +## Query parameters -All CAT plugins URL parameters are optional. +All parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +In addition to the [common parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- @@ -37,7 +37,7 @@ cluster_manager_timeout | Time | The amount of time to wait for a connection to The following example request lists all installed plugins: -``` +```json GET _cat/plugins?v ``` {% include copy-curl.html %} diff --git a/_api-reference/cat/cat-recovery.md b/_api-reference/cat/cat-recovery.md index 8f251a94e0..fc29e14ac6 100644 --- a/_api-reference/cat/cat-recovery.md +++ b/_api-reference/cat/cat-recovery.md @@ -18,15 +18,11 @@ The CAT recovery operation lists all completed and ongoing index and shard recov ## Path and HTTP methods -``` +```json GET _cat/recovery ``` -## URL parameters - -All CAT recovery URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -37,14 +33,14 @@ time | Time | Specify the units for time. For example, `5d` or `7h`. For more in ## Example requests -``` +```json GET _cat/recovery?v ``` {% include copy-curl.html %} To see only the recoveries of a specific index, add the index name after your query. -``` +```json GET _cat/recovery/?v ``` {% include copy-curl.html %} diff --git a/_api-reference/cat/cat-repositories.md b/_api-reference/cat/cat-repositories.md index f0fc4bb622..c197ee5c6c 100644 --- a/_api-reference/cat/cat-repositories.md +++ b/_api-reference/cat/cat-repositories.md @@ -17,15 +17,12 @@ The CAT repositories operation lists all snapshot repositories for a cluster. ## Path and HTTP methods -``` +```json GET _cat/repositories ``` -## URL parameters - -All CAT repositories URL parameters are optional. +## Query parameters -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- @@ -36,7 +33,7 @@ cluster_manager_timeout | Time | The amount of time to wait for a connection to The following example request lists all snapshot repositories in the cluster: -``` +```json GET _cat/repositories?v ``` {% include copy-curl.html %} diff --git a/_api-reference/cat/cat-segment-replication.md b/_api-reference/cat/cat-segment-replication.md index 5900b97a7c..e943d0a451 100644 --- a/_api-reference/cat/cat-segment-replication.md +++ b/_api-reference/cat/cat-segment-replication.md @@ -24,16 +24,12 @@ GET /_cat/segment_replication/ ## Path parameters -The following table lists the available optional path parameter. - Parameter | Type | Description :--- | :--- | :--- `index` | String | The name of the index, or a comma-separated list or wildcard expression of index names used to filter results. If this parameter is not provided, the response contains information about all indexes in the cluster. ## Query parameters -The CAT segment replication API operation supports the following optional query parameters. - Parameter | Data type | Description :--- |:-----------| :--- `active_only` | Boolean | If `true`, the response only includes active segment replications. Defaults to `false`. diff --git a/_api-reference/cat/cat-segments.md b/_api-reference/cat/cat-segments.md index cd9eda38be..76696d3886 100644 --- a/_api-reference/cat/cat-segments.md +++ b/_api-reference/cat/cat-segments.md @@ -18,15 +18,11 @@ The cat segments operation lists Lucene segment-level information for each index ## Path and HTTP methods -``` +```json GET _cat/segments ``` -## URL parameters - -All CAT segments URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -35,21 +31,21 @@ cluster_manager_timeout | Time | The amount of time to wait for a connection to ## Example requests -``` +```json GET _cat/segments?v ``` {% include copy-curl.html %} To see only the information about segments of a specific index, add the index name after your query. -``` +```json GET _cat/segments/?v ``` {% include copy-curl.html %} If you want to get information for more than one index, separate the indexes with commas: -``` +```json GET _cat/segments/index1,index2,index3 ``` {% include copy-curl.html %} @@ -61,3 +57,7 @@ index | shard | prirep | ip | segment | generation | docs.count | docs.deleted | movies | 0 | p | 172.18.0.4 | _0 | 0 | 1 | 0 | 3.5kb | 1364 | true | true | 8.7.0 | true movies | 0 | r | 172.18.0.3 | _0 | 0 | 1 | 0 | 3.5kb | 1364 | true | true | 8.7.0 | true ``` + +## Limiting the response size + +To limit the number of indexes returned, configure the `cat.segments.response.limit.number_of_indices` setting. For more information, see [Cluster-level CAT response limit settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/cluster-settings/#cluster-level-cat-response-limit-settings). \ No newline at end of file diff --git a/_api-reference/cat/cat-shards.md b/_api-reference/cat/cat-shards.md index 56817936a6..c9677cb0ed 100644 --- a/_api-reference/cat/cat-shards.md +++ b/_api-reference/cat/cat-shards.md @@ -18,15 +18,15 @@ The CAT shards operation lists the state of all primary and replica shards and h ## Path and HTTP methods -``` +```json GET _cat/shards ``` -## URL parameters +## Query parameters -All cat shards URL parameters are optional. +All parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +In addition to the [common parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- @@ -66,3 +66,7 @@ index | shard | prirep | state | docs | store | ip | | node plugins | 0 | p | STARTED | 0 | 208b | 172.18.0.4 | odfe-node1 plugins | 0 | r | STARTED | 0 | 208b | 172.18.0.3 | odfe-node2 ``` + +## Limiting the response size + +To limit the number of shards returned, configure the `cat.shards.response.limit.number_of_shards` setting. For more information, see [Cluster-level CAT response limit settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/cluster-settings/#cluster-level-cat-response-limit-settings). \ No newline at end of file diff --git a/_api-reference/cat/cat-snapshots.md b/_api-reference/cat/cat-snapshots.md index 2e1bd514bf..71c3b3f75d 100644 --- a/_api-reference/cat/cat-snapshots.md +++ b/_api-reference/cat/cat-snapshots.md @@ -18,15 +18,11 @@ The CAT snapshots operation lists all snapshots for a repository. ## Path and HTTP methods -``` +```json GET _cat/snapshots ``` -## URL parameters - -All CAT snapshots URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameter: +## Query parameters Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-tasks.md b/_api-reference/cat/cat-tasks.md index 7a71b592e7..5419d5c647 100644 --- a/_api-reference/cat/cat-tasks.md +++ b/_api-reference/cat/cat-tasks.md @@ -17,15 +17,11 @@ The CAT tasks operation lists the progress of all tasks currently running on you ## Path and HTTP methods -``` +```json GET _cat/tasks ``` -## URL parameters - -All CAT tasks URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- diff --git a/_api-reference/cat/cat-templates.md b/_api-reference/cat/cat-templates.md index ba47ae711d..90b7d43fc7 100644 --- a/_api-reference/cat/cat-templates.md +++ b/_api-reference/cat/cat-templates.md @@ -18,16 +18,11 @@ The CAT templates operation lists the names, patterns, order numbers, and versio ## Path and HTTP methods -``` +```json GET _cat/templates ``` -{% include copy-curl.html %} - -## URL parameters -All CAT templates URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -38,14 +33,14 @@ cluster_manager_timeout | Time | The amount of time to wait for a connection to The following example request returns information about all templates: -``` +```json GET _cat/templates?v ``` {% include copy-curl.html %} If you want to get information for a specific template or pattern: -``` +```json GET _cat/templates/ ``` {% include copy-curl.html %} diff --git a/_api-reference/cat/cat-thread-pool.md b/_api-reference/cat/cat-thread-pool.md index de24052175..3171ae830e 100644 --- a/_api-reference/cat/cat-thread-pool.md +++ b/_api-reference/cat/cat-thread-pool.md @@ -17,15 +17,11 @@ The CAT thread pool operation lists the active, queued, and rejected threads of ## Path and HTTP methods -``` +```json GET _cat/thread_pool ``` -## URL parameters - -All CAT thread pool URL parameters are optional. - -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +## Query parameters Parameter | Type | Description :--- | :--- | :--- @@ -36,21 +32,21 @@ cluster_manager_timeout | Time | The amount of time to wait for a connection to The following example request gives information about thread pools on all nodes: -``` +```json GET _cat/thread_pool?v ``` {% include copy-curl.html %} If you want to get information for more than one thread pool, separate the thread pool names with commas: -``` +```json GET _cat/thread_pool/thread_pool_name_1,thread_pool_name_2,thread_pool_name_3 ``` {% include copy-curl.html %} If you want to limit the information to a specific thread pool, add the thread pool name after your query: -``` +```json GET _cat/thread_pool/?v ``` {% include copy-curl.html %} diff --git a/_api-reference/cluster-api/cluster-allocation.md b/_api-reference/cluster-api/cluster-allocation.md index 4ec6e27f2b..2f8bd9799c 100644 --- a/_api-reference/cluster-api/cluster-allocation.md +++ b/_api-reference/cluster-api/cluster-allocation.md @@ -17,18 +17,16 @@ The most basic cluster allocation explain request finds an unassigned shard and If you add some options, you can instead get information on a specific shard, including why OpenSearch assigned it to its current node. - ## Path and HTTP methods -``` +```json GET _cluster/allocation/explain POST _cluster/allocation/explain ``` +## Query parameters -## URL parameters - -All cluster allocation explain parameters are optional. +All parameters are optional. Parameter | Type | Description :--- | :--- | :--- @@ -36,7 +34,7 @@ include_yes_decisions | Boolean | OpenSearch makes a series of yes or no decisio include_disk_info | Boolean | Whether to include information about disk usage in the response. Default is `false`. -## Request body +## Request body fields All cluster allocation explain fields are optional. diff --git a/_api-reference/cluster-api/cluster-awareness.md b/_api-reference/cluster-api/cluster-awareness.md index 18259b9a98..8c162f214c 100644 --- a/_api-reference/cluster-api/cluster-awareness.md +++ b/_api-reference/cluster-api/cluster-awareness.md @@ -17,7 +17,7 @@ To control the distribution of search or HTTP traffic, you can use the weights p ## Path and HTTP methods -``` +```json PUT /_cluster/routing/awareness//weights GET /_cluster/routing/awareness//weights?local GET /_cluster/routing/awareness//weights @@ -29,7 +29,7 @@ Parameter | Type | Description :--- | :--- | :--- attribute | String | The name of the awareness attribute, usually `zone`. The attribute name must match the values listed in the request body when assigning weights to zones. -## Request body parameters +## Request body fields Parameter | Type | Description :--- | :--- | :--- @@ -51,11 +51,12 @@ In the following example request body, `zone_1` and `zone_2` receive 50 requests } ``` -## Example: Weighted round robin search +## Example requests + +### Weighted round robin search The following example request creates a round robin shard allocation for search traffic by using an undefined ratio: -#### Request ```json PUT /_cluster/routing/awareness/zone/weights @@ -71,27 +72,37 @@ PUT /_cluster/routing/awareness/zone/weights ``` {% include copy-curl.html %} -#### Response -``` -{ - "acknowledged": true -} -``` +### Getting weights for all zones + +The following example request gets weights for all zones. +```json +GET /_cluster/routing/awareness/zone/weights +``` +{% include copy-curl.html %} -## Example: Getting weights for all zones -The following example request gets weights for all zones. +### Deleting weights -#### Request +You can remove your weight ratio for each zone using the `DELETE` method: ```json -GET /_cluster/routing/awareness/zone/weights +DELETE /_cluster/routing/awareness/zone/weights ``` {% include copy-curl.html %} -#### Response +## Example responses + +OpenSearch typically responds with the following when successfully allocating shards: + +```json +{ + "acknowledged": true +} +``` + +### Getting weights for all zone OpenSearch responds with the weight of each zone: @@ -106,26 +117,7 @@ OpenSearch responds with the weight of each zone: }, "_version":1 } -``` - -## Example: Deleting weights - -You can remove your weight ratio for each zone using the `DELETE` method. -#### Request - -```json -DELETE /_cluster/routing/awareness/zone/weights -``` -{% include copy-curl.html %} - -#### Response - -```json -{ - "_version":1 -} -``` ## Next steps diff --git a/_api-reference/cluster-api/cluster-decommission.md b/_api-reference/cluster-api/cluster-decommission.md index c707e5390a..1cf17c5b6b 100644 --- a/_api-reference/cluster-api/cluster-decommission.md +++ b/_api-reference/cluster-api/cluster-decommission.md @@ -18,27 +18,27 @@ The cluster decommission operation adds support decommissioning based on awarene For more information about allocation awareness, see [Shard allocation awareness]({{site.url}}{{site.baseurl}}//opensearch/cluster/#shard-allocation-awareness). -## HTTP and Path methods +## Path and HTTP methods -``` +```json PUT /_cluster/decommission/awareness/{awareness_attribute_name}/{awareness_attribute_value} GET /_cluster/decommission/awareness/{awareness_attribute_name}/_status DELETE /_cluster/decommission/awareness ``` -## URL parameters +## Path parameters Parameter | Type | Description :--- | :--- | :--- awareness_attribute_name | String | The name of awareness attribute, usually `zone`. awareness_attribute_value | String | The value of the awareness attribute. For example, if you have shards allocated in two different zones, you can give each zone a value of `zone-a` or `zoneb`. The cluster decommission operation decommissions the zone listed in the method. +## Example requests -## Example: Decommissioning and recommissioning a zone +### Decommissioning and recommissioning a zone You can use the following example requests to decommission and recommission a zone: -#### Request The following example request decommissions `zone-a`: @@ -54,27 +54,29 @@ DELETE /_cluster/decommission/awareness ``` {% include copy-curl.html %} -#### Example response +### Getting zone decommission status +The following example requests returns the decommission status of all zones. ```json -{ - "acknowledged": true -} +GET /_cluster/decommission/awareness/zone/_status ``` +{% include copy-curl.html %} -## Example: Getting zone decommission status +#### Example responses -The following example requests returns the decommission status of all zones. - -#### Request +The following example response shows a successful zone decommission: ```json -GET /_cluster/decommission/awareness/zone/_status +{ + "acknowledged": true +} ``` -{% include copy-curl.html %} -#### Example response +### Getting zone decommission status + +The following example response returns the decommission status of all zones: + ```json { diff --git a/_api-reference/cluster-api/cluster-health.md b/_api-reference/cluster-api/cluster-health.md index 0fd5662d91..df8f3f24e3 100644 --- a/_api-reference/cluster-api/cluster-health.md +++ b/_api-reference/cluster-api/cluster-health.md @@ -20,7 +20,7 @@ To get the status of a specific index, provide the index name. ## Path and HTTP methods -``` +```json GET _cluster/health GET _cluster/health/ ``` @@ -98,7 +98,7 @@ The response contains cluster health information: } ``` -## Response fields +## Response body fields The following table lists all response fields. diff --git a/_api-reference/cluster-api/cluster-settings.md b/_api-reference/cluster-api/cluster-settings.md index ec682ecdbd..1e0977c56a 100644 --- a/_api-reference/cluster-api/cluster-settings.md +++ b/_api-reference/cluster-api/cluster-settings.md @@ -16,14 +16,14 @@ The cluster settings operation lets you check the current settings for your clus ## Path and HTTP methods -``` +```json GET _cluster/settings PUT _cluster/settings ``` ## Path parameters -All cluster setting parameters are optional. +All parameters are optional. Parameter | Data type | Description :--- | :--- | :--- @@ -32,9 +32,9 @@ include_defaults (GET only) | Boolean | Whether to include default settings as p cluster_manager_timeout | Time unit | The amount of time to wait for a response from the cluster manager node. Default is `30 seconds`. timeout (PUT only) | Time unit | The amount of time to wait for a response from the cluster. Default is `30 seconds`. -## Request fields +## Request body fields -The GET operation has no request body options. All cluster setting field parameters are optional. +The GET operation has no request body fields. All cluster setting field parameters are optional. Not all cluster settings can be updated using the cluster settings API. You will receive the error message `"setting [cluster.some.setting], not dynamically updateable"` when trying to configure these settings through the API. {: .note } diff --git a/_api-reference/cluster-api/cluster-stats.md b/_api-reference/cluster-api/cluster-stats.md index fb0ade2c6b..0a1b348a67 100644 --- a/_api-reference/cluster-api/cluster-stats.md +++ b/_api-reference/cluster-api/cluster-stats.md @@ -21,19 +21,68 @@ The cluster stats API operation returns statistics about your cluster. ```json GET _cluster/stats GET _cluster/stats/nodes/ +GET _cluster/stats//nodes/ +GET _cluster/stats///nodes/ ``` -## URL parameters +## Path parameters -All cluster stats parameters are optional. +All parameters are optional. Parameter | Type | Description :--- | :--- | :--- <node-filters> | List | A comma-separated list of [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters) that OpenSearch uses to filter results. +metric | String | A comma-separated list of [metric groups](#metric-groups), for example, `jvm,fs`. Default is all metric groups. +index_metric | String | A comma-separated list of [index metric groups](#index-metric-groups), for example, `docs,store`. Default is all index metrics. - Although the `master` node is now called `cluster_manager` for version 2.0, we retained the `master` field for backwards compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. To see an example node count increase, see the Response sample. - {: .note } +Although the term `master` was deprecated in favor of `cluster_manager` subsequent to OpenSearch 2.0, the `master` field was retained for backward compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. For an example node count increase, see the [example response](#example-response). +{: .note } + +### Metric groups + +The following table lists all available metric groups. + +Metric | Description +:--- |:---- +`indices` | Statistics about indexes in the cluster. +`os` | Statistics about the host OS, including load and memory. +`process` | Statistics about processes, including open file descriptors and CPU usage. +`jvm` | Statistics about the JVM, including heap usage and threads. +`fs` | Statistics about file system usage. +`plugins` | Statistics about OpenSearch plugins integrated with the nodes. +`network_types` | A list of the transport and HTTP networks connected to the nodes. +`discovery_type` | The method used by the nodes to find other nodes in the cluster. +`packaging_types` | Information about each node's OpenSearch distribution. +`ingest` | Statistics about ingest pipelines. + +### Index metric groups + +To filter the information returned for the `indices` metric, you can use specific `index_metric` values. These values are only supported when using the following query types: + +```json +GET _cluster/stats/_all//_nodes/ +GET _cluster/stats/indices//_nodes/ +``` + +The following index metrics are supported: + +- `shards` +- `docs` +- `store` +- `fielddata` +- `query_cache` +- `completion` +- `segments` +- `mappings` +- `analysis` + +For example, the following query requests statistics for `docs` and `search`: + +```json +GET _cluster/stats/indices/docs,segments/_nodes/_all +``` +{% include copy-curl.html %} ## Example request @@ -491,32 +540,32 @@ GET _cluster/stats/nodes/_cluster_manager Field | Description :--- | :--- -nodes | How many nodes returned in the response. -cluster_name | The cluster's name. -cluster_uuid | The cluster's uuid. -timestamp | The Unix epoch time of when the cluster was last refreshed. -status | The cluster's health status. -indices | Statistics about the indexes in the cluster. -indices.count | How many indexes are in the cluster. -indices.shards | Information about the cluster's shards. -indices.docs | How many documents are still in the cluster and how many documents are deleted. -indices.store | Information about the cluster's storage. -indices.fielddata | Information about the cluster's field data -indices.query_cache | Data about the cluster's query cache. -indices.completion | How many bytes in memory are used to complete operations. -indices.segments | Information about the cluster's segments, which are small Lucene indexes. -indices.mappings | Mappings within the cluster. -indices.analysis | Information about analyzers used in the cluster. -nodes | Statistics about the nodes in the cluster. -nodes.count | How many nodes were returned from the request. -nodes.versions | OpenSearch's version number. -nodes.os | Information about the operating systems used in the nodes. -nodes.process | The processes the returned nodes use. -nodes.jvm | Statistics about the Java Virtual Machines in use. -nodes.fs | The nodes' file storage. -nodes.plugins | The OpenSearch plugins integrated within the nodes. -nodes.network_types | The transport and HTTP networks within the nodes. -nodes.discovery_type | The method the nodes use to find other nodes within the cluster. -nodes.packaging_types | Information about the nodes' OpenSearch distribution. -nodes.ingest | Information about the nodes' ingest pipelines/nodes, if there are any. -total_time_spent | The total amount of download and upload time spent across all shards in the cluster when downloading or uploading from the remote store. +`nodes` | The number of nodes returned in the response. +`cluster_name` | The cluster's name. +`cluster_uuid` | The cluster's UUID. +`timestamp` | The Unix epoch time indicating when the cluster was last refreshed. +`status` | The cluster's health status. +`indices` | Statistics about the indexes in the cluster. +`indices.count` | The number of indexes in the cluster. +`indices.shards` | Information about the cluster's shards. +`indices.docs` | The number of documents remaining in the cluster and the number of documents that were deleted. +`indices.store` | Information about the cluster's storage. +`indices.fielddata` | Information about the cluster's field data. +`indices.query_cache` | Data about the cluster's query cache. +`indices.completion` | The number of bytes in memory that were used to complete operations. +`indices.segments` | Information about the cluster's segments, which are small Lucene indexes. +`indices.mappings` | Information about mappings in the cluster. +`indices.analysis` | Information about analyzers used in the cluster. +`nodes` | Statistics about the nodes in the cluster. +`nodes.count` | The number of nodes returned by the request. +`nodes.versions` | The OpenSearch version number for each node. +`nodes.os` | Information about the operating systems used by the nodes. +`nodes.process` | A list of processes used by each node. +`nodes.jvm` | Statistics about the JVMs in use. +`nodes.fs` | Information about the node's file storage. +`nodes.plugins` | A list of the OpenSearch plugins integrated with the nodes. +`nodes.network_types` | A list of the transport and HTTP networks connected to the nodes. +`nodes.discovery_type` | A list of methods used by the nodes to find other nodes in the cluster. +`nodes.packaging_types` | Information about each node's OpenSearch distribution. +`nodes.ingest` | Information about the node's ingest pipelines/nodes, if there are any. +`total_time_spent` | The total amount of download and upload time spent across all shards in the cluster when downloading or uploading from the remote store. diff --git a/_api-reference/common-parameters.md b/_api-reference/common-parameters.md index 5b536ad992..ac3efbf4bf 100644 --- a/_api-reference/common-parameters.md +++ b/_api-reference/common-parameters.md @@ -123,4 +123,17 @@ Kilometers | `km` or `kilometers` Meters | `m` or `meters` Centimeters | `cm` or `centimeters` Millimeters | `mm` or `millimeters` -Nautical miles | `NM`, `nmi`, or `nauticalmiles` \ No newline at end of file +Nautical miles | `NM`, `nmi`, or `nauticalmiles` + +## `X-Opaque-Id` header + +You can specify an opaque identifier for any request using the `X-Opaque-Id` header. This identifier is used to track tasks and deduplicate deprecation warnings in server-side logs. This identifier is used to differentiate between callers sending requests to your OpenSearch cluster. Do not specify a unique value per request. + +#### Example request + +The following request adds an opaque ID to the request: + +```json +curl -H "X-Opaque-Id: my-curl-client-1" -XGET localhost:9200/_tasks +``` +{% include copy.html %} diff --git a/_api-reference/count.md b/_api-reference/count.md index 2ac336eeb0..048dad9609 100644 --- a/_api-reference/count.md +++ b/_api-reference/count.md @@ -13,7 +13,35 @@ redirect_from: The count API gives you quick access to the number of documents that match a query. You can also use it to check the document count of an index, data stream, or cluster. -## Example + +## Path and HTTP methods + +```json +GET /_count/ +POST /_count/ +``` + + +## Query parameters + +All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`allow_no_indices` | Boolean | If false, the request returns an error if any wildcard expression or index alias targets any closed or missing indexes. Default is `false`. +`analyzer` | String | The analyzer to use in the query string. +`analyze_wildcard` | Boolean | Specifies whether to analyze wildcard and prefix queries. Default is `false`. +`default_operator` | String | Indicates whether the default operator for a string query should be `AND` or `OR`. Default is `OR`. +`df` | String | The default field in case a field prefix is not provided in the query string. +`expand_wildcards` | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indexes), `closed` (match closed, non-hidden indexes), `hidden` (match hidden indexes), and `none` (deny wildcard expressions). Default is `open`. +`ignore_unavailable` | Boolean | Specifies whether to include missing or closed indexes in the response. Default is `false`. +`lenient` | Boolean | Specifies whether OpenSearch should accept requests if queries have format errors (for example, querying a text field for an integer). Default is `false`. +`min_score` | Float | Include only documents with a minimum `_score` value in the result. +`routing` | String | Value used to route the operation to a specific shard. +`preference` | String | Specifies which shard or node OpenSearch should perform the count operation on. +`terminate_after` | Integer | The maximum number of documents OpenSearch should process before terminating the request. + +## Example requests To see the number of documents that match a query: @@ -64,35 +92,7 @@ GET _count Alternatively, you could use the [cat indexes]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-indices/) and [cat count]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-count/) APIs to see the number of documents per index or data stream. {: .note } - -## Path and HTTP methods - -``` -GET /_count/ -POST /_count/ -``` - - -## URL parameters - -All count parameters are optional. - -Parameter | Type | Description -:--- | :--- | :--- -`allow_no_indices` | Boolean | If false, the request returns an error if any wildcard expression or index alias targets any closed or missing indexes. Default is `false`. -`analyzer` | String | The analyzer to use in the query string. -`analyze_wildcard` | Boolean | Specifies whether to analyze wildcard and prefix queries. Default is `false`. -`default_operator` | String | Indicates whether the default operator for a string query should be `AND` or `OR`. Default is `OR`. -`df` | String | The default field in case a field prefix is not provided in the query string. -`expand_wildcards` | String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values. Valid values are `all` (match any index), `open` (match open, non-hidden indexes), `closed` (match closed, non-hidden indexes), `hidden` (match hidden indexes), and `none` (deny wildcard expressions). Default is `open`. -`ignore_unavailable` | Boolean | Specifies whether to include missing or closed indexes in the response. Default is `false`. -`lenient` | Boolean | Specifies whether OpenSearch should accept requests if queries have format errors (for example, querying a text field for an integer). Default is `false`. -`min_score` | Float | Include only documents with a minimum `_score` value in the result. -`routing` | String | Value used to route the operation to a specific shard. -`preference` | String | Specifies which shard or node OpenSearch should perform the count operation on. -`terminate_after` | Integer | The maximum number of documents OpenSearch should process before terminating the request. - -## Response +## Example response ```json { diff --git a/_api-reference/document-apis/bulk-streaming.md b/_api-reference/document-apis/bulk-streaming.md new file mode 100644 index 0000000000..c127eab527 --- /dev/null +++ b/_api-reference/document-apis/bulk-streaming.md @@ -0,0 +1,81 @@ +--- +layout: default +title: Streaming bulk +parent: Document APIs +nav_order: 25 +redirect_from: + - /opensearch/rest-api/document-apis/bulk/streaming/ +--- + +# Streaming bulk +**Introduced 2.17.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/9065). +{: .warning} + +The streaming bulk operation lets you add, update, or delete multiple documents by streaming the request and getting the results as a streaming response. In comparison to the traditional [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/), streaming ingestion eliminates the need to estimate the batch size (which is affected by the cluster operational state at any given time) and naturally applies backpressure between many clients and the cluster. The streaming works over HTTP/2 or HTTP/1.1 (using chunked transfer encoding), depending on the capabilities of the clients and the cluster. + +The default HTTP transport method does not support streaming. You must install the [`transport-reactor-netty4`]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/network-settings/#selecting-the-transport) HTTP transport plugin and use it as the default HTTP transport layer. Both the `transport-reactor-netty4` plugin and the Streaming Bulk API are experimental. +{: .note} + +## Path and HTTP methods + +```json +POST _bulk/stream +POST /_bulk/stream +``` + +If you specify the index in the path, then you don't need to include it in the [request body chunks]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/#request-body). + +OpenSearch also accepts PUT requests to the `_bulk/stream` path, but we highly recommend using POST. The accepted usage of PUT---adding or replacing a single resource on a given path---doesn't make sense for streaming bulk requests. +{: .note } + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +`pipeline` | String | The pipeline ID for preprocessing documents. +`refresh` | Enum | Whether to refresh the affected shards after performing the indexing operations. Default is `false`. `true` causes the changes show up in search results immediately but degrades cluster performance. `wait_for` waits for a refresh. Requests take longer to return, but cluster performance isn't degraded. +`require_alias` | Boolean | Set to `true` to require that all actions target an index alias rather than an index. Default is `false`. +`routing` | String | Routes the request to the specified shard. +`timeout` | Time | How long to wait for the request to return. Default is `1m`. +`type` | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using the `_doc` type for all indexes. +`wait_for_active_shards` | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is `1` (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have 2 replicas distributed across 2 additional nodes in order for the request to succeed. +`batch_interval` | Time | Specifies for how long bulk operations should be accumulated into a batch before sending the batch to data nodes. +`batch_size` | Time | Specifies how many bulk operations should be accumulated into a batch before sending the batch to data nodes. Default is `1`. +{% comment %}_source | List | asdf +`_source_excludes` | List | asdf +`_source_includes` | List | asdf{% endcomment %} + +## Request body fields + +The Streaming Bulk API request body is fully compatible with the [Bulk API request body]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/#request-body), where each bulk operation (create/index/update/delete) is sent as a separate chunk. + +## Example request + +```json +curl -X POST "http://localhost:9200/_bulk/stream" -H "Transfer-Encoding: chunked" -H "Content-Type: application/json" -d' +{ "delete": { "_index": "movies", "_id": "tt2229499" } } +{ "index": { "_index": "movies", "_id": "tt1979320" } } +{ "title": "Rush", "year": 2013 } +{ "create": { "_index": "movies", "_id": "tt1392214" } } +{ "title": "Prisoners", "year": 2013 } +{ "update": { "_index": "movies", "_id": "tt0816711" } } +{ "doc" : { "title": "World War Z" } } +' +``` +{% include copy.html %} + +## Example response + +Depending on the batch settings, each streamed response chunk may report the results of one or many (batch) bulk operations. For example, for the preceding request with no batching (default), the streaming response may appear as follows: + +```json +{"took": 11, "errors": false, "items": [ { "index": {"_index": "movies", "_id": "tt1979320", "_version": 1, "result": "created", "_shards": { "total": 2 "successful": 1, "failed": 0 }, "_seq_no": 1, "_primary_term": 1, "status": 201 } } ] } +{"took": 2, "errors": true, "items": [ { "create": { "_index": "movies", "_id": "tt1392214", "status": 409, "error": { "type": "version_conflict_engine_exception", "reason": "[tt1392214]: version conflict, document already exists (current version [1])", "index": "movies", "shard": "0", "index_uuid": "yhizhusbSWmP0G7OJnmcLg" } } } ] } +{"took": 4, "errors": true, "items": [ { "update": { "_index": "movies", "_id": "tt0816711", "status": 404, "error": { "type": "document_missing_exception", "reason": "[_doc][tt0816711]: document missing", "index": "movies", "shard": "0", "index_uuid": "yhizhusbSWmP0G7OJnmcLg" } } } ] } +``` diff --git a/_api-reference/document-apis/bulk.md b/_api-reference/document-apis/bulk.md index 0475aa573d..2a2a5ef8a2 100644 --- a/_api-reference/document-apis/bulk.md +++ b/_api-reference/document-apis/bulk.md @@ -17,25 +17,11 @@ The bulk operation lets you add, update, or delete multiple documents in a singl Beginning in OpenSearch 2.9, when indexing documents using the bulk operation, the document `_id` must be 512 bytes or less in size. {: .note} -## Example - -```json -POST _bulk -{ "delete": { "_index": "movies", "_id": "tt2229499" } } -{ "index": { "_index": "movies", "_id": "tt1979320" } } -{ "title": "Rush", "year": 2013 } -{ "create": { "_index": "movies", "_id": "tt1392214" } } -{ "title": "Prisoners", "year": 2013 } -{ "update": { "_index": "movies", "_id": "tt0816711" } } -{ "doc" : { "title": "World War Z" } } - -``` -{% include copy-curl.html %} ## Path and HTTP methods -``` +```json POST _bulk POST /_bulk ``` @@ -46,23 +32,23 @@ OpenSearch also accepts PUT requests to the `_bulk` path, but we highly recommen {: .note } -## URL parameters +## Query parameters -All bulk URL parameters are optional. +All parameters are optional. Parameter | Type | Description :--- | :--- | :--- pipeline | String | The pipeline ID for preprocessing documents. -refresh | Enum | Whether to refresh the affected shards after performing the indexing operations. Default is `false`. `true` makes the changes show up in search results immediately, but hurts cluster performance. `wait_for` waits for a refresh. Requests take longer to return, but cluster performance doesn't suffer. +refresh | Enum | Whether to refresh the affected shards after performing the indexing operations. Default is `false`. `true` causes the changes show up in search results immediately but degrades cluster performance. `wait_for` waits for a refresh. Requests take longer to return, but cluster performance isn't degraded. require_alias | Boolean | Set to `true` to require that all actions target an index alias rather than an index. Default is `false`. routing | String | Routes the request to the specified shard. -timeout | Time | How long to wait for the request to return. Default `1m`. -type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using a type of `_doc` for all indexes. -wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. +timeout | Time | How long to wait for the request to return. Default is `1m`. +type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using the `_doc` type for all indexes. +wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is `1` (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have 2 replicas distributed across 2 additional nodes in order for the request to succeed. batch_size | Integer | **(Deprecated)** Specifies the number of documents to be batched and sent to an ingest pipeline to be processed together. Default is `2147483647` (documents are ingested by an ingest pipeline all at once). If the bulk request doesn't explicitly specify an ingest pipeline or the index doesn't have a default ingest pipeline, then this parameter is ignored. Only documents with `create`, `index`, or `update` actions can be grouped into batches. {% comment %}_source | List | asdf -_source_excludes | list | asdf -_source_includes | list | asdf{% endcomment %} +_source_excludes | List | asdf +_source_includes | List | asdf{% endcomment %} ## Request body @@ -81,7 +67,7 @@ The optional JSON document doesn't need to be minified---spaces are fine---but i All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If you don't provide an ID, OpenSearch generates one automatically, which can make it challenging to update the document at a later time. -- Create +### Create Creates a document if it doesn't already exist and returns an error otherwise. The next line must include a JSON document: @@ -90,49 +76,64 @@ All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If { "title": "Prisoners", "year": 2013 } ``` -- Delete +### Delete - This action deletes a document if it exists. If the document doesn't exist, OpenSearch doesn't return an error but instead returns `not_found` under `result`. Delete actions don't require documents on the next line: +This action deletes a document if it exists. If the document doesn't exist, OpenSearch doesn't return an error but instead returns `not_found` under `result`. Delete actions don't require documents on the next line: - ```json - { "delete": { "_index": "movies", "_id": "tt2229499" } } - ``` +```json +{ "delete": { "_index": "movies", "_id": "tt2229499" } } +``` -- Index +### Index - Index actions create a document if it doesn't yet exist and replace the document if it already exists. The next line must include a JSON document: +Index actions create a document if it doesn't yet exist and replace the document if it already exists. The next line must include a JSON document: - ```json - { "index": { "_index": "movies", "_id": "tt1979320" } } - { "title": "Rush", "year": 2013} - ``` +```json +{ "index": { "_index": "movies", "_id": "tt1979320" } } +{ "title": "Rush", "year": 2013} +``` -- Update +### Update - By default, this action updates existing documents and returns an error if the document doesn't exist. The next line must include a full or partial JSON document, depending on how much of the document you want to update: +By default, this action updates existing documents and returns an error if the document doesn't exist. The next line must include a full or partial JSON document, depending on how much of the document you want to update: - ```json - { "update": { "_index": "movies", "_id": "tt0816711" } } - { "doc" : { "title": "World War Z" } } - ``` +```json +{ "update": { "_index": "movies", "_id": "tt0816711" } } +{ "doc" : { "title": "World War Z" } } +``` - To upsert a document, specify `doc_as_upsert` as `true`. If a document exists, it is updated; if it does not exist, a new document is indexed with the parameters specified in the `doc` field: +### Upsert - - Upsert - ```json - { "update": { "_index": "movies", "_id": "tt0816711" } } - { "doc" : { "title": "World War Z" }, "doc_as_upsert": true } - ``` +To upsert a document, specify `doc_as_upsert` as `true`. If a document exists, it is updated; if it does not exist, a new document is indexed with the parameters specified in the `doc` field: + +```json +{ "update": { "_index": "movies", "_id": "tt0816711" } } +{ "doc" : { "title": "World War Z" }, "doc_as_upsert": true } +``` - You can specify a script for more complex document updates by defining the script with the `source` or `id` from a document: +### Script +You can specify a script for more complex document updates by defining the script with the `source` or `id` from a document: +```json +{ "update": { "_index": "movies", "_id": "tt0816711" } } +{ "script" : { "source": "ctx._source.title = \"World War Z\"" } } +``` - - Script - ```json - { "update": { "_index": "movies", "_id": "tt0816711" } } - { "script" : { "source": "ctx._source.title = \"World War Z\"" } } - ``` +## Example request + +```json +POST _bulk +{ "delete": { "_index": "movies", "_id": "tt2229499" } } +{ "index": { "_index": "movies", "_id": "tt1979320" } } +{ "title": "Rush", "year": 2013 } +{ "create": { "_index": "movies", "_id": "tt1392214" } } +{ "title": "Prisoners", "year": 2013 } +{ "update": { "_index": "movies", "_id": "tt0816711" } } +{ "doc" : { "title": "World War Z" } } + +``` +{% include copy-curl.html %} ## Example response diff --git a/_api-reference/document-apis/delete-by-query.md b/_api-reference/document-apis/delete-by-query.md index 64da909aad..a55617e145 100644 --- a/_api-reference/document-apis/delete-by-query.md +++ b/_api-reference/document-apis/delete-by-query.md @@ -13,33 +13,24 @@ redirect_from: You can include a query as part of your delete request so OpenSearch deletes all documents that match that query. -## Example +## Path and HTTP methods ```json -POST sample-index1/_delete_by_query -{ - "query": { - "match": { - "movie-length": "124" - } - } -} +POST /_delete_by_query ``` -{% include copy-curl.html %} -## Path and HTTP methods +## Path parameters -``` -POST /_delete_by_query -``` +Parameter | Type | Description +:--- | :--- | :--- | :--- +<index> | String | Name or list of the data streams, indexes, or aliases to delete from. Supports wildcards. If left blank, OpenSearch searches all indexes. -## URL parameters +## Query parameters -All URL parameters are optional. +All parameters are optional. Parameter | Type | Description :--- | :--- | :--- | :--- -<index> | String | Name or list of the data streams, indexes, or aliases to delete from. Supports wildcards. If left blank, OpenSearch searches all indexes. allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. analyzer | String | The analyzer to use in the query string. analyze_wildcard | Boolean | Specifies whether to analyze wildcard and prefix queries. Default is `false`. @@ -74,7 +65,7 @@ wait_for_active_shards | String | The number of shards that must be active befor wait_for_completion | Boolean | Setting this parameter to false indicates to OpenSearch it should not wait for completion and perform this request asynchronously. Asynchronous requests run in the background, and you can use the [Tasks]({{site.url}}{{site.baseurl}}/api-reference/tasks) API to monitor progress. -## Request body +## Request body fields To search your index for specific documents, you must include a [query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) in the request body that OpenSearch uses to match documents. If you don't use a query, OpenSearch treats your delete request as a simple [delete document operation]({{site.url}}{{site.baseurl}}/api-reference/document-apis/delete-document). @@ -88,6 +79,21 @@ To search your index for specific documents, you must include a [query]({{site.u } ``` +## Example request + +```json +POST sample-index1/_delete_by_query +{ + "query": { + "match": { + "movie-length": "124" + } + } +} +``` +{% include copy-curl.html %} + + ## Example response ```json { diff --git a/_api-reference/document-apis/delete-document.md b/_api-reference/document-apis/delete-document.md index ece99a28ca..85ce4bd79b 100644 --- a/_api-reference/document-apis/delete-document.md +++ b/_api-reference/document-apis/delete-document.md @@ -13,25 +13,23 @@ redirect_from: If you no longer need a document in your index, you can use the delete document API operation to delete it. -## Example - -``` -DELETE /sample-index1/_doc/1 -``` -{% include copy-curl.html %} - ## Path and HTTP methods -``` +```json DELETE //_doc/<_id> ``` -## URL parameters +## Path parameters Parameter | Type | Description | Required :--- | :--- | :--- | :--- <index> | String | The index to delete from. | Yes <_id> | String | The ID of the document to delete. | Yes + +## Query parameters + +Parameter | Type | Description | Required +:--- | :--- | :--- | :--- if_seq_no | Integer | Only perform the delete operation if the document's version number matches the specified number. | No if_primary_term | Integer | Only perform the delete operation if the document has the specified primary term. | No refresh | Enum | If true, OpenSearch refreshes shards to make the delete operation available to search results. Valid options are `true`, `false`, and `wait_for`, which tells OpenSearch to wait for a refresh before executing the operation. Default is `false`. | No @@ -41,6 +39,13 @@ version | Integer | The version of the document to delete, which must match the version_type | Enum | Retrieves a specifically typed document. Available options are `external` (retrieve the document if the specified version number is greater than the document's current version) and `external_gte` (retrieve the document if the specified version number is greater than or equal to the document's current version). For example, to delete version 3 of a document, use `/_doc/1?version=3&version_type=external`. | No wait_for_active_shards | String | The number of active shards that must be available before OpenSearch processes the delete request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the operation to succeed. | No +## Example request + +```json +DELETE /sample-index1/_doc/1 +``` +{% include copy-curl.html %} + ## Example response ```json diff --git a/_api-reference/document-apis/get-documents.md b/_api-reference/document-apis/get-documents.md index 232e9083c7..1a6bc73a12 100644 --- a/_api-reference/document-apis/get-documents.md +++ b/_api-reference/document-apis/get-documents.md @@ -30,6 +30,13 @@ GET /_source/<_id> HEAD /_source/<_id> ``` +## Path parameters + +Parameter | Type | Description | Required +:--- | :--- | :--- | :--- +<index> | String | The index to retrieve the document from. | Yes +<_id> | String | The ID of the document to retrieve. | Yes + ## Query parameters All query parameters are optional. @@ -151,5 +158,5 @@ _seq_no | The sequence number assigned when the document is indexed. primary_term | The primary term assigned when the document is indexed. found | Whether the document exists. _routing | The shard that the document is routed to. If the document is not routed to a particular shard, this field is omitted. -_source | Contains the document's data if `found` is true. If `_source` is set to false or `stored_fields` is set to true in the URL parameters, this field is omitted. +_source | Contains the document's data if `found` is true. If `_source` is set to false or `stored_fields` is set to true in the parameters, this field is omitted. _fields | Contains the document's data that's stored in the index. Only returned if both `stored_fields` and `found` are true. diff --git a/_api-reference/document-apis/index-document.md b/_api-reference/document-apis/index-document.md index a506e2d9d8..d195a0662e 100644 --- a/_api-reference/document-apis/index-document.md +++ b/_api-reference/document-apis/index-document.md @@ -13,19 +13,10 @@ redirect_from: You can use the `Index document` operation to add a single document to your index. -## Example - -```json -PUT sample-index/_doc/1 -{ - "Description": "To be or not to be, that is the question." -} -``` -{% include copy-curl.html %} ## Path and HTTP methods -``` +```json PUT /_doc/<_id> POST /_doc @@ -49,45 +40,20 @@ To test the Document APIs, add a document by following these steps: 3. In the **Management** section, choose **Dev Tools**. 4. Enter a command, and then select the green triangle play button to send the request. The following are some example commands. -### Create a sample-index -```json -PUT /sample-index -``` -{% include copy-curl.html %} - -### Example PUT request - -```json -PUT /sample_index/_doc/1 -{ - "name": "Example", - "price": 29.99, - "description": "To be or not to be, that is the question" -} -``` -{% include copy-curl.html %} - -### Example POST request -```json -POST /sample_index/_doc -{ - "name": "Another Example", - "price": 19.99, - "description": "We are such stuff as dreams are made on" -} +## Path parameters -``` -{% include copy-curl.html %} +Parameter | Type | Description | Required +:--- | :--- | :--- | :--- +<index> | String | Name of the index. | Yes +<id> | String | A unique identifier to attach to the document. To automatically generate an ID, use `POST /doc` in your request instead of PUT. | No -## URL parameters +## Query parameters -In your request, you must specify the index you want to add your document to. If the index doesn't already exist, OpenSearch automatically creates the index and adds in your document. All other URL parameters are optional. +In your request, you must specify the index you want to add your document to. If the index doesn't already exist, OpenSearch automatically creates the index and adds in your document. All other parameters are optional. Parameter | Type | Description | Required :--- | :--- | :--- | :--- -<index> | String | Name of the index. | Yes -<_id> | String | A unique identifier to attach to the document. To automatically generate an ID, use `POST /doc` in your request instead of PUT. | No if_seq_no | Integer | Only perform the index operation if the document has the specified sequence number. | No if_primary_term | Integer | Only perform the index operation if the document has the specified primary term.| No op_type | Enum | Specifies the type of operation to complete with the document. Valid values are `create` (index a document only if it doesn't exist) and `index`. If a document ID is included in the request, then the default is `index`. Otherwise, the default is `create`. | No @@ -100,17 +66,38 @@ version_type | Enum | Assigns a specific type to the document. Valid options are wait_for_active_shards | String | The number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the operation to succeed. | No require_alias | Boolean | Specifies whether the target index must be an index alias. Default is `false`. | No -## Request body +## Example requests + +The following example requests create a sample index document for an index named `sample_index`: + + +### Example PUT request + +```json +PUT /sample_index/_doc/1 +{ + "name": "Example", + "price": 29.99, + "description": "To be or not to be, that is the question" +} +``` +{% include copy-curl.html %} -Your request body must contain the information you want to index. +### Example POST request ```json +POST /sample_index/_doc { - "Description": "This is just a sample document" + "name": "Another Example", + "price": 19.99, + "description": "We are such stuff as dreams are made on" } + ``` +{% include copy-curl.html %} ## Example response + ```json { "_index": "sample-index", diff --git a/_api-reference/document-apis/multi-get.md b/_api-reference/document-apis/multi-get.md index b267b8f3ac..acd69a7b7e 100644 --- a/_api-reference/document-apis/multi-get.md +++ b/_api-reference/document-apis/multi-get.md @@ -15,20 +15,25 @@ The multi-get operation allows you to run multiple GET operations in one request ## Path and HTTP methods -``` +```json GET _mget GET /_mget POST _mget POST /_mget ``` -## URL parameters - -All multi-get URL parameters are optional. +## Path parameters Parameter | Type | Description :--- | :--- | :--- | :--- <index> | String | Name of the index to retrieve documents from. + +## Query parameters + +All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- | :--- preference | String | Specifies the nodes or shards OpenSearch should execute the multi-get operation on. Default is `random`. realtime | Boolean | Specifies whether the operation should run in realtime. If false, the operation waits for the index to refresh to analyze the source to retrieve data, which makes the operation near-realtime. Default is `true`. refresh | Boolean | If true, OpenSearch refreshes shards to make the multi-get operation available to search results. Valid options are `true`, `false`, and `wait_for`, which tells OpenSearch to wait for a refresh before executing the operation. Default is `false`. @@ -147,5 +152,5 @@ _seq_no | The sequence number assigned when the document is indexed. primary_term | The primary term assigned when the document is indexed. found | Whether the document exists. _routing | The shard that the document is routed to. If the document is not routed to a particular shard, this field is omitted. -_source | Contains the document's data if `found` is true. If `_source` is set to false or `stored_fields` is set to true in the URL parameters, this field is omitted. +_source | Contains the document's data if `found` is true. If `_source` is set to false or `stored_fields` is set to true in the parameters, this field is omitted. _fields | Contains the document's data that's stored in the index. Only returned if both `stored_fields` and `found` are true. diff --git a/_api-reference/document-apis/reindex.md b/_api-reference/document-apis/reindex.md index 8ac1c48be4..65df81777e 100644 --- a/_api-reference/document-apis/reindex.md +++ b/_api-reference/document-apis/reindex.md @@ -14,30 +14,16 @@ redirect_from: The reindex document API operation lets you copy all or a subset of your data from a source index into a destination index. -## Example - -```json -POST /_reindex -{ - "source":{ - "index":"my-source-index" - }, - "dest":{ - "index":"my-destination-index" - } -} -``` -{% include copy-curl.html %} ## Path and HTTP methods -``` +```json POST /_reindex ``` -## URL parameters +## Query parameters -All URL parameters are optional. +All parameters are optional. Parameter | Type | Description :--- | :--- | :--- @@ -81,6 +67,21 @@ pipeline | Which ingest pipeline to utilize during the reindex. script | A script that OpenSearch uses to apply transformations to the data during the reindex operation. lang | The scripting language. Valid options are `painless`, `expression`, `mustache`, and `java`. +## Example request + +```json +POST /_reindex +{ + "source":{ + "index":"my-source-index" + }, + "dest":{ + "index":"my-destination-index" + } +} +``` +{% include copy-curl.html %} + ## Example response ```json { diff --git a/_api-reference/document-apis/update-by-query.md b/_api-reference/document-apis/update-by-query.md index 09b3bd599f..64df8c901b 100644 --- a/_api-reference/document-apis/update-by-query.md +++ b/_api-reference/document-apis/update-by-query.md @@ -13,40 +13,25 @@ redirect_from: You can include a query and a script as part of your update request so OpenSearch can run the script to update all of the documents that match the query. -## Example + +## Path and HTTP methods ```json -POST test-index1/_update_by_query -{ - "query": { - "term": { - "oldValue": 10 - } - }, - "script" : { - "source": "ctx._source.oldValue += params.newValue", - "lang": "painless", - "params" : { - "newValue" : 20 - } - } -} +POST , /_update_by_query ``` -{% include copy-curl.html %} -## Path and HTTP methods +## Path parameters -``` -POST , /_update_by_query -``` +Parameter | Type | Description +:--- | :--- | :--- | :--- +<index> | String | Comma-separated list of indexes to update. To update all indexes, use * or omit this parameter. -## URL parameters +## Query parameters -All URL parameters are optional. +All parameters are optional. Parameter | Type | Description :--- | :--- | :--- | :--- -<index> | String | Comma-separated list of indexes to update. To update all indexes, use * or omit this parameter. allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. analyzer | String | Analyzer to use in the query string. analyze_wildcard | Boolean | Whether the update operation should include wildcard and prefix queries in the analysis. Default is `false`. @@ -81,7 +66,7 @@ version | Boolean | Whether to include the document version as a match. wait_for_active_shards | String | The number of shards that must be active before OpenSearch executes the operation. Valid values are `all` or any integer up to the total number of shards in the index. Default is 1, which is the primary shard. wait_for_completion | boolean | When set to `false`, the response body includes a task ID and OpenSearch executes the operation asynchronously. The task ID can be used to check the status of the task or to cancel the task. Default is set to `true`. -## Request body +## Request body options To update your indexes and documents by query, you must include a [query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) and a script in the request body that OpenSearch can run to update your documents. If you don't specify a query, then every document in the index gets updated. @@ -102,6 +87,27 @@ To update your indexes and documents by query, you must include a [query]({{site } ``` +## Example requests + +```json +POST test-index1/_update_by_query +{ + "query": { + "term": { + "oldValue": 10 + } + }, + "script" : { + "source": "ctx._source.oldValue += params.newValue", + "lang": "painless", + "params" : { + "newValue" : 20 + } + } +} +``` +{% include copy-curl.html %} + ## Example response ```json { diff --git a/_api-reference/document-apis/update-document.md b/_api-reference/document-apis/update-document.md index 3f951b5adf..ff17940cdb 100644 --- a/_api-reference/document-apis/update-document.md +++ b/_api-reference/document-apis/update-document.md @@ -11,45 +11,26 @@ redirect_from: **Introduced 1.0** {: .label .label-purple } -If you need to update a document's fields in your index, you can use the update document API operation. You can do so by specifying the new data you want to be in your index or by including a script in your request body, which OpenSearch runs to update the document. By default, the update operation only updates a document that exists in the index. If a document does not exist, the API returns an error. To _upsert_ a document (update the document that exists or index a new one), use the [upsert](#upsert) operation. +If you need to update a document's fields in your index, you can use the update document API operation. You can do so by specifying the new data you want to be in your index or by including a script in your request body, which OpenSearch runs to update the document. By default, the update operation only updates a document that exists in the index. If a document does not exist, the API returns an error. To _upsert_ a document (update the document that exists or index a new one), use the [upsert](#using-the-upsert-operation) operation. -## Example - -```json -POST /sample-index1/_update/1 -{ - "doc": { - "first_name" : "Bruce", - "last_name" : "Wayne" - } -} -``` -{% include copy-curl.html %} - -## Script example - -```json -POST /test-index1/_update/1 -{ - "script" : { - "source": "ctx._source.secret_identity = \"Batman\"" - } -} -``` -{% include copy-curl.html %} ## Path and HTTP methods -``` +```json POST //_update/<_id> ``` -## URL parameters +## Path parameters Parameter | Type | Description | Required :--- | :--- | :--- | :--- <index> | String | Name of the index. | Yes <_id> | String | The ID of the document to update. | Yes + +## Query parameters + +Parameter | Type | Description | Required +:--- | :--- | :--- | :--- if_seq_no | Integer | Only perform the update operation if the document has the specified sequence number. | No if_primary_term | Integer | Perform the update operation if the document has the specified primary term. | No lang | String | Language of the script. Default is `painless`. | No @@ -63,7 +44,7 @@ _source_includes | List | A comma-separated list of source fields to include in timeout | Time | How long to wait for a response from the cluster. | No wait_for_active_shards | String | The number of active shards that must be available before OpenSearch processes the update request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the operation to succeed. | No -## Request body +## Request body fields Your request body must contain the information with which you want to update your document. If you only want to replace certain fields in your document, your request body must include a `doc` object containing the fields that you want to update: @@ -90,7 +71,34 @@ You can also use a script to tell OpenSearch how to update your document: } ``` -## Upsert +## Example requests + +### Update a document + +```json +POST /sample-index1/_update/1 +{ + "doc": { + "first_name" : "Bruce", + "last_name" : "Wayne" + } +} +``` +{% include copy-curl.html %} + +### Update a document with a script + +```json +POST /test-index1/_update/1 +{ + "script" : { + "source": "ctx._source.secret_identity = \"Batman\"" + } +} +``` +{% include copy-curl.html %} + +### Using the upsert operation Upsert is an operation that conditionally either updates an existing document or inserts a new one based on information in the object. @@ -109,6 +117,7 @@ POST /sample-index1/_update/1 } } ``` +{% include copy-curl.html %} Consider an index that contains the following document: @@ -123,6 +132,7 @@ Consider an index that contains the following document: } } ``` +{% include copy-curl.html %} After the upsert operation, the document's `first_name` and `last_name` fields are updated: @@ -137,6 +147,7 @@ After the upsert operation, the document's `first_name` and `last_name` fields a } } ``` +{% include copy-curl.html %} If the document does not exist in the index, a new document is indexed with the fields specified in the `upsert` object: @@ -151,6 +162,7 @@ If the document does not exist in the index, a new document is indexed with the } } ``` +{% include copy-curl.html %} You can also add `doc_as_upsert` to the request and set it to `true` to use the information in the `doc` field for performing the upsert operation: @@ -165,6 +177,7 @@ POST /sample-index1/_update/1 "doc_as_upsert": true } ``` +{% include copy-curl.html %} Consider an index that contains the following document: @@ -179,6 +192,7 @@ Consider an index that contains the following document: } } ``` +{% include copy-curl.html %} After the upsert operation, the document's `first_name` and `last_name` fields are updated and an `age` field is added. If the document does not exist in the index, a new document is indexed with the fields specified in the `upsert` object. In both cases, the document is as follows: @@ -194,8 +208,10 @@ After the upsert operation, the document's `first_name` and `last_name` fields a } } ``` +{% include copy-curl.html %} ## Example response + ```json { "_index": "sample-index1", diff --git a/_api-reference/explain.md b/_api-reference/explain.md index 8c2b757945..0591c5bb52 100644 --- a/_api-reference/explain.md +++ b/_api-reference/explain.md @@ -18,7 +18,40 @@ The explain API is an expensive operation in terms of both resources and time. O {: .warning } -## Example +## Path and HTTP methods + +```json +GET /_explain/ +POST /_explain/ +``` + +## Path parameters + +Parameter | Type | Description | Required +:--- | :--- | :--- | :--- +`` | String | Name of the index. You can only specify a single index. | Yes +`` | String | A unique identifier to attach to the document. | Yes + +## Query parameters + +You must specify the index and document ID. All other parameters are optional. + +Parameter | Type | Description | Required +:--- | :--- | :--- | :--- +`analyzer` | String | The analyzer to use in the query string. | No +`analyze_wildcard` | Boolean | Specifies whether to analyze wildcard and prefix queries. Default is `false`. | No +`default_operator` | String | Indicates whether the default operator for a string query should be AND or OR. Default is OR. | No +`df` | String | The default field in case a field prefix is not provided in the query string. | No +`lenient` | Boolean | Specifies whether OpenSearch should ignore format-based query failures (for example, querying a text field for an integer). Default is `false`. | No +`preference` | String | Specifies a preference of which shard to retrieve results from. Available options are `_local`, which tells the operation to retrieve results from a locally allocated shard replica, and a custom string value assigned to a specific shard replica. By default, OpenSearch executes the explain operation on random shards. | No +`q` | String | Query in the Lucene query string syntax. | No +`stored_fields` | Boolean | If true, the operation retrieves document fields stored in the index rather than the document’s `_source`. Default is `false`. | No +`routing` | String | Value used to route the operation to a specific shard. | No +`_source` | String | Whether to include the `_source` field in the response body. Default is `true`. | No +`_source_excludes` | String | A comma-separated list of source fields to exclude in the query response. | No +`_source_includes` | String | A comma-separated list of source fields to include in the query response. | No + +## Example requests To see the explain output for all results, set the `explain` flag to `true` either in the URL or in the body of the request: @@ -48,35 +81,7 @@ POST opensearch_dashboards_sample_data_ecommerce/_explain/EVz1Q3sBgg5eWQP6RSte ``` {% include copy-curl.html %} -## Path and HTTP methods - -``` -GET /_explain/ -POST /_explain/ -``` - -## URL parameters - -You must specify the index and document ID. All other URL parameters are optional. - -Parameter | Type | Description | Required -:--- | :--- | :--- | :--- -`` | String | Name of the index. You can only specify a single index. | Yes -`<_id>` | String | A unique identifier to attach to the document. | Yes -`analyzer` | String | The analyzer to use in the query string. | No -`analyze_wildcard` | Boolean | Specifies whether to analyze wildcard and prefix queries. Default is `false`. | No -`default_operator` | String | Indicates whether the default operator for a string query should be AND or OR. Default is OR. | No -`df` | String | The default field in case a field prefix is not provided in the query string. | No -`lenient` | Boolean | Specifies whether OpenSearch should ignore format-based query failures (for example, querying a text field for an integer). Default is `false`. | No -`preference` | String | Specifies a preference of which shard to retrieve results from. Available options are `_local`, which tells the operation to retrieve results from a locally allocated shard replica, and a custom string value assigned to a specific shard replica. By default, OpenSearch executes the explain operation on random shards. | No -`q` | String | Query in the Lucene query string syntax. | No -`stored_fields` | Boolean | If true, the operation retrieves document fields stored in the index rather than the document’s `_source`. Default is `false`. | No -`routing` | String | Value used to route the operation to a specific shard. | No -`_source` | String | Whether to include the `_source` field in the response body. Default is `true`. | No -`_source_excludes` | String | A comma-separated list of source fields to exclude in the query response. | No -`_source_includes` | String | A comma-separated list of source fields to include in the query response. | No - -## Response +## Example response ```json { diff --git a/_api-reference/index-apis/alias.md b/_api-reference/index-apis/alias.md index ebd7bdedfd..c3ddf76911 100644 --- a/_api-reference/index-apis/alias.md +++ b/_api-reference/index-apis/alias.md @@ -15,45 +15,22 @@ redirect_from: An alias is a virtual pointer that you can use to reference one or more indexes. Creating and updating aliases are atomic operations, so you can reindex your data and point an alias at it without any downtime. -## Example - -```json -POST _aliases -{ - "actions": [ - { - "add": { - "index": "movies", - "alias": "movies-alias1" - } - }, - { - "remove": { - "index": "old-index", - "alias": "old-index-alias" - } - } - ] -} -``` -{% include copy-curl.html %} - ## Path and HTTP methods -``` +```json POST _aliases ``` -## URL parameters +## Query parameters -All alias parameters are optional. +All parameters are optional. Parameter | Data Type | Description :--- | :--- | :--- cluster_manager_timeout | Time | The amount of time to wait for a response from the cluster manager node. Default is `30s`. timeout | Time | The amount of time to wait for a response from the cluster. Default is `30s`. -## Request body +## Request body fields In your request body, you need to specify what action to take, the alias name, and the index you want to associate with the alias. Other fields are optional. @@ -75,6 +52,29 @@ routing | String | Used to assign a custom value to a shard for specific operati index_routing | String | Assigns a custom value to a shard only for index operations. | No search_routing | String | Assigns a custom value to a shard only for search operations. | No +## Example request + +```json +POST _aliases +{ + "actions": [ + { + "add": { + "index": "movies", + "alias": "movies-alias1" + } + }, + { + "remove": { + "index": "old-index", + "alias": "old-index-alias" + } + } + ] +} +``` +{% include copy-curl.html %} + ## Example response ```json diff --git a/_api-reference/index-apis/clear-index-cache.md b/_api-reference/index-apis/clear-index-cache.md index 9bf873301d..6227a29960 100644 --- a/_api-reference/index-apis/clear-index-cache.md +++ b/_api-reference/index-apis/clear-index-cache.md @@ -15,6 +15,12 @@ The clear cache API operation clears the caches of one or more indexes. For data If you use the Security plugin, you must have the `manage index` privileges. {: .note} +## Path and HTTP methods + +```json +POST //_cache/clear +``` + ## Path parameters | Parameter | Data type | Description | @@ -117,7 +123,7 @@ The `POST /books,hockey/_cache/clear` request returns the following fields: } ``` -## Response fields +## Response body fields The `POST /books,hockey/_cache/clear` request returns the following response fields: diff --git a/_api-reference/index-apis/clone.md b/_api-reference/index-apis/clone.md index c1496cbaf8..36592a28b5 100644 --- a/_api-reference/index-apis/clone.md +++ b/_api-reference/index-apis/clone.md @@ -13,27 +13,9 @@ redirect_from: The clone index API operation clones all data in an existing read-only index into a new index. The new index cannot already exist. -## Example - -```json -PUT /sample-index1/_clone/cloned-index1 -{ - "settings": { - "index": { - "number_of_shards": 2, - "number_of_replicas": 1 - } - }, - "aliases": { - "sample-alias1": {} - } -} -``` -{% include copy-curl.html %} - ## Path and HTTP methods -``` +```json POST //_clone/ PUT //_clone/ ``` @@ -48,14 +30,19 @@ OpenSearch indexes have the following naming restrictions: `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` -## URL parameters - -Your request must include the source and target indexes. All other clone index parameters are optional. +## Path parameter Parameter | Type | Description :--- | :--- | :--- <source-index> | String | The source index to clone. <target-index> | String | The index to create and add cloned data to. + +## Query parameters + +Your request must include the source and target indexes. All other clone index parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- wait_for_active_shards | String | The number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to all or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the operation to succeed. cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. timeout | Time | How long to wait for the request to return. Default is `30s`. diff --git a/_api-reference/index-apis/close-index.md b/_api-reference/index-apis/close-index.md index 865d17d90a..ecad7d18cc 100644 --- a/_api-reference/index-apis/close-index.md +++ b/_api-reference/index-apis/close-index.md @@ -13,26 +13,25 @@ redirect_from: The close index API operation closes an index. Once an index is closed, you cannot add data to it or search for any data within the index. -#### Example + +## Path and HTTP methods ```json -POST /sample-index/_close +POST //_close ``` -{% include copy-curl.html %} -## Path and HTTP methods +## Path parameters -``` -POST //_close -``` +Parameter | Type | Description +:--- | :--- | :--- +<index> | String | The index to close. Can be a comma-separated list of multiple index names. Use `_all` or * to close all indexes. -## URL parameters +## Query parameters All parameters are optional. Parameter | Type | Description :--- | :--- | :--- -<index-name> | String | The index to close. Can be a comma-separated list of multiple index names. Use `_all` or * to close all indexes. allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is `true`. expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are all (match all indexes), open (match open indexes), closed (match closed indexes), hidden (match hidden indexes), and none (do not accept wildcard expressions). Default is `open`. ignore_unavailable | Boolean | If true, OpenSearch does not search for missing or closed indexes. Default is `false`. @@ -40,6 +39,13 @@ wait_for_active_shards | String | Specifies the number of active shards that mus cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. timeout | Time | How long to wait for a response from the cluster. Default is `30s`. +## Example requests + +```json +POST /sample-index/_close +``` +{% include copy-curl.html %} + ## Example response ```json diff --git a/_api-reference/index-apis/component-template.md b/_api-reference/index-apis/component-template.md index bafdfa95c7..fa73e64c94 100644 --- a/_api-reference/index-apis/component-template.md +++ b/_api-reference/index-apis/component-template.md @@ -40,7 +40,7 @@ Parameter | Data type | Description `cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. `timeout` | Time | The amount of time for the operation to wait for a response. Default is `30s`. -## Request fields +## Request body fields The following options can be used in the request body to customize the index template. diff --git a/_api-reference/index-apis/create-index-template.md b/_api-reference/index-apis/create-index-template.md index 2a92e3f4c4..c2f4228c8e 100644 --- a/_api-reference/index-apis/create-index-template.md +++ b/_api-reference/index-apis/create-index-template.md @@ -31,7 +31,7 @@ Parameter | Data type | Description `create` | Boolean | When true, the API cannot replace or update any existing index templates. Default is `false`. `cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. -## Request body options +## Request body fields The following options can be used in the request body to customize the index template. @@ -45,7 +45,7 @@ Parameter | Type | Description `priority` | Integer | A number that determines which index templates take precedence during the creation of a new index or data stream. OpenSearch chooses the template with the highest priority. When no priority is given, the template is assigned a `0`, signifying the lowest priority. Optional. `template` | Object | The template that includes the `aliases`, `mappings`, or `settings` for the index. For more information, see [#template]. Optional. `version` | Integer | The version number used to manage index templates. Version numbers are not automatically set by OpenSearch. Optional. - +`context` | Object | (Experimental) The `context` parameter provides use-case-specific predefined templates that can be applied to an index. Among all settings and mappings declared for a template, context templates hold the highest priority. For more information, see [index-context]({{site.url}}{{site.baseurl}}/im-plugin/index-context/). ### Template diff --git a/_api-reference/index-apis/create-index.md b/_api-reference/index-apis/create-index.md index 2f4c1041bc..f10450bb28 100644 --- a/_api-reference/index-apis/create-index.md +++ b/_api-reference/index-apis/create-index.md @@ -18,8 +18,8 @@ When creating an index, you can specify its mappings, settings, and aliases. ## Path and HTTP methods -``` -PUT +```json +PUT ``` ## Index naming restrictions @@ -50,7 +50,7 @@ timeout | Time | How long to wait for the request to return. Default is `30s`. ## Request body -As part of your request, you can optionally specify [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/), [mappings]({{site.url}}{{site.baseurl}}/field-types/index/), and [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/) for your newly created index. +As part of your request, you can optionally specify [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/), [mappings]({{site.url}}{{site.baseurl}}/field-types/index/), [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/), and [index context]({{site.url}}{{site.baseurl}}/opensearch/index-context/). ## Example request diff --git a/_api-reference/index-apis/dangling-index.md b/_api-reference/index-apis/dangling-index.md index 9d40687f9f..f44a9dc4d4 100644 --- a/_api-reference/index-apis/dangling-index.md +++ b/_api-reference/index-apis/dangling-index.md @@ -15,21 +15,24 @@ After a node joins a cluster, dangling indexes occur if any shards exist in the List dangling indexes: -``` +```json GET /_dangling ``` +{% include copy-curl.html %} Import a dangling index: -``` +```json POST /_dangling/ ``` +{% include copy-curl.html %} Delete a dangling index: -``` +```json DELETE /_dangling/ ``` +{% include copy-curl.html %} ## Path parameters @@ -49,31 +52,29 @@ accept_data_loss | Boolean | Must be set to `true` for an `import` or `delete` b timeout | Time units | The amount of time to wait for a response. If no response is received in the defined time period, an error is returned. Default is `30` seconds. cluster_manager_timeout | Time units | The amount of time to wait for a connection to the cluster manager. If no response is received in the defined time period, an error is returned. Default is `30` seconds. -## Examples - -The following are example requests and a example response. +## Example requests -#### Sample list +### Sample list ````bash GET /_dangling ```` {% include copy-curl.html %} -#### Sample import +### Sample import ````bash POST /_dangling/msdjernajxAT23RT-BupMB?accept_data_loss=true ```` {% include copy-curl.html %} -#### Sample delete +### Sample delete ````bash DELETE /_dangling/msdjernajxAT23RT-BupMB?accept_data_loss=true ```` -#### Example response body +## Example response ````json { diff --git a/_api-reference/index-apis/delete-index.md b/_api-reference/index-apis/delete-index.md index ad00eb7eca..af0bd292fc 100644 --- a/_api-reference/index-apis/delete-index.md +++ b/_api-reference/index-apis/delete-index.md @@ -13,19 +13,13 @@ redirect_from: If you no longer need an index, you can use the delete index API operation to delete it. -## Example +## Path and HTTP methods ```json -DELETE /sample-index -``` -{% include copy-curl.html %} - -## Path and HTTP methods -``` DELETE / ``` -## URL parameters +## Query parameters All parameters are optional. @@ -37,6 +31,13 @@ ignore_unavailable | Boolean | If true, OpenSearch does not include missing or c cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. timeout | Time | How long to wait for the response to return. Default is `30s`. +## Example request + +```json +DELETE /sample-index +``` +{% include copy-curl.html %} + ## Example response ```json diff --git a/_api-reference/index-apis/exists.md b/_api-reference/index-apis/exists.md index 351e2f2088..fb1a4d79c6 100644 --- a/_api-reference/index-apis/exists.md +++ b/_api-reference/index-apis/exists.md @@ -13,20 +13,14 @@ redirect_from: The index exists API operation returns whether or not an index already exists. -## Example - -```json -HEAD /sample-index -``` -{% include copy-curl.html %} ## Path and HTTP methods -``` +```json HEAD / ``` -## URL parameters +## Query parameters All parameters are optional. @@ -40,6 +34,13 @@ ignore_unavailable | Boolean | If true, OpenSearch does not search for missing o local | Boolean | Whether to return information from only the local node instead of from the cluster manager node. Default is `false`. +## Example request + +```json +HEAD /sample-index +``` +{% include copy-curl.html %} + ## Example response The index exists API operation returns only one of two possible response codes: `200` -- the index exists, and `404` -- the index does not exist. diff --git a/_api-reference/index-apis/flush.md b/_api-reference/index-apis/flush.md index fb97e43900..e464a42cad 100644 --- a/_api-reference/index-apis/flush.md +++ b/_api-reference/index-apis/flush.md @@ -18,7 +18,7 @@ OpenSearch automatically performs flushes in the background based on conditions The Flush API supports the following paths: -``` +```json GET /_flush POST /_flush GET /{index}/_flush @@ -35,7 +35,7 @@ The following table lists the available path parameters. All path parameters are ## Query parameters -The Flush API supports the following query parameters. +All parameters are optional. | Parameter | Data type | Description | | :--- | :--- | :--- | @@ -45,21 +45,26 @@ The Flush API supports the following query parameters. | `ignore_unavailable` | Boolean | When `true`, OpenSearch ignores missing or closed indexes. If `false`, OpenSearch returns an error if the force merge operation encounters missing or closed indexes. Default is `false`. | | `wait_if_ongoing` | Boolean | When `true`, the Flush API does not run while another flush request is active. When `false`, OpenSearch returns an error if another flush request is active. Default is `true`. | -## Example request: Flush a specific index +## Example requests + +### Flush a specific index The following example flushes an index named `shakespeare`: -``` +```json POST /shakespeare/_flush ``` +{% include copy-curl.html %} -## Example request: Flush all indexes + +### Flush all indexes The following example flushes all indexes in a cluster: -``` +```json POST /_flush ``` +{% include copy-curl.html %} ## Example response diff --git a/_api-reference/index-apis/force-merge.md b/_api-reference/index-apis/force-merge.md index ce7501ebe3..8316c72937 100644 --- a/_api-reference/index-apis/force-merge.md +++ b/_api-reference/index-apis/force-merge.md @@ -11,6 +11,13 @@ nav_order: 37 The force merge API operation forces a merge on the shards of one or more indexes. For a data stream, the API forces a merge on the shards of the stream's backing index. +## Path and HTTP methods + +```json +POST /_forcemerge +POST //_forcemerge/ +``` + ## The merge operation In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data. Periodically, smaller segments are merged into larger ones and the larger segments become immutable. Merging reduces the overall number of segments on each shard and frees up disk space. @@ -45,12 +52,6 @@ When you force merge multiple indexes, the merge operation is executed on each s It can be useful to force merge data streams in order to manage a data stream's backing indexes, especially after a rollover operation. Time-based indexes receive indexing requests only during a specified time period. Once that time period has elapsed and the index receives no more write requests, you can force merge segments of all index shards into one segment. Searches on single-segment shards are more efficient because they use simpler data structures. -## Path and HTTP methods - -```json -POST /_forcemerge -POST //_forcemerge/ -``` ## Path parameters @@ -135,7 +136,7 @@ POST /.testindex-logs/_forcemerge?primary_only=true } ``` -## Response fields +## Response body fields The following table lists all response fields. diff --git a/_api-reference/index-apis/get-index.md b/_api-reference/index-apis/get-index.md index e2d2d85c65..78fe7bcd94 100644 --- a/_api-reference/index-apis/get-index.md +++ b/_api-reference/index-apis/get-index.md @@ -13,20 +13,24 @@ redirect_from: You can use the get index API operation to return information about an index. -## Example + +## Path and HTTP methods ```json -GET /sample-index +GET / ``` -{% include copy-curl.html %} -## Path and HTTP methods +## Path parameters -``` -GET / -``` +## Path parameters -## URL parameters +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `` | String | A comma-separated list of indexes, data streams, or index aliases to which the operation is applied. Supports wildcard expressions (`*`). Use `_all` or `*` to specify all indexes and data streams in a cluster. | + +## Query parameters All parameters are optional. @@ -40,6 +44,12 @@ ignore_unavailable | Boolean | If true, OpenSearch does not include missing or c local | Boolean | Whether to return information from only the local node instead of from the cluster manager node. Default is `false`. cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +## Example request + +```json +GET /sample-index +``` +{% include copy-curl.html %} ## Example response ```json diff --git a/_api-reference/index-apis/get-settings.md b/_api-reference/index-apis/get-settings.md index 94cb4a7c6c..4eebb43272 100644 --- a/_api-reference/index-apis/get-settings.md +++ b/_api-reference/index-apis/get-settings.md @@ -14,29 +14,28 @@ redirect_from: The get settings API operation returns all the settings in your index. -## Example - -```json -GET /sample-index1/_settings -``` -{% include copy-curl.html %} ## Path and HTTP methods -``` +```json GET /_settings GET //_settings GET //_settings/ ``` -## URL parameters - -All get settings parameters are optional. +## Path parameters Parameter | Data type | Description :--- | :--- | :--- <target-index> | String | The index to get settings from. Can be a comma-separated list to get settings from multiple indexes, or use `_all` to return settings from all indexes within the cluster. <setting> | String | Filter to return specific settings. + +## Query parameters + +All get settings parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. flat_settings | Boolean | Whether to return settings in the flat form, which can improve readability, especially for heavily nested settings. For example, the flat form of “index”: { “creation_date”: “123456789” } is “index.creation_date”: “123456789”. @@ -45,6 +44,13 @@ ignore_unavailable | Boolean | If true, OpenSearch does not include missing or c local | Boolean | Whether to return information from the local node only instead of the cluster manager node. Default is `false`. cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +## Example request + +```json +GET /sample-index1/_settings +``` +{% include copy-curl.html %} + ## Example response ```json diff --git a/_api-reference/index-apis/open-index.md b/_api-reference/index-apis/open-index.md index 0d8ef62282..3011507697 100644 --- a/_api-reference/index-apis/open-index.md +++ b/_api-reference/index-apis/open-index.md @@ -13,26 +13,25 @@ redirect_from: The open index API operation opens a closed index, letting you add or search for data within the index. -## Example + +## Path and HTTP methods ```json -POST /sample-index/_open +POST //_open ``` -{% include copy-curl.html %} -## Path and HTTP methods +## Path parameters -``` -POST //_open -``` +Parameter | Type | Description +:--- | :--- | :--- +<index> | String | The index to open. Can be a comma-separated list of multiple index names. Use `_all` or * to open all indexes. -## URL parameters +## Query parameters All parameters are optional. Parameter | Type | Description :--- | :--- | :--- -<index-name> | String | The index to open. Can be a comma-separated list of multiple index names. Use `_all` or * to open all indexes. allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is `true`. expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are all (match all indexes), open (match open indexes), closed (match closed indexes), hidden (match hidden indexes), and none (do not accept wildcard expressions). Default is `open`. ignore_unavailable | Boolean | If true, OpenSearch does not search for missing or closed indexes. Default is `false`. @@ -42,6 +41,13 @@ timeout | Time | How long to wait for a response from the cluster. Default is `3 wait_for_completion | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. task_execution_timeout | Time | The explicit task execution timeout. Only useful when wait_for_completion is set to `false`. Default is `1h`. +## Example request + +```json +POST /sample-index/_open +``` +{% include copy-curl.html %} + ## Example response ```json diff --git a/_api-reference/index-apis/put-mapping.md b/_api-reference/index-apis/put-mapping.md index f7d9321d33..26bfbae0d9 100644 --- a/_api-reference/index-apis/put-mapping.md +++ b/_api-reference/index-apis/put-mapping.md @@ -17,17 +17,45 @@ If you want to create or add mappings and fields to an index, you can use the pu You can't use this operation to update mappings that already map to existing data in the index. You must first create a new index with your desired mappings, and then use the [reindex API operation]({{site.url}}{{site.baseurl}}/opensearch/reindex-data) to map all the documents from your old index to the new index. If you don't want any downtime while you re-index your indexes, you can use [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias). +## Path and HTTP methods -## Required path parameter +```json +PUT //_mapping +PUT /,/_mapping +``` + + +## Path parameters The only required path parameter is the index with which to associate the mapping. If you don't specify an index, you will get an error. You can specify a single index, or multiple indexes separated by a comma as follows: -``` +```json PUT //_mapping PUT /,/_mapping ``` -## Required request body field +## Query parameters + +Optionally, you can add query parameters to make a more specific request. For example, to skip any missing or closed indexes in the response, you can add the `ignore_unavailable` query parameter to your request as follows: + +```json +PUT /sample-index/_mapping?ignore_unavailable +``` + +The following table defines the put mapping query parameters: + +Parameter | Data type | Description +:--- | :--- | :--- +allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. +expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. +ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indexes in the response. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. +timeout | Time | How long to wait for the response to return. Default is `30s`. +write_index_only | Boolean | Whether OpenSearch should apply mapping updates only to the write index. + +## Request body fields + +### properties The request body must contain `properties`, which has all of the mappings that you want to create or update. @@ -44,8 +72,6 @@ The request body must contain `properties`, which has all of the mappings that y } ``` -## Optional request body fields - ### dynamic You can make the document structure match the structure of the index mapping by setting the `dynamic` request body field to `strict`, as seen in the following example: @@ -61,26 +87,8 @@ You can make the document structure match the structure of the index mapping by } ``` -## Optional query parameters - -Optionally, you can add query parameters to make a more specific request. For example, to skip any missing or closed indexes in the response, you can add the `ignore_unavailable` query parameter to your request as follows: - -```json -PUT /sample-index/_mapping?ignore_unavailable -``` - -The following table defines the put mapping query parameters: - -Parameter | Data type | Description -:--- | :--- | :--- -allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. -expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. -ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indexes in the response. -cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. -timeout | Time | How long to wait for the response to return. Default is `30s`. -write_index_only | Boolean | Whether OpenSearch should apply mapping updates only to the write index. -#### Sample Request +## Example request The following request creates a new mapping for the `sample-index` index: @@ -100,7 +108,7 @@ PUT /sample-index/_mapping ``` {% include copy-curl.html %} -#### Sample Response +## Example response Upon success, the response returns `"acknowledged": true`. diff --git a/_api-reference/index-apis/recover.md b/_api-reference/index-apis/recover.md index dc2df1e5a2..41f071cf6c 100644 --- a/_api-reference/index-apis/recover.md +++ b/_api-reference/index-apis/recover.md @@ -28,14 +28,14 @@ The Recovery API reports solely on completed recoveries for shard copies present ```json GET /_recovery -GET //recovery/ +GET //_recovery/ ``` ## Path parameters Parameter | Data type | Description :--- | :--- -`index-name` | String | A comma-separated list of indexes, data streams, or index aliases to which the operation is applied. Supports wildcard expressions (`*`). Use `_all` or `*` to specify all indexes and data streams in a cluster. | +`index` | String | A comma-separated list of indexes, data streams, or index aliases to which the operation is applied. Supports wildcard expressions (`*`). Use `_all` or `*` to specify all indexes and data streams in a cluster. | ## Query parameters @@ -48,24 +48,6 @@ Parameter | Data type | Description `detailed` | Boolean | When `true`, provides detailed information about shard recoveries. Default is `false`. `index` | String | A comma-separated list or wildcard expression of index names used to limit the request. -## Response fields - -The API responds with the following information about the recovery shard. - -Parameter | Data type | Description -:--- | :--- | :--- -`id` | Integer | The ID of the shard. -`type` | String | The recovery source for the shard. Returned values include:
- `EMPTY_STORE`: An empty store. Indicates a new primary shard or the forced allocation of an empty primary shard using the Cluster Reroute API.
- `EXISTING_STORE`: The store of an existing primary shard. Indicates that the recovery is related to node startup or the allocation of an existing primary shard.
- `LOCAL_SHARDS`: Shards belonging to another index on the same node. Indicates that the recovery is related to a clone, shrink, or split operation.
- `PEER`: A primary shard on another node. Indicates that the recovery is related to shard replication.
- `SNAPSHOT`: A snapshot. Indicates that the recovery is related to a snapshot restore operation. -`STAGE` | String | The recovery stage. Returned values can include:
- `INIT`: Recovery has not started.
- `INDEX`: Reading index metadata and copying bytes from the source to the destination.
- `VERIFY_INDEX`: Verifying the integrity of the index.
- `TRANSLOG`: Replaying the transaction log.
- `FINALIZE`: Cleanup.
- `DONE`: Complete. -`primary` | Boolean | When `true`, the shard is a primary shard. -`start_time` | String | The timestamp indicating when the recovery started. -`stop_time` | String | The timestamp indicating when the recovery completed. -`total_time_in_millis` | String | The total amount of time taken to recover a shard, in milliseconds. -`source` | Object | The recovery source. This can include a description of the repository (if the recovery is from a snapshot) or a description of the source node. -`target` | Object | The destination node. -`index` | Object | Statistics about the physical index recovery. -`translog` | Object | Statistics about the translog recovery. - `start` | Object | Statistics about the amount of time taken to open and start the index. ## Example requests @@ -289,3 +271,22 @@ The following response returns detailed recovery information about an index name } } ``` + +## Response body fields + +The API responds with the following information about the recovery shard. + +Parameter | Data type | Description +:--- | :--- | :--- +`id` | Integer | The ID of the shard. +`type` | String | The recovery source for the shard. Returned values include:
- `EMPTY_STORE`: An empty store. Indicates a new primary shard or the forced allocation of an empty primary shard using the Cluster Reroute API.
- `EXISTING_STORE`: The store of an existing primary shard. Indicates that the recovery is related to node startup or the allocation of an existing primary shard.
- `LOCAL_SHARDS`: Shards belonging to another index on the same node. Indicates that the recovery is related to a clone, shrink, or split operation.
- `PEER`: A primary shard on another node. Indicates that the recovery is related to shard replication.
- `SNAPSHOT`: A snapshot. Indicates that the recovery is related to a snapshot restore operation. +`STAGE` | String | The recovery stage. Returned values can include:
- `INIT`: Recovery has not started.
- `INDEX`: Reading index metadata and copying bytes from the source to the destination.
- `VERIFY_INDEX`: Verifying the integrity of the index.
- `TRANSLOG`: Replaying the transaction log.
- `FINALIZE`: Cleanup.
- `DONE`: Complete. +`primary` | Boolean | When `true`, the shard is a primary shard. +`start_time` | String | The timestamp indicating when the recovery started. +`stop_time` | String | The timestamp indicating when the recovery completed. +`total_time_in_millis` | String | The total amount of time taken to recover a shard, in milliseconds. +`source` | Object | The recovery source. This can include a description of the repository (if the recovery is from a snapshot) or a description of the source node. +`target` | Object | The destination node. +`index` | Object | Statistics about the physical index recovery. +`translog` | Object | Statistics about the translog recovery. + `start` | Object | Statistics about the amount of time taken to open and start the index. diff --git a/_api-reference/index-apis/refresh.md b/_api-reference/index-apis/refresh.md index 4d75060087..917ca5d9a9 100644 --- a/_api-reference/index-apis/refresh.md +++ b/_api-reference/index-apis/refresh.md @@ -48,22 +48,24 @@ The following table lists the available query parameters. All query parameters a | `expand_wildcards` | String | The type of index that the wildcard patterns can match. If the request targets data streams, this argument determines whether the wildcard expressions match any hidden data streams. Supports comma-separated values, such as `open,hidden`. Valid values are `all`, `open`, `closed`, `hidden`, and `none`. +## Example requests -#### Example: Refresh several data streams or indexes +### Refresh several data streams or indexes The following example request refreshes two indexes named `my-index-A` and `my-index-B`: -``` +```json POST /my-index-A,my-index-B/_refresh ``` {% include copy-curl.html %} -#### Example: Refresh all data streams and indexes in a cluster +### Refresh all data streams and indexes in a cluster The following request refreshes all data streams and indexes in a cluster: -``` +```json POST /_refresh ``` +{% include copy-curl.html %} diff --git a/_api-reference/index-apis/rollover.md b/_api-reference/index-apis/rollover.md index 722dfe196c..db30a5d7bf 100644 --- a/_api-reference/index-apis/rollover.md +++ b/_api-reference/index-apis/rollover.md @@ -61,9 +61,9 @@ Parameter | Type | Description `timeout` | Time | The amount of time to wait for a response. Default is `30s`. `wait_for_active_shards` | String | The number of active shards that must be available before OpenSearch processes the request. Default is `1` (only the primary shard). You can also set to `all` or a positive integer. Values greater than `1` require replicas. For example, if you specify a value of `3`, then the index must have two replicas distributed across two additional nodes in order for the operation to succeed. -## Request body +## Request body fields -The following request body parameters are supported. +The following request body fields are supported. ### `alias` diff --git a/_api-reference/index-apis/segment.md b/_api-reference/index-apis/segment.md index 0ecee63e77..b9625d3b34 100644 --- a/_api-reference/index-apis/segment.md +++ b/_api-reference/index-apis/segment.md @@ -15,7 +15,7 @@ The Segment API provides details about the Lucene segments within index shards a ## Path and HTTP methods ```json -GET //_segments +GET //_segments GET /_segments ``` @@ -29,7 +29,7 @@ Parameter | Data type | Description ## Query parameters -The Segment API supports the following optional query parameters. +All query parameters are optional. Parameter | Data type | Description :--- | :--- | :--- @@ -38,21 +38,6 @@ Parameter | Data type | Description `ignore_unavailable` | Boolean | When `true`, OpenSearch ignores missing or closed indexes. If `false`, OpenSearch returns an error if the force merge operation encounters missing or closed indexes. Default is `false`. `verbose` | Boolean | When `true`, provides information about Lucene's memory usage. Default is `false`. -## Response body fields - -Parameter | Data type | Description - :--- | :--- | :--- -`` | String | The name of the segment used to create internal file names in the shard directory. -`generation` | Integer | The generation number, such as `0`, incremented for each written segment and used to name the segment. -`num_docs` | Integer | The number of documents, obtained from Lucene. Nested documents are counted separately from their parents. Deleted documents, as well as recently indexed documents that are not yet assigned to a segment, are excluded. -`deleted_docs` | Integer | The number of deleted documents, obtained from Lucene, which may not match the actual number of delete operations performed. Recently deleted documents that are not yet assigned to a segment are excluded. Deleted documents are automatically merged when appropriate. OpenSearch will occasionally delete extra documents in order to track recent shard operations. -`size_in_bytes` | Integer | The amount of disk space used by the segment, for example, `50kb`. -`memory_in_bytes` | Integer | The amount of segment data, measured in bytes, that is kept in memory to facilitate efficient search operations, such as `1264`. A value of `-1` indicates that OpenSearch was unable to compute this number. -`committed` | Boolean | When `true`, the segments are synced to disk. Segments synced to disk can survive a hard reboot. If `false`, then uncommitted segment data is stored in the transaction log as well so that changes can be replayed at the next startup. -`search` | Boolean | When `true`, segment search is enabled. When `false`, the segment may have already been written to disk and require a refresh in order to be searchable. -`version` | String | The Lucene version used to write the segment. -`compound` | Boolean | When `true`, indicates that Lucene merged all segment files into one file in order to save any file descriptions. -`attributes` | Object | Shows if high compression was enabled. ## Example requests @@ -119,3 +104,19 @@ GET /_segments } ``` +## Response body fields + +Parameter | Data type | Description + :--- | :--- | :--- +`` | String | The name of the segment used to create internal file names in the shard directory. +`generation` | Integer | The generation number, such as `0`, incremented for each written segment and used to name the segment. +`num_docs` | Integer | The number of documents, obtained from Lucene. Nested documents are counted separately from their parents. Deleted documents, as well as recently indexed documents that are not yet assigned to a segment, are excluded. +`deleted_docs` | Integer | The number of deleted documents, obtained from Lucene, which may not match the actual number of delete operations performed. Recently deleted documents that are not yet assigned to a segment are excluded. Deleted documents are automatically merged when appropriate. OpenSearch will occasionally delete extra documents in order to track recent shard operations. +`size_in_bytes` | Integer | The amount of disk space used by the segment, for example, `50kb`. +`memory_in_bytes` | Integer | The amount of segment data, measured in bytes, that is kept in memory to facilitate efficient search operations, such as `1264`. A value of `-1` indicates that OpenSearch was unable to compute this number. +`committed` | Boolean | When `true`, the segments are synced to disk. Segments synced to disk can survive a hard reboot. If `false`, then uncommitted segment data is stored in the transaction log as well so that changes can be replayed at the next startup. +`search` | Boolean | When `true`, segment search is enabled. When `false`, the segment may have already been written to disk and require a refresh in order to be searchable. +`version` | String | The Lucene version used to write the segment. +`compound` | Boolean | When `true`, indicates that Lucene merged all segment files into one file in order to save any file descriptions. +`attributes` | Object | Shows if high compression was enabled. + diff --git a/_api-reference/index-apis/shrink-index.md b/_api-reference/index-apis/shrink-index.md index 17b7c4dff6..e3e1c67155 100644 --- a/_api-reference/index-apis/shrink-index.md +++ b/_api-reference/index-apis/shrink-index.md @@ -13,25 +13,10 @@ redirect_from: The shrink index API operation moves all of your data in an existing index into a new index with fewer primary shards. -## Example - -```json -POST /my-old-index/_shrink/my-new-index -{ - "settings": { - "index.number_of_replicas": 4, - "index.number_of_shards": 3 - }, - "aliases":{ - "new-index-alias": {} - } -} -``` -{% include copy-curl.html %} ## Path and HTTP methods -``` +```json POST //_shrink/ PUT //_shrink/ ``` @@ -44,27 +29,32 @@ When creating new indexes with this operation, remember that OpenSearch indexes `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` -## URL parameters - -The shrink index API operation requires you to specify both the source index and the target index. All other parameters are optional. +## Path parameters Parameter | Type | description :--- | :--- | :--- <index-name> | String | The index to shrink. <target-index> | String | The target index to shrink the source index into. + +## Query parameters + +The shrink index API operation requires you to specify both the source index and the target index. All other parameters are optional. + +Parameter | Type | description +:--- | :--- | :--- wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the request. Default is 1 (only the primary shard). Set to all or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. timeout | Time | How long to wait for the request to return a response. Default is `30s`. wait_for_completion | Boolean | When set to `false`, the request returns immediately instead of after the operation is finished. To monitor the operation status, use the [Tasks API]({{site.url}}{{site.baseurl}}/api-reference/tasks/) with the task ID returned by the request. Default is `true`. task_execution_timeout | Time | The explicit task execution timeout. Only useful when wait_for_completion is set to `false`. Default is `1h`. -## Request body +## Request body fields You can use the request body to configure some index settings for the target index. All fields are optional. Field | Type | Description :--- | :--- | :--- -alias | Object | Sets an alias for the target index. Can have the fields `filter`, `index_routing`, `is_hidden`, `is_write_index`, `routing`, or `search_routing`. See [Index Aliases]({{site.url}}{{site.baseurl}}/api-reference/alias/#request-body). +alias | Object | Sets an alias for the target index. Can have the fields `filter`, `index_routing`, `is_hidden`, `is_write_index`, `routing`, or `search_routing`. See [Index Aliases]({{site.url}}{{site.baseurl}}/api-reference/alias/#request-body-fields). settings | Object | Index settings you can apply to your target index. See [Index Settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/). [max_shard_size](#the-max_shard_size-parameter) | Bytes | Specifies the maximum size of a primary shard in the target index. Because `max_shard_size` conflicts with the `index.number_of_shards` setting, you cannot set both of them at the same time. @@ -84,4 +74,20 @@ The minimum number of primary shards for the target index is 1. ## Index codec considerations -For index codec considerations, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#splits-and-shrinks). \ No newline at end of file +For index codec considerations, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#splits-and-shrinks). + +## Example request + +```json +POST /my-old-index/_shrink/my-new-index +{ + "settings": { + "index.number_of_replicas": 4, + "index.number_of_shards": 3 + }, + "aliases":{ + "new-index-alias": {} + } +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_api-reference/index-apis/split.md b/_api-reference/index-apis/split.md index ad13bffbba..b3db4c3340 100644 --- a/_api-reference/index-apis/split.md +++ b/_api-reference/index-apis/split.md @@ -33,7 +33,7 @@ PUT /sample-index1/_split/split-index1 ## Path and HTTP methods -``` +```json POST //_split/ PUT //_split/ ``` @@ -48,7 +48,14 @@ OpenSearch indexes have the following naming restrictions: `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` -## URL parameters +## Path parameters + +Parameter | Type | Description +:--- | :--- | :--- +<source-index> | String | The source index to split. +<target-index> | String | The index to create. + +## Query parameters Your request must include the source and target indexes. All split index parameters are optional. diff --git a/_api-reference/index-apis/stats.md b/_api-reference/index-apis/stats.md index 7310298594..728fe7751f 100644 --- a/_api-reference/index-apis/stats.md +++ b/_api-reference/index-apis/stats.md @@ -825,6 +825,6 @@ By default, the returned statistics are aggregated in the `primaries` and `total ``` -## Response fields +## Response body fields For information about response fields, see [Nodes Stats API response fields]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/#indices). diff --git a/_api-reference/index-apis/update-alias.md b/_api-reference/index-apis/update-alias.md index f32d34025e..c069703bf3 100644 --- a/_api-reference/index-apis/update-alias.md +++ b/_api-reference/index-apis/update-alias.md @@ -10,7 +10,7 @@ nav_order: 5 **Introduced 1.0** {: .label .label-purple } -The Create or Update Alias API adds a data stream or index to an alias or updates the settings for an existing alias. For more alias API operations, see [Index aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/). +The Create or Update Alias API adds one or more indexes to an alias or updates the settings for an existing alias. For more alias API operations, see [Index aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/). The Create or Update Alias API is distinct from the [Alias API]({{site.url}}{{site.baseurl}}/opensearch/rest-api/alias/), which supports the addition and removal of aliases and the removal of alias indexes. In contrast, the following API only supports adding or updating an alias without updating the index itself. Each API also uses different request body parameters. {: .note} @@ -35,7 +35,7 @@ PUT /_alias | Parameter | Type | Description | :--- | :--- | :--- -| `target` | String | A comma-delimited list of data streams and indexes. Wildcard expressions (`*`) are supported. To target all data streams and indexes in a cluster, use `_all` or `*`. Optional. | +| `target` | String | A comma-delimited list of indexes. Wildcard expressions (`*`) are supported. To target all indexes in a cluster, use `_all` or `*`. Optional. | | `alias-name` | String | The alias name to be created or updated. Optional. | ## Query parameters @@ -53,7 +53,7 @@ In the request body, you can specify the index name, the alias name, and the set Field | Type | Description :--- | :--- | :--- | :--- -`index` | String | A comma-delimited list of data streams or indexes that you want to associate with the alias. If this field is set, it will override the index name specified in the URL path. +`index` | String | A comma-delimited list of indexes that you want to associate with the alias. If this field is set, it will override the index name specified in the URL path. `alias` | String | The name of the alias. If this field is set, it will override the alias name specified in the URL path. `is_write_index` | Boolean | Specifies whether the index should be a write index. An alias can only have one write index at a time. If a write request is submitted to an alias that links to multiple indexes, then OpenSearch runs the request only on the write index. `routing` | String | Assigns a custom value to a shard for specific operations. diff --git a/_api-reference/index-apis/update-settings.md b/_api-reference/index-apis/update-settings.md index 9fc9f01f85..3afaaa10d3 100644 --- a/_api-reference/index-apis/update-settings.md +++ b/_api-reference/index-apis/update-settings.md @@ -11,28 +11,22 @@ redirect_from: **Introduced 1.0** {: .label .label-purple } -You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). Aside from the static and dynamic index settings, you can also update individual plugins' settings. To get the full list of updatable settings, run `GET /_settings?include_defaults=true`. -## Example + +## Path and HTTP methods ```json -PUT /sample-index1/_settings -{ - "index.plugins.index_state_management.rollover_skip": true, - "index": { - "number_of_replicas": 4 - } -} +PUT //_settings ``` -{% include copy-curl.html %} -## Path and HTTP methods +## Path parameters -``` -PUT //_settings -``` +Parameter | Type | Description +:--- | :--- | :--- +<index> | String | The index to update. Can be a comma-separated list of multiple index names. Use `_all` or `*` to specify all indexes. ## Query parameters @@ -59,7 +53,20 @@ The request body must all of the index settings that you want to update. } ``` -## Response +## Example request + +```json +PUT /sample-index1/_settings +{ + "index.plugins.index_state_management.rollover_skip": true, + "index": { + "number_of_replicas": 4 + } +} +``` +{% include copy-curl.html %} + +## Example response ```json { diff --git a/_api-reference/list/index.md b/_api-reference/list/index.md new file mode 100644 index 0000000000..b3f694157c --- /dev/null +++ b/_api-reference/list/index.md @@ -0,0 +1,175 @@ +--- +layout: default +title: List API +nav_order: 45 +has_children: true +--- + +# List APIs +**Introduced 2.18** +{: .label .label-purple } + +The List API retrieves statistics about indexes and shards in a paginated format. This streamlines the task of processing responses that include many indexes. + +The List API supports two operations: + +- [List indices]({{site.url}}{{site.baseurl}}/api-reference/list/list-indices/) +- [List shards]({{site.url}}{{site.baseurl}}/api-reference/list/list-shards/) + +## Shared query parameters + +All List API operations support the following optional query parameters. + +Parameter | Description +:--- | :--- | +`v` | Provides verbose output by adding headers to the columns. It also adds some formatting to help align each of the columns. All examples in this section include the `v` parameter. +`help` | Lists the default and other available headers for a given operation. +`h` | Limits the output to specific headers. +`format` | The format in which to return the result. Valid values are `json`, `yaml`, `cbor`, and `smile`. +`s` | Sorts the output by the specified columns. + +## Examples + +The following examples show how to use the optional query parameters to customize all List API responses. + + +### Get verbose output + +To query indexes and their statistics with a verbose output that includes all column headings in the response, use the `v` query parameter, as shown in the following example. + +#### Request + +```json +GET _list/indices?v +``` +{% include copy-curl.html %} + +#### Response + +```json +health status index uuid pri rep docs.count docs.deleted +green open .kibana_1 - - - - +yellow open sample-index-1 - - - - +next_token null +``` + + +### Get all available headers + +To see all the available headers, use the `help` parameter with the following syntax: + +```json +GET _list/?help +``` +{% include copy-curl.html %} + +#### Request + +The following example list indices operation returns all the available headers: + +```json +GET _list/indices?help +``` +{% include copy-curl.html %} + +#### Response + +The following example displays the indexes and their health status in a table: + +```json +health | h | current health status +status | s | open/close status +index | i,idx | index name +uuid | id,uuid | index uuid +pri | p,shards.primary,shardsPrimary | number of primary shards +rep | r,shards.replica,shardsReplica | number of replica shards +docs.count | dc,docsCount | available docs +``` + +### Get a subset of headers + +To limit the output to a subset of headers, use the `h` parameter with the following syntax: + +```json +GET _list/?h=,&v +``` +{% include copy-curl.html %} + +For any operation, you can determine which headers are available by using the `help` parameter and then using the `h` parameter to limit the output to only a subset of headers. + +#### Request + +The following example limits the indexes in the response to only the index name and health status headers: + +```json +GET _list/indices?h=health,index +``` +{% include copy-curl.html %} + +### Response + +```json +green .kibana_1 +yellow sample-index-1 +next_token null +``` + + +### Sort by a header + +To sort the output on a single page by a header, use the `s` parameter with the following syntax: + +```json +GET _list/?s=, +``` +{% include copy-curl.html %} + +#### Request + +The following example request sorts indexes by index name: + +```json +GET _list/indices?s=h,i +``` +{% include copy-curl.html %} + +#### Response + +```json +green sample-index-2 +yellow sample-index-1 +next_token null +``` + +### Retrieve data in JSON format + +By default, List APIs return data in a `text/plain` format. Other supported formats are [YAML](https://yaml.org/), [CBOR](https://cbor.io/), and [Smile](https://github.com/FasterXML/smile-format-specification). + + +To retrieve data in the JSON format, use the `format=json` parameter with the following syntax. + +If you use the Security plugin, ensure you have the appropriate permissions. +{: .note } + +#### Request + +```json +GET _list/?format=json +``` +{% include copy-curl.html %} + +#### Request + +```json +GET _list/indices?format=json +``` +{% include copy-curl.html %} + +### Response + +The response contains data in JSON format: + +```json +{"next_token":null,"indices":[{"health":"green","status":"-","index":".kibana_1","uuid":"-","pri":"-","rep":"-","docs.count":"-","docs.deleted":"-","store.size":"-","pri.store.size":"-"},{"health":"yellow","status":"-","index":"sample-index-1","uuid":"-","pri":"-","rep":"-","docs.count":"-","docs.deleted":"-","store.size":"-","pri.store.size":"-"}]} +``` + diff --git a/_api-reference/list/list-indices.md b/_api-reference/list/list-indices.md new file mode 100644 index 0000000000..618413d35c --- /dev/null +++ b/_api-reference/list/list-indices.md @@ -0,0 +1,83 @@ +--- +layout: default +title: List indices +parent: List API +nav_order: 25 +has_children: false +--- + +# List indices +**Introduced 2.18** +{: .label .label-purple } + +The list indices operation provides the following index information in a paginated format: + +- The amount of disk space used by the index. +- The number of shards contained in the index. +- The index's health status. + +## Path and HTTP methods + +```json +GET _list/indices +GET _list/indices/ +``` + +## Query parameters + +Parameter | Type | Description +:--- | :--- | :--- +`bytes` | Byte size | Specifies the units for the byte size, for example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`health` | String | Limits indexes based on their health status. Supported values are `green`, `yellow`, and `red`. +`include_unloaded_segments` | Boolean | Whether to include information from segments not loaded into memory. Default is `false`. +`cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. +`pri` | Boolean | Whether to return information only from the primary shards. Default is `false`. +`time` | Time | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`expand_wildcards` | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +`next_token` | String | Fetches the next page of indexes. When `null`, only provides the first page of indexes. Default is `null`. +`size` | Integer | The maximum number of indexes to be displayed on a single page. The number of indexes on a single page of the response is not always equal to the specified `size`. Default is `500`. Minimum is `1` and maximum value is `5000`. +`sort` | String | The order in which the indexes are displayed. If `desc`, then the most recently created indexes are displayed first. If `asc`, then the oldest indexes are displayed first. Default is `asc`. + +When using the `next_token` path parameter, use the token produced by the response to see the next page of indexes. After the API returns `null`, all indexes contained in the API have been returned. +{: .tip } + + +## Example requests + +To get information for all the indexes, use the following query and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/indices/?v&next_token=token +``` + + +To limit the information to a specific index, add the index name after your query, as shown in the following example: + +```json +GET _list/indices/?v +``` +{% include copy-curl.html %} + +To get information about more than one index, separate the indexes with commas, as shown in the following example: + +```json +GET _list/indices/index1,index2,index3?v&next_token=token +``` +{% include copy-curl.html %} + + +## Example response + +**Plain text format** + +```json +health | status | index | uuid | pri | rep | docs.count | docs.deleted | store.size | pri.store.size +green | open | movies | UZbpfERBQ1-3GSH2bnM3sg | 1 | 1 | 1 | 0 | 7.7kb | 3.8kb +next_token MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw== +``` + +**JSON format** + +```json +{"next_token":"MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw==","indices":[{"health":"green","status":"open","index":"movies","uuid":"UZbpfERBQ1-3GSH2bnM3sg","pri":"1","rep":"1","docs.count":"1","docs.deleted":"0","store.size":"7.7kb","pri.store.size":"3.8kb"}]} +``` diff --git a/_api-reference/list/list-shards.md b/_api-reference/list/list-shards.md new file mode 100644 index 0000000000..7111aeb0f2 --- /dev/null +++ b/_api-reference/list/list-shards.md @@ -0,0 +1,78 @@ +--- +layout: default +title: List shards +parent: List API +nav_order: 20 +--- + +# List shards +**Introduced 2.18** +{: .label .label-purple } + +The list shards operation outputs, in a paginated format, the state of all primary and replica shards and how they are distributed. + +## Path and HTTP methods + +```json +GET _list/shards +GET _list/shards/ +``` + +## Query parameters + +All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`bytes` | Byte size | Specifies the byte size units, for example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`local` | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. +`cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. +`cancel_after_time_interval` | Time | The amount of time after which the shard request is canceled. Default is `-1` (no timeout). +`time` | Time | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`next_token` | String | Fetches the next page of indexes. When `null`, only provides the first page of indexes. Default is `null`. +`size` | Integer | The maximum number of indexes to be displayed on a single page. The number of indexes on a single page of the response is not always equal to the specified `size`. Default and minimum value is `2000`. Maximum value is `20000`. +`sort` | String | The order in which the indexes are displayed. If `desc`, then the most recently created indexes are displayed first. If `asc`, then the oldest indexes are displayed first. Default is `asc`. + +When using the `next_token` path parameter, use the token produced by the response to see the next page of indexes. After the API returns `null`, all indexes contained in the API have been returned. +{: .tip } + +## Example requests + +To get information for all the indexes and shards, use the following query and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/shards/?v&next_token=token +``` + +To limit the information to a specific index, add the index name after your query, as shown in the following example and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/shards/?v&next_token=token +``` +{% include copy-curl.html %} + +If you want to get information for more than one index, separate the indexes with commas, as shown in the following example: + +```json +GET _list/shards/index1,index2,index3?v&next_token=token +``` +{% include copy-curl.html %} + +## Example response + +**Plain text format** + +```json +index | shard | prirep | state | docs | store | ip | | node +plugins | 0 | p | STARTED | 0 | 208b | 172.18.0.4 | odfe-node1 +plugins | 0 | r | STARTED | 0 | 208b | 172.18.0.3 | odfe-node2 +.... +.... +next_token MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw== +``` + +**JSON format** + +```json +{"next_token":"MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw==","shards":[{"index":"plugins","shard":"0","prirep":"p","state":"STARTED","docs":"0","store":"208B","ip":"172.18.0.4","node":"odfe-node1"},{"index":"plugins","shard":"0","prirep":"r","state":"STARTED","docs":"0","store":"208B","ip":"172.18.0.3","node":"odfe-node2"}]} +``` diff --git a/_api-reference/msearch-template.md b/_api-reference/msearch-template.md index 316cc134ff..fdebf5bed1 100644 --- a/_api-reference/msearch-template.md +++ b/_api-reference/msearch-template.md @@ -15,33 +15,17 @@ The Multi-search Template API runs multiple search template requests in a single The Multi-search Template API uses the following paths: -``` +```json GET /_msearch/template POST /_msearch/template GET /{index}/_msearch/template POST /{index}/_msearch/template ``` -## Request body -The multi-search template request body follows this pattern, similar to the [Multi-search API]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) pattern: +## Query parameters and metadata options -``` -Metadata\n -Query\n -Metadata\n -Query\n - -``` - -- Metadata lines include options, such as which indexes to search and the type of search. -- Query lines use [query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). - -Like the [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) operation, the JSON doesn't need to be minified---spaces are fine---but it does need to be on a single line. OpenSearch uses newline characters to parse multi-search requests and requires that the request body end with a newline character. - -## URL parameters and metadata options - -All multi-search template URL parameters are optional. Some can also be applied per search as part of each metadata line. +All parameters are optional. Some can also be applied per search as part of each metadata line. Parameter | Type | Description | Supported in metadata :--- | :--- | :--- | :--- @@ -57,9 +41,9 @@ rest_total_hits_as_int | String | Specifies whether the `hits.total` property is search_type | String | Affects the relevance score. Valid options are `query_then_fetch` and `dfs_query_then_fetch`. `query_then_fetch` scores documents using term and document frequencies for a single shard (faster, less accurate), whereas `dfs_query_then_fetch` uses term and document frequencies across all shards (slower, more accurate). Default is `query_then_fetch`. | Yes typed_keys | Boolean | Specifies whether to prefix aggregation names with their internal types in the response. Default is `false`. | No -## Metadata-only options +### Metadata-only options -Some options can't be applied as URL parameters to the entire request. Instead, you can apply them per search as part of each metadata line. All are optional. +Some options can't be applied as parameters to the entire request. Instead, you can apply them per search as part of each metadata line. All are optional. Option | Type | Description :--- | :--- | :--- @@ -68,11 +52,26 @@ preference | String | Specifies the nodes or shards on which you want to perform request_cache | Boolean | Specifies whether to cache results, which can improve latency for repeated searches. Default is to use the `index.requests.cache.enable` setting for the index (which defaults to `true` for new indexes). routing | String | Comma-separated custom routing values, for example, `"routing": "value1,value2,value3"`. -## Example +## Request body + +The multi-search template request body follows this pattern, similar to the [Multi-search API]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) pattern: -The following example `msearch/template` API request runs queries against a single index using multiple templates named `line_search_template` and `play_search_template`: +``` +Metadata\n +Query\n +Metadata\n +Query\n -### Request +``` + +- Metadata lines include options, such as which indexes to search and the type of search. +- Query lines use [query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). + +Like the [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) operation, the JSON doesn't need to be minified---spaces are fine---but it does need to be on a single line. OpenSearch uses newline characters to parse multi-search requests and requires that the request body end with a newline character. + +## Example request + +The following example `msearch/template` API request runs queries against a single index using multiple templates named `line_search_template` and `play_search_template`: ```json GET _msearch/template @@ -83,7 +82,7 @@ GET _msearch/template ``` {% include copy-curl.html %} -### Response +## Example response OpenSearch returns an array with the results of each search in the same order as in the multi-search template request: @@ -107,7 +106,8 @@ OpenSearch returns an array with the results of each search in the same order as }, "max_score": null, "hits": [] - } + }, + "status": 200 }, { "took": 3, @@ -125,7 +125,8 @@ OpenSearch returns an array with the results of each search in the same order as }, "max_score": null, "hits": [] - } + }, + "status": 200 } ] } diff --git a/_api-reference/multi-search.md b/_api-reference/multi-search.md index ff04b2d075..d8ac41ecec 100644 --- a/_api-reference/multi-search.md +++ b/_api-reference/multi-search.md @@ -13,19 +13,6 @@ redirect_from: As the name suggests, the multi-search operation lets you bundle multiple search requests into a single request. OpenSearch then executes the searches in parallel, so you get back the response more quickly compared to sending one request per search. OpenSearch executes each search independently, so the failure of one doesn't affect the others. -## Example - -```json -GET _msearch -{ "index": "opensearch_dashboards_sample_data_logs"} -{ "query": { "match_all": {} }, "from": 0, "size": 10} -{ "index": "opensearch_dashboards_sample_data_ecommerce", "search_type": "dfs_query_then_fetch"} -{ "query": { "match_all": {} } } - -``` -{% include copy-curl.html %} - - ## Path and HTTP methods The Multi-search API uses the following paths: @@ -37,27 +24,10 @@ POST _msearch POST /_msearch ``` -## Request body - -The multi-search request body follows this pattern: - -``` -Metadata\n -Query\n -Metadata\n -Query\n - -``` - -- Metadata lines include options, such as which indexes to search and the type of search. -- Query lines use the [query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). - -Just like the [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) operation, the JSON doesn't need to be minified---spaces are fine---but it does need to be on a single line. OpenSearch uses newline characters to parse multi-search requests and requires that the request body end with a newline character. - -## URL parameters and metadata options +## Query parameters and metadata options -All multi-search URL parameters are optional. Some can also be applied per-search as part of each metadata line. +All parameters are optional. Some can also be applied per-search as part of each metadata line. Parameter | Type | Description | Supported in metadata line :--- | :--- | :--- @@ -80,7 +50,7 @@ From the REST API specification: A threshold that enforces a pre-filter round tr ## Metadata-only options -Some options can't be applied as URL parameters to the entire request. Instead, you can apply them per-search as part of each metadata line. All are optional. +Some options can't be applied as parameters to the entire request. Instead, you can apply them per-search as part of each metadata line. All are optional. Option | Type | Description :--- | :--- | :--- @@ -89,12 +59,28 @@ preference | String | The nodes or shards that you'd like to perform the search. request_cache | Boolean | Whether to cache results, which can improve latency for repeat searches. Default is to use the `index.requests.cache.enable` setting for the index (which defaults to `true` for new indexes). routing | String | Comma-separated custom routing values, for example, `"routing": "value1,value2,value3"`. +## Request body + +The multi-search request body follows this pattern: + +``` +Metadata\n +Query\n +Metadata\n +Query\n + +``` + +- Metadata lines include options, such as which indexes to search and the type of search. +- Query lines use the [query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). + +Just like the [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) operation, the JSON doesn't need to be minified---spaces are fine---but it does need to be on a single line. OpenSearch uses newline characters to parse multi-search requests and requires that the request body end with a newline character. + -## Example +## Example request The following example `msearch` API request runs queries against multiple indexes: -### Request ```json GET _msearch @@ -107,7 +93,7 @@ GET _msearch {% include copy-curl.html %} -### Response +## Example response OpenSearch returns an array with the results of each search in the same order as the multi-search request. diff --git a/_api-reference/nodes-apis/nodes-hot-threads.md b/_api-reference/nodes-apis/nodes-hot-threads.md index f5e014dd6d..5339903d1e 100644 --- a/_api-reference/nodes-apis/nodes-hot-threads.md +++ b/_api-reference/nodes-apis/nodes-hot-threads.md @@ -11,12 +11,6 @@ nav_order: 30 The nodes hot threads endpoint provides information about busy JVM threads for selected cluster nodes. It provides a unique view of the of activity each node. -#### Example - -```json -GET /_nodes/hot_threads -``` -{% include copy-curl.html %} ## Path and HTTP methods diff --git a/_api-reference/nodes-apis/nodes-info.md b/_api-reference/nodes-apis/nodes-info.md index a8953505ff..a8767cafce 100644 --- a/_api-reference/nodes-apis/nodes-info.md +++ b/_api-reference/nodes-apis/nodes-info.md @@ -18,25 +18,10 @@ The nodes info API represents mostly static information about your cluster's nod - Thread pools settings - Installed plugins -## Example - -To get information about all nodes in a cluster, use the following query: - -```json -GET /_nodes -``` -{% include copy-curl.html %} - -To get thread pool information about the cluster manager node only, use the following query: - -```json -GET /_nodes/master:true/thread_pool -``` -{% include copy-curl.html %} ## Path and HTTP methods -```bash +```json GET /_nodes GET /_nodes/ GET /_nodes/ @@ -88,6 +73,13 @@ GET /_nodes/cluster_manager:true/process,transport ``` {% include copy-curl.html %} +To get thread pool information about the cluster manager node only, use the following query: + +```json +GET /_nodes/master:true/thread_pool +``` +{% include copy-curl.html %} + ## Example response The response contains the metric groups specified in the `` request parameter (in this case, `process` and `transport`): @@ -136,7 +128,7 @@ The response contains the metric groups specified in the `` request par } ``` -## Response fields +## Response body fields The response contains the basic node identification and build info for every node matching the `` request parameter. The following table lists the response fields. diff --git a/_api-reference/nodes-apis/nodes-reload-secure.md b/_api-reference/nodes-apis/nodes-reload-secure.md index b4be66ddd4..1ed0eabde4 100644 --- a/_api-reference/nodes-apis/nodes-reload-secure.md +++ b/_api-reference/nodes-apis/nodes-reload-secure.md @@ -13,7 +13,7 @@ The nodes reload secure settings endpoint allows you to change secure settings o ## Path and HTTP methods -``` +```json POST _nodes/reload_secure_settings POST _nodes//reload_secure_settings ``` @@ -26,7 +26,7 @@ Parameter | Type | Description :--- | :--- | :--- nodeId | String | A comma-separated list of nodeIds used to filter results. Supports [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters). Defaults to `_all`. -## Request fields +## Request body fields The request may include an optional object containing the password for the OpenSearch keystore. diff --git a/_api-reference/nodes-apis/nodes-stats.md b/_api-reference/nodes-apis/nodes-stats.md index 479cd8e732..604b89969b 100644 --- a/_api-reference/nodes-apis/nodes-stats.md +++ b/_api-reference/nodes-apis/nodes-stats.md @@ -788,7 +788,7 @@ Select the arrow to view the example response. ``` -## Response fields +## Response body fields The following table lists all response fields. diff --git a/_api-reference/nodes-apis/nodes-usage.md b/_api-reference/nodes-apis/nodes-usage.md index 355b7f8ff2..1101b2989a 100644 --- a/_api-reference/nodes-apis/nodes-usage.md +++ b/_api-reference/nodes-apis/nodes-usage.md @@ -13,7 +13,7 @@ The nodes usage endpoint returns low-level information about REST action usage o ## Path and HTTP methods -``` +```json GET _nodes/usage GET _nodes//usage GET _nodes/usage/ diff --git a/_api-reference/profile.md b/_api-reference/profile.md index 4f8c69db9c..e54ffabe15 100644 --- a/_api-reference/profile.md +++ b/_api-reference/profile.md @@ -18,6 +18,18 @@ The Profile API provides timing information about the execution of individual co The Profile API is a resource-consuming operation that adds overhead to search operations. {: .warning} +## Path and HTTP methods + +```json +GET /testindex/_search +{ + "profile": true, + "query" : { + "match" : { "title" : "wind" } + } +} +``` + ## Concurrent segment search Starting in OpenSearch 2.12, [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/) allows each shard-level request to search segments in parallel during the query phase. The Profile API response contains several additional fields with statistics about _slices_. @@ -26,7 +38,9 @@ A slice is the unit of work that can be executed by a thread. Each query can be In general, the max/min/avg slice time captures statistics across all slices for a timing type. For example, when profiling aggregations, the `max_slice_time_in_nanos` field in the `aggregations` section shows the maximum time consumed by the aggregation operation and its children across all slices. -## Example request: Non-concurrent search +## Example requests + +### Non-concurrent search To use the Profile API, include the `profile` parameter set to `true` in the search request sent to the `_search` endpoint: @@ -70,7 +84,54 @@ The response contains an additional `time` field with human-readable units, for The Profile API response is verbose, so if you're running the request through the `curl` command, include the `?pretty` query parameter to make the response easier to understand. {: .tip} -#### Example response +### Aggregations + +To profile aggregations, send an aggregation request and provide the `profile` parameter set to `true`. + +#### Global aggregation + +```json +GET /opensearch_dashboards_sample_data_ecommerce/_search +{ + "profile": "true", + "size": 0, + "query": { + "match": { "manufacturer": "Elitelligence" } + }, + "aggs": { + "all_products": { + "global": {}, + "aggs": { + "avg_price": { "avg": { "field": "taxful_total_price" } } + } + }, + "elitelligence_products": { "avg": { "field": "taxful_total_price" } } + } +} +``` +{% include copy-curl.html %} + +#### Non-global aggregation + +```json +GET /opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "avg_taxful_total_price": { + "avg": { + "field": "taxful_total_price" + } + } + } +} +``` +{% include copy-curl.html %} + + +## Example response + +### Non-concurrent search The response contains profiling information: @@ -221,7 +282,7 @@ The response contains profiling information: ``` -#### Example response: Concurrent segment search +### Concurrent segment search The following is an example response for a concurrent segment search with three segment slices: @@ -435,7 +496,7 @@ The following is an example response for a concurrent segment search with three ``` -## Response fields +## Response body fields The response includes the following fields. @@ -522,34 +583,9 @@ Reason | Description `aggregation` | A collector for aggregations that is run against the specified query scope. OpenSearch uses a single `aggregation` collector to collect documents for all aggregations. `global_aggregation` | A collector that is run against the global query scope. Global scope is different from a specified query scope, so in order to collect the entire dataset, a `match_all` query must be run. -## Aggregations - -To profile aggregations, send an aggregation request and provide the `profile` parameter set to `true`. - -#### Example request: Global aggregation - -```json -GET /opensearch_dashboards_sample_data_ecommerce/_search -{ - "profile": "true", - "size": 0, - "query": { - "match": { "manufacturer": "Elitelligence" } - }, - "aggs": { - "all_products": { - "global": {}, - "aggs": { - "avg_price": { "avg": { "field": "taxful_total_price" } } - } - }, - "elitelligence_products": { "avg": { "field": "taxful_total_price" } } - } -} -``` -{% include copy-curl.html %} +### Aggregation responses -#### Example response: Global aggregation +#### Response: Global aggregation The response contains profiling information: @@ -804,24 +840,7 @@ The response contains profiling information: ``` -#### Example request: Non-global aggregation - -```json -GET /opensearch_dashboards_sample_data_ecommerce/_search -{ - "size": 0, - "aggs": { - "avg_taxful_total_price": { - "avg": { - "field": "taxful_total_price" - } - } - } -} -``` -{% include copy-curl.html %} - -#### Example response: Non-global aggregation +#### Response: Non-global aggregation The response contains profiling information: @@ -966,13 +985,13 @@ The response contains profiling information: ``` -### Response fields +#### Response body fields The `aggregations` array contains aggregation objects with the following fields. Field | Data type | Description :--- | :--- | :--- -`type` | String | The aggregator type. In the [non-global aggregation example response](#example-response-non-global-aggregation), the aggregator type is `AvgAggregator`. [Global aggregation example response](#example-request-global-aggregation) contains a `GlobalAggregator` with an `AvgAggregator` child. +`type` | String | The aggregator type. In the [non-global aggregation example response](#response-non-global-aggregation), the aggregator type is `AvgAggregator`. [Global aggregation example response](#response-global-aggregation) contains a `GlobalAggregator` with an `AvgAggregator` child. `description` | String | Contains a Lucene explanation of the aggregation. Helps differentiate aggregations with the same type. `time_in_nanos` | Long | The total elapsed time for this aggregation, in nanoseconds. For concurrent segment search, `time_in_nanos` is the total amount of time across all slices (the difference between the last completed slice execution end time and the first slice execution start time). [`breakdown`](#the-breakdown-object-1) | Object | Contains timing statistics about low-level Lucene execution. @@ -982,7 +1001,7 @@ Field | Data type | Description `min_slice_time_in_nanos` |Long |The minimum amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. `avg_slice_time_in_nanos` |Long |The average amount of time taken by any slice to run an aggregation, in nanoseconds. This field is included only if you enable concurrent segment search. -### The `breakdown` object +#### The `breakdown` object The `breakdown` object represents the timing statistics about low-level Lucene execution, broken down by method. Each field in the `breakdown` object represents an internal Lucene method executed within the aggregation. Timings are listed in wall-clock nanoseconds and are not normalized. The `breakdown` timings are inclusive of all child times. The `breakdown` object is comprised of the following fields. All fields contain integer values. diff --git a/_api-reference/rank-eval.md b/_api-reference/rank-eval.md index 881ff3a22b..61c80be592 100644 --- a/_api-reference/rank-eval.md +++ b/_api-reference/rank-eval.md @@ -12,7 +12,7 @@ The [rank]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/rank/) ## Path and HTTP methods -``` +```json GET /_rank_eval POST /_rank_eval ``` @@ -28,7 +28,7 @@ allow_no_indices | Boolean | Defaults to `true`. When set to `false` the respons expand_wildcards | String | Expand wildcard expressions for indexes that are `open`, `closed`, `hidden`, `none`, or `all`. search_type | String | Set search type to either `query_then_fetch` or `dfs_query_then_fetch`. -## Request fields +## Request body fields The request body must contain at least one parameter. diff --git a/_api-reference/remote-info.md b/_api-reference/remote-info.md index ac2971f294..25e032a9d5 100644 --- a/_api-reference/remote-info.md +++ b/_api-reference/remote-info.md @@ -17,12 +17,11 @@ The response is more comprehensive and useful than a call to `_cluster/settings` ## Path and HTTP methods -``` +```json GET _remote/info ``` -{% include copy-curl.html %} -## Response +## Example Response ```json { diff --git a/_api-reference/render-template.md b/_api-reference/render-template.md index 409fde5e4a..db2caa9cef 100644 --- a/_api-reference/render-template.md +++ b/_api-reference/render-template.md @@ -10,7 +10,7 @@ The Render Template API renders a [search template]({{site.url}}{{site.baseurl}} ## Paths and HTTP methods -``` +```json GET /_render/template POST /_render/template GET /_render/template/ @@ -25,7 +25,7 @@ The Render Template API supports the following optional path parameter. | :--- | :--- | :--- | | `id` | String | The ID of the search template to render. | -## Request options +## Request body fields The following options are supported in the request body of the Render Template API. diff --git a/_api-reference/script-apis/create-stored-script.md b/_api-reference/script-apis/create-stored-script.md index 0a915cd836..d28c504a51 100644 --- a/_api-reference/script-apis/create-stored-script.md +++ b/_api-reference/script-apis/create-stored-script.md @@ -34,7 +34,7 @@ All parameters are optional. | cluster_manager_timeout | Time | Amount of time to wait for a connection to the cluster manager. Defaults to 30 seconds. | | timeout | Time | The period of time to wait for a response. If a response is not received before the timeout value, the request fails and returns an error. Defaults to 30 seconds.| -## Request fields +## Request body fields | Field | Data type | Description | :--- | :--- | :--- @@ -49,7 +49,7 @@ All parameters are optional. ## Example request -The sample uses an index called `books` with the following documents: +The following example requests uses an index called `books` with the following documents: ````json {"index":{"_id":1}} @@ -60,6 +60,8 @@ The sample uses an index called `books` with the following documents: {"name":"book3","author":"Gilroy","ratings":[2,1,5]} ```` +### Creating a painless script + The following request creates the Painless script `my-first-script`. It sums the ratings for each book and displays the sum in the output. ````json @@ -96,10 +98,16 @@ curl -XPUT "http://opensearch:9200/_scripts/my-first-script" -H 'Content-Type: a {% include copy.html %} -The following request creates the Painless script `my-first-script`, which sums the ratings for each book and displays the sum in the output: +See [Execute Painless stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-stored-script/) for information about running the script. + +### Creating or updating a stored script with parameters + +The Painless script supports `params` to pass variables to the script. + +The following request creates the Painless script `multiplier-script`. The request sums the ratings for each book, multiplies the summed value by the `multiplier` parameter, and displays the result in the output: ````json -PUT _scripts/my-first-script +PUT _scripts/multiplier-script { "script": { "lang": "painless", @@ -108,15 +116,13 @@ PUT _scripts/my-first-script for (int i = 0; i < doc['ratings'].length; ++i) { total += doc['ratings'][i]; } - return total; + return total * params['multiplier']; """ } } ```` {% include copy-curl.html %} -See [Execute Painless stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-stored-script/) for information about running the script. - ## Example response The `PUT _scripts/my-first-script` request returns the following field: @@ -130,43 +136,5 @@ The `PUT _scripts/my-first-script` request returns the following field: To determine whether the script was successfully created, use the [Get stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/get-stored-script/) API, passing the script name as the `script` path parameter. {: .note} -### Response fields - -| Field | Data type | Description | -:--- | :--- | :--- -| acknowledged | Boolean | Whether the request was received. | - -## Creating or updating a stored script with parameters - -The Painless script supports `params` to pass variables to the script. - -#### Example - -The following request creates the Painless script `multiplier-script`. The request sums the ratings for each book, multiplies the summed value by the `multiplier` parameter, and displays the result in the output: - -````json -PUT _scripts/multiplier-script -{ - "script": { - "lang": "painless", - "source": """ - int total = 0; - for (int i = 0; i < doc['ratings'].length; ++i) { - total += doc['ratings'][i]; - } - return total * params['multiplier']; - """ - } -} -```` -{% include copy-curl.html %} - -### Example response -The `PUT _scripts/multiplier-script` request returns the following field: -````json -{ - "acknowledged" : true -} -```` \ No newline at end of file diff --git a/_api-reference/script-apis/delete-script.md b/_api-reference/script-apis/delete-script.md index fe9c272acc..22c2a3f394 100644 --- a/_api-reference/script-apis/delete-script.md +++ b/_api-reference/script-apis/delete-script.md @@ -9,7 +9,13 @@ nav_order: 4 **Introduced 1.0** {: .label .label-purple } -Deletes a stored script +Deletes a stored script. + +## Path and HTTP methods + +```json +DELETE _scripts/my-script +``` ## Path parameters @@ -47,7 +53,7 @@ The `DELETE _scripts/my-first-script` request returns the following field: To determine whether the stored script was successfully deleted, use the [Get stored script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/get-stored-script/) API, passing the script name as the `script` path parameter. -## Response fields +## Response body fields The request returns the following response fields: diff --git a/_api-reference/script-apis/exec-script.md b/_api-reference/script-apis/exec-script.md index b6476be980..cd31ad92f4 100644 --- a/_api-reference/script-apis/exec-script.md +++ b/_api-reference/script-apis/exec-script.md @@ -18,7 +18,7 @@ GET /_scripts/painless/_execute POST /_scripts/painless/_execute ``` -## Request fields +## Request body fields | Field | Description | :--- | :--- @@ -54,7 +54,7 @@ The response contains the average of two script parameters: } ``` -## Response fields +## Response body fields | Field | Description | :--- | :--- diff --git a/_api-reference/script-apis/exec-stored-script.md b/_api-reference/script-apis/exec-stored-script.md index a7de3b5274..31102c23dd 100644 --- a/_api-reference/script-apis/exec-stored-script.md +++ b/_api-reference/script-apis/exec-stored-script.md @@ -13,7 +13,22 @@ Runs a stored script written in the Painless language. OpenSearch provides several ways to run a script; the following sections show how to run a script by passing script information in the request body of a `GET /_search` request. -## Request fields +## Path and HTTP methods + +```json +GET books/_search +{ + "script_fields": { + "total_ratings": { + "script": { + "id": "my-first-script" + } + } + } +} +``` + +## Request field options | Field | Data type | Description | :--- | :--- | :--- @@ -104,7 +119,7 @@ The `GET books/_search` request returns the following fields: } ```` -## Response fields +## Response body fields | Field | Data type | Description | :--- | :--- | :--- diff --git a/_api-reference/script-apis/get-script-contexts.md b/_api-reference/script-apis/get-script-contexts.md index 85421128a1..fa45ce95ad 100644 --- a/_api-reference/script-apis/get-script-contexts.md +++ b/_api-reference/script-apis/get-script-contexts.md @@ -13,9 +13,9 @@ Retrieves all contexts for stored scripts. ## Example request -````json +```json GET _script_context -```` +``` {% include copy-curl.html %} ## Example response @@ -547,7 +547,7 @@ The `GET _script_context` request returns the following fields: } ```` -## Response fields +## Response body fields The `GET _script_context` request returns the following response fields: diff --git a/_api-reference/script-apis/get-script-language.md b/_api-reference/script-apis/get-script-language.md index 76414d52ea..a7bb7e7b51 100644 --- a/_api-reference/script-apis/get-script-language.md +++ b/_api-reference/script-apis/get-script-language.md @@ -89,7 +89,7 @@ The `GET _script_language` request returns the available contexts for each langu } ``` -## Response fields +## Response body fields The request contains the following response fields. diff --git a/_api-reference/script-apis/get-stored-script.md b/_api-reference/script-apis/get-stored-script.md index d7987974d3..341bfc046e 100644 --- a/_api-reference/script-apis/get-stored-script.md +++ b/_api-reference/script-apis/get-stored-script.md @@ -11,6 +11,12 @@ nav_order: 3 Retrieves a stored script. +## Path and HTTP methods + +```json +GET _scripts/my-first-script +``` + ## Path parameters | Parameter | Data type | Description | @@ -53,7 +59,7 @@ The `GET _scripts/my-first-script` request returns the following fields: } ```` -## Response fields +## Response body fields The `GET _scripts/my-first-script` request returns the following response fields: diff --git a/_api-reference/scroll.md b/_api-reference/scroll.md index b940c90d86..770697d4e4 100644 --- a/_api-reference/scroll.md +++ b/_api-reference/scroll.md @@ -17,7 +17,32 @@ To use the `scroll` operation, add a `scroll` parameter to the request header wi Because search contexts consume a lot of memory, we suggest you don't use the `scroll` operation for frequent user queries. Instead, use the `sort` parameter with the `search_after` parameter to scroll responses for user queries. {: .note } -## Example +## Path and HTTP methods + +```json +GET _search/scroll +POST _search/scroll +GET _search/scroll/ +POST _search/scroll/ +``` + +## Path parameters + +Parameter | Type | Description +:--- | :--- | :--- +scroll_id | String | The scroll ID for the search. + +## Query parameters + +All scroll parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +scroll | Time | Specifies the amount of time the search context is maintained. +scroll_id | String | The scroll ID for the search. +rest_total_hits_as_int | Boolean | Whether the `hits.total` property is returned as an integer (`true`) or an object (`false`). Default is `false`. + +## Example requests To set the number of results that you want returned for each batch, use the `size` parameter: @@ -84,28 +109,6 @@ DELETE _search/scroll/_all The `scroll` operation corresponds to a specific timestamp. It doesn't consider documents added after that timestamp as potential results. - -## Path and HTTP methods - -``` -GET _search/scroll -POST _search/scroll -``` -``` -GET _search/scroll/ -POST _search/scroll/ -``` - -## URL parameters - -All scroll parameters are optional. - -Parameter | Type | Description -:--- | :--- | :--- -scroll | Time | Specifies the amount of time the search context is maintained. -scroll_id | String | The scroll ID for the search. -rest_total_hits_as_int | Boolean | Whether the `hits.total` property is returned as an integer (`true`) or an object (`false`). Default is `false`. - ## Example response ```json diff --git a/_api-reference/search.md b/_api-reference/search.md index 777f48354e..df6992912e 100644 --- a/_api-reference/search.md +++ b/_api-reference/search.md @@ -12,33 +12,19 @@ redirect_from: The Search API operation lets you execute a search request to search your cluster for data. -## Example - -```json -GET /movies/_search -{ - "query": { - "match": { - "text_entry": "I am the night" - } - } -} -``` -{% include copy-curl.html %} - ## Path and HTTP Methods -``` -GET //_search +```json +GET //_search GET /_search -POST //_search +POST //_search POST /_search ``` -## URL Parameters +## Query parameters -All URL parameters are optional. +All parameters are optional. Parameter | Type | Description :--- | :--- | :--- @@ -124,7 +110,22 @@ terminate_after | Integer | The maximum number of documents OpenSearch should pr timeout | Time | How long to wait for a response. Default is no timeout. version | Boolean | Whether to include the document version in the response. -## Response body +## Example request + +```json +GET /movies/_search +{ + "query": { + "match": { + "text_entry": "I am the night" + } + } +} +``` +{% include copy-curl.html %} + + +## Response body fields ```json { diff --git a/_api-reference/snapshots/cleanup-snapshot-repository.md b/_api-reference/snapshots/cleanup-snapshot-repository.md new file mode 100644 index 0000000000..18e8f35f81 --- /dev/null +++ b/_api-reference/snapshots/cleanup-snapshot-repository.md @@ -0,0 +1,63 @@ +--- +layout: default +title: Cleanup Snapshot Repository +parent: Snapshot APIs +nav_order: 11 +--- + +# Cleanup Snapshot Repository +Introduced 1.0 +{: .label .label-purple } + +The Cleanup Snapshot Repository API clears a snapshot repository of data no longer referenced by any existing snapshot. + +## Path and HTTP methods + +```json +POST /_snapshot//_cleanup +``` + + +## Path parameters + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `repository` | String | The name of the snapshot repository. | + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `cluster_manager_timeout` | Time | The amount of time to wait for a response from the cluster manager node. Formerly called `master_timeout`. Optional. Default is 30 seconds. | +| `timeout` | Time | The amount of time to wait for the operation to complete. Optional.| + +## Example request + +The following request removes all stale data from the repository `my_backup`: + +```json +POST /_snapshot/my_backup/_cleanup +``` +{% include copy-curl.html %} + + +## Example response + +```json +{ + "results":{ + "deleted_bytes":40, + "deleted_blobs":8 + } +} +``` + +## Response body fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `deleted_bytes` | Integer | The number of bytes made available in the snapshot after data deletion. | +| `deleted_blobs` | Integer | The number of binary large objects (BLOBs) cleared from the repository by the request. | + diff --git a/_api-reference/snapshots/create-repository.md b/_api-reference/snapshots/create-repository.md index 34e2ea8376..40a35973e8 100644 --- a/_api-reference/snapshots/create-repository.md +++ b/_api-reference/snapshots/create-repository.md @@ -21,9 +21,9 @@ For instructions on creating a repository, see [Register repository]({{site.url} ## Path and HTTP methods -``` -POST /_snapshot/my-first-repo/ -PUT /_snapshot/my-first-repo/ +```json +POST /_snapshot// +PUT /_snapshot// ``` ## Path parameters diff --git a/_api-reference/snapshots/create-snapshot.md b/_api-reference/snapshots/create-snapshot.md index b35d1a1d0c..45e5a28b55 100644 --- a/_api-reference/snapshots/create-snapshot.md +++ b/_api-reference/snapshots/create-snapshot.md @@ -26,7 +26,7 @@ POST /_snapshot// Parameter | Data type | Description :--- | :--- | :--- -repository | String | Repostory name to contain the snapshot. | +repository | String | Repository name to store the snapshot. | snapshot | String | Name of Snapshot to create. | ## Query parameters @@ -35,7 +35,7 @@ Parameter | Data type | Description :--- | :--- | :--- wait_for_completion | Boolean | Whether to wait for snapshot creation to complete before continuing. If you include this parameter, the snapshot definition is returned after completion. | -## Request fields +## Request body fields The request body is optional. @@ -48,7 +48,7 @@ Field | Data type | Description ## Example requests -##### Request without a body +### Request without a body The following request creates a snapshot called `my-first-snapshot` in an S3 repository called `my-s3-repository`. A request body is not included because it is optional. @@ -57,7 +57,7 @@ POST _snapshot/my-s3-repository/my-first-snapshot ``` {% include copy-curl.html %} -##### Request with a body +### Request with a body You can also add a request body to include or exclude certain indices or specify other settings: @@ -87,7 +87,7 @@ Upon success, the response content depends on whether you include the `wait_for_ To verify that the snapshot was created, use the [Get snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot) API, passing the snapshot name as the `snapshot` path parameter. {: .note} -##### `wait_for_completion` included +### `wait_for_completion` included The snapshot definition is returned. @@ -125,7 +125,7 @@ The snapshot definition is returned. } ``` -#### Response fields +## Response body fields | Field | Data type | Description | | :--- | :--- | :--- | diff --git a/_api-reference/snapshots/delete-snapshot-repository.md b/_api-reference/snapshots/delete-snapshot-repository.md index 1fadc21207..2649c3c90d 100644 --- a/_api-reference/snapshots/delete-snapshot-repository.md +++ b/_api-reference/snapshots/delete-snapshot-repository.md @@ -9,11 +9,17 @@ nav_order: 3 **Introduced 1.0** {: .label .label-purple } - Deletes a snapshot repository configuration. +Deletes a snapshot repository configuration. - A repository in OpenSearch is simply a configuration that maps a repository name to a type (file system or s3 repository) along with other information depending on the type. The configuration is backed by a file system location or an s3 bucket. When you invoke the API, the physical file system or s3 bucket itself is not deleted. Only the configuration is deleted. +A repository in OpenSearch is simply a configuration that maps a repository name to a type (file system or s3 repository) along with other information depending on the type. The configuration is backed by a file system location or an s3 bucket. When you invoke the API, the physical file system or s3 bucket itself is not deleted. Only the configuration is deleted. - To learn more about repositories, see [Register or update snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-repository). +To learn more about repositories, see [Register or update snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-repository). + +## Path and HTTP methods + +```json +DELETE _snapshot/ +``` ## Path parameters diff --git a/_api-reference/snapshots/delete-snapshot.md b/_api-reference/snapshots/delete-snapshot.md index d231adf74a..faed3b92d0 100644 --- a/_api-reference/snapshots/delete-snapshot.md +++ b/_api-reference/snapshots/delete-snapshot.md @@ -17,11 +17,17 @@ Deletes a snapshot from a repository. * To view a list of your snapshots, see [cat snapshots]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-snapshots). +## Path and HTTP method + +```json +DELETE _snapshot// +``` + ## Path parameters Parameter | Data type | Description :--- | :--- | :--- -repository | String | Repostory that contains the snapshot. | +repository | String | Repository that contains the snapshot. | snapshot | String | Snapshot to delete. | ## Example request diff --git a/_api-reference/snapshots/get-snapshot-repository.md b/_api-reference/snapshots/get-snapshot-repository.md index 6617106059..1098cd544a 100644 --- a/_api-reference/snapshots/get-snapshot-repository.md +++ b/_api-reference/snapshots/get-snapshot-repository.md @@ -16,6 +16,12 @@ To learn more about repositories, see [Register repository]({{site.url}}{{site.b You can also get details about a snapshot during and after snapshot creation. See [Get snapshot status]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-status/). {: .note} +## Path and HTTP methods + +```json +GET /_snapshot/ +``` + ## Path parameters | Parameter | Data type | Description | @@ -54,7 +60,7 @@ Upon success, the response returns repositry information. This sample is for an } ```` -## Response fields +## Response body fields | Field | Data type | Description | | :--- | :--- | :--- | diff --git a/_api-reference/snapshots/get-snapshot-status.md b/_api-reference/snapshots/get-snapshot-status.md index c7f919bcb3..8675c23886 100644 --- a/_api-reference/snapshots/get-snapshot-status.md +++ b/_api-reference/snapshots/get-snapshot-status.md @@ -16,14 +16,21 @@ To learn about snapshot creation, see [Create snapshot]({{site.url}}{{site.baseu If you use the Security plugin, you must have the `monitor_snapshot`, `create_snapshot`, or `manage cluster` privileges. {: .note} +## Path and HTTP methods + +```json +GET _snapshot///_status +``` + ## Path parameters Path parameters are optional. | Parameter | Data type | Description | :--- | :--- | :--- -| repository | String | Repository containing the snapshot. | -| snapshot | String | Snapshot to return. | +| repository | String | The repository containing the snapshot. | +| snapshot | List | The snapshot(s) to return. | +| index | List | The indexes to include in the response. | Three request variants provide flexibility: @@ -31,16 +38,23 @@ Three request variants provide flexibility: * `GET _snapshot//_status` returns all currently running snapshots in the specified repository. This is the preferred variant. -* `GET _snapshot///_status` returns detailed status information for a specific snapshot in the specified repository, regardless of whether it's currently running or not. +* `GET _snapshot///_status` returns detailed status information for a specific snapshot(s) in the specified repository, regardless of whether it's currently running. + +* `GET /_snapshot////_status` returns detailed status information only for the specified indexes in a specific snapshot in the specified repository. Note that this endpoint works only for indexes belonging to a specific snapshot. + +Snapshot API calls only work if the total number of shards across the requested resources, such as snapshots and indexes created from snapshots, is smaller than the limit specified by the following cluster setting: + +- `snapshot.max_shards_allowed_in_status_api`(Dynamic, integer): The maximum number of shards that can be included in the Snapshot Status API response. Default value is `200000`. Not applicable for [shallow snapshots v2]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability##shallow-snapshot-v2), where the total number and sizes of files are returned as 0. + -Using the API to return state for other than currently running snapshots can be very costly for (1) machine machine resources and (2) processing time if running in the cloud. For each snapshot, each request causes file reads from all a snapshot's shards. +Using the API to return the state of snapshots that are not currently running can be very costly in terms of both machine resources and processing time when querying data in the cloud. For each snapshot, each request causes a file read of all of the snapshot's shards. {: .warning} -## Request fields +## Request body fields | Field | Data type | Description | :--- | :--- | :--- -| ignore_unavailable | Boolean | How to handles requests for unavailable snapshots. If `false`, the request returns an error for unavailable snapshots. If `true`, the request ignores unavailable snapshots, such as those that are corrupted or temporarily cannot be returned. Defaults to `false`.| +| ignore_unavailable | Boolean | How to handle requests for unavailable snapshots and indexes. If `false`, the request returns an error for unavailable snapshots and indexes. If `true`, the request ignores unavailable snapshots and indexes, such as those that are corrupted or temporarily cannot be returned. Default is `false`.| ## Example request @@ -369,31 +383,31 @@ The `GET _snapshot/my-opensearch-repo/my-first-snapshot/_status` request returns } ```` -## Response fields +## Response body fields | Field | Data type | Description | :--- | :--- | :--- | repository | String | Name of repository that contains the snapshot. | | snapshot | String | Snapshot name. | -| uuid | String | Snapshot Universally unique identifier (UUID). | +| uuid | String | A snapshot's universally unique identifier (UUID). | | state | String | Snapshot's current status. See [Snapshot states](#snapshot-states). | | include_global_state | Boolean | Whether the current cluster state is included in the snapshot. | | shards_stats | Object | Snapshot's shard counts. See [Shard stats](#shard-stats). | -| stats | Object | Details of files included in the snapshot. `file_count`: number of files. `size_in_bytes`: total of all fie sizes. See [Snapshot file stats](#snapshot-file-stats). | +| stats | Object | Information about files included in the snapshot. `file_count`: number of files. `size_in_bytes`: total size of all files. See [Snapshot file stats](#snapshot-file-stats). | | index | list of Objects | List of objects that contain information about the indices in the snapshot. See [Index objects](#index-objects).| -##### Snapshot states +### Snapshot states | State | Description | :--- | :--- | -| FAILED | The snapshot terminated in an error and no data was stored. | +| FAILED | The snapshot terminated in an error and no data was stored. | | IN_PROGRESS | The snapshot is currently running. | | PARTIAL | The global cluster state was stored, but data from at least one shard was not stored. The `failures` property of the [Create snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-snapshot) response contains additional details. | | SUCCESS | The snapshot finished and all shards were stored successfully. | -##### Shard stats +### Shard stats -All property values are Integers. +All property values are integers. | Property | Description | :--- | :--- | @@ -404,7 +418,7 @@ All property values are Integers. | failed | Number of shards that failed to be included in the snapshot. | | total | Total number of shards included in the snapshot. | -##### Snapshot file stats +### Snapshot file stats | Property | Type | Description | :--- | :--- | :--- | @@ -414,10 +428,10 @@ All property values are Integers. | start_time_in_millis | Long | Time (in milliseconds) when snapshot creation began. | | time_in_millis | Long | Total time (in milliseconds) that the snapshot took to complete. | -##### Index objects +### Index objects | Property | Type | Description | :--- | :--- | :--- | | shards_stats | Object | See [Shard stats](#shard-stats). | | stats | Object | See [Snapshot file stats](#snapshot-file-stats). | -| shards | list of Objects | List of objects containing information about the shards that include the snapshot. OpenSearch returns the following properties about the shards.

**stage**: Current state of shards in the snapshot. Shard states are:

* DONE: Number of shards in the snapshot that were successfully stored in the repository.

* FAILURE: Number of shards in the snapshot that were not successfully stored in the repository.

* FINALIZE: Number of shards in the snapshot that are in the finalizing stage of being stored in the repository.

* INIT: Number of shards in the snapshot that are in the initializing stage of being stored in the repository.

* STARTED: Number of shards in the snapshot that are in the started stage of being stored in the repository.

**stats**: See [Snapshot file stats](#snapshot-file-stats).

**total**: Total number and size of files referenced by the snapshot.

**start_time_in_millis**: Time (in milliseconds) when snapshot creation began.

**time_in_millis**: Total time (in milliseconds) that the snapshot took to complete. | +| shards | List of objects | Contains information about the shards included in the snapshot. OpenSearch returns the following properties about the shard:

**stage**: The current state of shards in the snapshot. Shard states are:

* DONE: The number of shards in the snapshot that were successfully stored in the repository.

* FAILURE: The number of shards in the snapshot that were not successfully stored in the repository.

* FINALIZE: The number of shards in the snapshot that are in the finalizing stage of being stored in the repository.

* INIT: The number of shards in the snapshot that are in the initializing stage of being stored in the repository.

* STARTED: The number of shards in the snapshot that are in the started stage of being stored in the repository.

**stats**: See [Snapshot file stats](#snapshot-file-stats).

**total**: The total number and sizes of files referenced by the snapshot.

**start_time_in_millis**: The time (in milliseconds) when snapshot creation began.

**time_in_millis**: The total amount of time (in milliseconds) that the snapshot took to complete. | diff --git a/_api-reference/snapshots/get-snapshot.md b/_api-reference/snapshots/get-snapshot.md index ac55c0370f..148f9e8ff2 100644 --- a/_api-reference/snapshots/get-snapshot.md +++ b/_api-reference/snapshots/get-snapshot.md @@ -11,6 +11,12 @@ nav_order: 6 Retrieves information about a snapshot. +## Path and HTTP methods + +```json +GET _snapshot/// +``` + ## Path parameters | Parameter | Data type | Description | @@ -73,7 +79,7 @@ Upon success, the response returns snapshot information: ] } ```` -## Response fields +## Response body fields | Field | Data type | Description | | :--- | :--- | :--- | diff --git a/_api-reference/snapshots/restore-snapshot.md b/_api-reference/snapshots/restore-snapshot.md index cdb9948c28..b22c371134 100644 --- a/_api-reference/snapshots/restore-snapshot.md +++ b/_api-reference/snapshots/restore-snapshot.md @@ -19,11 +19,17 @@ Restores a snapshot of a cluster or specified data streams and indices. If open indexes with the same name that you want to restore already exist in the cluster, you must close, delete, or rename the indexes. See [Example request](#example-request) for information about renaming an index. See [Close index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index) for information about closing an index. {: .note} +## Path and HTTP methods + +```json +GET _snapshot/// +``` + ## Path parameters | Parameter | Data type | Description | :--- | :--- | :--- -repository | String | Repository containing the snapshot to restore. | +| repository | String | Repository containing the snapshot to restore. | | snapshot | String | Snapshot to restore. | ## Query parameters @@ -32,7 +38,7 @@ Parameter | Data type | Description :--- | :--- | :--- wait_for_completion | Boolean | Whether to wait for snapshot restoration to complete before continuing. | -### Request fields +## Request body fields All request body parameters are optional. @@ -45,8 +51,10 @@ All request body parameters are optional. | index_settings | String | A comma-delimited list of settings to add or change in all restored indices. Use this parameter to override index settings during snapshot restoration. For data streams, these index settings are applied to the restored backing indices. | | indices | String | A comma-delimited list of data streams and indices to restore from the snapshot. Multi-index syntax is supported. By default, a restore operation includes all data streams and indices in the snapshot. If this argument is provided, the restore operation only includes the data streams and indices that you specify. | | partial | Boolean | How the restore operation will behave if indices in the snapshot do not have all primary shards available. If `false`, the entire restore operation fails if any indices in the snapshot do not have all primary shards available.

If `true`, allows the restoration of a partial snapshot of indices with unavailable shards. Only shards that were successfully included in the snapshot are restored. All missing shards are recreated as empty. By default, the entire restore operation fails if one or more indices included in the snapshot do not have all primary shards available. To change this behavior, set `partial` to `true`. Defaults to `false`. | -| rename_pattern | String | The pattern to apply to restored data streams and indices. Data streams and indices matching the rename pattern will be renamed according to `rename_replacement`.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

The request fails if two or more data streams or indices are renamed into the same name. If you rename a restored data stream, its backing indices are also renamed. For example, if you rename the logs data stream to `recovered-logs`, the backing index `.ds-logs-1` is renamed to `.ds-recovered-logs-1`.

If you rename a restored stream, ensure an index template matches the new stream name. If there are no matching index template names, the stream cannot roll over and new backing indices are not created.| -| rename_replacement | String | The rename replacement string. See `rename_pattern` for more information.| +| rename_pattern | String | The pattern to apply to the restored data streams and indexes. Data streams and indexes matching the rename pattern will be renamed according to the `rename_replacement` setting.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

The request fails if two or more data streams or indexes are renamed to the same name. If you rename a restored data stream, its backing indexes are also renamed. For example, if you rename the logs data stream to `recovered-logs`, the backing index `.ds-logs-1` is renamed to `.ds-recovered-logs-1`.

If you rename a restored stream, ensure an index template matches the new stream name. If there are no matching index template names, the stream cannot roll over, and new backing indexes are not created.| +| rename_replacement | String | The rename replacement string.| +| rename_alias_pattern | String | The pattern to apply to the restored aliases. Aliases matching the rename pattern will be renamed according to the `rename_alias_replacement` setting.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

If two or more aliases are renamed to the same name, these aliases will be merged into one.| +| rename_alias_replacement | String | The rename replacement string for aliases.| | source_remote_store_repository | String | The name of the remote store repository of the source index being restored. If not provided, the Snapshot Restore API will use the repository that was registered when the snapshot was created. | wait_for_completion | Boolean | Whether to return a response after the restore operation has completed. If `false`, the request returns a response when the restore operation initializes. If `true`, the request returns a response when the restore operation completes. Defaults to `false`. | @@ -92,7 +100,7 @@ Upon success, the response returns the following JSON object: ```` Except for the snapshot name, all properties are empty or `0`. This is because any changes made to the volume after the snapshot was generated are lost. However, if you invoke the [Get snapshot]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot) API to examine the snapshot, a fully populated snapshot object is returned. -## Response fields +## Response body fields | Field | Data type | Description | | :--- | :--- | :--- | @@ -117,4 +125,4 @@ If open indices in a snapshot already exist in a cluster, and you don't delete, }, "status" : 500 } -```` \ No newline at end of file +```` diff --git a/_api-reference/snapshots/verify-snapshot-repository.md b/_api-reference/snapshots/verify-snapshot-repository.md index e5e6337196..67a006e709 100644 --- a/_api-reference/snapshots/verify-snapshot-repository.md +++ b/_api-reference/snapshots/verify-snapshot-repository.md @@ -17,6 +17,12 @@ If verification is successful, the verify snapshot repository API returns a list If you use the Security plugin, you must have the `manage cluster` privilege. {: .note} +## Path and HTTP methods + +```json +GET _snapshot// +``` + ## Path parameters Path parameters are optional. @@ -70,7 +76,7 @@ In the preceding sample, one node is connected to the snapshot repository. If mo } ```` -## Response fields +## Response body fields | Field | Data type | Description | :--- | :--- | :--- diff --git a/_api-reference/tasks.md b/_api-reference/tasks.md index e4ca0b6049..477e720d22 100644 --- a/_api-reference/tasks.md +++ b/_api-reference/tasks.md @@ -12,9 +12,9 @@ redirect_from: A task is any operation you run in a cluster. For example, searching your data collection of books for a title or author name is a task. When you run OpenSearch, a task is automatically created to monitor your cluster's health and performance. For more information about all of the tasks currently executing in your cluster, you can use the `tasks` API operation. -The following request returns information about all of your tasks: +## Path and HTTP methods -``` +```json GET _tasks ``` {% include copy-curl.html %} @@ -28,6 +28,7 @@ GET _tasks/ Note that if a task finishes running, it won't be returned as part of your request. For an example of a task that takes a little longer to finish, you can run the [`_reindex`]({{site.url}}{{site.baseurl}}/opensearch/reindex-data) API operation on a larger document, and then run `tasks`. +## Query parameters You can also use the following parameters with your query. @@ -42,7 +43,6 @@ Parameter | Data type | Description | `timeout` | Time | An explicit operation timeout. (Default: 30 seconds) `cluster_manager_timeout` | Time | The time to wait for a connection to the primary node. (Default: 30 seconds) -For example, this request returns tasks currently running on a node named `opensearch-node1`: ## Example requests diff --git a/_automating-configurations/api/create-workflow.md b/_automating-configurations/api/create-workflow.md index 770c1a1a13..ad9552c3ef 100644 --- a/_automating-configurations/api/create-workflow.md +++ b/_automating-configurations/api/create-workflow.md @@ -16,7 +16,7 @@ Creating a workflow adds the content of a workflow template to the flow framewor To obtain the validation template for workflow steps, call the [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/). -You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as `openAI_key: '${{ openai_key }}'`. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. +You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. Once a workflow is created, provide its `workflow_id` to other APIs. @@ -105,7 +105,7 @@ The following table lists the available query parameters. All query parameters a | `use_case` | String | The name of the [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/#supported-workflow-templates) to use when creating the workflow. | | User-provided substitution expressions | String | Parameters matching substitution expressions in the template. Only allowed if `provision` is set to `true`. Optional. If `provision` is set to `false`, you can pass these parameters in the [Provision Workflow API query parameters]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/#query-parameters). | -## Request fields +## Request body fields The following table lists the available request fields. diff --git a/_automating-configurations/api/provision-workflow.md b/_automating-configurations/api/provision-workflow.md index 62c4954ee9..cb1fe42789 100644 --- a/_automating-configurations/api/provision-workflow.md +++ b/_automating-configurations/api/provision-workflow.md @@ -30,7 +30,7 @@ The following table lists the available path parameters. ## Query parameters -If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as `openAI_key: '${{ openai_key }}'`, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: +If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: ```json POST /_plugins/_flow_framework/workflow//_provision?= @@ -47,14 +47,14 @@ POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision ``` {% include copy-curl.html %} -The following request substitutes the expression `${{ openai_key }}` with the value "12345" using a query parameter: +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using a query parameter: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision?openai_key=12345 ``` {% include copy-curl.html %} -The following request substitutes the expression `${{ openai_key }}` with the value "12345" using the request body: +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using the request body: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision diff --git a/_automating-configurations/workflow-steps.md b/_automating-configurations/workflow-steps.md index 43685a957a..0f61874be6 100644 --- a/_automating-configurations/workflow-steps.md +++ b/_automating-configurations/workflow-steps.md @@ -75,9 +75,11 @@ You can include the following additional fields in the `user_inputs` field if th You can include the following additional fields in the `previous_node_inputs` field when indicated. -|Field |Data type |Description | -|--- |--- |--- | -|`model_id` |String |The `model_id` is used as an input for several steps. As a special case for the Register Agent step type, if an `llm.model_id` field is not present in the `user_inputs` and not present in `previous_node_inputs`, the `model_id` field from the previous node may be used as a backup for the model ID. | +| Field |Data type | Description | +|-----------------|--- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `model_id` |String | The `model_id` is used as an input for several steps. As a special case for the `register_agent` step type, if an `llm.model_id` field is not present in the `user_inputs` and not present in `previous_node_inputs`, then the `model_id` field from the previous node may be used as a backup for the model ID. The `model_id` will also be included in the `parameters` input of the `create_tool` step for the `MLModelTool`. | +| `agent_id` |String | The `agent_id` is used as an input for several steps. The `agent_id` will also be included in the `parameters` input of the `create_tool` step for the `AgentTool`. | +| `connector_id` |String | The `connector_id` is used as an input for several steps. The `connector_id` will also be included in the `parameters` input of the `create_tool` step for the `ConnectorTool`. | ## Example workflow steps diff --git a/_benchmark/glossary.md b/_benchmark/glossary.md new file mode 100644 index 0000000000..a1d2335b8c --- /dev/null +++ b/_benchmark/glossary.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Glossary +nav_order: 100 +--- + +# OpenSearch Benchmark glossary + +The following terms are commonly used in OpenSearch Benchmark: + +- **Corpora**: A collection of documents. +- **Latency**: If `target-throughput` is disabled (has no value or a value of `0)`, then latency is equal to service time. If `target-throughput` is enabled (has a value of 1 or greater), then latency is equal to the service time plus the amount of time the request waits in the queue before being sent. +- **Metric keys**: The metrics stored by OpenSearch Benchmark, based on the configuration in the [metrics record]({{site.url}}{{site.baseurl}}/benchmark/metrics/metric-records/). +- **Operations**: In workloads, a list of API operations performed by a workload. +- **Pipeline**: A series of steps occurring both before and after running a workload that determines benchmark results. +- **Schedule**: A list of two or more operations performed in the order they appear when a workload is run. +- **Service time**: The amount of time taken for `opensearch-py`, the primary client for OpenSearch Benchmark, to send a request and receive a response from the OpenSearch cluster. It includes the amount of time taken for the server to process a request as well as for network latency, load balancer overhead, and deserialization/serialization. +- **Summary report**: A report generated at the end of a test based on the metric keys defined in the workload. +- **Test**: A single invocation of the OpenSearch Benchmark binary. +- **Throughput**: The number of operations completed in a given period of time. +- **Workload**: A collection of one or more benchmarking tests that use a specific document corpus to perform a benchmark against a cluster. The document corpus contains any indexes, data files, or operations invoked when the workload runs. \ No newline at end of file diff --git a/_benchmark/quickstart.md b/_benchmark/quickstart.md index a6bcd59819..9ab6d25c77 100644 --- a/_benchmark/quickstart.md +++ b/_benchmark/quickstart.md @@ -116,7 +116,7 @@ You can now run your first benchmark. The following benchmark uses the [percolat Benchmarks are run using the [`execute-test`]({{site.url}}{{site.baseurl}}/benchmark/commands/execute-test/) command with the following command flags: -For additional `execute_test` command flags, see the [execute-test]({{site.url}}{{site.baseurl}}/benchmark/commands/execute-test/) reference. Some commonly used options are `--workload-params`, `--exclude-tasks`, and `--include-tasks`. +For additional `execute-test` command flags, see the [execute-test]({{site.url}}{{site.baseurl}}/benchmark/commands/execute-test/) reference. Some commonly used options are `--workload-params`, `--exclude-tasks`, and `--include-tasks`. {: .tip} * `--pipeline=benchmark-only` : Informs OSB that users wants to provide their own OpenSearch cluster. @@ -136,7 +136,7 @@ opensearch-benchmark execute-test --pipeline=benchmark-only --workload=percolato ``` {% include copy.html %} -When the `execute_test` command runs, all tasks and operations in the `percolator` workload run sequentially. +When the `execute-test` command runs, all tasks and operations in the `percolator` workload run sequentially. ### Validating the test diff --git a/_benchmark/reference/commands/aggregate.md b/_benchmark/reference/commands/aggregate.md new file mode 100644 index 0000000000..a891bf3edf --- /dev/null +++ b/_benchmark/reference/commands/aggregate.md @@ -0,0 +1,98 @@ +--- +layout: default +title: aggregate +nav_order: 85 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: + - /benchmark/commands/aggregate/ +--- + +# aggregate + +The `aggregate` command combines multiple test executions into a single aggregated result, providing a more streamlined way to conduct and analyze multiple test runs. There are two methods of aggregation: + +- [Auto-aggregation](#auto-aggregation) +- [Manual aggregation](#manual-aggregation) + +## Auto-aggregation + +The auto-aggregation method runs multiple iterations of benchmark tests and automatically aggregates the results, all within a single command. You can use the flags outlined in this with the `execute` command. + +### Usage + +The following example runs the `geonames` workload and aggregates the results twice: + +```bash +opensearch-benchmark execute --test-iterations=2 --aggregate=true --workload=geonames --target-hosts=127.0.0.1:9200 +``` +{% include copy-curl.html %} + +### Auto-aggregation flags + +The following new flags can be used to customize the auto-aggregation method: + +- `--test-iterations`: Specifies the number of times to run the workload (default is `1`). +- `--aggregate`: Determines whether to aggregate the results of multiple test executions (default is `true`). +- `--sleep-timer`: Specifies the number of seconds to sleep before starting the next test execution (default is `5`). +- `--cancel-on-error`: When set, stops executing tests if an error occurs in one of the test iterations (default is `false`). + +## Manual aggregation + +You can use the `aggregate` command to manually aggregate results from multiple test executions. + +### Usage + +To aggregate multiple test executions manually, specify the `test_execution_ids` you would like to aggregate, as shown in the following example: + +```bash +opensearch-benchmark aggregate --test-executions=,,... +``` +{% include copy-curl.html %} + +### Response + +OpenSearch Benchmark responds with the following: + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +Aggregate test execution ID: aggregate_results_geonames_9aafcfb8-d3b7-4583-864e-4598b5886c4f + +------------------------------- +[INFO] SUCCESS (took 1 seconds) +------------------------------- +``` + +The results will be aggregated into one test execution and stored under the ID shown in the output. + +### Additional options +- `--test-execution-id`: Define a unique ID for the aggregated test execution. +- `--results-file`: Write the aggregated results to the provided file. +- `--workload-repository`: Define the repository from which OpenSearch Benchmark will load workloads (default is `default`). + +## Aggregated results + +Aggregated results includes the following information: + +- **Relative Standard Deviation (RSD)**: For each metric an additional `mean_rsd` value shows the spread of results across test executions. +- **Overall min/max values**: Instead of averaging minimum and maximum values, the aggregated result include `overall_min` and `overall_max` which reflect the true minimum/maximum across all test runs. +- **Storage**: Aggregated test results are stored in a separate `aggregated_results` folder alongside the `test_executions` folder. + +The following example shows aggregated results: + +```json + "throughput": { + "overall_min": 29056.890292903263, + "mean": 50115.8603858536, + "median": 50099.54349684457, + "overall_max": 72255.15946248993, + "unit": "docs/s", + "mean_rsd": 59.426059705973664 + }, +``` diff --git a/_benchmark/reference/commands/command-flags.md b/_benchmark/reference/commands/command-flags.md index 6520f80803..96948e45c7 100644 --- a/_benchmark/reference/commands/command-flags.md +++ b/_benchmark/reference/commands/command-flags.md @@ -328,3 +328,33 @@ Sets what fraction of randomized query values can be repeated. Takes values betw Sets how many distinct repeatable pair values are generated for each operation when randomization is used. Default is `5000`. This setting does not work when `--randomization-enabled` is not used. + + +## test-iterations + + +Specifies the number of times to run the workload. Default is `1`. + + +## aggregate + + +Determines whether OpenSearch Benchmark should aggregate the results of multiple test executions. + +When set to `true`, OpenSearch Benchmark will combine the results from all iterations into a single aggregated report. When set to `false`, results from each iteration will be reported separately. + +Default is `true`. + + +## sleep-timer + + +Specifies the number of seconds to sleep before starting the next test execution. Default is `5`. + + + +## cancel-on-error + + +When set, this flag instructs OpenSearch Benchmark to stop executing tests if an error occurs in one of the test iterations. Default is `false` (not set). + diff --git a/_benchmark/reference/workloads/corpora.md b/_benchmark/reference/workloads/corpora.md index 0e8d408e9a..f59e2b8a6a 100644 --- a/_benchmark/reference/workloads/corpora.md +++ b/_benchmark/reference/workloads/corpora.md @@ -49,7 +49,7 @@ Each entry in the `documents` array consists of the following options. Parameter | Required | Type | Description :--- | :--- | :--- | :--- `source-file` | Yes | String | The file name containing the corresponding documents for the workload. When using OpenSearch Benchmark locally, documents are contained in a JSON file. When providing a `base_url`, use a compressed file format: `.zip`, `.bz2`, `.gz`, `.tar`, `.tar.gz`, `.tgz`, or `.tar.bz2`. The compressed file must have one JSON file containing the name. -`document-count` | Yes | Integer | The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client receives an Nth of the document corpus. When using a source that contains a document with a parent-child relationship, specify the number of parent documents. +`document-count` | Yes | Integer | The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client receives an Nth of the document corpus. When using a source that contains a document with a parent/child relationship, specify the number of parent documents. `base-url` | No | String | An http(s), Amazon Simple Storage Service (Amazon S3), or Google Cloud Storage URL that points to the root path where OpenSearch Benchmark can obtain the corresponding source file. `source-format` | No | String | Defines the format OpenSearch Benchmark uses to interpret the data file specified in `source-file`. Only `bulk` is supported. `compressed-bytes` | No | Integer | The size, in bytes, of the compressed source file, indicating how much data OpenSearch Benchmark downloads. diff --git a/_benchmark/user-guide/configuring-benchmark.md b/_benchmark/user-guide/install-and-configure/configuring-benchmark.md similarity index 98% rename from _benchmark/user-guide/configuring-benchmark.md rename to _benchmark/user-guide/install-and-configure/configuring-benchmark.md index 2be467d587..59ac13a83c 100644 --- a/_benchmark/user-guide/configuring-benchmark.md +++ b/_benchmark/user-guide/install-and-configure/configuring-benchmark.md @@ -1,10 +1,12 @@ --- layout: default -title: Configuring OpenSearch Benchmark +title: Configuring nav_order: 7 -parent: User guide +grand_parent: User guide +parent: Install and configure redirect_from: - /benchmark/configuring-benchmark/ + - /benchmark/user-guide/configuring-benchmark/ --- # Configuring OpenSearch Benchmark diff --git a/_benchmark/user-guide/install-and-configure/index.md b/_benchmark/user-guide/install-and-configure/index.md new file mode 100644 index 0000000000..c0a48278ad --- /dev/null +++ b/_benchmark/user-guide/install-and-configure/index.md @@ -0,0 +1,12 @@ +--- +layout: default +title: Install and configure +nav_order: 5 +parent: User guide +has_children: true +--- + +# Installing and configuring OpenSearch Benchmark + +This section details how to install and configure OpenSearch Benchmark. + diff --git a/_benchmark/user-guide/installing-benchmark.md b/_benchmark/user-guide/install-and-configure/installing-benchmark.md similarity index 98% rename from _benchmark/user-guide/installing-benchmark.md rename to _benchmark/user-guide/install-and-configure/installing-benchmark.md index 8383cfb2f9..1dd30f9180 100644 --- a/_benchmark/user-guide/installing-benchmark.md +++ b/_benchmark/user-guide/install-and-configure/installing-benchmark.md @@ -1,10 +1,12 @@ --- layout: default -title: Installing OpenSearch Benchmark +title: Installing nav_order: 5 -parent: User guide +grand_parent: User guide +parent: Install and configure redirect_from: - /benchmark/installing-benchmark/ + - /benchmark/user-guide/installing-benchmark/ --- # Installing OpenSearch Benchmark diff --git a/_benchmark/user-guide/distributed-load.md b/_benchmark/user-guide/optimizing-benchmarks/distributed-load.md similarity index 96% rename from _benchmark/user-guide/distributed-load.md rename to _benchmark/user-guide/optimizing-benchmarks/distributed-load.md index 60fc98500f..9729fe4362 100644 --- a/_benchmark/user-guide/distributed-load.md +++ b/_benchmark/user-guide/optimizing-benchmarks/distributed-load.md @@ -2,12 +2,12 @@ layout: default title: Running distributed loads nav_order: 15 -parent: User guide +parent: Optimizing benchmarks +grand_parent: User guide --- # Running distributed loads - OpenSearch Benchmark loads always run on the same machine on which the benchmark was started. However, you can use multiple load drivers to generate additional benchmark testing loads, particularly for large clusters on multiple machines. This tutorial describes how to distribute benchmark loads across multiple machines in a single cluster. ## System architecture @@ -64,7 +64,7 @@ With OpenSearch Benchmark running on all three nodes and the worker nodes set to On **Node 1**, run a benchmark test with the `worker-ips` set to the IP addresses for your worker nodes, as shown in the following example: ``` -opensearch-benchmark execute_test --pipeline=benchmark-only --workload=eventdata --worker-ips=198.52.100.0,198.53.100.0 --target-hosts= --client-options= --kill-running-processes +opensearch-benchmark execute-test --pipeline=benchmark-only --workload=eventdata --worker-ips=198.52.100.0,198.53.100.0 --target-hosts= --client-options= --kill-running-processes ``` After the test completes, the logs generated by the test appear on your worker nodes. diff --git a/_benchmark/user-guide/optimizing-benchmarks/index.md b/_benchmark/user-guide/optimizing-benchmarks/index.md new file mode 100644 index 0000000000..0ea6c1978e --- /dev/null +++ b/_benchmark/user-guide/optimizing-benchmarks/index.md @@ -0,0 +1,11 @@ +--- +layout: default +title: Optimizing benchmarks +nav_order: 25 +parent: User guide +has_children: true +--- + +# Optimizing benchmarks + +This section details different ways you can optimize the benchmark tools for your cluster. \ No newline at end of file diff --git a/_benchmark/user-guide/target-throughput.md b/_benchmark/user-guide/optimizing-benchmarks/target-throughput.md similarity index 79% rename from _benchmark/user-guide/target-throughput.md rename to _benchmark/user-guide/optimizing-benchmarks/target-throughput.md index 63832de595..b6c55f96c5 100644 --- a/_benchmark/user-guide/target-throughput.md +++ b/_benchmark/user-guide/optimizing-benchmarks/target-throughput.md @@ -2,7 +2,10 @@ layout: default title: Target throughput nav_order: 150 -parent: User guide +parent: Optimizing benchmarks +grand_parent: User guide +redirect_from: + - /benchmark/user-guide/target-throughput/ --- # Target throughput @@ -16,13 +19,15 @@ OpenSearch Benchmark has two testing modes, both of which are related to through ## Benchmarking mode -When you do not specify a `target-throughput`, OpenSearch Benchmark latency tests are performed in *benchmarking mode*. In this mode, the OpenSearch client sends requests to the OpenSearch cluster as fast as possible. After the cluster receives a response from the previous request, OpenSearch Benchmark immediately sends the next request to the OpenSearch client. In this testing mode, latency is identical to service time. +When `target-throughput` is set to `0`, OpenSearch Benchmark latency tests are performed in *benchmarking mode*. In this mode, the OpenSearch client sends requests to the OpenSearch cluster as fast as possible. After the cluster receives a response from the previous request, OpenSearch Benchmark immediately sends the next request to the OpenSearch client. In this testing mode, latency is identical to service time. + +OpenSearch Benchmark issues one request at a time per a single client. The number of clients is set by the `search-clients` setting in the workload parameters. ## Throughput-throttled mode -**Throughput** measures the rate at which OpenSearch Benchmark issues requests, assuming that responses will be returned instantaneously. However, users can set a `target-throughput`, which is a common workload parameter that can be set for each test and is measured in operations per second. +If the `target-throughput` is not set to `0`, then OpenSearch Benchmark issues the next request in accordance with the `target-throughput`, assuming that responses are returned instantaneously. -OpenSearch Benchmark issues one request at a time for a single-client thread, which is specified as `search-clients` in the workload parameters. If `target-throughput` is set to `0`, then OpenSearch Benchmark issues a request immediately after it receives the response from the previous request. If the `target-throughput` is not set to `0`, then OpenSearch Benchmark issues the next request in accordance with the `target-throughput`, assuming that responses are returned instantaneously. +**Throughput** measures the rate at which OpenSearch Benchmark issues requests, assuming that responses are returned instantaneously. To configure the request rate, you can set the `target-throughput` workload parameter to the desired number of operations per second for each test. When you want to simulate the type of traffic you might encounter when deploying a production cluster, set the `target-throughput` in your benchmark test to match the number of requests you estimate that the production cluster might receive. The following examples show how the `target-throughput` setting affects the latency measurement. diff --git a/_benchmark/user-guide/telemetry.md b/_benchmark/user-guide/telemetry.md deleted file mode 100644 index d4c40c790a..0000000000 --- a/_benchmark/user-guide/telemetry.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -layout: default -title: Enabling telemetry devices -nav_order: 30 -parent: User guide ---- - -Telemetry results will not appear in the summary report. To visualize telemetry results, ingest the data into OpenSearch and visualize the data in OpenSearch Dashboards. \ No newline at end of file diff --git a/_benchmark/user-guide/understanding-results/index.md b/_benchmark/user-guide/understanding-results/index.md new file mode 100644 index 0000000000..2122aa0e2e --- /dev/null +++ b/_benchmark/user-guide/understanding-results/index.md @@ -0,0 +1,12 @@ +--- +layout: default +title: Understanding results +nav_order: 20 +parent: User guide +has_children: true +--- + +After a [running a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/working-with-workloads/running-workloads/), OpenSearch Benchmark produces a series of metrics. The following pages details: + +- [How metrics are reported]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-results/summary-reports/) +- [How to visualize metrics]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-results/telemetry/) \ No newline at end of file diff --git a/_benchmark/user-guide/understanding-results.md b/_benchmark/user-guide/understanding-results/summary-reports.md similarity index 98% rename from _benchmark/user-guide/understanding-results.md rename to _benchmark/user-guide/understanding-results/summary-reports.md index 5b8935a8c7..28578c8c89 100644 --- a/_benchmark/user-guide/understanding-results.md +++ b/_benchmark/user-guide/understanding-results/summary-reports.md @@ -1,10 +1,14 @@ --- layout: default -title: Understanding benchmark results +title: Summary reports nav_order: 22 -parent: User guide +grand_parent: User guide +parent: Understanding results +redirect_from: + - /benchmark/user-guide/understanding-results/ --- +# Understanding the summary report At the end of each test run, OpenSearch Benchmark creates a summary of test result metrics like service time, throughput, latency, and more. These metrics provide insights into how the selected workload performed on a benchmarked OpenSearch cluster. diff --git a/_benchmark/user-guide/understanding-results/telemetry.md b/_benchmark/user-guide/understanding-results/telemetry.md new file mode 100644 index 0000000000..3548dd4456 --- /dev/null +++ b/_benchmark/user-guide/understanding-results/telemetry.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Enabling telemetry devices +nav_order: 30 +grand_parent: User guide +parent: Understanding results +redirect_from: + - /benchmark/user-guide/telemetry +--- + +# Enabling telemetry devices + +Telemetry results will not appear in the summary report. To visualize telemetry results, ingest the data into OpenSearch and visualize the data in OpenSearch Dashboards. + +To view a list of the available telemetry devices, use the command `opensearch-benchmark list telemetry`. After you've selected a [supported telemetry device]({{site.url}}{{site.baseurl}}/benchmark/reference/telemetry/), you can activate the device when running a tests with the `--telemetry` command flag. For example, if you want to use the `jfr` device with the `geonames` workload, enter the following command: + +```json +opensearch-benchmark workload --workload=geonames --telemetry=jfr +``` +{% include copy-curl.html %} + diff --git a/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md b/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md index 3bf339e4d5..f8e1d90d32 100644 --- a/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md +++ b/_benchmark/user-guide/understanding-workloads/anatomy-of-a-workload.md @@ -98,7 +98,7 @@ To create an index, specify its `name`. To add definitions to your index, use th The `corpora` element requires the name of the index containing the document corpus, for example, `movies`, and a list of parameters that define the document corpora. This list includes the following parameters: - `source-file`: The file name that contains the workload's corresponding documents. When using OpenSearch Benchmark locally, documents are contained in a JSON file. When providing a `base_url`, use a compressed file format: `.zip`, `.bz2`, `.zst`, `.gz`, `.tar`, `.tar.gz`, `.tgz`, or `.tar.bz2`. The compressed file must include one JSON file containing the name. -- `document-count`: The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client is assigned an Nth of the document corpus to ingest into the test cluster. When using a source that contains a document with a parent-child relationship, specify the number of parent documents. +- `document-count`: The number of documents in the `source-file`, which determines which client indexes correlate to which parts of the document corpus. Each N client is assigned an Nth of the document corpus to ingest into the test cluster. When using a source that contains a document with a parent/child relationship, specify the number of parent documents. - `uncompressed-bytes`: The size, in bytes, of the source file after decompression, indicating how much disk space the decompressed source file needs. - `compressed-bytes`: The size, in bytes, of the source file before decompression. This can help you assess the amount of time needed for the cluster to ingest documents. diff --git a/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md index 6016caee0a..ae973a7c62 100644 --- a/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md +++ b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md @@ -22,8 +22,8 @@ Consider the following criteria when deciding which workload would work best for ## General search clusters -For benchmarking clusters built for general search use cases, start with the `[nyc_taxis]`(https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload. This workload contains data about the rides taken in yellow taxis in New York City in 2015. +For benchmarking clusters built for general search use cases, start with the [nyc_taxis](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload. This workload contains data about the rides taken in yellow taxis in New York City in 2015. ## Log data -For benchmarking clusters built for indexing and search with log data, use the [`http_logs`](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/http_logs) workload. This workload contains data about the 1998 World Cup. \ No newline at end of file +For benchmarking clusters built for indexing and search with log data, use the [http_logs](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/http_logs) workload. This workload contains data about the 1998 World Cup. diff --git a/_benchmark/user-guide/understanding-workloads/index.md b/_benchmark/user-guide/understanding-workloads/index.md index 844b565185..6e6d2aa9c1 100644 --- a/_benchmark/user-guide/understanding-workloads/index.md +++ b/_benchmark/user-guide/understanding-workloads/index.md @@ -1,7 +1,7 @@ --- layout: default title: Understanding workloads -nav_order: 7 +nav_order: 10 parent: User guide has_children: true --- diff --git a/_benchmark/user-guide/contributing-workloads.md b/_benchmark/user-guide/working-with-workloads/contributing-workloads.md similarity index 97% rename from _benchmark/user-guide/contributing-workloads.md rename to _benchmark/user-guide/working-with-workloads/contributing-workloads.md index e60f60eaed..74524f36cb 100644 --- a/_benchmark/user-guide/contributing-workloads.md +++ b/_benchmark/user-guide/working-with-workloads/contributing-workloads.md @@ -2,7 +2,10 @@ layout: default title: Sharing custom workloads nav_order: 11 -parent: User guide +grand_parent: User guide +parent: Working with workloads +redirect_from: + - /benchmark/user-guide/contributing-workloads/ --- # Sharing custom workloads diff --git a/_benchmark/user-guide/creating-custom-workloads.md b/_benchmark/user-guide/working-with-workloads/creating-custom-workloads.md similarity index 99% rename from _benchmark/user-guide/creating-custom-workloads.md rename to _benchmark/user-guide/working-with-workloads/creating-custom-workloads.md index ee0dca1ce9..a239c94249 100644 --- a/_benchmark/user-guide/creating-custom-workloads.md +++ b/_benchmark/user-guide/working-with-workloads/creating-custom-workloads.md @@ -2,7 +2,8 @@ layout: default title: Creating custom workloads nav_order: 10 -parent: User guide +grand_parent: User guide +parent: Working with workloads redirect_from: - /benchmark/user-guide/creating-custom-workloads/ - /benchmark/creating-custom-workloads/ @@ -263,7 +264,7 @@ opensearch-benchmark list workloads --workload-path= Use the `opensearch-benchmark execute-test` command to invoke your new workload and run a benchmark test against your OpenSearch cluster, as shown in the following example. Replace `--workload-path` with the path to your custom workload, `--target-host` with the `host:port` pairs for your cluster, and `--client-options` with any authorization options required to access the cluster. ``` -opensearch-benchmark execute_test \ +opensearch-benchmark execute-test \ --pipeline="benchmark-only" \ --workload-path="" \ --target-host="" \ @@ -289,7 +290,7 @@ head -n 1000 -documents.json > -documents-1k.json Then, run `opensearch-benchmark execute-test` with the option `--test-mode`. Test mode runs a quick version of the workload test. ``` -opensearch-benchmark execute_test \ +opensearch-benchmark execute-test \ --pipeline="benchmark-only" \ --workload-path="" \ --target-host="" \ diff --git a/_benchmark/user-guide/finetine-workloads.md b/_benchmark/user-guide/working-with-workloads/finetune-workloads.md similarity index 97% rename from _benchmark/user-guide/finetine-workloads.md rename to _benchmark/user-guide/working-with-workloads/finetune-workloads.md index 4fc0a284db..d150247ad8 100644 --- a/_benchmark/user-guide/finetine-workloads.md +++ b/_benchmark/user-guide/working-with-workloads/finetune-workloads.md @@ -2,7 +2,10 @@ layout: default title: Fine-tuning custom workloads nav_order: 12 -parent: User guide +grand_parent: User guide +parent: Working with workloads +redirect_from: + - /benchmark/user-guide/finetine-workloads/ --- # Fine-tuning custom workloads diff --git a/_benchmark/user-guide/working-with-workloads/index.md b/_benchmark/user-guide/working-with-workloads/index.md new file mode 100644 index 0000000000..a6acb86b4b --- /dev/null +++ b/_benchmark/user-guide/working-with-workloads/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Working with workloads +nav_order: 15 +parent: User guide +has_children: true +--- + +# Working with workloads + +Once you [understand workloads]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/index/) and have [chosen a workload]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/choosing-a-workload/) to run your benchmarks with, you can begin working with workloads. + +- [Running workloads]({{site.url}}{{site.baseurl}}/benchmark/user-guide/working-with-workloads/running-workloads/): Learn how to run an OpenSearch Benchmark workload. +- [Creating custom workloads]({{site.url}}{{site.baseurl}}/benchmark/user-guide/working-with-workloads/creating-custom-workloads/): Create a custom workload with your own datasets. +- [Fine-tuning workloads]({{site.url}}{{site.baseurl}}/benchmark/user-guide/working-with-workloads/finetune-workloads/): Fine-tune your custom workload according to the needs of your cluster. +- [Contributing workloads]({{site.url}}{{site.baseurl}}/benchmark/user-guide/working-with-workloads/contributing-workloads/): Contribute your custom workload for the OpenSearch community to use. \ No newline at end of file diff --git a/_benchmark/user-guide/running-workloads.md b/_benchmark/user-guide/working-with-workloads/running-workloads.md similarity index 99% rename from _benchmark/user-guide/running-workloads.md rename to _benchmark/user-guide/working-with-workloads/running-workloads.md index 36108eb9c8..534d61f6b9 100644 --- a/_benchmark/user-guide/running-workloads.md +++ b/_benchmark/user-guide/working-with-workloads/running-workloads.md @@ -2,7 +2,10 @@ layout: default title: Running a workload nav_order: 9 -parent: User guide +grand_parent: User guide +parent: Working with workloads +redirect_from: + - /benchmark/user-guide/running-workloads/ --- # Running a workload diff --git a/_clients/index.md b/_clients/index.md index fc8c23d912..a15f0539d2 100644 --- a/_clients/index.md +++ b/_clients/index.md @@ -53,8 +53,8 @@ To view the compatibility matrix for a specific client, see the `COMPATIBILITY.m Client | Recommended version :--- | :--- -[Elasticsearch Java low-level REST client](https://search.maven.org/artifact/org.elasticsearch.client/elasticsearch-rest-client/7.13.4/jar) | 7.13.4 -[Elasticsearch Java high-level REST client](https://search.maven.org/artifact/org.elasticsearch.client/elasticsearch-rest-high-level-client/7.13.4/jar) | 7.13.4 +[Elasticsearch Java low-level REST client](https://central.sonatype.com/artifact/org.elasticsearch.client/elasticsearch-rest-client/7.13.4) | 7.13.4 +[Elasticsearch Java high-level REST client](https://central.sonatype.com/artifact/org.elasticsearch.client/elasticsearch-rest-high-level-client/7.13.4) | 7.13.4 [Elasticsearch Python client](https://pypi.org/project/elasticsearch/7.13.4/) | 7.13.4 [Elasticsearch Node.js client](https://www.npmjs.com/package/@elastic/elasticsearch/v/7.13.0) | 7.13.0 [Elasticsearch Ruby client](https://rubygems.org/gems/elasticsearch/versions/7.13.0) | 7.13.0 diff --git a/_config.yml b/_config.yml index 8a43e2f61a..3c6f737cc8 100644 --- a/_config.yml +++ b/_config.yml @@ -5,10 +5,10 @@ baseurl: "/docs/latest" # the subpath of your site, e.g. /blog url: "https://opensearch.org" # the base hostname & protocol for your site, e.g. http://example.com permalink: /:path/ -opensearch_version: '2.16.0' -opensearch_dashboards_version: '2.16.0' -opensearch_major_minor_version: '2.16' -lucene_version: '9_11_1' +opensearch_version: '2.18.0' +opensearch_dashboards_version: '2.18.0' +opensearch_major_minor_version: '2.18' +lucene_version: '9_12_0' # Build settings markdown: kramdown @@ -31,9 +31,6 @@ collections: install-and-configure: permalink: /:collection/:path/ output: true - upgrade-to: - permalink: /:collection/:path/ - output: true im-plugin: permalink: /:collection/:path/ output: true @@ -94,6 +91,9 @@ collections: data-prepper: permalink: /:collection/:path/ output: true + migration-assistant: + permalink: /:collection/:path/ + output: true tools: permalink: /:collection/:path/ output: true @@ -121,6 +121,9 @@ collections: getting-started: permalink: /:collection/:path/ output: true + workspace: + permalink: /:collection/:path/ + output: true opensearch_collection: # Define the collections used in the theme @@ -134,11 +137,6 @@ opensearch_collection: install-and-configure: name: Install and upgrade nav_fold: true - upgrade-to: - name: Migrate to OpenSearch - # nav_exclude: true - nav_fold: true - # search_exclude: true im-plugin: name: Managing Indexes nav_fold: true @@ -210,6 +208,12 @@ clients_collection: name: Clients nav_fold: true +migration_assistant_collection: + collections: + migration-assistant: + name: Migration Assistant + nav_fold: true + benchmark_collection: collections: benchmark: @@ -249,6 +253,12 @@ defaults: values: section: "benchmark" section-name: "Benchmark" + - + scope: + path: "_migration-assistant" + values: + section: "migration-assistant" + section-name: "Migration Assistant" # Enable or disable the site search # By default, just-the-docs enables its JSON file-based search. We also have an OpenSearch-driven search functionality. @@ -308,6 +318,7 @@ plugins: - jekyll-remote-theme - jekyll-redirect-from - jekyll-sitemap + - jekyll-spec-insert # This format has to conform to RFC822 last-modified-at: @@ -317,6 +328,8 @@ last-modified-at: # The following items will not be processed, by default. Create a custom list # to override the default setting. exclude: + - README.md + - DEVELOPER_GUIDE.md - Gemfile - Gemfile.lock - node_modules @@ -324,6 +337,12 @@ exclude: - vendor/cache/ - vendor/gems/ - vendor/ruby/ - - README.md - - .idea - - templates + - templates/ + - .sass-cache/ + - .jekyll-cache/ + - .idea/ + - .github/ + - .bundle/ + - _site/ + - spec-insert + - release-notes \ No newline at end of file diff --git a/_dashboards/csp/csp-dynamic-configuration.md b/_dashboards/csp/csp-dynamic-configuration.md index abe80a60c7..794323472b 100644 --- a/_dashboards/csp/csp-dynamic-configuration.md +++ b/_dashboards/csp/csp-dynamic-configuration.md @@ -1,11 +1,11 @@ --- layout: default -title: Configuring CSP rules for `frame-ancestors` +title: Configuring CSP rules for frame ancestors nav_order: 140 has_children: false --- -# Configuring CSP rules for `frame-ancestors` +# Configuring CSP rules for frame ancestors Introduced 2.13 {: .label .label-purple } diff --git a/_dashboards/dashboards-assistant/alert-insight.md b/_dashboards/dashboards-assistant/alert-insight.md new file mode 100644 index 0000000000..603e5aba44 --- /dev/null +++ b/_dashboards/dashboards-assistant/alert-insight.md @@ -0,0 +1,321 @@ +--- +layout: default +title: Alert insights +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Alert insights + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion in the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant alert insights help generate alert summaries and provide log patterns based on the logs that triggered the alert. + +## Configuring alert insights + +To configure alert insights, use the following steps. + +### Prerequisite + +Before using alert insights, you must have the `alerting` and `alerting-dashboards` plugins installed on your cluster. By default, these plugins are installed as part of standard OpenSearch distributions. For more information, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). + +### Step 1: Enable alert insights + +To enable alert insights, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.alertInsight.enabled: true +``` +{% include copy.html %} + +### Step 2: Create the agents + +To orchestrate alert insights, you'll need to create the necessary [agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). Create a workflow template for creating all necessary agents by sending the following request: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow?provision=true +{ + "name": "Alert Summary Agent", + "description": "Create Alert Summary Agent using Claude on BedRock", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": ["2.17.0", "3.0.0"] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "version": "1", + "name": "Claude instant runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for Claude model", + "actions": [ + { + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "method": "POST", + "request_body": "{\"prompt\":\"\\n\\nHuman: ${parameters.prompt}\\n\\nAssistant:\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type": "predict", + "url": "https://bedrock-runtime.us-west-2.amazonaws.com/model/anthropic.claude-instant-v1/invoke" + } + ], + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "region": "us-west-2", + "endpoint": "bedrock-runtime.us-west-2.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0001", + "response_filter": "$.completion", + "anthropic_version": "bedrock-2023-05-31" + } + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "description": "Claude model", + "deploy": true, + "name": "claude-instant" + } + }, + { + "id": "create_alert_summary_ml_model_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You are an OpenSearch Alert Assistant to help summarize the alerts.\n Here is the detail of alert: ${parameters.context};\n The question is: ${parameters.question}." + }, + "name": "MLModelTool", + "type": "MLModelTool" + } + }, + { + "id": "create_alert_summary_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_alert_summary_ml_model_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "Alert Summary Agent", + "description": "this is an alert summary agent" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +
+ +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +For this example, use the templates to create the following agents: +- An alert insights agent, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/create-knowledge-base-alert-agent.json) +- Two summary agents: + - A basic alert summary agent, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/alert-summary-agent-claude-tested.json) + - An agent for an alert summary that includes log patterns, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/alert-summary-log-pattern-agent.json) + + These agents require different prompts. The prompt for the log patterns summary must include a placeholder `${parameters.topNLogPatternData}` and additional instructions to guide the LLM on using this information effectively. Note that log patterns are available only for query monitors created using OpenSearch Dashboards. + +### Step 3: Create the root agents + +Next, create [root agents]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/#root_agent) for agents created in the previous step. + +Create a root agent for the alert summary agent: + +```json +POST /.plugins-ml-config/_doc/os_summary +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +Create a root agent for the alert summary with log patterns agent: + +```json +POST /.plugins-ml-config/_doc/os_summary_with_log_pattern +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +Create a root agent for the alert insights agent: + +```json +POST /.plugins-ml-config/_doc/os_insight +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +The created `os_insight` agent provides alert insights related to OpenSearch cluster metrics. For insights about alerts unrelated to OpenSearch cluster metrics, you need to register an agent with [this template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/create-knowledge-base-alert-agent.json) and change the agent name to `KB_For_Alert_Insight`. +{: .note} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agents + +You can verify that the agents were created successfully by calling the agents with an example payload. + +To test the alert summary agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + } +} +``` +{% include copy-curl.html %} + +To test the alert summary with log patterns agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "topNLogPatternData": "[[539,["[Sun Dec 04 07:12:44 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 06:19:18 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:59:47 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:11:22 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:31:12 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 05:04:04 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 20:24:49 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 06:16:23 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 20:47:17 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:30:43 2005] [error] mod_jk child workerEnv in error state 6","[Mon Dec 05 06:35:27 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:07:30 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 16:32:56 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 8"],"[ :: ] [] _ "],[32,["[Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:34:57 2005] [error] [client 61.138.216.82] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:48:48 2005] [error] [client 67.166.248.235] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 01:30:32 2005] [error] [client 211.62.201.48] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 16:45:04 2005] [error] [client 216.216.185.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 17:31:39 2005] [error] [client 218.75.106.250] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:00:56 2005] [error] [client 68.228.3.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:14:09 2005] [error] [client 61.220.139.68] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:28:44 2005] [error] [client 198.232.168.9] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:53:43 2005] [error] [client 218.39.132.175] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 05:15:09 2005] [error] [client 222.166.160.184] Directory index forbidden by rule: /var/www/html/"],"[ :: ] [] [ ...] : ////"],[12,["[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:16 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2"],"[ :: ] [] _ -"]]" + } +} +``` +{% include copy-curl.html %} + +To test the alert insights agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please provide your insight on this alerts.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "summary": + } +} +``` +{% include copy-curl.html %} + +## Generating an alert summary + +You can generate an alert summary by calling the `/api/assistant/summary` API endpoint. To generate an alert summary, the fields `index`, `dsl`, and `topNLogPatternData` are optional. If all three fields are provided, the agent will provide a summary with log pattern analysis; otherwise, it will provide a general summary: + +```json +POST /api/assistant/summary +{ + "summaryType": "alerts", + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "index": "loghub-apache-new", + "dsl": "{\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}", + "topNLogPatternData": "[[539,["[Sun Dec 04 07:12:44 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 06:19:18 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:59:47 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:11:22 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:31:12 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 05:04:04 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 20:24:49 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 06:16:23 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 20:47:17 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:30:43 2005] [error] mod_jk child workerEnv in error state 6","[Mon Dec 05 06:35:27 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:07:30 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 16:32:56 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 8"],"[ :: ] [] _ "],[32,["[Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:34:57 2005] [error] [client 61.138.216.82] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:48:48 2005] [error] [client 67.166.248.235] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 01:30:32 2005] [error] [client 211.62.201.48] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 16:45:04 2005] [error] [client 216.216.185.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 17:31:39 2005] [error] [client 218.75.106.250] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:00:56 2005] [error] [client 68.228.3.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:14:09 2005] [error] [client 61.220.139.68] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:28:44 2005] [error] [client 198.232.168.9] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:53:43 2005] [error] [client 218.39.132.175] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 05:15:09 2005] [error] [client 222.166.160.184] Directory index forbidden by rule: /var/www/html/"],"[ :: ] [] [ ...] : ////"],[12,["[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:16 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2"],"[ :: ] [] _ -"]]" +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Summary API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`summaryType` | Required | Specifies the type of application calling this API. Use `alerts` for alert insights. +`question` | Required | Specifies the user's question regarding alert insights. Default is `Please summarize this alert, do not use any tool.` +`context` | Required | Provides context for the alert, including the alert monitor definition, active alerts, and trigger values. +`index` | Optional | The index that the alert monitors. If this parameter is not provided, log pattern analysis is not returned. +`dsl` | Optional | The DSL query for alert monitoring. If this parameter is not provided, log pattern analysis is not returned. +`topNLogPatternData` | Optional | Log patterns for the alert trigger data. If this parameter is not provided, log pattern analysis is not returned. + +## Generating alert insights + +You can generate alert insights by calling the `/api/assistant/insight` API endpoint. To generate alert insights, all of the following parameters are required: + +```json +POST /api/assistant/insight +{ + "summaryType": "alerts", + "insightType": "user_insight" + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "question": "Please provide your insight on this alerts.", + "summary": +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Insight API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`summaryType` | Required | Specifies the type of application calling this API. Use `alerts` for alert insights. +`insightType` | Required | Defines the alert type. Use `os_insight` for cluster metrics alerts and `user_insight` for other alert types. +`question` | Required | Specifies the user's question regarding alert insights. Default is `Please provide your insight on this alerts.` +`context` | Required | Provides context for the alert, including the alert monitor definition, active alerts, and trigger values. +`summary` | Required | The result returned by the alert summary agent. + + +## Viewing alert insights in OpenSearch Dashboards + +Before viewing alert insights, you must configure alerts in OpenSearch Dashboards. For more information, see [Alerting]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/index/). + +To view alert insights in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Plugins > Alerting**. All alerts are displayed. + +1. Hover over the alerts for your desired monitor. If you configured alert insights, you will see a sparkle icon ({::nomarkdown}sparkle icon{:/}) next to the alerts in the **Alerts** column, as shown in the following image. + + Alerting page with sparkle icon + +1. Select the alerts label or the sparkle icon. You will see the generated summary, as shown in the following image. + + Alert summary + +1. Select the information icon ({::nomarkdown}info icon{:/}) to view alert insights. You will see the generated alert insights, as shown in the following image. + + Alert insights \ No newline at end of file diff --git a/_dashboards/dashboards-assistant/data-summary.md b/_dashboards/dashboards-assistant/data-summary.md new file mode 100644 index 0000000000..e90e184e07 --- /dev/null +++ b/_dashboards/dashboards-assistant/data-summary.md @@ -0,0 +1,294 @@ +--- +layout: default +title: Data summary +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Data summary + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant data summary feature uses large language models (LLMs) to help you generate summaries for data stored in OpenSearch indexes. This tool provides an efficient way to gain insights from large datasets, making it easier to understand and act on the information contained in your OpenSearch indexes. + +## Configuration + +To configure the data summary feature, use the following steps. + +### Prerequisite + +Before using the data summary feature, enable query enhancements in OpenSearch Dashboards as follows: + +1. On the top menu bar, go to **Management > Dashboards Management**. +1. In the left navigation pane, select **Advanced settings**. +1. On the settings page, toggle **Enable query enhancements** to **On**. + +### Step 1: Enable the data summary feature + +To enable the data summary feature, configure the following `opensearch_dashboards.yml` setting: + +```yaml +queryEnhancements.queryAssist.summary.enabled: true +``` +{% include copy.html %} + +### Step 2: Create a data summary agent + +To orchestrate data summarization, create a data summary [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow?provision=true +{ + "name": "Query Assist Agent", + "description": "Create a Query Assist Agent using Claude on BedRock", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": ["2.13.0", "3.0.0"] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "version": "1", + "name": "Claude instant runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for Claude model", + "actions": [ + { + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "method": "POST", + "request_body": "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type": "predict", + "url": "https://bedrock-runtime.us-west-2.amazonaws.com/model/anthropic.claude-instant-v1/invoke" + } + ], + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "region": "us-west-2", + "endpoint": "bedrock-runtime.us-west-2.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0001", + "response_filter": "$.completion", + "anthropic_version": "bedrock-2023-05-31" + } + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "description": "Claude model", + "deploy": true, + "name": "claude-instant", + "guardrails": { + "type": "local_regex", + "input_guardrail": { + "stop_words": [ + { + "index_name": "words0", + "source_fields": ["title"] + } + ], + "regex": ["regex1", "regex2"] + }, + "output_guardrail": { + "stop_words": [ + { + "index_name": "words0", + "source_fields": ["title"] + } + ], + "regex": ["regex1", "regex2"] + } + } + } + }, + { + "id": "TransferQuestionToPPLAndExecuteTool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "PPLTool", + "name": "TransferQuestionToPPLAndExecuteTool", + "description": "Use this tool to transfer natural language to generate PPL and execute PPL to query inside. Use this tool after you know the index name, otherwise, call IndexRoutingTool first. The input parameters are: {index:IndexName, question:UserQuestion}", + "parameters": { + "response_filter": "$.completion", + "execute": false + }, + "include_output_in_agent_response": true + } + }, + { + "id": "summarize_success_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "MLModelTool", + "Name": "SummarizeSuccessTool", + "description": "Use this tool to summarize a PPL success response in query assist", + "parameters": { + "prompt": "\n\nHuman: You will be given a search response, summarize it as a concise paragraph while considering the following:\nUser's question on index '${parameters.index}': ${parameters.question}\nPPL (Piped Processing Language) query used: ${parameters.query}\n\nGive some documents to support your point.\nNote that the output could be truncated, summarize what you see. Don't mention about total items returned and don't mention about the fact that output is truncated if you see 'Output is too long, truncated' in the response.\n\nSkip the introduction; go straight into the summarization.\n\nUse the following pieces of context to answer the users question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n${parameters.response}\n\nAssistant:", + "response_filter": "$.completion" + } + } + }, + { + "id": "summarize_error_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "MLModelTool", + "name": "SummarizeErrorTool", + "description": "Use this tool to summarize a PPL error response in query assist", + "include_output_in_agent_response": true, + "parameters": { + "prompt": "\n\nHuman: You will be given an API response with errors, summarize it as a concise paragraph. Do not try to answer the user's question.\nIf the error cannot be fixed, eg. no such field or function not supported, then give suggestions to rephrase the question.\nIt is imperative that you must not give suggestions on how to fix the error or alternative PPL query, or answers to the question.\n\nConsider the following:\nUser's question on index '${parameters.index}': ${parameters.question}\nPPL (Piped Processing Language) query used: ${parameters.query}\n\nSkip the introduction; go straight into the summarization.\n\nUse the following pieces of context to answer the users question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n${parameters.response}\n\nAssistant:", + "response_filter": "$.completion" + } + } + }, + { + "id": "suggestions_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "type": "MLModelTool", + "name": "SuggestionsTool", + "description": "Use this tool to generate possible questions for an index in query assist", + "include_output_in_agent_response": true, + "parameters": { + "prompt": "\n\nHuman: OpenSearch index: ${parameters.index}\n\nRecommend 2 or 3 possible questions on this index given the fields below. Only give the questions, do not give descriptions of questions and do not give PPL queries.\n\nThe format for a field is\n```\n- field_name: field_type (sample field value)\n```\n\nFields:\n${parameters.fields}\n\nPut each question in a tag.\n\nAssistant:", + "response_filter": "$.completion" + } + } + }, + { + "id": "ppl_agent", + "type": "register_agent", + "previous_node_inputs": { + "TransferQuestionToPPLAndExecuteTool": "tools" + }, + "user_inputs": { + "parameters": {}, + "app_type": "query_assist", + "name": "PPL agent", + "description": "this is the PPL agent", + "type": "flow" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +
+ +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +### Step 3: Create a root agent + +Next, create a [root agent]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/#root_agent) for the data summary agent created in the previous step: + +```json +POST /.plugins-ml-config/_doc/os_data2summary +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the data summary agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "sample_data":"'[{\"_index\":\"90943e30-9a47-11e8-b64d-95841ca0b247\",\"_source\":{\"referer\":\"http://twitter.com/success/gemini-9a\",\"request\":\"/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"agent\":\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\",\"extension\":\"deb\",\"memory\":null,\"ip\":\"239.67.210.53\",\"index\":\"opensearch_dashboards_sample_data_logs\",\"message\":\"239.67.210.53 - - [2018-08-30T15:29:01.686Z] \\\"GET /beats/metricbeat/metricbeat-6.3.2-amd64.deb HTTP/1.1\\\" 404 2633 \\\"-\\\" \\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\\\"\",\"url\":\"https://artifacts.opensearch.org/downloads/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"tags\":\"success\",\"geo\":{\"srcdest\":\"CN:PL\",\"src\":\"CN\",\"coordinates\":{\"lat\":44.91167028,\"lon\":-108.4455092},\"dest\":\"PL\"},\"utc_time\":\"2024-09-05 15:29:01.686\",\"bytes\":2633,\"machine\":{\"os\":\"win xp\",\"ram\":21474836480},\"response\":\"404\",\"clientip\":\"239.67.210.53\",\"host\":\"artifacts.opensearch.org\",\"event\":{\"dataset\":\"sample_web_logs\"},\"phpmemory\":null,\"timestamp\":\"2024-09-05 15:29:01.686\"}}]'", + "sample_count":1, + "total_count":383, + "question":"Are there any errors in my logs?", + "ppl":"source=opensearch_dashboards_sample_data_logs| where QUERY_STRING(['response'], '4* OR 5*')"} +} +``` +{% include copy-curl.html %} + +## Generating a data summary + +You can generate a data summary by calling the `/api/assistant/data2summary` API endpoint. The `sample_count`, `total_count`, `question`, and `ppl` parameters are optional: + +```json +POST /api/assistant/data2summary +{ + "sample_data":"'[{\"_index\":\"90943e30-9a47-11e8-b64d-95841ca0b247\",\"_source\":{\"referer\":\"http://twitter.com/success/gemini-9a\",\"request\":\"/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"agent\":\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\",\"extension\":\"deb\",\"memory\":null,\"ip\":\"239.67.210.53\",\"index\":\"opensearch_dashboards_sample_data_logs\",\"message\":\"239.67.210.53 - - [2018-08-30T15:29:01.686Z] \\\"GET /beats/metricbeat/metricbeat-6.3.2-amd64.deb HTTP/1.1\\\" 404 2633 \\\"-\\\" \\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\\\"\",\"url\":\"https://artifacts.opensearch.org/downloads/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"tags\":\"success\",\"geo\":{\"srcdest\":\"CN:PL\",\"src\":\"CN\",\"coordinates\":{\"lat\":44.91167028,\"lon\":-108.4455092},\"dest\":\"PL\"},\"utc_time\":\"2024-09-05 15:29:01.686\",\"bytes\":2633,\"machine\":{\"os\":\"win xp\",\"ram\":21474836480},\"response\":\"404\",\"clientip\":\"239.67.210.53\",\"host\":\"artifacts.opensearch.org\",\"event\":{\"dataset\":\"sample_web_logs\"},\"phpmemory\":null,\"timestamp\":\"2024-09-05 15:29:01.686\"}}]'", + "sample_count":1, + "total_count":383, + "question":"Are there any errors in my logs?", + "ppl":"source=opensearch_dashboards_sample_data_logs| where QUERY_STRING(['response'], '4* OR 5*')" +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Data Summary API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`sample_data` | Required | A sample of data returned by the specified query and used as input for summarization. +`question` | Optional | The user's natural language question about the data, which guides the summary generation. +`ppl` | Optional | The Piped Processing Language (PPL) query used to retrieve data; in query assistance, this is generated by the LLM using the user's natural language question. +`sample_count` | Optional | The number of entries included in sample_data. +`total_count` | Optional | The total number of entries in the full query result set. + +## Viewing data summaries in OpenSearch Dashboards + +To view alert insights in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Discover**. + +1. From the query language dropdown list, select **PPL**. You will see the generated data summary after the query text, as shown in the following image. + + data summary diff --git a/_dashboards/dashboards-assistant/index.md b/_dashboards/dashboards-assistant/index.md index bf2d754be8..615a53b75f 100644 --- a/_dashboards/dashboards-assistant/index.md +++ b/_dashboards/dashboards-assistant/index.md @@ -2,7 +2,7 @@ layout: default title: OpenSearch Assistant for OpenSearch Dashboards nav_order: 3 -has_children: false +has_children: true has_toc: false --- @@ -22,7 +22,7 @@ To enable **OpenSearch Assistant** in OpenSearch Dashboards, locate your copy of ```yaml assistant.chat.enabled: true ``` -{% include copy-curl.html %} +{% include copy.html %} Then configure the root `agent_id` through the following API: @@ -131,8 +131,17 @@ assistant.next.enabled: true ``` {% include copy-curl.html %} +## Additional Dashboards Assistant capabilities + +For information about additional Dashboards Assistant capabilities, see the following pages: + +- [Generating alert insights]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/alert-insight/) +- [Generating data summaries]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/data-summary/) +- [Generating anomaly detector suggestions]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/suggest-anomaly-detector/) +- [Generating visualizations from text]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/text-to-visualization/) + ## Related articles - [Getting started guide for OpenSearch Assistant in OpenSearch Dashboards](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) - [OpenSearch Assistant configuration through the REST API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/opensearch-assistant/) -- [Build your own chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/build-chatbot/) \ No newline at end of file +- [Build your own chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/build-chatbot/) diff --git a/_dashboards/dashboards-assistant/suggest-anomaly-detector.md b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md new file mode 100644 index 0000000000..8f4aac80fd --- /dev/null +++ b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Anomaly detector suggestions +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Anomaly detector suggestions + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant can use a large language model (LLM) to suggest the creation of an anomaly detector. The LLM analyzes data patterns in your OpenSearch indexes and recommends configuration settings for the anomaly detector, making it easier to identify unusual activity or trends in your data. + +## Configuration + +To configure anomaly detector suggestions, use the following steps. + +### Prerequisite + +Before using anomaly detector suggestions, enable query enhancements in OpenSearch Dashboards as follows: + +1. On the top menu bar, go to **Management > Dashboards Management**. +1. In the left navigation pane, select **Advanced settings**. +1. On the settings page, toggle **Enable query enhancements** to **On**. + +### Step 1: Enable anomaly detector suggestions + +To enable anomaly detector suggestions, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.smartAnomalyDetector.enabled: true +``` +{% include copy.html %} + +### Step 2: Create an anomaly detector suggestion agent + +To orchestrate anomaly detector suggestions, create an anomaly detector suggestion [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload. For more information, see [Configuring OpenSearch Assistant]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/index/#configuring-opensearch-assistant). + +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +### Step 3: Configure the agent + +Next, configure the anomaly detector suggestion agent created in the previous step: + +```json +POST /.plugins-ml-config/_doc/os_suggest_ad +{ + "type": "suggest_anomaly_detector_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "index":"sample_weblogs_test" + } +} +``` +{% include copy-curl.html %} + +## Viewing anomaly detector suggestions in OpenSearch Dashboards + +To view anomaly detector suggestions in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Discover**. + +1. From the index pattern dropdown list, select an index pattern. + +1. Select the **AI assistant** dropdown list and then select **Suggest anomaly detector**, as shown in the following image. + + Click the Suggest anomaly detector action + +1. Wait for the LLM to populate the **Suggest anomaly detector** fields that will be used to create an anomaly detector for the index pattern. Then select the **Create detector** button to create an anomaly detector, as shown in the following image. + + Suggested anomaly detector diff --git a/_dashboards/dashboards-assistant/text-to-visualization.md b/_dashboards/dashboards-assistant/text-to-visualization.md new file mode 100644 index 0000000000..a30ca6d1f8 --- /dev/null +++ b/_dashboards/dashboards-assistant/text-to-visualization.md @@ -0,0 +1,286 @@ +--- +layout: default +title: Text to visualization +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Text to visualization + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant can create visualizations using natural language instructions. + +## Configuration + +To configure text to visualization, use the following steps. + +### Step 1: Enable text to visualization + +To enable text to visualization, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.text2viz.enabled: true +``` +{% include copy.html %} + +### Step 2: Create the agents + +To orchestrate text to visualization, you'll need to create the necessary [agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). Create a workflow template for creating all necessary text-to-visualization agents by sending the following request: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow +{ + "name": "Text to visualization agents", + "description": "This template is to create all Agents required for text to visualization", + "use_case": "REGISTER_AGENTS", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.18.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "endpoint": "bedrock-runtime.us-east-1.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0000", + "response_filter": "$.content[0].text", + "region": "us-east-1", + "anthropic_version": "bedrock-2023-05-31" + }, + "version": "1", + "name": "Claude haiku runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for claude model", + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-3-haiku-20240307-v1:0/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"${parameters.prompt}\"}]}],\"anthropic_version\":\"${parameters.anthropic_version}\",\"max_tokens\":${parameters.max_tokens_to_sample}}" + } + ] + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "name": "claude-haiku", + "description": "Claude model", + "deploy": true + } + }, + { + "id": "create_t2vega_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You're an expert at creating vega-lite visualization. No matter what the user asks, you should reply with a valid vega-lite specification in json.\nYour task is to generate Vega-Lite specification in json based on the given sample data, the schema of the data, the PPL query to get the data and the user's input.\nLet's start from dimension and metric/date. Now I have a question, I already transfer it to PPL and query my Opensearch cluster. \nThen I get data. For the PPL, it will do aggregation like \"stats AVG(field_1) as avg, COUNT(field_2) by field_3, field_4, field_5\". \nIn this aggregation, the metric is [avg, COUNT(field_2)] , and then we judge the type of field_3,4,5. If only field_5 is type related to date, the dimension is [field_3, field_4], and date is [field_5]\nFor example, stats SUM(bytes) by span(timestamp, 1w), machine.os, response, then SUM(bytes) is metric and span(timestamp, 1w) is date, while machine.os, response are dimensions.\nNotice: Some fields like 'span()....' will be the date, but not metric and dimension. \nAnd one field will only count once in dimension count. You should always pick field name from schema\nTo summarize,\nA dimension is a categorical variable that is used to group, segment, or categorize data. It is typically a qualitative attribute that provides context for metrics and is used to slice and dice data to see how different categories perform in relation to each other.\nThe dimension is not date related fields. The dimension and date are very closed. The only difference is date is related to datetime, while dimension is not.\nA metric is a quantitative measure used to quantify or calculate some aspect of the data. Metrics are numerical and typically represent aggregated values like sums, averages, counts, or other statistical calculations.\n\nIf a ppl doesn't have aggregation using 'stats', then each field in output is dimension.\nOtherwise, if a ppl use aggregation using 'stats' but doesn't group by using 'by', then each field in output is metric.\n\nThen for each given PPL, you could give the metric and dimension and date. One field will in only one of the metric, dimension or date.\n\nThen according to the metric number and dimension number of PPL result, you should first format the entrance code by metric_number, dimension_number, and date_number. For example, if metric_number = 1, dimension_number = 2, date_number=1, then the entrance code is 121.\nI define several use case categories here according to the entrance code.\nFor each category, I will define the entrance condition (number of metric and dimension)\nI will also give some defined attribute of generated vega-lite. Please refer to it to generate vega-lite.\n\nType 1:\nEntrance code: <1, 1, 0>\nDefined Attributes:\n {\n \"title\": \"\",\n \"description\": \"<description>\",\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric name>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<dimension name>\",\n \"type\": \"nominal\"\n }\n },\n }\n\nType 2:\nEntrance code: <1, 2, 0>\nDefined Attributes:\n{\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n },\n \"color\": {\n \"field\": \"<dimension 2>\",\n \"type\": \"nominal\"\n }\n }\n }\n\n\nType 3\nEntrance code: <3, 1, 0>\nDefined Attributes:\n{\n \"mark\": \"point\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\"\n },\n \"size\": {\n \"field\": \"<metric 3>\",\n \"type\": \"quantitative\"\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n}\n\nType 4\nEntrance code: <2, 1, 0>\nDefined Attributes:\n{\n \"mark\": \"point\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<mtric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\"\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n}\n\nType 5:\nEntrance code: <2, 1, 1>\nDefined Attributes:\n{\n \"layer\": [\n {\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1 name>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n },\n {\n \"mark\": {\n \"type\": \"line\",\n \"color\": \"red\"\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 2 name>\",\n \"orient\": \"right\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n }\n ],\n \"resolve\": {\n \"scale\": {\n \"y\": \"independent\"\n }\n }\n }\n\nType 6:\nEntrance code: <2, 0, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"layer\": [\n {\n \"mark\": \"area\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1 name>\"\n }\n }\n }\n },\n {\n \"mark\": {\n \"type\": \"line\",\n \"color\": \"black\"\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"date\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"metric 2\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 2 name>\",\n \"orient\": \"right\"\n }\n }\n }\n }\n ],\n \"resolve\": {\n \"scale\": {\n \"y\": \"independent\"\n }\n }\n }\n \nType 7:\nEntrance code: <1, 0, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric name>\"\n }\n }\n }\n }\n\nType 8:\nEntrance code: <1, 1, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric name>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\",\n \"legend\": {\n \"title\": \"<dimension name>\"\n }\n }\n }\n }\n\nType 9:\nEntrance code: <1, 2, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\",\n \"legend\": {\n \"title\": \"<dimension 1>\"\n }\n },\n \"facet\": {\n \"field\": \"<dimension 2>\",\n \"type\": \"nominal\",\n \"columns\": 2\n }\n }\n }\n\nType 10:\nEntrance code: all other code\nAll others type.\nUse a table to show the result\n\n\nBesides, here are some requirements:\n1. Do not contain the key called 'data' in vega-lite specification.\n2. If mark.type = point and shape.field is a field of the data, the definition of the shape should be inside the root \"encoding\" object, NOT in the \"mark\" object, for example, {\"encoding\": {\"shape\": {\"field\": \"field_name\"}}}\n3. Please also generate title and description\n\nThe sample data in json format:\n${parameters.sampleData}\n\nThis is the schema of the data:\n${parameters.dataSchema}\n\nThe user used this PPL query to get the data: ${parameters.ppl}\n\nThe user's question is: ${parameters.input_question}\n\nNotice: Some fields like 'span()....' will be the date, but not metric and dimension. \nAnd one field will only count once in dimension count. You should always pick field name from schema.\n And when you code is <2, 1, 0>, it belongs type 4.\n And when you code is <1, 2, 0>, it belongs type 9.\n\n\nNow please reply a valid vega-lite specification in json based on above instructions.\nPlease return the number of dimension, metric and date. Then choose the type. \nPlease also return the type.\nFinally return the vega-lite specification according to the type.\nPlease make sure all the key in the schema matches the word I given. \nYour answer format should be:\nNumber of metrics:[list the metric name here, Don't use duplicate name] <number of metrics {a}> \nNumber of dimensions:[list the dimension name here] <number of dimension {b}> \nNumber of dates:[list the date name here] <number of dates {c}> \nThen format the entrance code by: <Number of metrics, Number of dimensions, Number of dates>\nType and its entrance code: <type number>: <its entrance code>\nThen apply the vega-lite requirements of the type.\n<vega-lite> {here is the vega-lite json} </vega-lite>\n\nAnd don't use 'transformer' in your vega-lite and wrap your vega-lite json in <vega-lite> </vega-lite> tags\n" + }, + "name": "Text2Vega", + "type": "MLModelTool" + } + }, + { + "id": "create_instruction_based_t2vega_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You're an expert at creating vega-lite visualization. No matter what the user asks, you should reply with a valid vega-lite specification in json.\nYour task is to generate Vega-Lite specification in json based on the given sample data, the schema of the data, the PPL query to get the data and the user's input.\n\nBesides, here are some requirements:\n1. Do not contain the key called 'data' in vega-lite specification.\n2. If mark.type = point and shape.field is a field of the data, the definition of the shape should be inside the root \"encoding\" object, NOT in the \"mark\" object, for example, {\"encoding\": {\"shape\": {\"field\": \"field_name\"}}}\n3. Please also generate title and description\n\nThe sample data in json format:\n${parameters.sampleData}\n\nThis is the schema of the data:\n${parameters.dataSchema}\n\nThe user used this PPL query to get the data: ${parameters.ppl}\n\nThe user's input question is: ${parameters.input_question}\nThe user's instruction on the visualization is: ${parameters.input_instruction}\n\nNow please reply a valid vega-lite specification in json based on above instructions.\nPlease only contain vega-lite in your response.\n" + }, + "name": "Text2Vega", + "type": "MLModelTool" + } + }, + { + "id": "t2vega_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_t2vega_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "t2vega agent", + "description": "this is the t2vega agent that has a set of rules to generate the visualizations" + } + }, + { + "id": "t2vega_instruction_based_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_instruction_based_t2vega_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "t2vega instruction based agent", + "description": "this is the t2vega agent that supports instructions" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +</details> + +Use the workflow ID returned in the response to provision the resources: + +```json +POST /_plugins/_flow_framework/workflow/<workflow_id>/_provision +``` +{% include copy-curl.html %} + +To view the status of the workflow and all created resources, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/<workflow_id>/_status +``` +{% include copy-curl.html %} + +### Step 3: Configure the root agent + +Next, configure a root agent for text to visualization: + +```json +POST /.plugins-ml-config/_doc/os_text2vega +{ + "type": "os_chat_root_agent", + "configuration": { + "agent_id": "<ROOT_AGENT_ID>" + } +} +``` +{% include copy-curl.html %} + +Configure the agent to receive user instructions for creating visualizations: + +```json +POST /.plugins-ml-config/_doc/os_text2vega_with_instructions +{ + "type": "os_chat_root_agent", + "configuration": { + "agent_id": "<ROOT_AGENT_ID>" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents/<ROOT_AGENT_ID>/_execute +{ + "parameters": { + "input_question": "find unique visitors and average bytes every 3 hours", + "input_instruction": "display with different layers, use independent scale for different layers, display unique visitors with light blue bar chart", + "ppl": "source=opensearch_dashboards_sample_data_ecommerce| stats DISTINCT_COUNT(user) as unique_visitors, AVG(taxful_total_price) as avg_bytes by span(order_date, 3h)", + "sampleData": """[{\"unique_visitors\":15,\"avg_bytes\":90.98684210526316,\"span(order_date,3h)\":\"2024-04-25 00:00:00\"},{\"unique_visitors\":14,\"avg_bytes\":72.72083333333333,\"span(order_date,3h)\":\"2024-04-25 03:00:00\"}]""", + "dataSchema": """[{\"name\":\"unique_visitors\",\"type\":\"integer\"},{\"name\":\"avg_bytes\",\"type\":\"double\"},{\"name\":\"span(order_date,3h)\",\"type\":\"timestamp\"}]""" + } +} +``` +{% include copy-curl.html %} + +## Generating a visualization from text + +You can generate a visualization from text by calling the `/api/assistant/text2vega` API endpoint. The `input_instruction` parameter is optional: + +```json +POST /api/assistant/text2vega +{ + "input_instruction": "<input_instruction>", + "input_question": "<input_question>", + "ppl": "<ppl_query>", + "dataSchema": "<data_schema_of_ppl_response>", + "sampleData": "<sample_data_of_ppl_response>" +} +``` +{% include copy-curl.html %} + +The following table describes the Text to Visualization API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`input_question` | Required | The user's original question used to generate the corresponding Piped Processing Language (PPL) query. +`ppl` | Required | The generated PPL query that retrieves the data required for the visualization. +`dataSchema` | Required | Describes the structure and types of the data fields in the visualization output, based on the PPL response. +`sampleData` | Required | Provides sample entries from the data that will populate the visualization. +`input_instruction` | Optional | Specifies the styling instructions, such as colors, for the visualization. + +## Generating visualizations from text in OpenSearch Dashboards + +To generate visualizations from text in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Visualize** and then select **Create visualization**. + +1. In the **New Visualization** dialog, select **Natural language**, as shown in the following image. + + <img width="800px" src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-start.png" alt="Create a visualization by selecting natural language"> + +1. From the data sources dropdown list, select a data source, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-select-data-source.png" alt="Create a visualization by selecting natural language"> + +1. In the text box on the upper right, enter a question using natural language. A new visualization is generated, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-ask-question.png" alt="Create a visualization by selecting natural language"> + +1. To modify the generated visualization, select **Edit visual**. In the **Edit visual** dialog, enter the desired modifications and then select **Apply**, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-edit-visual.png" alt="Create a visualization by selecting natural language"> + + The visualization is updated, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-edit-visual-response.png" alt="Create a visualization by selecting natural language"> + + + diff --git a/_dashboards/management/accelerate-external-data.md b/_dashboards/management/accelerate-external-data.md index 6d1fa030e4..00eb8671ec 100644 --- a/_dashboards/management/accelerate-external-data.md +++ b/_dashboards/management/accelerate-external-data.md @@ -1,10 +1,8 @@ --- layout: default title: Optimize query performance using OpenSearch indexing -parent: Connecting Amazon S3 to OpenSearch -grand_parent: Data sources -nav_order: 15 -has_children: false +parent: Data sources +nav_order: 17 --- # Optimize query performance using OpenSearch indexing @@ -14,35 +12,171 @@ Introduced 2.11 Query performance can be slow when using external data sources for reasons such as network latency, data transformation, and data volume. You can optimize your query performance by using OpenSearch indexes, such as a skipping index or a covering index. -A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. +- A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. For more information, see [Skipping indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#skipping-indexes). +- A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. For more information, see [Covering indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#covering-indexes). +- A _materialized view_ enhances query performance by storing precomputed and aggregated data from the source data. For more information, see [Materialized views](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#materialized-views). -A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. See the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md) for comprehensive guidance on this feature's indexing process. +For comprehensive guidance on each indexing process, see the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md). ## Data sources use case: Accelerate performance -To get started with the **Accelerate performance** use case available in **Data sources**, follow these steps: +To get started with accelerating query performance, perform the following steps: -1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your Amazon S3 data source from the **Data sources** dropdown menu in the upper-left corner. -2. From the left-side navigation menu, select a database. -3. View the results in the table and confirm that you have the desired data. +1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your data source from the **Data sources** dropdown menu. +2. From the navigation menu, select a database. +3. View the results in the table and confirm that you have the correct data. 4. Create an OpenSearch index by following these steps: - 1. Select the **Accelerate data** button. A pop-up window appears. - 2. Enter your details in **Select data fields**. In the **Database** field, select the desired acceleration index: **Skipping index** or **Covering index**. A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. -5. Under **Index settings**, enter the information for your acceleration index. For information about naming, select **Help**. Note that an Amazon S3 table can only have one skipping index at a time. + 1. Select **Accelerate data**. A pop-up window appears. + 2. Enter your database and table details under **Select data fields**. +5. For **Acceleration type**, select the type of acceleration according to your use case. Then, enter the information for your acceleration type. For more information, see the following sections: + - [Skipping indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#skipping-indexes) + - [Covering indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#covering-indexes) + - [Materialized views](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#materialized-views) + +## Skipping indexes + +A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. + +With a skipping index, you can index only the metadata of the data stored in Amazon S3. When you query a table with a skipping index, the query planner references the index and rewrites the query to efficiently locate the data, instead of scanning all partitions and files. This allows the skipping index to quickly narrow down the specific location of the stored data. ### Define skipping index settings -1. Under **Skipping index definition**, select the **Add fields** button to define the skipping index acceleration method and choose the fields you want to add. -2. Select the **Copy Query to Editor** button to apply your skipping index settings. -3. View the skipping index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. Under **Skipping index definition**, select **Generate** to automatically generate a skipping index. Alternately, to manually choose the fields you want to add, select **Add fields**. Choose from the following types: + - `Partition`: Uses data partition details to locate data. This type is best for partitioning-based columns such as year, month, day, hour. + - `MinMax`: Uses lower and upper bound of the indexed column to locate data. This type is best for numeric columns. + - `ValueSet`: Uses a unique value set to locate data. This type is best for columns with low to moderate cardinality that require exact matching. + - `BloomFilter`: Uses the bloom filter algorithm to locate data. This type is best for columns with high cardinality that do not require exact matching. +2. Select **Create acceleration** to apply your skipping index settings. +3. View the skipping index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a skipping index using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE SKIPPING INDEX +ON datasourcename.gluedatabasename.vpclogstable( + `srcaddr` BLOOM_FILTER, + `dstaddr` BLOOM_FILTER, + `day` PARTITION, + `account_id`BLOOM_FILTER + ) WITH ( +index_settings = '{"number_of_shards":5,"number_of_replicas":1}', +auto_refresh = true, +checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Covering indexes + +A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. + +With a covering index, you can ingest data from a specified column in a table. This is the most performant of the three indexing types. Because OpenSearch ingests all data from your desired column, you get better performance and can perform advanced analytics. + +OpenSearch creates a new index from the covering index data. You can use this new index to create visualizations, or for anomaly detection and geospatial capabilities. You can manage the covering view index with Index State Management. For more information, see [Index State Management](https://opensearch.org/docs/latest/im-plugin/ism/index/). ### Define covering index settings -1. Under **Index settings**, enter a valid index name. Note that each Amazon S3 table can have multiple covering indexes. -2. Once you have added the index name, define the covering index fields by selecting `(add fields here)` under **Covering index definition**. -3. Select the **Copy Query to Editor** button to apply your covering index settings. -4. View the covering index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a REFRESH statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in an HDFS compatible file system. +4. Define the covering index fields by selecting **(add fields here)** under **Covering index definition**. +5. Select **Create acceleration** to apply your covering index settings. +6. View the covering index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a covering index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE INDEX vpc_covering_index +ON datasourcename.gluedatabasename.vpclogstable (version, account_id, interface_id, +srcaddr, dstaddr, srcport, dstport, protocol, packets, +bytes, start, action, log_status STRING, +`aws-account-id`, `aws-service`, `aws-region`, year, +month, day, hour ) +WITH ( + auto_refresh = true, + refresh_interval = '15 minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Materialized views + +With _materialized views_, you can use complex queries, such as aggregations, to power Dashboards visualizations. Materialized views ingest a small amount of your data, depending on the query, into OpenSearch. OpenSearch then forms an index from the ingested data that you can use for visualizations. You can manage the materialized view index with Index State Management. For more information, see [Index State Management](https://opensearch.org/docs/latest/im-plugin/ism/index/). + +### Define materialized view settings + +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a `REFRESH` statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in an HDFS compatible file system. +4. Enter a **Watermark delay**, which defines how late data can come and still be processed, such as 1 minute or 10 seconds. +5. Define the covering index fields under **Materialized view definition**. +6. Select **Create acceleration** to apply your materialized view index settings. +7. View the materialized view query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a materialized view index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE MATERIALIZED VIEW {table_name}__week_live_mview AS + SELECT + cloud.account_uid AS `aws.vpc.cloud_account_uid`, + cloud.region AS `aws.vpc.cloud_region`, + cloud.zone AS `aws.vpc.cloud_zone`, + cloud.provider AS `aws.vpc.cloud_provider`, + + CAST(IFNULL(src_endpoint.port, 0) AS LONG) AS `aws.vpc.srcport`, + CAST(IFNULL(src_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-src-aws-service`, + CAST(IFNULL(src_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.srcaddr`, + CAST(IFNULL(src_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.src-interface_uid`, + CAST(IFNULL(src_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.src-vpc_uid`, + CAST(IFNULL(src_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.src-instance_uid`, + CAST(IFNULL(src_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.src-subnet_uid`, + + CAST(IFNULL(dst_endpoint.port, 0) AS LONG) AS `aws.vpc.dstport`, + CAST(IFNULL(dst_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-dst-aws-service`, + CAST(IFNULL(dst_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.dstaddr`, + CAST(IFNULL(dst_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-interface_uid`, + CAST(IFNULL(dst_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-vpc_uid`, + CAST(IFNULL(dst_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-instance_uid`, + CAST(IFNULL(dst_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-subnet_uid`, + CASE + WHEN regexp(dst_endpoint.ip, '(10\\..*)|(192\\.168\\..*)|(172\\.1[6-9]\\..*)|(172\\.2[0-9]\\..*)|(172\\.3[0-1]\\.*)') + THEN 'ingress' + ELSE 'egress' + END AS `aws.vpc.flow-direction`, + + CAST(IFNULL(connection_info['protocol_num'], 0) AS INT) AS `aws.vpc.connection.protocol_num`, + CAST(IFNULL(connection_info['tcp_flags'], '0') AS STRING) AS `aws.vpc.connection.tcp_flags`, + CAST(IFNULL(connection_info['protocol_ver'], '0') AS STRING) AS `aws.vpc.connection.protocol_ver`, + CAST(IFNULL(connection_info['boundary'], 'Unknown') AS STRING) AS `aws.vpc.connection.boundary`, + CAST(IFNULL(connection_info['direction'], 'Unknown') AS STRING) AS `aws.vpc.connection.direction`, + + CAST(IFNULL(traffic.packets, 0) AS LONG) AS `aws.vpc.packets`, + CAST(IFNULL(traffic.bytes, 0) AS LONG) AS `aws.vpc.bytes`, + + CAST(FROM_UNIXTIME(time / 1000) AS TIMESTAMP) AS `@timestamp`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `start_time`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `interval_start_time`, + CAST(FROM_UNIXTIME(end_time / 1000) AS TIMESTAMP) AS `end_time`, + status_code AS `aws.vpc.status_code`, + + severity AS `aws.vpc.severity`, + class_name AS `aws.vpc.class_name`, + category_name AS `aws.vpc.category_name`, + activity_name AS `aws.vpc.activity_name`, + disposition AS `aws.vpc.disposition`, + type_name AS `aws.vpc.type_name`, + + region AS `aws.vpc.region`, + accountid AS `aws.vpc.account-id` + FROM + datasourcename.gluedatabasename.vpclogstable +WITH ( + auto_refresh = true, + refresh_interval = '15 Minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint', + watermark_delay = '1 Minute', +) +``` ## Limitations -This feature is still under development, so there are some limitations. For real-time updates, refer to the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). +This feature is still under development, so there are some limitations. For real-time updates, see the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). diff --git a/_dashboards/management/acl.md b/_dashboards/management/acl.md new file mode 100644 index 0000000000..bd57b72419 --- /dev/null +++ b/_dashboards/management/acl.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Access control lists for saved objects +parent: Dashboards Management +nav_order: 50 +--- + +# Access control lists for saved objects +Introduced 2.18 +{: .label .label-purple } + +You can use access control lists (ACLs) to manage permissions for your saved objects, providing authorization (AuthZ) capabilities without requiring backend plugin integration. + +## Understanding ACL types + +ACLs are applied at two levels: + +1. **Workspace ACL:** Workspace objects inherit permissions from their parent workspace. See [Workspace ACL]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl) for more information. +2. **Objects ACL:** Each individual object can have its own ACL policy. All operations on these objects must pass ACL policy validation. + +## Enabling the ACL feature + +The ACL feature must be enabled before you can define any access controls. Enable it by: + +1. Opening your `opensearch_dashboards.yml` file. +2. Enabling permissions with `savedObjects.permission.enabled: true`. + +## Defining ACL permissions + +ACL permissions are defined using the following schema: + +```json +{ + "permissions": { + "<permission_type_1>": { + "users": ["<principal_1>", "<principal_2>"], + "groups": ["<principal_3>", "<principal_4>"] + } + } +} +``` +{% include copy-curl.html %} + +### Granting permissions to authenticated users + +The wildcard character (`*`) grants permissions to all authenticated users. In the following example, the ACL grants workspace management permissions to the `finance_manager` group and dashboard creation permissions to the `finance_analyst` group: + +```json +{ + "permissions": { + "write": { + "groups": ["finance_manager"] + }, + "library_write": { + "groups": ["finance_analyst"] + } + } +} +``` +{% include copy-curl.html %} + +### Configuring mixed-level permissions + +To allow one user, `user-1` for example, to modify an object while giving read-only access to others, you can configure the ACL policy as follows: + +```json +{ + "permissions": { + "read": { + "users": ["*"] + }, + "write": { + "users": ["user-1"] + }, + } +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/management/management-index.md b/_dashboards/management/management-index.md index 7edc4d06c2..01796180e5 100644 --- a/_dashboards/management/management-index.md +++ b/_dashboards/management/management-index.md @@ -9,16 +9,14 @@ has_children: true Introduced 2.10 {: .label .label-purple } -**Dashboards Management** serves as the command center for customizing OpenSearch Dashboards to your needs. A view of the interface is shown in the following image. +**Dashboards Management** is the central hub for managing and customizing OpenSearch data directly within OpenSearch Dashboards. -<img src="{{site.url}}{{site.baseurl}}/images/dashboards/dashboards-management-ui.png" alt="Dashboards Management interface" width="700"/> - -{::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/alert-icon.png" class="inline-icon" alt="alert icon"/>{:/} **Note**<br>OpenSearch and OpenSearch Dashboards privileges govern access to individual features. If you do not have the appropriate access, consult your administrator. -{: .note} +OpenSearch and OpenSearch Dashboards permissions govern access to individual features. If you do not have the appropriate access permissions, consult your administrator. +{: .warning} ## Applications -The following applications are available in **Dashboards Management**: +You can access the following applications in **Dashboards Management**: - **[Index Patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/):** To access OpenSearch data, you need to create an index pattern so that you can select the data you want to use and define the properties of the fields. The Index Pattern tool gives you the ability to create an index pattern from within the UI. Index patterns point to one or more indexes, data streams, or index aliases. - **[Data Sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/):** The Data Sources tool is used to configure and manage the data sources that OpenSearch uses to collect and analyze data. You can use the tool to specify the source configuration in your copy of the [OpenSearch Dashboards configuration file]({{site.url}}{{site.baseurl}}https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml). diff --git a/_dashboards/visualize/maps-stats-api.md b/_dashboards/visualize/maps-stats-api.md index f81c7e6ac4..7c974a33d9 100644 --- a/_dashboards/visualize/maps-stats-api.md +++ b/_dashboards/visualize/maps-stats-api.md @@ -93,7 +93,7 @@ The following is the response for the preceding request: } ``` -## Response fields +## Response body fields The response contains statistics for the following layer types: diff --git a/_dashboards/visualize/viz-index.md b/_dashboards/visualize/viz-index.md index 75407a6ba5..4bde79d2cc 100644 --- a/_dashboards/visualize/viz-index.md +++ b/_dashboards/visualize/viz-index.md @@ -81,7 +81,7 @@ Region maps show patterns and trends across geographic locations. A region map i ### Markdown -Markdown is a the markup language used in Dashboards to provide context to your data visualizations. Using Markdown, you can display information and instructions along with the visualization. +Markdown is the markup language used in Dashboards to provide context to your data visualizations. Using Markdown, you can display information and instructions along with the visualization. <img src="{{site.url}}{{site.baseurl}}/images/dashboards/markdown.png" width="600" height="600" alt="Example coordinate map in OpenSearch Dashboards"> diff --git a/_dashboards/workspace/apis.md b/_dashboards/workspace/apis.md new file mode 100644 index 0000000000..683488e423 --- /dev/null +++ b/_dashboards/workspace/apis.md @@ -0,0 +1,386 @@ +--- +layout: default +title: Workspaces APIs +parent: Workspace for OpenSearch Dashboards +nav_order: 10 +--- + +# Workspaces APIs +Introduced 2.18 +{: .label .label-purple } + +The Workspaces API provides a set of endpoints for managing workspaces in OpenSearch Dashboards. + +## List Workspaces API + +You can use the following endpoint to retrieve a list of workspaces: + +```json +POST <osd host>:<port>/api/workspaces/_list +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `search` | String | Optional | A query string used to filter workspaces with simple query syntax, for example, `simple_query_string`. | +| `searchFields` | Array | Optional | Specifies which fields to perform the search query against. | +| `sortField` | String | Optional | The field name to use for sorting results. | +| `sortOrder` | String | Optional | Specifies ascending or descending sort order. | +| `perPage` | Number | Optional | The number of workspace results per page. | +| `page` | Number | Optional | The number of pages of results to retrieve. | +| `permissionModes` | Array | Optional | A list of permissions to filter by. | + +#### Example request + +```json +POST /api/workspaces/_list +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "page": 1, + "per_page": 20, + "total": 3, + "workspaces": [ + { + "name": "test1", + "features": [ + "use-case-all" + ], + "id": "hWNZls" + }, + { + "name": "test2", + "features": [ + "use-case-observability" + ], + "id": "SnkOPt" + } + ] + } +} +``` +{% include copy-curl.html %} + +## Get Workspaces API + +You can use the following endpoint to retrieve a single workspace: + +```json +GET <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +GET /api/workspaces/SnkOPt +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "name": "test2", + "features": ["use-case-all"], + "id": "SnkOPt" + } +} +``` +{% include copy-curl.html %} + +## Create Workspaces API + +You can use the following endpoint to create a workspace: + +```json +POST <osd host>:<port>/api/workspaces +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +POST api/workspaces +{ + "attributes": { + "name": "test4", + "description": "test4" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "id": "eHVoCJ" + } +} +``` +{% include copy-curl.html %} + +## Update Workspaces API + +You can use the following endpoint to update the attributes and permissions for a workspace: + +```json +PUT <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +PUT api/workspaces/eHVoCJ +{ + "attributes": { + "name": "test4", + "description": "test update" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Delete Workspaces API + +You can use the following endpoint to delete a workspace: + +```json +DELETE <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +DELETE api/workspaces/eHVoCJ +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Duplicate Saved Objects Workspaces API + +You can use the following endpoint to copy saved objects between workspaces: + +```json +POST <osd host>:<port>/api/workspaces/_duplicate_saved_objects +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `objects` | Array | Required | Specifies the saved objects to be duplicated. | +| `targetWorkspace` | String | Required | Identifies the destination workspace for copying. | +| `includeReferencesDeep` | Boolean | Optional | Determines whether to copy all referenced objects to the target workspace. Default is `true`. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_duplicate_saved_objects +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "successCount": 1, + "success": true, + "successResults": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + "meta": { + "title": "test*", + "icon": "indexPatternApp" + }, + "destinationId": "f4b724fd-9647-4bbf-bf59-610b43a62c75" + } + ] +} +``` +{% include copy-curl.html %} + +## Associate Saved Objects Workspaces API + +You can use the following endpoint to associate saved objects with a workspace: + +```json +POST <osd host>:<port>/api/workspaces/_associate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | Identifies the target workspace for object association. | +| `savedObjects` | Array | Required | Specifies the list of saved objects to be copied. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_associate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} + +## Dissociate Saved Objects Workspaces API + +You can use the following endpoint to dissociate saved objects from a workspace: + +```json +POST <osd host>:<port>/api/workspaces/_dissociate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | The target workspace with which to associate the objects. | +| `savedObjects` | Array | Required | A list of saved objects to copy. | + +The following table lists the attributes of the `savedObjects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | The type of the saved object, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_dissociate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/workspace/create-workspace.md b/_dashboards/workspace/create-workspace.md new file mode 100644 index 0000000000..34ba65bb54 --- /dev/null +++ b/_dashboards/workspace/create-workspace.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Create a workspace +parent: Workspace for OpenSearch Dashboards +nav_order: 1 +--- + +# Create a workspace +Introduced 2.18 +{: .label .label-purple } + +Before getting started with this tutorial, you must enable the workspace feature flag. See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace/#enabling-the-workspace-feature) for more information. + +When the saved objects permission is enabled, only users with admin status can create workspaces. See [Configuring the dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. + +To create a workspace, follow these steps: + +1. Open OpenSearch Dashboards. +2. From the main page, choose the appropriate card for your use case, for example, **Observability**, **Security Analytics**, **Search**, **Essentials**, or **Analytics**. Alternatively, you can select the **Create workspace** button and choose the appropriate use case from the dropdown menu. +3. Enter the required information in the **Workspace details** window. + - **Workspace name** is required. Valid characters are `a-z`, `A-Z`, `0-9`, parentheses (`()`), brackets (`[]`), underscore (`_`), hyphen (`-`), and spaces. Choose a unique workspace name within the character limit (40 characters). The **Create workspace** button is disabled when the workspace name already exists or exceeds the character limit, and an error message appears. + - **Use case and features** is required. Choose the use case that best fits your needs. If you are using Amazon OpenSearch Serverless and have enabled the [multiple data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/) feature, **Essentials** is automatically assigned. +4. (Optional) Select the color picker to customize the color of your workspace icon. +5. (Optional) Add a workspace description of up to 200 characters. This option is disabled when the description exceeds the character limit. +6. Save your workspace. + - The **Create workspace** button becomes active once you enter the information for all required fields. You become the workspace owner automatically. The system redirects you to either the collaborators page if the saved objects permission is enabled or the overview page if the saved objects permission is disabled. See [Configuring dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information about permissions. + +To set up permissions, see [Workspace access control lists]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/) for more information. + +## Associating data sources with a workspace + +The **Associate data source** option is only visible when the multiple data sources feature is enabled. Before creating your workspace, you must connect it with at least one data source. If you have not set up your data sources, see [Data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/). Once your sources are connected, you can link them to your new workspace. +{: .warning} + +### Associating OpenSearch data sources + +To associate OpenSearch data sources, follow these steps: + +1. Select the **Associate OpenSearch Data Sources** button to open the selection modal. +2. View the available data sources in the modal: + - Standard OpenSearch sources appear as single entries. + - Sources with direct query connections show a +N indicator. +3. Select the appropriate data source name(s). +4. Select the **Associate data sources** button to complete the association. + +### Associating direct query sources + +To associate direct query sources, follow these steps: + +1. Select the **Associate direct query data sources** button to open the selection modal. The modal displays only sources with direct query connections. +2. Select a data source to automatically expand its direct query connections. +3. Select the **Associate data sources** button to complete the association. diff --git a/_dashboards/workspace/index.md b/_dashboards/workspace/index.md new file mode 100644 index 0000000000..f0f572a4a5 --- /dev/null +++ b/_dashboards/workspace/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Getting started with workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 0 +--- + +# Getting started with workspaces +Introduced 2.18 +{: .label .label-purple } + +OpenSearch Dashboards 2.18 introduces an enhanced home page that provides a comprehensive view of all your workspaces. + +The new home page includes the following features: + +1. A **Create workspace** button for [OpenSearch Dashboard admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) to navigate to the [create workspace]({{site.url}}{{site.baseurl}}/dashboards/workspace/create-workspace) page. +2. Workspace access time information and a link to the workspace overview page. +3. A use case information icon that displays information about the workspace's purpose. +4. A **View all workspaces** button that navigates to the [workspace management]({{site.url}}{{site.baseurl}}/dashboards/workspace/manage-workspace/#navigating-the-workspaces-list) page. +5. Links to the latest OpenSearch documentation through the **Learn more from documentation** button and to [OpenSearch Playground](https://playground.opensearch.org/app/home#/) through the **Explore live demo environment at playground.opensearch.org** button. + +The navigation logic ensures a seamless user experience by directing you to the appropriate page based on your workspace access level: + +- If a you have a default workspace configured, you are directed to the workspace overview page. +- If a you have only one workspace, you are directed to the overview page of that workspace. +- If a you have multiple workspaces, you are directed to the new home page. +- If a you have no workspaces, you are directed to the new home page. diff --git a/_dashboards/workspace/manage-workspace.md b/_dashboards/workspace/manage-workspace.md new file mode 100644 index 0000000000..45733d75be --- /dev/null +++ b/_dashboards/workspace/manage-workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Manage workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 2 +--- + +# Manage workspaces +Introduced 2.18 +{: .label .label-purple } + +You can access and modify the workspace details, including name, description, use case, and icon color, on the **Workspace details** page. + +To access and modify your workspace details, follow these steps: + +1. Open OpenSearch Dashboards and navigate to **My Workspaces**. +2. Choose the desired workspace and then select the **Edit** button to make changes +3. Select the **Save** button to confirm changes or the **Discard changes** button to cancel modifications. + +## Workspace update permissions + +The following permissions apply when changing workspaces: + +1. **Without the Security plugin:** All users can edit and update the workspace. +2. **With the Security plugin installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can edit and update workspaces. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml`:** Only the [workspace owner]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#defining-workspace-collaborators) and the [workspace admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) can edit and update workspaces. + +## Workspace update restrictions + +When updating workspace use cases, the following rules apply. + +Original use case | Target use case | +:---: | :---: +Analytics | Cannot be changed to any other use case +Search | Analytics +Security analytics | Analytics +Observability | Analytics +Essentials | Analytics Search<br> Security Analytics<br> Observability + +## Workspace control panel + +The **Workspace details** page features the following buttons in the upper-right corner: + +1. **Delete** ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon) + - **Without the Security plugin installed:** All users can delete the workspace. + - **With the Security plugins installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can delete the workspace. + - **With the Security plugin installed and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only the admin can delete the workspace. +2. **Set as default workspace:** Sets the current workspace as the default login destination. +3. **Workspace overview:** Opens the **Overview** page in a new tab. + +## Adding assets to the workspace + +Access the **Sample data** in the navigation menu on the left. Select the appropriate dataset to install it in your cluster and OpenSearch Dashboards. + +## Copying assets between workspaces + +Data sources and configuration copying are not supported. +{: .warning} + +The assets page provides the following methods for copying assets across workspaces: + +1. **Copy all assets to...:** Copies all assets in the table. +2. **Copy to...:** Moves selected assets from the table. +3. **Copy to...:** Copies a single asset from the table. + +After selecting a copy option, choose the target workspace from the dropdown menu. The **Copy related assets** checkbox allows you to transfer associated assets. + +Upon selecting the **Copy** button, a side panel appears showing successful and failed asset transfers. Asset copy destinations depend on the following security configurations: + +1. **Without the Security plugin:** All workspaces are accessible. +2. **With the Security plugin and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All workspaces are accessible. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only workspaces for which the user has read and write or admin permissions are accessible. + +## Associating data sources + +On the data source management page, you can access a comprehensive list of associated OpenSearch connections, monitor direct query connections relevant to your current workspace, and establish new data source associations as needed. + +### Managing OpenSearch connections + +The OpenSearch connections tab displays all associated connections for the current workspace. Follow these steps to manage your connections: + +1. Access a comprehensive list of associated OpenSearch connections on the connections tab. +2. Use the **Remove association** button to unlink connections as needed. +3. Add new data sources by selecting the **OpenSearch data sources** button and subsequent modal. +4. Select from unassociated OpenSearch connections to expand your workspace's capabilities. + +### Adding direct query connections + +The **Direct query connections** tab displays a list of all direct query connections associated with your current workspace. To add more direct query connections to your workspace, select the **Direct query data sources** button. A modal window opens. + +The association modal displays a list of OpenSearch connections that contain direct query connections and have not yet been associated with your current workspace. When you associate an OpenSearch connection with your current workspace, all direct query connections within that OpenSearch connection are automatically associated as well. + +## Deleting your workspace + +Workspace deletion is restricted to admins. If you do not see a {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon, check your permissions. See [Configuring dashboard administrators]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. +{: .warning} + +Deleting a workspace permanently erases all its assets (except data sources) and the workspace itself. This action cannot be reversed. + +To delete a workspace, follow these steps: + +1. From the **Workspace details** page, select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon in the upper-right corner to delete the current workspace. +2. Alternatively, from the workspace list page, select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/ellipsis-icon.png" class="inline-icon" alt="ellipsis icon"/>{:/} icon and select **Delete**. Optionally, select multiple workspaces for bulk deletion. + +## Navigating the workspaces list + +The workspaces list page serves as your central hub for workspace management, displaying all workspaces for which you have access permissions. Key features include the following: + +- Search: Quickly find a workspace by name. +- Filter: Sort workspaces by use case. +- At a glance: View each workspace's name, use case, description, last update time, and associated data sources. + +Each workspace entry includes an **Actions** column with the following functional buttons. These tools streamline your workspace management, allowing for efficient organization and customization of your OpenSearch Dashboards environment: + +1. Copy ID: One-click copying of the workspace ID. +2. Edit: Direct access to the workspace's detailed configuration page. +3. Set as default: Easily set any workspace as your default workspace. +4. Delete: Remove workspaces as needed (may require admin privileges). diff --git a/_dashboards/workspace/workspace-acl.md b/_dashboards/workspace/workspace-acl.md new file mode 100644 index 0000000000..16b2cc8628 --- /dev/null +++ b/_dashboards/workspace/workspace-acl.md @@ -0,0 +1,153 @@ +--- +layout: default +title: Workspace access control lists +parent: Workspace for OpenSearch Dashboards +nav_order: 3 +--- + +# Workspace access control lists +Introduced 2.18 +{: .label .label-purple } + +Workspace access control lists (ACLs) manage authorization for saved objects `AuthZ(Authorization)` while enabling [Security in OpenSearch]({{site.url}}{{site.baseurl}}/security/) for `AuthN(Authentication)`. + +## Personas + +**Workspace** use cases involve the following key personas: + +* **Dashboard admin:** Has full access to all OpenSearch Dashboards functions and data. +* **Workspace administrator (also called _owner_):** Has full control over a specific workspace, including its configuration and saved objects. When a workspace is created, its creator is automatically assigned the role of workspace owner. +* **Workspace content producer:** Can view, create, and update saved objects within the workspace. +* **Workspace viewer:** Has read-only access to saved objects in the workspace. + + Roles are workspace specific, allowing users to assume different roles across workspaces. + {: .note} + +## Enabling permission control + +See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/management/acl#enabling-the-acl-feature) for instructions. + +## Configuring dashboard administrators + +To grant full access to all workspaces and objects in OpenSearch Dashboards, configure the admin permissions. Edit the `opensearch_dashboards.yml` file to define the admin by user ID and backend role, as shown in the following configuration: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["UserID"] +opensearchDashboards.dashboardAdmin.groups: ["BackendRole"] +savedObjects.permission.enabled: true +``` +{% include copy.html %} + +By default, the configuration is set to `[]`, meaning that no users are designated as admins. If the Security plugin is not installed and `savedObjects.permission.enabled: false`, all users are granted admin permissions. + +### Configuring global admin access + +Set all users as admins with this wildcard setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["*"] +``` +{% include copy-curl.html %} + +### Configuring admin access for a single user + +Configure a user with the `admin-user-id` setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["admin-user-id"] +``` +{% include copy-curl.html %} + +### Configuring admin access by backend role + +Configure a user with the `admin-role` setting: + +```yaml +opensearchDashboards.dashboardAdmin.groups: ["admin-role"] +``` +{% include copy-curl.html %} + +### Admin-restricted operations + +Admin-restricted operations include the following: + +- Workspace creation +- Workspace deletion +- Data source connections +- Disconnecting data sources from workspaces + +## Defining workspace collaborators + +Access to collaborator management is limited to admins. The **Collaborators** feature is only available when permission control is enabled. For instructions on activating permission control, see [Enabling permission control](#enabling-permission-control). The access levels include the following: + +- **Read only:** Grants permission to view the workspace and its assets. +- **Read and write:** Allows viewing and editing of assets within the workspace. +- **Admin:** Provides full access, including viewing and editing of assets within the workspace and updating workspace metadata, such as name, description, data sources, and collaborators. + +From the **Collaborators** page, you can by collaborator ID and filter results by collaborator type and access level. + +### Adding collaborators + +Workspace creators are granted the **Admin** access level as a collaborator. To add more collaborators, select the **Add collaborators** button, which displays a dropdown menu. Choose **Add Users** or **Add Groups** to access the corresponding modal for adding new collaborators. + +#### Adding users + +To add users, follow these steps: + +1. Select the **Add Users** button to open the modal. The modal displays one empty `User ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Choose **Add another User** to add multiple users. Do not use duplicate or existing `User ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +#### Adding groups + +To add groups, follow these steps: + +1. Select the **Add Groups** button to open the modal. The modal displays one empty `Group ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Use **Add another group** to add multiple groups. Do not use duplicate or existing `Group ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +### Modifying access levels + +You can modify collaborators access levels after adding them to the collaborators table if you have the required permissions. Collaborators can be assigned any access level. However, if all **Admin** collaborators are changed to lower access levels, then only admins can manage workspace collaboration. + +#### Modifying individual access levels + +To modify a single collaborator's access level, follow these steps: + +1. Select the action icon on the right of the table row. +2. Select **Change access level** from the dropdown menu. +3. Choose the desired access level from the list. +4. Confirm the change in the modal that appears and select **Confirm**. The collaborator's access level is updated in the table upon confirmation. + +#### Modifying access levels in batch + +To change access levels for several collaborators simultaneously, follow these steps: + +1. Select the desired collaborator rows in the table. +2. Select the **Actions** button that appears. +3. Select **Change access level** from the dropdown menu. +4. Select the new access level from the list provided. +5. Review and confirm the changes in the modal that appears. The access levels for all selected collaborators are updated in the table upon confirmation. + +### Deleting collaborators + +After adding collaborators to the table, you have the option to delete them. Be cautious when removing admin collaborators because deleting all of them restricts workspace collaborator management to admins only. A confirmation modal is displayed before finalizing this action. + +#### Deleting individual collaborators + +To delete an individual collaborator, follow these steps: + +1. Select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/ellipsis-icon.png" class="inline-icon" alt="ellipsis icon"/>{:/} icon on the right of the table row to display a dropdown menu. +2. Select **Delete collaborator** from the dropdown menu. A confirmation modal appears to verify your action. +3. Select **Confirm** in the modal to remove the collaborator from the table. + +#### Deleting collaborators in batch + +To remove several collaborators simultaneously, follow these steps: + +1. Select the rows containing the collaborators you want to remove from the table. A "Delete x collaborators" button appears. +2. Select the **Delete x collaborators** button. +3. Review the confirmation modal that appears. +4. Select **Confirm** to remove all selected collaborators from the table. diff --git a/_dashboards/workspace/workspace.md b/_dashboards/workspace/workspace.md new file mode 100644 index 0000000000..0938c48891 --- /dev/null +++ b/_dashboards/workspace/workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Workspace for OpenSearch Dashboards +nav_order: 110 +has_children: true +--- + +# Workspace for OpenSearch Dashboards +Introduced 2.18 +{: .label .label-purple } + +The Workspace feature in OpenSearch Dashboards enables you to tailor your environment with use-case-specific configurations. For example, you can create dedicated workspaces for observability scenarios, allowing you to focus on relevant functionalities. Additionally, the Workspace feature enables organization of visual assets, such as dashboards and visualizations, within a workspace with isolated storage. + +## Workspace data model + +The Workspace data model is defined by the following structure: + +```typescript +interface Workspace { + id: string; + name: string; + description?: string; + features?: string[]; + color: string; + uiSettings: Record<string, unknown>; +} +``` +{% include copy-curl.html %} + +The Workspace data model is composed of the following key attributes: + +- `id`: String type; unique ID for each each workspace. +- `name`: String type; designates the name of the workspace. +- `description`: Optional string type; provides contextual information for the workspace. +- `features`: Optional array of strings; contains use case IDs linked to the workspace. + +--- + +#### Example Workspace object + +The following object shows a typical Workspace configuration: + +```typescript +{ + id: "M5NqCu", + name: "Analytics team", + description: "Analytics team workspace", + features: ["use-case-analytics"], +} +``` +{% include copy-curl.html %} + +The configuration creates the `Analytics team` using the `use-case-observability` feature set. Use cases map to specific feature groups, limiting functionality to the defined set within each workspace. + +The following are predefined use case options: + +- `use-case-observability` +- `use-case-security-analytics` +- `use-case-search` +- `use-case-essentials` +- `use-case-all` + +--- + +## Associating saved objects with workspaces + +Saved objects in OpenSearch Dashboards, such as dashboards, visualizations, and index patterns, can be associated with specific workspaces, improving organization and accessibility as the volume of objects grows. + +The `workspaces` attribute, an array of strings, is added to saved objects to be linked with one or more workspaces. As a result, saved objects such as dashboards and visualizations are only accessible within their designated workspaces. + +The following saved object shows a dashboard object associated with the workspace `M5NqCu`: + +```typescript +{ + type: "dashboard", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu"] +} +``` +{% include copy-curl.html %} + +Saved objects support association with multiple workspaces, facilitating cross-team collaboration and resource sharing. This feature is useful when an object is relevant to multiple teams, projects, or use cases. + +The following example shows a data source object linked to multiple workspaces: + +```typescript +{ + type: "data-source", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu", "<TeamA-workspace-id>", "<Analytics-workspace-id>"] +} +``` +{% include copy-curl.html %} + +## Non-workspace saved objects + +Not all saved objects in OpenSearch Dashboards are associated with a workspace. Some objects operate independently of the workspace framework. These objects lack `workspace` attributes and serve system-wide functions. For example, the global user interface settings object manages configurations affecting the entire OpenSearch Dashboards interface in order to maintain consistent functionality across all workspaces. + +This dual approach allows OpenSearch Dashboards to balance granular, context-specific customization with overall system consistency. + +## Enabling the Workspace feature + +In your `opensearch_dashboards.yml` file, set the following option: + +```yaml +workspace.enabled: true +uiSettings: + overrides: + "home:useNewHomePage": true +``` +{% include copy-curl.html %} + +If your cluster has the Security plugin installed, then multi-tenancy must be disabled to avoid conflicts with similar workspaces: + +```yaml +opensearch_security.multitenancy.enabled: false +``` +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md index 9628bb6caf..ba574bdf7d 100644 --- a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md +++ b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md @@ -53,6 +53,7 @@ You can configure `random_cut_forest` mode with the following options. | `sample_size` | `256` | 100--2500 | The sample size used in the ML algorithm. | | `time_decay` | `0.1` | 0--1.0 | The time decay value used in the ML algorithm. Used as the mathematical expression `timeDecay` divided by `SampleSize` in the ML algorithm. | | `type` | `metrics` | N/A | The type of data sent to the algorithm. | +| `output_after` | 32 | N/A | Specifies the number of events to process before outputting any detected anomalies. | | `version` | `1.0` | N/A | The algorithm version number. | ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/aws-lambda.md b/_data-prepper/pipelines/configuration/processors/aws-lambda.md new file mode 100644 index 0000000000..bd167996a1 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/aws-lambda.md @@ -0,0 +1,94 @@ +--- +layout: default +title: aws_lambda +parent: Processors +grand_parent: Pipelines +nav_order: 10 +--- + +# aws_lambda integration for Data Prepper + +The [AWS Lambda](https://aws.amazon.com/lambda/) integration allows developers to use serverless computing capabilities within their Data Prepper pipelines for flexible event processing and data routing. + +## AWS Lambda processor configuration + +The `aws_lambda` processor enables invocation of an AWS Lambda function within your Data Prepper pipeline in order to process events. It supports both synchronous and asynchronous invocations based on your use case. + +## Configuration fields + +You can configure the processor using the following configuration options. + +Field | Type | Required | Description +-------------------- | ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Required | The name of the AWS Lambda function to invoke. +`invocation_type` | String | Required | Specifies the invocation type, either `request-response` or `event`. Default is `request-response`. +`aws.region` | String | Required | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | Optional | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | Optional | The maximum number of retries for failed invocations. Default is `3`. +`batch` | Object | Optional | The batch settings for the Lambda invocations. Default is `key_name = "events"`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | Optional | A conditional expression that determines when to invoke the Lambda processor. +`response_codec` | Object | Optional | A codec configuration for parsing Lambda responses. Default is `json`. +`tags_on_match_failure` | List | Optional | A list of tags to add to events when Lambda matching fails or encounters an unexpected error. +`sdk_timeout` | Duration| Optional | Configures the SDK's client connection timeout period. Default is `60s`. +`response_events_match` | Boolean | Optional | Specifies how Data Prepper interprets and processes Lambda function responses. Default is `false`. + +#### Example configuration + +``` +processors: + - aws_lambda: + function_name: "my-lambda-function" + invocation_type: "request-response" + response_events_match: false + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-role" + max_retries: 3 + batch: + key_name: "events" + threshold: + event_count: 100 + maximum_size: "5mb" + event_collect_timeout: PT10S + lambda_when: "event['status'] == 'process'" + +``` +{% include copy-curl.html %} + +## Usage + +The processor supports the following invocation types: + +- `request-response`: The processor waits for Lambda function completion before proceeding. +- `event`: The function is triggered asynchronously without waiting for a response. +- `batch`: When enabled, events are aggregated and sent in bulk to optimize Lambda invocations. Batch thresholds control the event count, size limit, and timeout. +- `codec`: JSON is used for both request and response codecs. Lambda must return JSON array outputs. +- `tags_on_match_failure`: Custom tags can be applied to events when Lambda processing fails or encounters unexpected issues. + +## Behavior + +When configured for batching, the AWS Lambda processor groups multiple events into a single request. This grouping is governed by batch thresholds, which can be based on the event count, size limit, or timeout. The processor then sends the entire batch to the Lambda function as a single payload. + +## Lambda response handling + +The `response_events_match` setting defines how Data Prepper handles the relationship between batch events sent to Lambda and the response received: + +- `true`: Lambda returns a JSON array with results for each batched event. Data Prepper maps this array back to its corresponding original event, ensuring that each event in the batch gets the corresponding part of the response from the array. +- `false`: Lambda returns one or more events for the entire batch. Response events are not correlated with the original events. Original event metadata is not preserved in the response events. For example, when `response_events_match` is set to `true`, the Lambda function is expected to return the same number of response events as the number of original requests, maintaining the original order. + +## Limitations + +Note the following limitations: + +- Payload limitation: 6 MB payload limit +- Response codec: JSON-only codec support + +## Integration testing + +Integration tests for this plugin are executed separately from the main Data Prepper build process. Use the following Gradle command to run these tests: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.processor.lambda.region="us-east-1" -Dtests.processor.lambda.functionName="lambda_test_function" -Dtests.processor.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` + +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/processors/copy-values.md b/_data-prepper/pipelines/configuration/processors/copy-values.md index f654e6f027..20d6a4a2c7 100644 --- a/_data-prepper/pipelines/configuration/processors/copy-values.md +++ b/_data-prepper/pipelines/configuration/processors/copy-values.md @@ -14,44 +14,126 @@ The `copy_values` processor copies values within an event and is a [mutate event You can configure the `copy_values` processor with the following options. -| Option | Required | Description | -:--- | :--- | :--- -| `entries` | Yes | A list of entries to be copied in an event. | -| `from_key` | Yes | The key of the entry to be copied. | -| `to_key` | Yes | The key of the new entry to be added. | -| `overwrite_if_to_key_exists` | No | When set to `true`, the existing value is overwritten if `key` already exists in the event. The default value is `false`. | +| Option | Required | Type | Description | +:--- | :--- | :--- | :--- +| `entries` | Yes | [entry](#entry) | A list of entries to be copied in an event. See [entry](#entry) for more information. | +| `from_list` | No | String | The key for the list of objects to be copied. | +| `to_list` | No | String | The key for the new list to be added. | +| `overwrite_if_to_list_exists` | No | Boolean | When set to `true`, the existing value is overwritten if the `key` specified by `to_list` already exists in the event. Default is `false`. | + +## entry + +For each entry, you can configure the following options. + +| Option | Required | Type | Description | +:--- | :--- | :--- | :--- +| `from_key` | Yes | String | The key for the entry to be copied. | +| `to_key` | Yes | String | The key for the new entry to be added. | +| `overwrite_if_to_key_exists` | No | Boolean | When set to `true`, the existing value is overwritten if the `key` already exists in the event. Default is `false`. | + ## Usage -To get started, create the following `pipeline.yaml` file: +The following examples show you how to use the `copy_values` processor. + +### Example: Copy values and skip existing fields + +The following example shows you how to configure the processor to copy values and skip existing fields: + +```yaml +... + processor: + - copy_values: + entries: + - from_key: "message1" + to_key: "message2" + - from_key: "message1" + to_key: "message3" +... +``` +{% include copy.html %} + +When the input event contains the following data: + +```json +{"message1": "hello", "message2": "bye"} +``` + +The processor copies "message1" to "message3" but not to "message2" because "message2" already exists. The processed event contains the following data: + +```json +{"message1": "hello", "message2": "bye", "message3": "hello"} +``` + +### Example: Copy values with overwrites + +The following example shows you how to configure the processor to copy values: ```yaml -pipeline: - source: - ... - .... +... processor: - copy_values: entries: - - from_key: "message" - to_key: "newMessage" - overwrite_if_to_key_exists: true - sink: + - from_key: "message1" + to_key: "message2" + overwrite_if_to_key_exists: true + - from_key: "message1" + to_key: "message3" +... ``` {% include copy.html %} -Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +When the input event contains the following data: + +```json +{"message1": "hello", "message2": "bye"} +``` -For example, before you run the `copy_values` processor, if the `logs_json.log` file contains the following event record: +The processor copies "message1" to both "message2" and "message3", overwriting the existing value in "message2". The processed event contains the following data: ```json -{"message": "hello"} +{"message1": "hello", "message2": "hello", "message3": "hello"} ``` -When you run this processor, it parses the message into the following output: +### Example: Selectively copy values between two lists of objects + +The following example shows you how to configure the processor to copy values between lists: + +```yaml +... + processor: + - copy_values: + from_list: mylist + to_list: newlist + entries: + - from_key: name + to_key: fruit_name +... +``` +{% include copy.html %} + +When the input event contains the following data: ```json -{"message": "hello", "newMessage": "hello"} +{ + "mylist": [ + {"name": "apple", "color": "red"}, + {"name": "orange", "color": "orange"} + ] +} ``` -If `newMessage` already exists, its existing value is overwritten with `value`. +The processed event contains a `newlist` with selectively copied fields: + +```json +{ + "newlist": [ + {"fruit_name": "apple"}, + {"fruit_name": "orange"} + ], + "mylist": [ + {"name": "apple", "color": "red"}, + {"name": "orange", "color": "orange"} + ] +} +``` diff --git a/_data-prepper/pipelines/configuration/sinks/aws-lambda.md b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md new file mode 100644 index 0000000000..d8c00bdb16 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md @@ -0,0 +1,73 @@ +--- +layout: default +title: aws_lambda +parent: Sinks +grand_parent: Pipelines +nav_order: 10 +--- + +---------------------------------------------------------------------------------------- +# `aws_lambda` sink for Data Prepper + +This page explains how to configure and use [AWS Lambda](https://aws.amazon.com/lambda/) with Data Prepper, enabling Lambda functions to serve as both processors and sinks. + +## `aws_lambda` sink + +Configure the Lambda sink using the following parameters. + +Field | Type | Required | Description +--------------------| ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Yes | The name of the AWS Lambda function to invoke. +`invocation_type` | String | No | Specifies the invocation type. Default is `event`. +`aws.region` | String | Yes | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | No | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | No | The maximum number of retries if the invocation fails. Default is `3`. +`batch` | Object | No | Optional batch settings for Lambda invocations. Default is `key_name = events`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | No | A conditional expression that determines when to invoke the Lambda sink. +`dlq` | Object | No | The dead-letter queue (DLQ) configuration for failed invocations. + +#### Example configuration + +``` +sink: + - aws_lambda: + function_name: "my-lambda-sink" + invocation_type: "event" + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-sink-role" + max_retries: 5 + batch: + key_name: "events" + threshold: + event_count: 50 + maximum_size: "3mb" + event_collect_timeout: PT5S + lambda_when: "event['type'] == 'log'" + dlq: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-sqs-role" + bucket: "<<your-dlq-bucket-name>>" +``` +{% include copy-curl.html %} + +## Usage + +The invocation types are as follows: + +- `event` (Default): Executes functions asynchronously without waiting for responses. +- `request-response` (Sink only): Executes functions synchronously, though responses are not processed. +- `batch`: Automatically groups events based on configured thresholds. +- `dlq`: Supports the DLQ configuration for failed invocations after retry attempts. + +Data Prepper components use an AWS Identity and Access Management (IAM) role assumption, `aws.sts_role_arn`, for secure Lambda function invocation and respect Lambda's concurrency limits during event processing. For more information, see the [AWS Lambda documentation](https://docs.aws.amazon.com/lambda). +{: .note} + +## Developer guide + +Integration tests must be executed separately from the main Data Prepper build. Execute them with the following command: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.sink.lambda.region="us-east-1" -Dtests.sink.lambda.functionName="lambda_test_function" -Dtests.sink.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/sources/documentdb.md b/_data-prepper/pipelines/configuration/sources/documentdb.md index c453b60a39..d3dd31edcb 100644 --- a/_data-prepper/pipelines/configuration/sources/documentdb.md +++ b/_data-prepper/pipelines/configuration/sources/documentdb.md @@ -3,7 +3,7 @@ layout: default title: documentdb parent: Sources grand_parent: Pipelines -nav_order: 2 +nav_order: 10 --- # documentdb diff --git a/_data-prepper/pipelines/configuration/sources/dynamo-db.md b/_data-prepper/pipelines/configuration/sources/dynamo-db.md index e465f45044..c5a7c8d188 100644 --- a/_data-prepper/pipelines/configuration/sources/dynamo-db.md +++ b/_data-prepper/pipelines/configuration/sources/dynamo-db.md @@ -3,7 +3,7 @@ layout: default title: dynamodb parent: Sources grand_parent: Pipelines -nav_order: 3 +nav_order: 20 --- # dynamodb diff --git a/_data-prepper/pipelines/configuration/sources/http.md b/_data-prepper/pipelines/configuration/sources/http.md index 06933edc1c..574f49e289 100644 --- a/_data-prepper/pipelines/configuration/sources/http.md +++ b/_data-prepper/pipelines/configuration/sources/http.md @@ -3,14 +3,14 @@ layout: default title: http parent: Sources grand_parent: Pipelines -nav_order: 5 +nav_order: 30 redirect_from: - /data-prepper/pipelines/configuration/sources/http-source/ --- # http -The `http` plugin accepts HTTP requests from clients. Currently, `http` only supports the JSON UTF-8 codec for incoming requests, such as `[{"key1": "value1"}, {"key2": "value2"}]`. The following table describes options you can use to configure the `http` source. +The `http` plugin accepts HTTP requests from clients. The following table describes options you can use to configure the `http` source. Option | Required | Type | Description :--- | :--- | :--- | :--- @@ -36,6 +36,19 @@ aws_region | Conditionally | String | AWS region used by ACM or Amazon S3. Requi Content will be added to this section.---> +## Ingestion + +Clients should send HTTP `POST` requests to the endpoint `/log/ingest`. + +The `http` protocol only supports the JSON UTF-8 codec for incoming requests, for example, `[{"key1": "value1"}, {"key2": "value2"}]`. + +#### Example: Ingest data with cURL + +The following cURL command can be used to ingest data: + +`curl "http://localhost:2021/log/ingest" --data '[{"key1": "value1"}, {"key2": "value2"}]'` +{% include copy-curl.html %} + ## Metrics The `http` source includes the following metrics. diff --git a/_data-prepper/pipelines/configuration/sources/kafka.md b/_data-prepper/pipelines/configuration/sources/kafka.md index e8452a93c3..ecd7c7eaa0 100644 --- a/_data-prepper/pipelines/configuration/sources/kafka.md +++ b/_data-prepper/pipelines/configuration/sources/kafka.md @@ -3,7 +3,7 @@ layout: default title: kafka parent: Sources grand_parent: Pipelines -nav_order: 6 +nav_order: 40 --- # kafka diff --git a/_data-prepper/pipelines/configuration/sources/kinesis.md b/_data-prepper/pipelines/configuration/sources/kinesis.md new file mode 100644 index 0000000000..659ed2afe1 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/kinesis.md @@ -0,0 +1,168 @@ +--- +layout: default +title: kinesis +parent: Sources +grand_parent: Pipelines +nav_order: 45 +--- + +# kinesis + +You can use the Data Prepper `kinesis` source to ingest records from one or more [Amazon Kinesis Data Streams](https://aws.amazon.com/kinesis/data-streams/). + +## Usage + +The following example pipeline specifies Kinesis as a source. The pipeline ingests data from multiple Kinesis data streams named `stream1` and `stream2` and sets the `initial_position` to indicate the starting point for reading the stream records: + +```yaml +version: "2" +kinesis-pipeline: + source: + kinesis: + streams: + - stream_name: "stream1" + initial_position: "LATEST" + - stream_name: "stream2" + initial_position: "LATEST" + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-iam-role" +``` + +## Configuration options + +The `kinesis` source supports the following configuration options. + +Option | Required | Type | Description +:--- |:---------|:---------| :--- +`aws` | Yes | AWS | Specifies the AWS configuration. See [`aws`](#aws). +`acknowledgments` | No | Boolean | When set to `true`, enables the `kinesis` source to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines#end-to-end-acknowledgments) when events are received by OpenSearch sinks. +`streams` | Yes | List | Configures a list of multiple Kinesis data streams that the `kinesis` source uses to read records. You can configure up to four streams. See [Streams](#streams). +`codec` | Yes | Codec | Specifies the [codec](#codec) to apply. +`buffer_timeout` | No | Duration | Sets the amount of time allowed for writing events to the Data Prepper buffer before timeout occurs. Any events that the source cannot write to the buffer during the specified amount of time are discarded. Default is `1s`. +`records_to_accumulate` | No | Integer | Determines the number of messages that accumulate before being written to the buffer. Default is `100`. +`consumer_strategy` | No | String | Selects the consumer strategy to use for ingesting Kinesis data streams. The default is `fan-out`, but `polling` can also be used. If `polling` is enabled, the additional configuration is required. +`polling` | No | polling | See [polling](#polling). + +### Streams + +You can use the following options in the `streams` array. + +Option | Required | Type | Description +:--- |:---------| :--- | :--- +`stream_name` | Yes | String | Defines the name of each Kinesis data stream. +`initial_position` | No | String | Sets the `initial_position` to determine at what point the `kinesis` source starts reading stream records. Use `LATEST` to start from the most recent record or `EARLIEST` to start from the beginning of the stream. Default is `LATEST`. +`checkpoint_interval` | No | Duration | Configure the `checkpoint_interval` to periodically checkpoint Kinesis data streams and avoid duplication of record processing. Default is `PT2M`. +`compression` | No | String | Specifies the compression format. To decompress records added by a [CloudWatch Logs Subscription Filter](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/SubscriptionFilters.html) to Kinesis, use the `gzip` compression format. + +## codec + +The `codec` determines how the `kinesis` source parses each Kinesis stream record. For increased and more efficient performance, you can use [codec combinations]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/codec-processor-combinations/) with certain processors. + +### json codec + +The `json` codec parses each single line as a single JSON object from a JSON array and then creates a Data Prepper event for each object in the array. It can be used for parsing nested CloudWatch events into individual log entries. +It also supports the below configuration to use with this codec. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`key_name` | No | String | The name of the input field from which to extract the JSON array and create Data Prepper events. +`include_keys` | No | List | The list of input fields to be extracted and added as additional fields in the Data Prepper event. +`include_keys_metadata` | No | List | The list of input fields to be extracted and added to the Data Prepper event metadata object. + +### `newline` codec + +The `newline` codec parses each Kinesis stream record as a single log event, making it ideal for processing single-line records. It also works well with the [`parse_json` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/parse-json/) to parse each line. + +You can use the following options to configure the `newline` codec. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`skip_lines` | No | Integer | Sets the number of lines to skip before creating events. You can use this configuration to skip common header rows. Default is `0`. +`header_destination` | No | String | Defines a key value to assign to the header line of the stream event. If this option is specified, then each event will contain a `header_destination` field. + +### polling + +When the `consumer_strategy` is set to `polling`, the `kinesis` source uses a polling-based approach to read records from the Kinesis data streams, instead of the default `fan-out` approach. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`max_polling_records` | No | Integer | Sets the number of records to fetch from Kinesis during a single call. +`idle_time_between_reads` | No | Duration | Defines the amount of idle time between calls. + +### aws + +You can use the following options in the `aws` configuration. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | Sets the AWS Region to use for credentials. Defaults to the [standard SDK behavior for determining the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | Defines the AWS Security Token Service (AWS STS) role to assume for requests to Amazon Kinesis Data Streams and Amazon DynamoDB. Defaults to `null`, which uses the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`aws_sts_header_overrides` | No | Map | Defines a map of header overrides that the AWS Identity and Access Management (IAM) role assumes for the sink plugin. + +## Exposed metadata attributes + +The `kinesis` source adds the following metadata to each processed event. You can access the metadata attributes using the [expression syntax `getMetadata` function]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/get-metadata/). + +- `stream_name`: Contains the name of the Kinesis data stream from which the event was obtained. + +## Permissions + +The following minimum permissions are required in order to run `kinesis` as a source: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "kinesis:DescribeStream", + "kinesis:DescribeStreamConsumer", + "kinesis:DescribeStreamSummary", + "kinesis:GetRecords", + "kinesis:GetShardIterator", + "kinesis:ListShards", + "kinesis:ListStreams", + "kinesis:ListStreamConsumers", + "kinesis:RegisterStreamConsumer", + "kinesis:SubscribeToShard" + ], + "Resource": [ + "arn:aws:kinesis:us-east-1:{account-id}:stream/stream1", + "arn:aws:kinesis:us-east-1:{account-id}:stream/stream2" + ] + }, + { + "Sid": "allowCreateTable", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:PutItem", + "dynamodb:DescribeTable", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan", + "dynamodb:UpdateItem", + "dynamodb:Query" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:{account-id}:table/kinesis-pipeline" + ] + } + ] +} +``` + +The `kinesis` source uses a DynamoDB table for ingestion coordination among multiple workers, so you need DynamoDB permissions. + +## Metrics + +The `kinesis` source includes the following metrics. + +### Counters + +* `recordsProcessed`: Counts the number of processed stream records. +* `recordProcessingErrors`: Counts the number of stream record processing errors. +* `acknowledgementSetSuccesses`: Counts the number of processed stream records that were successfully added to the sink. +* `acknowledgementSetFailures`: Counts the number of processed stream records that failed to be added to the sink. diff --git a/_data-prepper/pipelines/configuration/sources/opensearch.md b/_data-prepper/pipelines/configuration/sources/opensearch.md index a7ba965729..1ee2237575 100644 --- a/_data-prepper/pipelines/configuration/sources/opensearch.md +++ b/_data-prepper/pipelines/configuration/sources/opensearch.md @@ -3,7 +3,7 @@ layout: default title: opensearch parent: Sources grand_parent: Pipelines -nav_order: 30 +nav_order: 50 --- # opensearch diff --git a/_data-prepper/pipelines/configuration/sources/otel-logs-source.md b/_data-prepper/pipelines/configuration/sources/otel-logs-source.md index 068369efaf..38095d7d7f 100644 --- a/_data-prepper/pipelines/configuration/sources/otel-logs-source.md +++ b/_data-prepper/pipelines/configuration/sources/otel-logs-source.md @@ -3,7 +3,7 @@ layout: default title: otel_logs_source parent: Sources grand_parent: Pipelines -nav_order: 25 +nav_order: 60 --- # otel_logs_source diff --git a/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md b/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md index bea74a96d3..0e8d377828 100644 --- a/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md +++ b/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md @@ -3,7 +3,7 @@ layout: default title: otel_metrics_source parent: Sources grand_parent: Pipelines -nav_order: 10 +nav_order: 70 --- # otel_metrics_source diff --git a/_data-prepper/pipelines/configuration/sources/otel-trace-source.md b/_data-prepper/pipelines/configuration/sources/otel-trace-source.md index 1be7864c33..de45a5de63 100644 --- a/_data-prepper/pipelines/configuration/sources/otel-trace-source.md +++ b/_data-prepper/pipelines/configuration/sources/otel-trace-source.md @@ -3,7 +3,7 @@ layout: default title: otel_trace_source parent: Sources grand_parent: Pipelines -nav_order: 15 +nav_order: 80 redirect_from: - /data-prepper/pipelines/configuration/sources/otel-trace/ --- diff --git a/_data-prepper/pipelines/configuration/sources/pipeline.md b/_data-prepper/pipelines/configuration/sources/pipeline.md new file mode 100644 index 0000000000..6ba025bd18 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/pipeline.md @@ -0,0 +1,31 @@ +--- +layout: default +title: pipeline +parent: Sources +grand_parent: Pipelines +nav_order: 90 +--- + +# pipeline + +Use the `pipeline` sink to read from another pipeline. + +## Configuration options + +The `pipeline` sink supports the following configuration options. + +| Option | Required | Type | Description | +|:-------|:---------|:-------|:---------------------------------------| +| `name` | Yes | String | The name of the pipeline to read from. | + +## Usage + +The following example configures a `pipeline` sink named `sample-pipeline` that reads from a pipeline named `movies`: + +```yaml +sample-pipeline: + source: + - pipeline: + name: "movies" +``` +{% include copy.html %} diff --git a/_data-prepper/pipelines/configuration/sources/s3.md b/_data-prepper/pipelines/configuration/sources/s3.md index 5a7d9986e5..7ca27ee500 100644 --- a/_data-prepper/pipelines/configuration/sources/s3.md +++ b/_data-prepper/pipelines/configuration/sources/s3.md @@ -3,7 +3,7 @@ layout: default title: s3 source parent: Sources grand_parent: Pipelines -nav_order: 20 +nav_order: 100 --- # s3 source @@ -104,7 +104,7 @@ Option | Required | Type | Description `s3_select` | No | [s3_select](#s3_select) | The Amazon S3 Select configuration. `scan` | No | [scan](#scan) | The S3 scan configuration. `delete_s3_objects_on_read` | No | Boolean | When `true`, the S3 scan attempts to delete S3 objects after all events from the S3 object are successfully acknowledged by all sinks. `acknowledgments` should be enabled when deleting S3 objects. Default is `false`. -`workers` | No | Integer | Configures the number of worker threads that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. +`workers` | No | Integer | Configures the number of worker threads (1--10) that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. diff --git a/_data-prepper/pipelines/configuration/sources/sources.md b/_data-prepper/pipelines/configuration/sources/sources.md index 811b161e16..682f215517 100644 --- a/_data-prepper/pipelines/configuration/sources/sources.md +++ b/_data-prepper/pipelines/configuration/sources/sources.md @@ -3,7 +3,7 @@ layout: default title: Sources parent: Pipelines has_children: true -nav_order: 20 +nav_order: 110 --- # Sources diff --git a/_data-prepper/pipelines/expression-syntax.md b/_data-prepper/pipelines/expression-syntax.md index 383b54c19b..07f68ee58e 100644 --- a/_data-prepper/pipelines/expression-syntax.md +++ b/_data-prepper/pipelines/expression-syntax.md @@ -30,6 +30,9 @@ The following table lists the supported operators. Operators are listed in order |----------------------|-------------------------------------------------------|---------------| | `()` | Priority expression | Left to right | | `not`<br> `+`<br> `-`| Unary logical NOT<br>Unary positive<br>Unary negative | Right to left | +| `*`, `/` | Multiplication and division operators | Left to right | +| `+`, `-` | Addition and subtraction operators | Left to right | +| `+` | String concatenation operator | Left to right | | `<`, `<=`, `>`, `>=` | Relational operators | Left to right | | `==`, `!=` | Equality operators | Left to right | | `and`, `or` | Conditional expression | Left to right | @@ -78,7 +81,6 @@ Conditional expressions allow you to combine multiple expressions or values usin <Any> or <Any> not <Any> ``` -{% include copy-curl.html %} The following are some example conditional expressions: @@ -91,9 +93,64 @@ not /status_code in {200, 202} ``` {% include copy-curl.html %} +### Arithmetic expressions + +Arithmetic expressions enable basic mathematical operations like addition, subtraction, multiplication, and division. These expressions can be combined with conditional expressions to create more complex conditional statements. The available arithmetic operators are +, -, *, and /. The syntax for using the arithmetic operators is as follows: + +``` +<Any> + <Any> +<Any> - <Any> +<Any> * <Any> +<Any> / <Any> +``` + +The following are example arithmetic expressions: + +``` +/value + length(/message) +/bytes / 1024 +/value1 - /value2 +/TimeInSeconds * 1000 +``` +{% include copy-curl.html %} + +The following are some example arithmetic expressions used in conditional expressions : + +``` +/value + length(/message) > 200 +/bytes / 1024 < 10 +/value1 - /value2 != /value3 + /value4 +``` +{% include copy-curl.html %} + +### String concatenation expressions + +String concatenation expressions enable you to combine strings to create new strings. These concatenated strings can also be used within conditional expressions. The syntax for using string concatenation is as follows: + +``` +<String Variable or String Literal> + <String Variable or String Literal> +``` + +The following are example string concatenation expressions: + +``` +/name + "suffix" +"prefix" + /name +"time of " + /timeInMs + " ms" +``` +{% include copy-curl.html %} + +The following are example string concatenation expressions that can be used in conditional expressions: + +``` +/service + ".com" == /url +"www." + /service != /url +``` +{% include copy-curl.html %} + ### Reserved symbols -Reserved symbols are symbols that are not currently used in the expression syntax but are reserved for possible future functionality or extensions. Reserved symbols include `^`, `*`, `/`, `%`, `+`, `-`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${<text>}`. +Certain symbols, such as ^, %, xor, =, +=, -=, *=, /=, %=, ++, --, and ${<text>}, are reserved for future functionality or extensions. Reserved symbols include `^`, `%`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${<text>}`. ## Syntax components @@ -170,6 +227,9 @@ White space is optional around relational operators, regex equality operators, e | `()` | Priority expression | Yes | `/a==(/b==200)`<br>`/a in ({200})` | `/status in({200})` | | `in`, `not in` | Set operators | Yes | `/a in {200}`<br>`/a not in {400}` | `/a in{200, 202}`<br>`/a not in{400}` | | `<`, `<=`, `>`, `>=` | Relational operators | No | `/status < 300`<br>`/status>=300` | | +| `+` | String concatenation operator | No | `/status_code + /message + "suffix"` +| `+`, `-` | Arithmetic addition and subtraction operators | No | `/status_code + length(/message) - 2` +| `*`, `/` | Multiplication and division operators | No | `/status_code * length(/message) / 3` | `=~`, `!~` | Regex equality operators | No | `/msg =~ "^\w*$"`<br>`/msg=~"^\w*$"` | | | `==`, `!=` | Equality operators | No | `/status == 200`<br>`/status_code==200` | | | `and`, `or`, `not` | Conditional operators | Yes | `/a<300 and /b>200` | `/b<300and/b>200` | diff --git a/_data/top_nav.yml b/_data/top_nav.yml index 51d8138680..6552d90359 100644 --- a/_data/top_nav.yml +++ b/_data/top_nav.yml @@ -63,6 +63,8 @@ items: url: /docs/latest/clients/ - label: Benchmark url: /docs/latest/benchmark/ + - label: Migration Assistant + url: /docs/latest/migration-assistant/ - label: Platform url: /platform/index.html children: diff --git a/_data/versions.json b/_data/versions.json index 4f7e55c21b..6fe1c59e6b 100644 --- a/_data/versions.json +++ b/_data/versions.json @@ -1,10 +1,12 @@ { - "current": "2.16", + "current": "2.18", "all": [ - "2.16", + "2.18", "1.3" ], "archived": [ + "2.17", + "2.16", "2.15", "2.14", "2.13", @@ -25,7 +27,7 @@ "1.1", "1.0" ], - "latest": "2.16" + "latest": "2.18" } diff --git a/_field-types/mapping-parameters/analyzer.md b/_field-types/mapping-parameters/analyzer.md new file mode 100644 index 0000000000..32b26da1e0 --- /dev/null +++ b/_field-types/mapping-parameters/analyzer.md @@ -0,0 +1,90 @@ +--- +layout: default +title: Analyzer +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 5 +has_children: false +has_toc: false +--- + +# Analyzer + +The `analyzer` mapping parameter is used to define the text analysis process that applies to a text field during both index and search operations. + +The key functions of the `analyzer` mapping parameter are: + +1. **Tokenization:** The analyzer determines how the text is broken down into individual tokens (words, numbers) that can be indexed and searched. Each generated token must not exceed 32,766 bytes in order to avoid indexing failures. + +2. **Normalization:** The analyzer can apply various normalization techniques, such as converting text to lowercase, removing stop words, and stemming/lemmatizing words. + +3. **Consistency:** By defining the same analyzer for both index and search operations, you ensure that the text analysis process is consistent, which helps improve the relevance of search results. + +4. **Customization:** OpenSearch allows you to define custom analyzers by specifying the tokenizer, character filters, and token filters to be used. This gives you fine-grained control over the text analysis process. + +For information about specific analyzer parameters, such as `analyzer`, `search_analyzer`, or `search_quote_analyzer`, see [Search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). +{: .note} + +------------ + +## Example + +The following example configuration defines a custom analyzer called `my_custom_analyzer`: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stop_filter", + "my_stemmer" + ] + } + }, + "filter": { + "my_stop_filter": { + "type": "stop", + "stopwords": ["the", "a", "and", "or"] + }, + "my_stemmer": { + "type": "stemmer", + "language": "english" + } + } + } + }, + "mappings": { + "properties": { + "my_text_field": { + "type": "text", + "analyzer": "my_custom_analyzer", + "search_analyzer": "standard", + "search_quote_analyzer": "my_custom_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +In this example, the `my_custom_analyzer` uses the standard tokenizer, converts all tokens to lowercase, applies a custom stop word filter, and applies an English stemmer. + +You can then map a text field so that it uses this custom analyzer for both index and search operations: + +```json +"mappings": { + "properties": { + "my_text_field": { + "type": "text", + "analyzer": "my_custom_analyzer" + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/boost.md b/_field-types/mapping-parameters/boost.md new file mode 100644 index 0000000000..f1648a861d --- /dev/null +++ b/_field-types/mapping-parameters/boost.md @@ -0,0 +1,50 @@ +--- +layout: default +title: Boost +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 10 +has_children: false +has_toc: false +--- + +# Boost + +The `boost` mapping parameter is used to increase or decrease the relevance score of a field during search queries. It allows you to apply more or less weight to specific fields when calculating the overall relevance score of a document. + +The `boost` parameter is applied as a multiplier to the score of a field. For example, if a field has a `boost` value of `2`, then the score contribution of that field is doubled. Conversely, a `boost` value of `0.5` would halve the score contribution of that field. + +----------- + +## Example + +The following is an example of how you can use the `boost` parameter in an OpenSearch mapping: + +```json +PUT my-index1 +{ + "mappings": { + "properties": { + "title": { + "type": "text", + "boost": 2 + }, + "description": { + "type": "text", + "boost": 1 + }, + "tags": { + "type": "keyword", + "boost": 1.5 + } + } + } +} +``` +{% include copy-curl.html %} + +In this example, the `title` field has a boost of `2`, which means that it contributes twice as much to the overall relevance score than the description field (which has a boost of `1`). The `tags` field has a boost of `1.5`, so it contributes one and a half times more than the description field. + +The `boost` parameter is particularly useful when you want to apply more weight to certain fields. For example, you might want to boost the `title` field more than the `description` field because the title may be a better indicator of the document's relevance. + +The `boost` parameter is a multiplicative factor---not an additive one. This means that a field with a higher boost value will have a disproportionately large effect on the overall relevance score as compared to fields with lower boost values. When using the `boost` parameter, it is recommended that you start with small values (1.5 or 2) and test the effect on your search results. Overly high boost values can skew the relevance scores and lead to unexpected or undesirable search results. diff --git a/_field-types/mapping-parameters/coerce.md b/_field-types/mapping-parameters/coerce.md new file mode 100644 index 0000000000..3cf844897a --- /dev/null +++ b/_field-types/mapping-parameters/coerce.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Coerce +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 15 +has_children: false +has_toc: false +--- + +# Coerce + +The `coerce` mapping parameter controls how values are converted to the expected field data type during indexing. This parameter lets you verify that your data is formatted and indexed properly, following the expected field types. This improves the accuracy of your search results. + +--- + +## Examples + +The following examples demonstrate how to use the `coerce` mapping parameter. + +#### Indexing a document with `coerce` enabled + +```json +PUT products +{ + "mappings": { + "properties": { + "price": { + "type": "integer", + "coerce": true + } + } + } +} + +PUT products/_doc/1 +{ + "name": "Product A", + "price": "19.99" +} +``` +{% include copy-curl.html %} + +In this example, the `price` field is defined as an `integer` type with `coerce` set to `true`. When indexing the document, the string value `19.99` is coerced to the integer `19`. + +#### Indexing a document with `coerce` disabled + +```json +PUT orders +{ + "mappings": { + "properties": { + "quantity": { + "type": "integer", + "coerce": false + } + } + } +} + +PUT orders/_doc/1 +{ + "item": "Widget", + "quantity": "10" +} +``` +{% include copy-curl.html %} + +In this example, the `quantity` field is defined as an `integer` type with `coerce` set to `false`. When indexing the document, the string value `10` is not coerced, and the document is rejected because of the type mismatch. + +#### Setting the index-level coercion setting + +```json +PUT inventory +{ + "settings": { + "index.mapping.coerce": false + }, + "mappings": { + "properties": { + "stock_count": { + "type": "integer", + "coerce": true + }, + "sku": { + "type": "keyword" + } + } + } +} + +PUT inventory/_doc/1 +{ + "sku": "ABC123", + "stock_count": "50" +} +``` +{% include copy-curl.html %} + +In this example, the index-level `index.mapping.coerce` setting is set to `false`, which disables coercion for the index. However, the `stock_count` field overrides this setting and enables coercion for this specific field. diff --git a/_field-types/mapping-parameters/copy-to.md b/_field-types/mapping-parameters/copy-to.md new file mode 100644 index 0000000000..b029f814b5 --- /dev/null +++ b/_field-types/mapping-parameters/copy-to.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Copy_to +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 20 +has_children: false +has_toc: false +--- + +# Copy_to + +The `copy_to` parameter allows you to copy the values of multiple fields into a single field. This parameter can be useful if you often search across multiple fields because it allows you to search the group field instead. + +Only the field value is copied and not the terms resulting from the analysis process. The original `_source` field remains unmodified, and the same value can be copied to multiple fields using the `copy_to` parameter. However, recursive copying through intermediary fields is not supported; instead, use `copy_to` directly from the originating field to multiple target fields. + +--- + +## Examples + +The following example uses the `copy_to` parameter to search for products by their name and description and copy those values into a single field: + +```json +PUT my-products-index +{ + "mappings": { + "properties": { + "name": { + "type": "text", + "copy_to": "product_info" + }, + "description": { + "type": "text", + "copy_to": "product_info" + }, + "product_info": { + "type": "text" + }, + "price": { + "type": "float" + } + } + } +} + +PUT my-products-index/_doc/1 +{ + "name": "Wireless Headphones", + "description": "High-quality wireless headphones with noise cancellation", + "price": 99.99 +} + +PUT my-products-index/_doc/2 +{ + "name": "Bluetooth Speaker", + "description": "Portable Bluetooth speaker with long battery life", + "price": 49.99 +} +``` +{% include copy-curl.html %} + +In this example, the values from the `name` and `description` fields are copied into the `product_info` field. You can now search for products by querying the `product_info` field, as follows: + +```json +GET my-products-index/_search +{ + "query": { + "match": { + "product_info": "wireless headphones" + } + } +} +``` +{% include copy-curl.html %} + +## Response + +```json +{ + "took": 20, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.9061546, + "hits": [ + { + "_index": "my-products-index", + "_id": "1", + "_score": 1.9061546, + "_source": { + "name": "Wireless Headphones", + "description": "High-quality wireless headphones with noise cancellation", + "price": 99.99 + } + } + ] + } +} +``` + diff --git a/_field-types/mapping-parameters/doc-values.md b/_field-types/mapping-parameters/doc-values.md new file mode 100644 index 0000000000..9ea9364c0b --- /dev/null +++ b/_field-types/mapping-parameters/doc-values.md @@ -0,0 +1,46 @@ +--- +layout: default +title: doc_values +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 25 +has_children: false +has_toc: false +--- + +# doc_values + +By default, OpenSearch indexes most fields for search purposes. The `doc_values ` parameter enables document-to-term lookups for operations such as sorting, aggregations, and scripting. + +The `doc_values` parameter accepts the following options. + +Option | Description +:--- | :--- +`true` | Enables `doc_values` for the field. Default is `true`. +`false` | Disables `doc_values` for the field. + +The `doc_values` parameter is not supported for use in text fields. + +--- + +## Example: Creating an index with `doc_values` enabled and disabled + +The following example request creates an index with `doc_values` enabled for one field and disabled for another: + +```json +PUT my-index-001 +{ + "mappings": { + "properties": { + "status_code": { + "type": "keyword" + }, + "session_id": { + "type": "keyword", + "doc_values": false + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/dynamic.md b/_field-types/mapping-parameters/dynamic.md similarity index 96% rename from _field-types/dynamic.md rename to _field-types/mapping-parameters/dynamic.md index 59f59bfe3d..2d48e98082 100644 --- a/_field-types/dynamic.md +++ b/_field-types/mapping-parameters/dynamic.md @@ -1,18 +1,22 @@ --- layout: default -title: Dynamic parameter -nav_order: 10 +title: Dynamic +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 30 +has_children: false +has_toc: false redirect_from: - /opensearch/dynamic/ --- -# Dynamic parameter +# Dynamic The `dynamic` parameter specifies whether newly detected fields can be added dynamically to a mapping. It accepts the parameters listed in the following table. Parameter | Description :--- | :--- -`true` | Specfies that new fields can be added dynamically to the mapping. Default is `true`. +`true` | Specifies that new fields can be added dynamically to the mapping. Default is `true`. `false` | Specifies that new fields cannot be added dynamically to the mapping. If a new field is detected, then it is not indexed or searchable but can be retrieved from the `_source` field. `strict` | Throws an exception. The indexing operation fails when new fields are detected. `strict_allow_templates` | Adds new fields if they match predefined dynamic templates in the mapping. diff --git a/_field-types/mapping-parameters/enabled.md b/_field-types/mapping-parameters/enabled.md new file mode 100644 index 0000000000..2695ff7529 --- /dev/null +++ b/_field-types/mapping-parameters/enabled.md @@ -0,0 +1,47 @@ +--- +layout: default +title: enabled +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 40 +has_children: false +has_toc: false +--- + +# Enabled + +The `enabled` parameter allows you to control whether OpenSearch parses the contents of a field. This parameter can be applied to the top-level mapping definition and to object fields. + +The `enabled` parameter accepts the following values. + +Parameter | Description +:--- | :--- +`true` | The field is parsed and indexed. Default is `true`. +`false` | The field is not parsed or indexed but is still retrievable from the `_source` field. When `enabled` is set to `false`, OpenSearch stores the field's value in the `_source` field but does not index or parse its contents. This can be useful for fields that you want to store but do not need to search, sort, or aggregate on. + +--- + +## Example: Using the `enabled` parameter + +In the following example request, the `session_data` field is disabled. OpenSearch stores its contents in the `_source` field but does not index or parse them: + +```json +PUT my-index-002 +{ + "mappings": { + "properties": { + "user_id": { + "type": "keyword" + }, + "last_updated": { + "type": "date" + }, + "session_data": { + "type": "object", + "enabled": false + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/index.md b/_field-types/mapping-parameters/index.md new file mode 100644 index 0000000000..ca5586bb8f --- /dev/null +++ b/_field-types/mapping-parameters/index.md @@ -0,0 +1,28 @@ +--- +layout: default +title: Mapping parameters +nav_order: 75 +has_children: true +has_toc: false +--- + +# Mapping parameters + +Mapping parameters are used to configure the behavior of index fields. For parameter use cases, see a mapping parameter's respective page. + +The following table lists OpenSearch mapping parameters. + +Parameter | Description +:--- | :--- +`analyzer` | Specifies the analyzer used to analyze string fields. Default is the `standard` analyzer, which is a general-purpose analyzer that splits text on white space and punctuation, converts to lowercase, and removes stop words. Allowed values are `standard`, `simple`, and `whitespace`. +`boost` | Specifies a field-level boost factor applied at query time. Allows you to increase or decrease the relevance score of a specific field during search queries. Default boost value is `1.0`, which means no boost is applied. Allowed values are any positive floating-point number. +`coerce` | Controls how values are converted to the expected field data type during indexing. Default value is `true`, which means that OpenSearch tries to coerce the value to the expected value type. Allowed values are `true` or `false`. +`copy_to` | Copies the value of a field to another field. There is no default value for this parameter. Optional. +`doc_values` | Specifies whether a field should be stored on disk to make sorting and aggregation faster. Default value is `true`, which means that the doc values are enabled. Allowed values are a single field name or a list of field names. +`dynamic` | Determines whether new fields should be added dynamically. Default value is `true`, which means that new fields can be added dynamically. Allowed values are `true`, `false`, or `strict`. +`enabled` | Specifies whether the field is enabled or disabled. Default value is `true`, which means that the field is enabled. Allowed values are `true` or `false`. +`format` | Specifies the date format for date fields. There is no default value for this parameter. Allowed values are any valid date format string, such as `yyyy-MM-dd` or `epoch_millis`. +`ignore_above` | Skips indexing values that exceed the specified length. Default value is `2147483647`, which means that there is no limit on the field value length. Allowed values are any positive integer. +`ignore_malformed` | Specifies whether malformed values should be ignored. Default value is `false`, which means that malformed values are not ignored. Allowed values are `true` or `false`. +`index` | Specifies whether a field should be indexed. Default value is `true`, which means that the field is indexed. Allowed values are `true`, `false`, or `not_analyzed`. +`index_options` | Specifies what information should be stored in an index for scoring purposes. Default value is `docs`, which means that only the document numbers are stored in the index. Allowed values are `docs`, `freqs`, `positions`, or `offsets`. \ No newline at end of file diff --git a/_field-types/supported-field-types/alias.md b/_field-types/supported-field-types/alias.md index 29cc58885c..f1f6ae9ac8 100644 --- a/_field-types/supported-field-types/alias.md +++ b/_field-types/supported-field-types/alias.md @@ -10,6 +10,8 @@ redirect_from: --- # Alias field type +**Introduced 1.0** +{: .label .label-purple } An alias field type creates another name for an existing field. You can use aliases in the[search](#using-aliases-in-search-api-operations) and [field capabilities](#using-aliases-in-field-capabilities-api-operations) API operations, with some [exceptions](#exceptions). To set up an [alias](#alias-field), you need to specify the [original field](#original-field) name in the `path` parameter. diff --git a/_field-types/supported-field-types/binary.md b/_field-types/supported-field-types/binary.md index 99d468c1dc..bb257bf7ec 100644 --- a/_field-types/supported-field-types/binary.md +++ b/_field-types/supported-field-types/binary.md @@ -10,6 +10,8 @@ redirect_from: --- # Binary field type +**Introduced 1.0** +{: .label .label-purple } A binary field type contains a binary value in [Base64](https://en.wikipedia.org/wiki/Base64) encoding that is not searchable. diff --git a/_field-types/supported-field-types/boolean.md b/_field-types/supported-field-types/boolean.md index 8233a45ad5..82cfdecf47 100644 --- a/_field-types/supported-field-types/boolean.md +++ b/_field-types/supported-field-types/boolean.md @@ -10,6 +10,8 @@ redirect_from: --- # Boolean field type +**Introduced 1.0** +{: .label .label-purple } A Boolean field type takes `true` or `false` values, or `"true"` or `"false"` strings. You can also pass an empty string (`""`) in place of a `false` value. diff --git a/_field-types/supported-field-types/completion.md b/_field-types/supported-field-types/completion.md index 85c803baa1..e6e392fb6d 100644 --- a/_field-types/supported-field-types/completion.md +++ b/_field-types/supported-field-types/completion.md @@ -11,6 +11,8 @@ redirect_from: --- # Completion field type +**Introduced 1.0** +{: .label .label-purple } A completion field type provides autocomplete functionality through a completion suggester. The completion suggester is a prefix suggester, so it matches the beginning of text only. A completion suggester creates an in-memory data structure, which provides faster lookups but leads to increased memory usage. You need to upload a list of all possible completions into the index before using this feature. diff --git a/_field-types/supported-field-types/constant-keyword.md b/_field-types/supported-field-types/constant-keyword.md index bf1e4afc70..4f9261f1a1 100644 --- a/_field-types/supported-field-types/constant-keyword.md +++ b/_field-types/supported-field-types/constant-keyword.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Constant keyword field type +**Introduced 2.14** +{: .label .label-purple } A constant keyword field uses the same value for all documents in the index. diff --git a/_field-types/supported-field-types/date-nanos.md b/_field-types/supported-field-types/date-nanos.md index 12399a69d4..eb569265fc 100644 --- a/_field-types/supported-field-types/date-nanos.md +++ b/_field-types/supported-field-types/date-nanos.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Date nanoseconds field type +**Introduced 1.0** +{: .label .label-purple } The `date_nanos` field type is similar to the [`date`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/) field type in that it holds a date. However, `date` stores the date in millisecond resolution, while `date_nanos` stores the date in nanosecond resolution. Dates are stored as `long` values that correspond to nanoseconds since the epoch. Therefore, the range of supported dates is approximately 1970--2262. diff --git a/_field-types/supported-field-types/date.md b/_field-types/supported-field-types/date.md index fb008d1512..8ca986219b 100644 --- a/_field-types/supported-field-types/date.md +++ b/_field-types/supported-field-types/date.md @@ -11,6 +11,8 @@ redirect_from: --- # Date field type +**Introduced 1.0** +{: .label .label-purple } A date in OpenSearch can be represented as one of the following: diff --git a/_field-types/supported-field-types/derived.md b/_field-types/supported-field-types/derived.md index d989c3e4a4..cb358f47f9 100644 --- a/_field-types/supported-field-types/derived.md +++ b/_field-types/supported-field-types/derived.md @@ -28,11 +28,11 @@ Despite the potential performance impact of query-time computations, the flexibi Currently, derived fields have the following limitations: -- **Aggregation, scoring, and sorting**: Not yet supported. +- **Scoring and sorting**: Not yet supported. +- **Aggregations**: Starting with OpenSearch 2.17, derived fields support most aggregation types. The following aggregations are not supported: geographic (geodistance, geohash grid, geohex grid, geotile grid, geobounds, geocentroid), significant terms, significant text, and scripted metric. - **Dashboard support**: These fields are not displayed in the list of available fields in OpenSearch Dashboards. However, you can still use them for filtering if you know the derived field name. - **Chained derived fields**: One derived field cannot be used to define another derived field. - **Join field type**: Derived fields are not supported for the [join field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/join/). -- **Concurrent segment search**: Derived fields are not supported for [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). We are planning to address these limitations in future versions. @@ -541,6 +541,80 @@ The response specifies highlighting in the `url` field: ``` </details> +## Aggregations + +Starting with OpenSearch 2.17, derived fields support most aggregation types. + +Geographic, significant terms, significant text, and scripted metric aggregations are not supported. +{: .note} + +For example, the following request creates a simple `terms` aggregation on the `method` derived field: + +```json +POST /logs/_search +{ + "size": 0, + "aggs": { + "methods": { + "terms": { + "field": "method" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the following buckets: + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took" : 12, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 5, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "methods" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "GET", + "doc_count" : 2 + }, + { + "key" : "POST", + "doc_count" : 2 + }, + { + "key" : "DELETE", + "doc_count" : 1 + } + ] + } + } +} +``` +</details> + ## Performance Derived fields are not indexed but are computed dynamically by retrieving values from the `_source` field or doc values. Thus, they run more slowly. To improve performance, try the following: diff --git a/_field-types/supported-field-types/flat-object.md b/_field-types/supported-field-types/flat-object.md index 933c385ac5..65d7c6dc8e 100644 --- a/_field-types/supported-field-types/flat-object.md +++ b/_field-types/supported-field-types/flat-object.md @@ -10,6 +10,8 @@ redirect_from: --- # Flat object field type +**Introduced 2.7** +{: .label .label-purple } In OpenSearch, you don't have to specify a mapping before indexing documents. If you don't specify a mapping, OpenSearch uses [dynamic mapping]({{site.url}}{{site.baseurl}}/field-types/index#dynamic-mapping) to map every field and its subfields in the document automatically. When you ingest documents such as logs, you may not know every field's subfield name and type in advance. In this case, dynamically mapping all new subfields can quickly lead to a "mapping explosion," where the growing number of fields may degrade the performance of your cluster. @@ -54,7 +56,8 @@ The flat object field type supports the following queries: - [Multi-match]({{site.url}}{{site.baseurl}}/query-dsl/full-text/multi-match/) - [Query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) - [Simple query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/simple-query-string/) -- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Wildcard]({{site.url}}{{site.baseurl}}/query-dsl/term/wildcard/) ## Limitations @@ -241,4 +244,4 @@ PUT /test-index/ ``` {% include copy-curl.html %} -Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. \ No newline at end of file +Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. diff --git a/_field-types/supported-field-types/geo-point.md b/_field-types/supported-field-types/geo-point.md index 0912dc618d..96586d044f 100644 --- a/_field-types/supported-field-types/geo-point.md +++ b/_field-types/supported-field-types/geo-point.md @@ -11,6 +11,8 @@ redirect_from: --- # Geopoint field type +**Introduced 1.0** +{: .label .label-purple } A geopoint field type contains a geographic point specified by latitude and longitude. diff --git a/_field-types/supported-field-types/geo-shape.md b/_field-types/supported-field-types/geo-shape.md index b7b06a0d04..ee98bfca03 100644 --- a/_field-types/supported-field-types/geo-shape.md +++ b/_field-types/supported-field-types/geo-shape.md @@ -11,6 +11,8 @@ redirect_from: --- # Geoshape field type +**Introduced 1.0** +{: .label .label-purple } A geoshape field type contains a geographic shape, such as a polygon or a collection of geographic points. To index a geoshape, OpenSearch tesselates the shape into a triangular mesh and stores each triangle in a BKD tree. This provides a 10<sup>-7</sup>decimal degree of precision, which represents near-perfect spatial resolution. Performance of this process is mostly impacted by the number of vertices in a polygon you are indexing. diff --git a/_field-types/supported-field-types/index.md b/_field-types/supported-field-types/index.md index 7c7b7375f9..5540e73f4e 100644 --- a/_field-types/supported-field-types/index.md +++ b/_field-types/supported-field-types/index.md @@ -22,7 +22,7 @@ Boolean | [`boolean`]({{site.url}}{{site.baseurl}}/field-types/supported-field-t [Date]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/dates/)| [`date`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/): A date stored in milliseconds. <br> [`date_nanos`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date-nanos/): A date stored in nanoseconds. IP | [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/): An IP address in IPv4 or IPv6 format. [Range]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/range/) | A range of values (`integer_range`, `long_range`, `double_range`, `float_range`, `date_range`, `ip_range`). -[Object]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object-fields/)| [`object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object/): A JSON object. <br>[`nested`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/): Used when objects in an array need to be indexed independently as separate documents.<br>[`flat_object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/flat-object/): A JSON object treated as a string.<br>[`join`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/): Establishes a parent-child relationship between documents in the same index. +[Object]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object-fields/)| [`object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object/): A JSON object. <br>[`nested`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/): Used when objects in an array need to be indexed independently as separate documents.<br>[`flat_object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/flat-object/): A JSON object treated as a string.<br>[`join`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/): Establishes a parent/child relationship between documents in the same index. [String]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/string/)|[`keyword`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/): Contains a string that is not analyzed.<br> [`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/): Contains a string that is analyzed.<br> [`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/): A space-optimized version of a `text` field.<br>[`token_count`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/token-count/): Stores the number of analyzed tokens in a string. <br>[`wildcard`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/wildcard/): A variation of `keyword` with efficient substring and regular expression matching. [Autocomplete]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/autocomplete/) |[`completion`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/completion/): Provides autocomplete functionality through a completion suggester.<br> [`search_as_you_type`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/search-as-you-type/): Provides search-as-you-type functionality using both prefix and infix completion. [Geographic]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geographic/)| [`geo_point`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-point/): A geographic point.<br>[`geo_shape`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-shape/): A geographic shape. @@ -30,6 +30,7 @@ IP | [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/): k-NN vector | [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/): Allows indexing a k-NN vector into OpenSearch and performing different kinds of k-NN search. Percolator | [`percolator`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/): Specifies to treat this field as a query. Derived | [`derived`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/derived/): Creates new fields dynamically by executing scripts on existing fields. +Star-tree | [`star_tree`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/): Precomputes aggregations and stores them in a [star-tree index](https://docs.pinot.apache.org/basics/indexing/star-tree-index), accelerating the performance of aggregation queries. ## Arrays diff --git a/_field-types/supported-field-types/ip.md b/_field-types/supported-field-types/ip.md index cb2a5569c8..99b41e45cd 100644 --- a/_field-types/supported-field-types/ip.md +++ b/_field-types/supported-field-types/ip.md @@ -10,6 +10,8 @@ redirect_from: --- # IP address field type +**Introduced 1.0** +{: .label .label-purple } An ip field type contains an IP address in IPv4 or IPv6 format. diff --git a/_field-types/supported-field-types/join.md b/_field-types/supported-field-types/join.md index c83705f4c3..fd808a65e7 100644 --- a/_field-types/supported-field-types/join.md +++ b/_field-types/supported-field-types/join.md @@ -11,12 +11,14 @@ redirect_from: --- # Join field type +**Introduced 1.0** +{: .label .label-purple } A join field type establishes a parent/child relationship between documents in the same index. ## Example -Create a mapping to establish a parent-child relationship between products and their brands: +Create a mapping to establish a parent/child relationship between products and their brands: ```json PUT testindex1 @@ -59,7 +61,7 @@ PUT testindex1/_doc/1 ``` {% include copy-curl.html %} -When indexing child documents, you have to specify the `routing` query parameter because parent and child documents in the same relation have to be indexed on the same shard. Each child document refers to its parent's ID in the `parent` field. +When indexing child documents, you need to specify the `routing` query parameter because parent and child documents in the same parent/child hierarchy must be indexed on the same shard. For more information, see [Routing]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/routing/). Each child document refers to its parent's ID in the `parent` field. Index two child documents, one for each parent: @@ -325,3 +327,8 @@ PUT testindex1 - Multiple parents are not supported. - You can add a child document to an existing document only if the existing document is already marked as a parent. - You can add a new relation to an existing join field. + +## Next steps + +- Learn about [joining queries]({{site.url}}{{site.baseurl}}/query-dsl/joining/) on join fields. +- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_field-types/supported-field-types/keyword.md b/_field-types/supported-field-types/keyword.md index eea6cc664b..ca9c8085f6 100644 --- a/_field-types/supported-field-types/keyword.md +++ b/_field-types/supported-field-types/keyword.md @@ -11,6 +11,8 @@ redirect_from: --- # Keyword field type +**Introduced 1.0** +{: .label .label-purple } A keyword field type contains a string that is not analyzed. It allows only exact, case-sensitive matches. diff --git a/_field-types/supported-field-types/knn-vector.md b/_field-types/supported-field-types/knn-vector.md index a2a7137733..da784aeefe 100644 --- a/_field-types/supported-field-types/knn-vector.md +++ b/_field-types/supported-field-types/knn-vector.md @@ -8,6 +8,8 @@ has_math: true --- # k-NN vector field type +**Introduced 1.0** +{: .label .label-purple } The [k-NN plugin]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) introduces a custom data type, the `knn_vector`, that allows users to ingest their k-NN vectors into an OpenSearch index and perform different kinds of k-NN search. The `knn_vector` field is highly configurable and can serve many different k-NN workloads. In general, a `knn_vector` field can be built either by providing a method definition or specifying a model id. @@ -20,8 +22,7 @@ PUT test-index { "settings": { "index": { - "knn": true, - "knn.algo_param.ef_search": 100 + "knn": true } }, "mappings": { @@ -29,14 +30,10 @@ PUT test-index "my_vector": { "type": "knn_vector", "dimension": 3, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", - "engine": "lucene", - "parameters": { - "ef_construction": 128, - "m": 24 - } + "engine": "faiss" } } } @@ -45,6 +42,92 @@ PUT test-index ``` {% include copy-curl.html %} +## Vector workload modes + +Vector search involves trade-offs between low-latency and low-cost search. Specify the `mode` mapping parameter of the `knn_vector` type to indicate which search mode you want to prioritize. The `mode` dictates the default values for k-NN parameters. You can further fine-tune your index by overriding the default parameter values in the k-NN field mapping. + +The following modes are currently supported. + +| Mode | Default engine | Description | +|:---|:---|:---| +| `in_memory` (Default) | `nmslib` | Prioritizes low-latency search. This mode uses the `nmslib` engine without any quantization applied. It is configured with the default parameter values for vector search in OpenSearch. | +| `on_disk` | `faiss` | Prioritizes low-cost vector search while maintaining strong recall. By default, the `on_disk` mode uses quantization and rescoring to execute a two-pass approach to retrieve the top neighbors. The `on_disk` mode supports only `float` vector types. | + +To create a k-NN index that uses the `on_disk` mode for low-cost search, send the following request: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2", + "mode": "on_disk" + } + } + } +} +``` +{% include copy-curl.html %} + +## Compression levels + +The `compression_level` mapping parameter selects a quantization encoder that reduces vector memory consumption by the given factor. The following table lists the available `compression_level` values. + +| Compression level | Supported engines | +|:------------------|:-------------------------------| +| `1x` | `faiss`, `lucene`, and `nmslib` | +| `2x` | `faiss` | +| `4x` | `lucene` | +| `8x` | `faiss` | +| `16x` | `faiss` | +| `32x` | `faiss` | + +For example, if a `compression_level` of `32x` is passed for a `float32` index of 768-dimensional vectors, the per-vector memory is reduced from `4 * 768 = 3072` bytes to `3072 / 32 = 846` bytes. Internally, binary quantization (which maps a `float` to a `bit`) may be used to achieve this compression. + +If you set the `compression_level` parameter, then you cannot specify an `encoder` in the `method` mapping. Compression levels greater than `1x` are only supported for `float` vector types. +{: .note} + +The following table lists the default `compression_level` values for the available workload modes. + +| Mode | Default compression level | +|:------------------|:-------------------------------| +| `in_memory` | `1x` | +| `on_disk` | `32x` | + + +To create a vector field with a `compression_level` of `16x`, specify the `compression_level` parameter in the mappings. This parameter overrides the default compression level for the `on_disk` mode from `32x` to `16x`, producing higher recall and accuracy at the expense of a larger memory footprint: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2", + "mode": "on_disk", + "compression_level": "16x" + } + } + } +} +``` +{% include copy-curl.html %} + ## Method definitions [Method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions) are used when the underlying [approximate k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) algorithm does not require training. For example, the following `knn_vector` field specifies that *nmslib*'s implementation of *hnsw* should be used for approximate k-NN search. During indexing, *nmslib* will build the corresponding *hnsw* segment files. @@ -53,13 +136,13 @@ PUT test-index "my_vector": { "type": "knn_vector", "dimension": 4, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "nmslib", "parameters": { - "ef_construction": 128, - "m": 24 + "ef_construction": 100, + "m": 16 } } } @@ -71,6 +154,7 @@ Model IDs are used when the underlying Approximate k-NN algorithm requires a tra model contains the information needed to initialize the native library segment files. ```json +"my_vector": { "type": "knn_vector", "model_id": "my-model" } @@ -78,28 +162,36 @@ model contains the information needed to initialize the native library segment f However, if you intend to use Painless scripting or a k-NN score script, you only need to pass the dimension. ```json +"my_vector": { "type": "knn_vector", "dimension": 128 } ``` -## Lucene byte vector +## Byte vectors -By default, k-NN vectors are `float` vectors, where each dimension is 4 bytes. If you want to save storage space, you can use `byte` vectors with the `lucene` engine. In a `byte` vector, each dimension is a signed 8-bit integer in the [-128, 127] range. +By default, k-NN vectors are `float` vectors, in which each dimension is 4 bytes. If you want to save storage space, you can use `byte` vectors with the `faiss` or `lucene` engine. In a `byte` vector, each dimension is a signed 8-bit integer in the [-128, 127] range. -Byte vectors are supported only for the `lucene` engine. They are not supported for the `nmslib` and `faiss` engines. +Byte vectors are supported only for the `lucene` and `faiss` engines. They are not supported for the `nmslib` engine. {: .note} -In [k-NN benchmarking tests](https://github.com/opensearch-project/k-NN/tree/main/benchmarks/perf-tool), the use of `byte` rather than `float` vectors resulted in a significant reduction in storage and memory usage as well as improved indexing throughput and reduced query latency. Additionally, precision on recall was not greatly affected (note that recall can depend on various factors, such as the [quantization technique](#quantization-techniques) and data distribution). +In [k-NN benchmarking tests](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/vectorsearch), the use of `byte` rather than `float` vectors resulted in a significant reduction in storage and memory usage as well as improved indexing throughput and reduced query latency. Additionally, precision on recall was not greatly affected (note that recall can depend on various factors, such as the [quantization technique](#quantization-techniques) and data distribution). When using `byte` vectors, expect some loss of precision in the recall compared to using `float` vectors. Byte vectors are useful in large-scale applications and use cases that prioritize a reduced memory footprint in exchange for a minimal loss of recall. {: .important} - + +When using `byte` vectors with the `faiss` engine, we recommend using [SIMD optimization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#simd-optimization-for-the-faiss-engine), which helps to significantly reduce search latencies and improve indexing throughput. +{: .important} + Introduced in k-NN plugin version 2.9, the optional `data_type` parameter defines the data type of a vector. The default value of this parameter is `float`. To use a `byte` vector, set the `data_type` parameter to `byte` when creating mappings for an index: - ```json +### Example: HNSW + +The following example creates a byte vector index with the `lucene` engine and `hnsw` algorithm: + +```json PUT test-index { "settings": { @@ -114,13 +206,13 @@ PUT test-index "type": "knn_vector", "dimension": 3, "data_type": "byte", + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "lucene", "parameters": { - "ef_construction": 128, - "m": 24 + "ef_construction": 100, + "m": 16 } } } @@ -130,7 +222,7 @@ PUT test-index ``` {% include copy-curl.html %} -Then ingest documents as usual. Make sure each dimension in the vector is in the supported [-128, 127] range: +After creating the index, ingest documents as usual. Make sure each dimension in the vector is in the supported [-128, 127] range: ```json PUT test-index/_doc/1 @@ -166,9 +258,160 @@ GET test-index/_search ``` {% include copy-curl.html %} +### Example: IVF + +The `ivf` method requires a training step that creates and trains the model used to initialize the native library index during segment creation. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). + +First, create an index that will contain byte vector training data. Specify the `faiss` engine and `ivf` algorithm and make sure that the `dimension` matches the dimension of the model you want to create: + +```json +PUT train-index +{ + "mappings": { + "properties": { + "train-field": { + "type": "knn_vector", + "dimension": 4, + "data_type": "byte" + } + } + } +} +``` +{% include copy-curl.html %} + +First, ingest training data containing byte vectors into the training index: + +```json +PUT _bulk +{ "index": { "_index": "train-index", "_id": "1" } } +{ "train-field": [127, 100, 0, -120] } +{ "index": { "_index": "train-index", "_id": "2" } } +{ "train-field": [2, -128, -10, 50] } +{ "index": { "_index": "train-index", "_id": "3" } } +{ "train-field": [13, -100, 5, 126] } +{ "index": { "_index": "train-index", "_id": "4" } } +{ "train-field": [5, 100, -6, -125] } +``` +{% include copy-curl.html %} + +Then, create and train the model named `byte-vector-model`. The model will be trained using the training data from the `train-field` in the `train-index`. Specify the `byte` data type: + +```json +POST _plugins/_knn/models/byte-vector-model/_train +{ + "training_index": "train-index", + "training_field": "train-field", + "dimension": 4, + "description": "model with byte data", + "data_type": "byte", + "method": { + "name": "ivf", + "engine": "faiss", + "space_type": "l2", + "parameters": { + "nlist": 1, + "nprobes": 1 + } + } +} +``` +{% include copy-curl.html %} + +To check the model training status, call the Get Model API: + +```json +GET _plugins/_knn/models/byte-vector-model?filter_path=state +``` +{% include copy-curl.html %} + +Once the training is complete, the `state` changes to `created`. + +Next, create an index that will initialize its native library indexes using the trained model: + +```json +PUT test-byte-ivf +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "model_id": "byte-vector-model" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest the data containing the byte vectors that you want to search into the created index: + +```json +PUT _bulk?refresh=true +{"index": {"_index": "test-byte-ivf", "_id": "1"}} +{"my_vector": [7, 10, 15, -120]} +{"index": {"_index": "test-byte-ivf", "_id": "2"}} +{"my_vector": [10, -100, 120, -108]} +{"index": {"_index": "test-byte-ivf", "_id": "3"}} +{"my_vector": [1, -2, 5, -50]} +{"index": {"_index": "test-byte-ivf", "_id": "4"}} +{"my_vector": [9, -7, 45, -78]} +{"index": {"_index": "test-byte-ivf", "_id": "5"}} +{"my_vector": [80, -70, 127, -128]} +``` +{% include copy-curl.html %} + +Finally, search the data. Be sure to provide a byte vector in the k-NN vector field: + +```json +GET test-byte-ivf/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector": { + "vector": [100, -120, 50, -45], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +### Memory estimation + +In the best-case scenario, byte vectors require 25% of the memory required by 32-bit vectors. + +#### HNSW memory estimation + +The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. + +As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows: + +```r +1.1 * (256 + 8 * 16) * 1,000,000 ~= 0.39 GB +``` + +#### IVF memory estimation + +The memory required for IVF is estimated to be `1.1 * ((dimension * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into. + +As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows: + +```r +1.1 * ((256 * 1,000,000) + (4 * 128 * 256)) ~= 0.27 GB +``` + + ### Quantization techniques -If your vectors are of the type `float`, you need to first convert them to the `byte` type before ingesting the documents. This conversion is accomplished by _quantizing the dataset_---reducing the precision of its vectors. There are many quantization techniques, such as scalar quantization or product quantization (PQ), which is used in the Faiss engine. The choice of quantization technique depends on the type of data you're using and can affect the accuracy of recall values. The following sections describe the scalar quantization algorithms that were used to quantize the [k-NN benchmarking test](https://github.com/opensearch-project/k-NN/tree/main/benchmarks/perf-tool) data for the [L2](#scalar-quantization-for-the-l2-space-type) and [cosine similarity](#scalar-quantization-for-the-cosine-similarity-space-type) space types. The provided pseudocode is for illustration purposes only. +If your vectors are of the type `float`, you need to first convert them to the `byte` type before ingesting the documents. This conversion is accomplished by _quantizing the dataset_---reducing the precision of its vectors. There are many quantization techniques, such as scalar quantization or product quantization (PQ), which is used in the Faiss engine. The choice of quantization technique depends on the type of data you're using and can affect the accuracy of recall values. The following sections describe the scalar quantization algorithms that were used to quantize the [k-NN benchmarking test](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/vectorsearch) data for the [L2](#scalar-quantization-for-the-l2-space-type) and [cosine similarity](#scalar-quantization-for-the-cosine-similarity-space-type) space types. The provided pseudocode is for illustration purposes only. #### Scalar quantization for the L2 space type @@ -267,7 +510,7 @@ return Byte(bval) ``` {% include copy.html %} -## Binary k-NN vectors +## Binary vectors You can reduce memory costs by a factor of 32 by switching from float to binary vectors. Using binary vector indexes can lower operational costs while maintaining high recall performance, making large-scale deployment more economical and efficient. @@ -305,14 +548,10 @@ PUT /test-binary-hnsw "type": "knn_vector", "dimension": 8, "data_type": "binary", + "space_type": "hamming", "method": { "name": "hnsw", - "space_type": "hamming", - "engine": "faiss", - "parameters": { - "ef_construction": 128, - "m": 24 - } + "engine": "faiss" } } } @@ -535,12 +774,12 @@ POST _plugins/_knn/models/test-binary-model/_train "dimension": 8, "description": "model with binary data", "data_type": "binary", + "space_type": "hamming", "method": { "name": "ivf", "engine": "faiss", - "space_type": "hamming", "parameters": { - "nlist": 1, + "nlist": 16, "nprobes": 1 } } diff --git a/_field-types/supported-field-types/match-only-text.md b/_field-types/supported-field-types/match-only-text.md index fd2c6b5850..534275bd3a 100644 --- a/_field-types/supported-field-types/match-only-text.md +++ b/_field-types/supported-field-types/match-only-text.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Match-only text field type +**Introduced 2.12** +{: .label .label-purple } A `match_only_text` field is a variant of a `text` field designed for full-text search when scoring and positional information of terms within a document are not critical. diff --git a/_field-types/supported-field-types/nested.md b/_field-types/supported-field-types/nested.md index 90d09177d1..4db270c1dc 100644 --- a/_field-types/supported-field-types/nested.md +++ b/_field-types/supported-field-types/nested.md @@ -11,6 +11,8 @@ redirect_from: --- # Nested field type +**Introduced 1.0** +{: .label .label-purple } A nested field type is a special type of [object field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/object/). @@ -312,3 +314,8 @@ Parameter | Description `include_in_parent` | A Boolean value that specifies whether all fields in the child nested object should also be added to the parent document in flattened form. Default is `false`. `include_in_root` | A Boolean value that specifies whether all fields in the child nested object should also be added to the root document in flattened form. Default is `false`. `properties` | Fields of this object, which can be of any supported type. New properties can be dynamically added to this object if `dynamic` is set to `true`. + +## Next steps + +- Learn about [joining queries]({{site.url}}{{site.baseurl}}/query-dsl/joining/) on nested fields. +- Learn about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_field-types/supported-field-types/object-fields.md b/_field-types/supported-field-types/object-fields.md index 429c5b94c7..e683e70f0d 100644 --- a/_field-types/supported-field-types/object-fields.md +++ b/_field-types/supported-field-types/object-fields.md @@ -19,5 +19,5 @@ Field data type | Description [`object`]({{site.url}}{{site.baseurl}}/field-types/object/) | A JSON object. [`nested`]({{site.url}}{{site.baseurl}}/field-types/nested/) | Used when objects in an array need to be indexed independently as separate documents. [`flat_object`]({{site.url}}{{site.baseurl}}/field-types/flat-object/) | A JSON object treated as a string. -[`join`]({{site.url}}{{site.baseurl}}/field-types/join/) | Establishes a parent-child relationship between documents in the same index. +[`join`]({{site.url}}{{site.baseurl}}/field-types/join/) | Establishes a parent/child relationship between documents in the same index. diff --git a/_field-types/supported-field-types/object.md b/_field-types/supported-field-types/object.md index db539a9608..ee50f5af9d 100644 --- a/_field-types/supported-field-types/object.md +++ b/_field-types/supported-field-types/object.md @@ -11,6 +11,8 @@ redirect_from: --- # Object field type +**Introduced 1.0** +{: .label .label-purple } An object field type contains a JSON object (a set of name/value pairs). A value in a JSON object may be another JSON object. It is not necessary to specify `object` as the type when mapping object fields because `object` is the default type. diff --git a/_field-types/supported-field-types/percolator.md b/_field-types/supported-field-types/percolator.md index 92325b6127..2b067cf595 100644 --- a/_field-types/supported-field-types/percolator.md +++ b/_field-types/supported-field-types/percolator.md @@ -10,6 +10,8 @@ redirect_from: --- # Percolator field type +**Introduced 1.0** +{: .label .label-purple } A percolator field type specifies to treat this field as a query. Any JSON object field can be marked as a percolator field. Normally, documents are indexed and searches are run against them. When you use a percolator field, you store a search, and later the percolate query matches documents to that search. diff --git a/_field-types/supported-field-types/range.md b/_field-types/supported-field-types/range.md index 22ae1d619e..1001bae584 100644 --- a/_field-types/supported-field-types/range.md +++ b/_field-types/supported-field-types/range.md @@ -10,6 +10,8 @@ redirect_from: --- # Range field types +**Introduced 1.0** +{: .label .label-purple } The following table lists all range field types that OpenSearch supports. diff --git a/_field-types/supported-field-types/rank.md b/_field-types/supported-field-types/rank.md index a4ec0fac4c..f57c540cf5 100644 --- a/_field-types/supported-field-types/rank.md +++ b/_field-types/supported-field-types/rank.md @@ -10,6 +10,8 @@ redirect_from: --- # Rank field types +**Introduced 1.0** +{: .label .label-purple } The following table lists all rank field types that OpenSearch supports. diff --git a/_field-types/supported-field-types/search-as-you-type.md b/_field-types/supported-field-types/search-as-you-type.md index b9141e6b8e..55774d432a 100644 --- a/_field-types/supported-field-types/search-as-you-type.md +++ b/_field-types/supported-field-types/search-as-you-type.md @@ -11,6 +11,8 @@ redirect_from: --- # Search-as-you-type field type +**Introduced 1.0** +{: .label .label-purple } A search-as-you-type field type provides search-as-you-type functionality using both prefix and infix completion. diff --git a/_field-types/supported-field-types/star-tree.md b/_field-types/supported-field-types/star-tree.md new file mode 100644 index 0000000000..2bfccb6632 --- /dev/null +++ b/_field-types/supported-field-types/star-tree.md @@ -0,0 +1,184 @@ +--- +layout: default +title: Star-tree +nav_order: 61 +parent: Supported field types +--- + +# Star-tree field type + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +A star-tree index precomputes aggregations, accelerating the performance of aggregation queries. +If a star-tree index is configured as part of an index mapping, the star-tree index is created and maintained as data is ingested in real time. + +OpenSearch will automatically use the star-tree index to optimize aggregations if the queried fields are part of star-tree index dimension fields and the aggregations are on star-tree index metric fields. No changes are required in the query syntax or the request parameters. + +For more information, see [Star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/). + +## Prerequisites + +To use a star-tree index, follow the instructions in [Enabling a star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index#enabling-a-star-tree-index). + +## Examples + +The following examples show how to use a star-tree index. + +### Star-tree index mappings + +Define star-tree index mappings in the `composite` section in `mappings`. + +The following example API request creates a corresponding star-tree index named`request_aggs`. To compute metric aggregations for `request_size` and `latency` fields with queries on `port` and `status` fields, configure the following mappings: + +```json +PUT logs +{ + "settings": { + "index.number_of_shards": 1, + "index.number_of_replicas": 0, + "index.composite_index": true + }, + "mappings": { + "composite": { + "request_aggs": { + "type": "star_tree", + "config": { + "max_leaf_docs": 10000, + "skip_star_node_creation_for_dimensions": [ + "port" + ], + "ordered_dimensions": [ + { + "name": "status" + }, + { + "name": "port" + } + ], + "metrics": [ + { + "name": "request_size", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + }, + { + "name": "latency", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + } + ] + } + } + }, + "properties": { + "status": { + "type": "integer" + }, + "port": { + "type": "integer" + }, + "request_size": { + "type": "integer" + }, + "latency": { + "type": "scaled_float", + "scaling_factor": 10 + } + } + } +} +``` + +## Star-tree index configuration options + +You can customize your star-tree implementation using the following `config` options in the `mappings` section. These options cannot be modified without reindexing. + +| Parameter | Description | +| :--- | :--- | +| `ordered_dimensions` | A [list of fields](#ordered-dimensions) based on which metrics will be aggregated in a star-tree index. Required. | +| `metrics` | A [list of metric](#metrics) fields required in order to perform aggregations. Required. | +| `max_leaf_docs` | The maximum number of star-tree documents that a leaf node can point to. After the maximum number of documents is reached, child nodes will be created based on the unique value of the next field in the `ordered_dimension` (if any). Default is `10000`. A lower value will use more storage but result in faster query performance. Inversely, a higher value will use less storage but result in slower query performance. For more information, see [Star-tree indexing structure]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#star-tree-index-structure). | +| `skip_star_node_creation_for_dimensions` | A list of dimensions for which a star-tree index will skip star node creation. When `true`, this reduces storage size at the expense of query performance. Default is `false`. For more information about star nodes, see [Star-tree indexing structure]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#star-tree-index-structure). | + + +### Ordered dimensions + +The `ordered_dimensions` parameter contains fields based on which metrics will be aggregated in a star-tree index. The star-tree index will be selected for querying only if all the fields in the query are part of the `ordered_dimensions`. + +When using the `ordered_dimesions` parameter, follow these best practices: + +- The order of dimensions matters. You can define the dimensions ordered from the highest cardinality to the lowest cardinality for efficient storage and query pruning. +- Avoid using high-cardinality fields as dimensions. High-cardinality fields adversely affect storage space, indexing throughput, and query performance. +- Currently, fields supported by the `ordered_dimensions` parameter are all [numeric field types](https://opensearch.org/docs/latest/field-types/supported-field-types/numeric/), with the exception of `unsigned_long`. For more information, see [GitHub issue #15231](https://github.com/opensearch-project/OpenSearch/issues/15231). +- Support for other field types, such as `keyword` and `ip`, will be added in future versions. For more information, see [GitHub issue #16232](https://github.com/opensearch-project/OpenSearch/issues/16232). +- A minimum of `2` and a maximum of `10` dimensions are supported per star-tree index. + +The `ordered_dimensions` parameter supports the following property. + +| Parameter | Required/Optional | Description | +| :--- | :--- | :--- | +| `name` | Required | The name of the field. The field name should be present in the `properties` section as part of the index `mapping`. Ensure that the `doc_values` setting is `enabled` for any associated fields. | + + +### Metrics + +Configure any metric fields on which you need to perform aggregations. `Metrics` are required as part of a star-tree index configuration. + +When using `metrics`, follow these best practices: + +- Currently, fields supported by `metrics` are all [numeric field types](https://opensearch.org/docs/latest/field-types/supported-field-types/numeric/), with the exception of `unsigned_long`. For more information, see [GitHub issue #15231](https://github.com/opensearch-project/OpenSearch/issues/15231). +- Supported metric aggregations include `Min`, `Max`, `Sum`, `Avg`, and `Value_count`. + - `Avg` is a derived metric based on `Sum` and `Value_count` and is not indexed when a query is run. The remaining base metrics are indexed. +- A maximum of `100` base metrics are supported per star-tree index. + +If `Min`, `Max`, `Sum`, and `Value_count` are defined as `metrics` for each field, then up to 25 such fields can be configured, as shown in the following example: + +```json +{ + "metrics": [ + { + "name": "field1", + "stats": [ + "sum", + "value_count", + "min", + "max" + ], + ..., + ..., + "name": "field25", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + } + ] +} +``` + + +#### Properties + +The `metrics` parameter supports the following properties. + +| Parameter | Required/Optional | Description | +| :--- | :--- | :--- | +| `name` | Required | The name of the field. The field name should be present in the `properties` section as part of the index `mapping`. Ensure that the `doc_values` setting is `enabled` for any associated fields. | +| `stats` | Optional | A list of metric aggregations computed for each field. You can choose between `Min`, `Max`, `Sum`, `Avg`, and `Value Count`.<br/>Default is `Sum` and `Value_count`.<br/>`Avg` is a derived metric statistic that will automatically be supported in queries if `Sum` and `Value_Count` are present as part of metric `stats`. + + +## Supported queries and aggregations + +For more information about supported queries and aggregations, see [Supported queries and aggregations for a star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#supported-queries-and-aggregations). + diff --git a/_field-types/supported-field-types/text.md b/_field-types/supported-field-types/text.md index 16350c0cb3..b06bec2187 100644 --- a/_field-types/supported-field-types/text.md +++ b/_field-types/supported-field-types/text.md @@ -11,6 +11,8 @@ redirect_from: --- # Text field type +**Introduced 1.0** +{: .label .label-purple } A `text` field type contains a string that is analyzed. It is used for full-text search because it allows partial matches. Searches for multiple terms can match some but not all of them. Depending on the analyzer, results can be case insensitive, stemmed, have stopwords removed, have synonyms applied, and so on. diff --git a/_field-types/supported-field-types/token-count.md b/_field-types/supported-field-types/token-count.md index 6c3445e6a7..11eeff7854 100644 --- a/_field-types/supported-field-types/token-count.md +++ b/_field-types/supported-field-types/token-count.md @@ -11,6 +11,8 @@ redirect_from: --- # Token count field type +**Introduced 1.0** +{: .label .label-purple } A token count field type stores the number of analyzed tokens in a string. diff --git a/_field-types/supported-field-types/unsigned-long.md b/_field-types/supported-field-types/unsigned-long.md index dde8d25dee..4c38cb3090 100644 --- a/_field-types/supported-field-types/unsigned-long.md +++ b/_field-types/supported-field-types/unsigned-long.md @@ -8,6 +8,8 @@ has_children: false --- # Unsigned long field type +**Introduced 2.8** +{: .label .label-purple } The `unsigned_long` field type is a numeric field type that represents an unsigned 64-bit integer with a minimum value of 0 and a maximum value of 2<sup>64</sup> − 1. In the following example, `counter` is mapped as an `unsigned_long` field: diff --git a/_field-types/supported-field-types/wildcard.md b/_field-types/supported-field-types/wildcard.md index c438f35c62..0f8c176135 100644 --- a/_field-types/supported-field-types/wildcard.md +++ b/_field-types/supported-field-types/wildcard.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Wildcard field type +**Introduced 2.15** +{: .label .label-purple } A `wildcard` field is a variant of a `keyword` field designed for arbitrary substring and regular expression matching. diff --git a/_field-types/supported-field-types/xy-point.md b/_field-types/supported-field-types/xy-point.md index 57b6f64758..0d066b9f09 100644 --- a/_field-types/supported-field-types/xy-point.md +++ b/_field-types/supported-field-types/xy-point.md @@ -11,6 +11,8 @@ redirect_from: --- # xy point field type +**Introduced 2.4** +{: .label .label-purple } An xy point field type contains a point in a two-dimensional Cartesian coordinate system, specified by x and y coordinates. It is based on the Lucene [XYPoint](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/geo/XYPoint.html) field type. The xy point field type is similar to the [geopoint]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) field type, but does not have the range limitations of geopoint. The coordinates of an xy point are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). diff --git a/_field-types/supported-field-types/xy-shape.md b/_field-types/supported-field-types/xy-shape.md index f1c7191240..9dcbafceae 100644 --- a/_field-types/supported-field-types/xy-shape.md +++ b/_field-types/supported-field-types/xy-shape.md @@ -11,6 +11,8 @@ redirect_from: --- # xy shape field type +**Introduced 2.4** +{: .label .label-purple } An xy shape field type contains a shape, such as a polygon or a collection of xy points. It is based on the Lucene [XYShape](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/document/XYShape.html) field type. To index an xy shape, OpenSearch tessellates the shape into a triangular mesh and stores each triangle in a BKD tree (a set of balanced k-dimensional trees). This provides a 10<sup>-7</sup>decimal degree of precision, which represents near-perfect spatial resolution. diff --git a/_getting-started/communicate.md b/_getting-started/communicate.md index 3472270c30..773558fb21 100644 --- a/_getting-started/communicate.md +++ b/_getting-started/communicate.md @@ -28,7 +28,7 @@ curl -X GET "http://localhost:9200/_cluster/health" If you're using the Security plugin, provide the username and password in the request: ```bash -curl -X GET "http://localhost:9200/_cluster/health" -ku admin:<custom-admin-password> +curl -X GET "https://localhost:9200/_cluster/health" -ku admin:<custom-admin-password> ``` {% include copy.html %} @@ -317,4 +317,4 @@ Once a field is created, you cannot change its type. Changing a field type requi ## Next steps -- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. \ No newline at end of file +- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. diff --git a/_getting-started/ingest-data.md b/_getting-started/ingest-data.md index 73cf1502f7..866a88e68a 100644 --- a/_getting-started/ingest-data.md +++ b/_getting-started/ingest-data.md @@ -50,32 +50,32 @@ Use the following steps to create a sample index and define field mappings for t ``` {% include copy.html %} -1. Download [ecommerce.json](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json). This file contains the index data formatted so that it can be ingested by the Bulk API: +1. Download [ecommerce.ndjson](https://github.com/opensearch-project/documentation-website/blob/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.ndjson). This file contains the index data formatted so that it can be ingested by the Bulk API: To use cURL, send the following request: ```bash - curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json + curl -O https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.ndjson ``` {% include copy.html %} To use wget, send the following request: ``` - wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.json + wget https://raw.githubusercontent.com/opensearch-project/documentation-website/{{site.opensearch_major_minor_version}}/assets/examples/ecommerce.ndjson ``` {% include copy.html %} 1. Define the field mappings provided in the mapping file: ```bash - curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce" -ku admin:<custom-admin-password> --data-binary "@ecommerce-field_mappings.json" + curl -H "Content-Type: application/json" -X PUT "https://localhost:9200/ecommerce" -ku admin:<custom-admin-password> --data-binary "@ecommerce-field_mappings.json" ``` {% include copy.html %} 1. Upload the documents using the Bulk API: ```bash - curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce/_bulk" -ku admin:<custom-admin-password> --data-binary "@ecommerce.json" + curl -H "Content-Type: application/x-ndjson" -X PUT "https://localhost:9200/ecommerce/_bulk" -ku admin:<custom-admin-password> --data-binary "@ecommerce.ndjson" ``` {% include copy.html %} diff --git a/_getting-started/search-data.md b/_getting-started/search-data.md index c6970e7e7b..8e4169fbae 100644 --- a/_getting-started/search-data.md +++ b/_getting-started/search-data.md @@ -104,7 +104,7 @@ OpenSearch returns the matching documents: } ``` -## Response fields +## Response body fields The preceding response contains the following fields. diff --git a/_im-plugin/index-context.md b/_im-plugin/index-context.md new file mode 100644 index 0000000000..be0dbd527d --- /dev/null +++ b/_im-plugin/index-context.md @@ -0,0 +1,175 @@ +--- +layout: default +title: Index context +nav_order: 14 +redirect_from: + - /opensearch/index-context/ +--- + +# Index context + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +Index context declares the use case for an index. Using the context information, OpenSearch applies a predetermined set of settings and mappings, which provides the following benefits: + +- Optimized performance +- Settings tuned to your specific use case +- Accurate mappings and aliases based on [OpenSearch Integrations]({{site.url}}{{site.baseurl}}/integrations/) + +The settings and metadata configuration that are applied using component templates are automatically loaded when your cluster starts. Component templates that start with `@abc_template@` or Application-Based Configuration (ABC) templates can only be used through a `context` object declaration, in order to prevent configuration issues. +{: .warning} + + +## Installation + +To install the index context feature: + +1. Install the `opensearch-system-templates` plugin on all nodes in your cluster using one of the [installation methods]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#install). + +2. Set the feature flag `opensearch.experimental.feature.application_templates.enabled` to `true`. For more information about enabling and disabling feature flags, see [Enabling experimental features]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/). + +3. Set the `cluster.application_templates.enabled` setting to `true`. For instructions on how to configure OpenSearch, see [configuring settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). + +## Using the `context` setting + +Use the `context` setting with the Index API to add use-case-specific context. + +### Considerations + +Consider the following when using the `context` parameter during index creation: + +1. If you use the `context` parameter to create an index, you cannot include any settings declared in the index context during index creation or dynamic settings updates. +2. The index context becomes permanent when set on an index or index template. + +When you adhere to these limitations, suggested configurations or mappings are uniformly applied on indexed data within the specified context. + +### Examples + +The following examples show how to use index context. + + +#### Create an index + +The following example request creates an index in which to store metric data by declaring a `metrics` mapping as the context: + +```json +PUT /my-metrics-index +{ + "context": { + "name": "metrics" + } +} +``` +{% include copy-curl.html %} + +After creation, the context is added to the index and the corresponding settings are applied: + + +**GET request** + +```json +GET /my-metrics-index +``` +{% include copy-curl.html %} + + +**Response** + +```json +{ + "my-metrics-index": { + "aliases": {}, + "mappings": {}, + "settings": { + "index": { + "codec": "zstd_no_dict", + "refresh_interval": "60s", + "number_of_shards": "1", + "provided_name": "my-metrics-index", + "merge": { + "policy": "log_byte_size" + }, + "context": { + "created_version": "1", + "current_version": "1" + }, + ... + } + }, + "context": { + "name": "metrics", + "version": "_latest" + } + } +} +``` + + +#### Create an index template + +You can also use the `context` parameter when creating an index template. The following example request creates an index template with the context information as `logs`: + +```json +PUT _index_template/my-logs +{ + "context": { + "name": "logs", + "version": "1" + }, + "index_patterns": [ + "my-logs-*" + ] +} +``` +{% include copy-curl.html %} + +All indexes created using this index template will get the metadata provided by the associated component template. The following request and response show how `context` is added to the template: + +**Get index template** + +```json +GET _index_template/my-logs +``` +{% include copy-curl.html %} + +**Response** + +```json +{ + "index_templates": [ + { + "name": "my-logs2", + "index_template": { + "index_patterns": [ + "my-logs1-*" + ], + "context": { + "name": "logs", + "version": "1" + } + } + } + ] +} +``` + +If there is any conflict between any settings, mappings, or aliases directly declared by your template and the backing component template for the context, the latter gets higher priority during index creation. + + +## Available context templates + +The following templates are available to be used through the `context` parameter as of OpenSearch 2.17: + +- `logs` +- `metrics` +- `nginx-logs` +- `amazon-cloudtrail-logs` +- `amazon-elb-logs` +- `amazon-s3-logs` +- `apache-web-logs` +- `k8s-logs` + +For more information about these templates, see the [OpenSearch system templates repository](https://github.com/opensearch-project/opensearch-system-templates/tree/main/src/main/resources/org/opensearch/system/applicationtemplates/v1). + +To view the current version of these templates on your cluster, use `GET /_component_template`. diff --git a/_im-plugin/ism/policies.md b/_im-plugin/ism/policies.md index e6262e883b..27c37e67ea 100644 --- a/_im-plugin/ism/policies.md +++ b/_im-plugin/ism/policies.md @@ -539,7 +539,7 @@ GET _plugins/_rollup/jobs/<rollup_id>/_explain } ```` -#### Request fields +#### Request body fields Request fields are required when creating an ISM policy. You can reference the [Index rollups API]({{site.url}}{{site.baseurl}}/im-plugin/index-rollups/rollup-api/#create-or-update-an-index-rollup-job) page for request field options. diff --git a/_includes/cards.html b/_includes/cards.html index 6d958e61a5..5ab37b8c27 100644 --- a/_includes/cards.html +++ b/_includes/cards.html @@ -30,8 +30,14 @@ <p class="description">Measure performance metrics for your OpenSearch cluster</p> <p class="last-link">Documentation →</p> </div> + + <div class="card"> + <a href="{{site.url}}/docs/latest/migration-assistant/" class='card-link'></a> + <p class="heading">Migration Assistant</p> + <p class="description">Migrate to OpenSearch from other platforms</p> + <p class="last-link">Documentation →</p> + </div> </div> </div> - diff --git a/_includes/header.html b/_includes/header.html index 20d82c451e..32d5b14774 100644 --- a/_includes/header.html +++ b/_includes/header.html @@ -82,7 +82,7 @@ {% endif %} <div role="banner" id="top"> <div class="navigation-container"> - <a class="navigation-container--logo" href="{{ '/' | relative_url }}"> + <a class="navigation-container--logo" href="https://opensearch.org/"> OpenSearch <svg width="200" height="39" viewBox="0 0 200 39" fill="none" xmlns="http://www.w3.org/2000/svg"> <g clip-path="url(#clip0_723_1352)"> diff --git a/_ingest-pipelines/processors/append.md b/_ingest-pipelines/processors/append.md index 8101cf97c9..fe222e647f 100644 --- a/_ingest-pipelines/processors/append.md +++ b/_ingest-pipelines/processors/append.md @@ -43,6 +43,7 @@ Parameter | Required/Optional | Description | `description` | Optional | A brief description of the processor. | `if` | Optional | A condition for running the processor. | `ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters errors. If set to `true`, failures are ignored. Default is `false`. | +`allow_duplicates` | Optional | Specifies whether to append the values already contained in the field. If `true`, duplicate values are appended. Otherwise, they are skipped. | `on_failure` | Optional | A list of processors to run if the processor fails. | `tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | diff --git a/_ingest-pipelines/processors/text-chunking.md b/_ingest-pipelines/processors/text-chunking.md index 97229d2aaa..0141ba1564 100644 --- a/_ingest-pipelines/processors/text-chunking.md +++ b/_ingest-pipelines/processors/text-chunking.md @@ -31,16 +31,20 @@ The following is the syntax for the `text_chunking` processor: The following table lists the required and optional parameters for the `text_chunking` processor. -| Parameter | Data type | Required/Optional | Description | -|:---|:---|:---|:---| -| `field_map` | Object | Required | Contains key-value pairs that specify the mapping of a text field to the output field. | -| `field_map.<input_field>` | String | Required | The name of the field from which to obtain text for generating chunked passages. | -| `field_map.<output_field>` | String | Required | The name of the field in which to store the chunked results. | -| `algorithm` | Object | Required | Contains at most one key-value pair that specifies the chunking algorithm and parameters. | -| `algorithm.<name>` | String | Optional | The name of the chunking algorithm. Valid values are [`fixed_token_length`](#fixed-token-length-algorithm) or [`delimiter`](#delimiter-algorithm). Default is `fixed_token_length`. | -| `algorithm.<parameters>` | Object | Optional | The parameters for the chunking algorithm. By default, contains the default parameters of the `fixed_token_length` algorithm. | -| `description` | String | Optional | A brief description of the processor. | -| `tag` | String | Optional | An identifier tag for the processor. Useful when debugging in order to distinguish between processors of the same type. | +| Parameter | Data type | Required/Optional | Description | +|:----------------------------|:----------|:---|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `field_map` | Object | Required | Contains key-value pairs that specify the mapping of a text field to the output field. | +| `field_map.<input_field>` | String | Required | The name of the field from which to obtain text for generating chunked passages. | +| `field_map.<output_field>` | String | Required | The name of the field in which to store the chunked results. | +| `algorithm` | Object | Required | Contains at most one key-value pair that specifies the chunking algorithm and parameters. | +| `algorithm.<name>` | String | Optional | The name of the chunking algorithm. Valid values are [`fixed_token_length`](#fixed-token-length-algorithm) or [`delimiter`](#delimiter-algorithm). Default is `fixed_token_length`. | +| `algorithm.<parameters>` | Object | Optional | The parameters for the chunking algorithm. By default, contains the default parameters of the `fixed_token_length` algorithm. | +| `ignore_missing` | Boolean | Optional | If `true`, empty fields are excluded from the output. If `false`, the output will contain an empty list for every empty field. Default is `false`. | +| `description` | String | Optional | A brief description of the processor. | +| `tag` | String | Optional | An identifier tag for the processor. Useful when debugging in order to distinguish between processors of the same type. | + +To perform chunking on nested fields, specify `input_field` and `output_field` values as JSON objects. Dot paths of nested fields are not supported. For example, use `"field_map": { "foo": { "bar": "bar_chunk"} }` instead of `"field_map": { "foo.bar": "foo.bar_chunk"}`. +{: .note} ### Fixed token length algorithm diff --git a/_install-and-configure/additional-plugins/index.md b/_install-and-configure/additional-plugins/index.md index 87d0662442..afc17cd8b2 100644 --- a/_install-and-configure/additional-plugins/index.md +++ b/_install-and-configure/additional-plugins/index.md @@ -9,29 +9,30 @@ nav_order: 10 There are many more plugins available in addition to those provided by the standard distribution of OpenSearch. These additional plugins have been built by OpenSearch developers or members of the OpenSearch community. While it isn't possible to provide an exhaustive list (because many plugins are not maintained in an OpenSearch GitHub repository), the following plugins, available in the [OpenSearch/plugins](https://github.com/opensearch-project/OpenSearch/tree/main/plugins) directory on GitHub, are some of the plugins that can be installed using one of the installation options, for example, using the command `bin/opensearch-plugin install <plugin-name>`. -| Plugin name | Earliest available version | -| :--- | :--- | -| analysis-icu | 1.0.0 | -| analysis-kuromoji | 1.0.0 | -| analysis-nori | 1.0.0 | -| analysis-phonetic | 1.0.0 | -| analysis-smartcn | 1.0.0 | -| analysis-stempel | 1.0.0 | -| analysis-ukrainian | 1.0.0 | -| discovery-azure-classic | 1.0.0 | -| discovery-ec2 | 1.0.0 | -| discovery-gce | 1.0.0 | -| [`ingest-attachment`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/ingest-attachment-plugin/) | 1.0.0 | -| mapper-annotated-text | 1.0.0 | -| mapper-murmur3 | 1.0.0 | -| [`mapper-size`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/mapper-size-plugin/) | 1.0.0 | -| query-insights | 2.12.0 | -| repository-azure | 1.0.0 | -| repository-gcs | 1.0.0 | -| repository-hdfs | 1.0.0 | -| repository-s3 | 1.0.0 | -| store-smb | 1.0.0 | -| transport-nio | 1.0.0 | +| Plugin name | Earliest available version | +|:-----------------------------------------------------------------------------------------------------------------------|:---------------------------| +| analysis-icu | 1.0.0 | +| analysis-kuromoji | 1.0.0 | +| analysis-nori | 1.0.0 | +| [`analysis-phonenumber`]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/phone-analyzers/) | 2.18.0 | +| analysis-phonetic | 1.0.0 | +| analysis-smartcn | 1.0.0 | +| analysis-stempel | 1.0.0 | +| analysis-ukrainian | 1.0.0 | +| discovery-azure-classic | 1.0.0 | +| discovery-ec2 | 1.0.0 | +| discovery-gce | 1.0.0 | +| [`ingest-attachment`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/ingest-attachment-plugin/) | 1.0.0 | +| mapper-annotated-text | 1.0.0 | +| mapper-murmur3 | 1.0.0 | +| [`mapper-size`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/mapper-size-plugin/) | 1.0.0 | +| query-insights | 2.12.0 | +| repository-azure | 1.0.0 | +| repository-gcs | 1.0.0 | +| repository-hdfs | 1.0.0 | +| repository-s3 | 1.0.0 | +| store-smb | 1.0.0 | +| transport-nio | 1.0.0 | ## Related articles diff --git a/_install-and-configure/configuring-opensearch/cluster-settings.md b/_install-and-configure/configuring-opensearch/cluster-settings.md index 9af0f5c5b1..65804a5de9 100644 --- a/_install-and-configure/configuring-opensearch/cluster-settings.md +++ b/_install-and-configure/configuring-opensearch/cluster-settings.md @@ -13,9 +13,9 @@ To learn more about static and dynamic settings, see [Configuring OpenSearch]({{ ## Cluster-level routing and allocation settings -OpenSearch supports the following cluster-level routing and shard allocation settings. All settings in this list are dynamic: +OpenSearch supports the following cluster-level routing and shard allocation settings: -- `cluster.routing.allocation.enable` (String): Enables or disables allocation for specific kinds of shards. +- `cluster.routing.allocation.enable` (Dynamic, string): Enables or disables allocation for specific kinds of shards. Valid values are: - `all` – Allows shard allocation for all types of shards. @@ -25,17 +25,17 @@ OpenSearch supports the following cluster-level routing and shard allocation set Default is `all`. -- `cluster.routing.allocation.node_concurrent_incoming_recoveries` (Integer): Configures how many concurrent incoming shard recoveries are allowed to happen on a node. Default is `2`. +- `cluster.routing.allocation.node_concurrent_incoming_recoveries` (Dynamic, integer): Configures how many concurrent incoming shard recoveries are allowed to occur on a node. Default is `2`. -- `cluster.routing.allocation.node_concurrent_outgoing_recoveries` (Integer): Configures how many concurrent outgoing shard recoveries are allowed to happen on a node. Default is `2`. +- `cluster.routing.allocation.node_concurrent_outgoing_recoveries` (Dynamic, integer): Configures how many concurrent outgoing shard recoveries are allowed to occur on a node. Default is `2`. -- `cluster.routing.allocation.node_concurrent_recoveries` (String): Used to set `cluster.routing.allocation.node_concurrent_incoming_recoveries` and `cluster.routing.allocation.node_concurrent_outgoing_recoveries` to the same value. +- `cluster.routing.allocation.node_concurrent_recoveries` (Dynamic, string): Used to set `cluster.routing.allocation.node_concurrent_incoming_recoveries` and `cluster.routing.allocation.node_concurrent_outgoing_recoveries` to the same value. -- `cluster.routing.allocation.node_initial_primaries_recoveries` (Integer): Sets the number of recoveries for unassigned primaries after a node restart. Default is `4`. +- `cluster.routing.allocation.node_initial_primaries_recoveries` (Dynamic, integer): Sets the number of recoveries for unassigned primaries after a node restart. Default is `4`. -- `cluster.routing.allocation.same_shard.host` (Boolean): When set to `true`, multiple copies of a shard are prevented from being allocated to distinct nodes on the same host. Default is `false`. +- `cluster.routing.allocation.same_shard.host` (Dynamic, Boolean): When set to `true`, multiple copies of a shard are prevented from being allocated to distinct nodes on the same host. Default is `false`. -- `cluster.routing.rebalance.enable` (String): Enables or disables rebalancing for specific kinds of shards. +- `cluster.routing.rebalance.enable` (Dynamic, string): Enables or disables rebalancing for specific kinds of shards. Valid values are: - `all` – Allows shard balancing for all types of shards. @@ -45,7 +45,7 @@ OpenSearch supports the following cluster-level routing and shard allocation set Default is `all`. -- `cluster.routing.allocation.allow_rebalance` (String): Specifies when shard rebalancing is allowed. +- `cluster.routing.allocation.allow_rebalance` (Dynamic, string): Specifies when shard rebalancing is allowed. Valid values are: - `always` – Always allow rebalancing. @@ -54,35 +54,35 @@ OpenSearch supports the following cluster-level routing and shard allocation set Default is `indices_all_active`. -- `cluster.routing.allocation.cluster_concurrent_rebalance` (Integer): Allows you to control how many concurrent shard rebalances are allowed across a cluster. Default is `2`. +- `cluster.routing.allocation.cluster_concurrent_rebalance` (Dynamic, integer): Allows you to control how many concurrent shard rebalances are allowed across a cluster. Default is `2`. -- `cluster.routing.allocation.balance.shard` (Floating point): Defines the weight factor for the total number of shards allocated per node. Default is `0.45`. +- `cluster.routing.allocation.balance.shard` (Dynamic, floating point): Defines the weight factor for the total number of shards allocated per node. Default is `0.45`. -- `cluster.routing.allocation.balance.index` (Floating point): Defines the weight factor for the number of shards per index allocated on a node. Default is `0.55`. +- `cluster.routing.allocation.balance.index` (Dynamic, floating point): Defines the weight factor for the number of shards per index allocated on a node. Default is `0.55`. -- `cluster.routing.allocation.balance.threshold` (Floating point): The minimum optimization value of operations that should be performed. Default is `1.0`. +- `cluster.routing.allocation.balance.threshold` (Dynamic, floating point): The minimum optimization value of operations that should be performed. Default is `1.0`. -- `cluster.routing.allocation.balance.prefer_primary` (Boolean): When set to `true`, OpenSearch attempts to evenly distribute the primary shards between the cluster nodes. Enabling this setting does not always guarantee an equal number of primary shards on each node, especially in the event of failover. Changing this setting to `false` after it was set to `true` does not invoke redistribution of primary shards. Default is `false`. +- `cluster.routing.allocation.balance.prefer_primary` (Dynamic, Boolean): When set to `true`, OpenSearch attempts to evenly distribute the primary shards between the cluster nodes. Enabling this setting does not always guarantee an equal number of primary shards on each node, especially in the event of a failover. Changing this setting to `false` after it was set to `true` does not invoke redistribution of primary shards. Default is `false`. -- `cluster.routing.allocation.rebalance.primary.enable` (Boolean): When set to `true`, OpenSearch attempts to rebalance the primary shards between the cluster nodes. When enabled, the cluster tries to maintain the number of primary shards on each node, with the maximum buffer defined by the `cluster.routing.allocation.rebalance.primary.buffer` setting. Changing this setting to `false` after it was set to `true` does not invoke the redistribution of primary shards. Default is `false`. +- `cluster.routing.allocation.rebalance.primary.enable` (Dynamic, Boolean): When set to `true`, OpenSearch attempts to rebalance the primary shards between the cluster nodes. When enabled, the cluster tries to maintain the number of primary shards on each node, with the maximum buffer defined by the `cluster.routing.allocation.rebalance.primary.buffer` setting. Changing this setting to `false` after it was set to `true` does not invoke the redistribution of primary shards. Default is `false`. -- `cluster.routing.allocation.rebalance.primary.buffer` (Floating point): Defines the maximum allowed buffer of primary shards between nodes when `cluster.routing.allocation.rebalance.primary.enable` is enabled. Default is `0.1`. +- `cluster.routing.allocation.rebalance.primary.buffer` (Dynamic, floating point): Defines the maximum allowed buffer of primary shards between nodes when `cluster.routing.allocation.rebalance.primary.enable` is enabled. Default is `0.1`. -- `cluster.routing.allocation.disk.threshold_enabled` (Boolean): When set to `false`, disables the disk allocation decider. This will also remove any existing `index.blocks.read_only_allow_delete index blocks` when disabled. Default is `true`. +- `cluster.routing.allocation.disk.threshold_enabled` (Dynamic, Boolean): When set to `false`, disables the disk allocation decider. This will also remove any existing `index.blocks.read_only_allow_delete index blocks` when disabled. Default is `true`. -- `cluster.routing.allocation.disk.watermark.low` (String): Controls the low watermark for disk usage. When set to a percentage, OpenSearch will not allocate shards to nodes with that percentage of disk used. This can also be entered as ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting does not affect the primary shards of newly created indexes, but will prevent their replicas from being allocated. Default is `85%`. +- `cluster.routing.allocation.disk.watermark.low` (Dynamic, string): Controls the low watermark for disk usage. When set to a percentage, OpenSearch will not allocate shards to nodes with that percentage of disk usage. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting does not affect the primary shards of newly created indexes but will prevent their replicas from being allocated. Default is `85%`. -- `cluster.routing.allocation.disk.watermark.high` (String): Controls the high watermark. OpenSearch will attempt to relocate shards away from a node whose disk usage is above the percentage defined. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting affects the allocation of all shards. Default is `90%`. +- `cluster.routing.allocation.disk.watermark.high` (Dynamic, string): Controls the high watermark. OpenSearch will attempt to relocate shards away from a node whose disk usage is above the defined percentage. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. This setting affects the allocation of all shards. Default is `90%`. -- `cluster.routing.allocation.disk.watermark.flood_stage` (String): Controls the flood stage watermark. This is a last resort to prevent nodes from running out of disk space. OpenSearch enforces a read-only index block (`index.blocks.read_only_allow_delete`) on every index that has one or more shards allocated on the node and that has at least one disk exceeding the flood stage. The index block is released once the disk utilization falls below the high watermark. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. Default is `95%`. +- `cluster.routing.allocation.disk.watermark.flood_stage` (Dynamic, string): Controls the flood stage watermark. This is a last resort to prevent nodes from running out of disk space. OpenSearch enforces a read-only index block (`index.blocks.read_only_allow_delete`) on every index that has one or more shards allocated on the node and at least one disk exceeding the flood stage. The index block is released once the disk utilization falls below the high watermark. This can also be entered as a ratio value, like `0.85`. Finally, this can also be set to a byte value, like `400mb`. Default is `95%`. -- `cluster.info.update.interval` (Time unit): Sets how often OpenSearch should check disk usage for each node in the cluster. Default is `30s`. +- `cluster.info.update.interval` (Dynamic, time unit): Sets how often OpenSearch should check disk usage for each node in the cluster. Default is `30s`. -- `cluster.routing.allocation.include.<attribute>` (Enum): Allocates shards to a node whose `attribute` has at least one of the included comma-separated values. +- `cluster.routing.allocation.include.<attribute>` (Dynamic, enum): Allocates shards to a node whose `attribute` contains at least one of the included comma-separated values. -- `cluster.routing.allocation.require.<attribute>` (Enum): Only allocates shards to a node whose `attribute` has all of the included comma-separated values. +- `cluster.routing.allocation.require.<attribute>` (Dynamic, enum): Only allocates shards to a node whose `attribute` contains all of the included comma-separated values. -- `cluster.routing.allocation.exclude.<attribute>` (Enum): Does not allocate shards to a node whose `attribute` has any of the included comma-separated values. The cluster allocation settings support the following built-in attributes. +- `cluster.routing.allocation.exclude.<attribute>` (Dynamic, enum): Does not allocate shards to a node whose `attribute` contains any of the included comma-separated values. The cluster allocation settings support the following built-in attributes. Valid values are: - `_name` – Match nodes by node name. @@ -93,15 +93,17 @@ OpenSearch supports the following cluster-level routing and shard allocation set - `_id` – Match nodes by node ID. - `_tier` – Match nodes by data tier role. -- `cluster.routing.allocation.shard_movement_strategy` (Enum): Determines the order in which shards are relocated from outgoing to incoming nodes. +- `cluster.routing.allocation.shard_movement_strategy` (Dynamic, enum): Determines the order in which shards are relocated from outgoing to incoming nodes. This setting supports the following strategies: - `PRIMARY_FIRST` – Primary shards are relocated first, before replica shards. This prioritization may help prevent a cluster's health status from going red if the relocating nodes fail during the process. - `REPLICA_FIRST` – Replica shards are relocated first, before primary shards. This prioritization may help prevent a cluster's health status from going red when carrying out shard relocation in a mixed-version, segment-replication-enabled OpenSearch cluster. In this situation, primary shards relocated to OpenSearch nodes of a newer version could try to copy segment files to replica shards on an older version of OpenSearch, which would result in shard failure. Relocating replica shards first may help to avoid this in multi-version clusters. - `NO_PREFERENCE` – The default behavior in which the order of shard relocation has no importance. -- `cluster.allocator.gateway.batch_size` (Integer): Limits the number of shards sent to data nodes in one batch for fetching any unassigned shard metadata. Default is `2000`. -- `cluster.allocator.existing_shards_allocator.batch_enabled` (Boolean): Enables batch allocation of unassigned shards that already exist on the disk as opposed to allocating one shard at a time. This reduces memory and transport overhead by fetching any unassigned shard metadata in a batch call. Default is `false`. +- `cluster.allocator.gateway.batch_size` (Dynamic, integer): Limits the number of shards sent to data nodes in a single batch to fetch any unassigned shard metadata. Default is `2000`. + +- `cluster.allocator.existing_shards_allocator.batch_enabled` (Static, Boolean): Enables batch allocation of unassigned shards that already exist on the disk, as opposed to allocating one shard at a time. This reduces memory and transport overhead by fetching any unassigned shard metadata in a batch call. Default is `false`. + ## Cluster-level shard, block, and task settings OpenSearch supports the following cluster-level shard, block, and task settings: @@ -149,3 +151,19 @@ OpenSearch supports the following cluster-level coordination settings. All setti - `cluster.fault_detection.leader_check.timeout` (Time unit): The amount of time a node waits for a response from the elected cluster manager during a leader check before deeming the check a failure. Valid values are from `1ms` to `60s`, inclusive. Default is `10s`. Changing this setting to a value other than the default can result in an unstable cluster. - `cluster.fault_detection.follower_check.timeout` (Time unit): The amount of time the elected cluster manager waits for a response during a follower check before deeming the check a failure. Valid values are from `1ms` to `60s`, inclusive. Default is `10s`. Changing this setting to a value other than the default can result in an unstable cluster. + +- `cluster.fault_detection.follower_check.interval` (Time unit): The amount of time that the elected cluster manager waits between sending follower checks to other nodes in the cluster. Valid values are `100ms` and higher. Default is `1000ms`. Changing this setting to a value other than the default can result in an unstable cluster. + +- `cluster.follower_lag.timeout` (Time unit): The amount of time that the elected cluster manager waits to receive acknowledgements for cluster state updates from lagging nodes. Default is `90s`. If a node does not successfully apply the cluster state update within this period of time, it is considered to have failed and is removed from the cluster. + +- `cluster.publish.timeout` (Time unit): The amount of time that the cluster manager waits for each cluster state update to be completely published to all nodes, unless `discovery.type` is set to `single-node`. Default is `30s`. + +## Cluster-level CAT response limit settings + +OpenSearch supports the following cluster-level CAT API response limit settings, all of which are dynamic: + +- `cat.indices.response.limit.number_of_indices` (Integer): Sets a limit on the number of indexes returned by the [CAT Indices API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-indices/). The default value is `-1` (no limit). If the number of indexes in the response exceeds this limit, the API returns a `429` error. To avoid this, you can specify an index pattern filter in your query (for example, `_cat/indices/<index-pattern>`). + +- `cat.shards.response.limit.number_of_shards` (Integer): Sets a limit on the number of shards returned by the [CAT Shards API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-shards/). The default value is `-1` (no limit). If the number of shards in the response exceeds this limit, the API returns a `429` error. To avoid this, you can specify an index pattern filter in your query (for example, `_cat/shards/<index-pattern>`). + +- `cat.segments.response.limit.number_of_indices` (Integer): Sets a limit on the number of indexes returned by the [CAT Segments API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-segments/). The default value is `-1` (no limit). If the number of indexes in the response exceeds this limit, the API returns a `429` error. To avoid this, you can specify an index pattern filter in your query (for example, `_cat/segments/<index-pattern>`). diff --git a/_install-and-configure/configuring-opensearch/index-settings.md b/_install-and-configure/configuring-opensearch/index-settings.md index bd9b9651aa..378fb8fbff 100644 --- a/_install-and-configure/configuring-opensearch/index-settings.md +++ b/_install-and-configure/configuring-opensearch/index-settings.md @@ -79,6 +79,18 @@ OpenSearch supports the following dynamic cluster-level index settings: - `cluster.snapshot.shard.path.prefix` (String): Controls the fixed path prefix for snapshot shard-level blobs. This setting only applies when the repository `shard_path_type` setting is either `HASHED_PREFIX` or `HASHED_INFIX`. Default is an empty string, `""`. +- `cluster.default_number_of_replicas` (Integer): Controls the default number of replicas for indexes in the cluster. The index-level `index.number_of_replicas` setting defaults to this value if not configured. Default is `1`. + +- `cluster.thread_pool.<fixed-threadpool>.size` (Integer): Controls the sizes of both the fixed and resizable queue thread pools. Overrides the defaults provided in `opensearch.yml`. + +- `cluster.thread_pool.<scaling-threadpool>.max` (Integer): Sets the maximum size of the scaling thread pool. Overrides the default provided in `opensearch.yml`. + +- `cluster.thread_pool.<scaling-threadpool>.core` (Integer): Specifies the core size of the scaling thread pool. Overrides the default provided in `opensearch.yml`. + + +Before tuning thread pool settings dynamically, note that these are expert-level settings that can potentially destabilize your cluster. Modifying thread pool settings applies the same thread pool size to all nodes, so it's not recommended for clusters with different hardware for the same roles. Similarly, avoid tuning thread pools shared by both data nodes and cluster manager nodes. After making these changes, we recommend monitoring your cluster to ensure that it remains stable and performs as expected. +{: .warning} + ## Index-level index settings You can specify index settings at index creation. There are two types of index settings: @@ -185,7 +197,7 @@ For more information about updating settings, including supported query paramete OpenSearch supports the following dynamic index-level index settings: -- `index.number_of_replicas` (Integer): The number of replica shards each primary shard should have. For example, if you have 4 primary shards and set `index.number_of_replicas` to 3, the index has 12 replica shards. Default is 1. +- `index.number_of_replicas` (Integer): The number of replica shards each primary shard should have. For example, if you have 4 primary shards and set `index.number_of_replicas` to 3, the index has 12 replica shards. If not set, defaults to `cluster.default_number_of_replicas` (which is `1` by default). - `index.auto_expand_replicas` (String): Whether the cluster should automatically add replica shards based on the number of data nodes. Specify a lower bound and upper limit (for example, 0--9) or `all` for the upper limit. For example, if you have 5 data nodes and set `index.auto_expand_replicas` to 0--3, then the cluster does not automatically add another replica shard. However, if you set this value to `0-all` and add 2 more nodes for a total of 7, the cluster will expand to now have 6 replica shards. Default is disabled. diff --git a/_install-and-configure/install-opensearch/index.md b/_install-and-configure/install-opensearch/index.md index d0c6e242cd..94c259667a 100644 --- a/_install-and-configure/install-opensearch/index.md +++ b/_install-and-configure/install-opensearch/index.md @@ -31,7 +31,7 @@ OpenSearch Version | Compatible Java Versions | Bundled Java Version 1.0--1.2.x | 11, 15 | 15.0.1+9 1.3.x | 8, 11, 14 | 11.0.24+8 2.0.0--2.11.x | 11, 17 | 17.0.2+8 -2.12.0+ | 11, 17, 21 | 21.0.4+7 +2.12.0+ | 11, 17, 21 | 21.0.5+11 To use a different Java installation, set the `OPENSEARCH_JAVA_HOME` or `JAVA_HOME` environment variable to the Java install location. For example: ```bash diff --git a/_install-and-configure/plugins.md b/_install-and-configure/plugins.md index 3a5d6a1834..e96b29e822 100644 --- a/_install-and-configure/plugins.md +++ b/_install-and-configure/plugins.md @@ -181,7 +181,7 @@ Continue with installation? [y/N]y ### Install a plugin using Maven coordinates -The `opensearch-plugin install` tool also allows you to specify Maven coordinates for available artifacts and versions hosted on [Maven Central](https://search.maven.org/search?q=org.opensearch.plugin). The tool parses the Maven coordinates you provide and constructs a URL. As a result, the host must be able to connect directly to the Maven Central site. The plugin installation fails if you pass coordinates to a proxy or local repository. +The `opensearch-plugin install` tool also allows you to specify Maven coordinates for available artifacts and versions hosted on [Maven Central](https://central.sonatype.com/namespace/org.opensearch.plugin). The tool parses the Maven coordinates you provide and constructs a URL. As a result, the host must be able to connect directly to the Maven Central site. The plugin installation fails if you pass coordinates to a proxy or local repository. #### Usage ```bash diff --git a/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md b/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md index 924900dbc8..c467601f1c 100644 --- a/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md +++ b/_install-and-configure/upgrade-opensearch/appendix/rolling-upgrade-lab.md @@ -169,12 +169,12 @@ This section can be broken down into two parts: {% include copy.html %} 1. Next, download the bulk data that you will ingest into this index: ```bash - wget https://raw.githubusercontent.com/opensearch-project/documentation-website/main/assets/examples/ecommerce.json + wget https://raw.githubusercontent.com/opensearch-project/documentation-website/main/assets/examples/ecommerce.ndjson ``` {% include copy.html %} 1. Use the [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/) API to create an index using the mappings defined in `ecommerce-field_mappings.json`: ```bash - curl -H "Content-Type: application/x-ndjson" \ + curl -H "Content-Type: application/json" \ -X PUT "https://localhost:9201/ecommerce?pretty" \ --data-binary "@ecommerce-field_mappings.json" \ -ku admin:<custom-admin-password> @@ -188,11 +188,11 @@ This section can be broken down into two parts: "index" : "ecommerce" } ``` -1. Use the [Bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) API to add data to the new ecommerce index from `ecommerce.json`: +1. Use the [Bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) API to add data to the new ecommerce index from `ecommerce.ndjson`: ```bash curl -H "Content-Type: application/x-ndjson" \ -X PUT "https://localhost:9201/ecommerce/_bulk?pretty" \ - --data-binary "@ecommerce.json" \ + --data-binary "@ecommerce.ndjson" \ -ku admin:<custom-admin-password> ``` {% include copy.html %} diff --git a/_layouts/default.html b/_layouts/default.html index d4d40d8cc4..7f2bf0a2a8 100755 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -87,6 +87,8 @@ {% assign section = site.clients_collection.collections %} {% elsif page.section == "benchmark" %} {% assign section = site.benchmark_collection.collections %} + {% elsif page.section == "migration-assistant" %} + {% assign section = site.migration_assistant_collection.collections %} {% endif %} {% if section %} diff --git a/_layouts/search_layout.html b/_layouts/search_layout.html index 47b8f25d1c..a2702573ae 100644 --- a/_layouts/search_layout.html +++ b/_layouts/search_layout.html @@ -20,55 +20,70 @@ <div class="main-content-wrap-home"> <div id="main-content" class="main-content" role="main"> - <head> - <meta charset="UTF-8"> - <meta name="viewport" content="width=device-width, initial-scale=1.0"> - <title>Results Page Head from layout - + + + + Results Page Head from layout + + + -
+ + {% include footer.html %} @@ -97,10 +112,7 @@

element.value).join(','); const urlPath = window.location.pathname; const versionMatch = urlPath.match(/(\d+\.\d+)/); const docsVersion = versionMatch ? versionMatch[1] : "latest"; @@ -139,11 +151,12 @@

{ + categoryBlog.addEventListener('change', () => { + updateAllCheckbox(); + triggerSearch(searchInput.value.trim()); + }); + categoryEvent.addEventListener('change', () => { updateAllCheckbox(); triggerSearch(searchInput.value.trim()); }); diff --git a/_migration-assistant/deploying-migration-assistant/configuration-options.md b/_migration-assistant/deploying-migration-assistant/configuration-options.md new file mode 100644 index 0000000000..7097d7e90e --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/configuration-options.md @@ -0,0 +1,175 @@ +--- +layout: default +title: Configuration options +nav_order: 15 +parent: Deploying Migration Assistant +--- + +# Configuration options + +This page outlines the configuration options for three key migrations scenarios: + +1. **Metadata migration** +2. **Backfill migration with `Reindex-from-Snapshot` (RFS)** +3. **Live capture migration with Capture and Replay (C&R)** + +Each of these migrations depends on either a snapshot or a capture proxy. The following example `cdk.context.json` configurations are used by AWS Cloud Development Kit (AWS CDK) to deploy and configure Migration Assistant for OpenSearch, shown as separate blocks for each migration type. If you are performing a migration applicable to multiple scenarios, these options can be combined. + + +For a complete list of configuration options, see [opensearch-migrations-options.md](https://github.com/opensearch-project/opensearch-migrations/blob/main/deployment/cdk/opensearch-service-migration/options.md). If you need a configuration option that is not found on this page, create an issue in the [OpenSearch Migrations repository](https://github.com/opensearch-project/opensearch-migrations/issues). +{: .tip } + +Options for the source cluster endpoint, target cluster endpoint, and existing virtual private cloud (VPC) should be configured in order for the migration tools to function effectively. + +## Shared configuration options + +Each migration configuration shares the following options. + + +| Name | Example | Description | +| :--- | :--- | :--- | +| `sourceClusterEndpoint` | `"https://source-cluster.elb.us-east-1.endpoint.com"` | The endpoint for the source cluster. | +| `targetClusterEndpoint` | `"https://vpc-demo-opensearch-cluster-cv6hggdb66ybpk4kxssqt6zdhu.us-west-2.es.amazonaws.com:443"` | The endpoint for the target cluster. Required if using an existing target cluster for the migration instead of creating a new one. | +| `vpcId` | `"vpc-123456789abcdefgh"` | The ID of the existing VPC in which the migration resources will be stored. The VPC must have at least two private subnets that span two Availability Zones. | + + +## Backfill migration using RFS + +The following CDK performs a backfill migrations using RFS: + +```json +{ + "backfill-migration": { + "stage": "dev", + "vpcId": , + "sourceCluster": { + "endpoint": , + "version": "ES 7.10", + "auth": {"type": "none"} + }, + "targetCluster": { + "endpoint": , + "auth": { + "type": "basic", + "username": , + "passwordFromSecretArn": + } + }, + "reindexFromSnapshotServiceEnabled": true, + "reindexFromSnapshotExtraArgs": "", + "artifactBucketRemovalPolicy": "DESTROY" + } +} +``` +{% include copy.html %} + +Performing an RFS backfill migration requires an existing snapshot. + + +The RFS configuration uses the following options. All options are optional. + +| Name | Example | Description | +| :--- | :--- | :--- | +| `reindexFromSnapshotServiceEnabled` | `true` | Enables deployment and configuration of the RFS ECS service. | +| `reindexFromSnapshotExtraArgs` | `"--target-aws-region us-east-1 --target-aws-service-signing-name es"` | Extra arguments for the Document Migration command, with space separation. See [RFS Extra Arguments](https://github.com/opensearch-project/opensearch-migrations/blob/main/DocumentsFromSnapshotMigration/README.md#arguments) for more information. You can pass `--no-insecure` to remove the `--insecure` flag. | + +To view all available arguments for `reindexFromSnapshotExtraArgs`, see [Snapshot migrations README](https://github.com/opensearch-project/opensearch-migrations/blob/main/DocumentsFromSnapshotMigration/README.md#arguments). At a minimum, no extra arguments may be needed. + +## Live capture migration with C&R + +The following sample CDK performs a live capture migration with C&R: + +```json +{ + "live-capture-migration": { + "stage": "dev", + "vpcId": , + "sourceCluster": { + "endpoint": , + "version": "ES 7.10", + "auth": {"type": "none"} + }, + "targetCluster": { + "endpoint": , + "auth": { + "type": "basic", + "username": , + "passwordFromSecretArn": + } + }, + "captureProxyServiceEnabled": true, + "captureProxyExtraArgs": "", + "trafficReplayerServiceEnabled": true, + "trafficReplayerExtraArgs": "", + "artifactBucketRemovalPolicy": "DESTROY" + } +} +``` +{% include copy.html %} + +Performing a live capture migration requires that a Capture Proxy be configured to capture incoming traffic and send it to the target cluster using the Traffic Replayer service. For arguments available in `captureProxyExtraArgs`, refer to the `@Parameter` fields [here](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). For `trafficReplayerExtraArgs`, refer to the `@Parameter` fields [here](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). At a minimum, no extra arguments may be needed. + + +| Name | Example | Description | +| :--- | :--- | :--- | +| `captureProxyServiceEnabled` | `true` | Enables the Capture Proxy service deployment using an AWS CloudFormation stack. | +| `captureProxyExtraArgs` | `"--suppressCaptureForHeaderMatch user-agent .*elastic-java/7.17.0.*"` | Extra arguments for the Capture Proxy command, including options specified by the [Capture Proxy](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). | +| `trafficReplayerServiceEnabled` | `true` | Enables the Traffic Replayer service deployment using a CloudFormation stack. | +| `trafficReplayerExtraArgs` | `"--sigv4-auth-header-service-region es,us-east-1 --speedup-factor 5"` | Extra arguments for the Traffic Replayer command, including options for auth headers and other parameters specified by the [Traffic Replayer](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). | + + +For arguments available in `captureProxyExtraArgs`, see the `@Parameter` fields in [`CaptureProxy.java`](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). For `trafficReplayerExtraArgs`, see the `@Parameter` fields in [TrafficReplayer.java](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). + + +## Cluster authentication options + +Both the source and target cluster can use no authentication, authentication limited to VPC, basic authentication with a username and password, or AWS Signature Version 4 scoped to a user or role. + +### No authentication + +```json + "sourceCluster": { + "endpoint": , + "version": "ES 7.10", + "auth": {"type": "none"} + } +``` +{% include copy.html %} + +### Basic authentication + +```json + "sourceCluster": { + "endpoint": , + "version": "ES 7.10", + "auth": { + "type": "basic", + "username": , + "passwordFromSecretArn": + } + } +``` +{% include copy.html %} + +### Signature Version 4 authentication + +```json + "sourceCluster": { + "endpoint": , + "version": "ES 7.10", + "auth": { + "type": "sigv4", + "region": "us-east-1", + "serviceSigningName": "es" + } + } +``` +{% include copy.html %} + +The `serviceSigningName` can be `es` for an Elasticsearch or OpenSearch domain, or `aoss` for an OpenSearch Serverless collection. + +All of these authentication options apply to both source and target clusters. + +## Network configuration + +The migration tooling expects the source cluster, target cluster, and migration resources to exist in the same VPC. If this is not the case, manual networking setup outside of this documentation is likely required. diff --git a/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md b/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md new file mode 100644 index 0000000000..331b99e1fa --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md @@ -0,0 +1,70 @@ +--- +layout: default +title: IAM and security groups for existing clusters +nav_order: 20 +parent: Deploying Migration Assistant +--- + +# IAM and security groups for existing clusters + +This page outlines security scenarios for using the migration tools with existing clusters, including any necessary configuration changes to ensure proper communication between them. + +## Importing an Amazon OpenSearch Service or Amazon OpenSearch Serverless target cluster + +Use the following scenarios for Amazon OpenSearch Service or Amazon OpenSearch Serverless target clusters. + +### OpenSearch Service + +For an OpenSearch Domain, two main configurations are typically required to ensure proper functioning of the migration solution: + +1. **Security Group Configuration** + + The domain should have a security group that allows communication from the applicable migration services (Traffic Replayer, Migration Console, `Reindex-from-Snapshot`). The CDK automatically creates an `osClusterAccessSG` security group, which is applied to the migration services. The user should then add this security group to their existing domain to allow access. + +2. **Access Policy Configuration** should be one of the following: + - An open access policy that allows all access. + - Configured to allow at least the AWS Identity and Access Management (IAM) task roles for the applicable migration services (Traffic Replayer, Migration Console, `Reindex-from-Snapshot`) to access the domain. + +### OpenSearch Serverless + +For an OpenSearch Serverless Collection, you will need to configure both network and data access policies: + +1. **Network Policy Configuration**: + The Collection should have a network policy that uses the `VPC` access type. This requires creating a VPC endpoint on the VPC used for the solution. The VPC endpoint should be configured for the private subnets of the VPC and should attach the `osClusterAccessSG` security group. + +2. **Data Access Policy Configuration**: + The data access policy should grant permission to perform all [index operations](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-data-access.html#serverless-data-supported-permissions) (`aoss:*`) for all indexes in the Collection. The IAM task roles of the applicable Migration services (Traffic Replayer, migration console, `Reindex-from-Snapshot`) should be used as the principals for this data access policy. + +## Capture Proxy on Coordinator Nodes of Source Cluster + +Although the CDK does not automatically set up the Capture Proxy on source cluster nodes (except in the demo solution), the Capture Proxy instances must communicate with the resources deployed by the CDK, such as Kafka. This section outlines the necessary steps to set up communication. + +Before [setting up Capture Proxy instances](https://github.com/opensearch-project/opensearch-migrations/tree/main/TrafficCapture/trafficCaptureProxyServer#installing-capture-proxy-on-coordinator-nodes) on the source cluster, ensure the following configurations are in place: + +1. **Security Group Configuration**: + The coordinator nodes should add the `trafficStreamSourceSG` security group to allow sending captured traffic to Kafka. + +2. **IAM Policy Configuration**: + The IAM role used by the coordinator nodes should have permissions to publish captured traffic to Kafka. You can add the following template policy through the AWS Console (IAM Role → Add permissions → Create inline policy → JSON view): + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": "kafka-cluster:Connect", + "Resource": "arn:aws:kafka:::cluster/migration-msk-cluster-/*", + "Effect": "Allow" + }, + { + "Action": [ + "kafka-cluster:CreateTopic", + "kafka-cluster:DescribeTopic", + "kafka-cluster:WriteData" + ], + "Resource": "arn:aws:kafka:::topic/migration-msk-cluster-/*", + "Effect": "Allow" + } + ] +} +``` diff --git a/_migration-assistant/deploying-migration-assistant/index.md b/_migration-assistant/deploying-migration-assistant/index.md new file mode 100644 index 0000000000..1c559a81b1 --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/index.md @@ -0,0 +1,13 @@ +--- +layout: default +title: Deploying Migration Assistant +nav_order: 15 +has_children: true +permalink: /deploying-migration-assistant/ +redirect-from: + - /deploying-migration-assistant/index/ +--- + +# Deploying Migration Assistant + +This section provides information about the available options for deploying Migration Assistant. diff --git a/_migration-assistant/getting-started-data-migration.md b/_migration-assistant/getting-started-data-migration.md new file mode 100644 index 0000000000..4110f29edf --- /dev/null +++ b/_migration-assistant/getting-started-data-migration.md @@ -0,0 +1,353 @@ +--- +layout: default +title: Getting started with data migration +nav_order: 10 +redirect_from: + - /upgrade-to/upgrade-to/ + - /upgrade-to/snapshot-migrate/ +--- + +# Getting started with data migration + +This quickstart outlines how to deploy Migration Assistant for OpenSearch and execute an existing data migration using `Reindex-from-Snapshot` (RFS). It uses AWS for illustrative purposes. However, the steps can be modified for use with other cloud providers. + +## Prerequisites and assumptions + +Before using this quickstart, make sure you fulfill the following prerequisites: + +* Verify that your migration path [is supported]({{site.url}}{{site.baseurl}}/migration-assistant/is-migration-assistant-right-for-you/#migration-paths). Note that we test with the exact versions specified, but you should be able to migrate data on alternative minor versions as long as the major version is supported. +* The source cluster must be deployed Amazon Simple Storage Service (Amazon S3) plugin. +* The target cluster must be deployed. + +The steps in this guide assume the following: + +* In this guide, a snapshot will be taken and stored in Amazon S3; the following assumptions are made about this snapshot: + * The `_source` flag is enabled on all indexes to be migrated. + * The snapshot includes the global cluster state (`include_global_state` is `true`). + * Shard sizes of up to approximately 80 GB are supported. Larger shards cannot be migrated. If this presents challenges for your migration, contact the [migration team](https://opensearch.slack.com/archives/C054JQ6UJFK). +* Migration Assistant will be installed in the same AWS Region and have access to both the source snapshot and target cluster. + +--- + +## Step 1: Install Bootstrap on an Amazon EC2 instance (~10 minutes) + +To begin your migration, use the following steps to install a `bootstrap` box on an Amazon Elastic Compute Cloud (Amazon EC2) instance. The instance uses AWS CloudFormation to create and manage the stack. + +1. Log in to the target AWS account in which you want to deploy Migration Assistant. +2. From the browser where you are logged in to your target AWS account, right-click [here](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?templateURL=https://solutions-reference.s3.amazonaws.com/migration-assistant-for-amazon-opensearch-service/latest/migration-assistant-for-amazon-opensearch-service.template&redirectId=SolutionWeb) to load the CloudFormation template from a new browser tab. +3. Follow the CloudFormation stack wizard: + * **Stack Name:** `MigrationBootstrap` + * **Stage Name:** `dev` + * Choose **Next** after each step > **Acknowledge** > **Submit**. +4. Verify that the Bootstrap stack exists and is set to `CREATE_COMPLETE`. This process takes around 10 minutes to complete. + +--- + +## Step 2: Set up Bootstrap instance access (~5 minutes) + +Use the following steps to set up Bootstrap instance access: + +1. After deployment, find the EC2 instance ID for the `bootstrap-dev-instance`. +2. Create an AWS Identity and Access Management (IAM) policy using the following snippet, replacing ``, ``, ``, and `` with your information: + + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "ssm:StartSession", + "Resource": [ + "arn:aws:ec2:::instance/", + "arn:aws:ssm:::document/BootstrapShellDoc--" + ] + } + ] + } + ``` + {% include copy.html %} + +3. Name the policy, for example, `SSM-OSMigrationBootstrapAccess`, and then create the policy by selecting **Create policy**. + +--- + +## Step 3: Log in to Bootstrap and building Migration Assistant (~15 minutes) + +Next, log in to Bootstrap and build Migration Assistant using the following steps. + +### Prerequisites + +To use these steps, make sure you fulfill the following prerequisites: + +* The AWS Command Line Interface (AWS CLI) and AWS Session Manager plugin are installed on your instance. +* The AWS credentials are configured (`aws configure`) for your instance. + +### Steps + +1. Load AWS credentials into your terminal. +2. Log in to the instance using the following command, replacing `` and `` with your instance ID and Region: + + ```bash + aws ssm start-session --document-name BootstrapShellDoc-- --target --region [--profile ] + ``` + {% include copy.html %} + +3. Once logged in, run the following command from the shell of the Bootstrap instance in the `/opensearch-migrations` directory: + + ```bash + ./initBootstrap.sh && cd deployment/cdk/opensearch-service-migration + ``` + {% include copy.html %} + +4. After a successful build, note the path for infrastructure deployment, which will be used in the next step. + +--- + +## Step 4: Configure and deploy RFS (~20 minutes) + +Use the following steps to configure and deploy RFS: + +1. Add the target cluster password to AWS Secrets Manager as an unstructured string. Be sure to copy the secret Amazon Resource Name (ARN) for use during deployment. +2. From the same shell as the Bootstrap instance, modify the `cdk.context.json` file located in the `/opensearch-migrations/deployment/cdk/opensearch-service-migration` directory: + + ```json + { + "migration-assistant": { + "vpcId": "", + "targetCluster": { + "endpoint": "", + "auth": { + "type": "basic", + "username": "", + "passwordFromSecretArn": "" + } + }, + "sourceCluster": { + "endpoint": "", + "auth": { + "type": "basic", + "username": "", + "passwordFromSecretArn": "" + } + }, + "reindexFromSnapshotExtraArgs": "", + "stage": "dev", + "otelCollectorEnabled": true, + "migrationConsoleServiceEnabled": true, + "reindexFromSnapshotServiceEnabled": true, + "migrationAssistanceEnabled": true + } + } + ``` + {% include copy.html %} + + The source and target cluster authorization can be configured to have no authorization, `basic` with a username and password, or `sigv4`. + +3. Bootstrap the account with the following command: + + ```bash + cdk bootstrap --c contextId=migration-assistant --require-approval never + ``` + {% include copy.html %} + +4. Deploy the stacks: + + ```bash + cdk deploy "*" --c contextId=migration-assistant --require-approval never --concurrency 5 + ``` + {% include copy.html %} + +5. Verify that all CloudFormation stacks were installed successfully. + +### RFS parameters + +If you're creating a snapshot using migration tooling, these parameters are automatically configured. If you're using an existing snapshot, modify the `reindexFromSnapshotExtraArgs` setting with the following values: + + ```bash + --s3-repo-uri s3:/// --s3-region --snapshot-name + ``` + +You will also need to give the `migrationconsole` and `reindexFromSnapshot` TaskRoles permissions to the S3 bucket. + +--- + +## Step 5: Deploy Migration Assistant + +To deploy Migration Assistant, use the following steps: + +1. Bootstrap the account: + + ```bash + cdk bootstrap --c contextId=migration-assistant --require-approval never --concurrency 5 + ``` + {% include copy.html %} + +2. Deploy the stacks when `cdk.context.json` is fully configured: + + ```bash + cdk deploy "*" --c contextId=migration-assistant --require-approval never --concurrency 3 + ``` + {% include copy.html %} + +These commands deploy the following stacks: + +* Migration Assistant network stack +* `Reindex-from-snapshot` stack +* Migration console stack + +--- + +## Step 6: Access the migration console + +Run the following command to access the migration console: + +```bash +./accessContainer.sh migration-console dev +``` +{% include copy.html %} + + +`accessContainer.sh` is located in `/opensearch-migrations/deployment/cdk/opensearch-service-migration/` on the Bootstrap instance. To learn more, see [Accessing the migration console]. +`{: .note} + +--- + +## Step 7: Verify the connection to the source and target clusters + +To verify the connection to the clusters, run the following command: + +```bash +console clusters connection-check +``` +{% include copy.html %} + +You should receive the following output: + +```bash +* **Source Cluster:** Successfully connected! +* **Target Cluster:** Successfully connected! +``` + +To learn more about migration console commands, see [Migration commands]. + +--- + +## Step 8: Create a snapshot + +Run the following command to initiate snapshot creation from the source cluster: + +```bash +console snapshot create [...] +``` +{% include copy.html %} + +To check the snapshot creation status, run the following command: + +```bash +console snapshot status [...] +``` +{% include copy.html %} + +To learn more information about the snapshot, run the following command: + +```bash +console snapshot status --deep-check [...] +``` +{% include copy.html %} + +Wait for snapshot creation to complete before moving to step 9. + +To learn more about snapshot creation, see [Snapshot Creation]. + +--- + +## Step 9: Migrate metadata + +Run the following command to migrate metadata: + +```bash +console metadata migrate [...] +``` +{% include copy.html %} + +For more information, see [Migrating metadata]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/). + +--- + +## Step 10: Migrate documents with RFS + +You can now use RFS to migrate documents from your original cluster: + +1. To start the migration from RFS, start a `backfill` using the following command: + + ```bash + console backfill start + ``` + {% include copy.html %} + +2. _(Optional)_ To speed up the migration, increase the number of documents processed at a simultaneously by using the following command: + + ```bash + console backfill scale + ``` + {% include copy.html %} + +3. To check the status of the documentation backfill, use the following command: + + ```bash + console backfill status + ``` + {% include copy.html %} + +4. If you need to stop the backfill process, use the following command: + + ```bash + console backfill stop + ``` + {% include copy.html %} + +For more information, see [Backfill]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/). + +--- + +## Step 11: Backfill monitoring + +Use the following command for detailed monitoring of the backfill process: + +```bash +console backfill status --deep-check +``` +{% include copy.html %} + +You should receive the following output: + +```json +BackfillStatus.RUNNING +Running=9 +Pending=1 +Desired=10 +Shards total: 62 +Shards completed: 46 +Shards incomplete: 16 +Shards in progress: 11 +Shards unclaimed: 5 +``` + +Logs and metrics are available in Amazon CloudWatch in the `OpenSearchMigrations` log group. + +--- + +## Step 12: Verify that all documents were migrated + +Use the following query in CloudWatch Logs Insights to identify failed documents: + +```bash +fields @message +| filter @message like "Bulk request succeeded, but some operations failed." +| sort @timestamp desc +| limit 10000 +``` +{% include copy.html %} + +If any failed documents are identified, you can index the failed documents directly as opposed to using RFS. + diff --git a/_migration-assistant/index.md b/_migration-assistant/index.md new file mode 100644 index 0000000000..f024fdb69c --- /dev/null +++ b/_migration-assistant/index.md @@ -0,0 +1,75 @@ +--- +layout: default +title: Migration Assistant for OpenSearch +nav_order: 1 +has_children: false +nav_exclude: true +has_toc: false +permalink: /migration-assistant/ +redirect_from: + - /migration-assistant/index/ + - /upgrade-to/index/ + - /upgrade-to/ +--- + +# Migration Assistant for OpenSearch + +Migration Assistant for OpenSearch aids you in successfully performing an end-to-end, zero-downtime migration to OpenSearch from other search providers. It helps with the following scenarios: + +- **Metadata migration**: Migrating cluster metadata, such as index settings, aliases, and templates. +- **Backfill migration**: Migrating existing or historical data from a source to a target cluster. +- **Live traffic migration**: Replicating live ongoing traffic from a source to a target cluster. +- **Comparative tooling**: Comparing the performance and behaviors of an existing cluster with a prospective new one. + +This user guide focuses on conducting a comprehensive migration involving both existing and live data with zero downtime and the option to back out of a migration. + +It's crucial to note that migration strategies are not universally applicable. This guide provides a detailed methodology, based on certain assumptions detailed throughout, emphasizing the importance of robust engineering practices to ensure a successful migration. +{: .tip } + +## Key components + +The following are the key components of Migration Assistant. + +### Elasticsearch/OpenSearch source + +Your source cluster in this solution operates on Elasticsearch or OpenSearch, hosted on EC2 instances or similar computing environments. A proxy is set up to interact with this source cluster, either positioned in front of or directly on the coordinating nodes of the cluster. + +### Migration management console + +A console that provides a migration-specific CLI and offers a variety of tools to streamline the migration process. Everything necessary for completing a migration, other than cleaning up the migration resources, can be done via this Console. + +### Traffic capture proxy + +This component is designed for HTTP RESTful traffic. It forwards traffic to the source cluster and also splits and channels this traffic to a stream processing service for later playback. + +### Traffic Replayer + +Acting as a traffic simulation tool, the Traffic Replayer replays recorded request traffic to a target cluster, mirroring source traffic patterns. It links original requests and their responses to those directed at the target cluster, facilitating comparative analysis. + +### Metadata migration tool + +The Metadata migration tool integrated into the Migration CLI can be used independently to migrate cluster metadata, including index mappings, index configuration settings, templates, component templates, and aliases. + +### reindex-from-snapshot + +`Reindex-from-Snapshot` (RFS) reindexes data from an existing snapshot. Workers on Amazon Elastic Container Service (Amazon ECS) coordinate the migration of documents from an existing snapshot, reindexing the documents in parallel to a target cluster. + +### Target cluster + +The destination cluster for migration or comparison in an A/B test. + +## Architecture overview + +The Migration Assistant architecture is based on the use of an AWS Cloud infrastructure, but most tools are designed to be cloud independent. A local containerized version of this solution is also available. + +The design deployed in AWS is as follows: + +![Migration architecture overview]({{site.url}}{{site.baseurl}}/images/migrations/migrations-architecture-overview.png) + +1. Client traffic is directed to the existing cluster. +2. An Application Load Balancer with capture proxies relays traffic to a source while replicating data to Amazon Managed Streaming for Apache Kafka (Amazon MSK). +3. Using the migration console, you can initiate metadata migration to establish indexes, templates, component templates, and aliases on the target cluster. +4. With continuous traffic capture in place, you can use a `reindex-from-snapshot` process to capture data from your current index. +4. Once `reindex-from-snapshot` is complete, captured traffic is replayed from Amazon MSK to the target cluster by the traffic replayer. +5. Performance and behavior of traffic sent to the source and target clusters are compared by reviewing logs and metrics. +6. After confirming that the target cluster's functionality meets expectations, clients are redirected to the new target. diff --git a/_migration-assistant/is-migration-assistant-right-for-you.md b/_migration-assistant/is-migration-assistant-right-for-you.md new file mode 100644 index 0000000000..073c2b6cd7 --- /dev/null +++ b/_migration-assistant/is-migration-assistant-right-for-you.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Is Migration Assistant right for you? +nav_order: 5 +--- + +# Is Migration Assistant right for you? + +Before deciding if Migration Assistant is right for you, it's important to assess your specific needs and understand the available tools for performing an upgrade or migration. + +Migration Assistant addresses gaps found in other migration solutions, but in some cases, alternative tools may be a better fit. + +For instance, if you need to upgrade more than one major version, such as moving from Elasticsearch 6.8 to OpenSearch 2.15, Migration Assistant allows you to do this in a single hop. In contrast, other options like rolling upgrades or snapshot restore would require multiple steps because they cannot handle major version jumps without reindexing your data. Additionally, if you need to capture live traffic and perform a zero-downtime migration, Migration Assistant would be the right choice. + +There are also tools available for migrating cluster configuration, templates, and aliases, which can be helpful depending on the complexity of your setup. These tools streamline the migration process by preserving critical settings and custom configurations. + +## Migration paths + +| **Source Version** | **Target Version** | +|-----------------------------|----------------------------------| +| Elasticsearch 6.8 | OpenSearch 1.3 | +| Elasticsearch 6.8 | OpenSearch 2.15 | +| Elasticsearch 7.10.2 | OpenSearch 1.3 | +| Elasticsearch 7.10.2 | OpenSearch 2.15 | +| Elasticsearch 7.17 | OpenSearch 1.3 | +| Elasticsearch 7.17 | OpenSearch 2.15 | +| OpenSearch 1.3 | OpenSearch 2.15 | + + +{: .note} + +### Supported source and target platforms + +* Self-managed (hosted by cloud provider or on-premises) +* AWS OpenSearch + +The tooling is designed to work with other cloud provider platforms, but it is not officially tested with these other platforms. If you would like to add support, please contact one of the maintainers on [GitHub](https://github.com/opensearch-project/opensearch-migrations/blob/main/MAINTAINERS.md). + +### Future migration paths + +To see the OpenSearch migrations roadmap, go to [OpenSearch Migrations - Roadmap](https://github.com/orgs/opensearch-project/projects/229/views/1). + +## Supported components + +Before starting a migration, consider the scope of the components involved. The table below outlines the components that should be considered for migration, indicates their support by the Migration Assistant, and provides comments and recommendations. + +| Component | Supported | Recommendations | +| :--- |:--- | :--- | +| **Documents** | Yes | Migrate existing data with `reindex-from-snapshot` (RFS) and live traffic with Capture and Replay. | +| **Index settings** | Yes | Migrate with the metadata migration tool. | +| **Index mappings** | Yes | Migrate with the metadata migration tool. | +| **Index templates** | Yes | Migrate with the metadata migration tool. | +| **Component templates** | Yes | Migrate with the metadata migration tool. | +| **Aliases** | Yes | Migrate with the metadata migration tool. | +| **Index State Management (ISM) policies** | Expected in 2025 | Manually migrate using an API. | +| **Elasticsearch Kibana dashboards** | Expected in 2025 | This tool is only needed when used to migrate Elasticsearch Kibana Dashboards to OpenSearch Dashboards. To start, export JSON files from Kibana and import them into OpenSearch Dashboards; before importing, use the [`dashboardsSanitizer`](https://github.com/opensearch-project/opensearch-migrations/tree/main/dashboardsSanitizer) tool on X-Pack visualizations like Canvas and Lens in Kibana Dashboards, as they may require recreation for compatibility with OpenSearch. | +| **Security constructs** | No | Configure roles and permissions based on cloud provider recommendations. For example, if using AWS, leverage AWS Identity and Access Management (IAM) for enhanced security management. | +| **Plugins** | No | Check plugin compatibility; some Elasticsearch plugins may not have direct equivalents in OpenSearch. | diff --git a/_migration-assistant/migration-console/accessing-the-migration-console.md b/_migration-assistant/migration-console/accessing-the-migration-console.md new file mode 100644 index 0000000000..ea66f5c04c --- /dev/null +++ b/_migration-assistant/migration-console/accessing-the-migration-console.md @@ -0,0 +1,35 @@ +--- +layout: default +title: Accessing the migration console +nav_order: 35 +parent: Migration console +--- + +# Accessing the migration console + +The Bootstrap box deployed through Migration Assistant contains a script that simplifies access to the migration console through that instance. + +To access the migration console, use the following commands: + +```shell +export STAGE=dev +export AWS_REGION=us-west-2 +/opensearch-migrations/deployment/cdk/opensearch-service-migration/accessContainer.sh migration-console ${STAGE} ${AWS_REGION} +``` +{% include copy.html %} + +When opening the console a message will appear above the command prompt, `Welcome to the Migration Assistant Console`. + +On a machine with the [AWS Command Line Interface (AWS CLI)](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and the [AWS Session Manager plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html), you can directly connect to the migration console. Ensure that you've run `aws configure` with credentials that have access to the environment. + +Use the following commands: + +```shell +export STAGE=dev +export SERVICE_NAME=migration-console +export TASK_ARN=$(aws ecs list-tasks --cluster migration-${STAGE}-ecs-cluster --family "migration-${STAGE}-${SERVICE_NAME}" | jq --raw-output '.taskArns[0]') +aws ecs execute-command --cluster "migration-${STAGE}-ecs-cluster" --task "${TASK_ARN}" --container "${SERVICE_NAME}" --interactive --command "/bin/bash" +``` +{% include copy.html %} + +Typically, `STAGE` is equivalent to a standard `dev` environment, but this may vary based on what the user specified during deployment. \ No newline at end of file diff --git a/_migration-assistant/migration-console/index.md b/_migration-assistant/migration-console/index.md new file mode 100644 index 0000000000..3e08e72c5c --- /dev/null +++ b/_migration-assistant/migration-console/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Migration console +nav_order: 30 +has_children: true +permalink: /migration-console/ +redirect_from: + - /migration-console/index/ +--- + +# Migration console + +The Migrations Assistant deployment includes an Amazon Elastic Container Service (Amazon ECS) task that hosts tools that run different phases of the migration and check the progress or results of the migration. This ECS task is called the **migration console**. The migration console is a command line interface used to interact with the deployed components of the solution. + +This section provides information about how to access the migration console and what commands are supported. + diff --git a/_migration-assistant/migration-console/migration-console-commands-references.md b/_migration-assistant/migration-console/migration-console-commands-references.md new file mode 100644 index 0000000000..21d793b3f3 --- /dev/null +++ b/_migration-assistant/migration-console/migration-console-commands-references.md @@ -0,0 +1,131 @@ +--- +layout: default +title: Command reference +nav_order: 40 +parent: Migration console +--- + +# Migration console command reference + +Migration console commands follow this syntax: `console [component] [action]`. The components include `clusters`, `backfill`, `snapshot`, `metadata`, and `replay`. The console is configured with a registry of the deployed services and the source and target cluster, generated from the `cdk.context.json` values. + +## Commonly used commands + +The exact commands used will depend heavily on use-case and goals, but the following are a series of common commands with a quick description of what they do. + +### Check connection + +Reports whether both the source and target clusters can be reached and provides their versions. + +```sh +console clusters connection-check +``` +{% include copy.html %} + +### Run `cat-indices` + +Runs the `cat-indices` API on the cluster. + +```sh +console clusters cat-indices +``` +{% include copy.html %} + +### Create a snapshot + +Creates a snapshot of the source cluster and stores it in a preconfigured Amazon Simple Storage Service (Amazon S3) bucket. + +```sh +console snapshot create +``` +{% include copy.html %} + +## Check snapshot status + +Runs a detailed check on the snapshot creation status, including estimated completion time: + +```sh +console snapshot status --deep-check +``` +{% include copy.html %} + +## Evaluate metadata + +Performs a dry run of metadata migration, showing which indexes, templates, and other objects will be migrated to the target cluster. + +```sh +console metadata evaluate +``` +{% include copy.html %} + +## Migrate metadata + +Migrates the metadata from the source cluster to the target cluster. + +```sh +console metadata migrate +``` +{% include copy.html %} + +## Start a backfill + +If `Reindex-From-Snapshot` (RFS) is enabled, this command starts an instance of the service to begin moving documents to the target cluster: + +There are similar `scale UNITS` and `stop` commands to change the number of active instances for RFS. + + +```sh +console backfill start +``` +{% include copy.html %} + +## Check backfill status + +Gets the current status of the backfill migration, including the number of operating instances and the progress of the shards. + + +## Start Traffic Replayer + +If Traffic Replayer is enabled, this command starts an instance of Traffic Replayer to begin replaying traffic against the target cluster. +The `stop` command stops all active instances. + +```sh +console replay start +``` +{% include copy.html %} + +## Read logs + +Reads any logs that exist when running Traffic Replayer. Use tab completion on the path to fill in the available `NODE_IDs` and, if applicable, log file names. The tuple logs roll over at a certain size threshold, so there may be many files named with timestamps. The `jq` command pretty-prints each line of the tuple output before writing it to file. + +```sh +console tuples show --in /shared-logs-output/traffic-replayer-default/[NODE_ID]/tuples/console.log | jq > readable_tuples.json +``` +{% include copy.html %} + +## Help option + +All commands and options can be explored within the tool itself by using the `--help` option, either for the entire `console` application or for individual components (for example, `console backfill --help`). For example: + +```bash +$ console --help +Usage: console [OPTIONS] COMMAND [ARGS]... + +Options: + --config-file TEXT Path to config file + --json + -v, --verbose Verbosity level. Default is warn, -v is info, -vv is + debug. + --help Show this message and exit. + +Commands: + backfill Commands related to controlling the configured backfill... + clusters Commands to interact with source and target clusters + completion Generate shell completion script and instructions for setup. + kafka All actions related to Kafka operations + metadata Commands related to migrating metadata to the target cluster. + metrics Commands related to checking metrics emitted by the capture... + replay Commands related to controlling the replayer. + snapshot Commands to create and check status of snapshots of the... + tuples All commands related to tuples. +``` diff --git a/_migration-assistant/migration-phases/assessing-your-cluster-for-migration.md b/_migration-assistant/migration-phases/assessing-your-cluster-for-migration.md new file mode 100644 index 0000000000..5ded49eb59 --- /dev/null +++ b/_migration-assistant/migration-phases/assessing-your-cluster-for-migration.md @@ -0,0 +1,48 @@ +--- +layout: default +title: Assessing your cluster for migration +nav_order: 60 +parent: Migration phases +--- + +# Assessing your cluster for migration + + +The goal of the Migration Assistant is to streamline the process of migrating from one location or version of Elasticsearch/OpenSearch to another. However, completing a migration sometimes requires resolving client compatibility issues before they can communicate directly with the target cluster. + +## Understanding breaking changes + +Before performing any upgrade or migration, you should review any documentation of breaking changes. Even if the cluster is migrated there might be changes required for clients to connect to the new cluster + +## Upgrade and breaking changes guides + +For migrations paths between Elasticsearch 6.8 and OpenSearch 2.x users should be familiar with documentation in the links below that apply to their specific case: + +* [Upgrading Amazon Service Domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/version-migration.html). + +* [Changes from Elasticsearch to OpenSearch fork](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/rename.html). + +* [OpenSearch Breaking Changes](https://opensearch.org/docs/latest/breaking-changes/). + +The next step is to set up a proper test bed to verify that your applications will work as expected on the target version. + +## Impact of data transformations + +Any time you apply a transformation to your data, such as: + +- Changing index names +- Modifying field names or field mappings +- Splitting indices with type mappings + +These changes might need to be reflected in your client configurations. For example, if your clients are reliant on specific index or field names, you must ensure that their queries are updated accordingly. + +We recommend running production-like queries against the target cluster before switching over actual production traffic. This helps verify that the client can: + +- Communicate with the target cluster +- Locate the necessary indices and fields +- Retrieve the expected results + +For complex migrations involving multiple transformations or breaking changes, we highly recommend performing a trial migration with representative, non-production data (e.g., in a staging environment) to fully test client compatibility with the target cluster. + + + diff --git a/_migration-assistant/migration-phases/backfill.md b/_migration-assistant/migration-phases/backfill.md new file mode 100644 index 0000000000..d2ff7cd873 --- /dev/null +++ b/_migration-assistant/migration-phases/backfill.md @@ -0,0 +1,181 @@ +--- +layout: default +title: Backfill +nav_order: 90 +parent: Migration phases +--- + +# Backfill + +After the [metadata]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/) for your cluster has been migrated, you can use capture proxy data replication and snapshots to backfill your data into the next cluster. + +## Capture proxy data replication + +If you're interested in capturing live traffic during your migration, Migration Assistant includes an Application Load Balancer for routing traffic to the capture proxy and the target cluster. Upstream client traffic must be routed through the capture proxy in order to replay the requests later. Before using the capture proxy, remember the following: + +* The layer upstream from the Application Load Balancer is compatible with the certificate on the Application Load Balancer listener, whether it's for clients or a Network Load Balancer. The `albAcmCertArn` in the `cdk.context.json` may need to be provided to ensure that clients trust the Application Load Balancer certificate. +* If a Network Load Balancer is used directly upstream of the Application Load Balancer, it must use a TLS listener. +* Upstream resources and security groups must allow network access to the Migration Assistant Application Load Balancer. + +To set up the capture proxy, go to the AWS Management Console and navigate to **EC2 > Load Balancers > Migration Assistant Application Load Balancer**. Copy the Application Load Balancer URL. With the URL copied, you can use one of the following options. + + +### If you are using **Network Load Balancer → Application Load Balancer → Cluster** + +1. Ensure that ingress is provided directly to the Application Load Balancer for the capture proxy. +2. Create a target group for the Migration Assistant Application Load Balancer on port `9200`, and set the health check to `HTTPS`. +3. Associate this target group with your existing Network Load Balancer on a new listener for testing. +4. Verify that the health check is successful, and perform smoke testing with some clients through the new listener port. +5. Once you are ready to migrate all clients, detach the Migration Assistant Application Load Balancer target group from the testing Network Load Balancer listener and modify the existing Network Load Balancer listener to direct traffic to this target group. +6. Now client requests will be routed through the proxy (once they establish a new connection). Verify the application metrics. + +### If you are using **Network Load Balancer → Cluster** + +If you do not want to modify application logic, add an Application Load Balancer in front of your cluster and follow the **Network Load Balancer → Application Load Balancer → Cluster** steps. Otherwise: + +1. Create a target group for the Application Load Balancer on port `9200` and set the health check to `HTTPS`. +2. Associate this target group with your existing Network Load Balancer on a new listener. +3. Verify that the health check is successful, and perform smoke testing with some clients through the new listener port. +4. Once you are ready to migrate all clients, deploy a change so that clients hit the new listener. + + +### If you are **not using an Network Load Balancer** + +If you're only using backfill as your migration technique, make a client/DNS change to route clients to the Migration Assistant Application Load Balancer on port `9200`. + + +### Kafka connection + +After you have routed the client based on your use case, test adding records against HTTP requests using the following steps: + +In the migration console, run the following command: + + ```bash + console kafka describe-topic-records + ``` + {% include copy.html %} + + Note the records in the logging topic. + +After a short period, execute the same command again and compare the increased number of records against the expected HTTP requests. + + +## Creating a snapshot + +Create a snapshot for your backfill using the following command: + +```bash +console snapshot create +``` +{% include copy.html %} + +To check the progress of your snapshot, use the following command: + +```bash +console snapshot status --deep-check +``` +{% include copy.html %} + +Depending on the size of the data in the source cluster and the bandwidth allocated for snapshots, the process can take some time. Adjust the maximum rate at which the source cluster's nodes create the snapshot using the `--max-snapshot-rate-mb-per-node` option. Increasing the snapshot rate will consume more node resources, which may affect the cluster's ability to handle normal traffic. + +## Backfilling documents to the source cluster + +From the snapshot you created of your source cluster, you can begin backfilling documents into the target cluster. Once you have started this process, a fleet of workers will spin up to read the snapshot and reindex documents into the target cluster. This fleet of workers can be scaled to increased the speed at which documents are reindexed into the target cluster. + +### Checking the starting state of the clusters + +You can check the indexes and document counts of the source and target clusters by running the `cat-indices` command. This can be used to monitor the difference between the source and target for any migration scenario. Check the indexes of both clusters using the following command: + +```shell +console clusters cat-indices +``` +{% include copy.html %} + +You should receive the following response: + +```shell +SOURCE CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open my-index WJPVdHNyQ1KMKol84Cy72Q 1 0 8 0 44.7kb 44.7kb + +TARGET CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .opendistro_security N3uy88FGT9eAO7FTbLqqqA 1 0 10 0 78.3kb 78.3kb +``` + +### Starting the backfill + +Use the following command to start the backfill and deploy the workers: + +```shell +console backfill start +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +BackfillStatus.RUNNING +Running=1 +Pending=0 +Desired=1 +Shards total: 48 +Shards completed: 48 +Shards incomplete: 0 +Shards in progress: 0 +Shards unclaimed: 0 +``` + +The status will be `Running` even if all the shards have been migrated. + +### Scaling up the fleet + +To speed up the transfer, you can scale the number of workers. It may take a few minutes for these additional workers to come online. The following command will update the worker fleet to a size of 10: + +```shell +console backfill scale 5 +``` +{% include copy.html %} + +We recommend slowly scaling up the fleet while monitoring the health metrics of the target cluster to avoid over-saturating it. [Amazon OpenSearch Service domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/monitoring.html) provide a number of metrics and logs that can provide this insight. + +### Stopping the migration + +Backfill requires manually stopping the fleet. Once all the data has been migrated, you can shut down the fleet and all its workers using the following command: +Backfill requires manually stopping the fleet. Once all the data has been migrated, you can shut down the fleet and all its workers using the following command: +```shell +console backfill stop +``` + +### Amazon CloudWatch metrics and dashboard + +Migration Assistant creates an Amazon CloudWatch dashboard that you can use to visualize the health and performance of the backfill process. It combines the metrics for the backfill workers and, for those migrating to Amazon OpenSearch Service, the target cluster. + +You can find the backfill dashboard in the CloudWatch console based on the AWS Region in which you have deployed Migration Assistant. The metric graphs for your target cluster will be blank until you select the OpenSearch domain you're migrating to from the dropdown menu at the top of the dashboard. + +## Validating the backfill + +After the backfill is complete and the workers have stopped, examine the contents of your cluster using the [Refresh API](https://opensearch.org/docs/latest/api-reference/index-apis/refresh/) and the [Flush API](https://opensearch.org/docs/latest/api-reference/index-apis/flush/). The following example uses the console CLI with the Refresh API to check the backfill status: + +```shell +console clusters cat-indices --refresh +``` +{% include copy.html %} + +This will display the number of documents in each of the indexes in the target cluster, as shown in the following example response: + +```shell +SOURCE CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open my-index -DqPQDrATw25hhe5Ss34bQ 1 0 3 0 12.7kb 12.7kb + +TARGET CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .opensearch-observability 8HOComzdSlSWCwqWIOGRbQ 1 1 0 0 416b 208b +green open .plugins-ml-config 9tld-PCJToSUsMiyDhlyhQ 5 1 1 0 9.5kb 4.7kb +green open my-index bGfGtYoeSU6U6p8leR5NAQ 1 0 3 0 5.5kb 5.5kb +green open .migrations_working_state lopd47ReQ9OEhw4ZuJGZOg 1 1 2 0 18.6kb 6.4kb +green open .kibana_1 +``` + +You can run additional queries against the target cluster to mimic your production workflow and closely examine the results. diff --git a/_migration-assistant/migration-phases/index.md b/_migration-assistant/migration-phases/index.md new file mode 100644 index 0000000000..c3c6c14b07 --- /dev/null +++ b/_migration-assistant/migration-phases/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Migration phases +nav_order: 50 +has_children: true +permalink: /migration-phases/ +redirect_from: + - /migration-phases/index/ +--- + +This page details how to conduct a migration with Migration Assistant. It encompasses a variety of scenarios including: + +- [**Metadata migration**]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/): Migrating cluster metadata, such as index settings, aliases, and templates. +- [**Backfill migration**]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/): Migrating existing or historical data from a source to a target cluster. +- **Live traffic migration**: Replicating live ongoing traffic from a source to a target cluster. + diff --git a/_migration-assistant/migration-phases/migrating-metadata.md b/_migration-assistant/migration-phases/migrating-metadata.md new file mode 100644 index 0000000000..249a2ca4d0 --- /dev/null +++ b/_migration-assistant/migration-phases/migrating-metadata.md @@ -0,0 +1,247 @@ +--- +layout: default +title: Migrating metadata +nav_order: 85 +parent: Migration phases +--- + +# Migrating metadata + +Metadata migration involves creating a snapshot of your cluster and then migrating the metadata from the snapshot using the migration console. + +This tool gathers information from a source cluster through a snapshot or through HTTP requests against the source cluster. These snapshots are fully compatible with the backfill process for `Reindex-From-Snapshot` (RFS) scenarios. + +After collecting information on the source cluster, comparisons are made against the target cluster. If running a migration, any metadata items that do not already exist will be created on the target cluster. + +## Creating the snapshot + +Creating a snapshot of the source cluster captures all the metadata and documents to be migrated to a new target cluster. + +Create the initial snapshot of the source cluster using the following command: + +```shell +console snapshot create +``` +{% include copy.html %} + +To check the progress of the snapshot in real time, use the following command: + +```shell +console snapshot status --deep-check +``` +{% include copy.html %} + +You should receive the following response when the snapshot is created: + +```shell +SUCCESS +Snapshot is SUCCESS. +Percent completed: 100.00% +Data GiB done: 29.211/29.211 +Total shards: 40 +Successful shards: 40 +Failed shards: 0 +Start time: 2024-07-22 18:21:42 +Duration: 0h 13m 4s +Anticipated duration remaining: 0h 0m 0s +Throughput: 38.13 MiB/sec +``` + +### Managing slow snapshot speeds + +Depending on the size of the data in the source cluster and the bandwidth allocated for snapshots, the process can take some time. Adjust the maximum rate at which the source cluster's nodes create the snapshot using the `--max-snapshot-rate-mb-per-node` option. Increasing the snapshot rate will consume more node resources, which may affect the cluster's ability to handle normal traffic. + +## Command arguments + +For the following commands, to identify all valid arguments, please run with `--help`. + +```shell +console metadata evaluate --help +``` +{% include copy.html %} + +```shell +console metadata migrate --help +``` +{% include copy.html %} + +Based on the migration console deployment options, a number of commands will be pre-populated. To view them, run console with verbosity: + +```shell +console -v metadata migrate --help +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +(.venv) bash-5.2# console -v metadata migrate --help +INFO:console_link.cli:Logging set to INFO +. +. +. +INFO:console_link.models.metadata:Migrating metadata with command: /root/metadataMigration/bin/MetadataMigration --otel-collector-endpoint http://otel-collector:4317 migrate --snapshot-name snapshot_2023_01_01 --target-host https://opensearchtarget:9200 --min-replicas 0 --file-system-repo-path /snapshot/test-console --target-username admin --target-password ******** --target-insecure --help +. +. +. +``` + + +## Using the `evaluate` command + +By scanning the contents of the source cluster, applying filtering, and applying modifications a list of all items that will be migrated will be created. Any items not seen in this output will not be migrated onto the target cluster if the migrate command was to be run. This is a safety check before making modifications on the target cluster. + +```shell +console metadata evaluate [...] +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```bash +Starting Metadata Evaluation +Clusters: + Source: + Remote Cluster: OpenSearch 1.3.16 ConnectionContext(uri=http://localhost:33039, protocol=HTTP, insecure=false, compressionSupported=false) + + Target: + Remote Cluster: OpenSearch 2.14.0 ConnectionContext(uri=http://localhost:33037, protocol=HTTP, insecure=false, compressionSupported=false) + + +Migration Candidates: + Index Templates: + simple_index_template + + Component Templates: + simple_component_template + + Indexes: + blog_2023, movies_2023 + + Aliases: + alias1, movies-alias + + +Results: + 0 issue(s) detected +``` + + +## Using the migrate command + +Running through the same data as the evaluate command all of the migrated items will be applied onto the target cluster. If re-run multiple times items that were previously migrated will not be recreated. If any items do need to be re-migrated, please delete them from the target cluster and then rerun the evaluate then migrate commands to ensure the desired changes are made. + +```shell +console metadata migrate [...] +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +Starting Metadata Migration + +Clusters: + Source: + Snapshot: OpenSearch 1.3.16 FileSystemRepo(repoRootDir=/tmp/junit10626813752669559861) + + Target: + Remote Cluster: OpenSearch 2.14.0 ConnectionContext(uri=http://localhost:33042, protocol=HTTP, insecure=false, compressionSupported=false) + + +Migrated Items: + Index Templates: + simple_index_template + + Component Templates: + simple_component_template + + Indexes: + blog_2023, movies_2023 + + Aliases: + alias1, movies-alias + + +Results: + 0 issue(s) detected +``` + + +## Metadata verification process + +Before moving on to additional migration steps, it is recommended to confirm details of your cluster. Depending on your configuration, this could be checking the sharding strategy or making sure index mappings are correctly defined by ingesting a test document. + +## Troubleshooting + +Use these instructions to help troubleshoot the following issues. + +### Accessing detailed logs + +Metadata migration creates a detailed log file that includes low level tracing information for troubleshooting. For each execution of the program a log file is created inside a shared volume on the migration console named `shared-logs-output` the following command will list all log files, one for each run of the command. + +```shell +ls -al /shared-logs-output/migration-console-default/*/metadata/ +``` +{% include copy.html %} + +To inspect the file within the console `cat`, `tail` and `grep` commands line tools. By looking for warnings, errors and exceptions in this log file can help understand the source of failures, or at the very least be useful for creating issues in this project. + +```shell +tail /shared-logs-output/migration-console-default/*/metadata/*.log +``` +{% include copy.html %} + +### Warnings and errors + +When encountering `WARN` or `ERROR` elements in the response, they will be accompanied by a short message, such as `WARN - my_index already exists`. More information can be found in the detailed logs associated with the warning or error. + +### OpenSearch running in compatibility mode + +There might be an error about being unable to update an ES 7.10.2 cluster, this can occur when compatibility mode has been enabled on an OpenSearch cluster disable it to continue, see [Enable compatibility mode](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/rename.html#rename-upgrade). + + +### Breaking change compatibility + +Metadata migration requires modifying data from the source to the target versions to recreate items. Sometimes these features are no longer supported and have been removed from the target version. Sometimes these features are not available in the target version, which is especially true when downgrading. While this tool is meant to make this process easier, it is not exhaustive in its support. When encountering a compatibility issue or an important feature gap for your migration, [search the issues and comment on the existing issue](https://github.com/opensearch-project/opensearch-migrations/issues) or [create a new](https://github.com/opensearch-project/opensearch-migrations/issues/new/choose) issue if one cannot be found. + +#### Deprecation of Mapping Types + +In Elasticsearch 6.8 the mapping types feature was discontinued in Elasticsearch 7.0+ which has created complexity in migrating to newer versions of Elasticsearch and OpenSearch, [learn more](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/removal-of-types.html) ↗. + +As Metadata migration supports migrating from ES 6.8 on to the latest versions of OpenSearch this scenario is handled by removing the type mapping types and restructuring the template or index properties. Note that, at the time of this writing multiple type mappings are not supported, [tracking task](https://opensearch.atlassian.net/browse/MIGRATIONS-1778) ↗. + + +**Example starting state with mapping type foo (ES 6):** + +```json +{ + "mappings": [ + { + "foo": { + "properties": { + "field1": { "type": "text" }, + "field2": { "type": "keyword" } + } + } + } + ] +} +``` +{% include copy.html %} + +**Example ending state with foo removed (ES 7):** + +```json +{ + "mappings": { + "properties": { + "field1": { "type": "text" }, + "field2": { "type": "keyword" }, + } + } +} +``` +{% include copy.html %} + +For additional technical details, [view the mapping type removal source code](https://github.com/opensearch-project/opensearch-migrations/blob/main/transformation/src/main/java/org/opensearch/migrations/transformation/rules/IndexMappingTypeRemoval.java). diff --git a/_migration-assistant/migration-phases/removing-migration-infrastructure.md b/_migration-assistant/migration-phases/removing-migration-infrastructure.md new file mode 100644 index 0000000000..656a8e1998 --- /dev/null +++ b/_migration-assistant/migration-phases/removing-migration-infrastructure.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Removing migration infrastructure +nav_order: 120 +parent: Migration phases +--- + +# Removing migration infrastructure + +After a migration is complete all resources should be removed except for the target cluster, and optionally your Cloudwatch Logs, and Traffic Replayer logs. + +To remove all the CDK stack(s) which get created during a deployment you can execute a command similar to below within the CDK directory + +```bash +cdk destroy "*" --c contextId= +``` +{% include copy.html %} + +Follow the instructions on the command-line to remove the deployed resources from the AWS account. + +The AWS Management Console can also be used to remove Migration Assistant resources and confirm that they are no longer in the account. \ No newline at end of file diff --git a/_migration-assistant/migration-phases/switching-traffic-from-the-source-cluster.md b/_migration-assistant/migration-phases/switching-traffic-from-the-source-cluster.md new file mode 100644 index 0000000000..c43580eef9 --- /dev/null +++ b/_migration-assistant/migration-phases/switching-traffic-from-the-source-cluster.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Switching traffic from the source cluster +nav_order: 110 +parent: Migration phases +--- + +# Switching traffic from the source cluster + +After the source and target clusters are synchronized, traffic needs to be switched to the target cluster so that the source cluster can be taken offline. + +## Assumptions + +This page assumes that the following has occurred before making the switch: + +- All client traffic is being routed through a switchover listener in the [MigrationAssistant Application Load Balancer]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/). +- Client traffic has been verified as compatible with the target cluster. +- The target cluster is in a good state to accept client traffic. +- The target proxy service is deployed. + +## Switching traffic + +Use the following steps to switch traffic from the source cluster to the target cluster: + +1. In the AWS Management Console, navigate to **ECS** > **Migration Assistant Cluster**. Note the desired count of the capture proxy, which should be greater than 1. + +2. Update the **ECS Service** of the target proxy to be at least as large as the traffic capture proxy. Wait for tasks to start up, and verify that all targets are healthy in the target proxy service's **Load balancer health** section. + +3. Navigate to **EC2** > **Load Balancers** > **Migration Assistant ALB**. + +4. Navigate to **ALB Metrics** and examine any useful information, specifically looking at **Active Connection Count** and **New Connection Count**. Note any large discrepancies, which can indicate reused connections affecting traffic switchover. + +5. Navigate to **Capture Proxy Target Group** (`ALBSourceProxy--TG`) > **Monitoring**. + +6. Examine the **Metrics Requests**, **Target (2XX, 3XX, 4XX, 5XX)**, and **Target Response Time** metrics. Verify that this appears as expected and includes all traffic expected to be included in the switchover. Note details that could help identify anomalies during the switchover, including the expected response time and response code rate. + +7. Navigate back to **ALB Metrics** and choose **Target Proxy Target Group** (`ALBTargetProxy--TG`). Verify that all expected targets are healthy and that none are in a draining state. + +8. Navigate back to **ALB Metrics** and to the **Listener** on port `9200`. + +9. Choose the **Default rule** and **Edit**. + +10. Modify the weights of the targets to switch the desired traffic to the target proxy. To perform a full switchover, modify the **Target Proxy** weight to `1` and the **Source Proxy** weight to `0`. + +11. Choose **Save Changes**. + +12. Navigate to both **SourceProxy** and **TargetProxy TG Monitoring** metrics and verify that traffic is switching over as expected. If connections are being reused by clients, perform any necessary actions to terminate them. Monitor these metrics until **SourceProxy TG** shows 0 requests when all clients have switched over. + + +## Fallback + +If you need to fall back to the source cluster at any point during the switchover, revert the **Default rule** so that the Application Load Balancer routes to the **SourceProxy Target Group**. \ No newline at end of file diff --git a/_migration-assistant/migration-phases/using-traffic-replayer.md b/_migration-assistant/migration-phases/using-traffic-replayer.md new file mode 100644 index 0000000000..5b7af3c3f7 --- /dev/null +++ b/_migration-assistant/migration-phases/using-traffic-replayer.md @@ -0,0 +1,309 @@ +--- +layout: default +title: Using Traffic Replayer +nav_order: 100 +parent: Migration phases +--- + +# Using Traffic Replayer + +This guide covers how to use Traffic Replayer to replay captured traffic from a source cluster to a target cluster during the migration process. Traffic Replayer allows you to verify that the target cluster can handle requests in the same way as the source cluster and catch up to real-time traffic for a smooth migration. + +## When to run Traffic Replayer + +After deploying Migration Assistant, Traffic Replayer does not run by default. It should be started only after all metadata and documents have been migrated to ensure that recent changes to the source cluster are properly reflected in the target cluster. + +For example, if a document was deleted after a snapshot was taken, starting Traffic Replayer before the document migration is complete may cause the deletion request to execute before the document is added to the target. Running Traffic Replayer after all other migration processes ensures that the target cluster will be consistent with the source cluster. + +## Configuration options + +[Traffic Replayer settings]({{site.url}}{{site.baseurl}}/migration-assistant/deploying-migration-assistant/configuration-options/) are configured during the deployment of Migration Assistant. Make sure to set the authentication mode for Traffic Replayer so that it can properly communicate with the target cluster. + +## Using Traffic Replayer + +To manage Traffic Replayer, use the `console replay` command. The following examples show the available commands. + +### Start Traffic Replayer + +The following command starts Traffic Replayer with the options specified at deployment: + +```bash +console replay start +``` + +When starting Traffic Replayer, you should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay start +Replayer started successfully. +Service migration-dev-traffic-replayer-default set to 1 desired count. Currently 0 running and 0 pending. +``` + +## Check the status of Traffic Replayer + +Use the following command to show the status of Traffic Replayer: + +```bash +console replay status +``` + +Replay will return one of the following statuses: + +- `Running` shows how many container instances are actively running. +- `Pending` indicates how many instances are being provisione.d +- `Desired` shows the total number of instances that should be running. + +You should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay status +(, 'Running=0\nPending=0\nDesired=0') +``` + +## Stop Traffic Replayer + +The following command stops Traffic Replayer: + +```bash +console replay stop +``` + +You should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay stop +Replayer stopped successfully. +Service migration-dev-traffic-replayer-default set to 0 desired count. Currently 0 running and 0 pending. +``` + + + +### Delivery guarantees + +Traffic Replayer retrieves traffic from Kafka and updates its commit cursor after sending requests to the target cluster. This provides an "at least once" delivery guarantee; however, success isn't always guaranteed. Therefore, you should monitor metrics and tuple outputs or perform external validation to ensure that the target cluster is functioning as expected. + +## Time scaling + +Traffic Replayer sends requests in the same order that they were received from each connection to the source. However, relative timing between different connections is not guaranteed. For example: + +- **Scenario**: Two connections exist:one sends a PUT request every minute, and the other sends a GET request every second. +- **Behavior**: Traffic Replayer will maintain the sequence within each connection, but the relative timing between the connections (PUTs and GETs) is not preserved. + +Assume that a source cluster responds to requests (GETs and PUTs) within 100 ms: + +- With a **speedup factor of 1**, the target will experience the same request rates and idle periods as the source. +- With a **speedup factor of 2**, requests will be sent twice as fast, with GETs sent every 500 ms and PUTs every 30 seconds. +- With a **speedup factor of 10**, requests will be sent 10x faster, and as long as the target responds quickly, Traffic Replayer can maintain the pace. + +If the target cannot respond fast enough, Traffic Replayer will wait for the previous request to complete before sending the next one. This may cause delays and affect global relative ordering. + +## Transformations + +During migrations, some requests may need to be transformed between versions. For example, Elasticsearch previously supported multiple type mappings in indexes, but this is no longer the case in OpenSearch. Clients may need to be adjusted accordingly by splitting documents into multiple indexes or transforming request data. + +Traffic Replayer automatically rewrites host and authentication headers, but for more complex transformations, custom transformation rules can be specified using the `--transformer-config` option. For more information, see the [Traffic Replayer README](https://github.com/opensearch-project/opensearch-migrations/blob/c3d25958a44ec2e7505892b4ea30e5fbfad4c71b/TrafficCapture/trafficReplayer/README.md#transformations). + +### Example transformation + +Suppose that a source request contains a `tagToExcise` element that needs to be removed and its children promoted and that the URI path includes `extraThingToRemove`, which should also be removed. The following Jolt script handles this transformation: + +```json +[{ "JsonJoltTransformerProvider": +[ + { + "script": { + "operation": "shift", + "spec": { + "payload": { + "inlinedJsonBody": { + "top": { + "tagToExcise": { + "*": "payload.inlinedJsonBody.top.&" + }, + "*": "payload.inlinedJsonBody.top.&" + }, + "*": "payload.inlinedJsonBody.&" + }, + "*": "payload.&" + }, + "*": "&" + } + } + }, + { + "script": { + "operation": "modify-overwrite-beta", + "spec": { + "URI": "=split('/extraThingToRemove',@(1,&))" + } + } + }, + { + "script": { + "operation": "modify-overwrite-beta", + "spec": { + "URI": "=join('',@(1,&))" + } + } + } +] +}] +``` + +The resulting request sent to the target will appear similar to the following: + +```bash +PUT /oldStyleIndex/moreStuff HTTP/1.0 +host: testhostname + +{"top":{"properties":{"field1":{"type":"text"},"field2":{"type":"keyword"}}}} +``` +{% include copy.html %} + +You can pass Base64-encoded transformation scripts using `--transformer-config-base64`. + +## Result logs + +HTTP transactions from the source capture and those resent to the target cluster are logged in files located at `/shared-logs-output/traffic-replayer-default/*/tuples/tuples.log`. The `/shared-logs-output` directory is shared across containers, including the migration console. You can access these files from the migration console using the same path. Previous runs are also available in a `gzipped` format. + +Each log entry is a newline-delimited JSON object, containing information about the source and target requests/responses along with other transaction details, such as response times. + +These logs contain the contents of all requests, including authorization headers and the contents of all HTTP messages. Ensure that access to the migration environment is restricted, as these logs serve as a source of truth for determining what happened in both the source and target clusters. Response times for the source refer to the amount of time between the proxy sending the end of a request and receiving the response. While response times for the target are recorded in the same manner, keep in mind that the locations of the capture proxy, Traffic Replayer, and target may differ and that these logs do not account for the client's location. +{: .note} + + +### Example log entry + +The following example log entry shows a `/_cat/indices?v` request sent to both the source and target clusters: + +```json +{ + "sourceRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "capture-proxy:9200", + "Authorization": "Basic YWRtaW46YWRtaW4=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "sourceResponse": { + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 59, + "content-type": "text/plain; charset=UTF-8", + "content-length": "214", + "body": "aGVhbHRoIHN0YXR1cyBpbmRleCAgICAgICB..." + }, + "targetRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "opensearchtarget", + "Authorization": "Basic YWRtaW46bXlTdHJvbmdQYXNzd29yZDEyMyE=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "targetResponses": [{ + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 721, + "content-type": "text/plain; charset=UTF-8", + "content-length": "484", + "body": "aGVhbHRoIHN0YXR1cyBpbmRleCAgICAgICB..." + }], + "connectionId": "0242acfffe13000a-0000000a-00000005-1eb087a9beb83f3e-a32794b4.0", + "numRequests": 1, + "numErrors": 0 +} +``` +{% include copy.html %} + + +### Decoding log content + +The contents of HTTP message bodies are Base64 encoded in order to handle various types of traffic, including compressed data. To view the logs in a more human-readable format, use the console library `tuples show`. Running the script as follows will produce a `readable-tuples.log` in the home directory: + +```shell +console tuples show --in /shared-logs-output/traffic-replayer-default/d3a4b31e1af4/tuples/tuples.log > readable-tuples.log +``` + +The `readable-tuples.log` should appear similar to the following: + +```json +{ + "sourceRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "capture-proxy:9200", + "Authorization": "Basic YWRtaW46YWRtaW4=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "sourceResponse": { + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 59, + "content-type": "text/plain; charset=UTF-8", + "content-length": "214", + "body": "health status index uuid ..." + }, + "targetRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "opensearchtarget", + "Authorization": "Basic YWRtaW46bXlTdHJvbmdQYXNzd29yZDEyMyE=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "targetResponses": [{ + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 721, + "content-type": "text/plain; charset=UTF-8", + "content-length": "484", + "body": "health status index uuid ..." + }], + "connectionId": "0242acfffe13000a-0000000a-00000005-1eb087a9beb83f3e-a32794b4.0", + "numRequests": 1, + "numErrors": 0 +} +``` + + +## Metrics + +Traffic Replayer emits various OpenTelemetry metrics to Amazon CloudWatch, and traces are sent through AWS X-Ray. The following are some useful metrics that can help evaluate cluster performance. + +### `sourceStatusCode` + +This metric tracks the HTTP status codes for both the source and target clusters, with dimensions for the HTTP verb, such as `GET` or `POST`, and the status code families (200--299). These dimensions can help quickly identify discrepancies between the source and target, such as when `DELETE 200s` becomes `4xx` or `GET 4xx` errors turn into `5xx` errors. + +### `lagBetweenSourceAndTargetRequests` + +This metric shows the delay between requests hitting the source and target clusters. With a speedup factor greater than 1 and a target cluster that can handle requests efficiently, this value should decrease as the replay progresses, indicating a reduction in replay lag. + +### Additional metrics + +The following metrics are also reported: + +- **Throughput**: `bytesWrittenToTarget` and `bytesReadFromTarget` indicate the throughput to and from the cluster. +- **Retries**: `numRetriedRequests` tracks the number of requests retried due to status code mismatches between the source and target. +- **Event counts**: Various `(*)Count` metrics track the number of completed events. +- **Durations**: `(*)Duration` metrics measure the duration of each step in the process. +- **Exceptions**: `(*)ExceptionCount` shows the number of exceptions encountered during each processing phase. + + +## CloudWatch considerations + +Metrics pushed to CloudWatch may experience a visibility lag of around 5 minutes. CloudWatch also retains higher-resolution data for a shorter period than lower-resolution data. For more information, see [Amazon CloudWatch concepts](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_concepts.html). \ No newline at end of file diff --git a/_migration-assistant/migration-phases/verifying-migration-tools.md b/_migration-assistant/migration-phases/verifying-migration-tools.md new file mode 100644 index 0000000000..77df2b4280 --- /dev/null +++ b/_migration-assistant/migration-phases/verifying-migration-tools.md @@ -0,0 +1,205 @@ +--- +layout: default +title: Verifying migration tools +nav_order: 70 +parent: Migration phases +--- + +# Verifying migration tools + +Before using the Migration Assistant, take the following steps to verify that your cluster is ready for migration. + +## Verifying snapshot creation + +Verify that a snapshot can be created of your source cluster and used for metadata and backfill scenarios. + +### Installing the Elasticsearch S3 Repository plugin + +The snapshot needs to be stored in a location that Migration Assistant can access. This guide uses Amazon Simple Storage Service (Amazon S3). By default, Migration Assistant creates an S3 bucket for storage. Therefore, it is necessary to install the [Elasticsearch S3 repository plugin](https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3.html) on your source nodes (https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3.html). + +Additionally, make sure that the plugin has been configured with AWS credentials that allow it to read and write to Amazon S3. If your Elasticsearch cluster is running on Amazon Elastic Compute Cloud (Amazon EC2) or Amazon Elastic Container Service (Amazon ECS) instances with an AWS Identity and Access Management (IAM) execution role, include the necessary S3 permissions. Alternatively, you can store the credentials in the [Elasticsearch keystore](https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3-client.html). + +### Verifying the S3 repository plugin configuration + +You can verify that the S3 repository plugin is configured correctly by creating a test snapshot. + +Create an S3 bucket for the snapshot using the following AWS Command Line Interface (AWS CLI) command: + +```shell +aws s3api create-bucket --bucket --region +``` +{% include copy.html %} + +Register a new S3 snapshot repository on your source cluster using the following cURL command: + +```shell +curl -X PUT "http://:9200/_snapshot/test_s3_repository" -H "Content-Type: application/json" -d '{ + "type": "s3", + "settings": { + "bucket": "", + "region": "" + } +}' +``` +{% include copy.html %} + +Next, create a test snapshot that captures only the cluster's metadata: + +```shell +curl -X PUT "http://:9200/_snapshot/test_s3_repository/test_snapshot_1" -H "Content-Type: application/json" -d '{ + "indices": "", + "ignore_unavailable": true, + "include_global_state": true +}' +``` +{% include copy.html %} + +Check the AWS Management Console to confirm that your bucket contains the snapshot. + +### Removing test snapshots after verification + +To remove the resources created during verification, you can use the following deletion commands: + +**Test snapshot** + +```shell +curl -X DELETE "http://:9200/_snapshot/test_s3_repository/test_snapshot_1?pretty" +``` +{% include copy.html %} + +**Test snapshot repository** + +```shell +curl -X DELETE "http://:9200/_snapshot/test_s3_repository?pretty" +``` +{% include copy.html %} + +**S3 bucket** + +```shell +aws s3 rm s3:// --recursive +aws s3api delete-bucket --bucket --region +``` +{% include copy.html %} + +### Troubleshooting + +Use this guidance to troubleshoot any of the following snapshot verification issues. + +#### Access denied error (403) + +If you encounter an error like `AccessDenied (Service: Amazon S3; Status Code: 403)`, verify the following: + +- The IAM role assigned to your Elasticsearch cluster has the necessary S3 permissions. +- The bucket name and AWS Region provided in the snapshot configuration match the actual S3 bucket you created. + +#### Older versions of Elasticsearch + +Older versions of the Elasticsearch S3 repository plugin may have trouble reading IAM role credentials embedded in Amazon EC2 and Amazon ECS instances. This is because the copy of the AWS SDK shipped with them is too old to read the new standard way of retrieving those credentials, as shown in [the Instance Metadata Service v2 (IMDSv2) specification](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html). This can result in snapshot creation failures, with an error message similar to the following: + +```json +{"error":{"root_cause":[{"type":"repository_verification_exception","reason":"[migration_assistant_repo] path [rfs-snapshot-repo] is not accessible on master node"}],"type":"repository_verification_exception","reason":"[migration_assistant_repo] path [rfs-snapshot-repo] is not accessible on master node","caused_by":{"type":"i_o_exception","reason":"Unable to upload object [rfs-snapshot-repo/tests-s8TvZ3CcRoO8bvyXcyV2Yg/master.dat] using a single upload","caused_by":{"type":"amazon_service_exception","reason":"Unauthorized (Service: null; Status Code: 401; Error Code: null; Request ID: null)"}}},"status":500} +``` + +If you encounter this issue, you can resolve it by temporarily enabling IMDSv1 on the instances in your source cluster for the duration of the snapshot. There is a toggle for this available in the AWS Management Console as well as in the AWS CLI. Switching this toggle will turn on the older access model and enable the Elasticsearch S3 repository plugin to work as normal. For more information about IMDSv1, see [Modify instance metadata options for existing instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-IMDS-existing-instances.html). + +## Switching over client traffic + +The Migration Assistant Application Load Balancer is deployed with a listener that shifts traffic between the source and target clusters through proxy services. The Application Load Balancer should start in **Source Passthrough** mode. + +### Verifying that the traffic switchover is complete + +Use the following steps to verify that the traffic switchover is complete: + +1. In the AWS Management Console, navigate to **EC2 > Load Balancers**. +2. Select the **MigrationAssistant ALB**. +3. Examine the listener on port `9200` and verify that 100% of the traffic is directed to the **Source Proxy**. +4. Navigate to the **Migration ECS Cluster** in the AWS Management Console. +5. Select the **Target Proxy Service**. +6. Verify that the desired count for the service is running: + * If the desired count is not met, update the service to increase it to at least 1 and wait for the service to start. +7. On the **Health and Metrics** tab under **Load balancer health**, verify that all targets are reporting as healthy: + * This confirms that the Application Load Balancer can connect to the target cluster through the target proxy. +8. (Reset) Update the desired count for the **Target Proxy Service** back to its original value in Amazon ECS. + +### Fixing unidentified traffic patterns + +When switching over traffic to the target cluster, you might encounter unidentified traffic patterns. To help identify the cause of these patterns, use the following steps: +* Verify that the target cluster allows traffic ingress from the **Target Proxy Security Group**. +* Navigate to **Target Proxy ECS Tasks** to investigate any failing tasks. +Set the **Filter desired status** to **Any desired status** to view all tasks, then navigate to the logs for any stopped tasks. + + +## Verifying replication + +Use the following steps to verify that replication is working once the traffic capture proxy is deployed: + + +1. Navigate to the **Migration ECS Cluster** in the AWS Management Console. +2. Navigate to **Capture Proxy Service**. +3. Verify that the capture proxy is running with the desired proxy count. If it is not, update the service to increase it to at least 1 and wait for startup. +4. Under **Health and Metrics** > **Load balancer health**, verify that all targets are healthy. This means that the Application Load Balancer is able to connect to the source cluster through the capture proxy. +5. Navigate to the **Migration Console Terminal**. +6. Run `console kafka describe-topic-records`. Wait 30 seconds for another Application Load Balancer health check. +7. Run `console kafka describe-topic-records` again and verify that the number of RECORDS increased between runs. +8. Run `console replay start` to start Traffic Replayer. +9. Run `tail -f /shared-logs-output/traffic-replayer-default/*/tuples/tuples.log | jq '.targetResponses[]."Status-Code"'` to confirm that the Kafka requests were sent to the target and that it responded as expected. If the responses don't appear: + * Check that the migration console can access the target cluster by running `./catIndices.sh`, which should show the indexes in the source and target. + * Confirm that messages are still being recorded to Kafka. + * Check for errors in the Traffic Replayer logs (`/migration/STAGE/default/traffic-replayer-default`) using CloudWatch. +10. (Reset) Update the desired count for the **Capture Proxy Service** back to its original value in Amazon ECS. + +### Troubleshooting + +Use this guidance to troubleshoot any of the following replication verification issues. + +### Health check responses with 401/403 status code + +If the source cluster is configured to require authentication, the capture proxy will not be able to verify replication beyond receiving a 401/403 status code for Application Load Balancer health checks. For more information, see [Failure Modes](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/README.md#failure-modes). + +### Traffic does not reach the source cluster + +Verify that the source cluster allows traffic ingress from the Capture Proxy Security Group. + +Look for failing tasks by navigating to **Traffic Capture Proxy ECS**. Change **Filter desired status** to **Any desired status** in order to see all tasks and navigate to the logs for stopped tasks. + + +## Resetting before migration + +After all verifications are complete, reset all resources before using Migration Assistant for an actual migration. + +The following steps outline how to reset resources with Migration Assistant before executing the actual migration. At this point all verifications are expected to have been completed. These steps can be performed after [Accessing the Migration Console]({{site.url}}{{site.baseurl}}/migration-assistant/migration-console/accessing-the-migration-console/). + +### Traffic Replayer + +To stop running Traffic Replayer, use the following command: + +```bash +console replay stop +``` +{% include copy.html %} + +### Kafka + +To clear all captured traffic from the Kafka topic, you can run the following command. + +This command will result in the loss of any traffic data captured by the capture proxy up to this point and thus should be used with caution. +{: .warning} + +```bash +console kafka delete-topic +``` +{% include copy.html %} + +### Target cluster + +To clear non-system indexes from the target cluster that may have been created as a result of testing, you can run the following command: + +This command will result in the loss of all data in the target cluster and should be used with caution. +{: .warning} + +```bash +console clusters clear-indices --cluster target +``` +{% include copy.html %} + diff --git a/_ml-commons-plugin/api/agent-apis/execute-agent.md b/_ml-commons-plugin/api/agent-apis/execute-agent.md index 27d50bced0..2af4fc2c8e 100644 --- a/_ml-commons-plugin/api/agent-apis/execute-agent.md +++ b/_ml-commons-plugin/api/agent-apis/execute-agent.md @@ -18,7 +18,7 @@ When an agent is executed, it runs the tools with which it is configured. POST /_plugins/_ml/agents//_execute ``` -## Request fields +## Request body fields The following table lists the available request fields. diff --git a/_ml-commons-plugin/api/agent-apis/get-agent.md b/_ml-commons-plugin/api/agent-apis/get-agent.md index 6190406649..7a03e85221 100644 --- a/_ml-commons-plugin/api/agent-apis/get-agent.md +++ b/_ml-commons-plugin/api/agent-apis/get-agent.md @@ -77,6 +77,6 @@ Assistant:""" } ``` -## Response fields +## Response body fields -For response field descriptions, see [Register Agent API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent#request-fields). \ No newline at end of file +For response field descriptions, see [Register Agent API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent#request-body-fields). \ No newline at end of file diff --git a/_ml-commons-plugin/api/agent-apis/register-agent.md b/_ml-commons-plugin/api/agent-apis/register-agent.md index eeea2af715..0057b44427 100644 --- a/_ml-commons-plugin/api/agent-apis/register-agent.md +++ b/_ml-commons-plugin/api/agent-apis/register-agent.md @@ -27,7 +27,7 @@ POST /_plugins/_ml/agents/_register ``` {% include copy-curl.html %} -## Request fields +## Request body fields The following table lists the available request fields. diff --git a/_ml-commons-plugin/api/agent-apis/search-agent.md b/_ml-commons-plugin/api/agent-apis/search-agent.md index 3d950cde8f..63b1d07eed 100644 --- a/_ml-commons-plugin/api/agent-apis/search-agent.md +++ b/_ml-commons-plugin/api/agent-apis/search-agent.md @@ -134,6 +134,6 @@ Assistant:""" } ``` -## Response fields +## Response body fields -For response field descriptions, see [Register Agent API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent#request-fields). +For response field descriptions, see [Register Agent API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent#request-body-fields). diff --git a/_ml-commons-plugin/api/async-batch-ingest.md b/_ml-commons-plugin/api/async-batch-ingest.md new file mode 100644 index 0000000000..493f192d0f --- /dev/null +++ b/_ml-commons-plugin/api/async-batch-ingest.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Asynchronous batch ingestion +parent: ML Commons APIs +has_children: false +has_toc: false +nav_order: 35 +--- + +# Asynchronous batch ingestion +**Introduced 2.17** +{: .label .label-purple } + +Use the Asynchronous Batch Ingestion API to ingest data into your OpenSearch cluster from your files on remote file servers, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. For detailed configuration steps, see [Asynchronous batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/async-batch-ingestion/). + +## Path and HTTP methods + +```json +POST /_plugins/_ml/_batch_ingestion +``` + +#### Request body fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- +`index_name`| String | Required | The index name. +`field_map` | Object | Required | Maps fields from the source file to specific fields in an OpenSearch index for ingestion. +`ingest_fields` | Array | Optional | Lists fields from the source file that should be ingested directly into the OpenSearch index without any additional mapping. +`credential` | Object | Required | Contains the authentication information for accessing external data sources, such as Amazon S3 or OpenAI. +`data_source` | Object | Required | Specifies the type and location of the external file(s) from which the data is ingested. +`data_source.type` | String | Required | Specifies the type of the external data source. Valid values are `s3` and `openAI`. +`data_source.source` | Array | Required | Specifies one or more file locations from which the data is ingested. For `s3`, specify the file path to the Amazon S3 bucket (for example, `["s3://offlinebatch/output/sagemaker_batch.json.out"]`). For `openAI`, specify the file IDs for input or output files (for example, `["file-", "file-", "file-"]`). + +## Example request: Ingesting a single file + +```json +POST /_plugins/_ml/_batch_ingestion +{ + "index_name": "my-nlp-index", + "field_map": { + "chapter": "$.content[0]", + "title": "$.content[1]", + "chapter_embedding": "$.SageMakerOutput[0]", + "title_embedding": "$.SageMakerOutput[1]", + "_id": "$.id" + }, + "ingest_fields": ["$.id"], + "credential": { + "region": "us-east-1", + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "data_source": { + "type": "s3", + "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"] + } +} +``` +{% include copy-curl.html %} + +## Example request: Ingesting multiple files + +```json +POST /_plugins/_ml/_batch_ingestion +{ + "index_name": "my-nlp-index-openai", + "field_map": { + "question": "source[1].$.body.input[0]", + "answer": "source[1].$.body.input[1]", + "question_embedding":"source[0].$.response.body.data[0].embedding", + "answer_embedding":"source[0].$.response.body.data[1].embedding", + "_id": ["source[0].$.custom_id", "source[1].$.custom_id"] + }, + "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"], + "credential": { + "openAI_key": "" + }, + "data_source": { + "type": "openAI", + "source": ["file-", "file-", "file-"] + } +} +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "task_id": "cbsPlpEBMHcagzGbOQOx", + "task_type": "BATCH_INGEST", + "status": "CREATED" +} +``` diff --git a/_ml-commons-plugin/api/connector-apis/create-connector.md b/_ml-commons-plugin/api/connector-apis/create-connector.md index 4225a24053..b99306bb8a 100644 --- a/_ml-commons-plugin/api/connector-apis/create-connector.md +++ b/_ml-commons-plugin/api/connector-apis/create-connector.md @@ -16,7 +16,7 @@ Creates a standalone connector. For more information, see [Connectors]({{site.ur POST /_plugins/_ml/connectors/_create ``` -## Request fields +## Request body fields For a list of request fields, see [Blueprint configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints#configuration-parameters). diff --git a/_ml-commons-plugin/api/connector-apis/update-connector.md b/_ml-commons-plugin/api/connector-apis/update-connector.md index 625d58bb62..4b62652da8 100644 --- a/_ml-commons-plugin/api/connector-apis/update-connector.md +++ b/_ml-commons-plugin/api/connector-apis/update-connector.md @@ -15,7 +15,7 @@ Use this API to update a standalone connector based on the `model_ID`. To update Before updating a standalone connector, you must undeploy all models that use the connector. For information about undeploying a model, see [Undeploy Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/undeploy-model/). {: .note} -Using this API, you can update the connector fields listed in the [Request fields](#request-fields) section and add optional fields to your connector. You cannot delete fields from a connector using this API. +Using this API, you can update the connector fields listed in the [Request fields](#request-body-fields) section and add optional fields to your connector. You cannot delete fields from a connector using this API. For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). @@ -25,7 +25,7 @@ For information about user access for this API, see [Model access control consid PUT /_plugins/_ml/connectors/ ``` -## Request fields +## Request body fields The following table lists the updatable fields. For more information about all connector fields, see [Blueprint configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints#configuration-parameters). diff --git a/_ml-commons-plugin/api/controller-apis/create-controller.md b/_ml-commons-plugin/api/controller-apis/create-controller.md index 91a6be4387..9fe9306575 100644 --- a/_ml-commons-plugin/api/controller-apis/create-controller.md +++ b/_ml-commons-plugin/api/controller-apis/create-controller.md @@ -35,7 +35,7 @@ Parameter | Data type | Description :--- | :--- | :--- `model_id` | String | The model ID of the model for which you want to set rate limits. Required. -## Request fields +## Request body fields The following table lists the available request fields. diff --git a/_ml-commons-plugin/api/controller-apis/get-controller.md b/_ml-commons-plugin/api/controller-apis/get-controller.md index b30fe15679..48e6f16549 100644 --- a/_ml-commons-plugin/api/controller-apis/get-controller.md +++ b/_ml-commons-plugin/api/controller-apis/get-controller.md @@ -69,9 +69,9 @@ If there is no controller defined for the model, OpenSearch returns an error: } ``` -## Response fields +## Response body fields -For response field descriptions, see [Create Controller API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/create-controller#request-fields). +For response field descriptions, see [Create Controller API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/controller-apis/create-controller#request-body-fields). ## Required permissions diff --git a/_ml-commons-plugin/api/execute-algorithm.md b/_ml-commons-plugin/api/execute-algorithm.md index 7b06cfefe8..6acd926444 100644 --- a/_ml-commons-plugin/api/execute-algorithm.md +++ b/_ml-commons-plugin/api/execute-algorithm.md @@ -2,7 +2,7 @@ layout: default title: Execute algorithm parent: ML Commons APIs -nav_order: 30 +nav_order: 37 --- # Execute algorithm diff --git a/_ml-commons-plugin/api/memory-apis/create-memory.md b/_ml-commons-plugin/api/memory-apis/create-memory.md index c7dac1783d..58ba34b2a4 100644 --- a/_ml-commons-plugin/api/memory-apis/create-memory.md +++ b/_ml-commons-plugin/api/memory-apis/create-memory.md @@ -34,7 +34,7 @@ Parameter | Data type | Description :--- | :--- | :--- `memory_id` | String | The ID of the memory to be updated. Required for the PUT method. -## Request fields +## Request body fields The following table lists the available request fields. diff --git a/_ml-commons-plugin/api/memory-apis/create-message.md b/_ml-commons-plugin/api/memory-apis/create-message.md index 345f411ccd..78ec0ade34 100644 --- a/_ml-commons-plugin/api/memory-apis/create-message.md +++ b/_ml-commons-plugin/api/memory-apis/create-message.md @@ -38,7 +38,7 @@ Parameter | Data type | Description `memory_id` | String | The ID of the memory to which to add the message. Required for the POST method. `message_id` | String | The ID of the message to be updated. Required for the PUT method. -## Request fields +## Request body fields The following table lists the available request fields. diff --git a/_ml-commons-plugin/api/memory-apis/get-memory.md b/_ml-commons-plugin/api/memory-apis/get-memory.md index 63ab548c00..7f62445072 100644 --- a/_ml-commons-plugin/api/memory-apis/get-memory.md +++ b/_ml-commons-plugin/api/memory-apis/get-memory.md @@ -120,7 +120,7 @@ GET /_plugins/_ml/memory?max_results=2&next_token=1 } ``` -## Response fields +## Response body fields The following table lists the available response fields. diff --git a/_ml-commons-plugin/api/memory-apis/get-message-traces.md b/_ml-commons-plugin/api/memory-apis/get-message-traces.md index 300adfc11d..1b0e9b1902 100644 --- a/_ml-commons-plugin/api/memory-apis/get-message-traces.md +++ b/_ml-commons-plugin/api/memory-apis/get-message-traces.md @@ -137,6 +137,6 @@ green open .opendistro-job-scheduler-lock XjgmXAVKQ4e8Y-ac54VBzg 1 } ``` -## Response fields +## Response body fields -For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). \ No newline at end of file +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-body-fields). \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/get-message.md b/_ml-commons-plugin/api/memory-apis/get-message.md index 2f4cfc949f..36baa84bf4 100644 --- a/_ml-commons-plugin/api/memory-apis/get-message.md +++ b/_ml-commons-plugin/api/memory-apis/get-message.md @@ -62,7 +62,7 @@ GET /_plugins/_ml/memory/message/0m8ya40BfUsSoeNTj-pU } ``` -For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-body-fields). ## Get all messages within a memory @@ -133,7 +133,7 @@ POST /_plugins/_ml/message/_search } ``` -## Response fields +## Response body fields -For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-body-fields). diff --git a/_ml-commons-plugin/api/memory-apis/search-memory.md b/_ml-commons-plugin/api/memory-apis/search-memory.md index fc8dd3e1d9..4fa022bfaf 100644 --- a/_ml-commons-plugin/api/memory-apis/search-memory.md +++ b/_ml-commons-plugin/api/memory-apis/search-memory.md @@ -120,7 +120,7 @@ POST /_plugins/_ml/memory/_search } ``` -## Response fields +## Response body fields The following table lists all response fields. diff --git a/_ml-commons-plugin/api/memory-apis/search-message.md b/_ml-commons-plugin/api/memory-apis/search-message.md index a88ccfbb41..22602c21f2 100644 --- a/_ml-commons-plugin/api/memory-apis/search-message.md +++ b/_ml-commons-plugin/api/memory-apis/search-message.md @@ -89,6 +89,6 @@ GET /_plugins/_ml/memory/gW8Aa40BfUsSoeNTvOKI/_search } ``` -## Response fields +## Response body fields -For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-fields). \ No newline at end of file +For information about response fields, see [Create Message request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/create-message#request-body-fields). \ No newline at end of file diff --git a/_ml-commons-plugin/api/model-apis/batch-predict.md b/_ml-commons-plugin/api/model-apis/batch-predict.md index b32fbb108d..c1dc7348fe 100644 --- a/_ml-commons-plugin/api/model-apis/batch-predict.md +++ b/_ml-commons-plugin/api/model-apis/batch-predict.md @@ -31,7 +31,13 @@ POST /_plugins/_ml/models//_batch_predict ## Prerequisites -Before using the Batch Predict API, you need to create a connector to the externally hosted model. For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request: +Before using the Batch Predict API, you need to create a connector to the externally hosted model. For each action, specify the `action_type` parameter that describes the action: + +- `batch_predict`: Runs the batch predict operation. +- `batch_predict_status`: Checks the batch predict operation status. +- `cancel_batch_predict`: Cancels the batch predict operation. + +For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request. The `cancel_batch_predict` action is optional and supports canceling the batch job running on OpenAI: ```json POST /_plugins/_ml/connectors/_create @@ -68,6 +74,22 @@ POST /_plugins/_ml/connectors/_create "Authorization": "Bearer ${credential.openAI_key}" }, "request_body": "{ \"input_file_id\": \"${parameters.input_file_id}\", \"endpoint\": \"${parameters.endpoint}\", \"completion_window\": \"24h\" }" + }, + { + "action_type": "batch_predict_status", + "method": "GET", + "url": "https://api.openai.com/v1/batches/${parameters.id}", + "headers": { + "Authorization": "Bearer ${credential.openAI_key}" + } + }, + { + "action_type": "cancel_batch_predict", + "method": "POST", + "url": "https://api.openai.com/v1/batches/${parameters.id}/cancel", + "headers": { + "Authorization": "Bearer ${credential.openAI_key}" + } } ] } @@ -123,45 +145,87 @@ POST /_plugins/_ml/models/lyjxwZABNrAVdFa9zrcZ/_batch_predict #### Example response +The response contains the task ID for the batch predict operation: + ```json { - "inference_results": [ - { - "output": [ - { - "name": "response", - "dataAsMap": { - "id": "batch_", - "object": "batch", - "endpoint": "/v1/embeddings", - "errors": null, - "input_file_id": "file-", - "completion_window": "24h", - "status": "validating", - "output_file_id": null, - "error_file_id": null, - "created_at": 1722037257, - "in_progress_at": null, - "expires_at": 1722123657, - "finalizing_at": null, - "completed_at": null, - "failed_at": null, - "expired_at": null, - "cancelling_at": null, - "cancelled_at": null, - "request_counts": { - "total": 0, - "completed": 0, - "failed": 0 - }, - "metadata": null - } - } - ], - "status_code": 200 - } - ] + "task_id": "KYZSv5EBqL2d0mFvs80C", + "status": "CREATED" } ``` -For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response. \ No newline at end of file +To check the status of the batch predict job, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). You can find the job details in the `remote_job` field in the task. Once the prediction is complete, the task `state` changes to `COMPLETED`. + +#### Example request + +```json +GET /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C +``` +{% include copy-curl.html %} + +#### Example response + +The response contains the batch predict operation details in the `remote_job` field: + +```json +{ + "model_id": "JYZRv5EBqL2d0mFvKs1E", + "task_type": "BATCH_PREDICTION", + "function_name": "REMOTE", + "state": "RUNNING", + "input_type": "REMOTE", + "worker_node": [ + "Ee5OCIq0RAy05hqQsNI1rg" + ], + "create_time": 1725491751455, + "last_update_time": 1725491751455, + "is_async": false, + "remote_job": { + "cancelled_at": null, + "metadata": null, + "request_counts": { + "total": 3, + "completed": 3, + "failed": 0 + }, + "input_file_id": "file-XXXXXXXXXXXX", + "output_file_id": "file-XXXXXXXXXXXXX", + "error_file_id": null, + "created_at": 1725491753, + "in_progress_at": 1725491753, + "expired_at": null, + "finalizing_at": 1725491757, + "completed_at": null, + "endpoint": "/v1/embeddings", + "expires_at": 1725578153, + "cancelling_at": null, + "completion_window": "24h", + "id": "batch_XXXXXXXXXXXXXXX", + "failed_at": null, + "errors": null, + "object": "batch", + "status": "in_progress" + } +} +``` + +For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response. + +### Canceling a batch predict job + +You can also cancel the batch predict operation running on the remote platform using the task ID returned by the batch predict request. To add this capability, set the `action_type` to `cancel_batch_predict` in the connector configuration when creating the connector. + +#### Example request + +```json +POST /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C/_cancel_batch +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "status": "OK" +} +``` diff --git a/_ml-commons-plugin/api/model-apis/register-model.md b/_ml-commons-plugin/api/model-apis/register-model.md index 7d8f6d8cc6..63537d0443 100644 --- a/_ml-commons-plugin/api/model-apis/register-model.md +++ b/_ml-commons-plugin/api/model-apis/register-model.md @@ -46,7 +46,7 @@ OpenSearch provides several pretrained models. For more information, see [OpenSe To register a pretrained text embedding model, the only required parameters are `name`, `version`, and `model_format`. -#### Request fields +#### Request body fields The following table lists the available request fields. @@ -75,7 +75,7 @@ POST /_plugins/_ml/models/_register To register a pretrained sparse encoding model, you must set the function name to `SPARSE_ENCODING` or `SPARSE_TOKENIZE`. -#### Request fields +#### Request body fields The following table lists the available request fields. @@ -107,7 +107,7 @@ POST /_plugins/_ml/models/_register To use a custom model locally within the OpenSearch cluster, you need to provide a URL and a config object for that model. For more information, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). -### Request fields +### Request body fields The following table lists the available request fields. @@ -170,7 +170,7 @@ POST /_plugins/_ml/models/_register To register a model hosted on a third-party platform, you can either first create a standalone connector and provide the ID of that connector or specify an internal connector for the model. For more information, see [Creating connectors for third-party ML platforms]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). -### Request fields +### Request body fields The following table lists the available request fields. diff --git a/_ml-commons-plugin/api/model-apis/update-model.md b/_ml-commons-plugin/api/model-apis/update-model.md index 048b5b76e5..083f2cb448 100644 --- a/_ml-commons-plugin/api/model-apis/update-model.md +++ b/_ml-commons-plugin/api/model-apis/update-model.md @@ -20,13 +20,13 @@ For information about user access for this API, see [Model access control consid PUT /_plugins/_ml/models/ ``` -## Request fields +## Request body fields The following table lists the updatable fields. Not all request fields are applicable to all models. To determine whether the field is applicable to your model type, see [Register Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/). Field | Data type | Description :--- | :--- | :--- -`connector` | Object | Contains specifications for a connector for a model hosted on a third-party platform. For more information, see [Creating a connector for a specific model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/#creating-a-connector-for-a-specific-model). For information about the updatable fields within a connector, see [Update Connector API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/update-connector/#request-fields). +`connector` | Object | Contains specifications for a connector for a model hosted on a third-party platform. For more information, see [Creating a connector for a specific model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/#creating-a-connector-for-a-specific-model). For information about the updatable fields within a connector, see [Update Connector API request fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/connector-apis/update-connector/#request-body-fields). `connector_id` | Optional | The connector ID of a standalone connector for a model hosted on a third-party platform. For more information, see [Standalone connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/#creating-a-standalone-connector). To update a standalone connector, you must undeploy the model, update the connector, and then redeploy the model. `description` | String | The model description. `is_enabled`| Boolean | Specifies whether the model is enabled. Disabling the model makes it unavailable for Predict API requests, regardless of the model's deployment status. Default is `true`. diff --git a/_ml-commons-plugin/api/model-group-apis/register-model-group.md b/_ml-commons-plugin/api/model-group-apis/register-model-group.md index 312513ff3f..d224a3356b 100644 --- a/_ml-commons-plugin/api/model-group-apis/register-model-group.md +++ b/_ml-commons-plugin/api/model-group-apis/register-model-group.md @@ -21,7 +21,7 @@ For more information, see [Model access control]({{site.url}}{{site.baseurl}}/ml POST /_plugins/_ml/model_groups/_register ``` -## Request fields +## Request body fields The following table lists the available request fields. @@ -54,7 +54,7 @@ POST /_plugins/_ml/model_groups/_register } ``` -## Response fields +## Response body fields The following table lists the available response fields. diff --git a/_ml-commons-plugin/api/model-group-apis/update-model-group.md b/_ml-commons-plugin/api/model-group-apis/update-model-group.md index 5aa5239794..d299094771 100644 --- a/_ml-commons-plugin/api/model-group-apis/update-model-group.md +++ b/_ml-commons-plugin/api/model-group-apis/update-model-group.md @@ -24,9 +24,9 @@ For more information, see [Model access control]({{site.url}}{{site.baseurl}}/ml PUT /_plugins/_ml/model_groups/ ``` -## Request fields +## Request body fields -Refer to [Request fields](#request-fields) for request field descriptions. +Refer to [Request fields](#request-body-fields) for request field descriptions. #### Example request diff --git a/_ml-commons-plugin/api/profile.md b/_ml-commons-plugin/api/profile.md index e8f65bb16c..8337f23e6e 100644 --- a/_ml-commons-plugin/api/profile.md +++ b/_ml-commons-plugin/api/profile.md @@ -41,7 +41,7 @@ Parameter | Data type | Description `model_id` | String | Returns runtime data for a specific model. You can provide multiple model IDs as comma-separated values to retrieve multiple model profiles. `task_id`| String | Returns runtime data for a specific task. You can provide multiple task IDs as comma-separated values to retrieve multiple task profiles. -### Request fields +### Request body fields All profile body request fields are optional. diff --git a/_ml-commons-plugin/custom-local-models.md b/_ml-commons-plugin/custom-local-models.md index c2866938f6..09c3105f8d 100644 --- a/_ml-commons-plugin/custom-local-models.md +++ b/_ml-commons-plugin/custom-local-models.md @@ -65,14 +65,10 @@ To ensure that this basic local setup works, specify the following cluster setti PUT _cluster/settings { "persistent": { - "plugins": { - "ml_commons": { - "allow_registering_model_via_url": "true", - "only_run_on_ml_node": "false", - "model_access_control_enabled": "true", - "native_memory_threshold": "99" - } - } + "plugins.ml_commons.allow_registering_model_via_url": "true", + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" } } ``` diff --git a/_ml-commons-plugin/pretrained-models.md b/_ml-commons-plugin/pretrained-models.md index 1b0c726c33..552e3e607e 100644 --- a/_ml-commons-plugin/pretrained-models.md +++ b/_ml-commons-plugin/pretrained-models.md @@ -88,13 +88,9 @@ This example uses a simple setup with no dedicated ML nodes and allows running a PUT _cluster/settings { "persistent": { - "plugins": { - "ml_commons": { - "only_run_on_ml_node": "false", - "model_access_control_enabled": "true", - "native_memory_threshold": "99" - } - } + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" } } ``` diff --git a/_ml-commons-plugin/remote-models/async-batch-ingestion.md b/_ml-commons-plugin/remote-models/async-batch-ingestion.md new file mode 100644 index 0000000000..a09c028477 --- /dev/null +++ b/_ml-commons-plugin/remote-models/async-batch-ingestion.md @@ -0,0 +1,190 @@ +--- +layout: default +title: Asynchronous batch ingestion +nav_order: 90 +parent: Connecting to externally hosted models +grand_parent: Integrating ML models +--- + + +# Asynchronous batch ingestion +**Introduced 2.17** +{: .label .label-purple } + +[Batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/batch-ingestion/) configures an ingest pipeline, which processes documents one by one. For each document, batch ingestion calls an externally hosted model to generate text embeddings from the document text and then ingests the document, including text and embeddings, into an OpenSearch index. + +An alternative to this real-time process, _asynchronous_ batch ingestion, ingests both documents and their embeddings generated outside of OpenSearch and stored on a remote file server, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. Asynchronous ingestion returns a task ID and runs asynchronously to ingest data offline into your k-NN cluster for neural search. You can use asynchronous batch ingestion together with the [Batch Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/batch-predict/) to perform inference asynchronously. The batch predict operation takes an input file containing documents and calls an externally hosted model to generate embeddings for those documents in an output file. You can then use asynchronous batch ingestion to ingest both the input file containing documents and the output file containing their embeddings into an OpenSearch index. + +As of OpenSearch 2.17, the Asynchronous Batch Ingestion API is supported by Amazon SageMaker, Amazon Bedrock, and OpenAI. +{: .note} + +## Prerequisites + +Before using asynchronous batch ingestion, you must generate text embeddings using a model of your choice and store the output on a file server, such as Amazon S3. For example, you can store the output of a Batch API call to an Amazon SageMaker text embedding model in a file with the Amazon S3 output path `s3://offlinebatch/output/sagemaker_batch.json.out`. The output is in JSONL format, with each line representing a text embedding result. The file contents have the following format: + +``` +{"SageMakerOutput":[[-0.017166402,0.055771016,...],[-0.06422759,-0.004301484,...],"content":["this is chapter 1","harry potter"],"id":1} +{"SageMakerOutput":[[-0.017455402,0.023771016,...],[-0.02322759,-0.009101284,...],"content":["this is chapter 2","draco malfoy"],"id":1} +... +``` + +## Ingesting data from a single file + +First, create a k-NN index into which you'll ingest the data. The fields in the k-NN index represent the structure of the data in the source file. + +In this example, the source file holds documents containing titles and chapters, along with their corresponding embeddings. Thus, you'll create a k-NN index with the fields `id`, `chapter_embedding`, `chapter`, `title_embedding`, and `title`: + +```json +PUT /my-nlp-index +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "chapter_embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "engine": "nmslib", + "space_type": "cosinesimil", + "name": "hnsw", + "parameters": { + "ef_construction": 512, + "m": 16 + } + } + }, + "chapter": { + "type": "text" + }, + "title_embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "engine": "nmslib", + "space_type": "cosinesimil", + "name": "hnsw", + "parameters": { + "ef_construction": 512, + "m": 16 + } + } + }, + "title": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +When using an S3 file as the source for asynchronous batch ingestion, you must map the fields in the source file to fields in the index in order to indicate into which index each piece of data is ingested. If no JSON path is provided for a field, that field will be set to `null` in the k-NN index. + +In the `field_map`, indicate the location of the data for each field in the source file. You can also specify fields to be ingested directly into your index without making any changes to the source file by adding their JSON paths to the `ingest_fields` array. For example, in the following asynchronous batch ingestion request, the element with the JSON path `$.id` from the source file is ingested directly into the `id` field of your index. To ingest this data from the Amazon S3 file, send the following request to your OpenSearch endpoint: + +```json +POST /_plugins/_ml/_batch_ingestion +{ + "index_name": "my-nlp-index", + "field_map": { + "chapter": "$.content[0]", + "title": "$.content[1]", + "chapter_embedding": "$.SageMakerOutput[0]", + "title_embedding": "$.SageMakerOutput[1]", + "_id": "$.id" + }, + "ingest_fields": ["$.id"], + "credential": { + "region": "us-east-1", + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "data_source": { + "type": "s3", + "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"] + } +} +``` +{% include copy-curl.html %} + +The response contains a task ID for the ingestion task: + +```json +{ + "task_id": "cbsPlpEBMHcagzGbOQOx", + "task_type": "BATCH_INGEST", + "status": "CREATED" +} +``` + +To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`. + + +## Ingesting data from multiple files + +You can also ingest data from multiple files by specifying the file locations in the `source`. The following example ingests data from three OpenAI files. + +The OpenAI Batch API input file is formatted as follows: + +``` +{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of life?", "The food was delicious and the waiter..."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of work?", "The travel was fantastic and the view..."]}} +{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of friend?", "The old friend was far away and the time..."]}} +... +``` + +The OpenAI Batch API output file is formatted as follows: + +``` +{"id": "batch_req_ITKQn29igorXCAGp6wzYs5IS", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "10845755592510080d13054c3776aef4", "body": {"object": "list", "data": [{"object": "embedding", "index": 0, "embedding": [0.0044326545, ... ...]}, {"object": "embedding", "index": 1, "embedding": [0.002297497, ... ... ]}], "model": "text-embedding-ada-002", "usage": {"prompt_tokens": 15, "total_tokens": 15}}}, "error": null} +... +``` + +If you have run the Batch API in OpenAI for text embedding and want to ingest the model input and output files along with some metadata into your index, send the following asynchronous ingestion request. Make sure to use `source[file-index]` to identify the file's location in the source array in the request body. For example, `source[0]` refers to the first file in the `data_source.source` array. + +The following request ingests seven fields into your index: Five are specified in the `field_map` section and two are specified in `ingest_fields`. The format follows the pattern `sourcefile.jsonPath`, indicating the JSON path for each file. In the field_map, `$.body.input[0]` is used as the JSON path to ingest data into the `question` field from the second file in the `source` array. The `ingest_fields` array lists all elements from the `source` files that will be ingested directly into your index: + +```json +POST /_plugins/_ml/_batch_ingestion +{ + "index_name": "my-nlp-index-openai", + "field_map": { + "question": "source[1].$.body.input[0]", + "answer": "source[1].$.body.input[1]", + "question_embedding":"source[0].$.response.body.data[0].embedding", + "answer_embedding":"source[0].$.response.body.data[1].embedding", + "_id": ["source[0].$.custom_id", "source[1].$.custom_id"] + }, + "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"], + "credential": { + "openAI_key": "" + }, + "data_source": { + "type": "openAI", + "source": ["file-", "file-", "file-"] + } +} +``` +{% include copy-curl.html %} + +In the request, make sure to define the `_id` field in the `field_map`. This is necessary in order to map each data entry from the three separate files. + +The response contains a task ID for the ingestion task: + +```json +{ + "task_id": "cbsPlpEBMHcagzGbOQOx", + "task_type": "BATCH_INGEST", + "status": "CREATED" +} +``` + +To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`. + +For request field descriptions, see [Asynchronous Batch Ingestion API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/async-batch-ingest/). \ No newline at end of file diff --git a/_ml-commons-plugin/remote-models/connectors.md b/_ml-commons-plugin/remote-models/connectors.md index 3ec6c73b07..788f1b003d 100644 --- a/_ml-commons-plugin/remote-models/connectors.md +++ b/_ml-commons-plugin/remote-models/connectors.md @@ -294,7 +294,7 @@ In some cases, you may need to update credentials, like `access_key`, that you u ```json PUT /_plugins/_ml/models/ { - "connector": { + "connectors": { "credential": { "openAI_key": "YOUR NEW OPENAI KEY" } diff --git a/_ml-commons-plugin/tutorials/generate-embeddings.md b/_ml-commons-plugin/tutorials/generate-embeddings.md index 92b62b9fe8..c236424eb8 100644 --- a/_ml-commons-plugin/tutorials/generate-embeddings.md +++ b/_ml-commons-plugin/tutorials/generate-embeddings.md @@ -107,7 +107,7 @@ PUT _ingest/pipeline/bedrock_embedding_pipeline { "set": { "field": "title_tmp", - "value": "{{_ingest._value.title}}" + "value": {% raw %}"{{_ingest._value.title}}"{% endraw %} } }, { @@ -333,4 +333,4 @@ POST _bulk { "index" : { "_index" : "my_books" } } { "books" : [{"title": "third book", "description": "This is third book"}, {"description": "This is fourth book"}] } ``` -{% include copy-curl.html %} \ No newline at end of file +{% include copy-curl.html %} diff --git a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md index 7061d3cb5a..c4cc27f660 100644 --- a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md +++ b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md @@ -7,7 +7,7 @@ nav_order: 10 # Semantic search using byte-quantized vectors -This tutorial illustrates how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector). +This tutorial shows you how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors). The Cohere Embed v3 model supports several `embedding_types`. For this tutorial, you'll use the `INT8` type to encode byte-quantized vectors. diff --git a/_monitoring-your-cluster/pa/index.md b/_monitoring-your-cluster/pa/index.md index bb4f9c6c30..156e985e8b 100644 --- a/_monitoring-your-cluster/pa/index.md +++ b/_monitoring-your-cluster/pa/index.md @@ -60,7 +60,7 @@ private-key-file-path = specify_path The Performance Analyzer plugin is included in the installations for [Docker]({{site.url}}{{site.baseurl}}/opensearch/install/docker/) and [tarball]({{site.url}}{{site.baseurl}}/opensearch/install/tar/), but you can also install the plugin manually. -To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://search.maven.org/search?q=org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster. +To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://central.sonatype.com/namespace/org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster. To start the Performance Analyzer root cause analysis (RCA) agent on a tarball installation, run the following command: diff --git a/_monitoring-your-cluster/pa/rca/shard-hotspot.md b/_monitoring-your-cluster/pa/rca/shard-hotspot.md index 18a6804410..889658716a 100644 --- a/_monitoring-your-cluster/pa/rca/shard-hotspot.md +++ b/_monitoring-your-cluster/pa/rca/shard-hotspot.md @@ -78,7 +78,7 @@ The response contains a list of unhealthy shards: }] ``` -## Response fields +## Response body fields The following table lists the response fields. diff --git a/_observing-your-data/ad/dashboards-anomaly-detection.md b/_observing-your-data/ad/dashboards-anomaly-detection.md index 679237094a..ad6fa5950b 100644 --- a/_observing-your-data/ad/dashboards-anomaly-detection.md +++ b/_observing-your-data/ad/dashboards-anomaly-detection.md @@ -18,12 +18,12 @@ You can connect data visualizations to OpenSearch datasets and then create, run, Before getting started, you must have: - Installed OpenSearch and OpenSearch Dashboards version 2.9 or later. See [Installing OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/). -- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins). +- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]/({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). - Installed the Anomaly Detection Dashboards plugin version 2.9 or later. See [Managing OpenSearch Dashboards plugins]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/plugins/) to get started. ## General requirements for anomaly detection visualizations -Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information on real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-set-up-detector-jobs). +Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information about real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-setting-up-detector-jobs). Keep in mind the following requirements when setting up or creating anomaly detection visualizations. The visualization: diff --git a/_observing-your-data/ad/index.md b/_observing-your-data/ad/index.md index 5dfa1b8f1a..657c3c90cb 100644 --- a/_observing-your-data/ad/index.md +++ b/_observing-your-data/ad/index.md @@ -10,30 +10,42 @@ redirect_from: # Anomaly detection -An anomaly in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric might help you uncover early signs of a system failure. +An _anomaly_ in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric can help identify early signs of a system failure. -It can be challenging to discover anomalies using conventional methods such as creating visualizations and dashboards. You could configure an alert based on a static threshold, but this requires prior domain knowledge and isn't adaptive to data that exhibits organic growth or seasonal behavior. +Conventional techniques like visualizations and dashboards can make it difficult to uncover anomalies. Configuring alerts based on static thresholds is possible, but this approach requires prior domain knowledge and may not adapt to data with organic growth or seasonal trends. -Anomaly detection automatically detects anomalies in your OpenSearch data in near real-time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an `anomaly grade` and `confidence score` value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Random Cut Forests](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9). +Anomaly detection automatically detects anomalies in your OpenSearch data in near real time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an _anomaly grade_ and _confidence score_ value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Robust Random Cut Forest Based Anomaly Detection on Streams](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9). You can pair the Anomaly Detection plugin with the [Alerting plugin]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/) to notify you as soon as an anomaly is detected. +{: .note} -To get started, choose **Anomaly Detection** in OpenSearch Dashboards. -To first test with sample streaming data, you can try out one of the preconfigured detectors with one of the sample datasets. +## Getting started with anomaly detection in OpenSearch Dashboards + +To get started, go to **OpenSearch Dashboards** > **OpenSearch Plugins** > **Anomaly Detection**. ## Step 1: Define a detector -A detector is an individual anomaly detection task. You can define multiple detectors, and all the detectors can run simultaneously, with each analyzing data from different sources. +A _detector_ is an individual anomaly detection task. You can define multiple detectors, and all detectors can run simultaneously, with each analyzing data from different sources. You can define a detector by following these steps: + +1. On the **Anomaly detection** page, select the **Create detector** button. +2. On the **Define detector** page, enter the required information in the **Detector details** pane. +3. In the **Select data** pane, specify the data source by choosing a source from the **Index** dropdown menu. You can choose an index, index patterns, or an alias. +4. (Optional) Filter the data source by selecting **Add data filter** and then entering the conditions for **Field**, **Operator**, and **Value**. Alternatively, you can choose **Use query DSL** and add your JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL). +#### Example: Filtering data using query DSL + +The following example query retrieves documents in which the `urlPath.keyword` field matches any of the specified values: +======= 1. Choose **Create detector**. 1. Add in the detector details. - Enter a name and brief description. Make sure the name is unique and descriptive enough to help you to identify the purpose of the detector. 1. Specify the data source. - - For **Data source**, choose the index you want to use as the data source. You can optionally use index patterns to choose multiple indexes. + - For **Data source**, choose one or more indexes to use as the data source. Alternatively, you can use an alias or index pattern to choose multiple indexes. + - Detectors can use remote indexes. You can access them using the `cluster-name:index-name` pattern. See [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/) for more information. Alternatively, you can select clusters and indexes in OpenSearch Dashboards 2.17 or later. To learn about configuring remote indexes with the Security plugin enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/#selecting-remote-indexes-with-fine-grained-access-control) in the [Anomaly detection security](observing-your-data/ad/security/) documentation. - (Optional) For **Data filter**, filter the index you chose as the data source. From the **Data filter** menu, choose **Add data filter**, and then design your filter query by selecting **Field**, **Operator**, and **Value**, or choose **Use query DSL** and add your own JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL). -#### Example filter using query DSL -The query is designed to retrieve documents in which the `urlPath.keyword` field matches one of the following specified values: +To create a cross-cluster detector in OpenSearch Dashboards, the following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) are required: `indices:data/read/field_caps`, `indices:admin/resolve/index`, and `cluster:monitor/remote/info`. +{: .note} - /domain/{id}/short - /sub_dir/{id}/short @@ -62,40 +74,38 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field } } ``` + {% include copy-curl.html %} -1. Specify a timestamp. - - Select the **Timestamp field** in your index. -1. Define operation settings. - - For **Operation settings**, define the **Detector interval**, which is the time interval at which the detector collects data. - - The detector aggregates the data in this interval, then feeds the aggregated result into the anomaly detection model. - The shorter you set this interval, the fewer data points the detector aggregates. - The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process needs a certain number of aggregated data points from contiguous intervals. - - - We recommend setting the detector interval based on your actual data. If it's too long it might delay the results, and if it's too short it might miss some data. It also won't have a sufficient number of consecutive data points for the shingle process. +5. In the **Timestamp** pane, select a field from the **Timestamp field** dropdown menu. - - (Optional) To add extra processing time for data collection, specify a **Window delay** value. +6. In the **Operation settings** pane, define the **Detector interval**, which is the interval at which the detector collects data. + - The detector aggregates the data at this interval and then feeds the aggregated result into the anomaly detection model. The shorter the interval, the fewer data points the detector aggregates. The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process requires a certain number of aggregated data points from contiguous intervals. + - You should set the detector interval based on your actual data. If the detector interval is too long, then it might delay the results. If the detector interval is too short, then it might miss some data. The detector interval also will not have a sufficient number of consecutive data points for the shingle process. + - (Optional) To add extra processing time for data collection, specify a **Window delay** value. - This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay. Set the window delay to shift the detector interval to account for this delay. - - For example, say the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time. -1. Specify custom results index. - - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. To enable this, select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, like `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored. + - For example, the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time. + - To avoid missing any data, set the **Window delay** to the upper limit of the expected ingestion delay. This ensures that the detector captures all data during its interval, reducing the risk of missing relevant information. While a longer window delay helps capture all data, too long of a window delay can hinder real-time anomaly detection because the detector will look further back in time. Find a balance to maintain both data accuracy and timely detection. - You can use the dash “-” sign to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the "financial" department at a granular level for the "us" area. +7. Specify a custom results index. + - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. Select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, such as `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored. + + You can use `-` to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the `financial` department at a granular level for the `us` group. {: .note } - When the Security plugin (fine-grained access control) is enabled, the default results index becomes a system index and is no longer accessible through the standard Index or Search APIs. To access its content, you must use the Anomaly Detection RESTful API or the dashboard. As a result, you cannot build customized dashboards using the default results index if the Security plugin is enabled. However, you can create a custom results index in order to build customized dashboards. - If the custom index you specify does not exist, the Anomaly Detection plugin will create it when you create the detector and start your real-time or historical analysis. - If the custom index already exists, the plugin will verify that the index mapping matches the required structure for anomaly results. In this case, ensure that the custom index has a valid mapping as defined in the [`anomaly-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/anomaly-results.json) file. - - To use the custom results index option, you need the following permissions: - - `indices:admin/create` - The Anomaly Detection plugin requires the ability to create and roll over the custom index. - - `indices:admin/aliases` - The Anomaly Detection plugin requires access to create and manage an alias for the custom index. - - `indices:data/write/index` - You need the `write` permission for the Anomaly Detection plugin to write results into the custom index for a single-entity detector. - - `indices:data/read/search` - You need the `search` permission because the Anomaly Detection plugin needs to search custom results indexes to show results on the Anomaly Detection UI. - - `indices:data/write/delete` - Because the detector might generate a large number of anomaly results, you need the `delete` permission to delete old data and save disk space. - - `indices:data/write/bulk*` - You need the `bulk*` permission because the Anomaly Detection plugin uses the bulk API to write results into the custom index. - - Managing the custom results index: - - The anomaly detection dashboard queries all detectors’ results from all custom results indexes. Having too many custom results indexes might impact the performance of the Anomaly Detection plugin. - - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to rollover old results indexes. You can also manually delete or archive any old results indexes. We recommend reusing a custom results index for multiple detectors. - - The Anomaly Detection plugin also provides lifecycle management for custom indexes. It rolls an alias over to a new index when the custom results index meets any of the conditions in the following table. + - To use the custom results index option, you must have the following permissions: + - `indices:admin/create` -- The `create` permission is required in order to create and roll over the custom index. + - `indices:admin/aliases` -- The `aliases` permission is required in order to create and manage an alias for the custom index. + - `indices:data/write/index` -- The `write` permission is required in order to write results into the custom index for a single-entity detector. + - `indices:data/read/search` -- The `search` permission is required in order to search custom results indexes to show results on the Anomaly Detection interface. + - `indices:data/write/delete` -- The detector may generate many anomaly results. The `delete` permission is required in order to delete old data and save disk space. + - `indices:data/write/bulk*` -- The `bulk*` permission is required because the plugin uses the Bulk API to write results into the custom index. + - When managing the custom results index, consider the following: + - The anomaly detection dashboard queries all detector results from all custom results indexes. Having too many custom results indexes can impact the plugin's performance. + - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to roll over old results indexes. You can also manually delete or archive any old results indexes. Reusing a custom results index for multiple detectors is recommended. + - The plugin provides lifecycle management for custom indexes. It rolls over an alias to a new index when the custom results index meets any of the conditions in the following table. Parameter | Description | Type | Unit | Example | Required :--- | :--- |:--- |:--- |:--- |:--- @@ -103,43 +113,52 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field `result_index_min_age` | The minimum index age required for rollover, calculated from its creation time to the current time. | `integer` |`day` | `7` | No `result_index_ttl` | The minimum age required to permanently delete rolled-over indexes. | `integer` | `day` | `60` | No -1. Choose **Next**. +8. Choose **Next**. After you define the detector, the next step is to configure the model. ## Step 2: Configure the model -#### Add features to your detector +1. Add features to your detector. -A feature is the field in your index that you want to check for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly. +A _feature_ is any field in your index that you want to analyze for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly. For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature. -A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely for multi-feature models to identify smaller anomalies as compared to a single-feature model. Adding more features might negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data might further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is 5. You can adjust this limit with the `plugins.anomaly_detection.max_anomaly_features` setting. -{: .note } +A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely that multi-feature models will identify smaller anomalies as compared to a single-feature model. Adding more features can negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data can further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is `5`. You can adjust this limit using the `plugins.anomaly_detection.max_anomaly_features` setting. +{: .note} + +### Configuring a model based on an aggregation method To configure an anomaly detection model based on an aggregation method, follow these steps: -1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**. -1. For **Find anomalies based on**, select **Field Value**. -1. For **aggregation method**, select either **average()**, **count()**, **sum()**, **min()**, or **max()**. -1. For **Field**, select from the available options. +1. On the **Detectors** page, select the desired detector from the list. +2. On the detector's details page, select the **Actions** button to activate the dropdown menu and then select **Edit model configuration**. +3. On the **Edit model configuration** page, select the **Add another feature** button. +4. Enter a name in the **Feature name** field and select the **Enable feature** checkbox. +5. Select **Field value** from the dropdown menu under **Find anomalies based on**. +6. Select the desired aggregation from the dropdown menu under **Aggregation method**. +7. Select the desired field from the options listed in the dropdown menu under **Field**. +8. Select the **Save changes** button. + +### Configuring a model based on a JSON aggregation query To configure an anomaly detection model based on a JSON aggregation query, follow these steps: -1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**. -1. For **Find anomalies based on**, select **Custom expression**. You will see the JSON editor window open up. -1. Enter your JSON aggregation query in the editor. -For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) -{: .note } +1. On the **Edit model configuration** page, select the **Add another feature** button. +2. Enter a name in the **Feature name** field and select the **Enable feature** checkbox. +3. Select **Custom expression** from the dropdown menu under **Find anomalies based on**. The JSON editor window will open. +4. Enter your JSON aggregation query in the editor. +5. Select the **Save changes** button. -#### (Optional) Set category fields for high cardinality +For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/). +{: .note} -You can categorize anomalies based on a keyword or IP field type. +### Setting categorical fields for high cardinality -The category field categorizes or slices the source time series with a dimension like IP addresses, product IDs, country codes, and so on. This helps to see a granular view of anomalies within each entity of the category field to isolate and debug issues. +You can categorize anomalies based on a keyword or IP field type. You can enable the **Categorical fields** option to categorize, or "slice," the source time series using a dimension, such as an IP address, a product ID, or a country code. This gives you a granular view of anomalies within each entity of the category field to help isolate and debug issues. -To set a category field, choose **Enable a category field** and select a field. You can’t change the category fields after you create the detector. +To set a category field, choose **Enable categorical fields** and select a field. You cannot change the category fields after you create the detector. Only a certain number of unique entities are supported in the category field. Use the following equation to calculate the recommended total number of entities supported in a cluster: @@ -147,7 +166,7 @@ Only a certain number of unique entities are supported in the category field. Us (data nodes * heap size * anomaly detection maximum memory percentage) / (entity model size of a detector) ``` -To get the entity model size of a detector, use the [profile detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage with the `plugins.anomaly_detection.model_max_size_percent` setting. +To get the detector's entity model size, use the [Profile Detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage using the `plugins.anomaly_detection.model_max_size_percent` setting. Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the default 10% memory allocation. With an entity model size of 1 MB, the following formula calculates the estimated number of unique entities: @@ -155,81 +174,109 @@ Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the de (8096 MB * 0.1 / 1 MB ) * 3 = 2429 ``` -If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), the anomaly detector will attempt to model the extra entities. The detector prioritizes entities that occur more often and are more recent. +If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), then the anomaly detector attempts to model the extra entities. The detector prioritizes both entities that occur more often and are more recent. -This formula serves as a starting point. Make sure to test it with a representative workload. You can find more information in the [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) blog post. +This formula serves as a starting point. Make sure to test it with a representative workload. See the OpenSearch blog post [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) for more information. {: .note } -#### (Advanced settings) Set a shingle size +### Setting a shingle size -Set the number of aggregation intervals from your data stream to consider in a detection window. It’s best to choose this value based on your actual data to see which one leads to the best results for your use case. +In the **Advanced settings** pane, you can set the number of data stream aggregation intervals to include in the detection window. Choose this value based on your actual data to find the optimal setting for your use case. To set the shingle size, select **Show** in the **Advanced settings** pane. Enter the desired size in the **intervals** field. -The anomaly detector expects the shingle size to be in the range of 1 and 60. The default shingle size is 8. We recommend that you don't choose 1 unless you have two or more features. Smaller values might increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also false positives. Larger values might be useful for ignoring noise in a signal. +The anomaly detector requires the shingle size to be between 1 and 128. The default is `8`. Use `1` only if you have at least two features. Values of less than `8` may increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also may increase false positives. Values greater than `8` may be useful for ignoring noise in a signal. -#### Preview sample anomalies +### Setting an imputation option -Preview sample anomalies and adjust the feature settings if needed. -For sample previews, the Anomaly Detection plugin selects a small number of data samples---for example, one data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. It loads this sample dataset into the detector. The detector uses this sample dataset to generate a sample preview of anomaly results. +In the **Advanced settings** pane, you can set the imputation option. This allows you to manage missing data in your streams. The options include the following: -Examine the sample preview and use it to fine-tune your feature configurations (for example, enable or disable features) to get more accurate results. +- **Ignore Missing Data (Default):** The system continues without considering missing data points, keeping the existing data flow. +- **Fill with Custom Values:** Specify a custom value for each feature to replace missing data points, allowing for targeted imputation tailored to your data. +- **Fill with Zeros:** Replace missing values with zeros. This is ideal when the absence of data indicates a significant event, such as a drop to zero in event counts. +- **Use Previous Values:** Fill gaps with the last observed value to maintain continuity in your time-series data. This method treats missing data as non-anomalous, carrying forward the previous trend. -1. Choose **Preview sample anomalies**. - - If you don't see any sample anomaly result, check the detector interval and make sure you have more than 400 data points for some entities during the preview date range. -1. Choose **Next**. +Using these options can improve recall in anomaly detection. For instance, if you are monitoring for drops in event counts, including both partial and complete drops, then filling missing values with zeros helps detect significant data absences, improving detection recall. + +Be cautious when imputing extensively missing data, as excessive gaps can compromise model accuracy. Quality input is critical---poor data quality leads to poor model performance. The confidence score also decreases when imputations occur. You can check whether a feature value has been imputed using the `feature_imputed` field in the anomaly results index. See [Anomaly result mapping]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/result-mapping/) for more information. +{: note} + +### Suppressing anomalies with threshold-based rules + +In the **Advanced settings** pane, you can suppress anomalies by setting rules that define acceptable differences between the expected and actual values, either as an absolute value or a relative percentage. This helps reduce false anomalies caused by minor fluctuations, allowing you to focus on significant deviations. + +Suppose you want to detect substantial changes in log volume while ignoring small variations that are not meaningful. Without customized settings, the system might generate false alerts for minor changes, making it difficult to identify true anomalies. By setting suppression rules, you can ignore minor deviations and focus on real anomalous patterns. + +To suppress anomalies for deviations of less than 30% from the expected value, you can set the following rules: -## Step 3: Set up detector jobs +``` +Ignore anomalies for feature logVolume when the actual value is no more than 30% above the expected value. +Ignore anomalies for feature logVolume when the actual value is no more than 30% below the expected value. +``` + +Ensure that a feature, for example, `logVolume`, is properly defined in your model. Suppression rules are tied to specific features. +{: .note} + +If you expect that the log volume should differ by at least 10,000 from the expected value before being considered an anomaly, you can set absolute thresholds: -To start a real-time detector to find anomalies in your data in near real-time, check **Start real-time detector automatically (recommended)**. +``` +Ignore anomalies for feature logVolume when the actual value is no more than 10000 above the expected value. +Ignore anomalies for feature logVolume when the actual value is no more than 10000 below the expected value. +``` -Alternatively, if you want to perform historical analysis and find patterns in long historical data windows (weeks or months), check **Run historical analysis detection** and select a date range (at least 128 detection intervals). +If no custom suppression rules are set, then the system defaults to a filter that ignores anomalies with deviations of less than 20% from the expected value for each enabled feature. -Analyzing historical data helps you get familiar with the Anomaly Detection plugin. You can also evaluate the performance of a detector with historical data to further fine-tune it. +### Previewing sample anomalies -We recommend experimenting with historical analysis with different feature sets and checking the precision before moving on to real-time detectors. +You can preview anomalies based on sample feature input and adjust the feature settings as needed. The Anomaly Detection plugin selects a small number of data samples---for example, 1 data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. The sample dataset is loaded into the detector, which then uses the sample dataset to generate a preview of the anomalies. + +1. Choose **Preview sample anomalies**. + - If sample anomaly results are not displayed, check the detector interval to verify that 400 or more data points are set for the entities during the preview date range. +2. Select the **Next** button. -## Step 4: Review and create +## Step 3: Setting up detector jobs -Review your detector settings and model configurations to make sure that they're valid and then select **Create detector**. +To start a detector to find anomalies in your data in near real time, select **Start real-time detector automatically (recommended)**. -![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/review_ad.png) +Alternatively, if you want to perform historical analysis and find patterns in longer historical data windows (weeks or months), select the **Run historical analysis detection** box and select a date range of at least 128 detection intervals. -If you see any validation errors, edit the settings to fix the errors and then return back to this page. +Analyzing historical data can help to familiarize you with the Anomaly Detection plugin. For example, you can evaluate the performance of a detector against historical data in order to fine-tune it. + +You can experiment with historical analysis by using different feature sets and checking the precision before using real-time detectors. + +## Step 4: Reviewing detector settings + +Review your detector settings and model configurations to confirm that they are valid and then select **Create detector**. + +If a validation error occurs, edit the settings to correct the error and return to the detector page. {: .note } -## Step 5: Observe the results +## Step 5: Observing the results -Choose the **Real-time results** or **Historical analysis** tab. For real-time results, you need to wait for some time to see the anomaly results. If the detector interval is 10 minutes, the detector might take more than an hour to start, because its waiting for sufficient data to generate anomalies. +Choose either the **Real-time results** or **Historical analysis** tab. For real-time results, it will take some time to display the anomaly results. For example, if the detector interval is 10 minutes, then the detector may take an hour to initiate because it is waiting for sufficient data to be able to generate anomalies. -A shorter interval means the model passes the shingle process more quickly and starts to generate the anomaly results sooner. -Use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to make sure you have sufficient data points. +A shorter interval results in the model passing the shingle process more quickly and generating anomaly results sooner. You can use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to ensure that you have enough data points. -If you see the detector pending in "initialization" for longer than a day, aggregate your existing data using the detector interval to check for any missing data points. If you find a lot of missing data points from the aggregated data, consider increasing the detector interval. +If the detector is pending in "initialization" for longer than 1 day, aggregate your existing data and use the detector interval to check for any missing data points. If you find many missing data points, consider increasing the detector interval. -Choose and drag over the anomaly line chart to zoom in and see a more detailed view of an anomaly. +Click and drag over the anomaly line chart to zoom in and see a detailed view of an anomaly. {: .note } -Analyze anomalies with the following visualizations: +You can analyze anomalies using the following visualizations: -- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is 10, it shows results for the last 600 minutes. The chart refreshes every 30 seconds. -- **Anomaly overview** (for real-time results) / **Anomaly history** (for historical analysis in the **Historical analysis** tab) plots the anomaly grade with the corresponding measure of confidence. This pane includes: +- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is `10`, it shows results for the last 600 minutes. The chart refreshes every 30 seconds. +- **Anomaly overview** (for real-time results) or **Anomaly history** (for historical analysis on the **Historical analysis** tab) plot the anomaly grade with the corresponding measure of confidence. The pane includes: - The number of anomaly occurrences based on the given data-time range. - - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of 0 represents “not an anomaly,” and a non-zero value represents the relative severity of the anomaly. + - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of `0` represents "not an anomaly," and a non-zero value represents the relative severity of the anomaly. - **Confidence** estimate of the probability that the reported anomaly grade matches the expected anomaly grade. Confidence increases as the model observes more data and learns the data behavior and trends. Note that confidence is distinct from model accuracy. - **Last anomaly occurrence** is the time at which the last anomaly occurred. -Underneath **Anomaly overview**/**Anomaly history** are: +Underneath **Anomaly overview** or **Anomaly history** are: - **Feature breakdown** plots the features based on the aggregation method. You can vary the date-time range of the detector. Selecting a point on the feature line chart shows the **Feature output**, the number of times a field appears in your index, and the **Expected value**, a predicted value for the feature output. Where there is no anomaly, the output and expected values are equal. - ![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png) - - **Anomaly occurrences** shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly. Selecting a point on the anomaly line chart shows **Feature Contribution**, the percentage of a feature that contributes to the anomaly -![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png) - - If you set the category field, you see an additional **Heat map** chart. The heat map correlates results for anomalous entities. This chart is empty until you select an anomalous entity. You also see the anomaly and feature line chart for the time period of the anomaly (`anomaly_grade` > 0). @@ -249,7 +296,7 @@ To see all the configuration settings for a detector, choose the **Detector conf 1. To make any changes to the detector configuration, or fine tune the time interval to minimize any false positives, go to the **Detector configuration** section and choose **Edit**. - You need to stop real-time and historical analysis to change its configuration. Confirm that you want to stop the detector and proceed. -1. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**. +2. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**. ## Step 8: Manage your detectors diff --git a/_observing-your-data/ad/result-mapping.md b/_observing-your-data/ad/result-mapping.md index 7e1482a013..967b185684 100644 --- a/_observing-your-data/ad/result-mapping.md +++ b/_observing-your-data/ad/result-mapping.md @@ -9,9 +9,7 @@ redirect_from: # Anomaly result mapping -If you enabled custom result index, the anomaly detection plugin stores the results in your own index. - -If the anomaly detector doesn’t detect an anomaly, the result has the following format: +When you select the **Enable custom result index** box on the **Custom result index** pane, the Anomaly Detection plugin will save the results to an index of your choosing. When the anomaly detector does not detect an anomaly, the result format is as follows: ```json { @@ -61,6 +59,7 @@ If the anomaly detector doesn’t detect an anomaly, the result has the followin "threshold": 1.2368549346675202 } ``` +{% include copy-curl.html %} ## Response body fields @@ -80,7 +79,83 @@ Field | Description `model_id` | A unique ID that identifies a model. If a detector is a single-stream detector (with no category field), it has only one model. If a detector is a high-cardinality detector (with one or more category fields), it might have multiple models, one for each entity. `threshold` | One of the criteria for a detector to classify a data point as an anomaly is that its `anomaly_score` must surpass a dynamic threshold. This field records the current threshold. -If an anomaly detector detects an anomaly, the result has the following format: +When the imputation option is enabled, the anomaly results include a `feature_imputed` array showing which features were modified due to missing data. If no features were imputed, then this is excluded. + +In the following example anomaly result output, the `processing_bytes_max` feature was imputed, as shown by the `imputed: true` status: + +```json +{ + "detector_id": "kzcZ43wBgEQAbjDnhzGF", + "schema_version": 5, + "data_start_time": 1635898161367, + "data_end_time": 1635898221367, + "feature_data": [ + { + "feature_id": "processing_bytes_max", + "feature_name": "processing bytes max", + "data": 2322 + }, + { + "feature_id": "processing_bytes_avg", + "feature_name": "processing bytes avg", + "data": 1718.6666666666667 + }, + { + "feature_id": "processing_bytes_min", + "feature_name": "processing bytes min", + "data": 1375 + }, + { + "feature_id": "processing_bytes_sum", + "feature_name": "processing bytes sum", + "data": 5156 + }, + { + "feature_id": "processing_time_max", + "feature_name": "processing time max", + "data": 31198 + } + ], + "execution_start_time": 1635898231577, + "execution_end_time": 1635898231622, + "anomaly_score": 1.8124904404395776, + "anomaly_grade": 0, + "confidence": 0.9802940756605277, + "entity": [ + { + "name": "process_name", + "value": "process_3" + } + ], + "model_id": "kzcZ43wBgEQAbjDnhzGF_entity_process_3", + "threshold": 1.2368549346675202, + "feature_imputed": [ + { + "feature_id": "processing_bytes_max", + "imputed": true + }, + { + "feature_id": "processing_bytes_avg", + "imputed": false + }, + { + "feature_id": "processing_bytes_min", + "imputed": false + }, + { + "feature_id": "processing_bytes_sum", + "imputed": false + }, + { + "feature_id": "processing_time_max", + "imputed": false + } + ] +} +``` +{% include copy-curl.html %} + +When an anomaly is detected, the result is provided in the following format: ```json { @@ -179,24 +254,23 @@ If an anomaly detector detects an anomaly, the result has the following format: "execution_start_time": 1635898427803 } ``` +{% include copy-curl.html %} -You can see the following additional fields: +Note that the result includes the following additional field. Field | Description :--- | :--- `relevant_attribution` | Represents the contribution of each input variable. The sum of the attributions is normalized to 1. `expected_values` | The expected value for each feature. -At times, the detector might detect an anomaly late. -Let's say the detector sees a random mix of the triples {1, 2, 3} and {2, 4, 5} that correspond to `slow weeks` and `busy weeks`, respectively. For example 1, 2, 3, 1, 2, 3, 2, 4, 5, 1, 2, 3, 2, 4, 5, ... and so on. -If the detector comes across a pattern {2, 2, X} and it's yet to see X, the detector infers that the pattern is anomalous, but it can't determine at this point which of the 2's is the cause. If X = 3, then the detector knows it's the first 2 in that unfinished triple, and if X = 5, then it's the second 2. If it's the first 2, then the detector detects the anomaly late. +The detector may be late in detecting an anomaly. For example: The detector observes a sequence of data that alternates between "slow weeks" (represented by the triples {1, 2, 3}) and "busy weeks" (represented by the triples {2, 4, 5}). If the detector comes across a pattern {2, 2, X}, where it has not yet seen the value that X will take, then the detector infers that the pattern is anomalous. However, it cannot determine which 2 is the cause. If X = 3, then the first 2 is the anomaly. If X = 5, then the second 2 is the anomaly. If it is the first 2, then the detector will be late in detecting the anomaly. -If a detector detects an anomaly late, the result has the following additional fields: +When a detector is late in detecting an anomaly, the result includes the following additional fields. Field | Description :--- | :--- -`past_values` | The actual input that triggered an anomaly. If `past_values` is null, the attributions or expected values are from the current input. If `past_values` is not null, the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]). -`approx_anomaly_start_time` | The approximate time of the actual input that triggers an anomaly. This field helps you understand when a detector flags an anomaly. Both single-stream and high-cardinality detectors don't query previous anomaly results because these queries are expensive operations. The cost is especially high for high-cardinality detectors that might have a lot of entities. If the data is not continuous, the accuracy of this field is low and the actual time that the detector detects an anomaly can be earlier. +`past_values` | The actual input that triggered an anomaly. If `past_values` is `null`, then the attributions or expected values are from the current input. If `past_values` is not `null`, then the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]). +`approx_anomaly_start_time` | The approximate time of the actual input that triggered an anomaly. This field helps you understand the time at which a detector flags an anomaly. Both single-stream and high-cardinality detectors do not query previous anomaly results because these queries are costly operations. The cost is especially high for high-cardinality detectors that may have many entities. If the data is not continuous, then the accuracy of this field is low and the actual time at which the detector detects an anomaly can be earlier. ```json { @@ -319,3 +393,4 @@ Field | Description "approx_anomaly_start_time": 1635883620000 } ``` +{% include copy-curl.html %} diff --git a/_observing-your-data/ad/security.md b/_observing-your-data/ad/security.md index 8eeaa3df41..e4816cec46 100644 --- a/_observing-your-data/ad/security.md +++ b/_observing-your-data/ad/security.md @@ -23,6 +23,11 @@ As an admin user, you can use the Security plugin to assign specific permissions The Security plugin has two built-in roles that cover most anomaly detection use cases: `anomaly_full_access` and `anomaly_read_access`. For descriptions of each, see [Predefined roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles). +If you use OpenSearch Dashboards to create your anomaly detectors, you may experience access issues even with `anomaly_full_access`. This issue has been resolved in OpenSearch 2.17, but for earlier versions, the following additional permissions need to be added: + +- `indices:data/read/search` -- You need this permission because the Anomaly Detection plugin needs to search the data source in order to validate whether there is enough data to train the model. +- `indices:admin/mappings/fields/get` and `indices:admin/mappings/fields/get*` -- You need these permissions to validate whether the given data source has a valid timestamp field and categorical field (in the case of creating a high-cardinality detector). + If these roles don't meet your needs, mix and match individual anomaly detection [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/ad/detector/delete` permission lets you delete detectors. ### A note on alerts and fine-grained access control @@ -31,6 +36,42 @@ When a trigger generates an alert, the detector and monitor configurations, the To reduce the chances of unintended users viewing metadata that could describe an index, we recommend that administrators enable role-based access control and keep these kinds of design elements in mind when assigning permissions to the intended group of users. See [Limit access by backend role](#advanced-limit-access-by-backend-role) for details. +### Selecting remote indexes with fine-grained access control + +To use a remote index as a data source for a detector, see the setup steps in [Authentication flow]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/#authentication-flow) in [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/). You must use a role that exists in both the remote and local clusters. The remote cluster must map the chosen role to the same username as in the local cluster. + +--- + +#### Example: Create a new user on the local cluster + +1. Create a new user on the local cluster to use for detector creation: + +``` +curl -XPUT -k -u 'admin:' 'https://localhost:9200/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}' +``` +{% include copy-curl.html %} + +2. Map the new user to the `anomaly_full_access` role: + +``` +curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}' +``` +{% include copy-curl.html %} + +3. On the remote cluster, create the same user and map `anomaly_full_access` to that role: + +``` +curl -XPUT -k -u 'admin:' 'https://localhost:9250/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}' +curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9250/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}' +``` +{% include copy-curl.html %} + +--- + +### Custom results index + +To use a custom results index, you need additional permissions not included in the default roles provided by the OpenSearch Security plugin. To add these permissions, see [Step 1: Define a detector]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-1-define-a-detector) in the [Anomaly detection]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/) documentation. + ## (Advanced) Limit access by backend role Use backend roles to configure fine-grained access to individual detectors based on roles. For example, users of different departments in an organization can view detectors owned by their own department. diff --git a/_observing-your-data/alerting/api.md b/_observing-your-data/alerting/api.md index d25323084a..ea52da552d 100644 --- a/_observing-your-data/alerting/api.md +++ b/_observing-your-data/alerting/api.md @@ -743,7 +743,7 @@ If you run a document-level query while the index is getting reindexed, the API ## Update monitor -When updating a monitor, you can optionally include `seq_no` and `primary_term` as URL parameters. If these numbers do not match the existing monitor or the monitor does not exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). +When updating a monitor, you can optionally include `seq_no` and `primary_term` as parameters. If these numbers do not match the existing monitor or the monitor does not exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). #### Example request ```json @@ -1609,7 +1609,7 @@ POST _plugins/_alerting/destinations ## Update destination -When updating a destination, you can optionally include `seq_no` and `primary_term` as URL parameters. If these numbers do not match the existing destination or the destination doesn't exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). +When updating a destination, you can optionally include `seq_no` and `primary_term` as parameters. If these numbers do not match the existing destination or the destination doesn't exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). #### Example request ```json @@ -1857,7 +1857,7 @@ POST _plugins/_alerting/destinations/email_accounts ## Update email account -When updating an email account, you can optionally include `seq_no` and `primary_term` as URL parameters. If these numbers don't match the existing email account or the email account doesn't exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). +When updating an email account, you can optionally include `seq_no` and `primary_term` as parameters. If these numbers don't match the existing email account or the email account doesn't exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). #### Example request ```json @@ -2117,7 +2117,7 @@ POST _plugins/_alerting/destinations/email_groups ## Update email group -When updating an email group, you can optionally include `seq_no` and `primary_term` as URL parameters. If these numbers don't match the existing email group or the email group doesn't exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). +When updating an email group, you can optionally include `seq_no` and `primary_term` as parameters. If these numbers don't match the existing email group or the email group doesn't exist, the Alerting plugin throws an error. OpenSearch increments the version number and the sequence number automatically (see the example response). #### Example request ```json diff --git a/_observing-your-data/alerting/composite-monitors.md b/_observing-your-data/alerting/composite-monitors.md index eb26dcc15e..7ba182a865 100644 --- a/_observing-your-data/alerting/composite-monitors.md +++ b/_observing-your-data/alerting/composite-monitors.md @@ -74,7 +74,7 @@ POST _plugins/_alerting/workflows ``` {% include copy-curl.html %} -#### Request fields +#### Request body fields | Field | Type | Description | | :--- | :--- | :--- | @@ -526,7 +526,7 @@ GET /_plugins/_alerting/workflows/alerts?workflowIds=&getAssociate } ``` -#### Request fields +#### Request body fields | Field | Type | Description | | :--- | :--- | :--- | @@ -546,7 +546,7 @@ POST _plugins/_alerting/workflows//_acknowledge/alerts ``` {% include copy-curl.html %} -#### Request fields +#### Request body fields | Field | Type | Description | | :--- | :--- | :--- | diff --git a/_observing-your-data/query-insights/grouping-top-n-queries.md b/_observing-your-data/query-insights/grouping-top-n-queries.md index 28cbcbb8e5..d4c900d7e7 100644 --- a/_observing-your-data/query-insights/grouping-top-n-queries.md +++ b/_observing-your-data/query-insights/grouping-top-n-queries.md @@ -45,6 +45,96 @@ bool When queries share the same query structure, they are grouped together, ensuring that all similar queries belong to the same group. +## Configuring the query structure + +The preceding example query shows a simplified query structure. By default, the query structure also includes field names and field data types. + +For example, consider an index `index1` with the following field mapping: + +```json +"mappings": { + "properties": { + "field1": { + "type": "keyword" + }, + "field2": { + "type": "text" + }, + "field3": { + "type": "text" + }, + "field4": { + "type": "long" + } + } +} +``` + +If you run the following query on this index: + +```json +{ + "query": { + "bool": { + "must": [ + { + "term": { + "field1": "example_value" + } + } + ], + "filter": [ + { + "match": { + "field2": "search_text" + } + }, + { + "range": { + "field4": { + "gte": 1, + "lte": 100 + } + } + } + ], + "should": [ + { + "regexp": { + "field3": ".*" + } + } + ] + } + } +} +``` + +Then the query has the following corresponding query structure: + +```c +bool [] + must: + term [field1, keyword] + filter: + match [field2, text] + range [field4, long] + should: + regexp [field3, text] +``` + +To exclude field names and field data types from the query structure, configure the following settings: + +```json +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.grouping.attributes.field_name" : false, + "search.insights.top_queries.grouping.attributes.field_type" : false + } +} +``` +{% include copy-curl.html %} ## Aggregate metrics per group @@ -307,7 +397,7 @@ The response contains the top N query groups: -## Response fields +## Response body fields The response includes the following fields. @@ -323,7 +413,7 @@ Field | Data type | Description `top_queries.task_resource_usages` | Array of objects | The resource usage breakdown for the various tasks belonging to the first query in the query group. `top_queries.indices` | Array | The indexes searched by the first query in the query group. `top_queries.labels` | Object | Used to label the top query. -`top_queries.search_type` | String | The search request execution type (`query_then_fetch` or `dfs_query_then_fetch`). For more information, see the `search_type` parameter in the [Search API documentation]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters). +`top_queries.search_type` | String | The search request execution type (`query_then_fetch` or `dfs_query_then_fetch`). For more information, see the `search_type` parameter in the [Search API documentation]({{site.url}}{{site.baseurl}}/api-reference/search/#query-parameters). `top_queries.measurements` | Object | The aggregate measurements for the query group. `top_queries.measurements.latency` | Object | The aggregate latency measurements for the query group. `top_queries.measurements.latency.number` | Integer | The total latency for the query group. diff --git a/_observing-your-data/query-insights/health.md b/_observing-your-data/query-insights/health.md new file mode 100644 index 0000000000..02324d8237 --- /dev/null +++ b/_observing-your-data/query-insights/health.md @@ -0,0 +1,119 @@ +--- +layout: default +title: Query Insights plugin health +parent: Query insights +nav_order: 50 +--- + +# Query Insights plugin health + +The Query Insights plugin provides an [API](#health-stats-api) and [metrics](#opentelemetry-error-metrics-counters) for monitoring its health and performance, enabling proactive identification of issues that may affect query processing or system resources. + +## Health Stats API +**Introduced 2.18** +{: .label .label-purple } + +The Health Stats API provides health metrics for each node running the Query Insights plugin. These metrics allow for an in-depth view of resource usage and the health of the query processing components. + +### Path and HTTP methods + +```json +GET _insights/health_stats +``` + +### Example request + +```json +GET _insights/health_stats +``` +{% include copy-curl.html %} + +### Example response + +The response includes a set of health-related fields for each node: + +```json +PUT _cluster/settings +{ + "AqegbPL0Tv2XWvZV4PTS8Q": { + "ThreadPoolInfo": { + "query_insights_executor": { + "type": "scaling", + "core": 1, + "max": 5, + "keep_alive": "5m", + "queue_size": 2 + } + }, + "QueryRecordsQueueSize": 2, + "TopQueriesHealthStats": { + "latency": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + }, + "memory": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + }, + "cpu": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + } + } + } +} +``` + +### Response fields + +The following table lists all response body fields. + +Field | Data type | Description +:--- |:---| :--- +`ThreadPoolInfo` | Object | Information about the Query Insights thread pool, including type, core count, max threads, and queue size. See [The ThreadPoolInfo object](#the-threadpoolinfo-object). +`QueryRecordsQueueSize` | Integer | The size of the queue that buffers incoming search queries before processing. A high value may suggest increased load or slower processing. +`TopQueriesHealthStats` | Object | Performance metrics for each top query service that provide information about memory allocation (heap size) and query grouping. See [The TopQueriesHealthStats object](#the-topquerieshealthstats-object). + +### The ThreadPoolInfo object + +The `ThreadPoolInfo` object contains the following detailed configuration and performance data for the thread pool dedicated to the Query Insights plugin. + +Field | Data type | Description +:--- |:---| :--- +`type`| String | The thread pool type (for example, `scaling`). +`core`| Integer | The minimum number of threads in the thread pool. +`max`| Integer | The maximum number of threads in the thread pool. +`keep_alive`| Time unit | The amount of time that idle threads are retained. +`queue_size`| Integer | The maximum number of tasks in the queue. + +### The TopQueriesHealthStats object + +The `TopQueriesHealthStats` object provides breakdowns for latency, memory, and CPU usage and contains the following information. + +Field | Data type | Description +:--- |:---| :--- +`TopQueriesHeapSize`| Integer | The heap memory allocation for the query group. +`QueryGroupCount_Total`| Integer | The total number of processed query groups. +`QueryGroupCount_MaxHeap`| Integer | The size of the max heap that stores all query groups in memory. + +## OpenTelemetry error metrics counters + +The Query Insights plugin integrates with OpenTelemetry to provide real-time error metrics counters. These counters help to identify specific operational failures in the plugin and improve reliability. Each metric provides targeted insights into potential error sources in the plugin workflow, allowing for more focused debugging and maintenance. + +To collect these metrics, you must configure and collect query metrics. For more information, see [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/). + +The following table lists all available metrics. + +Field | Description +:--- | :--- +`LOCAL_INDEX_READER_PARSING_EXCEPTIONS` | The number of errors that occur when parsing data using the LocalIndexReader. +`LOCAL_INDEX_EXPORTER_BULK_FAILURES` | The number of failures that occur when ingesting Query Insights plugin data into local indexes. +`LOCAL_INDEX_EXPORTER_EXCEPTIONS` | The number of exceptions that occur in the Query Insights plugin LocalIndexExporter. +`INVALID_EXPORTER_TYPE_FAILURES` | The number of invalid exporter type failures. +`INVALID_INDEX_PATTERN_EXCEPTIONS` | The number of invalid index pattern exceptions. +`DATA_INGEST_EXCEPTIONS` | The number of exceptions that occur when ingesting data into the Query Insights plugin. +`QUERY_CATEGORIZE_EXCEPTIONS` | The number of exceptions that occur when categorizing the queries. +`EXPORTER_FAIL_TO_CLOSE_EXCEPTION` | The number of failures that occur when closing the exporter. \ No newline at end of file diff --git a/_observing-your-data/query-insights/index.md b/_observing-your-data/query-insights/index.md index ef3a65bfcd..4bba866c05 100644 --- a/_observing-your-data/query-insights/index.md +++ b/_observing-your-data/query-insights/index.md @@ -4,6 +4,8 @@ title: Query insights nav_order: 40 has_children: true has_toc: false +redirect_from: + - /query-insights/ --- # Query insights @@ -40,3 +42,7 @@ You can obtain the following information using Query Insights: - [Top n queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/) - [Grouping top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/grouping-top-n-queries/) - [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/) + +## Query Insights plugin health + +For information about monitoring the health of the Query Insights plugin, see [Query Insights plugin health]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/health/). \ No newline at end of file diff --git a/_observing-your-data/query-insights/top-n-queries.md b/_observing-your-data/query-insights/top-n-queries.md index b63d670926..a5bf42e694 100644 --- a/_observing-your-data/query-insights/top-n-queries.md +++ b/_observing-your-data/query-insights/top-n-queries.md @@ -96,6 +96,17 @@ GET /_insights/top_queries?type=memory ``` {% include copy-curl.html %} + +To specify a time range for querying top N results, use the `from` and `to` parameters in ISO8601 format: `YYYY-MM-DD'T'HH:mm:ss.SSSZ`. +For example, to retrieve the top N queries from August 25, 2024, at 15:00 UTC to August 30, 2024, at 17:00 UTC, send the following request: + +```json +GET /_insights/top_queries?from=2024-08-25T15:00:00.000Z&to=2024-08-30T17:00:00.000Z +``` +{% include copy-curl.html %} + +If you have a [local index exporter enabled](#configuring-a-local-index-exporter), historical queries stored in local OpenSearch indexes will also be included in the specified time range. + If your query returns no results, ensure that top N query monitoring is enabled for the target metric type and that search requests were made within the current [time window](#configuring-the-window-size). {: .important} diff --git a/_observing-your-data/trace/distributed-tracing.md b/_observing-your-data/trace/distributed-tracing.md index 4fb464f67c..773b4dd34a 100644 --- a/_observing-your-data/trace/distributed-tracing.md +++ b/_observing-your-data/trace/distributed-tracing.md @@ -1,6 +1,6 @@ --- layout: default -title: Distrbuted tracing +title: Distributed tracing parent: Trace Analytics nav_order: 65 --- diff --git a/_query-dsl/full-text/match-bool-prefix.md b/_query-dsl/full-text/match-bool-prefix.md index 3964dc5ee8..6905d49989 100644 --- a/_query-dsl/full-text/match-bool-prefix.md +++ b/_query-dsl/full-text/match-bool-prefix.md @@ -216,7 +216,7 @@ The `` accepts the following parameters. All parameters except `query` ar Parameter | Data type | Description :--- | :--- | :--- `query` | String | The text, number, Boolean value, or date to use for search. Required. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `fuzziness` | `AUTO`, `0`, or a positive integer | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. `fuzzy_rewrite` | String | Determines how OpenSearch rewrites the query. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. If the `fuzziness` parameter is not `0`, the query uses a `fuzzy_rewrite` method of `top_terms_blended_freqs_${max_expansions}` by default. Default is `constant_score`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. diff --git a/_query-dsl/full-text/match-phrase.md b/_query-dsl/full-text/match-phrase.md index 747c4814d9..3f36465790 100644 --- a/_query-dsl/full-text/match-phrase.md +++ b/_query-dsl/full-text/match-phrase.md @@ -268,6 +268,6 @@ The `` accepts the following parameters. All parameters except `query` ar Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `slop` | `0` (default) or a positive integer | Controls the degree to which words in a query can be misordered and still be considered a match. From the [Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PhraseQuery.html#getSlop--): "The number of other words permitted between words in query phrase. For example, to switch the order of two words requires two moves (the first move places the words atop one another), so to permit reorderings of phrases, the slop must be at least two. A value of zero requires an exact match." `zero_terms_query` | String | In some cases, the analyzer removes all terms from a query string. For example, the `stop` analyzer removes all terms from the string `an but this`. In those cases, `zero_terms_query` specifies whether to match no documents (`none`) or all documents (`all`). Valid values are `none` and `all`. Default is `none`. \ No newline at end of file diff --git a/_query-dsl/full-text/match.md b/_query-dsl/full-text/match.md index 056ef76890..5ece14e127 100644 --- a/_query-dsl/full-text/match.md +++ b/_query-dsl/full-text/match.md @@ -451,7 +451,7 @@ Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba,batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. `enable_position_increments` | Boolean | When `true`, resulting queries are aware of position increments. This setting is useful when the removal of stop words leaves an unwanted "gap" between terms. Default is `true`. `fuzziness` | String | The number of character edits (insertions, deletions, substitutions, or transpositions) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. Valid values are non-negative integers or `AUTO`. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. diff --git a/_query-dsl/full-text/multi-match.md b/_query-dsl/full-text/multi-match.md index ab1496fdd3..a3995df714 100644 --- a/_query-dsl/full-text/multi-match.md +++ b/_query-dsl/full-text/multi-match.md @@ -900,9 +900,9 @@ Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba,batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. -`fields` | Array of strings | The list of fields in which to search. If you don't provide the `fields` parameter, `multi_match` query searches the fields specified in the `index.query. Default_field` setting, which defaults to `*`. +`fields` | Array of strings | The list of fields in which to search. If you don't provide the `fields` parameter, `multi_match` query searches the fields specified in the `index.query.default_field` setting, which defaults to `*`. `fuzziness` | String | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. Valid values are non-negative integers or `AUTO`. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. Not supported for `phrase`, `phrase_prefix`, and `cross_fields` queries. `fuzzy_rewrite` | String | Determines how OpenSearch rewrites the query. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. If the `fuzziness` parameter is not `0`, the query uses a `fuzzy_rewrite` method of `top_terms_blended_freqs_${max_expansions}` by default. Default is `constant_score`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. diff --git a/_query-dsl/full-text/query-string.md b/_query-dsl/full-text/query-string.md index 47180e3f6d..7b3343155d 100644 --- a/_query-dsl/full-text/query-string.md +++ b/_query-dsl/full-text/query-string.md @@ -623,7 +623,7 @@ Parameter | Data type | Description `query` | String | The text that may contain expressions in the [query string syntax](#query-string-syntax) to use for search. Required. `allow_leading_wildcard` | Boolean | Specifies whether `*` and `?` are allowed as first characters of a search term. Default is `true`. `analyze_wildcard` | Boolean | Specifies whether OpenSearch should attempt to analyze wildcard terms. Default is `false`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba, batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. `default_field` | String | The field in which to search if the field is not specified in the query string. Supports wildcards. Defaults to the value specified in the `index.query. Default_field` index setting. By default, the `index.query. Default_field` is `*`, which means extract all fields eligible for term query and filter the metadata fields. The extracted fields are combined into a query if the `prefix` is not specified. Eligible fields do not include nested documents. Searching all eligible fields could be a resource-intensive operation. The `indices.query.bool.max_clause_count` search setting defines the maximum value for the product of the number of fields and the number of terms that can be queried at one time. The default value for `indices.query.bool.max_clause_count` is 1,024. diff --git a/_query-dsl/full-text/simple-query-string.md b/_query-dsl/full-text/simple-query-string.md index 58780cfdb4..1624efdaa7 100644 --- a/_query-dsl/full-text/simple-query-string.md +++ b/_query-dsl/full-text/simple-query-string.md @@ -355,14 +355,14 @@ Parameter | Data type | Description :--- | :--- | :--- `query`| String | The text that may contain expressions in the [simple query string syntax](#simple-query-string-syntax) to use for search. Required. `analyze_wildcard` | Boolean | Specifies whether OpenSearch should attempt to analyze wildcard terms. Default is `false`. -`analyzer` | String | The analyzer used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The analyzer used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create [match_phrase queries]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match/) automatically for multi-term synonyms. Default is `true`. `default_operator`| String | If the query string contains multiple search terms, whether all terms need to match (`AND`) or only one term needs to match (`OR`) for a document to be considered a match. Valid values are:
- `OR`: The string `to be` is interpreted as `to OR be`
- `AND`: The string `to be` is interpreted as `to AND be`
Default is `OR`. -`fields` | String array | The list of fields to search (for example, `"fields": ["title^4", "description"]`). Supports wildcards. If unspecified, defaults to the `index.query. Default_field` setting, which defaults to `["*"]`. The maximum number of fields that can be searched at the same time is defined by `indices.query.bool.max_clause_count`, which is 1,024 by default. +`fields` | String array | The list of fields to search (for example, `"fields": ["title^4", "description"]`). Supports wildcards. If unspecified, defaults to the `index.query.default_field` setting, which defaults to `["*"]`. The maximum number of fields that can be searched at the same time is defined by `indices.query.bool.max_clause_count`, which is 1,024 by default. `flags` | String | A `|`-delimited string of [flags]({{site.baseurl}}/query-dsl/full-text/simple-query-string/) to enable (for example, `AND|OR|NOT`). Default is `ALL`. You can explicitly set the value for `default_field`. For example, to return all titles, set it to `"default_field": "title"`. `fuzzy_max_expansions` | Positive integer | The maximum number of terms to which the query can expand. Fuzzy queries “expand to” a number of matching terms that are within the distance specified in `fuzziness`. Then OpenSearch tries to match those terms. Default is `50`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. `fuzzy_prefix_length`| Integer | The number of beginning characters left unchanged for fuzzy matching. Default is 0. `lenient` | Boolean | Setting `lenient` to `true` ignores data type mismatches between the query and the document field. For example, a query string of `"8.2"` could match a field of type `float`. Default is `false`. `minimum_should_match` | Positive or negative integer, positive or negative percentage, combination | If the query string contains multiple search terms and you use the `or` operator, the number of terms that need to match for the document to be considered a match. For example, if `minimum_should_match` is 2, `wind often rising` does not match `The Wind Rises.` If `minimum_should_match` is `1`, it matches. For details, see [Minimum should match]({{site.url}}{{site.baseurl}}/query-dsl/minimum-should-match/). -`quote_field_suffix` | String | This option supports searching for exact matches (surrounded with quotation marks) using a different analysis method than non-exact matches use. For example, if `quote_field_suffix` is `.exact` and you search for `\"lightly\"` in the `title` field, OpenSearch searches for the word `lightly` in the `title.exact` field. This second field might use a different type (for example, `keyword` rather than `text`) or a different analyzer. \ No newline at end of file +`quote_field_suffix` | String | This option supports searching for exact matches (surrounded with quotation marks) using a different analysis method than non-exact matches use. For example, if `quote_field_suffix` is `.exact` and you search for `\"lightly\"` in the `title` field, OpenSearch searches for the word `lightly` in the `title.exact` field. This second field might use a different type (for example, `keyword` rather than `text`) or a different analyzer. diff --git a/_query-dsl/geo-and-xy/geo-bounding-box.md b/_query-dsl/geo-and-xy/geo-bounding-box.md index 1112a4278e..66fcc224d6 100644 --- a/_query-dsl/geo-and-xy/geo-bounding-box.md +++ b/_query-dsl/geo-and-xy/geo-bounding-box.md @@ -173,11 +173,11 @@ GET testindex1/_search ``` {% include copy-curl.html %} -## Request fields +## Parameters -Geo-bounding box queries accept the following fields. +Geo-bounding box queries accept the following parameters. -Field | Data type | Description +Parameter | Data type | Description :--- | :--- | :--- `_name` | String | The name of the filter. Optional. `validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Default is `STRICT`. diff --git a/_query-dsl/geo-and-xy/geodistance.md b/_query-dsl/geo-and-xy/geodistance.md index b272cad81e..3eef58bc69 100644 --- a/_query-dsl/geo-and-xy/geodistance.md +++ b/_query-dsl/geo-and-xy/geodistance.md @@ -103,11 +103,11 @@ The response contains the matching document: } ``` -## Request fields +## Parameters -Geodistance queries accept the following fields. +Geodistance queries accept the following parameters. -Field | Data type | Description +Parameter | Data type | Description :--- | :--- | :--- `_name` | String | The name of the filter. Optional. `distance` | String | The distance within which to match the points. This distance is the radius of a circle centered at the specified point. For supported distance units, see [Distance units]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/#distance-units). Required. diff --git a/_query-dsl/geo-and-xy/geopolygon.md b/_query-dsl/geo-and-xy/geopolygon.md index 980a0c5a63..810e48f2b7 100644 --- a/_query-dsl/geo-and-xy/geopolygon.md +++ b/_query-dsl/geo-and-xy/geopolygon.md @@ -161,11 +161,11 @@ However, if you specify the vertices in the following order: The response returns no results. -## Request fields +## Parameters -Geopolygon queries accept the following fields. +Geopolygon queries accept the following parameters. -Field | Data type | Description +Parameter | Data type | Description :--- | :--- | :--- `_name` | String | The name of the filter. Optional. `validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Optional. Default is `STRICT`. diff --git a/_query-dsl/geo-and-xy/geoshape.md b/_query-dsl/geo-and-xy/geoshape.md index 8acc691c3a..5b144b06d6 100644 --- a/_query-dsl/geo-and-xy/geoshape.md +++ b/_query-dsl/geo-and-xy/geoshape.md @@ -721,10 +721,10 @@ The response returns document 1: Note that when you indexed the geopoints, you specified their coordinates in `"latitude, longitude"` format. When you search for matching documents, the coordinate array is in `[longitude, latitude]` format. Thus, document 1 is returned in the results but document 2 is not. -## Request fields +## Parameters -Geoshape queries accept the following fields. +Geoshape queries accept the following parameters. -Field | Data type | Description +Parameter | Data type | Description :--- | :--- | :--- `ignore_unmapped` | Boolean | Specifies whether to ignore an unmapped field. If set to `true`, then the query does not return any documents that contain an unmapped field. If set to `false`, then an exception is thrown when the field is unmapped. Optional. Default is `false`. \ No newline at end of file diff --git a/_query-dsl/geo-and-xy/index.md b/_query-dsl/geo-and-xy/index.md index ee51e1e523..9bcf6a9462 100644 --- a/_query-dsl/geo-and-xy/index.md +++ b/_query-dsl/geo-and-xy/index.md @@ -30,7 +30,7 @@ OpenSearch provides the following geographic query types: - [**Geo-bounding box queries**]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/geo-and-xy/geo-bounding-box/): Return documents with geopoint field values that are within a bounding box. - [**Geodistance queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geodistance/): Return documents with geopoints that are within a specified distance from the provided geopoint. -- [**Geopolygon queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geodistance/): Return documents containing geopoints that are within a polygon. +- [**Geopolygon queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geopolygon/): Return documents containing geopoints that are within a polygon. - [**Geoshape queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geoshape/): Return documents that contain: - Geoshapes and geopoints that have one of four spatial relations to the provided shape: `INTERSECTS`, `DISJOINT`, `WITHIN`, or `CONTAINS`. - - Geopoints that intersect the provided shape. \ No newline at end of file + - Geopoints that intersect the provided shape. diff --git a/_query-dsl/joining/has-child.md b/_query-dsl/joining/has-child.md new file mode 100644 index 0000000000..c7da5bf7a9 --- /dev/null +++ b/_query-dsl/joining/has-child.md @@ -0,0 +1,398 @@ +--- +layout: default +title: Has child +parent: Joining queries +nav_order: 10 +--- + +# Has child query + +The `has_child` query returns parent documents whose child documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type. + +The `has_child` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching child documents pointing to different parent documents increases. Each `has_child` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible. +{: .warning} + +## Example + +Before you can run a `has_child` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format: + +```json +PUT /example_index +{ + "mappings": { + "properties": { + "relationship_field": { + "type": "join", + "relations": { + "parent_doc": "child_doc" + } + } + } + } +} +``` +{% include copy-curl.html %} + +In this example, you'll configure an index that contains documents representing products and their brands. + +First, create the index and establish the parent/child relationship between `brand` and `product`: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "product_to_brand": { + "type": "join", + "relations": { + "brand": "product" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Index two parent (brand) documents: + +```json +PUT testindex1/_doc/1 +{ + "name": "Luxury brand", + "product_to_brand" : "brand" +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/2 +{ + "name": "Economy brand", + "product_to_brand" : "brand" +} +``` +{% include copy-curl.html %} + +Index three child (product) documents: + +```json +PUT testindex1/_doc/3?routing=1 +{ + "name": "Mechanical watch", + "sales_count": 150, + "product_to_brand": { + "name": "product", + "parent": "1" + } +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/4?routing=2 +{ + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/5?routing=2 +{ + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } +} +``` +{% include copy-curl.html %} + +To search for the parent of a child, use a `has_child` query. The following query returns parent documents (brands) that make watches: + +```json +GET testindex1/_search +{ + "query" : { + "has_child": { + "type":"product", + "query": { + "match" : { + "name": "watch" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response returns both brands: + +```json +{ + "took": 15, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex1", + "_id": "1", + "_score": 1, + "_source": { + "name": "Luxury brand", + "product_to_brand": "brand" + } + }, + { + "_index": "testindex1", + "_id": "2", + "_score": 1, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + } + } + ] + } +} +``` + +## Retrieving inner hits + +To return child documents that matched the query, provide the `inner_hits` parameter: + +```json +GET testindex1/_search +{ + "query" : { + "has_child": { + "type":"product", + "query": { + "match" : { + "name": "watch" + } + }, + "inner_hits": {} + } + } +} +``` +{% include copy-curl.html %} + +The response contains child documents in the `inner_hits` field: + +```json +{ + "took": 52, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex1", + "_id": "1", + "_score": 1, + "_source": { + "name": "Luxury brand", + "product_to_brand": "brand" + }, + "inner_hits": { + "product": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.53899646, + "hits": [ + { + "_index": "testindex1", + "_id": "3", + "_score": 0.53899646, + "_routing": "1", + "_source": { + "name": "Mechanical watch", + "sales_count": 150, + "product_to_brand": { + "name": "product", + "parent": "1" + } + } + } + ] + } + } + } + }, + { + "_index": "testindex1", + "_id": "2", + "_score": 1, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + }, + "inner_hits": { + "product": { + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.53899646, + "hits": [ + { + "_index": "testindex1", + "_id": "4", + "_score": 0.53899646, + "_routing": "2", + "_source": { + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + }, + { + "_index": "testindex1", + "_id": "5", + "_score": 0.53899646, + "_routing": "2", + "_source": { + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + } + ] + } + } + } + } + ] + } +} +``` + +For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). + +## Parameters + +The following table lists all top-level parameters supported by `has_child` queries. + +| Parameter | Required/Optional | Description | +|:---|:---|:---| +| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. | +| `query` | Required | The query to run on child documents. If a child document matches the query, the parent document is returned. | +| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. | +| `max_children` | Optional | The maximum number of matching child documents for a parent document. If exceeded, the parent document is excluded from the search results. | +| `min_children` | Optional | The minimum number of matching child documents required for a parent document to be included in the results. If not met, the parent is excluded. Default is `1`.| +| `score_mode` | Optional | Defines how scores of matching child documents influence the parent document's score. Valid values are:
- `none`: Ignores the relevance scores of child documents and assigns a score of `0` to the parent document.
- `avg`: Uses the average relevance score of all matching child documents.
- `max`: Assigns the highest relevance score from the matching child documents to the parent.
- `min`: Assigns the lowest relevance score from the matching child documents to the parent.
- `sum`: Sums the relevance scores of all matching child documents.
Default is `none`. | +| `inner_hits` | Optional | If provided, returns the underlying hits (child documents) that matched the query. | + + +## Sorting limitations + +The `has_child` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort parent documents by fields in their child documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the parent document's score. + +In the preceding example, you can sort parent documents (brands) based on the `sales_count` of their child products. This query multiplies the score by the `sales_count` field of the child documents and assigns the highest relevance score from the matching child documents to the parent: + +```json +GET testindex1/_search +{ + "query": { + "has_child": { + "type": "product", + "query": { + "function_score": { + "script_score": { + "script": "_score * doc['sales_count'].value" + } + } + }, + "score_mode": "max" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the brands sorted by the highest child `sales_count`: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 300, + "hits": [ + { + "_index": "testindex1", + "_id": "2", + "_score": 300, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + } + }, + { + "_index": "testindex1", + "_id": "1", + "_score": 150, + "_source": { + "name": "Luxury brand", + "product_to_brand": "brand" + } + } + ] + } +} +``` + +## Next steps + +- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_query-dsl/joining/has-parent.md b/_query-dsl/joining/has-parent.md new file mode 100644 index 0000000000..6b293ffff2 --- /dev/null +++ b/_query-dsl/joining/has-parent.md @@ -0,0 +1,358 @@ +--- +layout: default +title: Has parent +parent: Joining queries +nav_order: 20 +--- + +# Has parent query + +The `has_parent` query returns child documents whose parent documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type. + +The `has_parent` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching parent documents increases. Each `has_parent` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible. +{: .warning} + +## Example + +Before you can run a `has_parent` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format: + +```json +PUT /example_index +{ + "mappings": { + "properties": { + "relationship_field": { + "type": "join", + "relations": { + "parent_doc": "child_doc" + } + } + } + } +} +``` +{% include copy-curl.html %} + +For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/). + +To search for the child of a parent, use a `has_parent` query. The following query returns child documents (products) made by the brand matching the query `economy`: + +```json +GET testindex1/_search +{ + "query" : { + "has_parent": { + "parent_type":"brand", + "query": { + "match" : { + "name": "economy" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response returns all products made by the brand: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex1", + "_id": "4", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + }, + { + "_index": "testindex1", + "_id": "5", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + } + ] + } +} +``` + +## Retrieving inner hits + +To return parent documents that matched the query, provide the `inner_hits` parameter: + +```json +GET testindex1/_search +{ + "query" : { + "has_parent": { + "parent_type":"brand", + "query": { + "match" : { + "name": "economy" + } + }, + "inner_hits": {} + } + } +} +``` +{% include copy-curl.html %} + +The response contains parent documents in the `inner_hits` field: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex1", + "_id": "4", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } + }, + "inner_hits": { + "brand": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.3862942, + "hits": [ + { + "_index": "testindex1", + "_id": "2", + "_score": 1.3862942, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + } + } + ] + } + } + } + }, + { + "_index": "testindex1", + "_id": "5", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } + }, + "inner_hits": { + "brand": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.3862942, + "hits": [ + { + "_index": "testindex1", + "_id": "2", + "_score": 1.3862942, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + } + } + ] + } + } + } + } + ] + } +} +``` + +For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). + +## Parameters + +The following table lists all top-level parameters supported by `has_parent` queries. + +| Parameter | Required/Optional | Description | +|:---|:---|:---| +| `parent_type` | Required | Specifies the name of the parent relationship as defined in the `join` field mapping. | +| `query` | Required | The query to run on parent documents. If a parent document matches the query, the child document is returned. | +| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `parent_type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `parent_type` field. Default is `false`. | +| `score` | Optional | Indicates whether the relevance score of a matching parent document is aggregated into its child documents. If `false`, then the relevance score of the parent document is ignored, and each child document is assigned a relevance score equal to the query's `boost`, which defaults to `1`. If `true`, then the relevance score of the matching parent document is aggregated into the relevance scores of its child documents. Default is `false`. | +| `inner_hits` | Optional | If provided, returns the underlying hits (parent documents) that matched the query. | + + +## Sorting limitations + +The `has_parent` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort child documents by fields in their parent documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the child document's score. + +For the preceding example, first add a `customer_satisfaction` field by which you'll sort the child documents belonging to the parent (brand) documents: + +```json +PUT testindex1/_doc/1 +{ + "name": "Luxury watch brand", + "product_to_brand" : "brand", + "customer_satisfaction": 4.5 +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/2 +{ + "name": "Economy watch brand", + "product_to_brand" : "brand", + "customer_satisfaction": 3.9 +} +``` +{% include copy-curl.html %} + +Now you can sort child documents (products) based on the `customer_satisfaction` field of their parent brands. This query multiplies the score by the `customer_satisfaction` field of the parent documents: + +```json +GET testindex1/_search +{ + "query": { + "has_parent": { + "parent_type": "brand", + "score": true, + "query": { + "function_score": { + "script_score": { + "script": "_score * doc['customer_satisfaction'].value" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the products, sorted by the highest parent `customer_satisfaction`: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 4.5, + "hits": [ + { + "_index": "testindex1", + "_id": "3", + "_score": 4.5, + "_routing": "1", + "_source": { + "name": "Mechanical watch", + "sales_count": 150, + "product_to_brand": { + "name": "product", + "parent": "1" + } + } + }, + { + "_index": "testindex1", + "_id": "4", + "_score": 3.9, + "_routing": "2", + "_source": { + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + }, + { + "_index": "testindex1", + "_id": "5", + "_score": 3.9, + "_routing": "2", + "_source": { + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + } + ] + } +} +``` + +## Next steps + +- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_query-dsl/joining/index.md b/_query-dsl/joining/index.md index 20f48c0b16..f0a0060640 100644 --- a/_query-dsl/joining/index.md +++ b/_query-dsl/joining/index.md @@ -3,16 +3,22 @@ layout: default title: Joining queries has_children: true nav_order: 55 +has_toc: false +redirect_from: + - /query-dsl/joining/ --- # Joining queries OpenSearch is a distributed system in which data is spread across multiple nodes. Thus, running a SQL-like JOIN operation in OpenSearch is resource intensive. As an alternative, OpenSearch provides the following queries that perform join operations and are optimized for scaling across multiple nodes: -- `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. -- `has_child` queries: Search for parent documents whose child documents match the query. -- `has_parent` queries: Search for child documents whose parent documents match the query. -- `parent_id` queries: A [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field type establishes a parent/child relationship between documents in the same index. `parent_id` queries search for child documents that are joined to a specific parent document. + +- Queries for searching [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields: + - `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. +- Queries for searching documents connected by a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type, which establishes a parent/child relationship between documents in the same index: + - [`has_child`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/) queries: Search for parent documents whose child documents match the query. + - [`has_parent`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-parent/) queries: Search for child documents whose parent documents match the query. + - [`parent_id`]({{site.url}}{{site.baseurl}}/query-dsl/joining/parent-id/) queries: Search for child documents that are joined to a specific parent document. If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, then joining queries are not executed. {: .important} \ No newline at end of file diff --git a/_query-dsl/joining/nested.md b/_query-dsl/joining/nested.md new file mode 100644 index 0000000000..431a40ed1a --- /dev/null +++ b/_query-dsl/joining/nested.md @@ -0,0 +1,347 @@ +--- +layout: default +title: Nested +parent: Joining queries +nav_order: 30 +--- + +# Nested query + +The `nested` query acts as a wrapper for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. If an object matches the search, the `nested` query returns the parent document at the root level. + +## Example + +Before you can run a `nested` query, your index must contain a [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field. + +To configure an example index containing nested fields, send the following request: + +```json +PUT /testindex +{ + "mappings": { + "properties": { + "patient": { + "type": "nested", + "properties": { + "name": { + "type": "text" + }, + "age": { + "type": "integer" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index a document into the example index: + +```json +PUT /testindex/_doc/1 +{ + "patient": { + "name": "John Doe", + "age": 56 + } +} +``` +{% include copy-curl.html %} + +To search the nested `patient` field, wrap your query in a `nested` query and provide the `path` to the nested field: + +```json +GET /testindex/_search +{ + "query": { + "nested": { + "path": "patient", + "query": { + "match": { + "patient.name": "John" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query returns the matching document: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 0.2876821, + "_source": { + "patient": { + "name": "John Doe", + "age": 56 + } + } + } + ] + } +} +``` + +## Retrieving inner hits + +To return inner hits that matched the query, provide the `inner_hits` parameter: + +```json +GET /testindex/_search +{ + "query": { + "nested": { + "path": "patient", + "query": { + "match": { + "patient.name": "John" + } + }, + "inner_hits": {} + } + } +} +``` +{% include copy-curl.html %} + +The response contains the additional `inner_hits` field. The `_nested` field identifies the specific inner object from which the inner hit originated. It contains the nested hit and the offset relative to its position in the `_source`. Because of sorting and scoring, the position of the hit objects in `inner_hits` often differs from their original location in the nested object. + +By default, the `_source` of the hit objects within `inner_hits` is returned relative to the `_nested` field. In this example, the `_source` within `inner_hits` contains the `name` and `age` fields as opposed to the top-level `_source`, which contains the whole `patient` object: + +```json +{ + "took": 38, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 0.2876821, + "_source": { + "patient": { + "name": "John Doe", + "age": 56 + } + }, + "inner_hits": { + "patient": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_nested": { + "field": "patient", + "offset": 0 + }, + "_score": 0.2876821, + "_source": { + "name": "John Doe", + "age": 56 + } + } + ] + } + } + } + } + ] + } +} +``` + +You can disable returning `_source` by configuring the `_source` field in the mappings. For more information, see [Source]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/source/). +{: .tip} + +For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). + +## Multi-level nested queries + +You can search documents that have nested objects inside other nested objects using multi-level nested queries. In this example, you'll query multiple layers of nested fields by specifying a nested query for each level of the hierarchy. + +First, create an index with multi-level nested fields: + +```json +PUT /patients +{ + "mappings": { + "properties": { + "patient": { + "type": "nested", + "properties": { + "name": { + "type": "text" + }, + "contacts": { + "type": "nested", + "properties": { + "name": { + "type": "text" + }, + "relationship": { + "type": "text" + }, + "phone": { + "type": "keyword" + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index a document into the example index: + +```json +PUT /patients/_doc/1 +{ + "patient": { + "name": "John Doe", + "contacts": [ + { + "name": "Jane Doe", + "relationship": "mother", + "phone": "5551111" + }, + { + "name": "Joe Doe", + "relationship": "father", + "phone": "5552222" + } + ] + } +} +``` +{% include copy-curl.html %} + +To search the nested `patient` field, use a multi-level `nested` query. The following query searches for patients whose contact information includes a person named `Jane` with a relationship of `mother`: + +```json +GET /patients/_search +{ + "query": { + "nested": { + "path": "patient", + "query": { + "nested": { + "path": "patient.contacts", + "query": { + "bool": { + "must": [ + { "match": { "patient.contacts.relationship": "mother" } }, + { "match": { "patient.contacts.name": "Jane" } } + ] + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query returns the patient who has a contact entry matching these details: + +```json +{ + "took": 14, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.3862942, + "hits": [ + { + "_index": "patients", + "_id": "1", + "_score": 1.3862942, + "_source": { + "patient": { + "name": "John Doe", + "contacts": [ + { + "name": "Jane Doe", + "relationship": "mother", + "phone": "5551111" + }, + { + "name": "Joe Doe", + "relationship": "father", + "phone": "5552222" + } + ] + } + } + } + ] + } +} +``` + +## Parameters + +The following table lists all top-level parameters supported by `nested` queries. + +| Parameter | Required/Optional | Description | +|:---|:---|:---| +| `path` | Required | Specifies the path to the nested object that you want to search. | +| `query` | Required | The query to run on the nested objects within the specified `path`. If a nested object matches the query, the root parent document is returned. You can search nested fields using dot notation, such as `nested_object.subfield`. Multi-level nesting is supported and automatically detected. Thus, an inner `nested` query within another nested query automatically matches the correct nesting level, instead of the root. | +| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `path` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `path` field. Default is `false`. | +| `score_mode` | Optional | Defines how scores of matching inner documents influence the parent document's score. Valid values are:
- `avg`: Uses the average relevance score of all matching inner documents.
- `max`: Assigns the highest relevance score from the matching inner documents to the parent.
- `min`: Assigns the lowest relevance score from the matching inner documents to the parent.
- `sum`: Sums the relevance scores of all matching inner documents.
- `none`: Ignores the relevance scores of inner documents and assigns a score of `0` to the parent document.
Default is `avg`. | +| `inner_hits` | Optional | If provided, returns the underlying hits that matched the query. | + +## Next steps + +- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_query-dsl/joining/parent-id.md b/_query-dsl/joining/parent-id.md new file mode 100644 index 0000000000..cbf86a796e --- /dev/null +++ b/_query-dsl/joining/parent-id.md @@ -0,0 +1,96 @@ +--- +layout: default +title: Parent ID +parent: Joining queries +nav_order: 40 +--- + +# Parent ID query + +The `parent_id` query returns child documents whose parent document has the specified ID. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type. + +## Example + +Before you can run a `parent_id` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format: + +```json +PUT /example_index +{ + "mappings": { + "properties": { + "relationship_field": { + "type": "join", + "relations": { + "parent_doc": "child_doc" + } + } + } + } +} +``` +{% include copy-curl.html %} + +For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/). + +To search for child documents of a specific parent document, use a `parent_id` query. The following query returns child documents (products) whose parent document has the ID `1`: + +```json +GET testindex1/_search +{ + "query": { + "parent_id": { + "type": "product", + "id": "1" + } + } +} +``` +{% include copy-curl.html %} + +The response returns the child product: + +```json +{ + "took": 57, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.87546873, + "hits": [ + { + "_index": "testindex1", + "_id": "3", + "_score": 0.87546873, + "_routing": "1", + "_source": { + "name": "Mechanical watch", + "sales_count": 150, + "product_to_brand": { + "name": "product", + "parent": "1" + } + } + } + ] + } +} +``` + +## Parameters + +The following table lists all top-level parameters supported by `parent_id` queries. + +| Parameter | Required/Optional | Description | +|:---|:---|:---| +| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. | +| `id` | Required | The ID of the parent document. The query returns child documents associated with this parent document. | +| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. | \ No newline at end of file diff --git a/_query-dsl/specialized/neural-sparse.md b/_query-dsl/specialized/neural-sparse.md index 8de3eaf693..904d340b13 100644 --- a/_query-dsl/specialized/neural-sparse.md +++ b/_query-dsl/specialized/neural-sparse.md @@ -11,7 +11,7 @@ Introduced 2.11 Use the `neural_sparse` query for vector field search in [neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). The query can use either raw text or sparse vector tokens. -## Request fields +## Request body fields Include the following request fields in the `neural_sparse` query: ### Example: Query by raw text diff --git a/_query-dsl/specialized/neural.md b/_query-dsl/specialized/neural.md index 14b930cdb6..ae9e1f2ea4 100644 --- a/_query-dsl/specialized/neural.md +++ b/_query-dsl/specialized/neural.md @@ -9,7 +9,7 @@ nav_order: 50 Use the `neural` query for vector field search in [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). -## Request fields +## Request body fields Include the following request fields in the `neural` query: @@ -35,6 +35,8 @@ Field | Data type | Required/Optional | Description `min_score` | Float | Optional | The minimum score threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). `max_distance` | Float | Optional | The maximum distance threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). `filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). **Important**: Filter can only be used with the `faiss` or `lucene` engines. +`method_parameters` | Object | Optional | Parameters passed to the k-NN index during search. See [Additional query parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#additional-query-parameters). +`rescore` | Object | Optional | Parameters for configuring rescoring functionality for k-NN indexes built using quantization. See [Rescoring]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). #### Example request diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md index 42c74c0436..2de0b71bd6 100644 --- a/_query-dsl/term/terms.md +++ b/_query-dsl/term/terms.md @@ -39,6 +39,7 @@ Parameter | Data type | Description :--- | :--- | :--- `` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`value_type` | String | Specifies the types of values used for filtering. Valid values are `default` and `bitmap`. If omitted, the value defaults to `default`. ## Terms lookup @@ -182,7 +183,7 @@ PUT classes/_doc/102 To search for students enrolled in `CS102`, use the dot path notation to specify the full path to the field in the `path` parameter: ```json -ET students/_search +GET students/_search { "query": { "terms": { @@ -250,3 +251,136 @@ Parameter | Data type | Description `path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required. `routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. + +## Bitmap filtering +**Introduced 2.17** +{: .label .label-purple } + +The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering. + +The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products. + +First, create a `products` index and map `product_id` as a `keyword`: + +```json +PUT /products +{ + "mappings": { + "properties": { + "product_id": { "type": "keyword" } + } + } +} +``` +{% include copy-curl.html %} + +Next, index three documents that correspond to products: + +```json +PUT students/_doc/1 +{ + "name": "Product 1", + "product_id" : "111" +} +``` +{% include copy-curl.html %} + +```json +PUT students/_doc/2 +{ + "name": "Product 2", + "product_id" : "222" +} +``` +{% include copy-curl.html %} + +```json +PUT students/_doc/3 +{ + "name": "Product 3", + "product_id" : "333" +} +``` +{% include copy-curl.html %} + +To store customer bitmap filters, you'll create a `customer_filter` [binary field](https://opensearch.org/docs/latest/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field: + +```json +PUT /customers +{ + "mappings": { + "properties": { + "customer_filter": { + "type": "binary", + "store": true + } + } + } +} +``` +{% include copy-curl.html %} + +For each customer, you need to generate a bitmap that represents the product IDs of the products the customer owns. This bitmap effectively encodes the filter criteria for that customer. In this example, you'll create a `terms` filter for a customer whose ID is `customer123` and who owns products `111`, `222`, and `333`. + +To encode a `terms` filter for the customer, first create a roaring bitmap for the filter. This example creates a bitmap using the [PyRoaringBitMap] library, so first run `pip install pyroaring` to install the library. Then serialize the bitmap and encode it using a [Base64](https://en.wikipedia.org/wiki/Base64) encoding scheme: + +```py +from pyroaring import BitMap +import base64 + +# Create a bitmap, serialize it into a byte string, and encode into Base64 +bm = BitMap([111, 222, 333]) # product ids owned by a customer +encoded = base64.b64encode(BitMap.serialize(bm)) + +# Convert the Base64-encoded bytes to a string for storage or transmission +encoded_bm_str = encoded.decode('utf-8') + +# Print the encoded bitmap +print(f"Encoded Bitmap: {encoded_bm_str}") +``` +{% include copy.html %} + +Next, index the customer filter into the `customers` index. The document ID for the filter is the same as the ID for the corresponding customer (in this example, `customer123`). The `customer_filter` field contains the bitmap you generated for this customer: + +```json +POST customers/_doc/customer123 +{ + "customer_filter": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==" +} +``` +{% include copy-curl.html %} + +Now you can run a `terms` query on the `products` index to look up a specific customer in the `customers` index. Because you're looking up a stored field instead of `_source`, set `store` to `true`. In the `value_type` field, specify the data type of the `terms` input as `bitmap`: + +```json +POST /products/_search +{ + "query": { + "terms": { + "product_id": { + "index": "customers", + "id": "customer123", + "path": "customer_filter", + "store": true + }, + "value_type": "bitmap" + } + } +} +``` +{% include copy-curl.html %} + +You can also directly pass the bitmap to the `terms` query. In this example, the `product_id` field contains the customer filter bitmap for the customer whose ID is `customer123`: + +```json +POST /products/_search +{ + "query": { + "terms": { + "product_id": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==", + "value_type": "bitmap" + } + } +} +``` +{% include copy-curl.html %} diff --git a/_reporting/report-dashboard-index.md b/_reporting/report-dashboard-index.md index 0df87a965c..5e6d07b802 100644 --- a/_reporting/report-dashboard-index.md +++ b/_reporting/report-dashboard-index.md @@ -11,7 +11,7 @@ redirect_from: You can use OpenSearch Dashboards to create PNG, PDF, and CSV reports. To create reports, you must have the correct permissions. For a summary of the predefined roles and the permissions they grant, see the [Security plugin]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles). -CSV reports have a non-configurable 10,000 row limit. They have no explicit size limit (for example, MB), but extremely large documents could cause report generation to fail with an out of memory error from the V8 JavaScript engine. +CSV reports have a non-configurable 10,000-row limit in OpenSearch version 2.16 and earlier. As of version 2.17, this limit can be configured when setting up a report. While reports have no explicit size limit (for example, MB), extremely large documents could cause report generation to fail with an out-of-memory error from the V8 JavaScript engine. {: .tip } ## Generating reports diff --git a/_sass/color_schemes/opensearch.scss b/_sass/color_schemes/opensearch.scss index 7a683e3bcb..754c697037 100644 --- a/_sass/color_schemes/opensearch.scss +++ b/_sass/color_schemes/opensearch.scss @@ -76,12 +76,6 @@ $btn-primary-color: $purple-300; $base-button-color: $grey-lt-300; $table-border-color: $grey-lt-300; -// $border-color: $grey-dk-200; -// $search-result-preview-color: $grey-dk-000; -// $search-background-color: $grey-dk-250; -// $table-background-color: $grey-dk-250; -// $feedback-color: darken($sidebar-color, 3%); - $content-width: 740px; $max-content-width: 1080px; $top-button-margin: 48px; diff --git a/_sass/custom/custom.scss b/_sass/custom/custom.scss index 3a9dcc5e6d..b3ee3c3775 100755 --- a/_sass/custom/custom.scss +++ b/_sass/custom/custom.scss @@ -1039,14 +1039,25 @@ body { display: flex; align-items: flex-start; justify-content: center; - gap: 20px; - margin: 0 auto; + gap: 0; + border-top: 1px solid #eeebee; + flex-direction: column; + @include mq(md) { + flex-direction: row; + gap: 20px + } } .search-page--sidebar { - flex: 1; - max-width: 200px; - flex: 0 0 200px; + max-width: 100%; + order: 2; + margin-top: 1rem; + color: $grey-dk-300; + @include mq(md) { + flex: 1; + max-width: 200px; + margin-top: 3rem; + } } .search-page--sidebar--category-filter--checkbox-child { @@ -1054,52 +1065,96 @@ body { } .search-page--results { - flex: 3; display: flex; flex-direction: column; align-items: center; - max-width: 60%; + width: 100%; + max-width: 100%; + order: 3; + @include mq(md) { + flex: 3; + max-width: 60%; + } } -.search-page--results--input { - width: 100%; +.search-page--results--wrapper { position: relative; + display: flex; + width: 100%; + background-color: white; + margin: 0 auto 2rem; + max-width: 800px; } .search-page--results--input-box { width: 100%; - padding: 10px; - margin-bottom: 20px; - border: 1px solid #ccc; + padding: 10px 40px 10px 10px; + border: 1px solid $grey-lt-300; border-radius: 4px; + color: $grey-dk-300; } .search-page--results--input-icon { position: absolute; - top: 35%; - right: 10px; - transform: translateY(-50%); + right: 12px; + align-self: center; pointer-events: none; - color: #333; + color: $grey-dk-000; } -.search-page--results--diplay { +.search-page--results--display { width: 100%; position: relative; flex-flow: column nowrap; + margin-top: 1rem; + @media (max-width: $content-width) { + margin-top: 0.5rem; + } } -.search-page--results--diplay--header { +.search-page--results--display--header { text-align: center; - margin-bottom: 20px; background-color: transparent; + color: $grey-dk-300; + margin-bottom: 1rem; + margin-top: 1.5rem; + padding-bottom: 1rem; + border-bottom: 1px solid $blue-dk-100; + font-size: 20px; + @include mq(md) { + font-size: 1.5rem; + } } -.search-page--results--diplay--container--item { - margin-bottom: 1%; +.search-page--results--display--container--item { + margin-bottom: 2rem; display: block; } +.search-page--results--no-results { + padding: 1rem; + display: block; + font-size: 1rem; + font-weight: normal; +} + +.search-page--results--display--container--item--link { + font-family: "Open Sans Condensed", Impact, "Franklin Gothic Bold", sans-serif; + font-size: 1.2rem; + font-weight: bold; + display: block; + text-decoration: underline; + text-underline-offset: 5px; + text-decoration-color: $grey-lt-300; + &:hover { + text-decoration-color: $blue-100; + } +} + +.category-checkbox { + margin-right: 4px; +} + @mixin body-text($color: #000) { color: $color; font-family: 'Open Sans'; diff --git a/_search-plugins/caching/request-cache.md b/_search-plugins/caching/request-cache.md index 124152300b..49d5e8cd82 100644 --- a/_search-plugins/caching/request-cache.md +++ b/_search-plugins/caching/request-cache.md @@ -28,6 +28,7 @@ Setting | Data type | Default | Level | Static/Dynamic | Description `indices.cache.cleanup_interval` | Time unit | `1m` (1 minute) | Cluster | Static | Schedules a recurring background task that cleans up expired entries from the cache at the specified interval. `indices.requests.cache.size` | Percentage | `1%` | Cluster | Static | The cache size as a percentage of the heap size (for example, to use 1% of the heap, specify `1%`). `index.requests.cache.enable` | Boolean | `true` | Index | Dynamic | Enables or disables the request cache. +`indices.requests.cache.maximum_cacheable_size` | Integer | `0` | Cluster | Dynamic | Sets the maximum `size` of queries to be added to the request cache. ### Example diff --git a/_search-plugins/caching/tiered-cache.md b/_search-plugins/caching/tiered-cache.md index 22b1138be8..1b793c8465 100644 --- a/_search-plugins/caching/tiered-cache.md +++ b/_search-plugins/caching/tiered-cache.md @@ -44,7 +44,7 @@ In OpenSearch 2.14, a request cache can be used in a tiered cache. To begin, con To use the OpenSearch-provided tiered spillover cache implementation, set the cache store name to `tiered_spillover`, as shown in the following example: ```yaml -indices.request.cache.store.name: tiered_spillover +indices.requests.cache.store.name: tiered_spillover ``` {% include copy.html %} @@ -53,8 +53,8 @@ indices.request.cache.store.name: tiered_spillover Set the on-heap and disk store tiers to `opensearch_onheap` and `ehcache_disk`, as shown in the following example: ```yaml -indices.request.cache.tiered_spillover.onheap.store.name: opensearch_onheap -indices.request.cache.tiered_spillover.disk.store.name: ehcache_disk +indices.requests.cache.tiered_spillover.onheap.store.name: opensearch_onheap +indices.requests.cache.tiered_spillover.disk.store.name: ehcache_disk ``` The `opensearch_onheap` setting uses the built-in on-heap cache available in OpenSearch. @@ -68,19 +68,19 @@ The following table lists the cache store settings for the `opensearch_onheap` s Setting | Data type | Default | Description :--- | :--- | :--- | :--- -`indices.request.cache.opensearch_onheap.size` | Percentage | 1% of the heap size | The size of the on-heap cache. Optional. -`indices.request.cache.opensearch_onheap.expire` | Time unit | `MAX_VALUE` (disabled) | Specifies a time-to-live (TTL) for the cached results. Optional. +`indices.requests.cache.opensearch_onheap.size` | Percentage | 1% of the heap size | The size of the on-heap cache. Optional. +`indices.requests.cache.opensearch_onheap.expire` | Time unit | `MAX_VALUE` (disabled) | Specifies a time-to-live (TTL) for the cached results. Optional. The following table lists the disk cache store settings for the `ehcache_disk` store. Setting | Data type | Default | Description :--- | :--- | :--- | :--- -`indices.request.cache.ehcache_disk.max_size_in_bytes` | Long | `1073741824` (1 GB) | Defines the size of the disk cache. Optional. -`indices.request.cache.ehcache_disk.storage.path` | String | `""` | Defines the storage path for the disk cache. Required. -`indices.request.cache.ehcache_disk.expire_after_access` | Time unit | `MAX_VALUE` (disabled) | Specifies a TTL for the cached results. Optional. -`indices.request.cache.ehcache_disk.alias` | String | `ehcacheDiskCache#INDICES_REQUEST_CACHE` | Specifies an alias for the disk cache. Optional. -`indices.request.cache.ehcache_disk.segments` | Integer | `16` | Defines the number of segments into which the disk cache is separated. Used for concurrency. Optional. -`indices.request.cache.ehcache_disk.concurrency` | Integer | `1` | Defines the number of distinct write queues created for the disk store, where a group of segments shares a write queue. Optional. +`indices.requests.cache.ehcache_disk.max_size_in_bytes` | Long | `1073741824` (1 GB) | Defines the size of the disk cache. Optional. +`indices.requests.cache.ehcache_disk.storage.path` | String | `{data.paths}/nodes/{node.id}/request_cache` | Defines the storage path for the disk cache. Optional. +`indices.requests.cache.ehcache_disk.expire_after_access` | Time unit | `MAX_VALUE` (disabled) | Specifies a TTL for the cached results. Optional. +`indices.requests.cache.ehcache_disk.alias` | String | `ehcacheDiskCache#INDICES_REQUEST_CACHE` | Specifies an alias for the disk cache. Optional. +`indices.requests.cache.ehcache_disk.segments` | Integer | `16` | Defines the number of segments into which the disk cache is separated. Used for concurrency. Optional. +`indices.requests.cache.ehcache_disk.concurrency` | Integer | `1` | Defines the number of distinct write queues created for the disk store, where a group of segments shares a write queue. Optional. ### Additional settings for the `tiered_spillover` store @@ -88,8 +88,11 @@ The following table lists additional settings for the `tiered_spillover` store s Setting | Data type | Default | Description :--- | :--- | :--- | :--- -`indices.request.cache.tiered_spillover.disk.store.policies.took_time.threshold` | Time unit | `10ms` | A policy used to determine whether to cache a query into a disk cache based on its took time. This is a dynamic setting. Optional. -`indices.request.cache.tiered_spillover.disk.store.enabled` | Boolean | `True` | Enables or disables the disk cache dynamically within a tiered spillover cache. Note: After disabling a disk cache, entries are not removed automatically and requires the cache to be manually cleared. Optional. +`indices.requests.cache.tiered_spillover.disk.store.policies.took_time.threshold` | Time unit | `10ms` | A policy used to determine whether to cache a query into a disk cache based on its took time. This is a dynamic setting. Optional. +`indices.requests.cache.tiered_spillover.disk.store.enabled` | Boolean | `True` | Enables or disables the disk cache dynamically within a tiered spillover cache. Note: After disabling a disk cache, entries are not removed automatically and requires the cache to be manually cleared. Optional. +`indices.requests.cache.tiered_spillover.onheap.store.size` | Percentage | 1% of the heap size | Defines the size of the on-heap cache within tiered cache. Optional. +`indices.requests.cache.tiered_spillover.disk.store.size` | Long | `1073741824` (1 GB) | Defines the size of the disk cache within tiered cache. Optional. +`indices.requests.cache.tiered_spillover.segments` | Integer | `2 ^ (ceil(log2(CPU_CORES * 1.5)))` | This determines the number of segments in the tiered cache, with each segment secured by a re-entrant read/write lock. These locks enable multiple concurrent readers without contention, while the segmentation allows multiple writers to operate simultaneously, resulting in higher write throughput. Optional. ### Delete stale entries settings diff --git a/_search-plugins/collapse-search.md b/_search-plugins/collapse-search.md new file mode 100644 index 0000000000..ec7e57515a --- /dev/null +++ b/_search-plugins/collapse-search.md @@ -0,0 +1,231 @@ +--- +layout: default +title: Collapse search results +nav_order: 3 +--- + +# Collapse search results + +The `collapse` parameter groups search results by a particular field value. This returns only the top document within each group, which helps reduce redundancy by eliminating duplicates. + +The `collapse` parameter requires the field being collapsed to be of either a `keyword` or a `numeric` type. + +--- + +## Collapsing search results + +To populate an index with data, define the index mappings and an `item` field indexed as a `keyword`. The following example request shows you how to define index mappings, populate an index, and then search it. + +#### Define index mappings + +```json +PUT /bakery-items +{ + "mappings": { + "properties": { + "item": { + "type": "keyword" + }, + "category": { + "type": "keyword" + }, + "price": { + "type": "float" + }, + "baked_date": { + "type": "date" + } + } + } +} +``` + +#### Populate an index + +```json +POST /bakery-items/_bulk +{ "index": {} } +{ "item": "Chocolate Cake", "category": "cakes", "price": 15, "baked_date": "2023-07-01T00:00:00Z" } +{ "index": {} } +{ "item": "Chocolate Cake", "category": "cakes", "price": 18, "baked_date": "2023-07-04T00:00:00Z" } +{ "index": {} } +{ "item": "Vanilla Cake", "category": "cakes", "price": 12, "baked_date": "2023-07-02T00:00:00Z" } +``` + +#### Search the index, returning all results + +```json +GET /bakery-items/_search +{ + "query": { + "match": { + "category": "cakes" + } + }, + "sort": ["price"] +} +``` + +This query returns the uncollapsed search results, showing all documents, including both entries for "Chocolate Cake". + +#### Search the index and collapse the results + +To group search results by the `item` field and sort them by `price`, you can use the following query: + +**Collapsed `item` field search results** + +```json +GET /bakery-items/_search +{ + "query": { + "match": { + "category": "cakes" + } + }, + "collapse": { + "field": "item" + }, + "sort": ["price"] +} +``` + +**Response** + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "bakery-items", + "_id": "mISga5EB2HLDXHkv9kAr", + "_score": null, + "_source": { + "item": "Vanilla Cake", + "category": "cakes", + "price": 12, + "baked_date": "2023-07-02T00:00:00Z", + "baker": "Baker A" + }, + "fields": { + "item": [ + "Vanilla Cake" + ] + }, + "sort": [ + 12 + ] + }, + { + "_index": "bakery-items", + "_id": "loSga5EB2HLDXHkv9kAr", + "_score": null, + "_source": { + "item": "Chocolate Cake", + "category": "cakes", + "price": 15, + "baked_date": "2023-07-01T00:00:00Z", + "baker": "Baker A" + }, + "fields": { + "item": [ + "Chocolate Cake" + ] + }, + "sort": [ + 15 + ] + } + ] + } +} +``` + +The collapsed search results will show only one "Chocolate Cake" entry, demonstrating how the `collapse` parameter reduces redundancy. + +The `collapse` parameter affects only the top search results and does not change any aggregation results. The total number of hits shown in the response reflects all matching documents before the parameter is applied, including duplicates. However, the response doesn't indicate the exact number of unique groups formed by the operation. + +--- + +## Expanding collapsed results + +You can expand each collapsed top hit with the `inner_hits` property. + +The following example request applies `inner_hits` to retrieve the lowest-priced and most recent item, for each type of cake: + +```json +GET /bakery-items/_search +{ + "query": { + "match": { + "category": "cakes" + } + }, + "collapse": { + "field": "item", + "inner_hits": [ + { + "name": "cheapest_items", + "size": 1, + "sort": ["price"] + }, + { + "name": "newest_items", + "size": 1, + "sort": [{ "baked_date": "desc" }] + } + ] + }, + "sort": ["price"] +} + +``` + +### Multiple inner hits for each collapsed hit + +To obtain several groups of inner hits for each collapsed result, you can set different criteria for each group. For example, lets request the three most recent items for every bakery item: + +```json +GET /bakery-items/_search +{ + "query": { + "match": { + "category": "cakes" + } + }, + "collapse": { + "field": "item", + "inner_hits": [ + { + "name": "cheapest_items", + "size": 1, + "sort": ["price"] + }, + { + "name": "newest_items", + "size": 3, + "sort": [{ "baked_date": "desc" }] + } + ] + }, + "sort": ["price"] +} + + +``` +This query searches for documents in the `cakes` category and groups the search results by the `item_name` field. For each `item_name`, it retrieves the top three lowest-priced items and the top three most recent items, sorted by `baked_date` in descending order. + +You can expand the groups by sending an additional query for each inner hit request corresponding to each collapsed hit in the response. This can significantly slow down the process if there are too many groups or inner hit requests. The `max_concurrent_group_searches` request parameter can be used to control the maximum number of concurrent searches allowed in this phase. The default is based on the number of data nodes and the default search thread pool size. + diff --git a/_search-plugins/concurrent-segment-search.md b/_search-plugins/concurrent-segment-search.md index 80614e2fff..6675faf1f9 100644 --- a/_search-plugins/concurrent-segment-search.md +++ b/_search-plugins/concurrent-segment-search.md @@ -175,7 +175,7 @@ The following sections provide additional considerations for concurrent segment ### The `terminate_after` search parameter -The [`terminate_after` search parameter]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters) is used to terminate a search request once a specified number of documents has been collected. If you include the `terminate_after` parameter in a request, concurrent segment search is disabled and the request is run in a non-concurrent manner. +The [`terminate_after` search parameter]({{site.url}}{{site.baseurl}}/api-reference/search/#query-parameters) is used to terminate a search request once a specified number of documents has been collected. If you include the `terminate_after` parameter in a request, concurrent segment search is disabled and the request is run in a non-concurrent manner. Typically, queries are used with smaller `terminate_after` values and thus complete quickly because the search is performed on a reduced dataset. Therefore, concurrent search may not further improve performance in this case. Moreover, when `terminate_after` is used with other search request parameters, such as `track_total_hits` or `size`, it adds complexity and changes the expected query behavior. Falling back to a non-concurrent path for search requests that include `terminate_after` ensures consistent results between concurrent and non-concurrent requests. diff --git a/_search-plugins/cross-cluster-search.md b/_search-plugins/cross-cluster-search.md index 7d3ff72efb..48a5e3cfbe 100644 --- a/_search-plugins/cross-cluster-search.md +++ b/_search-plugins/cross-cluster-search.md @@ -38,7 +38,7 @@ To query indexes on remote clusters, users must have `READ` or `SEARCH` permissi indices:admin/shards/search_shards ``` -For more information about the `ccs_minimize_roundtrips` parameter, see the list of [URL Parameters]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters) for the Search API. +For more information about the `ccs_minimize_roundtrips` parameter, see the list of [parameters]({{site.url}}{{site.baseurl}}/api-reference/search/#query-parameters) for the Search API. #### Example roles.yml configuration diff --git a/_search-plugins/filter-search.md b/_search-plugins/filter-search.md new file mode 100644 index 0000000000..f8625e0ac0 --- /dev/null +++ b/_search-plugins/filter-search.md @@ -0,0 +1,151 @@ +--- +layout: default +title: Filter search results +nav_order: 36 +--- + +# Filter search results + +You can filter searches using different methods, each suited to specific scenarios. You can apply filters at the query level, using `boolean` query clauses and `post_filter` and `aggregation` level filters, as follows: + +- **Query-level filtering:** Apply `boolean` query filter clauses to filter search hits and aggregations, such as to narrow results to specific categories or brands. +- **Post-filter filtering:** Use `post_filter` to refine search hits based on user selections while preserving all aggregation options. +- **Aggregation-level filtering:** Adjust specific aggregations based on selected filters without impacting other aggregations. + +## Query-level filtering with Boolean queries + +Use a `boolean` query with a filter clause to apply filters to both search hits and aggregations. For example, if a shopper searches for `smartphones` from `BrandA`, a Boolean query can restrict results to only those smartphones from `BrandA`. The following steps guide you through query-level filtering. + +1. Create an index `electronics` and provide the mapping using the following request: + +```json +PUT /electronics +{ + "mappings": { + "properties": { + "brand": { "type": "keyword" }, + "category": { "type": "keyword" }, + "price": { "type": "float" }, + "features": { "type": "keyword" } + } + } +} +``` +{% include copy-curl.html %} + +2. Add documents to the `electronics` index using the following request: + +```json +PUT /electronics/_doc/1?refresh +{ + "brand": "BrandA", + "category": "Smartphone", + "price": 699.99, + "features": ["5G", "Dual Camera"] +} +PUT /electronics/_doc/2?refresh +{ + "brand": "BrandA", + "category": "Laptop", + "price": 1199.99, + "features": ["Touchscreen", "16GB RAM"] +} +PUT /electronics/_doc/3?refresh +{ + "brand": "BrandB", + "category": "Smartphone", + "price": 799.99, + "features": ["5G", "Triple Camera"] +} +``` +{% include copy-curl.html %} + +3. Apply a `boolean` filter query to display only `smartphones` from `BrandA` using the following request: + +```json +GET /electronics/_search +{ + "query": { + "bool": { + "filter": [ + { "term": { "brand": "BrandA" }}, + { "term": { "category": "Smartphone" }} + ] + } + } +} +``` +{% include copy-curl.html %} + +## Narrowing results using `post-filter` while preserving aggregation visibility + +Use `post_filter` to limit search hits while preserving all aggregation options. For example, if a shopper selects `BrandA`, results are filtered to show only `BrandA` products while maintaining the visibility of all brand options in the aggregations, as shown in the following example request: + +```json +GET /electronics/_search +{ + "query": { + "bool": { + "filter": { "term": { "category": "Smartphone" }} + } + }, + "aggs": { + "brands": { + "terms": { "field": "brand" } + } + }, + "post_filter": { + "term": { "brand": "BrandA" } + } +} +``` +{% include copy-curl.html %} + +The result should show `BrandA` smartphones in the search hits and all brands in the aggregations. + +## Refining aggregations with aggregation-level filtering + +You can use aggregation-level filtering to apply filters to specific aggregations without affecting the main aggregation to which they belong. + +For example, you can use aggregation-level filtering to filter the `price_ranges` aggregation based on selected brands, `BrandA` and `BrandB`, without affecting the main `price_ranges` aggregation, as shown in the following example request. This displays price ranges relevant to the selected brands while also displaying overall price ranges for all products. + +```json +GET /electronics/_search +{ + "query": { + "bool": { + "filter": { "term": { "category": "Smartphone" }} + } + }, + "aggs": { + "price_ranges": { + "range": { + "field": "price", + "ranges": [ + { "to": 500 }, + { "from": 500, "to": 1000 }, + { "from": 1000 } + ] + } + }, + "filtered_brands": { + "filter": { + "terms": { "brand": ["BrandA", "BrandB"] } + }, + "aggs": { + "price_ranges": { + "range": { + "field": "price", + "ranges": [ + { "to": 500 }, + { "from": 500, "to": 1000 }, + { "from": 1000 } + ] + } + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_search-plugins/improving-search-performance.md b/_search-plugins/improving-search-performance.md index 4a0ffafe11..4cc0a60dc0 100644 --- a/_search-plugins/improving-search-performance.md +++ b/_search-plugins/improving-search-performance.md @@ -11,4 +11,6 @@ OpenSearch offers several ways to improve search performance: - Run resource-intensive queries asynchronously with [asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/). -- Search segments concurrently using [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). \ No newline at end of file +- Search segments concurrently using [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). + +- Improve aggregation performance using a [star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/). diff --git a/_search-plugins/knn/api.md b/_search-plugins/knn/api.md index c7314f7ae2..d927bf1c35 100644 --- a/_search-plugins/knn/api.md +++ b/_search-plugins/knn/api.md @@ -185,7 +185,7 @@ This API operation only works with indexes created using the `nmslib` and `faiss The following request evicts the native library indexes of three indexes from the cache: ```json -GET /_plugins/_knn/clear_cache/index1,index2,index3?pretty +POST /_plugins/_knn/clear_cache/index1,index2,index3?pretty { "_shards" : { "total" : 6, @@ -200,7 +200,7 @@ The `total` parameter indicates the number of shards that the API attempted to c The k-NN clear cache API can be used with index patterns to clear one or more indexes that match the given pattern from the cache, as shown in the following example: ```json -GET /_plugins/_knn/clear_cache/index*?pretty +POST /_plugins/_knn/clear_cache/index*?pretty { "_shards" : { "total" : 6, @@ -234,7 +234,7 @@ Response field | Description `timestamp` | The date and time when the model was created. `description` | A user-provided description of the model. `error` | An error message explaining why the model is in a failed state. -`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. +`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note - this value can be set in the top-level of the request as well `dimension` | The dimensionality of the vector space for which this model is designed. `engine` | The native library used to create the model, either `faiss` or `nmslib`. @@ -351,6 +351,7 @@ Request parameter | Description `search_size` | The training data is pulled from the training index using scroll queries. This parameter defines the number of results to return per scroll query. Default is `10000`. Optional. `description` | A user-provided description of the model. Optional. `method` | The configuration of the approximate k-NN method used for search operations. For more information about the available methods, see [k-NN index method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). The method requires training to be valid. +`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note: This value can also be set in the `method` parameter. #### Usage @@ -365,10 +366,10 @@ POST /_plugins/_knn/models/{model_id}/_train?preference={node_id} "max_training_vector_count": 1200, "search_size": 100, "description": "My model", + "space_type": "l2", "method": { "name":"ivf", "engine":"faiss", - "space_type": "l2", "parameters":{ "nlist":128, "encoder":{ @@ -395,10 +396,10 @@ POST /_plugins/_knn/models/_train?preference={node_id} "max_training_vector_count": 1200, "search_size": 100, "description": "My model", + "space_type": "l2", "method": { "name":"ivf", "engine":"faiss", - "space_type": "l2", "parameters":{ "nlist":128, "encoder":{ diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md index e9cff8562f..f8921033e0 100644 --- a/_search-plugins/knn/approximate-knn.md +++ b/_search-plugins/knn/approximate-knn.md @@ -49,9 +49,9 @@ PUT my-knn-index-1 "my_vector1": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "nmslib", "parameters": { "ef_construction": 128, @@ -62,9 +62,9 @@ PUT my-knn-index-1 "my_vector2": { "type": "knn_vector", "dimension": 4, + "space_type": "innerproduct", "method": { "name": "hnsw", - "space_type": "innerproduct", "engine": "faiss", "parameters": { "ef_construction": 256, @@ -199,10 +199,10 @@ POST /_plugins/_knn/models/my-model/_train "training_field": "train-field", "dimension": 4, "description": "My model description", + "space_type": "l2", "method": { "name": "ivf", "engine": "faiss", - "space_type": "l2", "parameters": { "nlist": 4, "nprobes": 2 @@ -308,6 +308,72 @@ Engine | Notes :--- | :--- `faiss` | If `nprobes` is present in a query, it overrides the value provided when creating the index. +### Rescoring quantized results using full precision + +Quantization can be used to significantly reduce the memory footprint of a k-NN index. For more information about quantization, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization). Because some vector representation is lost during quantization, the computed distances will be approximate. This causes the overall recall of the search to decrease. + +To improve recall while maintaining the memory savings of quantization, you can use a two-phase search approach. In the first phase, `oversample_factor * k` results are retrieved from an index using quantized vectors and the scores are approximated. In the second phase, the full-precision vectors of those `oversample_factor * k` results are loaded into memory from disk, and scores are recomputed against the full-precision query vector. The results are then reduced to the top k. + +The default rescoring behavior is determined by the `mode` and `compression_level` of the backing k-NN vector field: + +- For `in_memory` mode, no rescoring is applied by default. +- For `on_disk` mode, default rescoring is based on the configured `compression_level`. Each `compression_level` provides a default `oversample_factor`, specified in the following table. + +| Compression level | Default rescore `oversample_factor` | +|:------------------|:----------------------------------| +| `32x` (default) | 3.0 | +| `16x` | 2.0 | +| `8x` | 2.0 | +| `4x` | No default rescoring | +| `2x` | No default rescoring | + +To explicitly apply rescoring, provide the `rescore` parameter in a query on a quantized index and specify the `oversample_factor`: + +```json +GET my-knn-index-1/_search +{ + "size": 2, + "query": { + "knn": { + "target-field": { + "vector": [2, 3, 5, 6], + "k": 2, + "rescore" : { + "oversample_factor": 1.2 + } + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, set the `rescore` parameter to `true` to use a default `oversample_factor` of `1.0`: + +```json +GET my-knn-index-1/_search +{ + "size": 2, + "query": { + "knn": { + "target-field": { + "vector": [2, 3, 5, 6], + "k": 2, + "rescore" : true + } + } + } +} +``` +{% include copy-curl.html %} + +The `oversample_factor` is a floating-point number between 1.0 and 100.0, inclusive. The number of results in the first pass is calculated as `oversample_factor * k` and is guaranteed to be between 100 and 10,000, inclusive. If the calculated number of results is smaller than 100, then the number of results is set to 100. If the calculated number of results is greater than 10,000, then the number of results is set to 10,000. + +Rescoring is only supported for the `faiss` engine. + +Rescoring is not needed if quantization is not used because the scores returned are already fully precise. +{: .note} + ### Using approximate k-NN with filters To learn about using filters with k-NN search, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). @@ -322,7 +388,7 @@ To learn more about the radial search feature, see [k-NN radial search]({{site.u ### Using approximate k-NN with binary vectors -To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). ## Spaces @@ -346,5 +412,5 @@ The cosine similarity formula does not include the `1 -` prefix. However, becaus With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. {: .note } -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). {: .note} diff --git a/_search-plugins/knn/disk-based-vector-search.md b/_search-plugins/knn/disk-based-vector-search.md new file mode 100644 index 0000000000..8fe794f44c --- /dev/null +++ b/_search-plugins/knn/disk-based-vector-search.md @@ -0,0 +1,208 @@ +--- +layout: default +title: Disk-based vector search +nav_order: 16 +parent: k-NN search +has_children: false +--- + +# Disk-based vector search +**Introduced 2.17** +{: .label .label-purple} + +For low-memory environments, OpenSearch provides _disk-based vector search_, which significantly reduces the operational costs for vector workloads. Disk-based vector search uses [binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization), compressing vectors and thereby reducing the memory requirements. This memory optimization provides large memory savings at the cost of slightly increased search latency while still maintaining strong recall. + +To use disk-based vector search, set the [`mode`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes) parameter to `on_disk` for your vector field type. This parameter will configure your index to use secondary storage. + +## Creating an index for disk-based vector search + +To create an index for disk-based vector search, send the following request: + +```json +PUT my-vector-index +{ + "settings" : { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "space_type": "innerproduct", + "data_type": "float", + "mode": "on_disk" + } + } + } +} +``` +{% include copy-curl.html %} + +By default, the `on_disk` mode configures the index to use the `faiss` engine and `hnsw` method. The default [`compression_level`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels) of `32x` reduces the amount of memory the vectors require by a factor of 32. To preserve the search recall, rescoring is enabled by default. A search on a disk-optimized index runs in two phases: The compressed index is searched first, and then the results are rescored using full-precision vectors loaded from disk. + +To reduce the compression level, provide the `compression_level` parameter when creating the index mapping: + +```json +PUT my-vector-index +{ + "settings" : { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "space_type": "innerproduct", + "data_type": "float", + "mode": "on_disk", + "compression_level": "16x" + } + } + } +} +``` +{% include copy-curl.html %} + +For more information about the `compression_level` parameter, see [Compression levels]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels). Note that for `4x` compression, the `lucene` engine will be used. +{: .note} + +If you need more granular fine-tuning, you can override additional k-NN parameters in the method definition. For example, to improve recall, increase the `ef_construction` parameter value: + +```json +PUT my-vector-index +{ + "settings" : { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "space_type": "innerproduct", + "data_type": "float", + "mode": "on_disk", + "method": { + "params": { + "ef_construction": 512 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The `on_disk` mode only works with the `float` data type. +{: .note} + +## Ingestion + +You can perform document ingestion for a disk-optimized vector index in the same way as for a regular vector index. To index several documents in bulk, send the following request: + +```json +POST _bulk +{ "index": { "_index": "my-vector-index", "_id": "1" } } +{ "my_vector_field": [1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5], "price": 12.2 } +{ "index": { "_index": "my-vector-index", "_id": "2" } } +{ "my_vector_field": [2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5], "price": 7.1 } +{ "index": { "_index": "my-vector-index", "_id": "3" } } +{ "my_vector_field": [3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5], "price": 12.9 } +{ "index": { "_index": "my-vector-index", "_id": "4" } } +{ "my_vector_field": [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5], "price": 1.2 } +{ "index": { "_index": "my-vector-index", "_id": "5" } } +{ "my_vector_field": [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], "price": 3.7 } +{ "index": { "_index": "my-vector-index", "_id": "6" } } +{ "my_vector_field": [6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5], "price": 10.3 } +{ "index": { "_index": "my-vector-index", "_id": "7" } } +{ "my_vector_field": [7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5], "price": 5.5 } +{ "index": { "_index": "my-vector-index", "_id": "8" } } +{ "my_vector_field": [8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5], "price": 4.4 } +{ "index": { "_index": "my-vector-index", "_id": "9" } } +{ "my_vector_field": [9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5], "price": 8.9 } +``` +{% include copy-curl.html %} + +## Search + +Search is also performed in the same way as in other index configurations. The key difference is that, by default, the `oversample_factor` of the rescore parameter is set to `3.0` (unless you override the `compression_level`). For more information, see [Rescoring quantized results using full precision]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). To perform vector search on a disk-optimized index, provide the search vector: + +```json +GET my-vector-index/_search +{ + "query": { + "knn": { + "my_vector_field": { + "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5], + "k": 5 + } + } + } +} +``` +{% include copy-curl.html %} + +Similarly to other index configurations, you can override k-NN parameters in the search request: + +```json +GET my-vector-index/_search +{ + "query": { + "knn": { + "my_vector_field": { + "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5], + "k": 5, + "method_parameters": { + "ef_search": 512 + }, + "rescore": { + "oversample_factor": 10.0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +[Radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/) does not support disk-based vector search. +{: .note} + +## Model-based indexes + +For [model-based indexes]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model), you can specify the `on_disk` parameter in the training request in the same way that you would specify it during index creation. By default, `on_disk` mode will use the [Faiss IVF method]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#supported-faiss-methods) and a compression level of `32x`. To run the training API, send the following request: + +```json +POST /_plugins/_knn/models/test-model/_train +{ + "training_index": "train-index-name", + "training_field": "train-field-name", + "dimension": 8, + "max_training_vector_count": 1200, + "search_size": 100, + "description": "My model", + "space_type": "innerproduct", + "mode": "on_disk" +} +``` +{% include copy-curl.html %} + +This command assumes that training data has been ingested into the `train-index-name` index. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). +{: .note} + +You can override the `compression_level` for disk-optimized indexes in the same way as for regular k-NN indexes. + + +## Next steps + +- For more information about binary quantization, see [Binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization). +- For more information about k-NN vector workload modes, see [Vector workload modes]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes). \ No newline at end of file diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md index a6ffd922eb..b53fa997d8 100644 --- a/_search-plugins/knn/knn-index.md +++ b/_search-plugins/knn/knn-index.md @@ -25,9 +25,9 @@ PUT /test-index "my_vector1": { "type": "knn_vector", "dimension": 3, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "lucene", "parameters": { "ef_construction": 128, @@ -41,17 +41,17 @@ PUT /test-index ``` {% include copy-curl.html %} -## Lucene byte vector +## Byte vectors -Starting with k-NN plugin version 2.9, you can use `byte` vectors with the `lucene` engine to reduce the amount of storage space needed. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector). +Starting with k-NN plugin version 2.17, you can use `byte` vectors with the `faiss` and `lucene` engines to reduce the amount of required memory and storage space. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors). -## Binary vector +## Binary vectors -Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). ## SIMD optimization for the Faiss engine -Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. +Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. Starting with version 2.18, the k-NN plugin supports AVX512 SIMD instructions on x64 architecture. SIMD optimization is applicable only if the vector dimension is a multiple of 8. {: .note} @@ -60,14 +60,22 @@ SIMD optimization is applicable only if the vector dimension is a multiple of 8. ### x64 architecture -For the x64 architecture, two different versions of the Faiss library are built and shipped with the artifact: +For x64 architecture, the following versions of the Faiss library are built and shipped with the artifact: - `libopensearchknn_faiss.so`: The non-optimized Faiss library without SIMD instructions. -- `libopensearchknn_faiss_avx2.so`: The Faiss library that contains AVX2 SIMD instructions. +- `libopensearchknn_faiss_avx512.so`: The Faiss library containing AVX512 SIMD instructions. +- `libopensearchknn_faiss_avx2.so`: The Faiss library containing AVX2 SIMD instructions. -If your hardware supports AVX2, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime. +When using the Faiss library, the performance ranking is as follows: AVX512 > AVX2 > no optimization. +{: .note } + +If your hardware supports AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx512.so` library at runtime. + +If your hardware supports AVX2 but doesn't support AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime. + +To disable the AVX512 and AVX2 SIMD instructions and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx512.disabled` and `knn.faiss.avx2.disabled` static settings as `true` in `opensearch.yml` (by default, both of these are `false`). -To disable AVX2 and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx2.disabled` static setting as `true` in `opensearch.yml` (default is `false`). Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). +Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). ### ARM64 architecture @@ -83,8 +91,8 @@ A method definition will always contain the name of the method, the space_type t Mapping parameter | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- `name` | true | n/a | false | The identifier for the nearest neighbor method. -`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. -`engine` | false | nmslib | false | The approximate k-NN library to use for indexing and search. The available libraries are faiss, nmslib, and Lucene. +`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. Note: This value can also be specified at the top level of the mapping. +`engine` | false | faiss | false | The approximate k-NN library to use for indexing and search. The available libraries are `faiss`, `nmslib`, and `lucene`. `parameters` | false | null | false | The parameters used for the nearest neighbor method. ### Supported nmslib methods @@ -116,7 +124,7 @@ Method name | Requires training | Supported spaces | Description For hnsw, "innerproduct" is not available when PQ is used. {: .note} -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). {: .note} #### HNSW parameters @@ -168,7 +176,6 @@ An index created in OpenSearch version 2.11 or earlier will still use the old `e "method": { "name":"hnsw", "engine":"lucene", - "space_type": "l2", "parameters":{ "m":2048, "ef_construction": 245 @@ -186,7 +193,6 @@ The following example method definition specifies the `hnsw` method and a `pq` e "method": { "name":"hnsw", "engine":"faiss", - "space_type": "l2", "parameters":{ "encoder":{ "name":"pq", @@ -232,7 +238,6 @@ The following example uses the `ivf` method without specifying an encoder (by d "method": { "name":"ivf", "engine":"faiss", - "space_type": "l2", "parameters":{ "nlist": 4, "nprobes": 2 @@ -246,7 +251,6 @@ The following example uses the `ivf` method with a `pq` encoder: "method": { "name":"ivf", "engine":"faiss", - "space_type": "l2", "parameters":{ "encoder":{ "name":"pq", @@ -265,7 +269,6 @@ The following example uses the `hnsw` method without specifying an encoder (by d "method": { "name":"hnsw", "engine":"faiss", - "space_type": "l2", "parameters":{ "ef_construction": 256, "m": 8 @@ -279,7 +282,6 @@ The following example uses the `hnsw` method with an `sq` encoder of type `fp16` "method": { "name":"hnsw", "engine":"faiss", - "space_type": "l2", "parameters":{ "encoder": { "name": "sq", @@ -300,7 +302,6 @@ The following example uses the `ivf` method with an `sq` encoder of type `fp16`: "method": { "name":"ivf", "engine":"faiss", - "space_type": "l2", "parameters":{ "encoder": { "name": "sq", @@ -324,7 +325,7 @@ If you want to use less memory and increase indexing speed as compared to HNSW w If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. Because PQ is a lossy encoding, query quality will drop. -You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/). +You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/). ### Memory estimation @@ -367,6 +368,7 @@ Setting | Default | Updatable | Description :--- | :--- | :--- | :--- `index.knn` | false | false | Whether the index should build native library indexes for the `knn_vector` fields. If set to false, the `knn_vector` fields will be stored in doc values, but approximate k-NN search functionality will be disabled. `index.knn.algo_param.ef_search` | 100 | true | The size of the dynamic list used during k-NN searches. Higher values result in more accurate but slower searches. Only available for NMSLIB. +`index.knn.advanced.approximate_threshold` | 15,000 | true | The number of vectors a segment must have before creating specialized data structures for approximate search. Set to `-1` to disable building vector data structures and `0` to always build them. `index.knn.algo_param.ef_construction` | 100 | false | Deprecated in 1.0.0. Instead, use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value. `index.knn.algo_param.m` | 16 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. `index.knn.space_type` | l2 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. diff --git a/_search-plugins/knn/knn-score-script.md b/_search-plugins/knn/knn-score-script.md index d2fd883e74..a184de2d3d 100644 --- a/_search-plugins/knn/knn-score-script.md +++ b/_search-plugins/knn/knn-score-script.md @@ -302,5 +302,5 @@ Cosine similarity returns a number between -1 and 1, and because OpenSearch rele With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ... ]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. {: .note } -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). {: .note} diff --git a/_search-plugins/knn/knn-vector-quantization.md b/_search-plugins/knn/knn-vector-quantization.md index 656ce72fd2..a911dc91c9 100644 --- a/_search-plugins/knn/knn-vector-quantization.md +++ b/_search-plugins/knn/knn-vector-quantization.md @@ -11,15 +11,15 @@ has_math: true By default, the k-NN plugin supports the indexing and querying of vectors of type `float`, where each dimension of the vector occupies 4 bytes of memory. For use cases that require ingestion on a large scale, keeping `float` vectors can be expensive because OpenSearch needs to construct, load, save, and search graphs (for native `nmslib` and `faiss` engines). To reduce the memory footprint, you can use vector quantization. -OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, and product quantization (PQ). +OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, product quantization (PQ), and binary quantization(BQ). -## Lucene byte vector +## Byte vectors -Starting with k-NN plugin version 2.9, you can use `byte` vectors with the Lucene engine in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector). +Starting with version 2.17, the k-NN plugin supports `byte` vectors with the `faiss` and `lucene` engines in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors). ## Lucene scalar quantization -Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike the [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector), which requires you to quantize vectors before ingesting the documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors. +Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike [byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors), which require you to quantize vectors before ingesting documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors. Quantization can decrease the memory footprint by a factor of 4 in exchange for some loss in recall. Additionally, quantization slightly increases disk usage because it requires storing both the raw input vectors and the quantized vectors. @@ -40,10 +40,10 @@ PUT /test-index "my_vector1": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", "engine": "lucene", - "space_type": "l2", "parameters": { "encoder": { "name": "sq" @@ -85,10 +85,10 @@ PUT /test-index "my_vector1": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", "engine": "lucene", - "space_type": "l2", "parameters": { "encoder": { "name": "sq", @@ -115,7 +115,7 @@ In the ideal scenario, 7-bit vectors created by the Lucene scalar quantizer use #### HNSW memory estimation -The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * M)` bytes/vector, where `M` is the maximum number of bidirectional links created for each element during the construction of the graph. +The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows: @@ -150,10 +150,10 @@ PUT /test-index "my_vector1": { "type": "knn_vector", "dimension": 3, + "space_type": "l2", "method": { "name": "hnsw", "engine": "faiss", - "space_type": "l2", "parameters": { "encoder": { "name": "sq" @@ -194,10 +194,10 @@ PUT /test-index "my_vector1": { "type": "knn_vector", "dimension": 3, + "space_type": "l2", "method": { "name": "hnsw", "engine": "faiss", - "space_type": "l2", "parameters": { "encoder": { "name": "sq", @@ -250,9 +250,9 @@ In the best-case scenario, 16-bit vectors produced by the Faiss SQfp16 quantizer #### HNSW memory estimation -The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * M)` bytes/vector. +The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. -As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows: +As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows: ```r 1.1 * (2 * 256 + 8 * 16) * 1,000,000 ~= 0.656 GB @@ -260,9 +260,9 @@ As an example, assume that you have 1 million vectors with a dimension of 256 an #### IVF memory estimation -The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * d))` bytes/vector. +The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into. -As an example, assume that you have 1 million vectors with a dimension of 256 and `nlist` of 128. The memory requirement can be estimated as follows: +As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows: ```r 1.1 * (((2 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 0.525 GB @@ -310,3 +310,175 @@ For example, assume that you have 1 million vectors with a dimension of 256, `iv ```r 1.1*((8 / 8 * 64 + 24) * 1000000 + 100 * (2^8 * 4 * 256 + 4 * 512 * 256)) ~= 0.171 GB ``` + +## Binary quantization + +Starting with version 2.17, OpenSearch supports BQ with binary vector support for the Faiss engine. BQ compresses vectors into a binary format (0s and 1s), making it highly efficient in terms of memory usage. You can choose to represent each vector dimension using 1, 2, or 4 bits, depending on the desired precision. One of the advantages of using BQ is that the training process is handled automatically during indexing. This means that no separate training step is required, unlike other quantization techniques such as PQ. + +### Using BQ +To configure BQ for the Faiss engine, define a `knn_vector` field and specify the `mode` as `on_disk`. This configuration defaults to 1-bit BQ and both `ef_search` and `ef_construction` set to `100`: + +```json +PUT my-vector-index +{ + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "space_type": "l2", + "data_type": "float", + "mode": "on_disk" + } + } + } +} +``` +{% include copy-curl.html %} + +To further optimize the configuration, you can specify additional parameters, such as the compression level, and fine-tune the search parameters. For example, you can override the `ef_construction` value or define the compression level, which corresponds to the number of bits used for quantization: + +- **32x compression** for 1-bit quantization +- **16x compression** for 2-bit quantization +- **8x compression** for 4-bit quantization + +This allows for greater control over memory usage and recall performance, providing flexibility to balance between precision and storage efficiency. + +To specify the compression level, set the `compression_level` parameter: + +```json +PUT my-vector-index +{ + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "space_type": "l2", + "data_type": "float", + "mode": "on_disk", + "compression_level": "16x", + "method": { + "params": { + "ef_construction": 16 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The following example further fine-tunes the configuration by defining `ef_construction`, `encoder`, and the number of `bits` (which can be `1`, `2`, or `4`): + +```json +PUT my-vector-index +{ + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "method": { + "name": "hnsw", + "engine": "faiss", + "space_type": "l2", + "params": { + "m": 16, + "ef_construction": 512, + "encoder": { + "name": "binary", + "parameters": { + "bits": 1 + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Search using binary quantized vectors + +You can perform a k-NN search on your index by providing a vector and specifying the number of nearest neighbors (k) to return: + +```json +GET my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector_field": { + "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5], + "k": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +You can also fine-tune search by providing the `ef_search` and `oversample_factor` parameters. +The `oversample_factor` parameter controls the factor by which the search oversamples the candidate vectors before ranking them. Using a higher oversample factor means that more candidates will be considered before ranking, improving accuracy but also increasing search time. When selecting the `oversample_factor` value, consider the trade-off between accuracy and efficiency. For example, setting the `oversample_factor` to `2.0` will double the number of candidates considered during the ranking phase, which may help achieve better results. + +The following request specifies the `ef_search` and `oversample_factor` parameters: + +```json +GET my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector_field": { + "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5], + "k": 10, + "method_parameters": { + "ef_search": 10 + }, + "rescore": { + "oversample_factor": 10.0 + } + } + } + } +} +``` +{% include copy-curl.html %} + + +#### HNSW memory estimation + +The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. + +As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The following sections provide memory requirement estimations for various compression values. + +##### 1-bit quantization (32x compression) + +In 1-bit quantization, each dimension is represented using 1 bit, equivalent to a 32x compression factor. The memory requirement can be estimated as follows: + +```r +Memory = 1.1 * ((256 * 1 / 8) + 8 * 16) * 1,000,000 + ~= 0.176 GB +``` + +##### 2-bit quantization (16x compression) + +In 2-bit quantization, each dimension is represented using 2 bits, equivalent to a 16x compression factor. The memory requirement can be estimated as follows: + +```r +Memory = 1.1 * ((256 * 2 / 8) + 8 * 16) * 1,000,000 + ~= 0.211 GB +``` + +##### 4-bit quantization (8x compression) + +In 4-bit quantization, each dimension is represented using 4 bits, equivalent to an 8x compression factor. The memory requirement can be estimated as follows: + +```r +Memory = 1.1 * ((256 * 4 / 8) + 8 * 16) * 1,000,000 + ~= 0.282 GB +``` diff --git a/_search-plugins/knn/nested-search-knn.md b/_search-plugins/knn/nested-search-knn.md index d947ebc6e6..bbba6c9c1e 100644 --- a/_search-plugins/knn/nested-search-knn.md +++ b/_search-plugins/knn/nested-search-knn.md @@ -38,9 +38,9 @@ PUT my-knn-index-1 "my_vector": { "type": "knn_vector", "dimension": 3, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "lucene", "parameters": { "ef_construction": 100, @@ -324,9 +324,9 @@ PUT my-knn-index-1 "my_vector": { "type": "knn_vector", "dimension": 3, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "lucene", "parameters": { "ef_construction": 100, diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md index cc27776fc4..4b2311ad65 100644 --- a/_search-plugins/knn/painless-functions.md +++ b/_search-plugins/knn/painless-functions.md @@ -51,11 +51,11 @@ The following table describes the available painless functions the k-NN plugin p Function name | Function signature | Description :--- | :--- l2Squared | `float l2Squared (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors. -l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors. +l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This function calculates the L1 Norm distance (Manhattan distance) between a given query vector and document vectors. cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of 1. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance, instead of calculating the magnitude every time for every filtered document:
`float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)`
In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score. hamming | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. The shorter the distance, the more relevant the document is, so this example inverts the return value of the Hamming distance. -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). {: .note} ## Constraints @@ -73,4 +73,4 @@ The `hamming` space type is supported for binary vectors in OpenSearch version 2 Because scores can only be positive, this script ranks documents with vector fields higher than those without. With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. -{: .note } \ No newline at end of file +{: .note } diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md index 123b1daef1..ae2368b597 100644 --- a/_search-plugins/knn/performance-tuning.md +++ b/_search-plugins/knn/performance-tuning.md @@ -45,7 +45,7 @@ If your hardware has multiple cores, you can allow multiple threads in native li Monitor CPU utilization and choose the correct number of threads. Because native library index construction is costly, choosing more threads then you need can cause additional CPU load. -### (Expert-level) Disable vector field storage in the source field +### (Expert level) Disable vector field storage in the source field The `_source` field contains the original JSON document body that was passed at index time. This field is not indexed and is not searchable but is stored so that it can be returned when executing fetch requests such as `get` and `search`. When using vector fields within the source, you can remove the vector field to save disk space, as shown in the following example where the `location` vector is excluded: @@ -59,9 +59,9 @@ The `_source` field contains the original JSON document body that was passed at "location": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "faiss" } } @@ -85,9 +85,9 @@ In OpenSearch 2.15 or later, you can further improve indexing speed and reduce d "location": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "faiss" } } @@ -95,9 +95,74 @@ In OpenSearch 2.15 or later, you can further improve indexing speed and reduce d } ``` -This is an expert-level setting. Disabling the `_recovery_source` may lead to failures during peer-to-peer recovery. Before disabling the `_recovery_source`, check with your OpenSearch cluster admin to determine whether your cluster performs regular flushes before starting the peer-to-peer recovery of shards before disabling the `_recovery_source`. +This is an expert-level setting. Disabling the `_recovery_source` may lead to failures during peer-to-peer recovery. Before disabling the `_recovery_source`, check with your OpenSearch cluster admin to determine whether your cluster performs regular flushes before starting the peer-to-peer recovery of shards prior to disabling the `_recovery_source`. {: .warning} +### (Expert level) Build vector data structures on demand + +This approach is recommended only for workloads that involve a single initial bulk upload and will be used exclusively for search after force merging to a single segment. + +During indexing, vector search builds a specialized data structure for a `knn_vector` field to enable efficient approximate k-NN search. However, these structures are rebuilt during [force merge]({{site.url}}{{site.baseurl}}/api-reference/index-apis/force-merge/) on k-NN indexes. To optimize indexing speed, follow these steps: + +1. **Disable vector data structure creation**: Disable vector data structure creation for new segments by setting [`index.knn.advanced.approximate_threshold`]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#index-settings) to `-1`. + + To specify the setting at index creation, send the following request: + + ```json + PUT /test-index/ + { + "settings": { + "index.knn.advanced.approximate_threshold": "-1" + } + } + ``` + {% include copy-curl.html %} + + To specify the setting after index creation, send the following request: + + ```json + PUT /test-index/_settings + { + "index.knn.advanced.approximate_threshold": "-1" + } + ``` + {% include copy-curl.html %} + +1. **Perform bulk indexing**: Index data in [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) without performing any searches during ingestion: + + ```json + POST _bulk + { "index": { "_index": "test-index", "_id": "1" } } + { "my_vector1": [1.5, 2.5], "price": 12.2 } + { "index": { "_index": "test-index", "_id": "2" } } + { "my_vector1": [2.5, 3.5], "price": 7.1 } + ``` + {% include copy-curl.html %} + + If searches are performed while vector data structures are disabled, they will run using exact k-NN search. + +1. **Reenable vector data structure creation**: Once indexing is complete, enable vector data structure creation by setting `index.knn.advanced.approximate_threshold` to `0`: + + ```json + PUT /test-index/_settings + { + "index.knn.advanced.approximate_threshold": "0" + } + ``` + {% include copy-curl.html %} + + If you do not reset the setting to `0` before the force merge, you will need to reindex your data. + {: .note} + +1. **Force merge segments into one segment**: Perform a force merge and specify `max_num_segments=1` to create the vector data structures only once: + + ```json + POST test-index/_forcemerge?max_num_segments=1 + ``` + {% include copy-curl.html %} + + After the force merge, new search requests will execute approximate k-NN search using the newly created data structures. + ## Search performance tuning Take the following steps to improve search performance: diff --git a/_search-plugins/knn/radial-search-knn.md b/_search-plugins/knn/radial-search-knn.md index 1a4a223294..e5449a0993 100644 --- a/_search-plugins/knn/radial-search-knn.md +++ b/_search-plugins/knn/radial-search-knn.md @@ -53,9 +53,9 @@ PUT knn-index-test "my_vector": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "faiss", "parameters": { "ef_construction": 100, diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md index 1b9aa3608c..e4731ec94c 100644 --- a/_search-plugins/knn/settings.md +++ b/_search-plugins/knn/settings.md @@ -27,6 +27,7 @@ Setting | Static/Dynamic | Default | Description `knn.model.index.number_of_replicas`| Dynamic | `1` | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this value should be at least 1 in order to increase stability. `knn.model.cache.size.limit` | Dynamic | `10%` | The model cache limit cannot exceed 25% of the JVM heap. `knn.faiss.avx2.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine). +`knn.faiss.avx512.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx512.so` library and load the `libopensearchknn_faiss_avx2.so` library or the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine). ## Index settings diff --git a/_search-plugins/ltr/advanced-functionality.md b/_search-plugins/ltr/advanced-functionality.md new file mode 100644 index 0000000000..50a7e6de19 --- /dev/null +++ b/_search-plugins/ltr/advanced-functionality.md @@ -0,0 +1,541 @@ +--- +layout: default +title: Advanced functionality +nav_order: 80 +parent: Learning to Rank +has_children: false +--- + +# Advanced functionality + +OpenSearch Learning to Rank (LTR) offers additional functionality. It is recommended that you have a foundational understanding of OpenSearch LTR before working with these features. + +## Reusable features + +[Building features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/) involves uploading a list of features. To avoid repeating common features across multiple sets, you can maintain a library of reusable features. + +For example, if a title field query is frequently used in your feature sets, then you can create a reusable title query using the feature API: + +```json + POST _ltr/_feature/titleSearch + { + "feature": + { + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + } +``` +{% include copy-curl.html %} + +Normal CRUD operations apply, so you can delete a feature by using the following operation: + +```json +DELETE _ltr/_feature/titleSearch +``` +{% include copy-curl.html %} + + +To fetch an individual feature, you can use the following request: + +```json +GET _ltr/_feature/titleSearch +``` +{% include copy-curl.html %} + +To view a list of all features filtered by name prefix, you can use the following request: + +```json +GET /_ltr/_feature?prefix=t +``` +{% include copy-curl.html %} + +To create or update a feature set, you can refer to the `titleSearch` feature by using the following request: + +```json +POST /_ltr/_featureset/my_featureset/_addfeatures/titleSearch +``` +{% include copy-curl.html %} + +This adds the `titleSearch` feature to the next ordinal position within the `my_featureset` feature set. + +## Derived features + +Derived features are those that build upon other features. These can be expressed as [Lucene expressions](http://lucene.apache.org/core/7_1_0/expressions/index.html?org/apache/lucene/expressions/js/package-summary.html) and are identified by the `"template_language": "derived_expression"`. + +Additionally, derived features can accept query-time variables of type [`Number`](https://docs.oracle.com/javase/8/docs/api/java/lang/Number.html), as described in [Creating feature sets]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features#creating-feature-sets). + +### Script features + +Script features are a type of [derived feature](#derived-features). These features have access to the `feature_vector`, but they are implemented as native or Painless OpenSearch scripts rather than as [Lucene +expressions](http://lucene.apache.org/core/7_1_0/expressions/index.html?org/apache/lucene/expressions/js/package-summary.html). + +To identify these features, set the `"template_language": "script_feature""`. The custom script can access the `feature_vector` through the [Java Map](https://docs.oracle.com/javase/8/docs/api/java/util/Map.html), as described in [Create a feature set]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features#creating-feature-sets). + +Script-based features may impact the performance of your OpenSearch cluster, so it is best to avoid them if you require highly performant queries. +{: .warning} + +### Script feature parameters + +Script features are native or Painless scripts within the context of LTR. These script features can accept parameters as described in the [OpenSearch script documentation]({{site.url}}{{site.baseurl}}/api-reference/script-apis/index/). When working with LTR scripts, you can override parameter values and names. The priority for parameterization, in increasing order, is as follows: + +- The parameter name and value are passed directly to the source script, but not in the LTR script parameters. These cannot be configured at query time. +- The parameter name is passed to both the `sltr` query and the source script, allowing the script parameter values to be overridden at query time. +- The LTR script parameter name to native script parameter name indirection allows you to use different parameter names in your LTR feature definition than those in the underlying native script. This gives you flexibility in how you define and use scripts within the LTR context. + +For example, to set up a customizable way to rank movies in search results, considering both the title match and other adjustable factors, you can use the following request: + +```json +POST _ltr/_featureset/more_movie_features +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{{keywords}}" + } + } + }, + { + "name": "custom_title_query_boost", + "params": [ + "some_multiplier", + "ltr_param_foo" + ], + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "(long)params.default_param * params.feature_vector.get('title_query') * (long)params.some_multiplier * (long) params.param_foo", + "params": { + "default_param": 10, + "some_multiplier": "some_multiplier", + "extra_script_params": { + "ltr_param_foo": "param_foo" + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Multiple feature stores + +A feature store corresponds to an independent LTR system, including features, feature sets, and models backed by a single index and cache. A feature store typically represents a single search problem or application, like Wikipedia or Wiktionary. To use multiple feature stores in your OpenSearch cluster, you can create and manage them using the provided API. For example, you can create a feature set for the `wikipedia` feature store as follows: + +```json +PUT _ltr/wikipedia + +POST _ltr/wikipedia/_featureset/attempt_1 +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +When logging features, you can specify the feature store using the `store` parameter in the `sltr` section of your query, as shown in the following example structure. If you do not provide a `store` parameter, the default store is used to look up the feature set. + +```json +{ + "sltr": { + "_name": "logged_featureset", + "featureset": "attempt_1", + "store": "wikipedia", + "params": { + "keywords": "star" + } + } +} +``` +{% include copy-curl.html %} + +To delete the feature set, you can use the following operation: + +```json +DELETE _ltr/wikipedia/_featureset/attempt_1 +``` +{% include copy-curl.html %} + +## Model caching + +The Model Caching plugin uses an internal cache for compiled models. To force the models to be recompiled, you can clear the cache for a feature store: + +```json +POST /_ltr/_clearcache +``` +{% include copy-curl.html %} + +To get cluster-wide cache statistics for a specific store, use the following request: + +```json +GET /_ltr/_cachestats +``` +{% include copy-curl.html %} + +You can control the characteristics of the internal cache by using the following node settings: + +``` +# limit cache usage to 12 megabytes (defaults to 10mb or max_heap/10 if lower) ltr.caches.max_mem: 12mb +# Evict cache entries 10 minutes after insertion (defaults to 1hour, set to 0 to disable) ltr.caches.expire_after_write: 10m +# Evict cache entries 10 minutes after access (defaults to 1hour, set to 0 to disable) ltr.caches.expire_after_read: 10m +``` +{% include copy.html %} + +## Extra logging + +As described in [Logging features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/), you can use the logging extension to return feature values with each document. For native scripts, you can also return additional arbitrary information along with the logged features. + +For native scripts, the `extra_logging` parameter is injected into the script parameters. This parameter is a [`Supplier>`](https://docs.oracle.com/javase/8/docs/api/java/util/function/Supplier.html), which provides a non-null `Map` only during the logging fetch phase. Any values you add to this map are returned alongside the logged features: + +```java +{ + @Override + public double runAsDouble() { + ... + Map extraLoggingMap = ((Supplier>) getParams().get("extra_logging")).get(); + if (extraLoggingMap != null) { + extraLoggingMap.put("extra_float", 10.0f); + extraLoggingMap.put("extra_string", "additional_info"); + } + ... + } +} +``` +{% include copy-curl.html %} + +If the extra logging map is accessed, it is returned as an additional entry with the logged features. The format of the logged features, including the extra logging information, will appear similar to the following example: + +```json + { + "log_entry1": [ + { + "name": "title_query", + "value": 9.510193 + }, + { + "name": "body_query", + "value": 10.7808075 + }, + { + "name": "user_rating", + "value": 7.8 + }, + { + "name": "extra_logging", + "value": { + "extra_float": 10.0, + "extra_string": "additional_info" + } + } + ] +} +``` +{% include copy-curl.html %} + +## Feature score caching + +By default, the Feature Score Caching plugin calculates feature scores for both model inference and feature score logging. For example, if you write a query to rescore the top 100 documents and return the top 10 with feature scores, then the plugin calculates the feature scores of the top 100 documents for model inference and then calculates and logs the scores for the top 10 documents. + +The following query shows this behavior: + +```json +POST tmdb/_search +{ + "size": 10, + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size" : 100, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "rescore_index": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +In some environments, it may be faster to cache the feature scores for model inference and reuse them for logging. To enable feature score caching, add the `cache: "true"` +flag to the `sltr` query that is the target of feature score logging, as shown in the following example: + +```json +{ + "sltr":{ + "cache":true, + "params":{ + "keywords":"rambo" + }, + "model":"my_model" + } +} +``` +{% include copy-curl.html %} + +## Stats + +You can use the Stats API to retrieve the plugin's overall status and statistics. To do this, send the following request: + +```json +GET /_ltr/_stats +``` +{% include copy-curl.html %} + +The response includes information about the cluster, configured stores, and cache statistics for various plugin components: + +```json +{ + "_nodes":{ + "total":1, + "successful":1, + "failed":0 + }, + "cluster_name":"es-cluster", + "stores":{ + "_default_":{ + "model_count":10, + "featureset_count":1, + "feature_count":0, + "status":"green" + } + }, + "status":"green", + "nodes":{ + "2QtMvxMvRoOTymAsoQbxhw":{ + "cache":{ + "feature":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + }, + "featureset":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + }, + "model":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +You can use filters to retrieve a single statistic by sending the following request: + +```json +GET /_ltr/_stats/{stat} +``` +{% include copy-curl.html %} + +You can limit the information to a single node in the cluster by sending the following requests: + +```json +GET /_ltr/_stats/nodes/{nodeId} +GET /_ltr/_stats/{stat}/nodes/{nodeId} +``` +{% include copy-curl.html %} + +## TermStat query +Experimental +{: .label .label-red } + +The `TermStatQuery` is in an experimental stage, and the Domain-Specific Language (DSL) may change as the code advances. For stable term-statistic access, see [ExplorerQuery]{.title-ref}. + +The `TermStatQuery` is a reimagined version of the legacy `ExplorerQuery`. It provides a clearer way to specify terms and offers more flexibility for experimentation. This query surfaces the same data as the [ExplorerQuery]{.title-ref}, but it allows you to specify a custom Lucene expression to retrieve the desired data, such as in the following example: + +```json +POST tmdb/_search +{ + "query": { + "term_stat": { + "expr": "df", + "aggr": "max", + "terms": ["rambo", "rocky"], + "fields": ["title"] + } + } +} +``` +{% include copy-curl.html %} + +The `expr` parameter is used to specify a Lucene expression. This expression is run on a per-term basis. The expression can be a simple stat type or a custom formula with multiple stat types, such as `(tf * idf) / 2`. Available stat types in the Lucene expression context are listed in the following table. + +Type | Description +:---| :--- +`df` | The direct document frequency for a term. For example, if `rambo` occurs in three movie titles across multiple documents, then the value would be `3`. +`idf` | The inverse document frequency (IDF) calculation using the formula `log((NUM_DOCS+1)/(raw_df+1)) + 1`. +`tf` | The term frequency for a document. For example, if `rambo` occurs three times in a movie synopsis in the same document, then the value would be `3`. +`tp` | The term positions for a document. Multiple positions can be returned for a single term, so you should review the behavior of the `pos_aggr` parameter. +`ttf` | The total term frequency for a term across an index. For example, if `rambo` is mentioned a total of 100 times in the `overview` field across all documents, then the value would be `100`. + +The `aggr` parameter specifies the type of aggregation to be applied to the collected statistics from the `expr`. For example, if you specify the terms `rambo` and `rocky`, then the query gathers statistics for both terms. Because you can only return a single value, you need to decide which statistical calculation to use. The available aggregation types are `min`, `max`, `avg`, `sum`, and `stddev`. The query also provides the following counts: `matches` (the number of terms that matched in the current document) and `unique` (the unique number of terms that were passed in the query). + +The `terms` parameter specifies an array of terms for which you want to gather statistics. Only single terms are supported, with no support for phrases or span queries. If your field is tokenized, you can pass multiple terms in one string in the array. + +The `fields` parameter specifies the fields to check for the specified `terms`. If no `analyzer` is specified, then the configured `search_analyzer` for each field is used. + +The optional parameters are listed in the following table. + +Type | Description +:---| :--- +`analyzer` | If specified, this analyzer is used instead of the configured `search_analyzer` for each field. +`pos_aggr` | Because each term can have multiple positions, you can use this parameter to specify the aggregation to apply to the term positions. This supports the same values as the `aggr` parameter and defaults to `avg`. + +### Script injection + +Script injection provides the ability to inject term statistics into a scripting context. When working with `ScriptFeatures`, you can pass a `term_stat` object with the `terms`, `fields`, and `analyzer` parameters. An injected variable named `termStats` then provides access to the raw values in your custom script. This enables advanced feature engineering by giving you access to all the underlying data. + +To access the count of matched tokens, use [`params.matchCount.get`]{.title-ref}. To access the unique token count, use [`params.uniqueTerms`]{.title-ref}. + +You can either hardcode the `term_stat` parameter in your script definition or pass the parameter to be set at query time. For example, the following example query defines a feature set with a script feature that uses hardcoded `term_stat` parameters: + +```json +POST _ltr/_featureset/test +{ + "featureset": { + "features": [ + { + "name": "injection", + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.termStats['df'].size()", + "params": { + "term_stat": { + "analyzer": "!standard", + "terms": ["rambo rocky"], + "fields": ["overview"] + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +Analyzer names must be prefixed with a bang(!) when specifying them locally. Otherwise, they are treated as the parameter lookup value. +{: .note} + +To set parameter lookups, you can pass the name of the parameter from which you want to pull the value, as shown in the following example request: + +```json +POST _ltr/_featureset/test +{ + "featureset": { + "features": [ + { + "name": "injection", + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.termStats['df'].size()", + "params": { + "term_stat": { + "analyzer": "analyzerParam", + "terms": "termsParam", + "fields": "fieldsParam" + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can pass the `term_stat` parameters as query-time parameters, as shown in the following request: + +```json +POST tmdb/_search +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": ["7555", "1370", "1369"] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "test", + "params": { + "analyzerParam": "standard", + "termsParam": ["troutman"], + "fieldsParam": ["overview"] + } + }} + ] + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_search-plugins/ltr/core-concepts.md b/_search-plugins/ltr/core-concepts.md new file mode 100644 index 0000000000..4a7f73e4ce --- /dev/null +++ b/_search-plugins/ltr/core-concepts.md @@ -0,0 +1,141 @@ +--- +layout: default +title: ML ranking core concepts +nav_order: 10 +parent: Learning to Rank +has_children: false +--- + +# ML ranking core concepts + +This guide is intended for OpenSearch developers and data scientist who are interested in adding machine learning (ML) ranking capabilities to their OpenSearch system. + +## What is LTR? + +Learning to Rank (LTR) applies ML to search relevance ranking. This differs from other classic ML problems, such as the following: + +- **Regression:** The goal is to predict a variable, such as a stock price, as a function of known information, such as number of employees or revenue. The output is a direct prediction. +- **Classification:** The goal is to categorize an entity into predefined classes, for example, profitable or not profitable. The output is a category. + +The objective of LTR is not to make a direct prediction but rather to learn a function (`f`) that can rank documents in an order that best matches your perception of relevance for a given query. The output `f` does not represent a literal value but rather a prediction of the document's relative usefulness. + +For comprehensive information about LTR, see [How is Search Different From Other Machine Learning Problems?](http://opensourceconnections.com/blog/2017/08/03/search-as-machine-learning-prob/) and [What is Learning to Rank?](http://opensourceconnections.com/blog/2017/02/24/what-is-learning-to-rank/). + +## Defining the ideal ordering with judgment lists + +Judgment lists, also known as golden sets, provide a way to grade individual search results for a keyword search. These lists express the ideal ordering of search results based on your expectations. + +For example, using the [demo on GitHub](http://github.com/opensearch-project/opensearch-learning-to-rank-base/tree/main/demo/), in a search for `Rambo`, the judgment list may appear similar to the following: + +``` +grade,keywords,movie +4,Rambo,First Blood # Exactly Relevant +4,Rambo,Rambo +3,Rambo,Rambo III # Fairly Relevant +3,Rambo,Rambo First Blood Part II +2,Rambo,Rocky # Tangentially Relevant +2,Rambo,Cobra +0,Rambo,Bambi # Not even close... +0,Rambo,First Daughter +``` + +This judgment list establishes the ideal ordering of search results for the query `Rambo`. Metrics like [Normalized Discounted Cumulative Gain (NDCG)](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) and [Expected Reciprocal Rank (ERR)](https://dl.acm.org/doi/abs/10.1145/1645953.1646033) can then be used to evaluate how closely the actual search results match this ideal ordering. + +The ranking function `f` aims to generate results closely aligned with the judgment list, maximizing quality metrics across various training queries. This ensures maximally useful search results. + +## Understanding features as building blocks of relevance + +The ranking function `f` uses input variables to arrive at a predicted output. For example, in stock price forecasting, input variables may encompass company-specific data like employee count and revenue. Likewise, in search relevance, the predictive model must leverage features that characterize the document, the query, and their associations, such as the [term frequency–inverse document frequency (TF–IDF)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) score of the query keywords in a field. + +Similarly, in the context of searching for movies, the ranking function must use relevant features to determine the most relevant results. These features may include: + +- Whether and to what degree the search keywords match the title field, such as `titleScore`. +- Whether and to what degree the search keywords match the description field, such as `descScore`. +- The movie's popularity, such as `popularity`. +- The movie's rating, such as `rating`. +- The number of keywords used during the search, such as `numKeywords*)`. + +The ranking function would become `f(titleScore, descScore, popularity, rating, numKeywords)`. The goal is to use the features in a way that maximizes the likelihood of the search results being useful. + +For example, in the `Rambo` use case, it seems intuitive that `titleScore` would be important. However, for the top movie _First Blood_, the keyword `Rambo` is likely only mentioned in the description. In this case, the `descScore` would become relevant. Additionally, the `popularity` and `rating` features could help differentiate between sequels and originals. If the existing features do not work for this purpose, then a new feature `isSequel` could be introduced. This new feature could then be used to make better ranking decisions. + +Selecting and experimenting with features is fundamental to LTR. Using features that fail to help predict patterns in the target variable can result in an unsatisfactory search experience, following the principle of "garbage in, garbage out" that applies to any ML problem. + +## Completing the training set by logging features + +When you have a set of defined features, the next step is to annotate the judgment list with each feature's values. These values are used when the training process begins. For example, consider the following judgment list: + +``` +grade,keywords,movie +4,Rambo,First Blood +4,Rambo,Rambo +3,Rambo,Rambo III +... +``` + +To complete the training set, add the following features: + +``` +grade,keywords,movie,titleScore,descScore,popularity,... +4,Rambo,First Blood,0.0,21.5,100,... +4,Rambo,Rambo,42.5,21.5,95,... +3,Rambo,Rambo III,53.1,40.1,50,... +``` + +The `titleScore` represents the relevance score of the `Rambo` keyword in the title field of the document, and so on. + +Many LTR models are familiar with a file format introduced by Support Vector Machine for Ranking (SVMRank), an early LTR method. In this format, queries are given IDs, and the actual document identifier can be removed from the training process. Features are labeled with ordinals starting at `1`. For the preceding example, the file format would be: + +``` +4 qid:1 1:0.0 2:21.5 3:100,... +4 qid:1 1:42.5 2:21.5 3:95,... +3 qid:1 1:53.1 2:40.1 3:50,... +... +``` + +In actual systems, you might log these values and then use them later to annotate a judgment list. In other cases, the judgment list might come from user analytics, so the feature values are logged as you interact with the search application. See [Logging features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/) for more information. + +## Training a ranking function + +The following are key considerations for training a ranking function: + +- **Ranking models:** Several models, such as the following, are available for training, each with pros and cons: + + - **Tree-based models** (for example, LambdaMART, MART, Random Forests) + - Generally the most accurate. + - Large and complex, making them expensive to train. + - Tools such as [RankLib](https://sourceforge.net/p/lemur/wiki/RankLib/) and [XGBoost](https://github.com/dmlc/xgboost) focus on tree-based models. + + - **SVM-based models (SVMRank)** + - Less accurate but less expensive to train. + - See [Support Vector Machine for Ranking](https://www.cs.cornell.edu/people/tj/svm_light/svm_rank.html) for more information. + + - **Linear models** + - Perform basic linear regression on the judgment list. + - Tend to not be useful outside of the examples. + - See [Learning to Rank 101 — Linear Models](http://opensourceconnections.com/blog/2017/04/01/learning-to-rank-linear-models/) for more information. + +- **Model selection:** The choice of model can depend not only on performance but also on your level of experience and familiarity with the different approaches. + +## Testing: Is the model any good? + +When testing the quality of the ranking model, consider the following: + +- **Judgment list limitations:** A judgment list cannot include every possible query that a model may encounter in the real world. It is important to test the model on a variety of queries in order to assess its ability to generalize beyond the training data. +- **Overfitting:** A model that is overfit to the training data does not perform well on new, unseen data. To avoid this, consider doing the following: + - Preserving some judgment lists as a _test set_ that is not used during the training process. + - Evaluating the model's performance on the test set, which reflects how it may perform in unfamiliar scenarios. + - Monitoring the _test NDCG_ metric, which should remain high as the model is trained. +- **Temporal generalization:** Even after deploying the model, you should continue testing the model's performance using more recent judgment lists to ensure that it does not become overfit to seasonal or temporal situations. + +## Real-world concerns + +The following are practical considerations for using the Learning to Rank plugin: + +- **Accurate judgment lists:** How can you create judgment lists that reflect your users' perception of search quality? +- **Measuring search quality:** What metrics should you use to determine whether the search results are useful to your users? +- **Data collection infrastructure:** What kind of infrastructure do you need in order to collect and log user behavior and feature data? +- **Model retraining:** How will you know when your model needs to be retrained? +- **A/B testing:** How will you compare your new model to your current search solution? What key performance indicators (KPIs) will you use to determine the success of your search system? + +See [How does the plugin fit in?]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) to learn more about how the Learning to Rank plugin's functionality fits into a complete LTR system. diff --git a/_search-plugins/ltr/faq.md b/_search-plugins/ltr/faq.md new file mode 100644 index 0000000000..14db276b3a --- /dev/null +++ b/_search-plugins/ltr/faq.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Common issues +nav_order: 1000 +parent: Learning to Rank +has_children: false +--- + +# Common issues + +To make the most of Learning to Rank (LTR), consider these helpful insights. + +## Negative scores + +Lucene does not allow for negative query scores. This can be problematic if your raw features include negative values. To address this, confirm that your features are non-negative _before_ training your model. You can achieve this by creating normalized fields with values shifted by the minimum value or by passing the scores through a function that produces a value greater than or equal to `0`. + +## Bugs + +If you encounter a bug while working with the plugin, you can open an issue in the [opensearch-learning-to-rank-base repository](https://github.com/opensearch-project/opensearch-learning-to-rank-base/issues). The project team regularly investigates and resolves issues. If you are seeking general support, the issue may be closed and you may be directed to the relevant support channel(s). + +## Further assistance + +If you need further assistance, join the [Relevance Slack Community](https://opensourceconnections.com/slack) and participate in the #opensearch-learn-to-rank channel to receive guidance and support from the community. diff --git a/_search-plugins/ltr/feature-engineering.md b/_search-plugins/ltr/feature-engineering.md new file mode 100644 index 0000000000..a059dcf709 --- /dev/null +++ b/_search-plugins/ltr/feature-engineering.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Feature engineering +nav_order: 40 +parent: Learning to Rank +has_children: false +--- + +# Feature engineering + +Common feature engineering tasks that you may encounter while developing a learning to rank (LTR) solution are described in the following sections. + +## Getting raw term statistics + +Many LTR solutions use raw term statistics in their training, such as the following: +- **Total term frequency (`raw_ttf`):** The total number of times that a term appears across an entire index. +- **Document frequency (`raw_df`):** The number of documents in which a term appears. +- **Term frequency (`raw_tf`):** The number of times that a term appears in a specific document. +- **Classic IDF (`classic_idf`):** The inverse document frequency (IDF) calculation `log((NUM_DOCS+1)/(raw_df+1)) + 1`. + +The Learning to Rank plugin provides a `match_explorer` query primitive that can extract these statistics for you, as shown in the following example: + +```json +POST tmdb/_search +{ + "query": { + "match_explorer": { + "type": "max_raw_df", + "query": { + "match": { + "title": "rambo rocky" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query returns the highest document frequency between the terms `rambo ` and `rocky`. + +You can use operations such as `max`, `min`, `sum`, and `stddev` with the statistics to get the information you need. + +### Term position statistics + +You can prepend the `type` with the desired operation (`min`, `max`, `avg`) to calculate the corresponding statistic across the term positions. If the terms are not present in the document, then the result will be `0`. + +The available statistics include the following: + +- `min_raw_tp` (minimum raw term position): This statistic finds the earliest position of any search term in the document. For example, with the query `dance monkey`, if `dance` occurs at positions [2, 5, 9] and `monkey` occurs at [1, 4], then the minimum is 1. +- `max_raw_tp` (maximum raw term position): This statistic finds the latest position of any search term in the document. Using the preceding example, the maximum is 9. +- `avg_raw_tp` (average raw term position): This statistic calculates the average term position for any of the query terms. Using the preceding example, the average for `dance` is 5.33 [(2+5+9)/3)] and the average for `monkey` is 2.5 [(1+4)/2], with an overall average of 3.91. +- `unique_terms_count`: Provides a count of the unique search terms in the query. + +## Document-specific features + +When working on an LTR solution, you may need to incorporate features that are specific to the document rather than to the relationship between the query and the document. These document-specific features can include metrics related to popularity or recency. + +The `function_score` query provides the functionality to extract these document-specific features. The following example query shows how you can use it to incorporate the `vote_average` field as a feature: + +```json +{ + "query": { + "function_score": { + "functions": [{ + "field_value_factor": { + "field": "vote_average", + "missing": 0 + } + }], + "query": { + "match_all": {} + } + } + } +} +``` +{% include copy-curl.html %} + +In the example, the score of the query is determined by the value of the `vote_average` field, which could be a measure of document popularity or quality. + +## Index drift + +When working with an index that is regularly updated, it is important to consider that the trends and patterns you observe may not remain constant over time. Your index can drift as user behavior, content, and other factors change. For example, on an e-commerce store, you may find that sandals are popular during summer months but become almost impossible to find in the winter. Similarly, the features that drive purchases or engagement during one time period may not be as important during another. + +## Next steps + +Learn about [logging feature scores]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/). diff --git a/_search-plugins/ltr/fits-in.md b/_search-plugins/ltr/fits-in.md new file mode 100644 index 0000000000..30ca291b82 --- /dev/null +++ b/_search-plugins/ltr/fits-in.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Scope of the plugin +nav_order: 20 +parent: Learning to Rank +has_children: false +--- + +# Scope of the plugin + +The Learning to Rank plugin for OpenSearch helps you develop and use machine learning (ML)-based ranking models for your application search operations. The following sections describe how the plugin fits into the overall LTR process. + +## What the plugin does + +The plugin provides the building blocks to develop and use LTR models, giving you the following capabilities: + +1. **Developing query-dependent features:** Create custom features that capture the relationship between a search query and a document. These features can be stored in OpenSearch. +2. **Logging feature values:** Record the feature values for documents returned in search results. Once you have logged the feature sets for your documents, you can combine this data with the judgment lists you have developed. This will give you a complete training set that you can use to test and train your ranking models. Tools such as RankLib or XGBoost can then be used to develop a satisfactory model. +3. **Deploying and using models:** Upload trained ranking models to the plugin and use them to rerank search results. The plugin offers a custom OpenSearch query domain-specific language (DSL) primitive that allows you to execute the model during the search process. + +## What the plugin does not do + +The plugin does not support the creation of judgment lists. This is a task you must handle yourself because it is domain specific. See the [Wikimedia Foundation blog](https://blog.wikimedia.org/2017/09/19/search-relevance-survey/) for an example approach to developing judgment lists for searching articles. Some domains, such as e-commerce, may focus more on conversion-related signals, while others may involve human relevance assessors (either internal experts or crowdsourced workers). + +The plugin does not handle model training or testing. This is an offline process that should be handled using the appropriate tools, such as [XGBoost](https://xgboost.ai/) and [RankLib](https://lemurproject.org/ranklib.php). The plugin integrates with these external model-building workflows. Training and testing ranking models can be a CPU-intensive task that requires data science expertise and offline testing. Most organizations prefer to have data scientists oversee the model development process rather than running it directly in their production environment. + +## Next steps + +Learn about [working with features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/). diff --git a/_search-plugins/ltr/index.md b/_search-plugins/ltr/index.md new file mode 100644 index 0000000000..1c1f0eb0d2 --- /dev/null +++ b/_search-plugins/ltr/index.md @@ -0,0 +1,37 @@ +--- +layout: default +title: Learning to Rank +nav_order: 20 +has_children: true +has_toc: false +redirect_from: + - /search-plugins/ltr/ +--- + +# Learning to Rank + +The Learning to Rank plugin for OpenSearch enables you to use machine learning (ML) and behavioral data to fine-tune the relevance of documents. It uses models from the [XGBoost](https://xgboost.ai/) and [RankLib](https://lemurproject.org/ranklib.php) libraries. These models rescore the search results, considering query-dependent features such as click-through data or field matches, which can further improve relevance. + +The term _learning to rank_ is abbreviated as LTR throughout the OpenSearch documentation when the term is used in a general sense. For the plugin developer documentation, see [opensearch-learning-to-rank-base](https://github.com/opensearch-project/opensearch-learning-to-rank-base). +{: .note} + +## Getting started + +The following resources can help you get started: + +- If you are new to LTR, start with the [ML ranking core concepts]({{site.url}}{{site.baseurl}}/search-plugins/ltr/core-concepts/) documentation. +- For a quick introduction, see the demo in [hello-ltr](https://github.com/o19s/hello-ltr). +- If you are familiar with LTR, start with the [Integrating the plugin]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) documentation. + +## Installing the plugin + +Prebuilt versions of the plugin are available at [https://github.com/opensearch-project/opensearch-learning-to-rank-base/releases](https://github.com/opensearch-project/opensearch-learning-to-rank-base/releases). + +If you need a version that is compatible with your OpenSearch installation, follow the instructions in the [README](https://github.com/opensearch-project/opensearch-learning-to-rank-base#development) file or [create an issue](https://github.com/opensearch-project/opensearch-learning-to-rank-base/issues). + +Once you have an appropriate version, you can install the plugin using the command line shown in the following example: + +``` +./bin/opensearch-plugin install https://github.com/opensearch-project/opensearch-learning-to-rank-base/releases/download/ltr-plugin-v2.11.1-RC1/ltr-plugin-v2.11.1-RC1.zip +``` +{% include copy-curl.html %} diff --git a/_search-plugins/ltr/logging-features.md b/_search-plugins/ltr/logging-features.md new file mode 100644 index 0000000000..7922b8683d --- /dev/null +++ b/_search-plugins/ltr/logging-features.md @@ -0,0 +1,418 @@ +--- +layout: default +title: Logging feature scores +nav_order: 50 +parent: Learning to Rank +has_children: false +--- + +# Logging feature scores + +Feature values need to be logged in order to train a model. This is a crucial component of the Learning to Rank plugin---as you search, feature values from the feature sets are logged so that they can be used for training. This allows models that effectively predict relevance using that set of features to be discovered. + +## `sltr` query + +The `sltr` query is the primary method for running features and evaluating models. When logging, an `sltr` query is used to execute each feature query and retrieve the feature scores. A feature set structure that works with the [`hello-ltr`](https://github.com/o19s/hello-ltr) demo schema is shown in the following example request: + +```json +PUT _ltr/_featureset/more_movie_features +{ + "name": "more_movie_features", + "features": [ + { + "name": "body_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "overview": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "title_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Common use cases + +Common use cases for logging feature sets are described in the following sections. + +### Joining feature values with a judgment list + +If the judgment list is already available, you can join feature values for each keyword/document pair to create a complete training set. For example, consider the following judgment list: + +``` +grade,keywords,docId +4,rambo,7555 +3,rambo,1370 +3,rambo,1369 +4,rocky,4241 +``` +{% include copy-curl.html %} + +The feature values need to be retrieved for all documents that have a judgment for each search term, one search term at a time. For example, starting with a `rambo` search, a filter can be created for the associated document as follows: + +```json +{ + "filter": [ + {"terms": { + "_id": ["7555", "1370", "1369"] + }} + ] +} +``` +{% include copy-curl.html %} + +The Learning to Rank plugin must point to the features to be logged. The `sltr` query, which is part of the plugin, can be used for this purpose. The `sltr` query has a `_name` (the named queries feature) used to reference it, refers to the previously created feature set `more_movie_features`, and passes the search keyword `rambo` and any other required parameters, as shown in the following example query: + +```json +{ + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + } +} +``` +{% include copy-curl.html %} + +[Searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/) provides an `sltr` query to use for executing a model. This `sltr` query is used as a mechanism to direct the Learning to Rank plugin to the feature set requiring logging. +{: .note} + +To avoid influencing the score, the `sltr` query is injected as a filter, as shown in the following example: + +```json +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": [ + "7555", + "1370", + "1369" + ] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +Executing this query returns the three expected hits. The next step is to enable feature logging to refer to the `sltr` query to be logged. + +The logging identifies the `sltr` query, runs the feature set's queries, scores each document, and returns those scores as computed fields for each document, as shown in the following example logging structure: + +```json +"ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } +} +``` +{% include copy-curl.html %} + +The log extension supports the following arguments: + +- `name`: The name of the log entry to fetch from each document. +- `named_query`: The named query that corresponds to an `sltr` query. +- `rescore_index`: If the `sltr` query is in a rescore phase, then this is the index of the query in the rescore list. +- `missing_as_zero`: Produces a `0` for missing features (when the feature does not match). Default is `false`. + +To enable the log to locate an `sltr` query, either during the normal query phase or during rescoring, either `named_query` or `rescore_index` must be set. +{: .note} + +The full example request is as follows: + +```json +POST tmdb/_search +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": ["7555", "1370", "1369"] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + }} + ] + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } + } +} +``` +{% include copy-curl.html %} + +Each document now contains a log entry, as shown in the following example: + +```json +{ + "_index": "tmdb", + "_type": "movie", + "_id": "1370", + "_score": 20.291, + "_source": { + ... + }, + "fields": { + "_ltrlog": [ + { + "log_entry1": [ + {"name": "title_query" + "value": 9.510193}, + {"name": "body_query + "value": 10.7808075} + ] + } + ] + }, + "matched_queries": [ + "logged_featureset" + ] +} +``` +{% include copy-curl.html %} + +The judgment list can be joined with the feature values to produce a training set. For the line corresponding to document `1370` with keyword `rambo`, the following can be added: + +``` +> 4 qid:1 1:9.510193 2:10.7808075 +``` +{% include copy-curl.html %} + +Repeat this process for all of your queries. + +For large judgment lists, it is recommended to batch the logs for multiple queries. You can use [multi-search]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) capabilities for this purpose. +{: .note} + +### Logging values for a live feature set + +If you are running in production with a model being executed within an `sltr` query, a live model may appear similar to the following example request: + +```json +POST tmdb/_search +{ + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + } +} +``` +{% include copy-curl.html %} + +See [Searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/) for information about model execution. +{: .note} + +To log the feature values for the query, apply the appropriate logging spec to reference the `sltr` query, as shown in the following example: + +```json +"ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "rescore_index": 0 + } + } +} +``` +{% include copy-curl.html %} + +The example logs the features in the response, enabling future model retraining using the same feature set. + +### Modifying and logging an existing feature set + +Feature sets can be expanded. For example, as shown in the following example request, if a new feature, such as `user_rating`, needs to be incorporated, it can be added to the existing feature set `more_movie_features`: + +``` json +PUT _ltr/_feature/user_rating/_addfeatures +{ + "features": [ + "name": "user_rating", + "params": [], + "template_language": "mustache", + "template" : { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } + ] +} +``` +{% include copy-curl.html %} + +See [Working with features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/) for more information. +{: .note} + +When logging is performed, the new feature is included in the output, as shown in the following example: + +``` json +{ + "log_entry1": [ + { + "name": "title_query", + "value": 9.510193 + }, + { + "name": "body_query", + "value": 10.7808075 + }, + { + "name": "user_rating", + "value": 7.8 + } + ] +} +``` +{% include copy-curl.html %} + +### Logging values for a proposed feature set + +You can create a completely new feature set for experimental purposes, for example, `other_movie_features`, as shown in the following example request: + +```json +PUT _ltr/_featureset/other_movie_features +{ + "name": "other_movie_features", + "features": [ + { + "name": "cast_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "cast.name": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "genre_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "genres.name": "{% raw %}{{keywords}}{% endraw %}" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +The feature set, `other_movie_features`, can be logged alongside the live production set, `more_movie_features`, by appending it as another filter, as shown in the following example request: + +```json +POST tmdb/_search +{ +"query": { + "bool": { + "filter": [ + { "sltr": { + "_name": "logged_featureset", + "featureset": "other_movie_features", + "params": { + "keywords": "rambo" + } + }}, + {"match": { + "_all": "rambo" + }} + ] + } +}, +"rescore": { + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } +} +} +``` +{% include copy-curl.html %} + +You can continue adding as many feature sets as needed for logging. + +## Logging scenarios + +Once you have covered the basics, you can consider some real-life feature logging scenarios. + +First, logging is used to develop judgment lists from user analytics to capture the exact value of a feature at the precise time of interaction. For instance, you may want to know the recency, title score, and other values at the precise time of a user's interaction. This would help you analyze which features or factors had relevance while training. To achieve this, you can build a comprehensive feature set for future experimentation. + +Second, logging can be used to retrain a model in which you already have confidence. You may want to keep your models up to date with a shifting index because models can lose their effectiveness over time. You may have A/B testing in place or be monitoring business metrics and notice gradual degradation in model performance. + +Third, logging is used during model development. You may have a judgment list but want to iterate heavily with a local copy of OpenSearch. This allows for extensive experimentation with new features, adding and removing them from the feature sets as needed. While this process may result in being slightly out of sync with the live index, the goal is to arrive at a set of satisfactory model parameters. Once this is achieved, the model can be trained with production data to confirm that the level of performance remains acceptable. + +## Next steps + +Learn more about training models in the [Uploading a trained model]({{site.url}}{{site.baseurl}}/search-plugins/ltr/training-models/) documentation. diff --git a/_search-plugins/ltr/searching-with-your-model.md b/_search-plugins/ltr/searching-with-your-model.md new file mode 100644 index 0000000000..ca1ff87307 --- /dev/null +++ b/_search-plugins/ltr/searching-with-your-model.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Optimizing search with LTR +nav_order: 70 +parent: Learning to Rank +has_children: false +--- + +# Optimizing search with LTR + +After you have trained a model, you can use the `sltr` query to execute it. However, directly running the query on the entire index is not recommended because it can be CPU intensive and impact the performance of your OpenSearch cluster. The query allows you to apply your trained model to search results, as shown in the following example: + +```json + POST tmdb/_search + { + "query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } +``` +{% include copy-curl.html %} + +## Rescoring top N + +To execute your model more efficiently, you can use the built-in rescore functionality to apply your model to the top N results of a baseline relevance query, as shown in the following example query: + +```json + POST tmdb/_search + { + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size": 1000, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + } + } +``` +{% include copy-curl.html %} + +A `match` is first executed for the term `rambo` and then `my_model` is applied to the top 1,000 results. This baseline query is used to generate an initial set of results that are then scored using the default similarity BM25 probabilistic ranking framework to calculate relevance scores. + +## Rescoring a subset of features + +You can selectively score a subset of features by specifying the `active_features` in the `sltr` query, as shown in the following example. This allows you to focus the model's scoring on the selected features, while any unspecified features are marked as missing. You only need to specify the `params` relevant to the `active_features`. If you request a feature name that is not part of the assigned feature set, then the query throws an error. + +```json + POST tmdb/_search + { + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size": 1000, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model", + "active_features": ["title_query"] + } + } + } + } + } +``` +{% include copy-curl.html %} + +The `my_model` model is applied but only scores the `title_query` feature. + +## Combining `sltr` with other OpenSearch features + +The `sltr` query can be integrated with the following OpenSearch features and functionalities to create more sophisticated and tailored search solutions that go beyond applying a model to your results: + +- Filtering out results based on business rules using OpenSearch filters before applying the model +- Chaining multiple rescores to refine the relevance of your results +- Rescoring once to address relevance with `sltr` and a second time for business concerns +- Downboosting relevant but low-quality content in the baseline query to prevent it from being rescored + +## Next steps + +Learn about [advanced functionality]({{site.url}}{{site.baseurl}}/search-plugins/ltr/advanced-functionality/). diff --git a/_search-plugins/ltr/training-models.md b/_search-plugins/ltr/training-models.md new file mode 100644 index 0000000000..fb068cedd7 --- /dev/null +++ b/_search-plugins/ltr/training-models.md @@ -0,0 +1,335 @@ +--- +layout: default +title: Uploading trained models +nav_order: 60 +parent: Learning to Rank +has_children: false +--- + +# Uploading trained models + +While model training occurs outside of the Learning to Rank plugin, you can use the plugin for [logging feature scores]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/). After you have trained a model, you can upload it to the plugin in the available serialization formats, such as RankLib and XGBoost. + +## RankLib model training + +The feature logging process generates a RankLib-comsumable judgment file. In the following judgment file, the query with ID 1 `rambo` includes the logged features 1 (a title `TF*IDF` +score) and 2 (a description `TF*IDF` score) for a set of documents: + +``` +4 qid:1 1:9.8376875 2:12.318446 # 7555 rambo +3 qid:1 1:10.7808075 2:9.510193 # 1370 rambo +3 qid:1 1:10.7808075 2:6.8449354 # 1369 rambo +3 qid:1 1:10.7808075 2:0.0 # 1368 rambo +``` + +The RankLib library can be called using the following command: + + ``` + cmd = "java -jar RankLib-2.8.jar -ranker %s -train%rs -save %s -frate 1.0" % (whichModel, judgmentsWithFeaturesFile, modelOutput) +``` + +The `judgmentsWithFeatureFile` is the input provided to RankLib for training. Additional parameters can be passed. See the [RankLib documentation](https://sourceforge.net/p/lemur/wiki/RankLib/) for more information. + +RankLib outputs the model in its own serialization format. As shown in the following example, a LambdaMART model is an ensemble of regression trees: + +``` +## LambdaMART +## No. of trees = 1000 +## No. of leaves = 10 +## No. of threshold candidates = 256 +## Learning rate = 0.1 +## Stop early = 100 + + + + + 2 + ... +``` + +Within the RankLib model, each tree in the ensemble examines feature values, makes decisions based on these feature values, and outputs the relevance scores. The features are referred to by their ordinal position, starting from 1, which corresponds to the 0th feature in the original feature set. RankLib does not use feature names during model training. + +### Other RankLib models + +RankLib is a library that implements several other model types in addition to LambdaMART, such as MART, +RankNet, RankBoost, AdaRank, Coordinate Ascent, ListNet, and Random Forests. Each of these models has its own set of parameters and training process. + +For example, the RankNet model is a neural network that learns to predict the probability of a document being more relevant than another document. The model is trained using a pairwise loss function that compares the predicted relevance of two documents with the actual relevance. The model is serialized in a format similar to the following example: + +``` +## RankNet +## Epochs = 100 +## No. of features = 5 +## No. of hidden layers = 1 +... +## Layer 1: 10 neurons +1 2 +1 +10 +0 0 -0.013491530393429608 0.031183180961270988 0.06558792020112071 -0.006024092627087733 0.05729619574181734 -0.0017010373987742411 0.07684848696852313 -0.06570387602230028 0.04390491141617467 0.013371636736099578 +... +``` + +All these models can be used with the Learning to Rank plugin, provided that the model is serialized in the RankLib format. + +## XGBoost model training + +Unlike the RankLib model, the XGBoost model is serialized in a format specific to gradient-boosted decision trees, as shown in the following example: + +```json + [ { "nodeid": 0, "depth": 0, "split": "tmdb_multi", "split_condition": 11.2009, "yes": 1, "no": 2, "missing": 1, "children": [ + { "nodeid": 1, "depth": 1, "split": "tmdb_title", "split_condition": 2.20631, "yes": 3, "no": 4, "missing": 3, "children": [ + { "nodeid": 3, "leaf": -0.03125 }, + ... +``` + +## XGBoost parameters + +Optional parameters can be specified for an XGBoost model. These parameters are specified as an object, with the decision trees specified in the `splits` field. The supported parameters include `objective`, which defines the model learning objective as described in the [XGBoost documentation](https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters). This parameter can transform the final model prediction. The supported values include `binary:logistic`, `binary:logitraw`, `rank:ndcg`, `rank:map`, `rank:pairwise`, `reg:linear`, and `reg:logistic`. + +## Simple linear models + +Machine learning (ML) models, such as Support Vector Machines (SVMs), output linear weights for each feature. The LTR model supports representing these linear weights in a simple format, such as those learned from an SVM or linear regression model. In the following example output, the weights indicate the relative importance of the features in the model's prediction: + +```json +{ + "title_query" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 +} +``` + +## Feature normalization + +Feature normalization is used to convert feature values to a consistent range, typically between 0 and 1 or -1 and 1. This is done during the training phase to better understand the relative impact of each feature. Some models, especially linear ones such as SVMRank, rely on normalization to function correctly. + +## Model upload process + +After training your model, the next step is to make it available for search operations. This involves uploading the model to the Learning to Rank plugin. When uploading a model, you must provide the following information: + +- Feature set used during training +- Model type, for example, RankLib or XGBoost +- Model content + +The following example request shows how to upload a RankLib model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_ranklib_model", + "model": { + "type": "model/ranklib", + "definition": "## LambdaMART\n + ## No. of trees = 1000 + ## No. of leaves = 10 + ## No. of threshold candidates = 256 + ## Learning rate = 0.1 + ## Stop early = 100 + + + + + 2 + ... + " + } + } + } +``` + +The following example request shows how to upload an XGBoost model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_xgboost_model", + "model": { + "type": "model/xgboost+json", + "definition": "[ { \"nodeid\": 0, \"depth\": 0, \"split\": \"tmdb_multi\", \"split_condition\": 11.2009, \"yes\": 1, \"no\": 2, \"missing\": 1, \"children\": [ + { \"nodeid\": 1, \"depth\": 1, \"split\": \"tmdb_title\", \"split_condition\": 2.20631, \"yes\": 3, \"no\": 4, \"missing\": 3, \"children\": [ + { \"nodeid\": 3, \"leaf\": -0.03125 }, + ..." + } + } + } +``` + +The following example request shows how to upload an XGBoost model that was trained using the `more_movie_features` feature set with parameters: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_xgboost_model", + "model": { + "type": "model/xgboost+json", + "definition": "{ + \"objective\": \"reg:logistic\", + \"splits\": [ { \"nodeid\": 0, \"depth\": 0, \"split\": \"tmdb_multi\", \"split_condition\": 11.2009, \"yes\": 1, \"no\": 2, \"missing\": 1, \"children\": [ + { \"nodeid\": 1, \"depth\": 1, \"split\": \"tmdb_title\", \"split_condition\": 2.20631, \"yes\": 3, \"no\": 4, \"missing\": 3, \"children\": [ + { \"nodeid\": 3, \"leaf\": -0.03125 }, + ... + ] + }" + } + } + } +```` + +The following example request shows how to upload a simple linear model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_linear_model", + "model": { + "type": "model/linear", + "definition": """ + { + "title_query" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 + } + """ + } + } + } +``` + +## Creating a model with feature normalization + +Feature normalization is a crucial preprocessing step that can be applied before model evaluation. LTR supports two types of feature normalization: min-max and standard normalization. + +### Standard normalization + +Standard normalization transforms features as follows: + +- Maps the mean value to 0 +- Maps one standard deviation above the mean to 1 +- Maps one standard deviation below the mean to -1 + +The following example request shows how to create a model with standard feature normalization: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_linear_model", + "model": { + "type": "model/linear", + "feature_normalizers": { + "release_year": { + "standard": { + "mean": 1970, + "standard_deviation": 30 + } + } + }, + "definition": """ + { + "release_year" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 + } + """ + } + } + } +``` + +### Min-max normalization + +Min-max normalization scales features to a fixed range, typically between 0 and 1. Min-max normalization transforms features as follows: + +- Maps the specified minimum value to 0 +- Maps the specified maximum value to 1 +- Scales the values between 0 and 1 linearly + +The following example request shows how to implement min-max normalization: + +```json + "feature_normalizers": { + "vote_average": { + "min_max": { + "minimum": 0, + "maximum": 10 + } + } + } +``` + +## Model independence from feature sets + +Models are initially created with reference to a feature set. After their creation, they exist as independent top-level entities. + +### Accessing models + +To retrieve a model, use a GET request: + +``` +GET _ltr/_model/my_linear_model +``` + +To delete a model, use a DELETE request: + +``` +DELETE _ltr/_model/my_linear_model +``` + +Model names must be globally unique across all feature sets. +{: .note} + +### Model persistence + +When a model is created, its features are copied. This prevents changes to the original features from affecting existing models or model production. For example, if the feature set used to create the model is deleted, you can still access and use the model. + +### Model response + +When retrieving a model, you receive a response that includes the features used to create it, as shown in the following example: + +```json + { + "_index": ".ltrstore", + "_type": "store", + "_id": "model-my_linear_model", + "_version": 1, + "found": true, + "_source": { + "name": "my_linear_model", + "type": "model", + "model": { + "name": "my_linear_model", + "feature_set": { + "name": "more_movie_features", + "features": [ + { + "name": "body_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "overview": "{{keywords}}" + } + } + }, + { + "name": "title_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + ]}}} +``` + +## Next steps + +Learn about [searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/). diff --git a/_search-plugins/ltr/working-with-features.md b/_search-plugins/ltr/working-with-features.md new file mode 100644 index 0000000000..00ebd908d7 --- /dev/null +++ b/_search-plugins/ltr/working-with-features.md @@ -0,0 +1,270 @@ +--- +layout: default +title: Working with features +nav_order: 30 +parent: Learning to Rank +has_children: false +--- + +# Working with features + +The following sections describe the specific functionality provided by the Learning to Rank plugin. This information will help you build and upload features for your learning to rank (LTR) system. See [ML ranking core concepts]({{site.url}}{{site.baseurl}}/search-plugins/ltr/core-concepts/) and [Scope of the plugin]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) for more information about the Learning to Rank plugin's roles and functionality. + +## Understanding the role of features in the Learning to Rank plugin + +The Learning to Rank plugin defines a _feature_ as an _OpenSearch query_. When you execute an OpenSearch query using your search terms and other relevant parameters, the resulting score is the value that can be used in your training data. For example, a feature may include basic `match` queries on fields such as `title`: + +```json +{ + "query": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } +} +``` +{% include copy-curl.html %} + +In addition to simple query-based features, you can also use document properties, such as `popularity`, as features. For example, you can use a function score query to get the average movie rating: + +```json +{ + "query": { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } +} +``` +{% include copy-curl.html %} + +Another example is a query based on location, such as a geodistance filter: + +```json +{ + "query": { + "bool" : { + "must" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "200km", + "pin.location" : { + "lat" : "{% raw %}{{users_lat}}{% endraw %}", + "lon" : "{% raw %}{{users_lon}}{% endraw %}" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +These types of queries are the building blocks that the ranking `f` function you are training combines mathematically to determine a relevance score. + +## Using Mustache templates in LTR queries + +The features in LTR queries use Mustache templates. This allows you to insert variables into your search queries. For example, you could have a query that uses `{% raw %}{{keywords}}{% endraw %}` to insert your search terms. Or you could use `{% raw %}{{users_lat}}{% endraw %}` and `{% raw %}{{users_lon}}{% endraw %}` to include the location. This gives you the flexibility to personalize your search. + +## Uploading and naming features + +The Learning to Rank plugin enables you to create and modify features. After you define your features, you can log them for use in model training. By combining the logged feature data with your judgment list, you can train a model. Once the model is ready, you can upload it and then apply it to your search queries. + +## Initializing the default feature store + +The Learning to Rank plugin uses a feature store to store metadata about your features and models. Typically, there is one feature store per major search implementation, for example, [Wikipedia](http://wikipedia.org) as compared to [Wikitravel](http://wikitravel.org). + +For most uses cases, you can use the default feature store and avoid managing multiple feature stores. To initialize the default feature store, run the following request: + +``` +PUT _ltr +``` +{% include copy-curl.html %} + +If you need to start again from the beginning, you can delete the default feature store by using the following operation: + +``` +DELETE _ltr +``` +{% include copy-curl.html %} + +Deleting the feature store removes all existing feature and model data. +{: .warning} + +The default feature store is used throughout the rest of this guide. + +## Working with features and feature sets + +A _feature set_ is a collection of features that have been grouped together. You can use feature sets to log multiple feature values for offline training. When creating a new model, you copy the relevant feature set into the model definition. + +## Creating feature sets + +To create a feature set, you can send a POST request. When creating the feature set, you provide a name and an optional list of features, as shown in the following example request: + +```json +POST _ltr/_featureset/more_movie_features +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "title_query_boost", + "params": [ + "some_multiplier" + ], + "template_language": "derived_expression", + "template": "title_query * some_multiplier" + }, + { + "name": "custom_title_query_boost", + "params": [ + "some_multiplier" + ], + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.feature_vector.get('title_query') * (long)params.some_multiplier", + "params": { + "some_multiplier": "some_multiplier" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Managing feature sets + +To fetch a specific feature set, you can use the following request: + +``` +GET _ltr/_featureset/more_movie_features +``` +{% include copy-curl.html %} + +To see a list of all defined feature sets, you can use the following request: + +``` +GET _ltr/_featureset +``` +{% include copy-curl.html %} + +If you have many feature sets, you can filter the list by using a prefix, as shown in the following example request: + +``` +GET _ltr/_featureset?prefix=mor +``` +{% include copy-curl.html %} + +This returns only the feature sets with names starting with `mor`. + +If you need to start over, you can delete a feature set using the following request: + +``` +DELETE _ltr/_featureset/more_movie_features +``` +{% include copy-curl.html %} + +## Validating features + +When adding new features, you should validate that the features work as expected. You can do this by adding a `validation` block in your feature creation request. This allows the Learning to Rank plugin to run the query before adding the feature, catching any issues early. If you do not run this validation, you may not discover until later that the query, while valid JSON, contains a malformed OpenSearch query. + +To run validation, you can specify the test parameters and the index to use, as shown in the following example validation block: + +```json +"validation": { + "params": { + "keywords": "rambo" + }, + "index": "tmdb" +}, +``` +{% include copy-curl.html %} + +Place the validation block alongside your feature set definition. In the following example, the `match` query is malformed (curly brackets are missing in the Mustache template). The validation fails, returning an error: + +```json +{ + "validation": { + "params": { + "keywords": "rambo" + }, + "index": "tmdb" + }, + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{% raw %}{{keywords{% endraw %}" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Expanding feature sets + +You may not initially know which features are the most useful. In these cases, you can later add new features to an existing feature set for logging and model evaluation. For example, if you want to create a `user_rating` feature, you can use the Feature Set Append API, as shown in the following example request: + +```json +POST /_ltr/_featureset/my_featureset/_addfeatures +{ + "features": [{ + "name": "user_rating", + "params": [], + "template_language": "mustache", + "template" : { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } + }] +} +``` +{% include copy-curl.html %} + +## Enforcing unique feature names + +The Learning to Rank plugin enforces unique names for each feature. This is because some model training libraries refer to features by name. In the preceding example, you could not add a new `user_rating` feature without causing an error because that feature name is already in use. + +## Treating feature sets as lists + +Feature sets are more like ordered lists than simple sets. Each feature has both a name and an ordinal position. Some LTR training applications, such as RankLib, refer to features by their ordinal position (for example, 1st feature, 2nd feature). Others may use the feature name. When working with logged features, you may need to handle both the ordinal and the name because the ordinal is preserved to maintain the list order. + +## Next steps + +Learn about [feature engineering]({{site.url}}{{site.baseurl}}/search-plugins/ltr/feature-engineering/) and [advanced functionality]({{site.url}}{{site.baseurl}}/search-plugins/ltr/advanced-functionality/). diff --git a/_search-plugins/neural-search-tutorial.md b/_search-plugins/neural-search-tutorial.md index 5f5a5fe79e..9c1b224cb8 100644 --- a/_search-plugins/neural-search-tutorial.md +++ b/_search-plugins/neural-search-tutorial.md @@ -55,13 +55,9 @@ For this simple setup, you'll use an OpenSearch-provided machine learning (ML) m PUT _cluster/settings { "persistent": { - "plugins": { - "ml_commons": { - "only_run_on_ml_node": "false", - "model_access_control_enabled": "true", - "native_memory_threshold": "99" - } - } + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" } } ``` diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md index 8a2723efa7..a802b48aca 100644 --- a/_search-plugins/search-pipelines/collapse-processor.md +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -20,7 +20,7 @@ Using the `collapse` response processor will likely result in fewer than `size` from a set whose size is already less than or equal to `size`. To increase the likelihood of returning `size` hits, use the `oversample` request processor and `truncate_hits` response processor, as shown in [this example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/#oversample-collapse-and-truncate-hits). -## Request fields +## Request body fields The following table lists all request fields. diff --git a/_search-plugins/search-pipelines/filter-query-processor.md b/_search-plugins/search-pipelines/filter-query-processor.md index 799d393e42..a55180f53c 100644 --- a/_search-plugins/search-pipelines/filter-query-processor.md +++ b/_search-plugins/search-pipelines/filter-query-processor.md @@ -13,7 +13,7 @@ Introduced 2.8 The `filter_query` search request processor intercepts a search request and applies an additional query to the request, filtering the results. This is useful when you don't want to rewrite existing queries in your application but need additional filtering of the results. -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/ml-inference-search-response.md b/_search-plugins/search-pipelines/ml-inference-search-response.md index e2ed7889c7..b0573d17be 100644 --- a/_search-plugins/search-pipelines/ml-inference-search-response.md +++ b/_search-plugins/search-pipelines/ml-inference-search-response.md @@ -48,7 +48,7 @@ The following is the syntax for the `ml-inference` search response processor: ``` {% include copy-curl.html %} -## Request fields +## Request body fields The following table lists the required and optional parameters for the `ml-inference` search response processor. @@ -96,7 +96,188 @@ For local models, you must provide a `model_input` field that specifies the mode For remote models, the `model_input` field is optional, and its default value is `"{ \"parameters\": ${ml_inference.parameters} }`. -### Example: Externally hosted model +### Example: Local model + +The following example shows you how to configure an `ml_inference` search response processor with a local model. + +**Step 1: Create a pipeline** + +The following example shows you how to create a search pipeline for the `huggingface/sentence-transformers/all-distilroberta-v1` local model. The model is a [pretrained sentence transformer model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) hosted in your OpenSearch cluster. + +If you invoke the model using the Predict API, then the request appears as follows: + +```json +POST /_plugins/_ml/_predict/text_embedding/cleMb4kBJ1eYAeTMFFg4 +{ + "text_docs":[ "today is sunny"], + "return_number": true, + "target_response": ["sentence_embedding"] +} +``` + +Using this schema, specify the `model_input` as follows: + +```json + "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }" +``` + +In the `input_map`, map the `passage_text` document field to the `text_docs` field expected by the model: + +```json +"input_map": [ + { + "text_docs": "passage_text" + } +] +``` + +Because you specified the field to be converted into embeddings as a JSON path, you need to set the `full_response_path` to `true`. Then the full JSON document is parsed in order to obtain the input field: + +```json +"full_response_path": true +``` + +The text in the `passage_text` field will be used to generate embeddings: + +```json +{ + "passage_text": "hello world" +} +``` + +The Predict API request returns the following response: + +```json +{ + "inference_results" : [ + { + "output" : [ + { + "name" : "sentence_embedding", + "data_type" : "FLOAT32", + "shape" : [ + 768 + ], + "data" : [ + 0.25517133, + -0.28009856, + 0.48519906, + ... + ] + } + ] + } + ] +} +``` + +The model generates embeddings in the `$.inference_results.*.output.*.data` field. The `output_map` maps this field to the newly created `passage_embedding` field in the search response document: + +```json +"output_map": [ + { + "passage_embedding": "$.inference_results.*.output.*.data" + } +] +``` + +To configure an `ml_inference` search response processor with a local model, specify the `function_name` explicitly. In this example, the `function_name` is `text_embedding`. For information about valid `function_name` values, see [Request fields](#request-body-fields). + +The following is the final configuration of the `ml_inference` search response processor with the local model: + +```json +PUT /_search/pipeline/ml_inference_pipeline_local +{ + "description": "search passage and generates embeddings", + "processors": [ + { + "ml_inference": { + "function_name": "text_embedding", + "full_response_path": true, + "model_id": "", + "model_config": { + "return_number": true, + "target_response": ["sentence_embedding"] + }, + "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }", + "input_map": [ + { + "text_docs": "passage_text" + } + ], + "output_map": [ + { + "passage_embedding": "$.inference_results.*.output.*.data" + } + ], + "ignore_missing": true, + "ignore_failure": true + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2: Run the pipeline** + +Run the following query, providing the pipeline name in the request: + +```json +GET /my_index/_search?search_pipeline=ml_inference_pipeline_local +{ +"query": { + "term": { + "passage_text": { + "value": "hello" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Response + +The response confirms that the processor has generated text embeddings in the `passage_embedding` field: + +```json +{ + "took": 288, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.00009405752, + "hits": [ + { + "_index": "my_index", + "_id": "1", + "_score": 0.00009405752, + "_source": { + "passage_text": "hello world", + "passage_embedding": [ + 0.017304314, + -0.021530833, + 0.050184276, + 0.08962978, + ...] + } + } + ] + } +} +``` + +### Example: Externally hosted text embedding model The following example shows you how to configure an `ml_inference` search response processor with an externally hosted model. @@ -172,7 +353,6 @@ GET /my_index/_search?search_pipeline=ml_inference_pipeline_local The response confirms that the processor has generated text embeddings in the `passage_embedding` field. The document within `_source` now contains both the `passage_text` and `passage_embedding` fields: ```json - { "took": 288, "timed_out": false, @@ -209,140 +389,312 @@ The response confirms that the processor has generated text embeddings in the `p } ``` -### Example: Local model +### Example: Externally hosted large language model -The following example shows you how to configure an `ml_inference` search response processor with a local model. +This example demonstrates how to configure an `ml_inference` search response processor to work with an externally hosted large language model (LLM) and map the model's response to the search extension object. Using the `ml_inference` processor, you can enable an LLM to summarize search results directly within the response. The summary is included in the `ext` field of the search response, providing seamless access to AI-generated insights alongside the original search results. -**Step 1: Create a pipeline** +**Prerequisite** -The following example shows you how to create a search pipeline for the `huggingface/sentence-transformers/all-distilroberta-v1` local model. The model is a [pretrained sentence transformer model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) hosted in your OpenSearch cluster. +You must configure an externally hosted LLM for this use case. For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). Once you register the LLM, you can use the following request to test it. This request requires providing the `prompt` and `context` fields: -If you invoke the model using the Predict API, then the request appears as follows: +```json +POST /_plugins/_ml/models/KKne6JIBAs32TwoK-FFR/_predict +{ + "parameters": { + "prompt":"\n\nHuman: You are a professional data analysist. You will always answer question: Which month had the lowest customer acquisition cost per new customer? based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say I don't know. Context: ${parameters.context.toString()}. \n\n Assistant:", + "context":"Customer acquisition cost: January: $50, February: $45, March: $40. New customers: January: 500, February: 600, March: 750" + } +} +``` +{% include copy-curl.html %} + +The response contains the model output in the `inference_results` field: ```json -POST /_plugins/_ml/_predict/text_embedding/cleMb4kBJ1eYAeTMFFg4 { - "text_docs":[ "today is sunny"], - "return_number": true, - "target_response": ["sentence_embedding"] + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "response": """ Based on the data provided: + + - Customer acquisition cost in January was $50 and new customers were 500. So cost per new customer was $50/500 = $0.10 + - Customer acquisition cost in February was $45 and new customers were 600. So cost per new customer was $45/600 = $0.075 + - Customer acquisition cost in March was $40 and new customers were 750. So cost per new customer was $40/750 = $0.053 + + Therefore, the month with the lowest customer acquisition cost per new customer was March, at $0.053.""" + } + } + ], + "status_code": 200 + } + ] } ``` -Using this schema, specify the `model_input` as follows: +**Step 1: Create a pipeline** + +Create a search pipeline for the registered model. The model requires a `context` field as input. The model response summarizes the text in the `review` field and stores the summary in the `ext.ml_inference.llm_response` field of the search response: ```json - "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }" +PUT /_search/pipeline/my_pipeline_request_review_llm +{ + "response_processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor is going to run llm", + "model_id": "EOF6wJIBtDGAJRTD4kNg", + "function_name": "REMOTE", + "input_map": [ + { + "context": "review" + } + ], + "output_map": [ + { + "ext.ml_inference.llm_response": "response" + } + ], + "model_config": { + "prompt": "\n\nHuman: You are a professional data analysist. You will always answer question: Which month had the lowest customer acquisition cost per new customer? based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say I don't know. Context: ${parameters.context.toString()}. \n\n Assistant:" + }, + "ignore_missing": false, + "ignore_failure": false + } + } + ] +} ``` +{% include copy-curl.html %} -In the `input_map`, map the `passage_text` document field to the `text_docs` field expected by the model: +In this configuration, you've provided the following parameters: + +- The `model_id` parameter specifies the ID of the generative AI model. +- The `function_name` parameter is set to `REMOTE`, indicating that the model is hosted externally. +- The `input_map` parameter maps the review field from the document to the context field expected by the model. +- The `output_map` parameter specifies that the model's response should be stored in `ext.ml_inference.llm_response` in the search response. +- The `model_config` parameter includes a prompt that tells the model how to process the input and generate a summary. + +**Step 2: Index sample documents** + +Index some sample documents to test the pipeline: ```json -"input_map": [ - { - "text_docs": "passage_text" +POST /_bulk +{"index":{"_index":"review_string_index","_id":"1"}} +{"review":"Customer acquisition cost: January: $50, New customers: January: 500."} +{"index":{"_index":"review_string_index","_id":"2"}} +{"review":"Customer acquisition cost: February: $45, New customers: February: 600."} +{"index":{"_index":"review_string_index","_id":"3"}} +{"review":"Customer acquisition cost: March: $40, New customers: March: 750."} +``` +{% include copy-curl.html %} + +**Step 3: Run the pipeline** + +Run a search query using the pipeline: + +```json +GET /review_string_index/_search?search_pipeline=my_pipeline_request_review_llm +{ + "query": { + "match_all": {} } -] +} ``` +{% include copy-curl.html %} -Because you specified the field to be converted into embeddings as a JSON path, you need to set the `full_response_path` to `true`. Then the full JSON document is parsed in order to obtain the input field: +The response includes the original documents and the generated summary in the `ext.ml_inference.llm_response` field: ```json -"full_response_path": true +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "review_string_index", + "_id": "1", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: January: $50, New customers: January: 500." + } + }, + { + "_index": "review_string_index", + "_id": "2", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: February: $45, New customers: February: 600." + } + }, + { + "_index": "review_string_index", + "_id": "3", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: March: $40, New customers: March: 750." + } + } + ] + }, + "ext": { + "ml_inference": { + "llm_response": """ Based on the context provided: + + - Customer acquisition cost in January was $50 and new customers were 500. So the cost per new customer was $50/500 = $0.10 + + - Customer acquisition cost in February was $45 and new customers were 600. So the cost per new customer was $45/600 = $0.075 + + - Customer acquisition cost in March was $40 and new customers were 750. So the cost per new customer was $40/750 = $0.053 + + Therefore, the month with the lowest customer acquisition cost per new customer was March, as it had the lowest cost per customer of $0.053.""" + } + } +} ``` -The text in the `passage_text` field will be used to generate embeddings: +### Example: Reranking search results using a text similarity model + +The following example shows you how to configure an `ml_inference` search response processor with a text similarity model. + +**Prerequisite** + +You must configure an externally hosted text similarity model for this use case. For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). Once you register the text similarity model, you can use the following request to test it. This request requires that you provide the `text` and `text_pair` fields within the `inputs` field: ```json +POST /_plugins/_ml/models/Ialx65IBAs32TwoK1lXf/_predict { - "passage_text": "hello world" + "parameters": { + "inputs": + { + "text": "I like you", + "text_pair": "I hate you" + } + } } ``` +{% include copy-curl.html %} -The Predict API request returns the following response: +The model returns similarity scores for each input document: ```json { - "inference_results" : [ + "inference_results": [ { - "output" : [ + "output": [ { - "name" : "sentence_embedding", - "data_type" : "FLOAT32", - "shape" : [ - 768 - ], - "data" : [ - 0.25517133, - -0.28009856, - 0.48519906, - ... - ] + "name": "response", + "dataAsMap": { + "label": "LABEL_0", + "score": 0.022704314440488815 + } } - ] + ], + "status_code": 200 } ] } ``` +{% include copy-curl.html %} -The model generates embeddings in the `$.inference_results.*.output.*.data` field. The `output_map` maps this field to the newly created `passage_embedding` field in the search response document: +**Step 1: Index sample documents** + +Create an index and add some sample documents: ```json -"output_map": [ - { - "passage_embedding": "$.inference_results.*.output.*.data" - } -] +POST _bulk +{"index":{"_index":"demo-index-0","_id":"1"}} +{"diary":"I hate you"} +{"index":{"_index":"demo-index-0","_id":"2"}} +{"diary":"I love you"} +{"index":{"_index":"demo-index-0","_id":"3"}} +{"diary":"I dislike you"} ``` +{% include copy-curl.html %} -To configure an `ml_inference` search response processor with a local model, specify the `function_name` explicitly. In this example, the `function_name` is `text_embedding`. For information about valid `function_name` values, see [Request fields](#request-fields). +**Step 2: Create a search pipeline** -The following is the final configuration of the `ml_inference` search response processor with the local model: +For this example, you'll create a search pipeline that uses a text similarity model in a `one-to-one` inference mode, processing each document in the search results individually. This setup allows the model to make one prediction request per document, providing specific relevance insights for each search hit. When using `input_map` to map the search request to query text, the JSON path must start with `$._request` or `_request`: ```json -PUT /_search/pipeline/ml_inference_pipeline_local +PUT /_search/pipeline/my_rerank_pipeline { - "description": "search passage and generates embeddings", - "processors": [ + "response_processors": [ { "ml_inference": { - "function_name": "text_embedding", - "full_response_path": true, - "model_id": "", - "model_config": { - "return_number": true, - "target_response": ["sentence_embedding"] - }, - "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }", + "tag": "ml_inference", + "description": "This processor runs ml inference during search response", + "model_id": "Ialx65IBAs32TwoK1lXf", + "model_input":"""{"parameters":{"inputs":{"text":"${input_map.text}","text_pair":"${input_map.text_pair}"}}}""", + "function_name": "REMOTE", "input_map": [ { - "text_docs": "passage_text" + "text": "diary", + "text_pair":"$._request.query.term.diary.value" } ], "output_map": [ { - "passage_embedding": "$.inference_results.*.output.*.data" + "rank_score": "$.score" } ], - "ignore_missing": true, - "ignore_failure": true - } + "full_response_path": false, + "model_config": {}, + "ignore_missing": false, + "ignore_failure": false, + "one_to_one": true + }, + "rerank": { + "by_field": { + "target_field": "rank_score", + "remove_target_field": true + } + } } ] } ``` {% include copy-curl.html %} -**Step 2: Run the pipeline** +In this configuration, you've provided the following parameters: -Run the following query, providing the pipeline name in the request: +- The `model_id` parameter specifies the unique identifier of the text similarity model. +- The `function_name` parameter is set to `REMOTE`, indicating that the model is hosted externally. +- The `input_map` parameter maps the `diary` field from each document to the `text` input of the model as well as the search query term to the `text_pair` input. +- The `output_map` parameter maps the model's score to a field named `rank_score` in each document. +- The `model_input` parameter formats the input for the model, ensuring that it matches the structure expected by the Predict API. +- The `one_to_one` parameter is set to `true`, ensuring that the model processes each document individually rather than batching multiple documents together. +- The `ignore_missing` parameter is set to `false`, causing the processor to fail if the mapped fields are missing from a document. +- The `ignore_failure` parameter is set to `false`, causing the entire pipeline to fail if the ML inference processor encounters an error. + +The `rerank` processor is applied after ML inference. It reorders the documents based on the `rank_score` field generated by the ML model and then removes this field from the final results. + +**Step 3: Run the pipeline** + +Now perform a search using the created pipeline: ```json -GET /my_index/_search?search_pipeline=ml_inference_pipeline_local +GET /demo-index-0/_search?search_pipeline=my_rerank_pipeline { -"query": { - "term": { - "passage_text": { - "value": "hello" + "query": { + "term": { + "dairy": { + "value": "today" } } } @@ -350,13 +702,11 @@ GET /my_index/_search?search_pipeline=ml_inference_pipeline_local ``` {% include copy-curl.html %} -#### Response - -The response confirms that the processor has generated text embeddings in the `passage_embedding` field: +The response includes the original documents and their reranked scores: ```json { - "took": 288, + "took": 2, "timed_out": false, "_shards": { "total": 1, @@ -366,26 +716,43 @@ The response confirms that the processor has generated text embeddings in the `p }, "hits": { "total": { - "value": 1, + "value": 3, "relation": "eq" }, - "max_score": 0.00009405752, + "max_score": 0.040183373, "hits": [ { - "_index": "my_index", + "_index": "demo-index-0", "_id": "1", - "_score": 0.00009405752, + "_score": 0.040183373, "_source": { - "passage_text": "hello world", - "passage_embedding": [ - 0.017304314, - -0.021530833, - 0.050184276, - 0.08962978, - ...] + "diary": "I hate you" + } + }, + { + "_index": "demo-index-0", + "_id": "2", + "_score": 0.022628736, + "_source": { + "diary": "I love you" + } + }, + { + "_index": "demo-index-0", + "_id": "3", + "_score": 0.0073115323, + "_source": { + "diary": "I dislike you" } } ] + }, + "profile": { + "shards": [] } } -``` \ No newline at end of file +``` + +## Next steps + +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/neural-query-enricher.md b/_search-plugins/search-pipelines/neural-query-enricher.md index 683eaa7b85..a237f8841d 100644 --- a/_search-plugins/search-pipelines/neural-query-enricher.md +++ b/_search-plugins/search-pipelines/neural-query-enricher.md @@ -13,7 +13,7 @@ Introduced 2.11 The `neural_query_enricher` search request processor is designed to set a default machine learning (ML) model ID at the index or field level for [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/) queries. To learn more about ML models, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Connecting to remote models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md b/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md index 3ba1e21405..41119e643a 100644 --- a/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md +++ b/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md @@ -15,7 +15,7 @@ The `neural_sparse_two_phase_processor` search processor is designed to provide 1. High-weight tokens score the documents and filter out the top documents. 2. Low-weight tokens rescore the top documents. -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/normalization-processor.md b/_search-plugins/search-pipelines/normalization-processor.md index ac29b079f1..e70f815bdd 100644 --- a/_search-plugins/search-pipelines/normalization-processor.md +++ b/_search-plugins/search-pipelines/normalization-processor.md @@ -25,7 +25,7 @@ OpenSearch supports two search types: `query_then_fetch` and `dfs_query_then_fet When you send a search request to a node, the node becomes a _coordinating node_. During the first phase of search, the _query phase_, the coordinating node routes the search request to all shards in the index, including primary and replica shards. Each shard then runs the search query locally and returns metadata about the matching documents, which includes their document IDs and relevance scores. The `normalization-processor` then normalizes and combines scores from different query clauses. The coordinating node merges and sorts the local lists of results, compiling a global list of top documents that match the query. After that, search execution enters a _fetch phase_, in which the coordinating node requests the documents in the global list from the shards where they reside. Each shard returns the documents' `_source` to the coordinating node. Finally, the coordinating node sends a search response containing the results back to you. -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md index 81f4252f3d..e881d5f32c 100644 --- a/_search-plugins/search-pipelines/oversample-processor.md +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -13,7 +13,7 @@ Introduced 2.12 The `oversample` request processor multiplies the `size` parameter of the search request by a specified `sample_factor` (>= 1.0), saving the original value in the `original_size` pipeline variable. The `oversample` processor is designed to work with the [`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/) but may be used on its own. -## Request fields +## Request body fields The following table lists all request fields. diff --git a/_search-plugins/search-pipelines/personalize-search-ranking.md b/_search-plugins/search-pipelines/personalize-search-ranking.md index b63ba4b966..32ff251cae 100644 --- a/_search-plugins/search-pipelines/personalize-search-ranking.md +++ b/_search-plugins/search-pipelines/personalize-search-ranking.md @@ -16,7 +16,7 @@ The `personalize_search_ranking` search response processor intercepts a search r To use the `personalize_search_ranking` processor, you must first install the Amazon Personalize Search Ranking (`opensearch-search-processor`) plugin. For detailed instructions, see [Installing and configuring the Amazon Personalize Search Ranking plugin](https://docs.aws.amazon.com/personalize/latest/dg/opensearch-install.html). {: .important} -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/rag-processor.md b/_search-plugins/search-pipelines/rag-processor.md index 60257ebd05..e9fca2e2c5 100644 --- a/_search-plugins/search-pipelines/rag-processor.md +++ b/_search-plugins/search-pipelines/rag-processor.md @@ -16,7 +16,7 @@ The `retrieval_augmented_generation` processor is a search results processor tha As of OpenSearch 2.12, the `retrieval_augmented_generation` processor supports only OpenAI and Amazon Bedrock models. {: .note} -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/rename-field-processor.md b/_search-plugins/search-pipelines/rename-field-processor.md index 9c734af656..4715a6308a 100644 --- a/_search-plugins/search-pipelines/rename-field-processor.md +++ b/_search-plugins/search-pipelines/rename-field-processor.md @@ -13,7 +13,7 @@ Introduced 2.8 The `rename_field` search response processor intercepts a search response and renames the specified field. This is useful when your index and your application use different names for the same field. For example, if you rename a field in your index, the `rename_field` processor can change the new name to the old one before sending the response to your application. -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/rerank-processor.md b/_search-plugins/search-pipelines/rerank-processor.md index 313ae5f74d..11691eff95 100644 --- a/_search-plugins/search-pipelines/rerank-processor.md +++ b/_search-plugins/search-pipelines/rerank-processor.md @@ -11,33 +11,49 @@ grand_parent: Search pipelines Introduced 2.12 {: .label .label-purple } -The `rerank` search request processor intercepts search results and passes them to a cross-encoder model to be reranked. The model reranks the results, taking into account the scoring context. Then the processor orders documents in the search results based on their new scores. +The `rerank` search response processor intercepts and reranks search results. The processor orders documents in the search results based on their new scores. -## Request fields +OpenSearch supports the following rerank types. + +Type | Description | Earliest available version +:--- | :--- | :--- +[`ml_opensearch`](#the-ml_opensearch-rerank-type) | Applies an OpenSearch-provided cross-encoder model. | 2.12 +[`by_field`](#the-by_field-rerank-type) | Applies reranking based on a user-provided field. | 2.18 + +## Request body fields The following table lists all available request fields. -Field | Data type | Description -:--- | :--- | :--- -`` | Object | The reranker type provides the rerank processor with static information needed across all reranking calls. Required. -`context` | Object | Provides the rerank processor with information necessary for generating reranking context at query time. -`tag` | String | The processor's identifier. Optional. -`description` | String | A description of the processor. Optional. -`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`` | Object | Required | The rerank type for document reranking. Valid values are `ml-opensearch` and `by_field`. +`context` | Object | Required for the `ml_opensearch` rerank type. Optional and does not affect the results for the `by_field` rerank type. | Provides the `rerank` processor with information necessary for reranking at query time. +`tag` | String | Optional | The processor's identifier. +`description` | String | Optional | A description of the processor. +`ignore_failure` | Boolean | Optional | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Default is `false`. + + +## The ml_opensearch rerank type + +Introduced 2.12 +{: .label .label-purple } -### The `ml_opensearch` reranker type +To rerank results using a cross-encoder model, specify the `ml_opensearch` rerank type. -The `ml_opensearch` reranker type is designed to work with the cross-encoder model provided by OpenSearch. For this reranker type, specify the following fields. +### Prerequisite + +Before using the `ml_opensearch` rerank type, you must configure a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). + +The `ml_opensearch` rerank type supports the following fields. All fields are required. Field | Data type | Description :--- | :--- | :--- -`ml_opensearch` | Object | Provides the rerank processor with model information. Required. -`ml_opensearch.model_id` | String | The model ID for the cross-encoder model. Required. For more information, see [Using ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). -`context.document_fields` | Array | An array of document fields that specifies the fields from which to retrieve context for the cross-encoder model. Required. +`ml_opensearch.model_id` | String | The model ID of the cross-encoder model for reranking. For more information, see [Using ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). +`context.document_fields` | Array | An array of document fields that specifies the fields from which to retrieve context for the cross-encoder model. -## Example +### Example -The following example demonstrates using a search pipeline with a `rerank` processor. +The following example demonstrates using a search pipeline with a `rerank` processor implemented using the `ml_opensearch` rerank type. For a complete example, see [Reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). ### Creating a search pipeline @@ -108,11 +124,72 @@ POST /_search?search_pipeline=rerank_pipeline ``` {% include copy-curl.html %} -The `query_context` object contains the following fields. +The `query_context` object contains the following fields. You must provide either `query_text` or `query_text_path` but cannot provide both simultaneously. + +Field name | Required/Optional | Description +:--- | :--- | :--- +`query_text` | Exactly one of `query_text` or `query_text_path` is required. | The natural language text of the question that you want to use to rerank the search results. +`query_text_path` | Exactly one of `query_text` or `query_text_path` is required. | The full JSON path to the text of the question that you want to use to rerank the search results. The maximum number of characters allowed in the path is `1000`. + + + +## The by_field rerank type + +Introduced 2.18 +{: .label .label-purple } + +To rerank results by a document field, specify the `by_field` rerank type. + +The `by_field` object supports the following fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`target_field` | String | Required | Specifies the field name or a dot path to the field containing the score to use for reranking. +`remove_target_field` | Boolean | Optional | If `true`, the response does not include the `target_field` used to perform reranking. Default is `false`. +`keep_previous_score` | Boolean | Optional | If `true`, the response includes a `previous_score` field, which contains the score calculated before reranking and can be useful when debugging. Default is `false`. + +### Example + +The following example demonstrates using a search pipeline with a `rerank` processor implemented using the `by_field` rerank type. For a complete example, see [Reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). + +### Creating a search pipeline + +The following request creates a search pipeline with a `by_field` rerank type response processor that ranks the documents by the `reviews.stars` field and specifies to return the original document score: + +```json +PUT /_search/pipeline/rerank_byfield_pipeline +{ + "response_processors": [ + { + "rerank": { + "by_field": { + "target_field": "reviews.stars", + "keep_previous_score" : true + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using the search pipeline + +To apply the search pipeline to a query, provide the search pipeline name in the query parameter: + +```json +POST /book-index/_search?search_pipeline=rerank_byfield_pipeline +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} -Field name | Description -:--- | :--- -`query_text` | The natural language text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. -`query_text_path` | The full JSON path to the text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. The maximum number of characters in the path is `1000`. +## Next steps -For more information about setting up reranking, see [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). \ No newline at end of file +- Learn more about [reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). +- See a complete example of [reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). +- See a complete example of [reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/script-processor.md b/_search-plugins/search-pipelines/script-processor.md index 1fd1d08e57..d927348d33 100644 --- a/_search-plugins/search-pipelines/script-processor.md +++ b/_search-plugins/search-pipelines/script-processor.md @@ -26,7 +26,7 @@ The `script` search request processor intercepts a search request and adds an in For request field definitions, see [search request fields]({{site.url}}{{site.baseurl}}/api-reference/search#request-body). -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/sort-processor.md b/_search-plugins/search-pipelines/sort-processor.md index 6df2352c1e..1b4a28d2a9 100644 --- a/_search-plugins/search-pipelines/sort-processor.md +++ b/_search-plugins/search-pipelines/sort-processor.md @@ -13,7 +13,7 @@ Introduced 2.16 The `sort` processor sorts an array of items in either ascending or descending order. Numeric arrays are sorted numerically, while string or mixed arrays (strings and numbers) are sorted lexicographically. The processor throws an error if the input is not an array. -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/split-processor.md b/_search-plugins/search-pipelines/split-processor.md index c524386262..0ffcc6fea9 100644 --- a/_search-plugins/search-pipelines/split-processor.md +++ b/_search-plugins/search-pipelines/split-processor.md @@ -13,7 +13,7 @@ Introduced 2.17 The `split` processor splits a string field into an array of substrings based on a specified delimiter. -## Request fields +## Request body fields The following table lists all available request fields. diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index 7bba627734..12426f3890 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -22,7 +22,7 @@ The following is a common usage pattern: 1. In the response pipeline, apply a reranking processor (which may promote results from beyond the originally requested top N) or the `collapse` processor (which may discard results after deduplication). 1. Apply the `truncate` processor to return (at most) the originally requested number of hits. -## Request fields +## Request body fields The following table lists all request fields. diff --git a/_search-plugins/search-pipelines/using-search-pipeline.md b/_search-plugins/search-pipelines/using-search-pipeline.md index ecb988ad11..b6dbbdc5d0 100644 --- a/_search-plugins/search-pipelines/using-search-pipeline.md +++ b/_search-plugins/search-pipelines/using-search-pipeline.md @@ -17,14 +17,45 @@ You can use a search pipeline in the following ways: ## Specifying an existing search pipeline for a request -After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query by specifying the pipeline name in the `search_pipeline` query parameter: +After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query in the following ways. For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example). + +### Specifying the pipeline in a query parameter + +You can specify the pipeline name in the `search_pipeline` query parameter as follows: ```json GET /my_index/_search?search_pipeline=my_pipeline ``` {% include copy-curl.html %} -For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example). +### Specifying the pipeline in the request body + +You can provide a search pipeline ID in the search request body as follows: + +```json +GET /my-index/_search +{ + "query": { + "match_all": {} + }, + "from": 0, + "size": 10, + "search_pipeline": "my_pipeline" +} +``` +{% include copy-curl.html %} + +For multi-search, you can provide a search pipeline ID in the search request body as follows: + +```json +GET /_msearch +{ "index": "test"} +{ "query": { "match_all": {} }, "from": 0, "size": 10, "search_pipeline": "my_pipeline"} +{ "index": "test-1", "search_type": "dfs_query_then_fetch"} +{ "query": { "match_all": {} }, "search_pipeline": "my_pipeline1" } + +``` +{% include copy-curl.html %} ## Using a temporary search pipeline for a request diff --git a/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md b/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md new file mode 100644 index 0000000000..7f30689491 --- /dev/null +++ b/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md @@ -0,0 +1,276 @@ +--- +layout: default +title: Reranking by a field using a cross-encoder +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 30 +--- + +# Reranking by a field using an externally hosted cross-encoder model +Introduced 2.18 +{: .label .label-purple } + +In this tutorial, you'll learn how to use a cross-encoder model hosted on Amazon SageMaker to rerank search results and improve search relevance. + +To rerank documents, you'll configure a search pipeline that processes search results at query time. The pipeline intercepts search results and passes them to the [`ml_inference` search response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-response/), which invokes the cross-encoder model. The model generates scores used to rerank the matching documents [`by_field`]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). + +## Prerequisite: Deploy a model on Amazon SageMaker + +Run the following code to deploy a model on Amazon SageMaker. For this example, you'll use the [`ms-marco-MiniLM-L-6-v2`](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) Hugging Face cross-encoder model hosted on Amazon SageMaker. We recommend using a GPU for better performance: + +```python +import sagemaker +import boto3 +from sagemaker.huggingface import HuggingFaceModel + +sess = sagemaker.Session() +role = sagemaker.get_execution_role() + +hub = { + 'HF_MODEL_ID':'cross-encoder/ms-marco-MiniLM-L-6-v2', + 'HF_TASK':'text-classification' +} +huggingface_model = HuggingFaceModel( + transformers_version='4.37.0', + pytorch_version='2.1.0', + py_version='py310', + env=hub, + role=role, +) +predictor = huggingface_model.deploy( + initial_instance_count=1, # number of instances + instance_type='ml.m5.xlarge' # ec2 instance type +) +``` +{% include copy.html %} + +After deploying the model, you can find the model endpoint by going to the Amazon SageMaker console in the AWS Management Console and selecting **Inference > Endpoints** on the left tab. Note the URL for the created model; you'll use it to create a connector. + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Create a connector](#step-1-create-a-connector). +1. [Register the model](#step-2-register-the-model). +1. [Ingest documents into an index](#step-3-ingest-documents-into-an-index). +1. [Create a search pipeline](#step-4-create-a-search-pipeline). +1. [Search using reranking](#step-5-search-using-reranking). + +## Step 1: Create a connector + +Create a connector to the cross-encoder model by providing the model URL in the `actions.url` parameter: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "SageMaker cross-encoder model", + "description": "Test connector for SageMaker cross-encoder hosted model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "region": "", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"inputs\": { \"text\": \"${parameters.text}\", \"text_pair\": \"${parameters.text_pair}\" }}" + } + ] +} +``` +{% include copy-curl.html %} + +Note the connector ID contained in the response; you'll use it in the following step. + +## Step 2: Register the model + +To register the model, provide the connector ID in the `connector_id` parameter: + +```json +POST /_plugins/_ml/models/_register +{ + "name": "Cross encoder model", + "version": "1.0.1", + "function_name": "remote", + "description": "Using a SageMaker endpoint to apply a cross encoder model", + "connector_id": "" +} +``` +{% include copy-curl.html %} + + +## Step 3: Ingest documents into an index + +Create an index and ingest sample documents containing facts about the New York City boroughs: + +```json +POST /nyc_areas/_bulk +{ "index": { "_id": 1 } } +{ "borough": "Queens", "area_name": "Astoria", "description": "Astoria is a neighborhood in the western part of Queens, New York City, known for its diverse community and vibrant cultural scene.", "population": 93000, "facts": "Astoria is home to many artists and has a large Greek-American community. The area also boasts some of the best Mediterranean food in NYC." } +{ "index": { "_id": 2 } } +{ "borough": "Queens", "area_name": "Flushing", "description": "Flushing is a neighborhood in the northern part of Queens, famous for its Asian-American population and bustling business district.", "population": 227000, "facts": "Flushing is one of the most ethnically diverse neighborhoods in NYC, with a large Chinese and Korean population. It is also home to the USTA Billie Jean King National Tennis Center." } +{ "index": { "_id": 3 } } +{ "borough": "Brooklyn", "area_name": "Williamsburg", "description": "Williamsburg is a trendy neighborhood in Brooklyn known for its hipster culture, vibrant art scene, and excellent restaurants.", "population": 150000, "facts": "Williamsburg is a hotspot for young professionals and artists. The neighborhood has seen rapid gentrification over the past two decades." } +{ "index": { "_id": 4 } } +{ "borough": "Manhattan", "area_name": "Harlem", "description": "Harlem is a historic neighborhood in Upper Manhattan, known for its significant African-American cultural heritage.", "population": 116000, "facts": "Harlem was the birthplace of the Harlem Renaissance, a cultural movement that celebrated Black culture through art, music, and literature." } +{ "index": { "_id": 5 } } +{ "borough": "The Bronx", "area_name": "Riverdale", "description": "Riverdale is a suburban-like neighborhood in the Bronx, known for its leafy streets and affluent residential areas.", "population": 48000, "facts": "Riverdale is one of the most affluent areas in the Bronx, with beautiful parks, historic homes, and excellent schools." } +{ "index": { "_id": 6 } } +{ "borough": "Staten Island", "area_name": "St. George", "description": "St. George is the main commercial and cultural center of Staten Island, offering stunning views of Lower Manhattan.", "population": 15000, "facts": "St. George is home to the Staten Island Ferry terminal and is a gateway to Staten Island, offering stunning views of the Statue of Liberty and Ellis Island." } +``` +{% include copy-curl.html %} + +## Step 4: Create a search pipeline + +Next, create a search pipeline for reranking. In the search pipeline configuration, the `input_map` and `output_map` define how the input data is prepared for the cross-encoder model and how the model's output is interpreted for reranking: + +- The `input_map` specifies which fields in the search documents and the query should be used as model inputs: + - The `text` field maps to the `facts` field in the indexed documents. It provides the document-specific content that the model will analyze. + - The `text_pair` field dynamically retrieves the search query text (`multi_match.query`) from the search request. + + The combination of `text` (document `facts`) and `text_pair` (search `query`) allows the cross-encoder model to compare the relevance of the document to the query, considering their semantic relationship. + +- The `output_map` field specifies how the output of the model is mapped to the fields in the response: + - The `rank_score` field in the response will store the model's relevance score, which will be used to perform reranking. + +When using the `by_field` rerank type, the `rank_score` field will contain the same score as the `_score` field. To remove the `rank_score` field from the search results, set `remove_target_field` to `true`. The original BM25 score, before reranking, is included for debugging purposes by setting `keep_previous_score` to `true`. This allows you to compare the original score with the reranked score to evaluate improvements in search relevance. + +To create the search pipeline, send the following request: + +```json +PUT /_search/pipeline/my_pipeline +{ + "response_processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor runs ml inference during search response", + "model_id": "", + "function_name": "REMOTE", + "input_map": [ + { + "text": "facts", + "text_pair":"$._request.query.multi_match.query" + } + ], + "output_map": [ + { + "rank_score": "$.score" + } + ], + "full_response_path": false, + "model_config": {}, + "ignore_missing": false, + "ignore_failure": false, + "one_to_one": true + }, + + "rerank": { + "by_field": { + "target_field": "rank_score", + "remove_target_field": true, + "keep_previous_score" : true + } + } + + } + ] +} +``` +{% include copy-curl.html %} + +## Step 5: Search using reranking + +Use the following request to search indexed documents and rerank them using the cross-encoder model. The request retrieves documents containing any of the specified terms in the `description` or `facts` fields. These terms are then used to compare and rerank the matched documents: + +```json +POST /nyc_areas/_search?search_pipeline=my_pipeline +{ + "query": { + "multi_match": { + "query": "artists art creative community", + "fields": ["description", "facts"] + } + } +} +``` +{% include copy-curl.html %} + +In the response, the `previous_score` field contains the document's BM25 score, which it would have received if you hadn't applied the pipeline. Note that while BM25 ranked "Astoria" the highest, the cross-encoder model prioritized "Harlem" because it matched more search terms: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.03418137, + "hits": [ + { + "_index": "nyc_areas", + "_id": "4", + "_score": 0.03418137, + "_source": { + "area_name": "Harlem", + "description": "Harlem is a historic neighborhood in Upper Manhattan, known for its significant African-American cultural heritage.", + "previous_score": 1.6489418, + "borough": "Manhattan", + "facts": "Harlem was the birthplace of the Harlem Renaissance, a cultural movement that celebrated Black culture through art, music, and literature.", + "population": 116000 + } + }, + { + "_index": "nyc_areas", + "_id": "1", + "_score": 0.0090838, + "_source": { + "area_name": "Astoria", + "description": "Astoria is a neighborhood in the western part of Queens, New York City, known for its diverse community and vibrant cultural scene.", + "previous_score": 2.519608, + "borough": "Queens", + "facts": "Astoria is home to many artists and has a large Greek-American community. The area also boasts some of the best Mediterranean food in NYC.", + "population": 93000 + } + }, + { + "_index": "nyc_areas", + "_id": "3", + "_score": 0.0032599436, + "_source": { + "area_name": "Williamsburg", + "description": "Williamsburg is a trendy neighborhood in Brooklyn known for its hipster culture, vibrant art scene, and excellent restaurants.", + "previous_score": 1.5632852, + "borough": "Brooklyn", + "facts": "Williamsburg is a hotspot for young professionals and artists. The neighborhood has seen rapid gentrification over the past two decades.", + "population": 150000 + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + \ No newline at end of file diff --git a/_search-plugins/search-relevance/rerank-by-field.md b/_search-plugins/search-relevance/rerank-by-field.md new file mode 100644 index 0000000000..e6f65a4d25 --- /dev/null +++ b/_search-plugins/search-relevance/rerank-by-field.md @@ -0,0 +1,209 @@ +--- +layout: default +title: Reranking by a field +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 20 +--- + +# Reranking search results by a field +Introduced 2.18 +{: .label .label-purple } + +You can use a `by_field` rerank type to rerank search results by a document field. Reranking search results by a field is useful if a model has already run and produced a numerical score for your documents or if a previous search response processor was applied and you want to rerank documents differently based on an aggregated field. + +To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores obtained from a document field. + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search using reranking](#step-4-search-using-reranking). + +## Step 1: Configure a search pipeline + +Configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) and specify the `by_field` rerank type. The pipeline sorts by the `reviews.stars` field (specified by a complete dot path to the field) and returns the original query scores for all documents along with their new scores: + +```json +PUT /_search/pipeline/rerank_byfield_pipeline +{ + "response_processors": [ + { + "rerank": { + "by_field": { + "target_field": "reviews.stars", + "keep_previous_score" : true + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). + +## Step 2: Create an index for ingestion + +In order to use the `rerank` processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: + +```json +PUT /book-index +{ + "settings": { + "index.search.default_pipeline" : "rerank_byfield_pipeline" + }, + "mappings": { + "properties": { + "title": { + "type": "text" + }, + "author": { + "type": "text" + }, + "genre": { + "type": "keyword" + }, + "reviews": { + "properties": { + "stars": { + "type": "float" + } + } + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following bulk request: + +```json +POST /_bulk +{ "index": { "_index": "book-index", "_id": "1" } } +{ "title": "The Lost City", "author": "Jane Doe", "genre": "Adventure Fiction", "reviews": { "stars": 4.2 }, "description": "An exhilarating journey through a hidden civilization in the Amazon rainforest." } +{ "index": { "_index": "book-index", "_id": "2" } } +{ "title": "Whispers of the Past", "author": "John Smith", "genre": "Historical Mystery", "reviews": { "stars": 4.7 }, "description": "A gripping tale set in Victorian England, unraveling a century-old mystery." } +{ "index": { "_index": "book-index", "_id": "3" } } +{ "title": "Starlit Dreams", "author": "Emily Clark", "genre": "Science Fiction", "reviews": { "stars": 4.5 }, "description": "In a future where dreams can be shared, one girl discovers her imaginations power." } +{ "index": { "_index": "book-index", "_id": "4" } } +{ "title": "The Enchanted Garden", "author": "Alice Green", "genre": "Fantasy", "reviews": { "stars": 4.8 }, "description": "A magical garden holds the key to a young girls destiny and friendship." } + +``` +{% include copy-curl.html %} + +## Step 4: Search using reranking + +As an example, run a `match_all` query on your index: + +```json +POST /book-index/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +The response contains documents sorted in descending order based on the `reviews.stars` field. Each document contains the original query score in the `previous_score` field: + +```json +{ + "took": 33, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 4.8, + "hits": [ + { + "_index": "book-index", + "_id": "4", + "_score": 4.8, + "_source": { + "reviews": { + "stars": 4.8 + }, + "author": "Alice Green", + "genre": "Fantasy", + "description": "A magical garden holds the key to a young girls destiny and friendship.", + "previous_score": 1, + "title": "The Enchanted Garden" + } + }, + { + "_index": "book-index", + "_id": "2", + "_score": 4.7, + "_source": { + "reviews": { + "stars": 4.7 + }, + "author": "John Smith", + "genre": "Historical Mystery", + "description": "A gripping tale set in Victorian England, unraveling a century-old mystery.", + "previous_score": 1, + "title": "Whispers of the Past" + } + }, + { + "_index": "book-index", + "_id": "3", + "_score": 4.5, + "_source": { + "reviews": { + "stars": 4.5 + }, + "author": "Emily Clark", + "genre": "Science Fiction", + "description": "In a future where dreams can be shared, one girl discovers her imaginations power.", + "previous_score": 1, + "title": "Starlit Dreams" + } + }, + { + "_index": "book-index", + "_id": "1", + "_score": 4.2, + "_source": { + "reviews": { + "stars": 4.2 + }, + "author": "Jane Doe", + "genre": "Adventure Fiction", + "description": "An exhilarating journey through a hidden civilization in the Amazon rainforest.", + "previous_score": 1, + "title": "The Lost City" + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + +## Next steps + +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/rerank-cross-encoder.md b/_search-plugins/search-relevance/rerank-cross-encoder.md new file mode 100644 index 0000000000..64f93c886c --- /dev/null +++ b/_search-plugins/search-relevance/rerank-cross-encoder.md @@ -0,0 +1,122 @@ +--- +layout: default +title: Reranking using a cross-encoder model +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 10 +--- + +# Reranking search results using a cross-encoder model +Introduced 2.12 +{: .label .label-purple } + +You can rerank search results using a cross-encoder model in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores provided by the cross-encoder model. + +**PREREQUISITE**
+Before configuring a reranking pipeline, you must set up a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). +{: .note} + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search using reranking](#step-4-search-using-reranking). + +## Step 1: Configure a search pipeline + +Next, configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) and specify the `ml_opensearch` rerank type. In the request, provide a model ID for the cross-encoder model and the document fields to use as context: + +```json +PUT /_search/pipeline/my_pipeline +{ + "description": "Pipeline for reranking with a cross-encoder", + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "gnDIbI0BfUsSoeNT_jAw" + }, + "context": { + "document_fields": [ + "passage_text" + ] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). + +## Step 2: Create an index for ingestion + +In order to use the `rerank` processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: + +```json +PUT /my-index +{ + "settings": { + "index.search.default_pipeline" : "my_pipeline" + }, + "mappings": { + "properties": { + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following bulk request: + +```json +POST /_bulk +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I said welcome to them and we entered the house" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I feel welcomed in their family" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "Welcoming gifts are great" } + +``` +{% include copy-curl.html %} + +## Step 4: Search using reranking + +To perform a reranking search on your index, use any OpenSearch query and provide an additional `ext.rerank` field: + +```json +POST /my-index/_search +{ + "query": { + "match": { + "passage_text": "how to welcome in family" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "how to welcome in family" + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can provide the full path to the field containing the context. For more information, see [Rerank processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#example). + +## Next steps + +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/reranking-search-results.md b/_search-plugins/search-relevance/reranking-search-results.md index 4b4deaeb92..065e069b5a 100644 --- a/_search-plugins/search-relevance/reranking-search-results.md +++ b/_search-plugins/search-relevance/reranking-search-results.md @@ -2,7 +2,7 @@ layout: default title: Reranking search results parent: Search relevance -has_children: false +has_children: true nav_order: 60 --- @@ -10,112 +10,12 @@ nav_order: 60 Introduced 2.12 {: .label .label-purple } -You can rerank search results using a cross-encoder reranker in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores provided by the cross-encoder model. +You can rerank search results using a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the `rerank` processor to them. The `rerank` processor evaluates the search results and sorts them based on the new scores. -**PREREQUISITE**
-Before configuring a reranking pipeline, you must set up a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). -{: .note} - -## Running a search with reranking - -To run a search with reranking, follow these steps: - -1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). -1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). -1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). -1. [Search using reranking](#step-4-search-using-reranking). - -## Step 1: Configure a search pipeline - -Next, configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). - -The following example request creates a search pipeline with an `ml_opensearch` rerank processor. In the request, provide a model ID for the cross-encoder model and the document fields to use as context: - -```json -PUT /_search/pipeline/my_pipeline -{ - "description": "Pipeline for reranking with a cross-encoder", - "response_processors": [ - { - "rerank": { - "ml_opensearch": { - "model_id": "gnDIbI0BfUsSoeNT_jAw" - }, - "context": { - "document_fields": [ - "passage_text" - ] - } - } - } - ] -} -``` -{% include copy-curl.html %} - -For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-fields). - -## Step 2: Create an index for ingestion - -In order to use the rerank processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: +You can rerank results in the following ways: -```json -PUT /my-index -{ - "settings": { - "index.search.default_pipeline" : "my_pipeline" - }, - "mappings": { - "properties": { - "passage_text": { - "type": "text" - } - } - } -} -``` -{% include copy-curl.html %} - -## Step 3: Ingest documents into the index - -To ingest documents into the index created in the previous step, send the following bulk request: - -```json -POST /_bulk -{ "index": { "_index": "my-index" } } -{ "passage_text" : "I said welcome to them and we entered the house" } -{ "index": { "_index": "my-index" } } -{ "passage_text" : "I feel welcomed in their family" } -{ "index": { "_index": "my-index" } } -{ "passage_text" : "Welcoming gifts are great" } - -``` -{% include copy-curl.html %} - -## Step 4: Search using reranking - -To perform reranking search on your index, use any OpenSearch query and provide an additional `ext.rerank` field: - -```json -POST /my-index/_search -{ - "query": { - "match": { - "passage_text": "how to welcome in family" - } - }, - "ext": { - "rerank": { - "query_context": { - "query_text": "how to welcome in family" - } - } - } -} -``` -{% include copy-curl.html %} - -Alternatively, you can provide the full path to the field containing the context. For more information, see [Rerank processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#example). +- [Using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/) +- [By a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/) ## Using rerank and normalization processors together @@ -130,4 +30,10 @@ The processing order is as follows: This processing order has the following implications: - Score modification: The rerank processor modifies the scores that were initially adjusted by the normalization processor, potentially leading to different ranking results than initially expected. -- Hybrid queries: In the context of hybrid queries, where multiple types of queries and scoring mechanisms are combined, this behavior is particularly noteworthy. The combined scores from the initial query are normalized first and then reranked, resulting in a two-stage scoring modification. \ No newline at end of file +- Hybrid queries: In the context of hybrid queries, where multiple types of queries and scoring mechanisms are combined, this behavior is particularly noteworthy. The combined scores from the initial query are normalized first and then reranked, resulting in a two-phase scoring modification. + +## Next steps + +- See a complete example of [reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). +- See a complete example of [reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/stats-api.md b/_search-plugins/search-relevance/stats-api.md index 3be40c70f6..d5bd286187 100644 --- a/_search-plugins/search-relevance/stats-api.md +++ b/_search-plugins/search-relevance/stats-api.md @@ -83,7 +83,7 @@ The following is the response for the preceding request: } ``` -## Response fields +## Response body fields The following table lists all response fields. diff --git a/_search-plugins/searching-data/index.md b/_search-plugins/searching-data/index.md index 279958d97c..42ce7654a0 100644 --- a/_search-plugins/searching-data/index.md +++ b/_search-plugins/searching-data/index.md @@ -19,4 +19,4 @@ Feature | Description [Sort results]({{site.url}}{{site.baseurl}}/opensearch/search/sort/) | Allow sorting of results by different criteria. [Highlight query matches]({{site.url}}{{site.baseurl}}/opensearch/search/highlight/) | Highlight the search term in the results. [Retrieve inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/) | Retrieve underlying hits in nested and parent-join objects. -[Retrieve specific fields]({{site.url}}{{site.baseurl}}search-plugins/searching-data/retrieve-specific-fields/) | Retrieve only the specific fields +[Retrieve specific fields]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/retrieve-specific-fields/) | Retrieve only the specific fields diff --git a/_search-plugins/searching-data/inner-hits.md b/_search-plugins/searching-data/inner-hits.md index 395e9e748a..38fc7a491d 100644 --- a/_search-plugins/searching-data/inner-hits.md +++ b/_search-plugins/searching-data/inner-hits.md @@ -139,8 +139,8 @@ The preceding query searches for nested user objects containing the name John an } } ``` -## Inner hits with parent-child objects -Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent-child objects. +## Inner hits with parent/child objects +Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent/child objects. 1. Create an index with a parent-join field: @@ -806,4 +806,8 @@ The following is the expected result: Using `inner_hits` provides contextual relevance by showing exactly which nested or child documents match the query criteria. This is crucial for applications in which the relevance of results depends on a specific part of the document that matches the query. - Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search. \ No newline at end of file + Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search. + +## Next steps + +- Learn about [joining queries]({{site.url}}{{site.baseurl}}/query-dsl/joining/) on [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) or [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) fields. \ No newline at end of file diff --git a/_search-plugins/searching-data/point-in-time-api.md b/_search-plugins/searching-data/point-in-time-api.md index d386d138cc..6a3d867052 100644 --- a/_search-plugins/searching-data/point-in-time-api.md +++ b/_search-plugins/searching-data/point-in-time-api.md @@ -71,7 +71,7 @@ POST /my-index-1/_search/point_in_time?keep_alive=100m } ``` -### Response fields +### Response body fields Field | Data type | Description :--- | :--- | :--- @@ -142,7 +142,7 @@ GET /_search/point_in_time/_all } ``` -### Response fields +### Response body fields Field | Data type | Description :--- | :--- | :--- @@ -176,7 +176,7 @@ DELETE /_search/point_in_time/_all If you want to delete one or several PITs, specify their PIT IDs in the request body. -### Request fields +### Request body fields Field | Data type | Description :--- | :--- | :--- @@ -214,7 +214,7 @@ For each PIT, the response contains a JSON object with a PIT ID and a `successfu } ``` -### Response fields +### Response body fields Field | Data type | Description :--- | :--- | :--- @@ -235,7 +235,7 @@ GET /_cat/pit_segments/_all If you want to list segments for one or several PITs, specify their PIT IDs in the request body. -### Request fields +### Request body fields Field | Data type | Description :--- | :--- | :--- diff --git a/_search-plugins/searching-data/point-in-time.md b/_search-plugins/searching-data/point-in-time.md index ee09354c0f..e1ecd350b1 100644 --- a/_search-plugins/searching-data/point-in-time.md +++ b/_search-plugins/searching-data/point-in-time.md @@ -37,6 +37,10 @@ The create PIT operation returns a PIT ID, which you can use to run multiple que In case of a cluster or node failure, all PIT data is lost. {: .note} +### PIT in SQL + +The [SQL plugin]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/) also supports pagination using PIT. When the `plugin.sql.pagination.api` setting is enabled (the default), SQL search queries in OpenSearch automatically use PIT internally. For more information, see [Pagination in SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/sql-ppl-api/#paginating-results). + ## Pagination with PIT and search_after When you run a query with a PIT ID, you can use the `search_after` parameter to retrieve the next page of results. This gives you control over the order of documents in the pages of results. diff --git a/_search-plugins/sql/limitation.md b/_search-plugins/sql/limitation.md index ac4a6ed619..3382a41912 100644 --- a/_search-plugins/sql/limitation.md +++ b/_search-plugins/sql/limitation.md @@ -94,7 +94,7 @@ Such queries are successfully executed by the `V2` engine unless they have `V1`- * `json` formatted output is supported in `V1` engine only. * The `V2` engine does not track query execution time, so slow queries are not reported. * The `V2` query engine not only runs queries in the OpenSearch engine but also supports post-processing for complex queries. Accordingly, the `explain` output is no longer OpenSearch domain-specific language (DSL) but also includes query plan information from the `V2` query engine. -Suggested change * The `V2` query engine does not support aggregation queries such as `histogram`, `date_histogram`, `percentiles`, `topHits`, `stats`, `extended_stats`, `terms`, or `range`. * JOINs and sub-queries are not supported. To stay up to date on the development for JOINs and sub-queries, track [GitHub issue #1441](https://github.com/opensearch-project/sql/issues/1441) and [GitHub issue #892](https://github.com/opensearch-project/sql/issues/892). -* PartiQL syntax for `nested` queries are not supported. Additionally, arrays of objects and primitive types return the first index of the array, while in `V1` they return the entire array as a JSON object. +* OpenSearch does not natively support the array data type but does allow multi-value fields implicitly. The SQL/PPL plugin adheres strictly to the data type semantics defined in index mappings. When parsing OpenSearch responses, it expects data to match the declared type and does not interpret all data in an array. If the [`plugins.query.field_type_tolerance`](https://github.com/opensearch-project/sql/blob/main/docs/user/admin/settings.rst#plugins-query-field-type-tolerance) setting is enabled, the SQL/PPL plugin handles array datasets by returning scalar data types, allowing basic queries (for example, `SELECT * FROM tbl WHERE condition`). However, using multi-value fields in expressions or functions will result in exceptions. If this setting is disabled or not set, only the first element of an array is returned, preserving the default behavior. +* PartiQL syntax for `nested` queries is not supported. diff --git a/_search-plugins/sql/settings.md b/_search-plugins/sql/settings.md index 4842f98449..28d8c05da3 100644 --- a/_search-plugins/sql/settings.md +++ b/_search-plugins/sql/settings.md @@ -79,6 +79,7 @@ Setting | Default | Description `plugins.query.memory_limit` | 85% | Configures the heap memory usage limit for the circuit breaker of the query engine. `plugins.query.size_limit` | 200 | Sets the default size of index that the query engine fetches from OpenSearch. `plugins.query.datasources.enabled` | true | Change to `false` to disable support for data sources in the plugin. +`plugins.query.field_type_tolerance` | true | If `false`, then an array is reduced to the first non-array value at any nesting level. For example, `[[1, 2], [3, 4]]` will be reduced to `1`. If `true`, then the array is preserved. Default is `true`. ## Spark connector settings diff --git a/_search-plugins/sql/sql-ppl-api.md b/_search-plugins/sql/sql-ppl-api.md index fefd612ceb..26f5f2cc81 100644 --- a/_search-plugins/sql/sql-ppl-api.md +++ b/_search-plugins/sql/sql-ppl-api.md @@ -20,7 +20,7 @@ Parameter | Data Type | Description [format]({{site.url}}{{site.baseurl}}/search-plugins/sql/response-formats/) | String | The format for the response. The `_sql` endpoint supports `jdbc`, `csv`, `raw`, and `json` formats. The `_ppl` endpoint supports `jdbc`, `csv`, and `raw` formats. Default is `jdbc`. sanitize | Boolean | Specifies whether to escape special characters in the results. See [Response formats]({{site.url}}{{site.baseurl}}/search-plugins/sql/response-formats/) for more information. Default is `true`. -### Request fields +### Request body fields Field | Data Type | Description :--- | :--- | :--- @@ -149,7 +149,7 @@ The response contains the schema and the results: } ``` -### Response fields +### Response body fields Field | Data Type | Description :--- | :--- | :--- diff --git a/_search-plugins/star-tree-index.md b/_search-plugins/star-tree-index.md new file mode 100644 index 0000000000..23d4b11c15 --- /dev/null +++ b/_search-plugins/star-tree-index.md @@ -0,0 +1,190 @@ +--- +layout: default +title: Star-tree index +parent: Improving search performance +nav_order: 54 +--- + +# Star-tree index + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +A star-tree index is a multi-field index that improves the performance of aggregations. + +OpenSearch will automatically use a star-tree index to optimize aggregations if the queried fields are part of dimension fields and the aggregations are on star-tree metric fields. No changes are required in the query syntax or the request parameters. + +## When to use a star-tree index + +A star-tree index can be used to perform faster aggregations. Consider the following criteria and features when deciding to use a star-tree index: + +- Star-tree indexes natively support multi-field aggregations. +- Star-tree indexes are created in real time as part of the indexing process, so the data in a star-tree will always be up to date. +- A star-tree index consolidates data, increasing index paging efficiency and using less IO for search queries. + +## Limitations + +Star-tree indexes have the following limitations: + +- A star-tree index should only be enabled on indexes whose data is not updated or deleted because updates and deletions are not accounted for in a star-tree index. +- A star-tree index can be used for aggregation queries only if the queried fields are a subset of the star-tree's dimensions and the aggregated fields are a subset of the star-tree's metrics. +- After a star-tree index is enabled, it cannot be disabled. In order to disable a star-tree index, the data in the index must be reindexed without the star-tree mapping. Furthermore, changing a star-tree configuration will also require a reindex operation. +- [Multi-values/array values]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/index/#arrays) are not supported. +- Only [limited queries and aggregations](#supported-queries-and-aggregations) are supported. Support for more features will be added in future versions. +- The cardinality of the dimensions should not be very high (as with `_id` fields). Higher cardinality leads to increased storage usage and query latency. + +## Star-tree index structure + +The following image illustrates a standard star-tree index structure. + +A star-tree index containing two dimensions and two metrics + +Sorted and aggregated star-tree documents are backed by `doc_values` in an index. The columnar data found in `doc_values` is stored using the following properties: + +- The values are sorted based on the fields set in the `ordered_dimension` setting. In the preceding image, the dimensions are determined by the `status` setting and then by the `port` for each status. +- For each unique dimension/value combination, the aggregated values for all the metrics, such as `avg(size)` and `count(requests)`, are precomputed during ingestion. + +### Leaf nodes + +Each node in a star-tree index points to a range of star-tree documents. Nodes can be further split into child nodes based on the [max_leaf_docs configuration]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/#star-tree-index-configuration-options). The number of documents that a leaf node points to is less than or equal to the value set in `max_leaf_docs`. This ensures that the maximum number of documents that need to traverse nodes to derive an aggregated value is at most the number of `max_leaf_docs`, which provides predictable latency. + +### Star nodes + +A star node contains the aggregated data of all the other nodes for a particular dimension, acting as a "catch-all" node. When a star node is found in a dimension, that dimension is skipped during aggregation. This groups together all values of that dimension and allows a query to skip non-competitive nodes when fetching the aggregated value of a particular field. + +The star-tree index structure diagram contains the following three examples demonstrating how a query behaves when retrieving aggregations from nodes in the star-tree: + +- **Blue**: In a `terms` query that searches for the average request size aggregation, the `port` equals `8443` and the status equals `200`. Because the query contains values in both the `status` and `port` dimensions, the query traverses status node `200` and returns the aggregations from child node `8443`. +- **Green**: In a `term` query that searches for the number of aggregation requests, the `status` equals `200`. Because the query only contains a value from the `status` dimension, the query traverses the `200` node's child star node, which contains the aggregated value of all the `port` child nodes. +- **Red**: In a `term` query that searches for the average request size aggregation, the port equals `5600`. Because the query does not contain a value from the `status` dimension, the query traverses a star node and returns the aggregated result from the `5600` child node. + +Support for the `Terms` query will be added in a future version. For more information, see [GitHub issue #15257](https://github.com/opensearch-project/OpenSearch/issues/15257). +{: .note} + +## Enabling a star-tree index + +To use a star-tree index, modify the following settings: + +- Set the feature flag `opensearch.experimental.feature.composite_index.star_tree.enabled` to `true`. For more information about enabling and disabling feature flags, see [Enabling experimental features]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/). +- Set the `indices.composite_index.star_tree.enabled` setting to `true`. For instructions on how to configure OpenSearch, see [Configuring settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). +- Set the `index.composite_index` index setting to `true` during index creation. +- Ensure that the `doc_values` parameter is enabled for the `dimensions` and `metrics` fields used in your star-tree mapping. + + +## Example mapping + +In the following example, index mappings define the star-tree configuration. The star-tree index precomputes aggregations in the `logs` index. The aggregations are calculated on the `size` and `latency` fields for all the combinations of values indexed in the `port` and `status` fields: + +```json +PUT logs +{ + "settings": { + "index.number_of_shards": 1, + "index.number_of_replicas": 0, + "index.composite_index": true + }, + "mappings": { + "composite": { + "request_aggs": { + "type": "star_tree", + "config": { + "ordered_dimensions": [ + { + "name": "status" + }, + { + "name": "port" + } + ], + "metrics": [ + { + "name": "size", + "stats": [ + "sum" + ] + }, + { + "name": "latency", + "stats": [ + "avg" + ] + } + ] + } + } + }, + "properties": { + "status": { + "type": "integer" + }, + "port": { + "type": "integer" + }, + "size": { + "type": "integer" + }, + "latency": { + "type": "scaled_float", + "scaling_factor": 10 + } + } + } +} +``` + +For detailed information about star-tree index mappings and parameters, see [Star-tree field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/). + +## Supported queries and aggregations + +Star-tree indexes can be used to optimize queries and aggregations. + +### Supported queries + +The following queries are supported as of OpenSearch 2.18: + +- [Term query](https://opensearch.org/docs/latest/query-dsl/term/term/) +- [Match all docs query](https://opensearch.org/docs/latest/query-dsl/match-all/) + +To use a query with a star-tree index, the query's fields must be present in the `ordered_dimensions` section of the star-tree configuration. Queries must also be paired with a supported aggregation. + +### Supported aggregations + +The following metric aggregations are supported as of OpenSearch 2.18: +- [Sum](https://opensearch.org/docs/latest/aggregations/metric/sum/) +- [Minimum](https://opensearch.org/docs/latest/aggregations/metric/minimum/) +- [Maximum](https://opensearch.org/docs/latest/aggregations/metric/maximum/) +- [Value count](https://opensearch.org/docs/latest/aggregations/metric/value-count/) +- [Average](https://opensearch.org/docs/latest/aggregations/metric/average/) + +To use aggregations: + +- The fields must be present in the `metrics` section of the star-tree configuration. +- The metric aggregation type must be part of the `stats` parameter. + +### Aggregation example + +The following example gets the sum of all the values in the `size` field for all error logs with `status=500`, using the [example mapping](#example-mapping): + +```json +POST /logs/_search +{ + "query": { + "term": { + "status": "500" + } + }, + "aggs": { + "sum_size": { + "sum": { + "field": "size" + } + } + } +} +``` + +Using a star-tree index, the result will be retrieved from a single aggregated document as it traverses the `status=500` node, as opposed to scanning through all of the matching documents. This results in lower query latency. + +## Using queries without a star-tree index + +Set the `indices.composite_index.star_tree.enabled` setting to `false` to run queries without using a star-tree index. diff --git a/_search-plugins/vector-search.md b/_search-plugins/vector-search.md index cd893f4144..f19030bf90 100644 --- a/_search-plugins/vector-search.md +++ b/_search-plugins/vector-search.md @@ -37,9 +37,9 @@ PUT test-index "my_vector1": { "type": "knn_vector", "dimension": 1024, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "nmslib", "parameters": { "ef_construction": 128, @@ -57,7 +57,7 @@ PUT test-index You must designate the field that will store vectors as a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field type. OpenSearch supports vectors of up to 16,000 dimensions, each of which is represented as a 32-bit or 16-bit float. -To save storage space, you can use `byte` or `binary` vectors. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector) and [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +To save storage space, you can use `byte` or `binary` vectors. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors) and [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). ### k-NN vector search @@ -131,9 +131,9 @@ PUT /hotels-index "location": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "lucene", "parameters": { "ef_construction": 100, diff --git a/_security-analytics/api-tools/alert-finding-api.md b/_security-analytics/api-tools/alert-finding-api.md index ed977361c5..582e892fc7 100644 --- a/_security-analytics/api-tools/alert-finding-api.md +++ b/_security-analytics/api-tools/alert-finding-api.md @@ -71,7 +71,7 @@ GET /_plugins/_security_analytics/alerts?detectorType=windows } ``` -#### Response fields +#### Response body fields Alerts persist until you resolve the root cause and have the following states: diff --git a/_security-analytics/api-tools/correlation-eng.md b/_security-analytics/api-tools/correlation-eng.md index c2a6ca56b1..449563d876 100644 --- a/_security-analytics/api-tools/correlation-eng.md +++ b/_security-analytics/api-tools/correlation-eng.md @@ -20,7 +20,7 @@ POST /_plugins/_security_analytics/correlation/rules ``` {% include copy-curl.html %} -### Request fields +### Request body fields | Field | Type | Description | | :--- | :--- |:--- | @@ -94,7 +94,7 @@ POST /_plugins/_security_analytics/correlation/rules ``` {% include copy-curl.html %} -### Response fields +### Response body fields | Field | Type | Description | | :--- | :--- |:--- | @@ -144,7 +144,7 @@ GET /_plugins/_security_analytics/correlations?start_timestamp=1689289210000&end ``` {% include copy-curl.html %} -### Response fields +### Response body fields | Field | Type | Description | | :--- | :--- |:--- | @@ -221,7 +221,7 @@ GET /_plugins/_security_analytics/findings/correlate?finding=425dce0b-f5ee-4889- ``` {% include copy-curl.html %} -### Response fields +### Response body fields | Field | Type | Description | | :--- | :--- |:--- | diff --git a/_security-analytics/api-tools/detector-api.md b/_security-analytics/api-tools/detector-api.md index e9de8698a4..5b0f9a9eb0 100644 --- a/_security-analytics/api-tools/detector-api.md +++ b/_security-analytics/api-tools/detector-api.md @@ -18,7 +18,7 @@ Creates a new detector. POST _plugins/_security_analytics/detectors ``` -### Request fields +### Request body fields You can specify the following fields when creating a detector. @@ -224,7 +224,7 @@ The Update Detector API can be used to update a detector definition. It requires PUT /_plugins/_security_analytics/detectors/ ``` -### Request fields +### Request body fields You can specify the following fields when updating a detector. @@ -332,7 +332,7 @@ PUT /_plugins/_security_analytics/detectors/J1RX1IMByX0LvTiGTddR } ``` -#### Response fields +#### Response body fields Field | Type | Description :--- | :--- |:--- | @@ -428,7 +428,7 @@ GET /_plugins/_security_analytics/detectors/ The Search Detector API searches for detector matches by detector ID, detector name, or detector type. -### Request fields +### Request body fields Field | Type | Description :--- | :--- |:--- | diff --git a/_security-analytics/api-tools/mappings-api.md b/_security-analytics/api-tools/mappings-api.md index b1e8ab2c51..64e83c7a0a 100644 --- a/_security-analytics/api-tools/mappings-api.md +++ b/_security-analytics/api-tools/mappings-api.md @@ -14,7 +14,7 @@ The following APIs can be used for a number of tasks related to mappings, from c This API returns a view of the fields contained in an index used as a log source. -### Request fields +### Request body fields The following fields are used to get field mappings. diff --git a/_security-analytics/threat-intelligence/api/findings.md b/_security-analytics/threat-intelligence/api/findings.md index 3d1b3e8951..5c648ab2ae 100644 --- a/_security-analytics/threat-intelligence/api/findings.md +++ b/_security-analytics/threat-intelligence/api/findings.md @@ -77,7 +77,7 @@ GET /_plugins/_security_analytics/threat_intel/alerts } ``` -### Response fields +### Response body fields A threat intelligence alert can have one of the following states. diff --git a/_security-analytics/threat-intelligence/api/monitor.md b/_security-analytics/threat-intelligence/api/monitor.md index 965fd79af3..e22b31f156 100644 --- a/_security-analytics/threat-intelligence/api/monitor.md +++ b/_security-analytics/threat-intelligence/api/monitor.md @@ -25,7 +25,7 @@ POST _plugins/_security_analytics/threat_intel/monitors PUT _plugins/_security_analytics/threat_intel/monitors/ ``` -### Request fields +### Request body fields You can specify the following fields in the request body. diff --git a/_security-analytics/threat-intelligence/api/source.md b/_security-analytics/threat-intelligence/api/source.md index 7cfadfd813..e9bd540477 100644 --- a/_security-analytics/threat-intelligence/api/source.md +++ b/_security-analytics/threat-intelligence/api/source.md @@ -21,7 +21,7 @@ POST _plugins/_security_analytics/threat_intel/sources PUT _plugins/_security_analytics/threat_intel/sources/ ``` -### Request fields +### Request body fields | Field | Type | Description | | :--- | :--- | :---- | @@ -60,7 +60,7 @@ The following fields modify the `ioc_types` option. | `spec_version` | String | The specification version used for the IOC. | | `version` | Integer | A version number for the IOC. | -### Response fields +### Response body fields | Field | Data type | Description | | :---- | :--- |:----- | diff --git a/_security-analytics/threat-intelligence/getting-started.md b/_security-analytics/threat-intelligence/getting-started.md index 366bc2674c..b26063bed0 100644 --- a/_security-analytics/threat-intelligence/getting-started.md +++ b/_security-analytics/threat-intelligence/getting-started.md @@ -50,15 +50,64 @@ Local files uploaded as the threat intelligence source must use the following sp When using the `S3_SOURCE` as a remote store, the following connection information must be provided: -- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role. -- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored. -- **Specify a directory or file**: The object key or directory path for the `STIX2` file in the S3 bucket. +- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role. When using the AWS OpenSearch Service, the role ARN needs to be in the same account as the OpenSearch domain. For more information about adding a new role for the AWS OpenSearch Service, see [Add service ARN](#add-aws-opensearch-service-arn). +- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored. To access an S3 bucket in a different AWS account, see the [Cross-account S3 bucket connection](#cross-account-s3-bucket-connection) section for more details. +- **Specify a file**: The object key for the `STIX2` file in the S3 bucket. - **Region**: The AWS Region for the S3 bucket. You can also set the **Download schedule**, which determines to where OpenSearch downloads an updated `STIX2` file from the connected S3 bucket. The default interval is once a day. Only daily intervals are supported. Alternatively, you can check the **Download on demand** option, which prevents new data from the bucket from being automatically downloaded. +#### Add AWS OpenSearch Service ARN + +If you're using the AWS OpenSearch Service, create a new ARN role with a custom trust policy. For instructions on how to create the role, see [Creating a role for an AWS service](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-service.html#roles-creatingrole-service-console). + +When creating the role, customize the following settings: + +- Add the following custom trust policy: + + ```bash + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "opensearchservice.amazonaws.com" + ] + }, + "Action": "sts:AssumeRole" + } + ] + } + ``` + +- On the Permissions policies page, add the `AmazonS3ReadOnlyAccess` permission. + + +#### Cross-account S3 bucket connection + +Because the role ARN needs to be in the same account as the OpenSearch domain, a trust policy needs to be configured that allows the OpenSearch domain to download from S3 buckets from the same account. + +To download from an S3 bucket in another account, the trust policy for that bucket needs to give the role ARN permission to read from the object, as shown in the following example: + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::123456789012:role/account-1-threat-intel-role" + }, + "Action": "s3:*", + "Resource": "arn:aws:s3:::account-2-threat-intel-bucket/*" + } + ] +} +``` ## Step 2: Set up scanning for your log sources diff --git a/_security/access-control/api.md b/_security/access-control/api.md index 63717d621a..0a6aedbb95 100644 --- a/_security/access-control/api.md +++ b/_security/access-control/api.md @@ -178,7 +178,7 @@ PUT _plugins/_security/api/account ``` {% include copy-curl.html %} -#### Request fields +#### Request body fields | Field | Data type | Description | Required | |:-------------------|:-----------|:-------------------------------|:----------| @@ -206,7 +206,7 @@ PUT _plugins/_security/api/account } ``` -#### Response fields +#### Response body fields | Field | Data type | Description | |:---------|:-----------|:------------------------------| @@ -1426,7 +1426,7 @@ GET _plugins/_security/api/_upgrade_check } ``` -#### Response fields +#### Response body fields | Field | Data type | Description | |:---------|:-----------|:------------------------------| @@ -1453,7 +1453,7 @@ POST _plugins/_security/api/_upgrade_perform ``` {% include copy-curl.html %} -#### Request fields +#### Request body fields | Field | Data type | Description | Required | |:----------------|:-----------|:------------------------------------------------------------------------------------------------------------------|:---------| @@ -1473,7 +1473,7 @@ POST _plugins/_security/api/_upgrade_perform } ``` -#### Response fields +#### Response body fields | Field | Data type | Description | |:---------|:-----------|:------------------------------| @@ -1575,7 +1575,7 @@ PATCH _plugins/_security/api/nodesdn ``` {% include copy-curl.html %} -#### Request fields +#### Request body fields | Field | Data type | Description | Required | |:----------------|:-----------|:------------------------------------------------------------------------------------------------------------------|:---------| @@ -1607,7 +1607,7 @@ PATCH _plugins/_security/api/nodesdn } ``` -#### Response fields +#### Response body fields | Field | Data type | Description | |:--------|:----------|:---------------------| @@ -1705,7 +1705,7 @@ curl -X PUT "https://your-opensearch-cluster/_plugins/_security/api/ssl/transpor } ``` -#### Response fields +#### Response body fields | Field | Data type | Description | |:--------|:----------|:----------------------------------------------------------------------------------| @@ -1741,7 +1741,7 @@ curl -X PUT "https://your-opensearch-cluster/_plugins/_security/api/ssl/http/rel } ``` -#### Response fields +#### Response body fields | Field | Data type | Description | |:--------|:----------|:--------------------------------------------------------------------| @@ -1822,7 +1822,7 @@ For details on using audit logging to track access to OpenSearch clusters, as we You can do an initial configuration of audit logging in the `audit.yml` file, found in the `opensearch-project/security/config` directory. Thereafter, you can use the REST API or Dashboards for further changes to the configuration. {: note.} -#### Request fields +#### Request body fields Field | Data type | Description :--- | :--- | :--- diff --git a/_security/access-control/document-level-security.md b/_security/access-control/document-level-security.md index 352fe06a61..b17b60e147 100644 --- a/_security/access-control/document-level-security.md +++ b/_security/access-control/document-level-security.md @@ -13,6 +13,8 @@ Document-level security lets you restrict a role to a subset of documents in an ![Document- and field-level security screen in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/images/security-dls.png) +The maximum size for the document-level security configuration is 1024 KB (1,048,404 characters). +{: .warning} ## Simple roles diff --git a/_security/audit-logs/storage-types.md b/_security/audit-logs/storage-types.md index 719287ad7f..a07d98db59 100644 --- a/_security/audit-logs/storage-types.md +++ b/_security/audit-logs/storage-types.md @@ -16,6 +16,7 @@ Setting | Description :--- | :--- debug | Outputs to stdout. Useful for testing and debugging. internal_opensearch | Writes to an audit index on the current OpenSearch cluster. +internal_opensearch_data_stream | Writes to an audit log data stream on the current OpenSearch cluster. external_opensearch | Writes to an audit index on a remote OpenSearch cluster. webhook | Sends events to an arbitrary HTTP endpoint. log4j | Writes the events to a Log4j logger. You can use any Log4j [appender](https://logging.apache.org/log4j/2.x/manual/appenders.html), such as SNMP, JDBC, Cassandra, and Kafka. @@ -23,10 +24,29 @@ log4j | Writes the events to a Log4j logger. You can use any Log4j [appender](ht You configure the output location in `opensearch.yml`: ``` -plugins.security.audit.type: +plugins.security.audit.type: ``` -`external_opensearch`, `webhook`, and `log4j` all have additional configuration options. Details follow. +`internal_opensearch_data_stream`, `external_opensearch`, `webhook`, and `log4j` can be customized with additional configuration options. For more information, see [Internal OpenSearch data streams](#internal-opensearch-data-streams). + + +## Internal OpenSearch data streams + +You can configure the `internal_opensearch_data_stream` type with the following parameters. + + +Name | Data type | Description +:--- | :--- | :--- +`plugins.security.audit.config.data_stream.name` | String | The name of the audit log data stream. Default is `opensearch-security-auditlog`. + +### Template settings + +Name | Data type | Description +:--- | :--- | :--- +`plugins.security.audit.config.data_stream.template.manage` | Boolean | When `true`, the template for the data stream is managed by OpenSearch. Default is `true`. +`plugins.security.audit.config.data_stream.template.name` | String | The name of the data stream template. Default is `opensearch-security-auditlog`. +`plugins.security.audit.config.data_stream.template.number_of_replicas` | Integer | The number of replicas for the data stream. Default is `0`. +`plugins.security.audit.config.data_stream.template.number_of_shards` | Integer | The number of shards for the data stream. Default is `1`. ## External OpenSearch diff --git a/_security/authentication-backends/jwt.md b/_security/authentication-backends/jwt.md index 3f28dfecfd..ef32b9f71a 100644 --- a/_security/authentication-backends/jwt.md +++ b/_security/authentication-backends/jwt.md @@ -117,7 +117,7 @@ The following table lists the configuration parameters. Name | Description :--- | :--- -`signing_key` | The signing key to use when verifying the token. If you use a symmetric key algorithm, it is the base64-encoded shared secret. If you use an asymmetric algorithm, it contains the public key. +`signing_key` | The signing key(s) used to verify the token. If you use a symmetric key algorithm, this is the Base64-encoded shared secret. If you use an asymmetric algorithm, the algorithm contains the public key. To pass multiple keys, use a comma-separated list or enumerate the keys. `jwt_header` | The HTTP header in which the token is transmitted. This is typically the `Authorization` header with the `Bearer` schema,`Authorization: Bearer `. Default is `Authorization`. Replacing this field with a value other than `Authorization` prevents the audit log from properly redacting the JWT header from audit messages. It is recommended that users only use `Authorization` when using JWTs with audit logging. `jwt_url_parameter` | If the token is not transmitted in the HTTP header but rather as an URL parameter, define the name of the parameter here. `subject_key` | The key in the JSON payload that stores the username. If not set, the [subject](https://tools.ietf.org/html/rfc7519#section-4.1.2) registered claim is used. @@ -177,7 +177,7 @@ The default name of the header is `Authorization`. If required by your authentic As with HTTP basic authentication, you should use HTTPS instead of HTTP when transmitting JWTs in HTTP requests. -### URL parameters for HTTP requests +### Query parameters for HTTP requests Although the most common way to transmit JWTs in HTTP requests is to use a header field, the Security plugin also supports parameters. Configure the name of the `GET` parameter using the following key: diff --git a/_security/configuration/disable-enable-security.md b/_security/configuration/disable-enable-security.md index 811fd2a69f..38bcc01cdd 100755 --- a/_security/configuration/disable-enable-security.md +++ b/_security/configuration/disable-enable-security.md @@ -155,22 +155,22 @@ Use the following steps to reinstall the plugin: 1. Disable shard allocation and stop all nodes so that shards don't move when the cluster is restarted: - ```json - curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{ - "transient": { - "cluster.routing.allocation.enable": "none" - } - }' - ``` - {% include copy.html %} + ```json + curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{ + "transient": { + "cluster.routing.allocation.enable": "none" + } + }' + ``` + {% include copy.html %} 2. Install the Security plugin on all nodes in your cluster using one of the [installation methods]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#install): - ```bash - bin/opensearch-plugin install opensearch-security - ``` - {% include copy.html %} - + ```bash + bin/opensearch-plugin install opensearch-security + ``` + {% include copy.html %} + 3. Add the necessary configuration to `opensearch.yml` for TLS encryption. See [Configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/) for information about the settings that need to be configured. diff --git a/_security/configuration/index.md b/_security/configuration/index.md index e351e8865f..f68667d92d 100644 --- a/_security/configuration/index.md +++ b/_security/configuration/index.md @@ -3,7 +3,7 @@ layout: default title: Configuration nav_order: 2 has_children: true -has_toc: false +has_toc: true redirect_from: - /security-plugin/configuration/ - /security-plugin/configuration/index/ @@ -11,21 +11,105 @@ redirect_from: # Security configuration -The plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch in a production environment, you must configure it manually: +The Security plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch with the Security plugin in a production environment, you must make changes to the demo certificates and other configuration options manually. -1. [Replace the demo certificates]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings). -1. [Reconfigure `opensearch.yml` to use your certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls). -1. [Reconfigure `config.yml` to use your authentication backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/) (if you don't plan to use the internal user database). -1. [Modify the configuration YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml). -1. If you plan to use the internal user database, [set a password policy in `opensearch.yml`]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#opensearchyml). -1. [Apply changes using the `securityadmin` script]({{site.url}}{{site.baseurl}}/security/configuration/security-admin). -1. Start OpenSearch. -1. [Add users, roles, role mappings, and tenants]({{site.url}}{{site.baseurl}}/security/access-control/index/). +## Replace the demo certificates -If you don't want to use the plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/). +OpenSearch ships with demo certificates intended for quick setup and demonstration purposes. For a production environment, it's critical to replace these with your own trusted certificates, using the following steps, to ensure secure communication: -The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that use kibana in their names. We will change these names in a future release. +1. **Generate your own certificates:** Use tools like OpenSSL or a certificate authority (CA) to generate your own certificates. For more information about generating certificates with OpenSSL, see [Generating self-signed certificates]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/). +2. **Store the generated certificates and private key in the appropriate directory:** Generated certificates are typically stored in `/config/`. For more information, see [Add certificate files to opensearch.yml]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/#add-certificate-files-to-opensearchyml). +3. **Set the following file permissions:** + - Private key (.key files): Set the file mode to `600`. This restricts access so that only the file owner (the OpenSearch user) can read and write to the file, ensuring that the private key remains secure and inaccessible to unauthorized users. + - Public certificates (.crt, .pem files): Set the file mode to `644`. This allows the file owner to read and write to the file, while other users can only read it. + +For additional guidance on file modes, see the following table. + + | Item | Sample | Numeric | Bitwise | + |-------------|---------------------|---------|--------------| + | Public key | `~/.ssh/id_rsa.pub` | `644` | `-rw-r--r--` | + | Private key | `~/.ssh/id_rsa` | `600` | `-rw-------` | + | SSH folder | `~/.ssh` | `700` | `drwx------` | + +For more information, see [Configuring basic security settings]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings). + +## Reconfigure `opensearch.yml` to use your certificates + +The `opensearch.yml` file is the main configuration file for OpenSearch; you can find the file at `/config/opensearch.yml`. Use the following steps to update this file to point to your custom certificates: + +In `opensearch.yml`, set the correct paths for your certificates and keys, as shown in the following example: + ``` + plugins.security.ssl.transport.pemcert_filepath: /path/to/your/cert.pem + plugins.security.ssl.transport.pemkey_filepath: /path/to/your/key.pem + plugins.security.ssl.transport.pemtrustedcas_filepath: /path/to/your/ca.pem + plugins.security.ssl.http.enabled: true + plugins.security.ssl.http.pemcert_filepath: /path/to/your/cert.pem + plugins.security.ssl.http.pemkey_filepath: /path/to/your/key.pem + plugins.security.ssl.http.pemtrustedcas_filepath: /path/to/your/ca.pem + ``` +For more information, see [Configuring TLS certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls/). + +## Reconfigure `config.yml` to use your authentication backend + +The `config.yml` file allows you to configure the authentication and authorization mechanisms for OpenSearch. Update the authentication backend settings in `/config/opensearch-security/config.yml` according to your requirements. + +For example, to use LDAP as your authentication backend, add the following settings: + + ``` + authc: + basic_internal_auth: + http_enabled: true + transport_enabled: true + order: 1 + http_authenticator: + type: basic + challenge: true + authentication_backend: + type: internal + ``` +For more information, see [Configuring the Security backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/). + +## Modify the configuration YAML files + +Determine whether any additional YAML files need modification, for example, the `roles.yml`, `roles_mapping.yml`, or `internal_users.yml` files. Update the files with any additional configuration information. For more information, see [Modifying the YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml/). + +## Set a password policy + +When using the internal user database, we recommend enforcing a password policy to ensure that strong passwords are used. For information about strong password policies, see [Password settings]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#password-settings). + +## Apply changes using the `securityadmin` script + +The following steps do not apply to first-time users because the security index is automatically initialized from the YAML configuration files when OpenSearch starts. +{: .note} + +After initial setup, if you make changes to your security configuration or disable automatic initialization by setting `plugins.security.allow_default_init_securityindex` to `false` (which prevents security index initialization from `yaml` files), you need to manually apply changes using the `securityadmin` script: + +1. Find the `securityadmin` script. The script is typically stored in the OpenSearch plugins directory, `plugins/opensearch-security/tools/securityadmin.[sh|bat]`. + - Note: If you're using OpenSearch 1.x, the `securityadmin` script is located in the `plugins/opendistro_security/tools/` directory. + - For more information, see [Basic usage](https://opensearch.org/docs/latest/security/configuration/security-admin/#basic-usage). +2. Run the script by using the following command: + ``` + ./plugins/opensearch-security/tools/securityadmin.[sh|bat] + ``` +3. Check the OpenSearch logs and configuration to ensure that the changes have been successfully applied. + +For more information about using the `securityadmin` script, see [Applying changes to configuration files]({{site.url}}{{site.baseurl}}/security/configuration/security-admin/). + +## Add users, roles, role mappings, and tenants + +If you don't want to use the Security plugin, you can disable it by adding the following setting to the `opensearch.yml` file: + +``` +plugins.security.disabled: true +``` + +You can then enable the plugin by removing the `plugins.security.disabled` setting. + +For more information about disabling the Security plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/). + +The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that contain "Kibana" in their names. We will change these names in a future version. {: .note } -For a full list of `opensearch.yml` Security plugin settings, Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/). +For a full list of `opensearch.yml` Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/). {: .note} + diff --git a/_security/configuration/yaml.md b/_security/configuration/yaml.md index 1686c8332e..2694e3a24f 100644 --- a/_security/configuration/yaml.md +++ b/_security/configuration/yaml.md @@ -265,7 +265,7 @@ kibana_server: ## roles.yml -This file contains any initial roles that you want to add to the Security plugin. Aside from some metadata, the default file is empty, because the Security plugin has a number of static roles that it adds automatically. +This file contains any initial roles that you want to add to the Security plugin. By default, this file contains predefined roles that grant usage to plugins within the default distribution of OpenSearch. The Security plugin will also add a number static roles automatically. ```yml --- diff --git a/_tools/index.md b/_tools/index.md index 108f10da97..c9d446a81a 100644 --- a/_tools/index.md +++ b/_tools/index.md @@ -18,6 +18,7 @@ This section provides documentation for OpenSearch-supported tools, including: - [OpenSearch CLI](#opensearch-cli) - [OpenSearch Kubernetes operator](#opensearch-kubernetes-operator) - [OpenSearch upgrade, migration, and comparison tools](#opensearch-upgrade-migration-and-comparison-tools) +- [Sycamore](#sycamore) for AI-powered extract, transform, load (ETL) on complex documents for vector and hybrid search For information about Data Prepper, the server-side data collector for filtering, enriching, transforming, normalizing, and aggregating data for downstream analytics and visualization, see [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/index/). @@ -122,3 +123,9 @@ The OpenSearch Kubernetes Operator is an open-source Kubernetes operator that he OpenSearch migration tools facilitate migrations to OpenSearch and upgrades to newer versions of OpenSearch. These can help you can set up a proof-of-concept environment locally using Docker containers or deploy to AWS using a one-click deployment script. This empowers you to fine-tune cluster configurations and manage workloads more effectively before migration. For more information about OpenSearch migration tools, see the documentation in the [OpenSearch Migration GitHub repository](https://github.com/opensearch-project/opensearch-migrations/tree/capture-and-replay-v0.1.0). + +## Sycamore + +[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using an [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html). + +For more information, see [Sycamore]({{site.url}}{{site.baseurl}}/tools/sycamore/). diff --git a/_tools/k8s-operator.md b/_tools/k8s-operator.md index 7ee1c1adee..5027dcf304 100644 --- a/_tools/k8s-operator.md +++ b/_tools/k8s-operator.md @@ -63,40 +63,40 @@ Then install the OpenSearch Kubernetes Operator using the following steps: 3. Enter `make build manifests`. 4. Start a Kubernetes cluster. When using minikube, open a new terminal window and enter `minikube start`. Kubernetes will now use a containerized minikube cluster with a namespace called `default`. Make sure that `~/.kube/config` points to the cluster. -```yml -apiVersion: v1 -clusters: -- cluster: - certificate-authority: /Users/naarcha/.minikube/ca.crt - extensions: - - extension: - last-update: Mon, 29 Aug 2022 10:11:47 CDT - provider: minikube.sigs.k8s.io - version: v1.26.1 - name: cluster_info - server: https://127.0.0.1:61661 - name: minikube -contexts: -- context: - cluster: minikube - extensions: - - extension: - last-update: Mon, 29 Aug 2022 10:11:47 CDT - provider: minikube.sigs.k8s.io - version: v1.26.1 - name: context_info - namespace: default - user: minikube - name: minikube -current-context: minikube -kind: Config -preferences: {} -users: -- name: minikube - user: - client-certificate: /Users/naarcha/.minikube/profiles/minikube/client.crt - client-key: /Users/naarcha/.minikube/profiles/minikube/client.key -``` + ```yml + apiVersion: v1 + clusters: + - cluster: + certificate-authority: /Users/naarcha/.minikube/ca.crt + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: cluster_info + server: https://127.0.0.1:61661 + name: minikube + contexts: + - context: + cluster: minikube + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: context_info + namespace: default + user: minikube + name: minikube + current-context: minikube + kind: Config + preferences: {} + users: + - name: minikube + user: + client-certificate: /Users/naarcha/.minikube/profiles/minikube/client.crt + client-key: /Users/naarcha/.minikube/profiles/minikube/client.key + ``` 5. Enter `make install` to create the CustomResourceDefinition that runs in your Kubernetes cluster. 6. Start the OpenSearch Kubernetes Operator. Enter `make run`. @@ -146,4 +146,4 @@ kubectl delete -f opensearch-cluster.yaml To learn more about how to customize your Kubernetes OpenSearch cluster, including data persistence, authentication methods, and scaling, see the [OpenSearch Kubernetes Operator User Guide](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/userguide/main.md). -If you want to contribute to the development of the OpenSearch Kubernetes Operator, see the repo [design documents](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/designs/high-level.md). \ No newline at end of file +If you want to contribute to the development of the OpenSearch Kubernetes Operator, see the repo [design documents](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/designs/high-level.md). diff --git a/_tools/sycamore.md b/_tools/sycamore.md new file mode 100644 index 0000000000..9b3986dbf3 --- /dev/null +++ b/_tools/sycamore.md @@ -0,0 +1,48 @@ +--- +layout: default +title: Sycamore +nav_order: 210 +has_children: false +--- + +# Sycamore + +[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using a connector like the [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html). + +To get started, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html). + +## Sycamore ETL pipeline structure + +A Sycamore extract, transform, load (ETL) pipeline applies a series of transformations to a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets), which is a collection of documents and their constituent elements (for example, tables, blocks of text, or headers). At the end of the pipeline, the DocSet is loaded into OpenSearch vector and keyword indexes. + +A typical pipeline for preparing unstructured data for vector or hybrid search in OpenSearch consists of the following steps: + +* Read documents into a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets). +* [Partition documents](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/partition.html) into structured JSON elements. +* Extract metadata and filter and clean data using [transforms](https://sycamore.readthedocs.io/en/stable/sycamore/APIs/docset.html). +* Create [chunks](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/merge.html) from groups of elements. +* Embed the chunks using the model of your choice. +* [Load](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html) the embeddings, metadata, and text into OpenSearch vector and keyword indexes. + +For an example pipeline that uses this workflow, see [this notebook](https://github.com/aryn-ai/sycamore/blob/main/notebooks/opensearch_docs_etl.ipynb). + + +## Install Sycamore + +We recommend installing the Sycamore library using `pip`. The connector for OpenSearch can be specified and installed using extras. For example: + +```bash +pip install sycamore-ai[opensearch] +``` +{% include copy.html %} + +By default, Sycamore works with the Aryn Partitioning Service to process PDFs. To run inference locally for partitioning or embedding, install Sycamore with the `local-inference` extra as follows: + +```bash +pip install sycamore-ai[opensearch,local-inference] +``` +{% include copy.html %} + +## Next steps + +For more information, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html). diff --git a/_troubleshoot/tls.md b/_troubleshoot/tls.md index 93e9a2c490..6c777ad5b8 100644 --- a/_troubleshoot/tls.md +++ b/_troubleshoot/tls.md @@ -207,7 +207,7 @@ plugins.security.ssl.http.enabled_protocols: TLS relies on the server and client negotiating a common cipher suite. Depending on your system, the available ciphers will vary. They depend on the JDK or OpenSSL version you're using, and whether or not the `JCE Unlimited Strength Jurisdiction Policy Files` are installed. -For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html). If you don't have them installed, you might see an error message on startup: +For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/java/technologies/javase-jce8-downloads.html). If you don't have them installed, you might see an error message on startup: ``` [INFO ] AES-256 not supported, max key length for AES is 128 bit. diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md index b0739c263a..b184930e1d 100644 --- a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md +++ b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md @@ -257,7 +257,7 @@ GET _remotestore/stats/ ``` -### Response fields +### Response body fields The response body of the Remote Store Stats API is split into three categories: diff --git a/_tuning-your-cluster/availability-and-recovery/search-backpressure.md b/_tuning-your-cluster/availability-and-recovery/search-backpressure.md index 58c0d29883..29982247a7 100644 --- a/_tuning-your-cluster/availability-and-recovery/search-backpressure.md +++ b/_tuning-your-cluster/availability-and-recovery/search-backpressure.md @@ -102,23 +102,23 @@ search_backpressure.node_duress.num_successive_breaches | 3 | The number of succ search_backpressure.node_duress.cpu_threshold | 90% | The CPU usage threshold (as a percentage) required for a node to be considered to be under duress. search_backpressure.node_duress.heap_threshold | 70% | The heap usage threshold (as a percentage) required for a node to be considered to be under duress. search_backpressure.search_task.elapsed_time_millis_threshold | 45,000 | The elapsed time threshold (in milliseconds) required for an individual parent task before it is considered for cancellation. -search_backpressure.search_task.cancellation_ratio | 0.1 | The maximum number of search tasks to cancel, as a percentage of successful search task completions. -search_backpressure.search_task.cancellation_rate| 0.003 | The maximum number of search tasks to cancel per millisecond of elapsed time. -search_backpressure.search_task.cancellation_burst | 5 | The maximum number of search tasks to cancel in a single iteration of the observer thread. -search_backpressure.search_task.heap_percent_threshold | 2% | The heap usage threshold (as a percentage) required for an individual parent task before it is considered for cancellation. -search_backpressure.search_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search tasks before cancellation is applied. -search_backpressure.search_task.heap_variance | 2.0 | The heap usage variance required for an individual parent task before it is considered for cancellation. A task is considered for cancellation when `taskHeapUsage` is greater than or equal to `heapUsageMovingAverage` * `variance`. -search_backpressure.search_task.heap_moving_average_window_size | 10 | The window size used to calculate the rolling average of the heap usage for the completed parent tasks. -search_backpressure.search_task.cpu_time_millis_threshold | 30,000 | The CPU usage threshold (in milliseconds) required for an individual parent task before it is considered for cancellation. -search_backpressure.search_shard_task.elapsed_time_millis_threshold | 30,000 | The elapsed time threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. -search_backpressure.search_shard_task.cancellation_ratio | 0.1 | The maximum number of search shard tasks to cancel, as a percentage of successful search shard task completions. -search_backpressure.search_shard_task.cancellation_rate | 0.003 | The maximum number of search shard tasks to cancel per millisecond of elapsed time. -search_backpressure.search_shard_task.cancellation_burst | 10 | The maximum number of search shard tasks to cancel in a single iteration of the observer thread. -search_backpressure.search_shard_task.heap_percent_threshold | 0.5% | The heap usage threshold (as a percentage) required for a single search shard task before it is considered for cancellation. -search_backpressure.search_shard_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search shard tasks before cancellation is applied. -search_backpressure.search_shard_task.heap_variance | 2.0 | The minimum variance required for a single search shard task's heap usage compared to the rolling average of previously completed tasks before it is considered for cancellation. -search_backpressure.search_shard_task.heap_moving_average_window_size | 100 | The number of previously completed search shard tasks to consider when calculating the rolling average of heap usage. -search_backpressure.search_shard_task.cpu_time_millis_threshold | 15,000 | The CPU usage threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. +search_backpressure.search_task.cancellation_ratio | 0.1 | The maximum number of search tasks to cancel, as a percentage of successful search task completions. The value range is (0, 1]. +search_backpressure.search_task.cancellation_rate| 0.003 | The maximum number of search tasks to cancel per millisecond of elapsed time. The value must be greater than 0. +search_backpressure.search_task.cancellation_burst | 5 | The maximum number of search tasks to cancel in a single iteration of the observer thread. The value must be greater than or equal to 1. +search_backpressure.search_task.heap_percent_threshold | 2% | The heap usage threshold (as a percentage) required for an individual parent task before it is considered for cancellation. The value range is [0%, 100%]. +search_backpressure.search_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search tasks before cancellation is applied. The value range is [0%, 100%]. +search_backpressure.search_task.heap_variance | 2.0 | The heap usage variance required for an individual parent task before it is considered for cancellation. A task is considered for cancellation when `taskHeapUsage` is greater than or equal to `heapUsageMovingAverage` * `variance`. The value must be greater than or equal to 0. +search_backpressure.search_task.heap_moving_average_window_size | 10 | The window size used to calculate the rolling average of the heap usage for the completed parent tasks. The value must be greater than or equal to 0. +search_backpressure.search_task.cpu_time_millis_threshold | 30,000 | The CPU usage threshold (in milliseconds) required for an individual parent task before it is considered for cancellation. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.elapsed_time_millis_threshold | 30,000 | The elapsed time threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.cancellation_ratio | 0.1 | The maximum number of search shard tasks to cancel, as a percentage of successful search shard task completions. The value range is (0, 1]. +search_backpressure.search_shard_task.cancellation_rate | 0.003 | The maximum number of search shard tasks to cancel per millisecond of elapsed time. The value must be greater than 0. +search_backpressure.search_shard_task.cancellation_burst | 10 | The maximum number of search shard tasks to cancel in a single iteration of the observer thread. The value must be greater than or equal to 1. +search_backpressure.search_shard_task.heap_percent_threshold | 0.5% | The heap usage threshold (as a percentage) required for a single search shard task before it is considered for cancellation. The value range is [0%, 100%]. +search_backpressure.search_shard_task.total_heap_percent_threshold | 5% | The heap usage threshold (as a percentage) required for the sum of heap usages of all search shard tasks before cancellation is applied. The value range is [0%, 100%]. +search_backpressure.search_shard_task.heap_variance | 2.0 | The minimum variance required for a single search shard task's heap usage compared to the rolling average of previously completed tasks before it is considered for cancellation. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.heap_moving_average_window_size | 100 | The number of previously completed search shard tasks to consider when calculating the rolling average of heap usage. The value must be greater than or equal to 0. +search_backpressure.search_shard_task.cpu_time_millis_threshold | 15,000 | The CPU usage threshold (in milliseconds) required for a single search shard task before it is considered for cancellation. The value must be greater than or equal to 0. ## Search Backpressure Stats API Introduced 2.4 @@ -216,7 +216,7 @@ The response contains server-side request cancellation statistics: } ``` -### Response fields +### Response body fields The response contains the following fields. diff --git a/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md b/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md index 16b97a380a..498aae55fc 100644 --- a/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md +++ b/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md @@ -13,7 +13,7 @@ Segment replication backpressure is a shard-level rejection mechanism that dynam Replica shards are also monitored to determine whether the shards are stuck or lagging for an extended period of time. When replica shards are stuck or lagging for more than double the amount of time defined by the `segrep.pressure.time.limit` field, the shards are removed and replaced with new replica shards. -## Request fields +## Request body fields Segment replication backpressure is disabled by default. To enable it, set `segrep.pressure.enabled` to `true`. You can update the following dynamic cluster settings using the [cluster settings]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-settings/) API endpoint. diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md index b9e35b2697..7076c792e2 100644 --- a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md @@ -18,7 +18,7 @@ The searchable snapshot feature incorporates techniques like caching frequently To configure the searchable snapshots feature, create a node in your `opensearch.yml file` and define the node role as `search`. Optionally, you can also configure the `cache.size` property for the node. -A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting. +A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage (80%) of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting. Parameter | Type | Description :--- | :--- | :--- @@ -46,7 +46,7 @@ services: - node.search.cache.size=50gb ``` - +- Starting with version 2.18, k-NN indexes support searchable snapshots for the NMSLIB and Faiss engines. ## Create a searchable snapshot index @@ -109,4 +109,3 @@ The following are known limitations of the searchable snapshots feature: - Searching remote data can impact the performance of other queries running on the same node. We recommend that users provision dedicated nodes with the `search` role for performance-critical applications. - For better search performance, consider [force merging]({{site.url}}{{site.baseurl}}/api-reference/index-apis/force-merge/) indexes into a smaller number of segments before taking a snapshot. For the best performance, at the cost of using compute resources prior to snapshotting, force merge your index into one segment. - We recommend configuring a maximum ratio of remote data to local disk cache size using the `cluster.filecache.remote_data_ratio` setting. A ratio of 5 is a good starting point for most workloads to ensure good query performance. If the ratio is too large, then there may not be sufficient disk space to handle the search workload. For more details on the maximum ratio of remote data, see issue [#11676](https://github.com/opensearch-project/OpenSearch/issues/11676). -- k-NN native-engine-based indexes using `faiss` and `nmslib` engines are incompatible with searchable snapshots. diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md index 812d5104c7..ac717633f6 100644 --- a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md @@ -110,6 +110,20 @@ You will most likely not need to specify any parameters except for `location`. F sudo ./bin/opensearch-keystore add s3.client.default.secret_key ``` +1. (Optional) If you're using a custom S3 endpoint (for example, MinIO), disable the Amazon EC2 metadata connection: + + ```bash + export AWS_EC2_METADATA_DISABLED=true + ``` + + If you're installing OpenSearch using Helm, update the following settings in your values file: + + ```yml + extraEnvs: + - name: AWS_EC2_METADATA_DISABLED + value: "true" + ``` + 1. (Optional) If you're using temporary credentials, add your session token: ```bash @@ -479,15 +493,19 @@ Request parameters | Description `include_global_state` | Whether to restore the cluster state. Default is `false`. `include_aliases` | Whether to restore aliases alongside their associated indexes. Default is `true`. `partial` | Whether to allow the restoration of partial snapshots. Default is `false`. -`rename_pattern` | If you want to rename indexes as you restore them, use this option to specify a regular expression that matches all indexes you want to restore. Use capture groups (`()`) to reuse portions of the index name. -`rename_replacement` | If you want to rename indexes as you restore them, use this option to specify the replacement pattern. Use `$0` to include the entire matching index name, `$1` to include the content of the first capture group, and so on. +`rename_pattern` | If you want to rename indexes, use this option to specify a regular expression that matches all the indexes that you want to restore and rename. Use capture groups (`()`) to reuse portions of the index name. +`rename_replacement` | If you want to rename indexes, use this option to specify the name replacement pattern. Use `$0` to include the entire matching index name or the number of the capture group. For example, `$1` would include the content of the first capture group. +`rename_alias_pattern` | If you want to rename aliases, use this option to specify a regular expression that matches all the aliases you want to restore and rename. Use capture groups (`()`) to reuse portions of the alias name. +`rename_alias_replacement` | If you want to rename aliases, use this option to specify the name replacement pattern. Use `$0` to include the entire matching alias name or the number of the capture group. For example, `$1` would include the content of the first capture group. `index_settings` | If you want to change [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/) applied during the restore operation, specify them here. You cannot change `index.number_of_shards`. `ignore_index_settings` | Rather than explicitly specifying new settings with `index_settings`, you can ignore certain index settings in the snapshot and use the cluster defaults applied during restore. You cannot ignore `index.number_of_shards`, `index.number_of_replicas`, or `index.auto_expand_replicas`. `storage_type` | `local` indicates that all snapshot metadata and index data will be downloaded to local storage.

`remote_snapshot` indicates that snapshot metadata will be downloaded to the cluster, but the remote repository will remain the authoritative store of the index data. Data will be downloaded and cached as necessary to service queries. At least one node in the cluster must be configured with the [search role]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/) in order to restore a snapshot using the type `remote_snapshot`.

Defaults to `local`. ### Conflicts and compatibility -One way to avoid naming conflicts when restoring indexes is to use the `rename_pattern` and `rename_replacement` options. You can then, if necessary, use the `_reindex` API to combine the two. The simpler way is to delete existing indexes prior to restoring from a snapshot. +One way to avoid index naming conflicts when restoring indexes is to use the `rename_pattern` and `rename_replacement` options. You can then, if necessary, use the `_reindex` API to combine the two. However, it may be simpler to delete the indexes that caused the conflict prior to restoring them from a snapshot. + +Similarly, to avoid alias naming conflicts when restoring indexes with aliases, you can use the `rename_alias_pattern` and `rename_alias_replacement` options. You can use the `_close` API to close existing indexes prior to restoring from a snapshot, but the index in the snapshot has to have the same number of shards as the existing index. diff --git a/_tuning-your-cluster/availability-and-recovery/workload-management/query-group-lifecycle-api.md b/_tuning-your-cluster/availability-and-recovery/workload-management/query-group-lifecycle-api.md new file mode 100644 index 0000000000..0fb0b0b65c --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/workload-management/query-group-lifecycle-api.md @@ -0,0 +1,125 @@ +--- +layout: default +title: Query Group Lifecycle API +nav_order: 20 +parent: Workload management +grand_parent: Availability and recovery +--- + +# Query Group Lifecycle API + +The Query Group Lifecycle API creates, updates, retrieves, and deletes query groups. The API categorizes queries into specific groups, called _query groups_, based on desired resource limits. + +## Path and HTTP methods + +```json +PUT _wlm/query_group +PUT _wlm/query_group/ +GET _wlm/query_group +GET _wlm/query_group/ +DELETE _wlm/query_group/ +``` + +## Request body fields + +| Field | Description | +| :--- | :--- | +| `_id` | The ID of the query group, which can be used to associate query requests with the group and enforce the group's resource limits. | +| `name` | The name of the query group. | +| `resiliency_mode` | The resiliency mode of the query group. Valid modes are `enforced`, `soft`, and `monitor`. For more information about resiliency modes, see [Operating modes](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview/#operating-modes). | +| `resource_limits` | The resource limits for query requests in the query group. Valid resources are `cpu` and `memory`. | + +When creating a query group, make sure that the sum of the resource limits for a single resource, either `cpu` or `memory`, does not exceed 1. + +## Example requests + +The following example requests show how to use the Query Group Lifecycle API. + +### Creating a query group + +```json +PUT _wlm/query_group +{ + "name": "analytics", + "resiliency_mode": "enforced", + "resource_limits": { + "cpu": 0.4, + "memory": 0.2 + } +} +``` +{% include copy-curl.html %} + +### Updating a query group + +```json +PUT _wlm/query_group/analytics +{ + "resiliency_mode": "monitor", + "resource_limits": { + "cpu": 0.41, + "memory": 0.21 + } +} +``` +{% include copy-curl.html %} + +### Getting a query group + +```json +GET _wlm/query_group/analytics +``` +{% include copy-curl.html %} + +### Deleting a query group + +```json +DELETE _wlm/query_group/analytics +``` +{% include copy-curl.html %} + +## Example responses + +OpenSearch returns responses similar to the following. + +### Creating a query group + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"enforced", + "resource_limits":{ + "cpu":0.4, + "memory":0.2 + }, + "updated_at":1726270184642 +} +``` + +### Updating a query group + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"monitor", + "resource_limits":{ + "cpu":0.41, + "memory":0.21 + }, + "updated_at":1726270333804 +} +``` + +## Response body fields + +| Field | Description | +| :--- | :--- | +| `_id` | The ID of the query group. | +| `name` | The name of the query group. Required when creating a new query group. | +| `resiliency_mode` | The resiliency mode of the query group. | +| `resource_limits` | The resource limits of the query group. | +| `updated_at` | The time at which the query group was last updated. | + + diff --git a/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md b/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md new file mode 100644 index 0000000000..956a01a774 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md @@ -0,0 +1,194 @@ +--- +layout: default +title: Workload management +nav_order: 70 +has_children: true +parent: Availability and recovery +--- + +Introduced 2.18 +{: .label .label-purple } + +# Workload management + +Workload management allows you to group search traffic and isolate network resources, preventing the overuse of network resources by specific requests. It offers the following benefits: + +- Tenant-level admission control and reactive query management. When resource usage exceeds configured limits, it automatically identifies and cancels demanding queries, ensuring fair resource distribution. + +- Tenant-level isolation within the cluster for search workloads, operating at the node level. + +## Installing workload management + +To install workload management, use the following command: + +```json +./bin/opensearch-plugin install workload-management +``` +{% include copy-curl.html %} + +## Query groups + +A _query group_ is a logical grouping of tasks with defined resource limits. System administrators can dynamically manage query groups using the Workload Management APIs. These query groups can be used to create search requests with resource limits. + +### Permissions + +Only users with administrator-level permissions can create and update query groups using the Workload Management APIs. + +### Operating modes + +The following operating modes determine the operating level for a query group: + +- **Disabled mode**: Workload management is disabled. + +- **Enabled mode**: Workload management is enabled and will cancel and reject queries once the query group's configured thresholds are reached. + +- **Monitor_only mode** (Default): Workload management will monitor tasks but will not cancel or reject any queries. + +### Example request + +The following example request adds a query group named `analytics`: + +```json +PUT _wlm/query_group +{ + “name”: “analytics”, + “resiliency_mode”: “enforced”, + “resource_limits”: { + “cpu”: 0.4, + “memory”: 0.2 + } +} +``` +{% include copy-curl.html %} + +When creating a query group, make sure that the sum of the resource limits for a single resource, such as `cpu` or `memory`, does not exceed `1`. + +### Example response + +OpenSearch responds with the set resource limits and the `_id` for the query group: + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"enforced", + "resource_limits":{ + "cpu":0.4, + "memory":0.2 + }, + "updated_at":1726270184642 +} +``` + +## Using `queryGroupID` + +You can associate a query request with a `queryGroupID` to manage and allocate resources within the limits defined by the query group. By using this ID, request routing and tracking are associated with the query group, ensuring resource quotas and task limits are maintained. + +The following example query uses the `queryGroupId` to ensure that the query does not exceed that query group's resource limits: + +```json +GET testindex/_search +Host: localhost:9200 +Content-Type: application/json +queryGroupId: preXpc67RbKKeCyka72_Gw +{ + "query": { + "match": { + "field_name": "value" + } + } +} +``` +{% include copy-curl.html %} + +## Workload management settings + +The following settings can be used to customize workload management using the `_cluster/settings` API. + +| **Setting name** | **Description** | +| :--- | :--- | +| `wlm.query_group.duress_streak` | Determines the node duress threshold. Once the threshold is reached, the node is marked as `in duress`. | +| `wlm.query_group.enforcement_interval` | Defines the monitoring interval. | +| `wlm.query_group.mode` | Defines the [operating mode](#operating-modes). | +| `wlm.query_group.node.memory_rejection_threshold` | Defines the query group level `memory` threshold. When the threshold is reached, the request is rejected. | +| `wlm.query_group.node.cpu_rejection_threshold` | Defines the query group level `cpu` threshold. When the threshold is reached, the request is rejected. | +| `wlm.query_group.node.memory_cancellation_threshold` | Controls whether the node is considered to be in duress when the `memory` threshold is reached. Requests routed to nodes in duress are canceled. | +| `wlm.query_group.node.cpu_cancellation_threshold` | Controls whether the node is considered to be in duress when the `cpu` threshold is reached. Requests routed to nodes in duress are canceled. | + +When setting rejection and cancellation thresholds, remember that the rejection threshold for a resource should always be lower than the cancellation threshold. + +## Workload Management Stats API + +The Workload Management Stats API returns workload management metrics for a query group, using the following method: + +```json +GET _wlm/stats +``` +{% include copy-curl.html %} + +### Example response + +```json +{ + “_nodes”: { + “total”: 1, + “successful”: 1, + “failed”: 0 + }, + “cluster_name”: “XXXXXXYYYYYYYY”, + “A3L9EfBIQf2anrrUhh_goA”: { + “query_groups”: { + “16YGxFlPRdqIO7K4EACJlw”: { + “total_completions”: 33570, + “total_rejections”: 0, + “total_cancellations”: 0, + “cpu”: { + “current_usage”: 0.03319935314357281, + “cancellations”: 0, + “rejections”: 0 + }, + “memory”: { + “current_usage”: 0.002306486276211217, + “cancellations”: 0, + “rejections”: 0 + } + }, + “DEFAULT_QUERY_GROUP”: { + “total_completions”: 42572, + “total_rejections”: 0, + “total_cancellations”: 0, + “cpu”: { + “current_usage”: 0, + “cancellations”: 0, + “rejections”: 0 + }, + “memory”: { + “current_usage”: 0, + “cancellations”: 0, + “rejections”: 0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Response body fields + +| Field name | Description | +| :--- | :--- | +| `total_completions` | The total number of request completions in the `query_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `total_rejections` | The total number request rejections in the `query_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `total_cancellations` | The total number of cancellations in the `query_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `cpu` | The `cpu` resource type statistics for the `query_group`. | +| `memory` | The `memory` resource type statistics for the `query_group`. | + +### Resource type statistics + +| Field name | Description | +| :--- | :---- | +| `current_usage` |The resource usage for the `query_group` at the given node based on the last run of the monitoring thread. This value is updated based on the `wlm.query_group.enforcement_interval`. | +| `cancellations` | The number of cancellations resulting from the cancellation threshold being reached. | +| `rejections` | The number of rejections resulting from the cancellation threshold being reached. | + diff --git a/_tuning-your-cluster/index.md b/_tuning-your-cluster/index.md index 99db78565f..f434c2b5ec 100644 --- a/_tuning-your-cluster/index.md +++ b/_tuning-your-cluster/index.md @@ -20,7 +20,7 @@ To create and deploy an OpenSearch cluster according to your requirements, it’ There are many ways to design a cluster. The following illustration shows a basic architecture that includes a four-node cluster that has one dedicated cluster manager node, one dedicated coordinating node, and two data nodes that are cluster manager eligible and also used for ingesting data. - The nomenclature for the cluster manager node is now referred to as the cluster manager node. + The master node is now referred to as the cluster manager node. {: .note } ![multi-node cluster architecture diagram]({{site.url}}{{site.baseurl}}/images/cluster.png) @@ -192,11 +192,27 @@ To better understand and monitor your cluster, use the [CAT API]({{site.url}}{{s ## (Advanced) Step 6: Configure shard allocation awareness or forced awareness +To further fine-tune your shard allocation, you can set custom node attributes for shard allocation awareness or forced awareness. + ### Shard allocation awareness -If your nodes are spread across several geographical zones, you can configure shard allocation awareness to allocate all replica shards to a zone that’s different from their primary shard. +You can set custom node attributes on OpenSearch nodes to be used for shard allocation awareness. For example, you can set the `zone` attribute on each node to represent the zone in which the node is located. You can also use the `zone` attribute to ensure that the primary shard and its replica shards are allocated in a balanced manner across available, distinct zones. In this scenario, maximum shard copies per zone would equal `ceil (number_of_shard_copies/number_of_distinct_zones)`. + +OpenSearch, by default, allocates shard copies of a single shard across different nodes. When only 1 zone is available, such as after a zone failure, OpenSearch allocates replica shards to the only remaining zone---it considers only available zones (attribute values) when calculating the maximum number of allowed shard copies per zone. + +For example, if your index has a total of 5 shard copies (1 primary and 4 replicas) and nodes in 3 distinct zones, then OpenSearch will perform the following to allocate all 5 shard copies: + +- Allocate no more than 2 shards per zone, which will require at least 2 nodes in 2 zones. +- Allocate the last shard in the third zone, with at least 1 node needed in the third zone. -With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones. It adds a layer of fault tolerance to ensure your data survives a zone failure beyond just individual node failures. +Alternatively, if you have 3 nodes in the first zone and 1 node in each remaining zone, then OpenSearch will allocate: + +- 2 shard copies in the first zone. +- 1 shard copy in the remaining 2 zones. + +The final shard copy will remain unallocated due to the lack of nodes. + +With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones, adding a layer of fault tolerance to ensure that your data survives zone failures. To configure shard allocation awareness, add zone attributes to `opensearch-d1` and `opensearch-d2`, respectively: @@ -219,6 +235,8 @@ PUT _cluster/settings } ``` +You can also use multiple attributes for shard allocation awareness by providing the attributes as a comma-separated string, for example, `zone,rack`. + You can either use `persistent` or `transient` settings. We recommend the `persistent` setting because it persists through a cluster reboot. Transient settings don't persist through a cluster reboot. Shard allocation awareness attempts to separate primary and replica shards across multiple zones. However, if only one zone is available (such as after a zone failure), OpenSearch allocates replica shards to the only remaining zone. diff --git a/_tuning-your-cluster/performance.md b/_tuning-your-cluster/performance.md index 28f47aeacb..b5066a890c 100644 --- a/_tuning-your-cluster/performance.md +++ b/_tuning-your-cluster/performance.md @@ -32,12 +32,9 @@ An increased `index.translog.flush_threshold_size` can also increase the time th Before increasing `index.translog.flush_threshold_size`, call the following API operation to get current flush operation statistics: ```json -curl -XPOST "os-endpoint/index-name/_stats/flush?pretty" +GET //_stats/flush?pretty ``` -{% include copy.html %} - - -Replace the `os-endpoint` and `index-name` with your endpoint and index name. +{% include copy-curl.html %} In the output, note the number of flushes and the total time. The following example output shows that there are 124 flushes, which took 17,690 milliseconds: @@ -53,9 +50,15 @@ In the output, note the number of flushes and the total time. The following exam To increase the flush threshold size, call the following API operation: ```json -curl -XPUT "os-endpoint/index-name/_settings?pretty" -d "{"index":{"translog.flush_threshold_size" : "1024MB"}}" +PUT //_settings +{ + "index": + { + "translog.flush_threshold_size" : "1024MB" + } +} ``` -{% include copy.html %} +{% include copy-curl.html %} In this example, the flush threshold size is set to 1024 MB, which is ideal for instances that have more than 32 GB of memory. @@ -65,9 +68,9 @@ Choose the appropriate threshold size for your cluster. Run the stats API operation again to see whether the flush activity changed: ```json -curl -XGET "os-endpoint/index-name/_stats/flush?pretty" +GET //_stats/flush ``` -{% include copy.html %} +{% include copy-curl.html %} It's a best practice to increase the `index.translog.flush_threshold_size` only for the current index. After you confirm the outcome, apply the changes to the index template. {: .note} @@ -127,14 +130,14 @@ To reduce the size of the OpenSearch response, use the `filter_path` parameter t In the following example, the `index-name`, `type-name`, and `took` fields are excluded from the response: ```json -curl -XPOST "es-endpoint/index-name/type-name/_bulk?pretty&filter_path=-took,-items.index._index,-items.index._type" -H 'Content-Type: application/json' -d' +POST /_bulk?pretty&filter_path=-took,-items.index._index,-items.index._type { "index" : { "_index" : "test2", "_id" : "1" } } { "user" : "testuser" } { "update" : {"_id" : "1", "_index" : "test2"} } { "doc" : {"user" : "example"} } ``` -{% include copy.html %} +{% include copy-curl.html %} ## Compression codecs -In OpenSearch 2.9 and later, there are two new codecs for compression: `zstd` and `zstd_no_dict`. You can optionally specify a compression level for these in the `index.codec.compression_level` setting with values in the [1, 6] range. [Benchmark]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#benchmarking) data shows that `zstd` provides a 7% better write throughput and `zstd_no_dict` provides a 14% better throughput, along with a 30% improvement in storage compared with the `default` codec. For more information about compression, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/). \ No newline at end of file +In OpenSearch 2.9 and later, there are two new codecs for compression: `zstd` and `zstd_no_dict`. You can optionally specify a compression level for these in the `index.codec.compression_level` setting with values in the [1, 6] range. [Benchmark]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#benchmarking) data shows that `zstd` provides a 7% better write throughput and `zstd_no_dict` provides a 14% better throughput, along with a 30% improvement in storage compared with the `default` codec. For more information about compression, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/). diff --git a/_tuning-your-cluster/replication-plugin/auto-follow.md b/_tuning-your-cluster/replication-plugin/auto-follow.md index 828b835387..92e7a6c144 100644 --- a/_tuning-your-cluster/replication-plugin/auto-follow.md +++ b/_tuning-your-cluster/replication-plugin/auto-follow.md @@ -98,9 +98,9 @@ To delete a replication rule, send the following request to the follower cluster ```bash curl -XDELETE -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d ' { - "leader_alias" : "my-conection-alias", + "leader_alias" : "my-connection-alias", "name": "my-replication-rule" }' ``` -When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication). \ No newline at end of file +When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication). diff --git a/_upgrade-to/index.md b/_upgrade-to/index.md index 0eea3d6209..696be88c21 100644 --- a/_upgrade-to/index.md +++ b/_upgrade-to/index.md @@ -1,6 +1,6 @@ --- layout: default -title: About the migration process +title: Upgrading OpenSearch nav_order: 1 nav_exclude: true permalink: /upgrade-to/ @@ -8,15 +8,14 @@ redirect_from: - /upgrade-to/index/ --- -# About the migration process +# Upgrading OpenSearch -The process of migrating from Elasticsearch OSS to OpenSearch varies depending on your current version of Elasticsearch OSS, installation type, tolerance for downtime, and cost-sensitivity. Rather than concrete steps to cover every situation, we have general guidance for the process. +The process of upgrading your OpenSearch version varies depending on your current version of OpenSearch, installation type, tolerance for downtime, and cost-sensitivity. For migrating to OpenSearch, we provide a [Migration Assistant]({{site.url}}{{site.baseurl}}/migration-assistant/). -Three approaches exist: +Two upgrade approaches exists: -- Use a snapshot to [migrate your Elasticsearch OSS data]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) to a new OpenSearch cluster. This method may incur downtime. -- Perform a [restart upgrade or a rolling upgrade]({{site.url}}{{site.baseurl}}/upgrade-to/upgrade-to/) on your existing nodes. A restart upgrade involves upgrading the entire cluster and restarting it, whereas a rolling upgrade requires upgrading and restarting nodes in the cluster one by one. -- Replace existing Elasticsearch OSS nodes with new OpenSearch nodes. Node replacement is most popular when upgrading [Docker clusters]({{site.url}}{{site.baseurl}}/upgrade-to/docker-upgrade-to/). +- Perform a [restart upgrade or a rolling upgrade]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) on your existing nodes. A restart upgrade involves upgrading the entire cluster and restarting it, whereas a rolling upgrade requires upgrading and restarting nodes in the cluster one by one. +- Replace existing OpenSearch nodes with new OpenSearch nodes. Node replacement is most popular when upgrading [Docker clusters]({{site.url}}{{site.baseurl}}/upgrade-to/docker-upgrade-to/). Regardless of your approach, to safeguard against data loss, we recommend that you take a [snapshot]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore) of all indexes prior to any migration. diff --git a/_upgrade-to/upgrade-to.md b/_upgrade-to/upgrade-to.md index 340055b214..00950687a5 100644 --- a/_upgrade-to/upgrade-to.md +++ b/_upgrade-to/upgrade-to.md @@ -6,6 +6,10 @@ nav_order: 15 # Migrating from Elasticsearch OSS to OpenSearch + +OpenSearch provides a [Migration Assistant]({{site.url}}{{site.baseurl}}/migration-assistant/) to assist you in migrating from other search solutions. +{: .warning} + If you want to migrate from an existing Elasticsearch OSS cluster to OpenSearch and find the [snapshot approach]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) unappealing, you can migrate your existing nodes from Elasticsearch OSS to OpenSearch. If your existing cluster runs an older version of Elasticsearch OSS, the first step is to upgrade to version 6.x or 7.x. diff --git a/assets/examples/ecommerce.json b/assets/examples/ecommerce.ndjson similarity index 100% rename from assets/examples/ecommerce.json rename to assets/examples/ecommerce.ndjson diff --git a/assets/js/copy-button.js b/assets/js/copy-button.js index c26cedfd1c..cb784f07d7 100644 --- a/assets/js/copy-button.js +++ b/assets/js/copy-button.js @@ -62,7 +62,7 @@ function addCurl(textToCopy) { result += path + "\""; if (body.length > 0) { - result += " -H 'Content-Type: application/json' -d'\n" + body + "'"; + result += " -H 'Content-Type: application/json' -d'\n" + body + "\n'"; } return result; diff --git a/assets/js/search.js b/assets/js/search.js index 8d9cab2ec5..86970d9544 100644 --- a/assets/js/search.js +++ b/assets/js/search.js @@ -173,7 +173,10 @@ const showNoResults = () => { emptyResults(); - elResults.appendChild(document.createRange().createContextualFragment('No results found!')); + const resultElement = document.createElement('div'); + resultElement.classList.add('search-page--results--no-results'); + resultElement.appendChild(document.createRange().createContextualFragment('No results found.')); + elResults.appendChild(resultElement); showResults(); elSpinner?.classList.remove(CLASSNAME_SPINNING); }; @@ -278,8 +281,6 @@ window.doResultsPageSearch = async (query, type, version) => { - console.log("Running results page search!"); - const searchResultsContainer = document.getElementById('searchPageResultsContainer'); try { @@ -291,7 +292,7 @@ window.doResultsPageSearch = async (query, type, version) => { if (data.results && data.results.length > 0) { data.results.forEach(result => { const resultElement = document.createElement('div'); - resultElement.classList.add('search-page--results--diplay--container--item'); + resultElement.classList.add('search-page--results--display--container--item'); const contentCite = document.createElement('cite'); const crumbs = [...result.ancestors]; @@ -302,11 +303,9 @@ window.doResultsPageSearch = async (query, type, version) => { const titleLink = document.createElement('a'); titleLink.href = result.url; + titleLink.classList.add('search-page--results--display--container--item--link'); titleLink.textContent = result.title; - titleLink.style.fontSize = '1.5em'; - titleLink.style.fontWeight = 'bold'; - titleLink.style.display = 'block'; - + const contentSpan = document.createElement('span'); contentSpan.textContent = result.content; contentSpan.style.display = 'block'; @@ -317,16 +316,10 @@ window.doResultsPageSearch = async (query, type, version) => { // Append the result element to the searchResultsContainer searchResultsContainer.appendChild(resultElement); - - const breakline = document.createElement('hr'); - breakline.style.border = '.5px solid #ccc'; - breakline.style.margin = 'auto'; - searchResultsContainer.appendChild(breakline); }); } else { const noResultsElement = document.createElement('div'); noResultsElement.textContent = 'No results found.'; - noResultsElement.style.fontSize = '2em'; searchResultsContainer.appendChild(noResultsElement); } } catch (error) { diff --git a/build.sh b/build.sh index 060bbfa666..85ef617931 100755 --- a/build.sh +++ b/build.sh @@ -1,3 +1,9 @@ #!/usr/bin/env bash -JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host localhost --port 4000 --incremental --livereload --open-url --trace +host="localhost" + +if [[ "$DOCKER_BUILD" == "true" ]]; then + host="0.0.0.0" +fi + +JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host ${host} --port 4000 --incremental --livereload --open-url --trace diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000000..04dd007db9 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,14 @@ +version: "3" + +services: + doc_builder: + image: ruby:3.2.4 + volumes: + - .:/app + working_dir: /app + ports: + - "4000:4000" + command: bash -c "bundler install && bash build.sh" + environment: + BUNDLE_PATH: /app/vendor/bundle # Avoid installing gems globally. + DOCKER_BUILD: true # Signify build.sh to bind to 0.0.0.0 for effective doc access from host. diff --git a/images/dashboards-assistant/alert-insight-insight.png b/images/dashboards-assistant/alert-insight-insight.png new file mode 100644 index 0000000000..3d65276d42 Binary files /dev/null and b/images/dashboards-assistant/alert-insight-insight.png differ diff --git a/images/dashboards-assistant/alert-insight-start.png b/images/dashboards-assistant/alert-insight-start.png new file mode 100644 index 0000000000..b55b6296bc Binary files /dev/null and b/images/dashboards-assistant/alert-insight-start.png differ diff --git a/images/dashboards-assistant/alert-insight-summary.png b/images/dashboards-assistant/alert-insight-summary.png new file mode 100644 index 0000000000..1e98170cda Binary files /dev/null and b/images/dashboards-assistant/alert-insight-summary.png differ diff --git a/images/dashboards-assistant/data-summary.png b/images/dashboards-assistant/data-summary.png new file mode 100644 index 0000000000..dc2e4e22f0 Binary files /dev/null and b/images/dashboards-assistant/data-summary.png differ diff --git a/images/dashboards-assistant/info-icon.png b/images/dashboards-assistant/info-icon.png new file mode 100644 index 0000000000..29e6b7b97b Binary files /dev/null and b/images/dashboards-assistant/info-icon.png differ diff --git a/images/dashboards-assistant/sparkle-icon.png b/images/dashboards-assistant/sparkle-icon.png new file mode 100644 index 0000000000..04b6d2b876 Binary files /dev/null and b/images/dashboards-assistant/sparkle-icon.png differ diff --git a/images/dashboards-assistant/suggestAD-UI.png b/images/dashboards-assistant/suggestAD-UI.png new file mode 100644 index 0000000000..dd7e32d6e2 Binary files /dev/null and b/images/dashboards-assistant/suggestAD-UI.png differ diff --git a/images/dashboards-assistant/suggestAD-button.png b/images/dashboards-assistant/suggestAD-button.png new file mode 100644 index 0000000000..a87fe862fe Binary files /dev/null and b/images/dashboards-assistant/suggestAD-button.png differ diff --git a/images/dashboards-assistant/t2viz-ask-question.png b/images/dashboards-assistant/t2viz-ask-question.png new file mode 100644 index 0000000000..e5b7e86f64 Binary files /dev/null and b/images/dashboards-assistant/t2viz-ask-question.png differ diff --git a/images/dashboards-assistant/t2viz-edit-visual-response.png b/images/dashboards-assistant/t2viz-edit-visual-response.png new file mode 100644 index 0000000000..1fd35425a7 Binary files /dev/null and b/images/dashboards-assistant/t2viz-edit-visual-response.png differ diff --git a/images/dashboards-assistant/t2viz-edit-visual.png b/images/dashboards-assistant/t2viz-edit-visual.png new file mode 100644 index 0000000000..0c57dd58aa Binary files /dev/null and b/images/dashboards-assistant/t2viz-edit-visual.png differ diff --git a/images/dashboards-assistant/t2viz-select-data-source.png b/images/dashboards-assistant/t2viz-select-data-source.png new file mode 100644 index 0000000000..172e136a5b Binary files /dev/null and b/images/dashboards-assistant/t2viz-select-data-source.png differ diff --git a/images/dashboards-assistant/t2viz-start.png b/images/dashboards-assistant/t2viz-start.png new file mode 100644 index 0000000000..f6d46a21e5 Binary files /dev/null and b/images/dashboards-assistant/t2viz-start.png differ diff --git a/images/migrations/migration-architecture-overview.svg b/images/migrations/migration-architecture-overview.svg new file mode 100644 index 0000000000..cf758653aa --- /dev/null +++ b/images/migrations/migration-architecture-overview.svg @@ -0,0 +1,2 @@ + +


<b><br><br></b>
Source Cluster Snapshot







[Not supported by viewer]








<br><br><br><br><br><br><br><br>
Amazon OpenSearch or
Elasticsearch/OpenSearch self-managed

[Not supported by viewer]
Source Cluster
[Not supported by viewer]

<div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br></font></b>
Text
Text
Text
Text

<font style="font-size: 14px;"><br></font>

<font style="font-size: 14px;"><br></font>
Text
Text

<font style="font-size: 14px;"><br></font>
Text
Text
Migration Infrastructure
[Not supported by viewer]
Client Traffic
<i>Client Traffic</i>
1
[Not supported by viewer]
2
[Not supported by viewer]
AWS Cloud
<div>AWS Cloud</div>
Application Load Balanacer
<div>Application Load Balanacer</div>





<b><br><br><br></b><br><br>
5
[Not supported by viewer]
Amazon EFS
[Not supported by viewer]
Monitoring and Analysis

[Not supported by viewer]
Amazon CloudWatch
<div>Amazon CloudWatch</div>


<b><br><br></b>

<font style="font-size: 14px;"><br></font>

<font style="font-size: 14px;"><br></font>
Amazon ECS
<div>Amazon ECS</div>
Migration Console











[Not supported by viewer]
          Replayer N
[Not supported by viewer]
         Replayer 2
[Not supported by viewer]
          RFS Task N
[Not supported by viewer]
         RFS Task 2
[Not supported by viewer]
Reindex-from-Snapshot (RFS)











[Not supported by viewer]
Replayer











[Not supported by viewer]
Migration Console
[Not supported by viewer]
          
            RFS Task 1

[Not supported by viewer]
            
            Replayer 1

[Not supported by viewer]

<font style="font-size: 14px;"><br></font>
            Capture Proxy N
[Not supported by viewer]
Capture Proxy ALB Target Group











[Not supported by viewer]
            Capture Proxy 2
[Not supported by viewer]
            
             Capture Proxy 1

[Not supported by viewer]
Amazon ECS
<div>Amazon ECS</div>
Amazon S3
Amazon S3

<font><br></font>
2
[Not supported by viewer]
2
[Not supported by viewer]
2
[Not supported by viewer]








<br><br><br><br><br><br><br><br>
Amazon OpenSearch or
Elasticsearch/OpenSearch self-managed
[Not supported by viewer]
Target Cluster
[Not supported by viewer]

<div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br></font></b>

<font style="font-size: 14px;"><br></font>

<font style="font-size: 14px;"><br></font>

<font style="font-size: 14px;"><br></font>
3
[Not supported by viewer]
Event Streamer







[Not supported by viewer]
Amazon MSK
[Not supported by viewer]
4
[Not supported by viewer]
3
[Not supported by viewer]
6
[Not supported by viewer]
4
[Not supported by viewer]
\ No newline at end of file diff --git a/images/migrations/migrations-architecture-overview.png b/images/migrations/migrations-architecture-overview.png new file mode 100644 index 0000000000..3002da3a87 Binary files /dev/null and b/images/migrations/migrations-architecture-overview.png differ diff --git a/images/star-tree-index.png b/images/star-tree-index.png new file mode 100644 index 0000000000..81309e1195 Binary files /dev/null and b/images/star-tree-index.png differ diff --git a/release-notes/opensearch-documentation-release-notes-2.17.0.md b/release-notes/opensearch-documentation-release-notes-2.17.0.md new file mode 100644 index 0000000000..d9ed51737c --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-2.17.0.md @@ -0,0 +1,36 @@ +# OpenSearch Documentation Website 2.17.0 Release Notes + +The OpenSearch 2.17.0 documentation includes the following additions and updates. + +## New documentation for 2.17.0 + +- Get offline batch inference details using task API in m [#8305](https://github.com/opensearch-project/documentation-website/pull/8305) +- Documentation for Binary Quantization Support with KNN Vector Search [#8281](https://github.com/opensearch-project/documentation-website/pull/8281) +- add offline batch ingestion tech doc [#8251](https://github.com/opensearch-project/documentation-website/pull/8251) +- Add documentation changes for disk-based k-NN [#8246](https://github.com/opensearch-project/documentation-website/pull/8246) +- Derived field updates for 2.17 [#8244](https://github.com/opensearch-project/documentation-website/pull/8244) +- Add changes for multiple signing keys [#8243](https://github.com/opensearch-project/documentation-website/pull/8243) +- Add documentation changes for Snapshot Status API [#8235](https://github.com/opensearch-project/documentation-website/pull/8235) +- Update flow framework additional fields in previous_node_inputs [#8233](https://github.com/opensearch-project/documentation-website/pull/8233) +- Add documentation changes for shallow snapshot v2 [#8207](https://github.com/opensearch-project/documentation-website/pull/8207) +- Add documentation for context and ABC templates [#8197](https://github.com/opensearch-project/documentation-website/pull/8197) +- Create documentation for snapshots with hashed prefix path type [#8196](https://github.com/opensearch-project/documentation-website/pull/8196) +- Adding documentation for remote index use in AD [#8191](https://github.com/opensearch-project/documentation-website/pull/8191) +- Doc update for concurrent search [#8181](https://github.com/opensearch-project/documentation-website/pull/8181) +- Adding new cluster search setting docs [#8180](https://github.com/opensearch-project/documentation-website/pull/8180) +- Add new settings for remote publication [#8176](https://github.com/opensearch-project/documentation-website/pull/8176) +- Grouping Top N queries documentation [#8173](https://github.com/opensearch-project/documentation-website/pull/8173) +- Document reprovision param for Update Workflow API [#8172](https://github.com/opensearch-project/documentation-website/pull/8172) +- Add documentation for Faiss byte vector [#8170](https://github.com/opensearch-project/documentation-website/pull/8170) +- Terms query can accept encoded terms input as bitmap [#8133](https://github.com/opensearch-project/documentation-website/pull/8133) +- Update doc for adding new param in cat shards action for cancellation… [#8127](https://github.com/opensearch-project/documentation-website/pull/8127) +- Add docs on skip_validating_missing_parameters in ml-commons connector [#8118](https://github.com/opensearch-project/documentation-website/pull/8118) +- Add Split Response Processor to 2.17 Search Pipeline docs [#8081](https://github.com/opensearch-project/documentation-website/pull/8081) +- Added documentation for FGAC for Flow Framework [#8076](https://github.com/opensearch-project/documentation-website/pull/8076) +- Remove composite agg limitations for concurrent search [#7904](https://github.com/opensearch-project/documentation-website/pull/7904) +- Add doc for nodes stats search.request.took fields [#7887](https://github.com/opensearch-project/documentation-website/pull/7887) +- Add documentation for ignore_hosts config option for ip-based rate limiting [#7859](https://github.com/opensearch-project/documentation-website/pull/7859) + +## Documentation for 2.17.0 experimental features + +- Document new experimental ingestion streaming APIs [#8123](https://github.com/opensearch-project/documentation-website/pull/8123) diff --git a/release-notes/opensearch-documentation-release-notes-2.18.0.md b/release-notes/opensearch-documentation-release-notes-2.18.0.md new file mode 100644 index 0000000000..30147a37f0 --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-2.18.0.md @@ -0,0 +1,39 @@ +# OpenSearch Documentation Website 2.18.0 Release Notes + +The OpenSearch 2.18.0 documentation includes the following additions and updates. + +## New documentation for 2.18.0 + +- Update SQL/PPL multiple value field limitation [#8646](https://github.com/opensearch-project/documentation-website/pull/8646) +- Add new use cases to ML Inference Search Response Processor [#8639](https://github.com/opensearch-project/documentation-website/pull/8639) +- Documentation for query field name and datatype in query shape [#8631](https://github.com/opensearch-project/documentation-website/pull/8631) +- add document for Query Insights health_stats API [#8627](https://github.com/opensearch-project/documentation-website/pull/8627) +- Add new indexing parameter and update performance tuning instruction [#8623](https://github.com/opensearch-project/documentation-website/pull/8623) +- Update default engine from nmslib to faiss [#8620](https://github.com/opensearch-project/documentation-website/pull/8620) +- Update documentation for coordination settings and batch size [#8604](https://github.com/opensearch-project/documentation-website/pull/8604) +- Update JDK version for 2.x distributions [#8603](https://github.com/opensearch-project/documentation-website/pull/8603) +- Add documentation for star tree index feature [#8598](https://github.com/opensearch-project/documentation-website/pull/8598) +- Add URI paths for cluster stats filtering. [#8595](https://github.com/opensearch-project/documentation-website/pull/8595) +- Adding documentation for _list APIs [#8594](https://github.com/opensearch-project/documentation-website/pull/8594) +- Adds documentation about byField rerank processor [#8593](https://github.com/opensearch-project/documentation-website/pull/8593) +- Updating tiered caching settings [#8592](https://github.com/opensearch-project/documentation-website/pull/8592) +- Add documentation changes for cluster level dynamic limit settings to block cat/indices, _cat/shards and _cat/segments [#8590](https://github.com/opensearch-project/documentation-website/pull/8590) +- Add doc for dynamic threadpool settings [#8588](https://github.com/opensearch-project/documentation-website/pull/8588) +- Add value range for the search backpressure settings [#8555](https://github.com/opensearch-project/documentation-website/pull/8555) +- Add new rename_alias parameters for restore-snapshot [#8544](https://github.com/opensearch-project/documentation-website/pull/8544) +- Add SQL PIT reference [#8541](https://github.com/opensearch-project/documentation-website/pull/8541) +- Document cluster.default_number_of_replicas and update index.number_of_replicas [#8526](https://github.com/opensearch-project/documentation-website/pull/8526) +- Msearch template API returns status code in each search response [#8522](https://github.com/opensearch-project/documentation-website/pull/8522) +- document the new `analysis-phonenumber` plugin [#8469](https://github.com/opensearch-project/documentation-website/pull/8469) +- Adds documentation for providing search pipeline id in the search/msearch request [#8372](https://github.com/opensearch-project/documentation-website/pull/8372) +- Data Stream support for Audit- Log [#8356](https://github.com/opensearch-project/documentation-website/pull/8356) +- Update documentation to reflect k-NN FAISS AVX512 support [#8307](https://github.com/opensearch-project/documentation-website/pull/8307) +- [Feature]: add ignore missing to text chunking processor [#8266](https://github.com/opensearch-project/documentation-website/pull/8266) +- Add documentation for workload management [#8228](https://github.com/opensearch-project/documentation-website/pull/8228) + +## In progress documentation for 2.18.0 + +- [Workspace] Add documentation for workspace and ACL [#8643](https://github.com/opensearch-project/documentation-website/pull/8643) +- add wlm feature overview [#8632](https://github.com/opensearch-project/documentation-website/pull/8632) +- Add querygroup lifecycle api documentation [#8628](https://github.com/opensearch-project/documentation-website/pull/8628) +- [Workload Management] Querygroup Lifecyle API docs [#8249](https://github.com/opensearch-project/documentation-website/pull/8249) diff --git a/spec-insert/.gitignore b/spec-insert/.gitignore new file mode 100644 index 0000000000..c9958b86d2 --- /dev/null +++ b/spec-insert/.gitignore @@ -0,0 +1,2 @@ +opensearch-openapi.yaml +rspec_examples.txt diff --git a/spec-insert/.rspec b/spec-insert/.rspec new file mode 100644 index 0000000000..c99d2e7396 --- /dev/null +++ b/spec-insert/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/spec-insert/.rubocop.yml b/spec-insert/.rubocop.yml new file mode 100644 index 0000000000..5b88e922f4 --- /dev/null +++ b/spec-insert/.rubocop.yml @@ -0,0 +1,29 @@ +require: rubocop-rake +AllCops: + Include: + - 'lib/**/*.rb' + - 'Rakefile' + NewCops: enable + +Metrics/CyclomaticComplexity: + Enabled: false +Metrics/MethodLength: + Enabled: false +Metrics/ParameterLists: + Enabled: false +Metrics/AbcSize: + Enabled: false +Metrics/PerceivedComplexity: + Enabled: false + +Layout/EmptyLineAfterGuardClause: + Enabled: false + +Style/MultilineBlockChain: + Enabled: false +Style/SingleLineMethods: + Enabled: false + +Naming/FileName: + Exclude: + - 'lib/jekyll-spec-insert.rb' # For Jekyll to recognize the plugin diff --git a/spec-insert/jekyll-spec-insert.gemspec b/spec-insert/jekyll-spec-insert.gemspec new file mode 100644 index 0000000000..d397f40af2 --- /dev/null +++ b/spec-insert/jekyll-spec-insert.gemspec @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +Gem::Specification.new do |spec| + spec.name = 'jekyll-spec-insert' + spec.version = '0.1.0' + spec.authors = ['Theo Truong'] + spec.email = ['theo.nam.truong@gmail.com'] + + spec.summary = 'A Jekyll plugin for inserting OpenSearch OpenAPI specifications into Jekyll sites.' + + spec.files = Dir['lib/**/*.rb'] + spec.require_paths = ['lib'] + + spec.metadata['rubygems_mfa_required'] = 'true' + spec.required_ruby_version = '>= 3.1.0' +end diff --git a/spec-insert/lib/api/action.rb b/spec-insert/lib/api/action.rb new file mode 100644 index 0000000000..5ad3dded77 --- /dev/null +++ b/spec-insert/lib/api/action.rb @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +# frozen_string_literal: true + +require_relative 'parameter' +require_relative 'operation' + +# A collection of operations that comprise a single API Action +# AKA operation-group +class Action + # @param [SpecHash] spec Parsed OpenAPI spec + def self.actions=(spec) + operations = spec.paths.flat_map do |url, ops| + ops.filter_map { |verb, op| Operation.new(op, url, verb) unless op['x-ignorable'] } + end + @actions = operations.group_by(&:group).values.map { |ops| Action.new(ops) }.index_by(&:full_name) + end + + # @return [Hash] API Actions indexed by operation-group + def self.actions + raise 'Actions not set' unless @actions + @actions + end + + # @return [Array] Operations in the action + attr_reader :operations + + # @param [Array] operations + def initialize(operations) + @operations = operations + @operation = operations.first + @spec = @operation&.spec + end + + # @return [Array] Input arguments. + def arguments; @arguments ||= Parameter.from_operations(@operations.map(&:spec)); end + + # @return [String] Full name of the action (i.e. namespace.action) + def full_name; @operation&.group; end + + # return [String] Name of the action + def name; @operation&.action; end + + # @return [String] Namespace of the action + def namespace; @operation&.namespace; end + + # @return [Array] Sorted unique HTTP verbs + def http_verbs; @operations.map(&:http_verb).uniq.sort; end + + # @return [Array] Unique URLs + def urls; @operations.map(&:url).uniq; end + + # @return [String] Description of the action + def description; @spec&.description; end + + # @return [Boolean] Whether the action is deprecated + def deprecated; @spec&.deprecated; end + + # @return [String] Deprecation message + def deprecation_message; @spec['x-deprecation-message']; end + + # @return [String] API reference + def api_reference; @operation&.external_docs&.url; end +end diff --git a/spec-insert/lib/api/operation.rb b/spec-insert/lib/api/operation.rb new file mode 100644 index 0000000000..6f9fb44cc4 --- /dev/null +++ b/spec-insert/lib/api/operation.rb @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +# frozen_string_literal: true + +# An API Operation +class Operation + # @return [Openapi3Parser::Node::Operation] Operation Spec + attr_reader :spec + # @return [String] URL + attr_reader :url + # @return [String] HTTP Verb + attr_reader :http_verb + # @return [String] Operation Group + attr_reader :group + # @return [String] API Action + attr_reader :action + # @return [String] API Namespace + attr_reader :namespace + + # @param [Openapi3Parser::Node::Operation] spec Operation Spec + # @param [String] url + # @param [String] http_verb + def initialize(spec, url, http_verb) + @spec = spec + @url = url + @http_verb = http_verb.upcase + @group = spec['x-operation-group'] + @action, @namespace = @group.split('.').reverse + end +end diff --git a/spec-insert/lib/api/parameter.rb b/spec-insert/lib/api/parameter.rb new file mode 100644 index 0000000000..fbd87fd50e --- /dev/null +++ b/spec-insert/lib/api/parameter.rb @@ -0,0 +1,94 @@ +# frozen_string_literal: true + +module ArgLocation + PATH = :path + QUERY = :query +end + +# Represents a parameter of an API action +class Parameter + # @return [String] The name of the parameter + attr_reader :name + # @return [String] The description of the parameter + attr_reader :description + # @return [Boolean] Whether the parameter is required + attr_reader :required + # @return [SpecHash] The JSON schema of the parameter + attr_reader :schema + # @return [String] Argument type in documentation + attr_reader :doc_type + # @return [String] The default value of the parameter + attr_reader :default + # @return [Boolean] Whether the parameter is deprecated + attr_reader :deprecated + # @return [String] The deprecation message + attr_reader :deprecation_message + # @return [String] The OpenSearch version when the parameter was deprecated + attr_reader :version_deprecated + # @return [ArgLocation] The location of the parameter + attr_reader :location + + def initialize(name:, description:, required:, schema:, default:, deprecated:, deprecation_message:, + version_deprecated:, location:) + @name = name + @description = description + @required = required + @schema = schema + @doc_type = get_doc_type(schema).gsub('String / List', 'List').gsub('List / String', 'List') + @default = default + @deprecated = deprecated + @deprecation_message = deprecation_message + @version_deprecated = version_deprecated + @location = location + end + + # @param [SpecHash | nil] schema + # @return [String | nil] Documentation type + def get_doc_type(schema) + return nil if schema.nil? + union = schema.anyOf || schema.oneOf + return union.map { |sch| get_doc_type(sch) }.join(' / ') unless union.nil? + return 'Integer' if schema.type == 'integer' + return 'Float' if schema.type == 'number' + return 'Boolean' if schema.type == 'boolean' + return 'String' if schema.type == 'string' + return 'NULL' if schema.type == 'null' + return 'List' if schema.type == 'array' + 'Object' + end + + # @param [SpecHash] Full OpenAPI spec + def self.global=(spec) + @global = spec.components.parameters.filter { |_, p| p['x-global'] }.map { |_, p| from_parameters([p], 1) } + end + + # @return [Array] Global parameters + def self.global + raise 'Global parameters not set' unless @global + @global + end + + # @param [Array] operations List of operations of the same group + # @return [Array] List of parameters of the operation group + def self.from_operations(operations) + operations.flat_map(&:parameters).filter { |param| !param['x-global'] } + .group_by(&:name).values.map { |params| from_parameters(params, operations.size) } + end + + # @param [Array] params List of parameters of the same name + # @param [Integer] opts_count Number of operations involved + # @return [Parameter] Single parameter distilled from the list + def self.from_parameters(params, opts_count) + param = params.first || SpecHash.new + schema = param&.schema || SpecHash.new + Parameter.new(name: param.name, + description: param.description || schema.description, + required: params.filter(&:required).size >= opts_count, + schema:, + default: param.default || schema.default, + deprecated: param.deprecated || schema.deprecated, + deprecation_message: param['x-deprecation-message'] || schema['x-deprecation-message'], + version_deprecated: param['x-version-deprecated'] || schema['x-version-deprecated'], + location: params.any? { |p| p.in == 'path' } ? ArgLocation::PATH : ArgLocation::QUERY) + end +end diff --git a/spec-insert/lib/doc_processor.rb b/spec-insert/lib/doc_processor.rb new file mode 100644 index 0000000000..0aaa01061a --- /dev/null +++ b/spec-insert/lib/doc_processor.rb @@ -0,0 +1,62 @@ +# frozen_string_literal: true + +require 'pathname' +require_relative 'renderers/spec_insert' +require_relative 'spec_insert_error' + +# Processes a file, replacing spec-insert blocks with rendered content +class DocProcessor + START_MARKER = // + + def initialize(file_path, logger:) + @file_path = Pathname(file_path) + @logger = logger + end + + # Processes the file, replacing spec-insert blocks with rendered content + # @param [Boolean] write_to_file Whether to write the changes back to the file + def process(write_to_file: true) + relative_path = @file_path.relative_path_from(Pathname.new(Dir.pwd)) + lines = File.readlines(@file_path) + original_content = lines.join + insertions = find_insertions(lines) + return if insertions.empty? + + insertions.reverse_each { |start, finish, insert| lines[start..finish] = insert.render } + rendered_content = lines.join + if write_to_file && rendered_content != original_content + File.write(@file_path, rendered_content) + @logger.info "Spec components inserted into #{relative_path} successfully." + end + rendered_content + rescue SpecInsertError => e + @logger.error "Error processing #{relative_path}. #{e.message}" + end + + private + + # @return Array<[Integer, Integer, SpecInsert]> + def find_insertions(lines) + start_indices = lines.each_with_index + .filter { |line, _index| line.match?(START_MARKER) } + .map { |_line, index| index } + end_indices = start_indices.map do |index| + (index..lines.length - 1).find { |i| lines[i].match?(END_MARKER) } + end.compact + + validate_markers!(start_indices, end_indices) + + start_indices.zip(end_indices).map do |start, finish| + [start, finish, SpecInsert.new(lines[start..finish])] + end + end + + # @param [Array] start_indices + # @param [Array] end_indices + def validate_markers!(start_indices, end_indices) + return if start_indices.length == end_indices.length && + start_indices.zip(end_indices).flatten.each_cons(2).all? { |a, b| a < b } + raise SpecInsertError, 'Mismatched "spec_insert_start" and "spec_insert_end" markers.' + end +end diff --git a/spec-insert/lib/insert_arguments.rb b/spec-insert/lib/insert_arguments.rb new file mode 100644 index 0000000000..6216b8a3e0 --- /dev/null +++ b/spec-insert/lib/insert_arguments.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +# Doc Insert Arguments +class InsertArguments + COLUMNS = %w[Parameter Description Required Type Default].freeze + DEFAULT_COLUMNS = %w[Parameter Type Description].freeze + attr_reader :raw + + # @param [Array] lines the lines between + def initialize(lines) + end_index = lines.each_with_index.find { |line, _index| line.match?(/^\s*-->/) }&.last&.- 1 + @raw = lines[1..end_index].filter { |line| line.include?(':') }.to_h do |line| + key, value = line.split(':') + [key.strip, value.strip] + end + end + + # @return [String] + def api + @raw['api'] + end + + # @return [String] + def component + @raw['component'] + end + + # @return [Array] + def columns + cols = parse_array(@raw['columns']) || DEFAULT_COLUMNS + invalid = cols - COLUMNS + raise ArgumentError, "Invalid column(s): #{invalid.join(', ')}" unless invalid.empty? + cols + end + + # @return [Boolean] + def pretty + parse_boolean(@raw['pretty'], default: false) + end + + # @return [Boolean] + def include_global + parse_boolean(@raw['include_global'], default: false) + end + + # @return [Boolean] + def include_deprecated + parse_boolean(@raw['include_deprecated'], default: true) + end + + # @return [Boolean] + def omit_header + parse_boolean(@raw['omit_header'], default: false) + end + + private + + # @param [String] value comma-separated array + def parse_array(value) + return nil if value.nil? + value.split(',').map(&:strip) + end + + # @param [String] value + # @param [Boolean] default value to return when nil + def parse_boolean(value, default:) + return default if value.nil? + return true if value.in?(%w[true True TRUE yes Yes YES 1]) + return false if value.in?(%w[false False FALSE no No NO 0]) + raise ArgumentError, "Invalid boolean value: #{value}" + end +end diff --git a/spec-insert/lib/jekyll-spec-insert.rb b/spec-insert/lib/jekyll-spec-insert.rb new file mode 100644 index 0000000000..14a8997cc8 --- /dev/null +++ b/spec-insert/lib/jekyll-spec-insert.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require 'active_support/all' +require 'listen' +require 'yaml' +require_relative 'spec_hash' +require_relative 'doc_processor' + +# Jekyll plugin to insert document components generated from the spec into the Jekyll site +class JekyllSpecInsert < Jekyll::Command + # @param [Mercenary::Program] prog + def self.init_with_program(prog) + prog.command(:'spec-insert') do |c| + c.syntax 'spec-insert [options]' + c.option 'watch', '--watch', '-W', 'Watch for changes and rebuild' + c.option 'refresh-spec', '--refresh-spec', '-R', 'Redownload the OpenSearch API specification' + c.action do |_args, options| + spec_file = File.join(Dir.pwd, 'spec-insert/opensearch-openapi.yaml') + excluded_paths = YAML.load_file('_config.yml')['exclude'] + download_spec(spec_file, forced: options['refresh-spec']) + SpecHash.load_file(spec_file) + run_once(excluded_paths) + watch(excluded_paths) if options['watch'] + end + end + end + + def self.download_spec(spec_file, forced: false) + return if !forced && File.exist?(spec_file) && (File.mtime(spec_file) > 1.day.ago) + Jekyll.logger.info 'Downloading OpenSearch API specification...' + system 'curl -L -X GET ' \ + 'https://github.com/opensearch-project/opensearch-api-specification' \ + '/releases/download/main-latest/opensearch-openapi.yaml ' \ + "-o #{spec_file}" + end + + def self.run_once(excluded_paths) + excluded_paths = excluded_paths.map { |path| File.join(Dir.pwd, path) } + Dir.glob(File.join(Dir.pwd, '**/*.md')) + .filter { |file| excluded_paths.none? { |excluded| file.start_with?(excluded) } } + .each { |file| DocProcessor.new(file, logger: Jekyll.logger).process } + end + + def self.watch(excluded_paths) + Jekyll.logger.info "\nWatching for changes...\n" + excluded_paths = excluded_paths.map { |path| /\.#{path}$/ } + + Listen.to(Dir.pwd, only: /\.md$/, ignore: excluded_paths) do |modified, added, _removed| + (modified + added).each { |file| DocProcessor.new(file, logger: Jekyll.logger).process } + end.start + + trap('INT') { exit } + trap('TERM') { exit } + sleep + end +end diff --git a/spec-insert/lib/renderers/base_mustache_renderer.rb b/spec-insert/lib/renderers/base_mustache_renderer.rb new file mode 100644 index 0000000000..b3d756304c --- /dev/null +++ b/spec-insert/lib/renderers/base_mustache_renderer.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require 'mustache' + +# Base Mustache Renderer +class BaseMustacheRenderer < Mustache + self.template_path = "#{__dir__}/templates" + + # @param [Action] action API Action + # @param [InsertArguments] args + def initialize(action, args) + super() + @action = action + @args = args + end + + def omit_header + @args.omit_header + end +end diff --git a/spec-insert/lib/renderers/parameter_table_renderer.rb b/spec-insert/lib/renderers/parameter_table_renderer.rb new file mode 100644 index 0000000000..23312962d8 --- /dev/null +++ b/spec-insert/lib/renderers/parameter_table_renderer.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +require_relative 'table_renderer' + +# Renders a table of parameters of an API action +class ParameterTableRenderer + # @param [Array] parameters + # @param [InsertArguments] args + def initialize(parameters, args) + @columns = args.columns + @pretty = args.pretty + @parameters = parameters + @parameters = @parameters.reject(&:deprecated) unless args.include_deprecated + @parameters = @parameters.sort_by { |arg| [arg.required ? 0 : 1, arg.deprecated ? 1 : 0, arg.name] } + end + + # @return [String] + def render + columns = @columns.map { |col| TableRenderer::Column.new(col, col) } + rows = @parameters.map { |arg| row(arg) } + TableRenderer.new(columns, rows, pretty: @pretty).render_lines.join("\n") + end + + private + + def row(param) + { + 'Parameter' => "`#{param.name}`#{'
_DEPRECATED_' if param.deprecated}", + 'Description' => description(param), + 'Required' => param.required ? 'Required' : nil, + 'Type' => param.doc_type, + 'Default' => param.default + } + end + + def description(param) + deprecation = deprecation(param) + required = param.required && @columns.exclude?('Required') ? '**(Required)** ' : '' + description = param.description.gsub("\n", ' ') + default = param.default.nil? || @columns.includes('Default') ? '' : " _(Default: #{param.default})_" + + "#{deprecation}#{required}#{description}#{default}" + end + + def deprecation(param) + message = ": #{param.deprecation_message}" if param.deprecation_message.present? + since = " since #{param.version_deprecated}" if param.version_deprecated.present? + "_(Deprecated#{since}#{message})_ " if param.deprecated + end +end diff --git a/spec-insert/lib/renderers/path_parameters.rb b/spec-insert/lib/renderers/path_parameters.rb new file mode 100644 index 0000000000..b1265bcf53 --- /dev/null +++ b/spec-insert/lib/renderers/path_parameters.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' +require_relative 'parameter_table_renderer' + +# Renders path parameters +class PathParameters < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/path_parameters.mustache" + + def table + params = @action.arguments.select { |arg| arg.location == ArgLocation::PATH } + ParameterTableRenderer.new(params, @args).render + end +end diff --git a/spec-insert/lib/renderers/paths_and_methods.rb b/spec-insert/lib/renderers/paths_and_methods.rb new file mode 100644 index 0000000000..0685c03b36 --- /dev/null +++ b/spec-insert/lib/renderers/paths_and_methods.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' + +# Renders paths and http methods +class PathsAndMethods < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/paths_and_methods.mustache" + + def operations + ljust = @action.operations.map { |op| op.http_verb.length }.max + @action.operations + .sort_by { |op| [op.url.length, op.http_verb] } + .map { |op| { verb: op.http_verb.ljust(ljust), path: op.url } } + end +end diff --git a/spec-insert/lib/renderers/query_parameters.rb b/spec-insert/lib/renderers/query_parameters.rb new file mode 100644 index 0000000000..37058ba5f1 --- /dev/null +++ b/spec-insert/lib/renderers/query_parameters.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' +require_relative 'parameter_table_renderer' + +# Renders query parameters +class QueryParameters < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/query_parameters.mustache" + + def table + params = @action.arguments.select { |arg| arg.location == ArgLocation::QUERY } + params += Parameter.global if @args.include_global + ParameterTableRenderer.new(params, @args).render + end +end diff --git a/spec-insert/lib/renderers/spec_insert.rb b/spec-insert/lib/renderers/spec_insert.rb new file mode 100644 index 0000000000..4d5ddb3803 --- /dev/null +++ b/spec-insert/lib/renderers/spec_insert.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +require_relative 'base_mustache_renderer' +require_relative '../insert_arguments' +require_relative '../api/action' +require_relative '../spec_insert_error' +require_relative 'paths_and_methods' +require_relative 'path_parameters' +require_relative 'query_parameters' + +# Class to render spec insertions +class SpecInsert < BaseMustacheRenderer + COMPONENTS = Set.new(%w[query_params path_params paths_and_http_methods]).freeze + self.template_file = "#{__dir__}/templates/spec_insert.mustache" + + # @param [Array] arg_lines the lines between "" + def initialize(arg_lines) + args = InsertArguments.new(arg_lines) + action = Action.actions[args.api] + super(action, args) + raise SpecInsertError, '`api` argument not specified.' unless @args.api + raise SpecInsertError, "API Action '#{@args.api}' does not exist in the spec." unless @action + end + + def arguments + @args.raw.map { |key, value| { key:, value: } } + end + + def content + raise SpecInsertError, '`component` argument not specified.' unless @args.component + case @args.component.to_sym + when :query_parameters + QueryParameters.new(@action, @args).render + when :path_parameters + PathParameters.new(@action, @args).render + when :paths_and_http_methods + PathsAndMethods.new(@action, @args).render + else + raise SpecInsertError, "Invalid component: #{@args.component}" + end + end +end diff --git a/spec-insert/lib/renderers/table_renderer.rb b/spec-insert/lib/renderers/table_renderer.rb new file mode 100644 index 0000000000..1cabc435bd --- /dev/null +++ b/spec-insert/lib/renderers/table_renderer.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +# TableRenderer renders a markdown table with the given columns and rows +class TableRenderer + # Column object for rendering markdown tables + class Column + attr_reader :title, :key + attr_accessor :width + + # @param [String] title display title + # @param [String | Symbol] key key to access in row hash + def initialize(title, key) + @title = title + @key = key + @width = 0 + end + end + + # @param [Array] columns + # @param [Array] rows + # @param [Boolean] pretty whether to render a pretty table or a compact one + def initialize(columns, rows, pretty:) + @column = columns + @rows = rows + @pretty = pretty + end + + # @return [Array] + def render_lines + calculate_column_widths if @pretty + [render_column, render_divider] + render_rows + end + + private + + def calculate_column_widths + @column.each do |column| + column.width = [@rows.map { |row| row[column.key].to_s.length }.max || 0, column.title.length].max + end + end + + def render_column + columns = @column.map { |column| column.title.ljust(column.width) }.join(' | ') + @pretty ? "| #{columns} |" : columns + end + + def render_divider + dividers = @column.map { |column| ":#{'-' * [column.width + 1, 3].max}" } + @pretty ? "|#{dividers.join('|')}|" : dividers.join(' | ') + end + + def render_rows + @rows.map do |row| + cells = @column.map { |column| row[column.key].to_s.ljust(column.width).gsub('|', '\|') }.join(' | ') + @pretty ? "| #{cells} |" : cells + end + end +end diff --git a/spec-insert/lib/renderers/templates/path_parameters.mustache b/spec-insert/lib/renderers/templates/path_parameters.mustache new file mode 100644 index 0000000000..9d9a2df9d4 --- /dev/null +++ b/spec-insert/lib/renderers/templates/path_parameters.mustache @@ -0,0 +1,4 @@ +{{^omit_header}} +## Path parameters +{{/omit_header}} +{{{table}}} \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/paths_and_methods.mustache b/spec-insert/lib/renderers/templates/paths_and_methods.mustache new file mode 100644 index 0000000000..3c2df68011 --- /dev/null +++ b/spec-insert/lib/renderers/templates/paths_and_methods.mustache @@ -0,0 +1,8 @@ +{{^omit_header}} +## Paths and HTTP methods +{{/omit_header}} +```json +{{#operations}} +{{{verb}}} {{{path}}} +{{/operations}} +``` \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/query_parameters.mustache b/spec-insert/lib/renderers/templates/query_parameters.mustache new file mode 100644 index 0000000000..d7331d8f5a --- /dev/null +++ b/spec-insert/lib/renderers/templates/query_parameters.mustache @@ -0,0 +1,7 @@ +{{^omit_header}} +## Query parameters +{{#optional}} +All query parameters are optional. +{{/optional}} +{{/omit_header}} +{{{table}}} \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/spec_insert.mustache b/spec-insert/lib/renderers/templates/spec_insert.mustache new file mode 100644 index 0000000000..63b6323d48 --- /dev/null +++ b/spec-insert/lib/renderers/templates/spec_insert.mustache @@ -0,0 +1,7 @@ + +{{{content}}} + diff --git a/spec-insert/lib/spec_hash.rb b/spec-insert/lib/spec_hash.rb new file mode 100644 index 0000000000..06a872c9b9 --- /dev/null +++ b/spec-insert/lib/spec_hash.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +require 'yaml' +require_relative 'api/action' +require_relative 'api/parameter' + +# Spec class for parsing OpenAPI spec +# It's basically a wrapper around a Hash that allows for accessing hash values as object attributes +# and resolving of $refs +class SpecHash + def self.load_file(file_path) + @raw = YAML.load_file(file_path) + @parsed = SpecHash.new(@raw, parsed: false) + Action.actions = @parsed + Parameter.global = @parsed + end + + # @return [Hash] Raw OpenAPI Spec + class << self; attr_reader :raw; end + + # @return [Spec] Parsed OpenAPI Spec + class << self; attr_reader :parsed; end + + attr_reader :hash + + # @param [Hash] hash + def initialize(hash = {}, parsed: true) + @hash = parsed ? hash : parse(hash) + end + + def [](key) + parse(@hash[key]) + end + + def respond_to_missing?(name, include_private = false) + @hash.key?(name.to_s) || @hash.respond_to?(name) || super + end + + def method_missing(name, ...) + return @hash.send(name, ...) if @hash.respond_to?(name) + parse(@hash[name.to_s]) + end + + private + + def parse(value) + return value.map { |v| parse(v) } if value.is_a?(Array) + return value unless value.is_a?(Hash) + ref = value.delete('$ref') + value.transform_values! { |v| parse(v) } + return SpecHash.new(value) unless ref + SpecHash.new(parse(resolve(ref)).merge(value)) + end + + def resolve(ref) + parts = ref.split('/') + parts.shift + self.class.raw.dig(*parts) + end +end diff --git a/spec-insert/lib/spec_insert_error.rb b/spec-insert/lib/spec_insert_error.rb new file mode 100644 index 0000000000..0ee5ccf159 --- /dev/null +++ b/spec-insert/lib/spec_insert_error.rb @@ -0,0 +1,4 @@ +# frozen_string_literal: true + +# Error unique to the SpecInsert process +class SpecInsertError < StandardError; end diff --git a/spec-insert/spec/_fixtures/actual_output/.gitignore b/spec-insert/spec/_fixtures/actual_output/.gitignore new file mode 100644 index 0000000000..de056073af --- /dev/null +++ b/spec-insert/spec/_fixtures/actual_output/.gitignore @@ -0,0 +1 @@ +**/*.md diff --git a/spec-insert/spec/_fixtures/expected_output/param_tables.md b/spec-insert/spec/_fixtures/expected_output/param_tables.md new file mode 100644 index 0000000000..596f185458 --- /dev/null +++ b/spec-insert/spec/_fixtures/expected_output/param_tables.md @@ -0,0 +1,43 @@ +Typical Path Parameters Example + + +## Path parameters +Parameter | Type | Description +:--- | :--- | :--- +`index` | List | Comma-separated list of data streams, indexes, and aliases to search. Supports wildcards (`*`). To search all data streams and indexes, omit this parameter or use `*` or `_all`. + + +Query Parameters Example with Global Parameters, Pretty Print, and Custom Columns + + +## Query parameters +| Type | Parameter | Description | Required | Default | +|:--------|:--------------------------|:-----------------------------------------------------------------------------------------------------------------------------------|:---------|:--------| +| Boolean | `analyze_wildcard` | If true, wildcard and prefix queries are analyzed. This parameter can only be used when the q query string parameter is specified. | Required | | +| String | `analyzer` | Analyzer to use for the query string. This parameter can only be used when the q query string parameter is specified. | | | +| Boolean | `pretty` | Whether to pretty format the returned JSON response. | | | +| Boolean | `human`
_DEPRECATED_ | _(Deprecated since 3.0: Use the `format` parameter instead.)_ Whether to return human readable values for statistics. | | | + + +Query Parameters Example with only Parameter and Description Columns + + +Parameter | Description +:--- | :--- +`analyze_wildcard` | **(Required)** If true, wildcard and prefix queries are analyzed. This parameter can only be used when the q query string parameter is specified. +`analyzer` | Analyzer to use for the query string. This parameter can only be used when the q query string parameter is specified. + diff --git a/spec-insert/spec/_fixtures/expected_output/paths_and_http_methods.md b/spec-insert/spec/_fixtures/expected_output/paths_and_http_methods.md new file mode 100644 index 0000000000..8ca1569b52 --- /dev/null +++ b/spec-insert/spec/_fixtures/expected_output/paths_and_http_methods.md @@ -0,0 +1,13 @@ + + +## Paths and HTTP methods +```json +GET /_search +POST /_search +GET /{index}/_search +POST /{index}/_search +``` + diff --git a/spec-insert/spec/_fixtures/input/param_tables.md b/spec-insert/spec/_fixtures/input/param_tables.md new file mode 100644 index 0000000000..d9f24e23f9 --- /dev/null +++ b/spec-insert/spec/_fixtures/input/param_tables.md @@ -0,0 +1,39 @@ +Typical Path Parameters Example + + +THIS + TEXT + SHOULD + BE + REPLACED + + +Query Parameters Example with Global Parameters, Pretty Print, and Custom Columns + + + THIS TEXT SHOULD BE REPLACED + + +Query Parameters Example with only Parameter and Description Columns + + +THIS +TEXT +SHOULD +BE +REPLACED + diff --git a/spec-insert/spec/_fixtures/input/paths_and_http_methods.md b/spec-insert/spec/_fixtures/input/paths_and_http_methods.md new file mode 100644 index 0000000000..0e92b8af8e --- /dev/null +++ b/spec-insert/spec/_fixtures/input/paths_and_http_methods.md @@ -0,0 +1,6 @@ + + + diff --git a/spec-insert/spec/_fixtures/opensearch_spec.yaml b/spec-insert/spec/_fixtures/opensearch_spec.yaml new file mode 100644 index 0000000000..7c67f27e69 --- /dev/null +++ b/spec-insert/spec/_fixtures/opensearch_spec.yaml @@ -0,0 +1,120 @@ +openapi: 3.1.0 +info: + title: OpenSearch API Specification + version: 1.0.0 + x-api-version: 2.16.0 +paths: + /_search: + get: + operationId: search.0 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + post: + operationId: search.1 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + /{index}/_search: + get: + operationId: search.2 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___path.index' + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + post: + operationId: search.3 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___path.index' + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' +components: + + parameters: + + _global___query.pretty: + name: pretty + in: query + description: Whether to pretty format the returned JSON response. + schema: + type: boolean + default: false + x-global: true + + _global___query.human: + name: human + in: query + description: Whether to return human readable values for statistics. + schema: + type: boolean + default: true + x-global: true + deprecated: true + x-version-deprecated: '3.0' + x-deprecation-message: Use the `format` parameter instead. + + search___path.index: + in: path + name: index + description: |- + Comma-separated list of data streams, indexes, and aliases to search. + Supports wildcards (`*`). + To search all data streams and indexes, omit this parameter or use `*` or `_all`. + required: true + schema: + $ref: '#/components/schemas/_common___Indices' + style: simple + + search___query.analyze_wildcard: + in: query + name: analyze_wildcard + required: true + description: |- + If true, wildcard and prefix queries are analyzed. + This parameter can only be used when the q query string parameter is specified. + schema: + type: boolean + default: false + style: form + + search___query.analyzer: + in: query + name: analyzer + description: |- + Analyzer to use for the query string. + This parameter can only be used when the q query string parameter is specified. + schema: + type: string + style: form + + schemas: + + _common___Indices: + oneOf: + - $ref: '#/components/schemas/_common___IndexName' + - type: array + items: + $ref: '#/components/schemas/_common___IndexName' + + _common___IndexName: + type: string diff --git a/spec-insert/spec/doc_processor_spec.rb b/spec-insert/spec/doc_processor_spec.rb new file mode 100644 index 0000000000..073613a2a9 --- /dev/null +++ b/spec-insert/spec/doc_processor_spec.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +require_relative 'spec_helper' +require_relative '../lib/doc_processor' +require_relative '../lib/spec_hash' + +describe DocProcessor do + SpecHash.load_file('spec/_fixtures/opensearch_spec.yaml') + + def test_file(file_name) + expected_output = File.read("#{__dir__}/_fixtures/expected_output/#{file_name}.md") + actual_output = described_class.new("#{__dir__}/_fixtures/input/#{file_name}.md", logger: Logger.new($stdout)).process(write_to_file: false) + File.write("./spec/_fixtures/actual_output/#{file_name}.md", actual_output) + expect(actual_output).to eq(expected_output) + end + + it 'inserts the param tables correctly' do + test_file('param_tables') + end + + it 'inserts the paths and http methods correctly' do + test_file('paths_and_http_methods') + end +end diff --git a/spec-insert/spec/spec_helper.rb b/spec-insert/spec/spec_helper.rb new file mode 100644 index 0000000000..74d9dc9bb9 --- /dev/null +++ b/spec-insert/spec/spec_helper.rb @@ -0,0 +1,102 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + + # The settings below are suggested to provide a good initial experience + # with RSpec, but feel free to customize to your heart's content. + + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = 'rspec_examples.txt' + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/ + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose expected_output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed expected_output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed + + config.expose_dsl_globally = true +end + +require 'active_support/all' +require 'rspec' diff --git a/templates/EXPERIMENTAL_TEMPLATE.md b/templates/EXPERIMENTAL_TEMPLATE.md index 6aa06c5824..b954a98a5d 100644 --- a/templates/EXPERIMENTAL_TEMPLATE.md +++ b/templates/EXPERIMENTAL_TEMPLATE.md @@ -10,5 +10,5 @@ parent: This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://example.issue.link). {: .warning} -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion in the [OpenSearch forum](https://forum.opensearch.org/). -{: .warning} \ No newline at end of file +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning}