From f443445fe0dca943edefbcc3ebe219a0cd5092e8 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 1 Aug 2024 12:04:24 +0100 Subject: [PATCH] adding apostrophe token filter page #7871 Signed-off-by: AntonEliatra --- .../token-filters/apostrophe-token-filter.md | 116 ++++++++++++++++++ _analyzers/token-filters/index.md | 2 +- 2 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 _analyzers/token-filters/apostrophe-token-filter.md diff --git a/_analyzers/token-filters/apostrophe-token-filter.md b/_analyzers/token-filters/apostrophe-token-filter.md new file mode 100644 index 0000000000..0ee06e79e5 --- /dev/null +++ b/_analyzers/token-filters/apostrophe-token-filter.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Apostrophe token filter +parent: Token filters +nav_order: 110 +--- + +# Apostrophe token filter + +The `apostrophe` token filter's primary function is to remove possessive apostrophes and anything following them. This can be very useful in analyzing text in languages which rely heavily on apostrophes, such as Turkish, where apostrophes serves to separate the root word from suffixes, including possessive suffixes, case markers, and other grammatical endings. + + +## Example + +Following example can be used to create new index `custom_text_index` with custom analyzer configured in `settings` and used in `mappings`. + +``` +PUT /custom_text_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", # splits text into words + "filter": [ + "lowercase", + "apostrophe" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "custom_analyzer" + } + } + } +} +``` + +## Check generated tokens + +You can use the following command to examine the tokens being generated using the created analyzer. + +``` +POST /custom_text_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "John's car is faster than Peter's bike" +} +``` + +Expected result: + +``` +{ + "tokens": [ + { + "token": "john", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "car", + "start_offset": 7, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "faster", + "start_offset": 14, + "end_offset": 20, + "type": "", + "position": 3 + }, + { + "token": "than", + "start_offset": 21, + "end_offset": 25, + "type": "", + "position": 4 + }, + { + "token": "peter", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 5 + }, + { + "token": "bike", + "start_offset": 34, + "end_offset": 38, + "type": "", + "position": 6 + } + ] +} +``` + +The built in `apostrophe` token filter is not suitable for languages such as French, as the apostrophes are used at the beginning of the words, for example "C'est l'amour de l'école" will result in four tokens: "C", "l", "de", "l". +{: .note} diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md index e6d9875736..6cbddebe70 100644 --- a/_analyzers/token-filters/index.md +++ b/_analyzers/token-filters/index.md @@ -13,7 +13,7 @@ Token filters receive the stream of tokens from the tokenizer and add, remove, o The following table lists all token filters that OpenSearch supports. Token filter | Underlying Lucene token filter| Description -`apostrophe` | [ApostropheFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token that contains an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following the apostrophe. +[`apostrophe`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/apostrophe-token-filter/) | [ApostropheFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token that contains an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following the apostrophe. `asciifolding` | [ASCIIFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. `cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. `cjk_width` | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into the equivalent basic Latin characters.
- Folds half-width Katakana character variants into the equivalent Kana characters.