From 9771b035c205b74eea28a138855bc721e09047c8 Mon Sep 17 00:00:00 2001 From: "Daniel (dB.) Doubrovkine" Date: Mon, 30 Sep 2024 18:39:18 -0400 Subject: [PATCH] Added persian_stem. (#592) Signed-off-by: dblock --- .cspell | 7 ++- CHANGELOG.md | 1 + spec/schemas/_common.analysis.yaml | 12 ++++ tests/default/_core/analyze.yaml | 60 ------------------- .../_core/analyze/filter/asciifolding.yaml | 23 +++++++ .../_core/analyze/filter/lowercase.yaml | 24 ++++++++ .../_core/analyze/filter/persian_stem.yaml | 23 +++++++ .../_core/analyze/filter/porterstem.yaml | 23 +++++++ tests/default/_core/analyze/filter/stop.yaml | 26 ++++++++ .../_core/analyze/filter/uppercase.yaml | 23 +++++++ 10 files changed, 160 insertions(+), 62 deletions(-) create mode 100644 tests/default/_core/analyze/filter/asciifolding.yaml create mode 100644 tests/default/_core/analyze/filter/lowercase.yaml create mode 100644 tests/default/_core/analyze/filter/persian_stem.yaml create mode 100644 tests/default/_core/analyze/filter/porterstem.yaml create mode 100644 tests/default/_core/analyze/filter/stop.yaml create mode 100644 tests/default/_core/analyze/filter/uppercase.yaml diff --git a/.cspell b/.cspell index df8953cd6..1576a9ec4 100644 --- a/.cspell +++ b/.cspell @@ -31,6 +31,7 @@ datarows decompounder Decompounder dedup +deprovision determinized distilbert DNFOF @@ -75,6 +76,7 @@ kstem kuromoji Kuromoji languageset +Léon localstats Lovins lucene @@ -137,6 +139,7 @@ Reindex relo reloadcerts remotestore +reprovision rerank Rerank Reranker @@ -192,5 +195,5 @@ vectory whoamiprotected wordnet Yrtsd -reprovision -deprovision \ No newline at end of file +جامد +جامدات \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 79f9c5354..0fb4ed38f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,6 +98,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Added `/_bulk/stream` ([#584](https://github.com/opensearch-project/opensearch-api-specification/pull/584)) - Added `/_plugins/_ml/agents/_register`, `/_plugins/_ml/connectors/_create`, `DELETE /_plugins/_ml/agents/{agent_id}`, `DELETE /_plugins/_ml/connectors/{connector_id}` ([#228](https://github.com/opensearch-project/opensearch-api-specification/issues/228)) - Added the `context` query param to the `put_script` APIs ([#586](https://github.com/opensearch-project/opensearch-api-specification/pull/586)) +- Added `persian_stem` filter ([#592](https://github.com/opensearch-project/opensearch-api-specification/pull/592)) ### Changed diff --git a/spec/schemas/_common.analysis.yaml b/spec/schemas/_common.analysis.yaml index e917bf0d0..f91d6b4b1 100644 --- a/spec/schemas/_common.analysis.yaml +++ b/spec/schemas/_common.analysis.yaml @@ -474,6 +474,7 @@ components: - $ref: '#/components/schemas/NoriPartOfSpeechTokenFilter' - $ref: '#/components/schemas/PatternCaptureTokenFilter' - $ref: '#/components/schemas/PatternReplaceTokenFilter' + - $ref: '#/components/schemas/PersianStemTokenFilter' - $ref: '#/components/schemas/PorterStemTokenFilter' - $ref: '#/components/schemas/PredicateTokenFilter' - $ref: '#/components/schemas/RemoveDuplicatesTokenFilter' @@ -894,6 +895,17 @@ components: required: - pattern - type + PersianStemTokenFilter: + allOf: + - $ref: '#/components/schemas/TokenFilterBase' + - type: object + properties: + type: + type: string + enum: + - persian_stem + required: + - type PorterStemTokenFilter: allOf: - $ref: '#/components/schemas/TokenFilterBase' diff --git a/tests/default/_core/analyze.yaml b/tests/default/_core/analyze.yaml index 659ec9a96..a5969d930 100644 --- a/tests/default/_core/analyze.yaml +++ b/tests/default/_core/analyze.yaml @@ -30,63 +30,3 @@ chapters: - Moneyball, directed by Bennett Miller response: status: 200 - - synopsis: Apply a filter. - path: /_analyze - method: GET - request: - payload: - tokenizer: keyword - filter: - - uppercase - text: Moneyball - response: - status: 200 - payload: - tokens: - - token: MONEYBALL - type: word - start_offset: 0 - end_offset: 9 - position: 0 - - synopsis: Apply a character filter. - path: /_analyze - method: GET - request: - payload: - tokenizer: keyword - filter: - - lowercase - char_filter: - - html_strip - text: Moneyball - response: - status: 200 - payload: - tokens: - - token: moneyball - type: word - start_offset: 3 - end_offset: 16 - position: 0 - - synopsis: Combine a lowercase translation with a stop filter. - path: /_analyze - method: GET - request: - payload: - tokenizer: whitespace - filter: - - lowercase - - type: stop - stopwords: - - in - - to - text: Moneyball directed by Bennett Miller - response: - status: 200 - payload: - tokens: - - token: moneyball - type: word - start_offset: 0 - end_offset: 9 - position: 0 diff --git a/tests/default/_core/analyze/filter/asciifolding.yaml b/tests/default/_core/analyze/filter/asciifolding.yaml new file mode 100644 index 000000000..b2af17dcf --- /dev/null +++ b/tests/default/_core/analyze/filter/asciifolding.yaml @@ -0,0 +1,23 @@ +$schema: ../../../../../json_schemas/test_story.schema.yaml + +description: Test /_analyze with a filter. +version: '>= 2.17' +chapters: + - synopsis: Apply a asciifolding filter. + path: /_analyze + method: GET + request: + payload: + tokenizer: keyword + filter: + - asciifolding + text: Léon + response: + status: 200 + payload: + tokens: + - token: Leon + type: word + start_offset: 0 + end_offset: 4 + position: 0 \ No newline at end of file diff --git a/tests/default/_core/analyze/filter/lowercase.yaml b/tests/default/_core/analyze/filter/lowercase.yaml new file mode 100644 index 000000000..cf80ae09b --- /dev/null +++ b/tests/default/_core/analyze/filter/lowercase.yaml @@ -0,0 +1,24 @@ +$schema: ../../../../../json_schemas/test_story.schema.yaml + +description: Test /_analyze with a filter. +chapters: + - synopsis: Apply a lowercase character filter. + path: /_analyze + method: GET + request: + payload: + tokenizer: keyword + filter: + - lowercase + char_filter: + - html_strip + text: Moneyball + response: + status: 200 + payload: + tokens: + - token: moneyball + type: word + start_offset: 3 + end_offset: 16 + position: 0 \ No newline at end of file diff --git a/tests/default/_core/analyze/filter/persian_stem.yaml b/tests/default/_core/analyze/filter/persian_stem.yaml new file mode 100644 index 000000000..973ecbaae --- /dev/null +++ b/tests/default/_core/analyze/filter/persian_stem.yaml @@ -0,0 +1,23 @@ +$schema: ../../../../../json_schemas/test_story.schema.yaml + +description: Test /_analyze with a filter. +version: '>= 2.17' +chapters: + - synopsis: Apply a persian_stem filter. + path: /_analyze + method: GET + request: + payload: + tokenizer: keyword + filter: + - persian_stem + text: جامدات + response: + status: 200 + payload: + tokens: + - token: جامد + type: word + start_offset: 0 + end_offset: 6 + position: 0 \ No newline at end of file diff --git a/tests/default/_core/analyze/filter/porterstem.yaml b/tests/default/_core/analyze/filter/porterstem.yaml new file mode 100644 index 000000000..d7b9d4a06 --- /dev/null +++ b/tests/default/_core/analyze/filter/porterstem.yaml @@ -0,0 +1,23 @@ +$schema: ../../../../../json_schemas/test_story.schema.yaml + +description: Test /_analyze with a filter. +version: '>= 2.17' +chapters: + - synopsis: Apply a porter_stem filter. + path: /_analyze + method: GET + request: + payload: + tokenizer: keyword + filter: + - porter_stem + text: Directed by Bennett Miller + response: + status: 200 + payload: + tokens: + - token: Directed by Bennett Mil + type: word + start_offset: 0 + end_offset: 26 + position: 0 \ No newline at end of file diff --git a/tests/default/_core/analyze/filter/stop.yaml b/tests/default/_core/analyze/filter/stop.yaml new file mode 100644 index 000000000..bdcf78480 --- /dev/null +++ b/tests/default/_core/analyze/filter/stop.yaml @@ -0,0 +1,26 @@ +$schema: ../../../../../json_schemas/test_story.schema.yaml + +description: Test /_analyze with a filter. +chapters: + - synopsis: Combine a lowercase translation with a stop filter. + path: /_analyze + method: GET + request: + payload: + tokenizer: whitespace + filter: + - lowercase + - type: stop + stopwords: + - in + - to + text: Moneyball directed by Bennett Miller + response: + status: 200 + payload: + tokens: + - token: moneyball + type: word + start_offset: 0 + end_offset: 9 + position: 0 diff --git a/tests/default/_core/analyze/filter/uppercase.yaml b/tests/default/_core/analyze/filter/uppercase.yaml new file mode 100644 index 000000000..e689916ff --- /dev/null +++ b/tests/default/_core/analyze/filter/uppercase.yaml @@ -0,0 +1,23 @@ +$schema: ../../../../../json_schemas/test_story.schema.yaml + +description: Test /_analyze with a filter. +chapters: + - synopsis: Apply an uppercase character filter. + path: /_analyze + method: GET + request: + payload: + tokenizer: keyword + filter: + - uppercase + text: Moneyball + response: + status: 200 + payload: + tokens: + - token: MONEYBALL + type: word + start_offset: 0 + end_offset: 9 + position: 0 +