From ce7a98330447d0eab373d5f1fad277cdd85d3489 Mon Sep 17 00:00:00 2001 From: Joe McElroy Date: Mon, 24 Jun 2024 16:17:09 +0100 Subject: [PATCH] [Search] [Playground] [Bug Fix] Model id detection fix when using search based dynamic template (#186665) ## Summary model_id detection relies on creating an aggregation for the model_id field which requires a keyword field. This is usually added to the `.keyword` multi-field. The issue is when using the `search` dynamic template, all text based fields create a keyword field with the `enum` key. This fix iterates through all the sub fields of the model_id field to find a keyword based one, rather than relying on the `.keyword` subfield. ### Checklist Delete any items that are not applicable to this PR. - [ ] Any text added follows [EUI's writing guidelines](https://elastic.github.io/eui/#/guidelines/writing), uses sentence case text and includes [i18n support](https://github.com/elastic/kibana/blob/main/packages/kbn-i18n/README.md) - [ ] [Documentation](https://www.elastic.co/guide/en/kibana/master/development-documentation.html) was added for features that require explanation or tutorials - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios - [ ] [Flaky Test Runner](https://ci-stats.kibana.dev/trigger_flaky_test_runner/1) was used on any tests changed - [ ] Any UI touched in this PR is usable by keyboard only (learn more about [keyboard accessibility](https://webaim.org/techniques/keyboard/)) - [ ] Any UI touched in this PR does not create any new axe failures (run axe in browser: [FF](https://addons.mozilla.org/en-US/firefox/addon/axe-devtools/), [Chrome](https://chrome.google.com/webstore/detail/axe-web-accessibility-tes/lhdoppojpmngadmnindnejefpokejbdd?hl=en-US)) - [ ] If a plugin configuration key changed, check if it needs to be allowlisted in the cloud and added to the [docker list](https://github.com/elastic/kibana/blob/main/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker) - [ ] This renders correctly on smaller devices using a responsive layout. (You can test this [in your browser](https://www.browserstack.com/guide/responsive-testing-on-local-server)) - [ ] This was checked for [cross-browser compatibility](https://www.elastic.co/support/matrix#matrix_browsers) (cherry picked from commit 175b41a6fa341c4f19c7bdeb1931f27baa344bf7) # Conflicts: # x-pack/plugins/search_playground/__mocks__/fetch_query_source_fields.mock.ts # x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.test.ts --- .../fetch_query_source_fields.mock.ts | 817 +++++++++++++++++- .../lib/fetch_query_source_fields.test.ts | 50 ++ .../server/lib/fetch_query_source_fields.ts | 20 +- 3 files changed, 871 insertions(+), 16 deletions(-) diff --git a/x-pack/plugins/search_playground/__mocks__/fetch_query_source_fields.mock.ts b/x-pack/plugins/search_playground/__mocks__/fetch_query_source_fields.mock.ts index 355494b4a217e..7ab5f261989a4 100644 --- a/x-pack/plugins/search_playground/__mocks__/fetch_query_source_fields.mock.ts +++ b/x-pack/plugins/search_playground/__mocks__/fetch_query_source_fields.mock.ts @@ -5,7 +5,256 @@ * 2.0. */ -import { SearchResponse } from '@elastic/elasticsearch/lib/api/types'; +import { IndicesGetMappingResponse, SearchResponse } from '@elastic/elasticsearch/lib/api/types'; + +export const SPARSE_SEMANTIC_FIELD_FIELD_CAPS = { + indices: ['test-index2'], + fields: { + infer_field: { + semantic_text: { + type: 'semantic_text', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'infer_field.inference.chunks.embeddings': { + sparse_vector: { + type: 'sparse_vector', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + non_infer_field: { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'infer_field.inference.chunks.text': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'infer_field.inference': { + object: { + type: 'object', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'infer_field.inference.chunks': { + nested: { + type: 'nested', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + }, +}; + +export const SPARSE_SEMANTIC_FIELD_MAPPINGS = { + 'test-index2': { + mappings: { + properties: { + infer_field: { + type: 'semantic_text', + inference_id: 'elser-endpoint', + model_settings: { + task_type: 'sparse_embedding', + }, + }, + non_infer_field: { + type: 'text', + }, + }, + }, + }, +} as any as IndicesGetMappingResponse; + +export const DENSE_SEMANTIC_FIELD_MAPPINGS = { + 'test-index2': { + mappings: { + properties: { + infer_field: { + type: 'semantic_text', + inference_id: 'cohere', + model_settings: { + task_type: 'text_embedding', + dimensions: 1536, + similarity: 'dot_product', + }, + }, + non_infer_field: { + type: 'text', + }, + }, + }, + }, +} as any as IndicesGetMappingResponse; + +// for when semantic_text field hasn't been mapped with task_type +// when theres no data / no inference has been performed in the field +export const DENSE_SEMANTIC_FIELD_MAPPINGS_MISSING_TASK_TYPE = { + 'test-index2': { + mappings: { + properties: { + infer_field: { + type: 'semantic_text', + inference_id: 'cohere', + model_settings: { + dimensions: 1536, + similarity: 'dot_product', + }, + }, + non_infer_field: { + type: 'text', + }, + }, + }, + }, +} as any as IndicesGetMappingResponse; + +export const DENSE_SEMANTIC_FIELD_FIELD_CAPS = { + indices: ['test-index2'], + fields: { + infer_field: { + semantic_text: { + type: 'semantic_text', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'infer_field.inference.chunks.embeddings': { + sparse_vector: { + type: 'dense_vector', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + non_infer_field: { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'infer_field.inference.chunks.text': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'infer_field.inference': { + object: { + type: 'object', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'infer_field.inference.chunks': { + nested: { + type: 'nested', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + }, +}; + +export const DENSE_SPARSE_SAME_FIELD_NAME_CAPS = { + indices: ['cohere-embeddings', 'elser_index'], + fields: { + text_embedding: { + sparse_vector: { + type: 'sparse_vector', + metadata_field: false, + searchable: true, + aggregatable: false, + indices: ['elser_index'], + }, + dense_vector: { + type: 'dense_vector', + metadata_field: false, + searchable: true, + aggregatable: false, + indices: ['cohere-embeddings'], + }, + }, + model_id: { + text: { type: 'text', metadata_field: false, searchable: true, aggregatable: false }, + }, + text: { text: { type: 'text', metadata_field: false, searchable: true, aggregatable: false } }, + 'model_id.keyword': { + keyword: { type: 'keyword', metadata_field: false, searchable: true, aggregatable: true }, + }, + }, +}; + +export const DENSE_OLD_PIPELINE_DOCS = [ + { + took: 1, + timed_out: false, + _shards: { total: 1, successful: 1, skipped: 0, failed: 0 }, + hits: { total: { value: 1, relation: 'eq' }, max_score: null, hits: [] }, + aggregations: { + 'ml.inference.body_content.model_id': { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [ + { + key: '.multilingual-e5-small_linux-x86_64', + doc_count: 1, + }, + ], + }, + }, + } as SearchResponse, +]; + +export const DENSE_SPARSE_SAME_FIELD_NAME_DOCS = [ + { + took: 1, + timed_out: false, + _shards: { total: 1, successful: 1, skipped: 0, failed: 0 }, + hits: { total: { value: 1, relation: 'eq' }, max_score: null, hits: [] }, + aggregations: { + model_id: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [{ key: 'cohere_embeddings', doc_count: 1 }], + }, + }, + } as SearchResponse, + { + took: 0, + timed_out: false, + _shards: { total: 1, successful: 1, skipped: 0, failed: 0 }, + hits: { total: { value: 1, relation: 'eq' }, max_score: null, hits: [] }, + aggregations: { + model_id: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [{ key: 'my-elser-model', doc_count: 1 }], + }, + }, + } as SearchResponse, +]; export const ELSER_PASSAGE_CHUNKED_TWO_INDICES_DOCS = [ { @@ -731,6 +980,572 @@ export const ELSER_PASSAGE_CHUNKED_TWO_INDICES = { }, }; +export const DENSE_PIPELINE_FIELD_CAPS = { + indices: ['search-test-e5'], + fields: { + additional_urls: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + 'title.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.pipeline.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'headings.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'ml.inference.body_content.model_id.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'headings.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors': { + object: { + type: 'object', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + '_ingest.processors.types.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'body_content.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + links: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + id: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + 'ml.inference.body_content.model_id.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + ml: { + object: { + type: 'object', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'ml.inference.body_content.model_id': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'ml.inference': { + object: { + type: 'object', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + body_content: { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.pipeline.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + domains: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.model_version.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + 'body_content.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + url_scheme: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + meta_description: { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'ml.inference.body_content': { + object: { + type: 'object', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + headings: { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.types.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + last_crawled_at: { + date: { + type: 'date', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.model_version.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'title.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'headings.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'title.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.pipeline.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.pipeline.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'meta_description.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.types.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'title.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'body_content.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.types.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'ml.inference.body_content.model_id.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + title: { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + meta_keywords: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.processed_timestamp': { + date: { + type: 'date', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + 'ml.inference.body_content.model_id.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'meta_description.enum': { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + 'meta_description.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'title.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.pipeline': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + _ingest: { + object: { + type: 'object', + metadata_field: false, + searchable: false, + aggregatable: false, + }, + }, + 'ml.inference.body_content.is_truncated': { + boolean: { + type: 'boolean', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.model_version.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.model_version.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + url_host: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + url_path: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.model_version': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + url_path_dir3: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.pipeline.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'headings.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.types': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'meta_description.joined': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'ml.inference.body_content.predicted_value': { + dense_vector: { + type: 'dense_vector', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + url: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + 'meta_description.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'ml.inference.body_content.model_id.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + url_port: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + 'body_content.delimiter': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + '_ingest.processors.model_version.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + url_path_dir2: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + url_path_dir1: { + keyword: { + type: 'keyword', + metadata_field: false, + searchable: true, + aggregatable: true, + }, + }, + '_ingest.processors.types.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'headings.stem': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + 'body_content.prefix': { + text: { + type: 'text', + metadata_field: false, + searchable: true, + aggregatable: false, + }, + }, + }, +}; + export const ELSER_PASSAGE_CHUNKED = { indices: ['search-nethys'], fields: { diff --git a/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.test.ts b/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.test.ts index 108ddabb0a73f..fccc337ed3561 100644 --- a/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.test.ts +++ b/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.test.ts @@ -19,6 +19,8 @@ import { SPARSE_INPUT_OUTPUT_ONE_INDEX, SPARSE_INPUT_OUTPUT_ONE_INDEX_FIELD_CAPS, SPARSE_INPUT_OUTPUT_ONE_INDEX_FIELD_CAPS_MODEL_ID_KEYWORD, + DENSE_PIPELINE_FIELD_CAPS, + DENSE_OLD_PIPELINE_DOCS, } from '../../__mocks__/fetch_query_source_fields.mock'; import { fetchFields, @@ -320,6 +322,54 @@ describe('fetch_query_source_fields', () => { }); }); + it('should perform a search request with the correct modelid for old style inference', async () => { + const client = { + asCurrentUser: { + fieldCaps: jest.fn().mockResolvedValue(DENSE_PIPELINE_FIELD_CAPS), + search: jest.fn().mockResolvedValue(DENSE_OLD_PIPELINE_DOCS[0]), + indices: { + getMapping: jest.fn().mockResolvedValue({ + 'search-test-e5': { + mappings: {}, + }, + }), + }, + }, + } as any; + const indices = ['search-test-e5']; + const response = await fetchFields(client, indices); + expect(client.asCurrentUser.search).toHaveBeenCalledWith({ + index: 'search-test-e5', + body: { + size: 0, + aggs: { + 'ml.inference.body_content.model_id': { + terms: { + field: 'ml.inference.body_content.model_id.enum', + size: 1, + }, + }, + }, + }, + }); + expect(response).toEqual({ + 'search-test-e5': { + bm25_query_fields: expect.any(Array), + dense_vector_query_fields: [ + { + field: 'ml.inference.body_content.predicted_value', + indices: ['search-test-e5'], + model_id: '.multilingual-e5-small_linux-x86_64', + }, + ], + elser_query_fields: [], + semantic_fields: [], + source_fields: expect.any(Array), + skipped_fields: 30, + }, + }); + }); + it('should perform a search request with the correct parameters with top level model id', async () => { const client = { asCurrentUser: { diff --git a/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.ts b/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.ts index b333a0e95962b..2a6068fcb6902 100644 --- a/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.ts +++ b/x-pack/plugins/search_playground/server/lib/fetch_query_source_fields.ts @@ -22,25 +22,15 @@ interface IndexFieldModel { export const getModelIdFields = (fieldCapsResponse: FieldCapsResponse) => { const { fields } = fieldCapsResponse; return Object.keys(fields).reduce>((acc, fieldKey) => { - const field = fields[fieldKey]; if (fieldKey.endsWith('model_id')) { - if ('keyword' in field && field.keyword.aggregatable) { - acc.push({ - path: fieldKey, - aggField: fieldKey, - }); - return acc; - } - const keywordModelIdField = fields[fieldKey + '.keyword']; + const multiField = Object.keys(fields) + .filter((key) => key.startsWith(fieldKey)) + .find((key) => fields[key].keyword && fields[key].keyword.aggregatable); - if ( - keywordModelIdField && - `keyword` in keywordModelIdField && - keywordModelIdField.keyword.aggregatable - ) { + if (multiField) { acc.push({ path: fieldKey, - aggField: fieldKey + '.keyword', + aggField: multiField, }); return acc; }