From 2f4e5039b0c60bd5ebd55ea4ab61e147fcaf14a0 Mon Sep 17 00:00:00 2001 From: Alejandro Hernandez Date: Mon, 29 Jan 2024 16:05:38 -0500 Subject: [PATCH] Elasticsearch provider: only highlight text fields --- .changeset/wet-maps-shout.md | 8 ++ .../src/example-types/results/README.md | 15 +-- .../results/highlighting/request.js | 127 ++++++++++-------- .../results/highlighting/request.test.js | 126 +++++++---------- .../results/highlighting/response.js | 40 +++--- .../results/highlighting/response.test.js | 35 ++--- .../results/highlighting/search.js | 10 +- .../results/highlighting/testSchema.js | 6 - .../results/highlighting/util.js | 10 +- 9 files changed, 182 insertions(+), 195 deletions(-) create mode 100644 .changeset/wet-maps-shout.md diff --git a/.changeset/wet-maps-shout.md b/.changeset/wet-maps-shout.md new file mode 100644 index 000000000..54fc4e132 --- /dev/null +++ b/.changeset/wet-maps-shout.md @@ -0,0 +1,8 @@ +--- +'contexture-elasticsearch': minor +--- + +Remove `subFields` configuration in the schema. Instead only send fields of type +`text` for highlighting. This both simplifies the API and reduces payload to +elastic, as well as fixing an issue where non-text top-level fields such as +`keyword` type fields were being highlighted when they should not be. diff --git a/packages/provider-elasticsearch/src/example-types/results/README.md b/packages/provider-elasticsearch/src/example-types/results/README.md index 79b0b4ce5..2ce47f6eb 100644 --- a/packages/provider-elasticsearch/src/example-types/results/README.md +++ b/packages/provider-elasticsearch/src/example-types/results/README.md @@ -14,7 +14,7 @@ We assume that users want to highlight all the fields present in the query. The #### 1. Sub-fields -Whitelisted sub-fields are sent for highlighting, since they could be present in the query: +All sub-fields of type `text` are sent for highlighting, since they could be present in the query:
@@ -22,23 +22,16 @@ Whitelisted sub-fields are sent for highlighting, since they could be present in ```jsonc { - "elasticsearch": { - "subFields": { - // `{field}.keyword` will *not* be sent for highlighting. - "keyword": { "highlight": false }, - // `{field}.subfield` will be sent for highlighting. - "subfield": { "highlight": true } - } - }, "fields": { // `state` will be sent for highlighting. "state": { "elasticsearch": { "mapping": { "fields": { - "keyword": {}, + // `state.keyword` will not be sent for highlighting. + "keyword": { "type": "keyword" }, // `state.subfield` will be sent for highlighting. - "subfield": {} + "subfield": { "type": "text" } } } } diff --git a/packages/provider-elasticsearch/src/example-types/results/highlighting/request.js b/packages/provider-elasticsearch/src/example-types/results/highlighting/request.js index 87b938fe3..35d93b95c 100644 --- a/packages/provider-elasticsearch/src/example-types/results/highlighting/request.js +++ b/packages/provider-elasticsearch/src/example-types/results/highlighting/request.js @@ -1,8 +1,7 @@ import _ from 'lodash/fp.js' import F from 'futil' import { minimatch } from 'minimatch' -import { CartesianProduct } from 'js-combinatorics' -import { isLeafField, isBlobField, isArrayOfObjectsField } from './util.js' +import { getFieldType, isBlobField, isArrayOfObjectsField } from './util.js' /* * Expand schema paths with wildcards into a list of paths without wildcards. @@ -14,7 +13,7 @@ let expandGlobs = (schema, globs) => { let fieldsNames = _.keys(schema.fields) let expandGlob = (glob) => - isLeafField(schema.fields[glob]) + getFieldType(schema.fields[glob]) ? [glob] : minimatch.match(fieldsNames, `${glob}*`) @@ -29,7 +28,7 @@ let expandGlobs = (schema, globs) => { * Add given paths to source with includes/excludes lists. Paths get added to * source.includes and removed from source.excludes as necessary. * - * Returns added paths. + * Returns object with source includes, excludes, and added paths. */ export let addPathsToRequestSource = (schema, source = {}, pathsToAdd = []) => { // There's nothing to add. @@ -69,64 +68,80 @@ export let addPathsToRequestSource = (schema, source = {}, pathsToAdd = []) => { return F.omitBlank({ ...result, addedPaths }) } -/* - * Names of all subfields that can be highlighted. +/** + * Map a field's subfields to a structure that looks like a top-level field. */ -let getHighlightSubFieldsNames = (schema) => - _.keys(_.pickBy('highlight', schema.elasticsearch?.subFields)) +let createTopLevelSubFields = (field) => + F.mapValuesIndexed( + (subField, subFieldName) => + F.omitBlank({ + // Reuse the parent multi-field `subType` so that we can generate the + // correct highlighting configuration. + subType: field.subType, + elasticsearch: F.omitBlank({ + mapping: F.omitBlank({ + ...subField, + copy_to: _.map( + (path) => `${path}.${subFieldName}`, + field.elasticsearch.mapping?.copy_to + ), + }), + }), + }), + field.elasticsearch?.mapping?.fields + ) -/* - * Paths of all group fields and their subfields that can be highlighted. +/** + * Returns object of all subfields in a schema. */ -export let getHighlightGroupFieldsPaths = _.memoize((schema) => { - let subFieldsNames = getHighlightSubFieldsNames(schema) - return _.flatMap((field) => { - let copy_to = field.elasticsearch?.mapping?.copy_to - if (_.isEmpty(copy_to)) return [] - let subFieldTuples = [...new CartesianProduct(copy_to, subFieldsNames)] - let product = [...copy_to, ..._.map(_.join('.'), subFieldTuples)] - return product - }, schema.fields) -}, _.get('elasticsearch.index')) - -let isGroupFieldPath = _.curry((schema, path) => - _.find(_.eq(path), getHighlightGroupFieldsPaths(schema)) -) +let getSchemaSubFields = (schema) => + F.reduceIndexed( + (acc, field, path) => + F.mergeOn( + acc, + _.mapKeys((k) => `${path}.${k}`, createTopLevelSubFields(field)) + ), + {}, + schema.fields + ) -/* - * Object of all fields and their subfields that can be highlighted. +/** + * Returns object of all group fields and their subfields in a schema. */ -export let getAllHighlightFields = _.memoize((schema) => { - let subFieldsNames = getHighlightSubFieldsNames(schema) - return F.reduceIndexed( - (acc, field, path) => { - if (!isLeafField(field) || isGroupFieldPath(schema, path)) { - return acc - } - acc[path] = field - let subFields = _.pick( - subFieldsNames, - field.elasticsearch?.mapping?.fields +let getSchemaGroupFields = _.memoize((schema) => { + let groupFields = _.pick( + _.uniq( + _.flatMap( + (field) => field.elasticsearch?.mapping?.copy_to ?? [], + schema.fields ) - for (let name in subFields) { - acc[`${path}.${name}`] = F.omitBlank({ - subType: field.subType, - elasticsearch: F.omitBlank({ - mapping: F.omitBlank({ - ...subFields[name], - copy_to: _.map( - (path) => `${path}.${name}`, - field.elasticsearch.mapping?.copy_to - ), - }), - }), - }) - } - return acc - }, - {}, + ), schema.fields ) + return { + ...groupFields, + ...getSchemaSubFields({ fields: groupFields }), + } +}, _.get('elasticsearch.index')) + +/* + * Return object of all fields and their subfields that can be highlighted. + */ +export let getAllHighlightFields = _.memoize((schema) => { + let groupFields = getSchemaGroupFields(schema) + + let canHighlightField = (field, path) => + // Only highlight text fields. + getFieldType(field) === 'text' && + // Omit group fields from highlighting. We assume users want to + // highlight fields that were copied over instead of the group fields + // themselves. + !_.has(path, groupFields) + + return F.pickByIndexed(canHighlightField, { + ...schema.fields, + ...getSchemaSubFields(schema), + }) }, _.get('elasticsearch.index')) let collectKeysAndValues = (f, coll) => @@ -146,8 +161,10 @@ let blobConfiguration = { * Get configuration for highlight fields to send in the elastic request. */ export let getRequestHighlightFields = (schema, node) => { + let groupFields = getSchemaGroupFields(schema) + let groupFieldsInQuery = collectKeysAndValues( - isGroupFieldPath(schema), + F.getIn(groupFields), node._meta?.relevantFilters ) diff --git a/packages/provider-elasticsearch/src/example-types/results/highlighting/request.test.js b/packages/provider-elasticsearch/src/example-types/results/highlighting/request.test.js index fb336e4b7..05a439e14 100644 --- a/packages/provider-elasticsearch/src/example-types/results/highlighting/request.test.js +++ b/packages/provider-elasticsearch/src/example-types/results/highlighting/request.test.js @@ -2,72 +2,28 @@ import { schema } from './testSchema.js' import { addPathsToRequestSource, getAllHighlightFields, - getHighlightGroupFieldsPaths, getRequestHighlightFields, } from './request.js' -describe('getHighlightGroupFieldsPaths', () => { - it('should return all combinations of group fields and sub-fields', () => { +describe('getAllHighlightFields', () => { + it('should only include text fields and subfields', () => { let schema = { - elasticsearch: { - subFields: { - keyword: { highlight: false }, - subfield1: { highlight: true }, - subfield2: { highlight: true }, - }, - }, fields: { - groupField1: { - elasticsearch: { - dataType: 'text', - mapping: { - fields: { keyword: {}, subfield1: {}, subfield2: {} }, - }, - }, - }, - groupField2: { - elasticsearch: { - dataType: 'text', - mapping: { - fields: { keyword: {}, subfield1: {}, subfield2: {} }, - }, - }, - }, state: { + subType: 'blob', elasticsearch: { dataType: 'text', mapping: { - copy_to: ['groupField1', 'groupField2'], + fields: { + keyword: { type: 'keyword' }, + subfield: { type: 'text' }, + }, }, }, }, - }, - } - expect(getHighlightGroupFieldsPaths(schema)).toEqual([ - 'groupField1', - 'groupField2', - 'groupField1.subfield1', - 'groupField2.subfield1', - 'groupField1.subfield2', - 'groupField2.subfield2', - ]) - }) -}) - -describe('getAllHighlightFields', () => { - it('should include subfields that can be highlighted', () => { - let schema = { - elasticsearch: { - subFields: { - keyword: { highlight: false }, - subfield: { highlight: true }, - }, - }, - fields: { - state: { + location: { elasticsearch: { - dataType: 'text', - mapping: { fields: { keyword: {}, subfield: {} } }, + dataType: 'geo_point', }, }, }, @@ -75,20 +31,37 @@ describe('getAllHighlightFields', () => { let actual = getAllHighlightFields(schema) expect(actual).toEqual({ state: { + subType: 'blob', elasticsearch: { dataType: 'text', - mapping: { fields: { keyword: {}, subfield: {} } }, + mapping: { + fields: { + keyword: { type: 'keyword' }, + subfield: { type: 'text' }, + }, + }, + }, + }, + 'state.subfield': { + subType: 'blob', + elasticsearch: { + mapping: { + type: 'text', + }, }, }, - 'state.subfield': {}, }) }) it('should exclude groups fields', () => { let schema = { fields: { - all: { elasticsearch: { dataType: 'text' } }, - address: { elasticsearch: { dataType: 'text' } }, + all: { + elasticsearch: { dataType: 'text' }, + }, + address: { + elasticsearch: { dataType: 'text' }, + }, state: { elasticsearch: { dataType: 'text', @@ -317,25 +290,29 @@ describe('getRequestHighlightFields()', () => { }) }) - it('should include whitelisted sub fields', () => { + it('should include text sub fields', () => { let schema = { - elasticsearch: { - subFields: { - keyword: { highlight: false }, - subfield: { highlight: true }, - }, - }, fields: { state: { elasticsearch: { dataType: 'text', - mapping: { fields: { keyword: {}, subfield: {} } }, + mapping: { + fields: { + keyword: { type: 'keyword' }, + subfield: { type: 'text' }, + }, + }, }, }, 'city.street': { elasticsearch: { dataType: 'text', - mapping: { fields: { keyword: {}, subfield: {} } }, + mapping: { + fields: { + keyword: { type: 'keyword' }, + subfield: { type: 'text' }, + }, + }, }, }, }, @@ -352,13 +329,6 @@ describe('getRequestHighlightFields()', () => { it('should generate configuration for blob text fields', () => { let schema = { - elasticsearch: { - subFields: { - subfield: { - highlight: true, - }, - }, - }, fields: { state: { subType: 'blob', @@ -437,15 +407,15 @@ describe('getRequestHighlightFields()', () => { it('should generate highlight_query with group fields replaced for sub fields', () => { let schema = { - elasticsearch: { - subFields: { - subfield: { highlight: true }, - }, - }, fields: { address: { elasticsearch: { dataType: 'text', + mapping: { + fields: { + subfield: { type: 'text' }, + }, + }, }, }, state: { diff --git a/packages/provider-elasticsearch/src/example-types/results/highlighting/response.js b/packages/provider-elasticsearch/src/example-types/results/highlighting/response.js index c6f98e016..7ac2d9af0 100644 --- a/packages/provider-elasticsearch/src/example-types/results/highlighting/response.js +++ b/packages/provider-elasticsearch/src/example-types/results/highlighting/response.js @@ -32,21 +32,21 @@ let groupByMultiField = _.curry((schema, highlight) => /** * Group nested fields under their parent array of objects path. */ -export let groupByArrayOfObjectsFields = _.curry((schema, highlight) => { - let arrayOfObjectsPaths = _.keys(getArrayOfObjectsPathsMap(schema)) - return F.reduceIndexed( - (acc, fragments, path) => { - let arrayPath = findByPrefixIn(arrayOfObjectsPaths, path) - if (arrayPath) { - let nestedPath = stripParentPath(arrayPath, path) - return _.update([arrayPath], _.set([nestedPath], fragments), acc) - } - return _.set([path], fragments, acc) - }, - {}, - highlight - ) -}) +export let groupByArrayOfObjectsFields = _.curry( + (arrayOfObjectsPaths, highlight) => + F.reduceIndexed( + (acc, fragments, path) => { + let arrayPath = findByPrefixIn(arrayOfObjectsPaths, path) + if (arrayPath) { + let nestedPath = stripParentPath(arrayPath, path) + return _.update([arrayPath], _.set([nestedPath], fragments), acc) + } + return _.set([path], fragments, acc) + }, + {}, + highlight + ) +) /** * Convert an array of fragments to an object where keys are corresponding @@ -97,12 +97,11 @@ export let getArrayOfObjectsFragments = (tags, source, fragmentsMap) => * 2. Fragments for large (blob) text fields get concatenated. * 3. Fragments for arrays get ordered based on the source array */ -export let getResponseHighlight = (schema, hit, tags, copySourcePaths) => { - let pathsMap = getNestedPathsMap(schema, copySourcePaths) - return _.flow( +export let getResponseHighlight = (schema, hit, tags, nestedPathsMap) => + _.flow( groupByMultiField(schema), _.mapValues(_.flatten), - groupByArrayOfObjectsFields(schema), + groupByArrayOfObjectsFields(_.keys(getArrayOfObjectsPathsMap(schema))), F.mapValuesIndexed((fragments, path) => { let field = schema.fields[path] @@ -118,7 +117,7 @@ export let getResponseHighlight = (schema, hit, tags, copySourcePaths) => { if (isArrayOfObjectsField(field)) { let sourceArray = _.get(path, hit._source) let result = getArrayOfObjectsFragments(tags, sourceArray, fragments) - let copyPaths = pathsMap[path] + let copyPaths = nestedPathsMap?.[path] if (_.isEmpty(copyPaths)) return result return F.mapValuesIndexed( (to, index) => _.merge(_.pick(copyPaths, sourceArray[index]), to), @@ -129,7 +128,6 @@ export let getResponseHighlight = (schema, hit, tags, copySourcePaths) => { return mergeHighlights(tags, fragments) }) )(hit.highlight) -} /** * Remove each path in `paths` from `hit._source`. diff --git a/packages/provider-elasticsearch/src/example-types/results/highlighting/response.test.js b/packages/provider-elasticsearch/src/example-types/results/highlighting/response.test.js index 4e7072b7e..ddd3a4592 100644 --- a/packages/provider-elasticsearch/src/example-types/results/highlighting/response.test.js +++ b/packages/provider-elasticsearch/src/example-types/results/highlighting/response.test.js @@ -27,19 +27,22 @@ describe('groupByArrayOfObjectsFields', () => { 'James Joyce', ], } - expect(groupByArrayOfObjectsFields(schema, highlight)).toEqual({ - 'library.categories': [ - 'Alternative Medicine', - 'Ethnic & Cultural', - ], - 'library.books': { - 'cover.title': [ - 'Nineteen Eighty-Four', - 'The Great Gatsby', + let arrayOfObjectsPaths = ['library.books'] + expect(groupByArrayOfObjectsFields(arrayOfObjectsPaths, highlight)).toEqual( + { + 'library.categories': [ + 'Alternative Medicine', + 'Ethnic & Cultural', ], - 'cover.author': ['George Orwell', 'James Joyce'], - }, - }) + 'library.books': { + 'cover.title': [ + 'Nineteen Eighty-Four', + 'The Great Gatsby', + ], + 'cover.author': ['George Orwell', 'James Joyce'], + }, + } + ) }) }) @@ -268,8 +271,8 @@ describe('getResponseHighlight()', () => { ], }, } - let copySourcePaths = ['library.books.cover.author'] - expect(getResponseHighlight(schema, hit, tags, copySourcePaths)).toEqual({ + let nestedPathsMap = { 'library.books': ['cover.author'] } + expect(getResponseHighlight(schema, hit, tags, nestedPathsMap)).toEqual({ 'library.books': { 0: { cover: { @@ -311,8 +314,8 @@ describe('getResponseHighlight()', () => { ], }, } - let copySourcePaths = ['library.books.cover.title'] - expect(getResponseHighlight(schema, hit, tags, copySourcePaths)).toEqual({ + let nestedPathsMap = { 'library.books': ['cover.title'] } + expect(getResponseHighlight(schema, hit, tags, nestedPathsMap)).toEqual({ 'library.books': { 0: { cover: { diff --git a/packages/provider-elasticsearch/src/example-types/results/highlighting/search.js b/packages/provider-elasticsearch/src/example-types/results/highlighting/search.js index f33bc23c5..febea113b 100644 --- a/packages/provider-elasticsearch/src/example-types/results/highlighting/search.js +++ b/packages/provider-elasticsearch/src/example-types/results/highlighting/search.js @@ -1,6 +1,6 @@ import _ from 'lodash/fp.js' import F from 'futil' -import { getArrayOfObjectsPathsMap } from './util.js' +import { getArrayOfObjectsPathsMap, getNestedPathsMap } from './util.js' import { addPathsToRequestSource, getRequestHighlightFields, @@ -43,9 +43,13 @@ export let searchWithHighlights = (node, search, schema) => async (body) => { }, }) + const nestedPathsMap = getNestedPathsMap( + schema, + node.highlight?.copySourcePaths + ) + for (let hit of response.hits.hits) { - let copySourcePaths = node.highlight?.copySourcePaths - hit.highlight = getResponseHighlight(schema, hit, tags, copySourcePaths) + hit.highlight = getResponseHighlight(schema, hit, tags, nestedPathsMap) removePathsFromSource(schema, hit, addedPaths) mergeHighlightsOnSource(schema, hit) } diff --git a/packages/provider-elasticsearch/src/example-types/results/highlighting/testSchema.js b/packages/provider-elasticsearch/src/example-types/results/highlighting/testSchema.js index 2eb939729..7f5cbe1d4 100644 --- a/packages/provider-elasticsearch/src/example-types/results/highlighting/testSchema.js +++ b/packages/provider-elasticsearch/src/example-types/results/highlighting/testSchema.js @@ -1,10 +1,4 @@ export let schema = { - elasticsearch: { - subFields: { - keyword: { highlight: false }, - subfield: { highlight: true }, - }, - }, fields: { groupField: { elasticsearch: { diff --git a/packages/provider-elasticsearch/src/example-types/results/highlighting/util.js b/packages/provider-elasticsearch/src/example-types/results/highlighting/util.js index f127ed7f1..265f1545a 100644 --- a/packages/provider-elasticsearch/src/example-types/results/highlighting/util.js +++ b/packages/provider-elasticsearch/src/example-types/results/highlighting/util.js @@ -1,19 +1,19 @@ import _ from 'lodash/fp.js' import F from 'futil' -export let isLeafField = (field) => - !!field?.elasticsearch?.dataType || !!field?.elasticsearch?.mapping?.type +export let getFieldType = (field) => + field?.elasticsearch?.dataType || field?.elasticsearch?.mapping?.type export let isBlobField = (field) => - field?.subType === 'blob' && isLeafField(field) + field?.subType === 'blob' && !!getFieldType(field) export let isArrayField = (field) => field?.subType === 'array' export let isArrayOfScalarsField = (field) => - isArrayField(field) && isLeafField(field) + isArrayField(field) && !!getFieldType(field) export let isArrayOfObjectsField = (field) => - isArrayField(field) && !isLeafField(field) + isArrayField(field) && !getFieldType(field) export let stripParentPath = _.curry((parentPath, path) => _.startsWith(`${parentPath}.`, path)