From 9cba371189b12e5782589bce41c29c220d7fbb6b Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Fri, 27 Dec 2024 18:45:10 -0800 Subject: [PATCH] Add auto-generation of index mappings & settings based on processors (#552) (#553) (cherry picked from commit 08b97d30b72d547c063401d004847cf1fde70ec5) Signed-off-by: Tyler Ohlsen Signed-off-by: github-actions[bot] Co-authored-by: github-actions[bot] --- common/constants.ts | 92 +++++++--- .../ingest_inputs/advanced_settings.tsx | 109 +++++++++++- .../new_workflow/quick_configure_inputs.tsx | 36 +--- public/utils/utils.ts | 161 +++++++++++++++++- 4 files changed, 336 insertions(+), 62 deletions(-) diff --git a/common/constants.ts b/common/constants.ts index 76435d0b..e52e4028 100644 --- a/common/constants.ts +++ b/common/constants.ts @@ -77,31 +77,81 @@ export const SEARCH_CONNECTORS_NODE_API_PATH = `${BASE_CONNECTOR_NODE_API_PATH}/ * based on the specified remote model from a remote service, if found */ -// Cohere -export const COHERE_DIMENSIONS = { - [`embed-english-v3.0`]: 1024, - [`embed-english-light-v3.0`]: 384, - [`embed-multilingual-v3.0`]: 1024, - [`embed-multilingual-light-v3.0`]: 384, - [`embed-english-v2.0`]: 4096, - [`embed-english-light-v2.0`]: 1024, - [`embed-multilingual-v2.0`]: 768, +interface RemoteEmbeddingModelConfig { + dimension: number; + fieldName: string; +} + +// Amazon BedRock +export const BEDROCK_CONFIGS = { + [`amazon.titan-embed-text-v1`]: { + dimension: 1536, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`amazon.titan-embed-text-v2`]: { + dimension: 1024, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`amazon.titan-embed-image-v1`]: { + dimension: 1024, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`cohere.embed-english-v3`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`cohere.embed-multilingual-v3`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, }; -// OpenAI -export const OPENAI_DIMENSIONS = { - [`text-embedding-3-small`]: 1536, - [`text-embedding-3-large`]: 3072, - [`text-embedding-ada-002`]: 1536, +// Cohere +export const COHERE_CONFIGS = { + [`embed-english-v3.0`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-english-light-v3.0`]: { + dimension: 384, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-multilingual-v3.0`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-multilingual-light-v3.0`]: { + dimension: 384, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-english-v2.0`]: { + dimension: 4096, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-english-light-v2.0`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-multilingual-v2.0`]: { + dimension: 768, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, }; -// Amazon BedRock -export const BEDROCK_DIMENSIONS = { - [`amazon.titan-embed-text-v1`]: 1536, - [`amazon.titan-embed-text-v2`]: 1024, - [`amazon.titan-embed-image-v1`]: 1024, - [`cohere.embed-english-v3`]: 1024, // same as Cohere directly - [`cohere.embed-multilingual-v3`]: 1024, // same as Cohere directly +// OpenAI +export const OPENAI_CONFIGS = { + [`text-embedding-3-small`]: { + dimension: 1536, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`text-embedding-3-large`]: { + dimension: 3072, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`text-embedding-ada-002`]: { + dimension: 1536, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, }; /** diff --git a/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx b/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx index 7402ba3e..cfbebf13 100644 --- a/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx +++ b/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx @@ -3,7 +3,9 @@ * SPDX-License-Identifier: Apache-2.0 */ -import React from 'react'; +import React, { useEffect } from 'react'; +import { useSelector } from 'react-redux'; +import { isEmpty } from 'lodash'; import { EuiAccordion, EuiFlexGroup, @@ -11,6 +13,17 @@ import { EuiSpacer, } from '@elastic/eui'; import { JsonField } from '../input_fields'; +import { getIn, useFormikContext } from 'formik'; +import { WorkflowFormValues } from '../../../../../common'; +import { AppState } from '../../../../store'; +import { + getEmbeddingField, + getEmbeddingModelDimensions, + getUpdatedIndexMappings, + getUpdatedIndexSettings, + isKnnIndex, + removeVectorFieldFromIndexMappings, +} from '../../../../utils'; interface AdvancedSettingsProps {} @@ -18,6 +31,90 @@ interface AdvancedSettingsProps {} * Input component for configuring ingest-side advanced settings */ export function AdvancedSettings(props: AdvancedSettingsProps) { + const { values, setFieldValue } = useFormikContext(); + const { models, connectors } = useSelector((state: AppState) => state.ml); + const ingestMLProcessors = (Object.values( + values?.ingest?.enrich + ) as any[]).filter((ingestProcessor) => ingestProcessor?.model !== undefined); + const ingestProcessorModelIds = ingestMLProcessors + .map((ingestProcessor) => ingestProcessor?.model?.id as string | undefined) + .filter((modelId) => !isEmpty(modelId)); + const indexMappingsPath = 'ingest.index.mappings'; + const indexSettingsPath = 'ingest.index.settings'; + const curMappings = getIn(values, indexMappingsPath); + const curSettings = getIn(values, indexSettingsPath); + + // listen on when processor with models are added / removed. dynamically update index + // settings to be knn-enabled or knn-disabled. + useEffect(() => { + if (ingestProcessorModelIds.length > 0) { + ingestProcessorModelIds.forEach((ingestProcessorModelId) => { + const processorModel = Object.values(models).find( + (model) => model.id === ingestProcessorModelId + ); + if (processorModel?.connectorId !== undefined) { + const processorConnector = connectors[processorModel?.connectorId]; + const dimension = getEmbeddingModelDimensions(processorConnector); + + // If a dimension is found, it is a known embedding model. + // Ensure the index is configured to be knn-enabled. + if (dimension !== undefined) { + if (!isKnnIndex(curSettings)) { + setFieldValue( + indexSettingsPath, + getUpdatedIndexSettings(curSettings, true) + ); + } + } + } + }); + } else { + if (isKnnIndex(curSettings)) { + setFieldValue( + indexSettingsPath, + getUpdatedIndexSettings(curSettings, false) + ); + } + } + }, [ingestProcessorModelIds.length]); + + // listener on when there are updates to any ingest processors. Try to update + // any index mappings accordingly, such as setting the knn_vector mappings + // for models that output vector embeddings, or removing any mappings, if no ML + // processor defined. + useEffect(() => { + if (ingestMLProcessors.length > 0) { + ingestMLProcessors.forEach((ingestMLProcessor) => { + const processorModel = Object.values(models).find( + (model) => model.id === ingestMLProcessor?.model?.id + ); + if (processorModel?.connectorId !== undefined) { + const processorConnector = connectors[processorModel?.connectorId]; + const dimension = getEmbeddingModelDimensions(processorConnector); + const embeddingFieldName = getEmbeddingField( + processorConnector, + ingestMLProcessor + ); + if (embeddingFieldName !== undefined && dimension !== undefined) { + setFieldValue( + indexMappingsPath, + getUpdatedIndexMappings( + curMappings, + embeddingFieldName, + dimension + ) + ); + } + } + }); + } else { + setFieldValue( + indexMappingsPath, + removeVectorFieldFromIndexMappings(curMappings) + ); + } + }, [getIn(values, 'ingest.enrich')]); + return ( @@ -25,16 +122,10 @@ export function AdvancedSettings(props: AdvancedSettingsProps) { - + - + diff --git a/public/pages/workflows/new_workflow/quick_configure_inputs.tsx b/public/pages/workflows/new_workflow/quick_configure_inputs.tsx index e590ae4f..90626b5a 100644 --- a/public/pages/workflows/new_workflow/quick_configure_inputs.tsx +++ b/public/pages/workflows/new_workflow/quick_configure_inputs.tsx @@ -16,8 +16,6 @@ import { EuiCompressedFieldNumber, } from '@elastic/eui'; import { - BEDROCK_DIMENSIONS, - COHERE_DIMENSIONS, DEFAULT_IMAGE_FIELD, DEFAULT_LLM_RESPONSE_FIELD, DEFAULT_TEXT_FIELD, @@ -25,12 +23,11 @@ import { MODEL_STATE, Model, ModelInterface, - OPENAI_DIMENSIONS, QuickConfigureFields, WORKFLOW_TYPE, } from '../../../../common'; import { AppState } from '../../../store'; -import { parseModelInputs } from '../../../utils'; +import { getEmbeddingModelDimensions, parseModelInputs } from '../../../utils'; import { get } from 'lodash'; interface QuickConfigureInputsProps { @@ -121,33 +118,10 @@ export function QuickConfigureInputs(props: QuickConfigureInputsProps) { if (selectedModel?.connectorId !== undefined) { const connector = connectors[selectedModel.connectorId]; if (connector !== undefined) { - // some APIs allow specifically setting the dimensions at runtime, - // so we check for that first. - if (connector.parameters?.dimensions !== undefined) { - setFieldValues({ - ...fieldValues, - embeddingLength: connector.parameters?.dimensions, - }); - } else if (connector.parameters?.model !== undefined) { - const dimensions = - // @ts-ignore - COHERE_DIMENSIONS[connector.parameters?.model] || - // @ts-ignore - OPENAI_DIMENSIONS[connector.parameters?.model] || - // @ts-ignore - BEDROCK_DIMENSIONS[connector.parameters?.model]; - if (dimensions !== undefined) { - setFieldValues({ - ...fieldValues, - embeddingLength: dimensions, - }); - } - } else { - setFieldValues({ - ...fieldValues, - embeddingLength: undefined, - }); - } + setFieldValues({ + ...fieldValues, + embeddingLength: getEmbeddingModelDimensions(connector), + }); } } }, [fieldValues.modelId, deployedModels, connectors]); diff --git a/public/utils/utils.ts b/public/utils/utils.ts index 6db1381e..26e64d62 100644 --- a/public/utils/utils.ts +++ b/public/utils/utils.ts @@ -5,7 +5,7 @@ import yaml from 'js-yaml'; import jsonpath from 'jsonpath'; -import { escape, get, isEmpty } from 'lodash'; +import { escape, findKey, get, isEmpty, set, unset } from 'lodash'; import semver from 'semver'; import queryString from 'query-string'; import { useLocation } from 'react-router-dom'; @@ -27,14 +27,22 @@ import { WORKFLOW_STEP_TYPE, Workflow, WorkflowResource, + BEDROCK_CONFIGS, + COHERE_CONFIGS, + OPENAI_CONFIGS, + customStringify, + NO_TRANSFORMATION, + TRANSFORM_TYPE, } from '../../common'; import { getCore, getDataSourceEnabled } from '../services'; import { + Connector, InputMapEntry, MDSQueryParams, ModelInputMap, ModelOutputMap, OutputMapEntry, + OutputMapFormValue, QueryParam, } from '../../common/interfaces'; import * as pluginManifest from '../../opensearch_dashboards.json'; @@ -603,3 +611,154 @@ export function injectParameters( }); return finalQueryString; } + +// Fetch embedding dimensions, if the selected model is a known one +export function getEmbeddingModelDimensions( + connector: Connector +): number | undefined { + // some APIs allow specifically setting the dimensions at runtime, + // so we check for that first. + if (connector.parameters?.dimensions !== undefined) { + return connector.parameters?.dimensions; + } else if (connector.parameters?.model !== undefined) { + return ( + // @ts-ignore + COHERE_CONFIGS[connector.parameters?.model]?.dimension || + // @ts-ignore + OPENAI_CONFIGS[connector.parameters?.model]?.dimension || + // @ts-ignore + BEDROCK_CONFIGS[connector.parameters?.model]?.dimension + ); + } else { + return undefined; + } +} + +// Check if an index is a knn index +export function isKnnIndex(existingSettings: string): boolean { + try { + return get(JSON.parse(existingSettings), 'index.knn', false); + } catch (error) { + console.error('Could not parse index settings: ', error); + return false; + } +} + +// Update the index settings based on parameters passed. +// Currently just used for updating the `knn` flag. +export function getUpdatedIndexSettings( + existingSettings: string, + knnBool: boolean +): string { + try { + return customStringify( + set(JSON.parse(existingSettings), 'index.knn', knnBool) + ); + } catch { + return existingSettings; + } +} + +// Get any embedding fields, if a known embedding model +function getEmbeddingFieldFromConnector( + connector: Connector +): string | undefined { + if (connector.parameters?.model !== undefined) { + return ( + // @ts-ignore + COHERE_CONFIGS[connector.parameters?.model]?.fieldName || + // @ts-ignore + OPENAI_CONFIGS[connector.parameters?.model]?.fieldName || + // @ts-ignore + BEDROCK_CONFIGS[connector.parameters?.model]?.fieldName + ); + } else { + return undefined; + } +} + +// Try to determine the embedding field based on the processor config. +// First check if it is a known model, then make a best guess based on +// the output map configuration, if there is any transformations made +export function getEmbeddingField( + connector: Connector, + processorConfig: any +): string | undefined { + let embeddingField = getEmbeddingFieldFromConnector(connector); + const outputMap = processorConfig?.output_map as OutputMapFormValue; + if ( + outputMap !== undefined && + outputMap[0] !== undefined && + Array.isArray(outputMap[0]) && + outputMap[0].length > 0 + ) { + const relevantOutputMapEntry = + embeddingField !== undefined + ? outputMap[0].find( + (outputMapEntry) => outputMapEntry.key === embeddingField + ) + : outputMap[0][0]; + switch (relevantOutputMapEntry?.value?.transformType) { + case TRANSFORM_TYPE.FIELD: { + embeddingField = relevantOutputMapEntry?.value?.value; + break; + } + case TRANSFORM_TYPE.EXPRESSION: { + embeddingField = get(relevantOutputMapEntry, 'value.nestedVars.0.name'); + break; + } + case NO_TRANSFORMATION: + case undefined: + default: { + embeddingField = relevantOutputMapEntry?.key; + break; + } + } + } + return embeddingField; +} + +// Update the index mappings based on parameters passed. +// Currently used for updating the knn_vector field configuration, & removing +// any old/existing knn_vector field in the process. +export function getUpdatedIndexMappings( + existingMappings: string, + embeddingFieldName: string, + dimension: number +): string { + try { + const mappingsWithRemovedVectorField = removeVectorFieldFromIndexMappings( + existingMappings + ); + return customStringify( + set( + JSON.parse(mappingsWithRemovedVectorField), + `properties.${embeddingFieldName}`, + { + type: 'knn_vector', + dimension, + } + ) + ); + } catch { + return existingMappings; + } +} + +export function removeVectorFieldFromIndexMappings( + existingMappings: string +): string { + try { + let existingMappingsObj = JSON.parse(existingMappings); + const existingEmbeddingField = findKey( + existingMappingsObj?.properties, + (field) => field.type === 'knn_vector' + ); + if (existingEmbeddingField !== undefined) { + unset(existingMappingsObj?.properties, existingEmbeddingField); + } + return customStringify(existingMappingsObj); + } catch { + return existingMappings; + } +}