From 12631a4930a588a4aa4fd192225827847c1b5226 Mon Sep 17 00:00:00 2001 From: Ilya Nikokoshev Date: Wed, 9 Oct 2024 15:24:00 +0300 Subject: [PATCH] [Auto Import] Improve the ECS mapping extraction logic (#195167) ## Release Notes Automatic Import is more forgiving if an LLM returns an ECS mapping in a slightly unexpected format. ## Summary When implementing https://github.com/elastic/kibana/pull/194386 an issue has been encountered where Claude returns the field name `date_format` instead of expected `date_formats` and the ECS chain breaks down. We add this case as a test to `x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test`. Without the changes in this PR the list returned by `findInvalidEcsFields` is ``` [ 'Reserved ECS field mapping identified for event.created : ai_postgres_202410050058.logs.column1.target', 'Invalid ECS field mapping identified for 0.9 : ai_postgres_202410050058.logs.column1.confidence, ai_postgres_202410050058.logs.column5.confidence', 'Invalid ECS field mapping identified for date : ai_postgres_202410050058.logs.column1.type, ai_postgres_202410050058.logs.column9.type', 'Invalid ECS field mapping identified for 0.95 : ai_postgres_202410050058.logs.column12.confidence', 'Invalid ECS field mapping identified for string : ai_postgres_202410050058.logs.column12.type, ai_postgres_202410050058.logs.column14.type, ai_postgres_202410050058.logs.column24.type, ai_postgres_202410050058.logs.column5.type, ai_postgres_202410050058.logs.column3.type, ai_postgres_202410050058.logs.column2.type', 'Invalid ECS field mapping identified for 0.8 : ai_postgres_202410050058.logs.column9.confidence, ai_postgres_202410050058.logs.column3.confidence', 'Invalid ECS field mapping identified for 0.7 : ai_postgres_202410050058.logs.column14.confidence, ai_postgres_202410050058.logs.column2.confidence', 'Invalid ECS field mapping identified for 0.85 : ai_postgres_202410050058.logs.column24.confidence' ] ``` while with these changes the result does not contain any `Invalid ECS field` messages. The key changes are in the `processMapping` function: 1. We made function more forgiving in regards to the input, accepting `date_format` in lieu of `date_formats`. 2. We have removed the collection of "other paths", that is, the reverse index for simple values like `0.8`. The latter change generally limits the impact of any other format issues in the ECS mapping in the future. Additionally, the function has been renamed to `extractECSMapping`, its output type validated, and documentation has been added. --------- Co-authored-by: Elastic Machine (cherry picked from commit 637d796071f067f8cab37165dd8f80111251ae81) --- .../server/graphs/ecs/validate.test.ts | 110 +++++++++++++++- .../server/graphs/ecs/validate.ts | 119 +++++++++++++----- 2 files changed, 193 insertions(+), 36 deletions(-) diff --git a/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test.ts b/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test.ts index a7fb5962b5558..39c4e3ac4bab3 100644 --- a/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test.ts +++ b/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test.ts @@ -8,14 +8,14 @@ import { ECS_RESERVED } from './constants'; import { + extractECSMapping, findDuplicateFields, findInvalidEcsFields, - processMapping, removeReservedFields, } from './validate'; describe('Testing ecs handler', () => { - it('processMapping()', async () => { + it('extractECSMapping()', async () => { const path: string[] = []; const value = { checkpoint: { @@ -50,7 +50,7 @@ describe('Testing ecs handler', () => { }, }; const output: Record = {}; - await processMapping(path, value, output); + await extractECSMapping(path, value, output); expect(output).toEqual({ 'source.address': [['checkpoint', 'firewall', 'origin']], 'user.name': [['checkpoint', 'firewall', 'administrator']], @@ -96,6 +96,110 @@ describe('findInvalidEcsFields', () => { const invalid = findInvalidEcsFields(ecsMappingReserved); expect(invalid.length).toBe(1); }); + + it('invalid: date_format fields (natural example)', async () => { + const misspelledDateFormatMapping = { + ai_postgres_202410050058: { + logs: { + column1: { + target: 'event.created', + confidence: 0.9, + type: 'date', + date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'], + }, + column12: { + target: 'log.level', + confidence: 0.95, + type: 'string', + date_format: [], + }, + column11: null, + column4: null, + column9: { + target: 'event.start', + confidence: 0.8, + type: 'date', + date_format: ['yyyy-MM-dd HH:mm:ss z'], + }, + column7: null, + column6: null, + column14: { + target: 'event.reason', + confidence: 0.7, + type: 'string', + date_format: [], + }, + column13: null, + column24: { + target: 'process.name', + confidence: 0.85, + type: 'string', + date_format: [], + }, + column23: null, + column10: null, + column5: { + target: 'source.address', + confidence: 0.9, + type: 'string', + date_format: [], + }, + column3: { + target: 'user.name', + confidence: 0.8, + type: 'string', + date_format: [], + }, + column2: { + target: 'destination.user.name', + confidence: 0.7, + type: 'string', + date_format: [], + }, + column8: null, + }, + }, + }; + + const invalid = findInvalidEcsFields(misspelledDateFormatMapping); + expect(invalid.length).toBe(1); + }); + + it('invalid: date_format fields (handcrafted example)', async () => { + const mixedMapping = { + some_title: { + logs: { + column1: { + target: 'event.created', + confidence: 0.9, + type: 'date', + date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'], + }, + column12: { + target: 'log.level', + confidence: 0.95, + type: 'string', + date_formats: [], + }, + column11: null, + column4: null, + column9: { + target: 'event.start', + confidence: 0.8, + type: 'date', + date_format: 'yyyy-MM-dd HH:mm:ss z', + }, + column2: { + target: 'destination.user.name', + type: 'string', + date_format: [], + }, + }, + }, + }; + const invalid = findInvalidEcsFields(mixedMapping); + expect(invalid.length).toBe(1); + }); }); describe('findDuplicateFields', () => { diff --git a/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.ts b/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.ts index 74334893ff634..3061e70f242a5 100644 --- a/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.ts +++ b/x-pack/plugins/integration_assistant/server/graphs/ecs/validate.ts @@ -10,7 +10,6 @@ import { mergeSamples } from '../../util/samples'; import { ECS_RESERVED } from './constants'; import type { EcsBaseNodeParams } from './types'; -const valueFieldKeys = new Set(['target', 'confidence', 'date_formats', 'type']); type AnyObject = Record; function extractKeys(data: AnyObject, prefix: string = ''): Set { @@ -46,43 +45,97 @@ function findMissingFields(combinedSamples: string, ecsMapping: AnyObject): stri return missingKeys; } -export function processMapping( +// Describes an LLM-generated ECS mapping candidate. +interface ECSFieldTarget { + target: string; + type: string; + confidence: number; + date_formats: string[]; +} + +/** + * Parses a given object as an ECSFieldTarget object if it meets the required structure. + * + * @param value - The value to be converted to an ECSMapping object. It should be an object + * with properties `target` and `type`. It should have `confidence` field and + * either `date_formats` or `date_format`, though we also fill in these otherwise. + * @returns An ECSFieldTarget object if the conversion succeeded, otherwise null. + */ +function asECSFieldTarget(value: any): ECSFieldTarget | null { + if (value === null || typeof value !== 'object' || Array.isArray(value)) { + return null; + } + + if ( + value.target && + typeof value.target === 'string' && + value.type && + typeof value.type === 'string' + ) { + let confidence = 0.5; + if (value.confidence && typeof value.confidence === 'number') { + confidence = value.confidence; + } + let dateFormats: string[] = []; + if (value.date_formats && Array.isArray(value.date_formats)) { + dateFormats = value.date_formats; + } else if (value.date_format && Array.isArray(value.date_format)) { + dateFormats = value.date_format; + } else if (value.date_format && typeof value.date_format === 'string') { + dateFormats = [value.date_format]; + } + return { + target: value.target, + type: value.type, + confidence, + date_formats: dateFormats, + }; + } + + return null; +} + +/** + * Extracts ECS (Elastic Common Schema) field mapping dictionary from the LLM output. + * + * @param path - The current path in the object being traversed (an array of strings). + * @param value - The value to be processed, which can be an array, object, or other types. + * @param output - A record where the extracted ECS mappings will be stored. The keys are ECS targets, and the values are arrays of paths. + * + * This function recursively traverses the provided value. If the value is an array, it processes each item in the array. + * If the value can be interpreted as an ECS mapping, it adds the path to the output record under the appropriate ECS target. + * If the value is a regular object, it continues traversing its properties. + */ +export function extractECSMapping( path: string[], value: any, output: Record ): void { - if (typeof value === 'object' && value !== null) { - if (!Array.isArray(value)) { - // If the value is a dict with all the keys returned for each source field, this is the full path of the field. - const valueKeys = new Set(Object.keys(value)); - - if ([...valueFieldKeys].every((k) => valueKeys.has(k))) { - if (value?.target !== null) { - if (!output[value?.target]) { - output[value.target] = []; - } - output[value.target].push(path); - } - } else { - // Regular dictionary, continue traversing - for (const [k, v] of Object.entries(value)) { - processMapping([...path, k], v, output); - } - } - } else { - // If the value is an array, iterate through items and process them - for (const item of value) { - if (typeof item === 'object' && item !== null) { - processMapping(path, item, output); - } + if (Array.isArray(value)) { + // If the value is an array, iterate through items and process them. + for (const item of value) { + if (typeof item === 'object' && item !== null) { + extractECSMapping(path, item, output); } } - } else if (value !== null) { - // Direct value, accumulate path - if (!output[value]) { - output[value] = []; + return; + } + + const ecsFieldTarget = asECSFieldTarget(value); + if (ecsFieldTarget) { + // If we can interpret the value as an ECSFieldTarget. + if (!output[ecsFieldTarget.target]) { + output[ecsFieldTarget.target] = []; + } + output[ecsFieldTarget.target].push(path); + return; + } + + if (typeof value === 'object' && value !== null) { + // Regular dictionary, continue traversing. + for (const [k, v] of Object.entries(value)) { + extractECSMapping([...path, k], v, output); } - output[value].push(path); } } @@ -96,7 +149,7 @@ export function findDuplicateFields(prefixedSamples: string[], ecsMapping: AnyOb const output: Record = {}; // Get all keys for each target ECS mapping field - processMapping([], ecsMapping, output); + extractECSMapping([], ecsMapping, output); // Filter out any ECS field that does not have multiple source fields mapped to it const filteredOutput = Object.fromEntries( @@ -138,7 +191,7 @@ export function findInvalidEcsFields(currentMapping: AnyObject): string[] { const ecsDict = ECS_FULL; const ecsReserved = ECS_RESERVED; - processMapping([], currentMapping, output); + extractECSMapping([], currentMapping, output); const filteredOutput = Object.fromEntries( Object.entries(output).filter(([key, _]) => key !== null) );