Skip to content

Commit

Permalink
[Auto Import] Improve the ECS mapping extraction logic (#195167)
Browse files Browse the repository at this point in the history
## Release Notes

Automatic Import is more forgiving if an LLM returns an ECS mapping in a
slightly unexpected format.

## Summary

When implementing #194386 an issue
has been encountered where Claude returns the field name `date_format`
instead of expected `date_formats` and the ECS chain breaks down.

We add this case as a test to
`x-pack/plugins/integration_assistant/server/graphs/ecs/validate.test`.

Without the changes in this PR the list returned by
`findInvalidEcsFields` is

```
      [
        'Reserved ECS field mapping identified for event.created : ai_postgres_202410050058.logs.column1.target',
        'Invalid ECS field mapping identified for 0.9 : ai_postgres_202410050058.logs.column1.confidence, ai_postgres_202410050058.logs.column5.confidence',
        'Invalid ECS field mapping identified for date : ai_postgres_202410050058.logs.column1.type, ai_postgres_202410050058.logs.column9.type',
        'Invalid ECS field mapping identified for 0.95 : ai_postgres_202410050058.logs.column12.confidence',
        'Invalid ECS field mapping identified for string : ai_postgres_202410050058.logs.column12.type, ai_postgres_202410050058.logs.column14.type, ai_postgres_202410050058.logs.column24.type, ai_postgres_202410050058.logs.column5.type, ai_postgres_202410050058.logs.column3.type, ai_postgres_202410050058.logs.column2.type',
        'Invalid ECS field mapping identified for 0.8 : ai_postgres_202410050058.logs.column9.confidence, ai_postgres_202410050058.logs.column3.confidence',
        'Invalid ECS field mapping identified for 0.7 : ai_postgres_202410050058.logs.column14.confidence, ai_postgres_202410050058.logs.column2.confidence',
        'Invalid ECS field mapping identified for 0.85 : ai_postgres_202410050058.logs.column24.confidence'
      ]
```

while with these changes the result does not contain any `Invalid ECS field` messages.

The key changes are in the `processMapping` function:

1. We made function more forgiving in regards to the input, accepting
`date_format` in lieu of `date_formats`.
2. We have removed the collection of "other paths", that is, the reverse
index for simple values like `0.8`.

The latter change generally limits the impact of any other format issues
in the ECS mapping in the future.

Additionally, the function has been renamed to `extractECSMapping`, its
output type validated, and documentation has been added.

---------

Co-authored-by: Elastic Machine <[email protected]>
(cherry picked from commit 637d796)
  • Loading branch information
ilyannn committed Oct 9, 2024
1 parent 3933429 commit 12631a4
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
import { ECS_RESERVED } from './constants';

import {
extractECSMapping,
findDuplicateFields,
findInvalidEcsFields,
processMapping,
removeReservedFields,
} from './validate';

describe('Testing ecs handler', () => {
it('processMapping()', async () => {
it('extractECSMapping()', async () => {
const path: string[] = [];
const value = {
checkpoint: {
Expand Down Expand Up @@ -50,7 +50,7 @@ describe('Testing ecs handler', () => {
},
};
const output: Record<string, string[][]> = {};
await processMapping(path, value, output);
await extractECSMapping(path, value, output);
expect(output).toEqual({
'source.address': [['checkpoint', 'firewall', 'origin']],
'user.name': [['checkpoint', 'firewall', 'administrator']],
Expand Down Expand Up @@ -96,6 +96,110 @@ describe('findInvalidEcsFields', () => {
const invalid = findInvalidEcsFields(ecsMappingReserved);
expect(invalid.length).toBe(1);
});

it('invalid: date_format fields (natural example)', async () => {
const misspelledDateFormatMapping = {
ai_postgres_202410050058: {
logs: {
column1: {
target: 'event.created',
confidence: 0.9,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'],
},
column12: {
target: 'log.level',
confidence: 0.95,
type: 'string',
date_format: [],
},
column11: null,
column4: null,
column9: {
target: 'event.start',
confidence: 0.8,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss z'],
},
column7: null,
column6: null,
column14: {
target: 'event.reason',
confidence: 0.7,
type: 'string',
date_format: [],
},
column13: null,
column24: {
target: 'process.name',
confidence: 0.85,
type: 'string',
date_format: [],
},
column23: null,
column10: null,
column5: {
target: 'source.address',
confidence: 0.9,
type: 'string',
date_format: [],
},
column3: {
target: 'user.name',
confidence: 0.8,
type: 'string',
date_format: [],
},
column2: {
target: 'destination.user.name',
confidence: 0.7,
type: 'string',
date_format: [],
},
column8: null,
},
},
};

const invalid = findInvalidEcsFields(misspelledDateFormatMapping);
expect(invalid.length).toBe(1);
});

it('invalid: date_format fields (handcrafted example)', async () => {
const mixedMapping = {
some_title: {
logs: {
column1: {
target: 'event.created',
confidence: 0.9,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'],
},
column12: {
target: 'log.level',
confidence: 0.95,
type: 'string',
date_formats: [],
},
column11: null,
column4: null,
column9: {
target: 'event.start',
confidence: 0.8,
type: 'date',
date_format: 'yyyy-MM-dd HH:mm:ss z',
},
column2: {
target: 'destination.user.name',
type: 'string',
date_format: [],
},
},
},
};
const invalid = findInvalidEcsFields(mixedMapping);
expect(invalid.length).toBe(1);
});
});

describe('findDuplicateFields', () => {
Expand Down
119 changes: 86 additions & 33 deletions x-pack/plugins/integration_assistant/server/graphs/ecs/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import { mergeSamples } from '../../util/samples';
import { ECS_RESERVED } from './constants';
import type { EcsBaseNodeParams } from './types';

const valueFieldKeys = new Set(['target', 'confidence', 'date_formats', 'type']);
type AnyObject = Record<string, any>;

function extractKeys(data: AnyObject, prefix: string = ''): Set<string> {
Expand Down Expand Up @@ -46,43 +45,97 @@ function findMissingFields(combinedSamples: string, ecsMapping: AnyObject): stri
return missingKeys;
}

export function processMapping(
// Describes an LLM-generated ECS mapping candidate.
interface ECSFieldTarget {
target: string;
type: string;
confidence: number;
date_formats: string[];
}

/**
* Parses a given object as an ECSFieldTarget object if it meets the required structure.
*
* @param value - The value to be converted to an ECSMapping object. It should be an object
* with properties `target` and `type`. It should have `confidence` field and
* either `date_formats` or `date_format`, though we also fill in these otherwise.
* @returns An ECSFieldTarget object if the conversion succeeded, otherwise null.
*/
function asECSFieldTarget(value: any): ECSFieldTarget | null {
if (value === null || typeof value !== 'object' || Array.isArray(value)) {
return null;
}

if (
value.target &&
typeof value.target === 'string' &&
value.type &&
typeof value.type === 'string'
) {
let confidence = 0.5;
if (value.confidence && typeof value.confidence === 'number') {
confidence = value.confidence;
}
let dateFormats: string[] = [];
if (value.date_formats && Array.isArray(value.date_formats)) {
dateFormats = value.date_formats;
} else if (value.date_format && Array.isArray(value.date_format)) {
dateFormats = value.date_format;
} else if (value.date_format && typeof value.date_format === 'string') {
dateFormats = [value.date_format];
}
return {
target: value.target,
type: value.type,
confidence,
date_formats: dateFormats,
};
}

return null;
}

/**
* Extracts ECS (Elastic Common Schema) field mapping dictionary from the LLM output.
*
* @param path - The current path in the object being traversed (an array of strings).
* @param value - The value to be processed, which can be an array, object, or other types.
* @param output - A record where the extracted ECS mappings will be stored. The keys are ECS targets, and the values are arrays of paths.
*
* This function recursively traverses the provided value. If the value is an array, it processes each item in the array.
* If the value can be interpreted as an ECS mapping, it adds the path to the output record under the appropriate ECS target.
* If the value is a regular object, it continues traversing its properties.
*/
export function extractECSMapping(
path: string[],
value: any,
output: Record<string, string[][]>
): void {
if (typeof value === 'object' && value !== null) {
if (!Array.isArray(value)) {
// If the value is a dict with all the keys returned for each source field, this is the full path of the field.
const valueKeys = new Set(Object.keys(value));

if ([...valueFieldKeys].every((k) => valueKeys.has(k))) {
if (value?.target !== null) {
if (!output[value?.target]) {
output[value.target] = [];
}
output[value.target].push(path);
}
} else {
// Regular dictionary, continue traversing
for (const [k, v] of Object.entries(value)) {
processMapping([...path, k], v, output);
}
}
} else {
// If the value is an array, iterate through items and process them
for (const item of value) {
if (typeof item === 'object' && item !== null) {
processMapping(path, item, output);
}
if (Array.isArray(value)) {
// If the value is an array, iterate through items and process them.
for (const item of value) {
if (typeof item === 'object' && item !== null) {
extractECSMapping(path, item, output);
}
}
} else if (value !== null) {
// Direct value, accumulate path
if (!output[value]) {
output[value] = [];
return;
}

const ecsFieldTarget = asECSFieldTarget(value);
if (ecsFieldTarget) {
// If we can interpret the value as an ECSFieldTarget.
if (!output[ecsFieldTarget.target]) {
output[ecsFieldTarget.target] = [];
}
output[ecsFieldTarget.target].push(path);
return;
}

if (typeof value === 'object' && value !== null) {
// Regular dictionary, continue traversing.
for (const [k, v] of Object.entries(value)) {
extractECSMapping([...path, k], v, output);
}
output[value].push(path);
}
}

Expand All @@ -96,7 +149,7 @@ export function findDuplicateFields(prefixedSamples: string[], ecsMapping: AnyOb
const output: Record<string, string[][]> = {};

// Get all keys for each target ECS mapping field
processMapping([], ecsMapping, output);
extractECSMapping([], ecsMapping, output);

// Filter out any ECS field that does not have multiple source fields mapped to it
const filteredOutput = Object.fromEntries(
Expand Down Expand Up @@ -138,7 +191,7 @@ export function findInvalidEcsFields(currentMapping: AnyObject): string[] {
const ecsDict = ECS_FULL;
const ecsReserved = ECS_RESERVED;

processMapping([], currentMapping, output);
extractECSMapping([], currentMapping, output);
const filteredOutput = Object.fromEntries(
Object.entries(output).filter(([key, _]) => key !== null)
);
Expand Down

0 comments on commit 12631a4

Please sign in to comment.