Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[8.x] [Auto Import] Improve the ECS mapping extraction logic (#195167) #195586

Merged
merged 1 commit into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
import { ECS_RESERVED } from './constants';

import {
extractECSMapping,
findDuplicateFields,
findInvalidEcsFields,
processMapping,
removeReservedFields,
} from './validate';

describe('Testing ecs handler', () => {
it('processMapping()', async () => {
it('extractECSMapping()', async () => {
const path: string[] = [];
const value = {
checkpoint: {
Expand Down Expand Up @@ -50,7 +50,7 @@ describe('Testing ecs handler', () => {
},
};
const output: Record<string, string[][]> = {};
await processMapping(path, value, output);
await extractECSMapping(path, value, output);
expect(output).toEqual({
'source.address': [['checkpoint', 'firewall', 'origin']],
'user.name': [['checkpoint', 'firewall', 'administrator']],
Expand Down Expand Up @@ -96,6 +96,110 @@ describe('findInvalidEcsFields', () => {
const invalid = findInvalidEcsFields(ecsMappingReserved);
expect(invalid.length).toBe(1);
});

it('invalid: date_format fields (natural example)', async () => {
const misspelledDateFormatMapping = {
ai_postgres_202410050058: {
logs: {
column1: {
target: 'event.created',
confidence: 0.9,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'],
},
column12: {
target: 'log.level',
confidence: 0.95,
type: 'string',
date_format: [],
},
column11: null,
column4: null,
column9: {
target: 'event.start',
confidence: 0.8,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss z'],
},
column7: null,
column6: null,
column14: {
target: 'event.reason',
confidence: 0.7,
type: 'string',
date_format: [],
},
column13: null,
column24: {
target: 'process.name',
confidence: 0.85,
type: 'string',
date_format: [],
},
column23: null,
column10: null,
column5: {
target: 'source.address',
confidence: 0.9,
type: 'string',
date_format: [],
},
column3: {
target: 'user.name',
confidence: 0.8,
type: 'string',
date_format: [],
},
column2: {
target: 'destination.user.name',
confidence: 0.7,
type: 'string',
date_format: [],
},
column8: null,
},
},
};

const invalid = findInvalidEcsFields(misspelledDateFormatMapping);
expect(invalid.length).toBe(1);
});

it('invalid: date_format fields (handcrafted example)', async () => {
const mixedMapping = {
some_title: {
logs: {
column1: {
target: 'event.created',
confidence: 0.9,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'],
},
column12: {
target: 'log.level',
confidence: 0.95,
type: 'string',
date_formats: [],
},
column11: null,
column4: null,
column9: {
target: 'event.start',
confidence: 0.8,
type: 'date',
date_format: 'yyyy-MM-dd HH:mm:ss z',
},
column2: {
target: 'destination.user.name',
type: 'string',
date_format: [],
},
},
},
};
const invalid = findInvalidEcsFields(mixedMapping);
expect(invalid.length).toBe(1);
});
});

describe('findDuplicateFields', () => {
Expand Down
119 changes: 86 additions & 33 deletions x-pack/plugins/integration_assistant/server/graphs/ecs/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import { mergeSamples } from '../../util/samples';
import { ECS_RESERVED } from './constants';
import type { EcsBaseNodeParams } from './types';

const valueFieldKeys = new Set(['target', 'confidence', 'date_formats', 'type']);
type AnyObject = Record<string, any>;

function extractKeys(data: AnyObject, prefix: string = ''): Set<string> {
Expand Down Expand Up @@ -46,43 +45,97 @@ function findMissingFields(combinedSamples: string, ecsMapping: AnyObject): stri
return missingKeys;
}

export function processMapping(
// Describes an LLM-generated ECS mapping candidate.
interface ECSFieldTarget {
target: string;
type: string;
confidence: number;
date_formats: string[];
}

/**
* Parses a given object as an ECSFieldTarget object if it meets the required structure.
*
* @param value - The value to be converted to an ECSMapping object. It should be an object
* with properties `target` and `type`. It should have `confidence` field and
* either `date_formats` or `date_format`, though we also fill in these otherwise.
* @returns An ECSFieldTarget object if the conversion succeeded, otherwise null.
*/
function asECSFieldTarget(value: any): ECSFieldTarget | null {
if (value === null || typeof value !== 'object' || Array.isArray(value)) {
return null;
}

if (
value.target &&
typeof value.target === 'string' &&
value.type &&
typeof value.type === 'string'
) {
let confidence = 0.5;
if (value.confidence && typeof value.confidence === 'number') {
confidence = value.confidence;
}
let dateFormats: string[] = [];
if (value.date_formats && Array.isArray(value.date_formats)) {
dateFormats = value.date_formats;
} else if (value.date_format && Array.isArray(value.date_format)) {
dateFormats = value.date_format;
} else if (value.date_format && typeof value.date_format === 'string') {
dateFormats = [value.date_format];
}
return {
target: value.target,
type: value.type,
confidence,
date_formats: dateFormats,
};
}

return null;
}

/**
* Extracts ECS (Elastic Common Schema) field mapping dictionary from the LLM output.
*
* @param path - The current path in the object being traversed (an array of strings).
* @param value - The value to be processed, which can be an array, object, or other types.
* @param output - A record where the extracted ECS mappings will be stored. The keys are ECS targets, and the values are arrays of paths.
*
* This function recursively traverses the provided value. If the value is an array, it processes each item in the array.
* If the value can be interpreted as an ECS mapping, it adds the path to the output record under the appropriate ECS target.
* If the value is a regular object, it continues traversing its properties.
*/
export function extractECSMapping(
path: string[],
value: any,
output: Record<string, string[][]>
): void {
if (typeof value === 'object' && value !== null) {
if (!Array.isArray(value)) {
// If the value is a dict with all the keys returned for each source field, this is the full path of the field.
const valueKeys = new Set(Object.keys(value));

if ([...valueFieldKeys].every((k) => valueKeys.has(k))) {
if (value?.target !== null) {
if (!output[value?.target]) {
output[value.target] = [];
}
output[value.target].push(path);
}
} else {
// Regular dictionary, continue traversing
for (const [k, v] of Object.entries(value)) {
processMapping([...path, k], v, output);
}
}
} else {
// If the value is an array, iterate through items and process them
for (const item of value) {
if (typeof item === 'object' && item !== null) {
processMapping(path, item, output);
}
if (Array.isArray(value)) {
// If the value is an array, iterate through items and process them.
for (const item of value) {
if (typeof item === 'object' && item !== null) {
extractECSMapping(path, item, output);
}
}
} else if (value !== null) {
// Direct value, accumulate path
if (!output[value]) {
output[value] = [];
return;
}

const ecsFieldTarget = asECSFieldTarget(value);
if (ecsFieldTarget) {
// If we can interpret the value as an ECSFieldTarget.
if (!output[ecsFieldTarget.target]) {
output[ecsFieldTarget.target] = [];
}
output[ecsFieldTarget.target].push(path);
return;
}

if (typeof value === 'object' && value !== null) {
// Regular dictionary, continue traversing.
for (const [k, v] of Object.entries(value)) {
extractECSMapping([...path, k], v, output);
}
output[value].push(path);
}
}

Expand All @@ -96,7 +149,7 @@ export function findDuplicateFields(prefixedSamples: string[], ecsMapping: AnyOb
const output: Record<string, string[][]> = {};

// Get all keys for each target ECS mapping field
processMapping([], ecsMapping, output);
extractECSMapping([], ecsMapping, output);

// Filter out any ECS field that does not have multiple source fields mapped to it
const filteredOutput = Object.fromEntries(
Expand Down Expand Up @@ -138,7 +191,7 @@ export function findInvalidEcsFields(currentMapping: AnyObject): string[] {
const ecsDict = ECS_FULL;
const ecsReserved = ECS_RESERVED;

processMapping([], currentMapping, output);
extractECSMapping([], currentMapping, output);
const filteredOutput = Object.fromEntries(
Object.entries(output).filter(([key, _]) => key !== null)
);
Expand Down