Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Auto Import] Improve the ECS mapping extraction logic #195167

Merged
merged 10 commits into from
Oct 9, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
* 2.0.
*/

import { findDuplicateFields, findInvalidEcsFields, processMapping } from './validate';
import { findDuplicateFields, findInvalidEcsFields, extractECSMapping } from './validate';

describe('Testing ecs handler', () => {
it('processMapping()', async () => {
it('extractECSMapping()', async () => {
const path: string[] = [];
const value = {
checkpoint: {
Expand Down Expand Up @@ -43,7 +43,7 @@ describe('Testing ecs handler', () => {
},
};
const output: Record<string, string[][]> = {};
await processMapping(path, value, output);
await extractECSMapping(path, value, output);
expect(output).toEqual({
'source.address': [['checkpoint', 'firewall', 'origin']],
'user.name': [['checkpoint', 'firewall', 'administrator']],
Expand Down Expand Up @@ -89,6 +89,74 @@ describe('findInvalidEcsFields', () => {
const invalid = findInvalidEcsFields(ecsMappingReserved);
expect(invalid.length).toBe(1);
});

it('invalid: date_format fields', async () => {
const misspelledDateFormatMapping = {
ai_postgres_202410050058: {
logs: {
column1: {
target: 'event.created',
confidence: 0.9,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss.SSS z'],
bhapas marked this conversation as resolved.
Show resolved Hide resolved
},
column12: {
target: 'log.level',
confidence: 0.95,
type: 'string',
date_format: [],
},
column11: null,
column4: null,
column9: {
target: 'event.start',
confidence: 0.8,
type: 'date',
date_format: ['yyyy-MM-dd HH:mm:ss z'],
},
column7: null,
column6: null,
column14: {
target: 'event.reason',
confidence: 0.7,
type: 'string',
date_format: [],
},
column13: null,
column24: {
target: 'process.name',
confidence: 0.85,
type: 'string',
date_format: [],
},
column23: null,
column10: null,
column5: {
target: 'source.address',
confidence: 0.9,
type: 'string',
date_format: [],
},
column3: {
target: 'user.name',
confidence: 0.8,
type: 'string',
date_format: [],
},
column2: {
target: 'destination.user.name',
confidence: 0.7,
type: 'string',
date_format: [],
},
column8: null,
},
},
};

const invalid = findInvalidEcsFields(misspelledDateFormatMapping);
expect(invalid.length).toBe(1);
});
});

describe('findDuplicateFields', () => {
Expand Down
119 changes: 86 additions & 33 deletions x-pack/plugins/integration_assistant/server/graphs/ecs/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import { mergeSamples } from '../../util/samples';
import { ECS_RESERVED } from './constants';
import type { EcsBaseNodeParams } from './types';

const valueFieldKeys = new Set(['target', 'confidence', 'date_formats', 'type']);
type AnyObject = Record<string, any>;

function extractKeys(data: AnyObject, prefix: string = ''): Set<string> {
Expand Down Expand Up @@ -46,43 +45,97 @@ function findMissingFields(combinedSamples: string, ecsMapping: AnyObject): stri
return missingKeys;
}

export function processMapping(
// Describes an LLM-generated ECS mapping candidate.
interface ECSFieldTarget {
target: string;
type: string;
confidence: number;
date_formats: string[];
}

/**
* Parses a given object as an ECSFieldTarget object if it meets the required structure.
*
* @param value - The value to be converted to an ECSMapping object. It should be an object
* with properties `target` and `type`. It should have `confidence` field and
* either `date_formats` or `date_format`, though we also fill in these otherwise.
* @returns An ECSFieldTarget object if the conversion succeeded, otherwise null.
*/
function asECSFieldTarget(value: any): ECSFieldTarget | null {
if (typeof value !== 'object' || value === null || Array.isArray(value)) {
bhapas marked this conversation as resolved.
Show resolved Hide resolved
return null;
}

if (
value.target &&
typeof value.target === 'string' &&
value.type &&
typeof value.type === 'string'
) {
let confidence = 0.5;
if (value.confidence && typeof value.confidence === 'number') {
confidence = value.confidence;
}
let dateFormats: string[] = [];
if (value.date_formats && Array.isArray(value.date_formats)) {
dateFormats = value.date_formats;
} else if (value.date_format && Array.isArray(value.date_format)) {
dateFormats = value.date_format;
} else if (value.date_format && typeof value.date_format === 'string') {
dateFormats = [value.date_format];
}
return {
target: value.target,
type: value.type,
confidence,
date_formats: dateFormats,
};
}

return null;
}

/**
* Extracts ECS (Elastic Common Schema) field mapping dictionary from the LLM output.
*
* @param path - The current path in the object being traversed (an array of strings).
* @param value - The value to be processed, which can be an array, object, or other types.
* @param output - A record where the extracted ECS mappings will be stored. The keys are ECS targets, and the values are arrays of paths.
*
* This function recursively traverses the provided value. If the value is an array, it processes each item in the array.
* If the value can be interpreted as an ECS mapping, it adds the path to the output record under the appropriate ECS target.
* If the value is a regular object, it continues traversing its properties.
*/
export function extractECSMapping(
path: string[],
value: any,
output: Record<string, string[][]>
): void {
if (typeof value === 'object' && value !== null) {
if (!Array.isArray(value)) {
// If the value is a dict with all the keys returned for each source field, this is the full path of the field.
const valueKeys = new Set(Object.keys(value));

if ([...valueFieldKeys].every((k) => valueKeys.has(k))) {
if (value?.target !== null) {
if (!output[value?.target]) {
output[value.target] = [];
}
output[value.target].push(path);
}
} else {
// Regular dictionary, continue traversing
for (const [k, v] of Object.entries(value)) {
processMapping([...path, k], v, output);
}
}
} else {
// If the value is an array, iterate through items and process them
for (const item of value) {
if (typeof item === 'object' && item !== null) {
processMapping(path, item, output);
}
if (Array.isArray(value)) {
// If the value is an array, iterate through items and process them.
for (const item of value) {
if (typeof item === 'object' && item !== null) {
extractECSMapping(path, item, output);
Comment on lines +115 to +118
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can try something like. Just a suggestion feel free to pick whatever you like to

value
 .filter((item) => (typeof item === 'object' && item !== null))
 .forEach((item) => extractECSMapping(path, item, output));

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it can be rewritten to be cleaner, but let's keep it like this because this makes it clear I just moved the lines around.

}
}
} else if (value !== null) {
// Direct value, accumulate path
if (!output[value]) {
output[value] = [];
return;
}

const ecsFieldTarget = asECSFieldTarget(value);
if (ecsFieldTarget) {
// If we can interpret the value as an ECSFieldTarget.
if (!output[ecsFieldTarget.target]) {
output[ecsFieldTarget.target] = [];
bhapas marked this conversation as resolved.
Show resolved Hide resolved
}
output[ecsFieldTarget.target].push(path);
return;
}

if (typeof value === 'object' && value !== null) {
// Regular dictionary, continue traversing.
for (const [k, v] of Object.entries(value)) {
extractECSMapping([...path, k], v, output);
}
output[value].push(path);
}
}

Expand All @@ -96,7 +149,7 @@ export function findDuplicateFields(prefixedSamples: string[], ecsMapping: AnyOb
const output: Record<string, string[][]> = {};

// Get all keys for each target ECS mapping field
processMapping([], ecsMapping, output);
extractECSMapping([], ecsMapping, output);

// Filter out any ECS field that does not have multiple source fields mapped to it
const filteredOutput = Object.fromEntries(
Expand Down Expand Up @@ -129,7 +182,7 @@ export function findInvalidEcsFields(currentMapping: AnyObject): string[] {
const ecsDict = ECS_FULL;
const ecsReserved = ECS_RESERVED;

processMapping([], currentMapping, output);
extractECSMapping([], currentMapping, output);
const filteredOutput = Object.fromEntries(
Object.entries(output).filter(([key, _]) => key !== null)
);
Expand Down