Skip to content

Commit

Permalink
[Security Solution] Classify EQL verification and ML job missing erro…
Browse files Browse the repository at this point in the history
…rs as user errors (elastic#180094)

## Summary

Building on elastic#180040

Detection rules commonly fail when prebuilt rules are imported and
enabled without the appropriate indices (for EQL) or ML jobs (for ML
rules). EQL rules fail with a `verification_exception` because the EQL
search API validates the fields in the search request against the
indices in the request; if there are no indices then it returns an
exception. ML rules fail with a `<job name> missing` exception on the
search request if the job is not found. Both of these errors do not mean
that the system is overloaded or performing incorrectly in some way, but
they are still showing up in large volumes on SLO dashboards.

This PR builds on elastic#180040, which introduces the ability to classify
errors as "user errors" when the error is not due to some kind of system
malfunction, but more related to incorrect (or insufficient) user
actions.

### Testing
#### EQL
1. Create 2 indices, `test` and `test2`
```
PUT /test
{
  "mappings": {
    "properties": {
      "@timestamp": {
        "type": "date"
      }
    }
  }
}

PUT /test2
{
  "mappings": {
    "properties": {
      "@timestamp": {
        "type": "date"
      },
      "event.category": {
        "type": "keyword"
      }
    }
  }
}
```

2. Create (disabled) an EQL rule that queries `test*` and uses a simple
query like `file where true`
3. Delete the index `test2`
4. Enable the rule. The rule will fail with a `verification_exception`
because `test` does not have `event.category`.
5. Use your favorite debugging method to verify that `userError` was
`true` in `addLastRunError` in
`x-pack/plugins/alerting/server/monitoring/rule_result_service.ts`
(hopefully this property will be added to the rule SO so we can check it
there in API integration tests)

#### ML rules
1. Import a prebuilt ML rule (`Unusual Process Spawned by a User`, for
example)
2. Enable the rule. The rule will fail with `An error occurred during
rule execution: message: "problem_child_rare_process_by_user missing"`
3. Use your favorite debugging method to verify that `userError` was
`true` in `addLastRunError` in
`x-pack/plugins/alerting/server/monitoring/rule_result_service.ts`
(hopefully this property will be added to the rule SO so we can check it
there in API integration tests)
  • Loading branch information
marshallmain authored and semd committed Apr 10, 2024
1 parent 8ef7deb commit 9242afb
Show file tree
Hide file tree
Showing 10 changed files with 217 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ export const createRuleExecutionLogClientForExecutors = (
};

const writeStatusChangeToRuleObject = async (args: NormalizedStatusChangeArgs): Promise<void> => {
const { newStatus, message, metrics } = args;
const { newStatus, message, metrics, userError } = args;

if (newStatus === RuleExecutionStatusEnum.running) {
return;
Expand All @@ -189,7 +189,7 @@ export const createRuleExecutionLogClientForExecutors = (
}

if (newStatus === RuleExecutionStatusEnum.failed) {
ruleResultService.addLastRunError(message);
ruleResultService.addLastRunError(message, userError ?? false);
} else if (newStatus === RuleExecutionStatusEnum['partial failure']) {
ruleResultService.addLastRunWarning(message);
}
Expand Down Expand Up @@ -233,6 +233,7 @@ interface NormalizedStatusChangeArgs {
newStatus: RuleExecutionStatus;
message: string;
metrics?: RuleExecutionMetrics;
userError?: boolean;
}

const normalizeStatusChangeArgs = (args: StatusChangeArgs): NormalizedStatusChangeArgs => {
Expand All @@ -242,7 +243,7 @@ const normalizeStatusChangeArgs = (args: StatusChangeArgs): NormalizedStatusChan
message: '',
};
}
const { newStatus, message, metrics } = args;
const { newStatus, message, metrics, userError } = args;

return {
newStatus,
Expand All @@ -255,6 +256,7 @@ const normalizeStatusChangeArgs = (args: StatusChangeArgs): NormalizedStatusChan
execution_gap_duration_s: normalizeGap(metrics.executionGap),
}
: undefined,
userError,
};
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ export interface StatusChangeArgs {
newStatus: RuleExecutionStatus;
message?: string;
metrics?: MetricsArgs;
userError?: boolean;
}

export interface MetricsArgs {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
success: result.success && runResult.success,
warning: warningMessages.length > 0,
warningMessages,
userError: runResult.userError,
};
runState = runResult.state;
}
Expand Down Expand Up @@ -516,6 +517,7 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
indexingDurations: result.bulkCreateTimes,
enrichmentDurations: result.enrichmentTimes,
},
userError: result.userError,
});
}
} catch (error) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,29 @@ describe('eql_executor', () => {
}`,
]);
});

it('should classify EQL verification exceptions as "user errors" when reporting to the framework', async () => {
alertServices.scopedClusterClient.asCurrentUser.eql.search.mockRejectedValue({
name: 'ResponseError',
message:
'verification_exception\n\tRoot causes:\n\t\tverification_exception: Found 1 problem\nline 1:1: Unknown column [event.category]',
});
const result = await eqlExecutor({
inputIndex: DEFAULT_INDEX_PATTERN,
runtimeMappings: {},
completeRule: eqlCompleteRule,
tuple,
ruleExecutionLogger,
services: alertServices,
version,
bulkCreate: jest.fn(),
wrapHits: jest.fn(),
wrapSequences: jest.fn(),
primaryTimestamp: '@timestamp',
exceptionFilter: undefined,
unprocessedExceptions: [],
});
expect(result.userError).toEqual(true);
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -100,40 +100,54 @@ export const eqlExecutor = async ({
}
const eqlSignalSearchStart = performance.now();

const response = await services.scopedClusterClient.asCurrentUser.eql.search<SignalSource>(
request
);
try {
const response = await services.scopedClusterClient.asCurrentUser.eql.search<SignalSource>(
request
);

const eqlSignalSearchEnd = performance.now();
const eqlSearchDuration = makeFloatString(eqlSignalSearchEnd - eqlSignalSearchStart);
result.searchAfterTimes = [eqlSearchDuration];
const eqlSignalSearchEnd = performance.now();
const eqlSearchDuration = makeFloatString(eqlSignalSearchEnd - eqlSignalSearchStart);
result.searchAfterTimes = [eqlSearchDuration];

let newSignals: Array<WrappedFieldsLatest<BaseFieldsLatest>> | undefined;
if (response.hits.sequences !== undefined) {
newSignals = wrapSequences(response.hits.sequences, buildReasonMessageForEqlAlert);
} else if (response.hits.events !== undefined) {
newSignals = wrapHits(response.hits.events, buildReasonMessageForEqlAlert);
} else {
throw new Error(
'eql query response should have either `sequences` or `events` but had neither'
);
}
let newSignals: Array<WrappedFieldsLatest<BaseFieldsLatest>> | undefined;
if (response.hits.sequences !== undefined) {
newSignals = wrapSequences(response.hits.sequences, buildReasonMessageForEqlAlert);
} else if (response.hits.events !== undefined) {
newSignals = wrapHits(response.hits.events, buildReasonMessageForEqlAlert);
} else {
throw new Error(
'eql query response should have either `sequences` or `events` but had neither'
);
}

if (newSignals?.length) {
const createResult = await bulkCreate(
newSignals,
undefined,
createEnrichEventsFunction({
services,
logger: ruleExecutionLogger,
})
);
if (newSignals?.length) {
const createResult = await bulkCreate(
newSignals,
undefined,
createEnrichEventsFunction({
services,
logger: ruleExecutionLogger,
})
);

addToSearchAfterReturn({ current: result, next: createResult });
}
if (response.hits.total && response.hits.total.value >= ruleParams.maxSignals) {
result.warningMessages.push(getMaxSignalsWarning());
addToSearchAfterReturn({ current: result, next: createResult });
}
if (response.hits.total && response.hits.total.value >= ruleParams.maxSignals) {
result.warningMessages.push(getMaxSignalsWarning());
}
return result;
} catch (error) {
if (
typeof error.message === 'string' &&
(error.message as string).includes('verification_exception')
) {
// We report errors that are more related to user configuration of rules rather than system outages as "user errors"
// so SLO dashboards can show less noise around system outages
result.userError = true;
}
result.errors.push(error.message);
result.success = false;
return result;
}
return result;
});
};
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ describe('ml_executor', () => {
errors: [],
createdItems: [],
});
jobsSummaryMock.mockResolvedValue([]);
});

it('should throw an error if ML plugin was not available', async () => {
Expand Down Expand Up @@ -131,4 +132,26 @@ describe('ml_executor', () => {
);
expect(response.warningMessages.length).toEqual(1);
});

it('should report job missing errors as user errors', async () => {
(findMlSignals as jest.Mock).mockRejectedValue({
message: 'my_test_job_name missing',
});

const result = await mlExecutor({
completeRule: mlCompleteRule,
tuple,
ml: mlMock,
services: alertServices,
ruleExecutionLogger,
listClient,
bulkCreate: jest.fn(),
wrapHits: jest.fn(),
exceptionFilter: undefined,
unprocessedExceptions: [],
});
expect(result.userError).toEqual(true);
expect(result.success).toEqual(false);
expect(result.errors).toEqual(['my_test_job_name missing']);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* 2.0.
*/

/* eslint require-atomic-updates: ["error", { "allowProperties": true }] */

import type { KibanaRequest } from '@kbn/core/server';
import type { ExceptionListItemSchema } from '@kbn/securitysolution-io-ts-list-types';
import type {
Expand All @@ -30,6 +32,7 @@ import {
import type { SetupPlugins } from '../../../../plugin';
import { withSecuritySpan } from '../../../../utils/with_security_span';
import type { IRuleExecutionLogForExecutors } from '../../rule_monitoring';
import type { AnomalyResults } from '../../../machine_learning';

export const mlExecutor = async ({
completeRule,
Expand Down Expand Up @@ -93,19 +96,29 @@ export const mlExecutor = async ({
result.warning = true;
}

const anomalyResults = await findMlSignals({
ml,
// Using fake KibanaRequest as it is needed to satisfy the ML Services API, but can be empty as it is
// currently unused by the mlAnomalySearch function.
request: {} as unknown as KibanaRequest,
savedObjectsClient: services.savedObjectsClient,
jobIds: ruleParams.machineLearningJobId,
anomalyThreshold: ruleParams.anomalyThreshold,
from: tuple.from.toISOString(),
to: tuple.to.toISOString(),
maxSignals: tuple.maxSignals,
exceptionFilter,
});
let anomalyResults: AnomalyResults;
try {
anomalyResults = await findMlSignals({
ml,
// Using fake KibanaRequest as it is needed to satisfy the ML Services API, but can be empty as it is
// currently unused by the mlAnomalySearch function.
request: {} as unknown as KibanaRequest,
savedObjectsClient: services.savedObjectsClient,
jobIds: ruleParams.machineLearningJobId,
anomalyThreshold: ruleParams.anomalyThreshold,
from: tuple.from.toISOString(),
to: tuple.to.toISOString(),
maxSignals: tuple.maxSignals,
exceptionFilter,
});
} catch (error) {
if (typeof error.message === 'string' && (error.message as string).endsWith('missing')) {
result.userError = true;
}
result.errors.push(error.message);
result.success = false;
return result;
}

if (
anomalyResults.hits.total &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ export interface SecurityAlertTypeReturnValue<TState extends RuleTypeState> {
createdSignalsCount: number;
createdSignals: unknown[];
errors: string[];
userError?: boolean;
lastLookbackDate?: Date | null;
searchAfterTimes: string[];
state: TState;
Expand Down Expand Up @@ -388,6 +389,7 @@ export interface SearchAfterAndBulkCreateReturnType {
createdSignalsCount: number;
createdSignals: unknown[];
errors: string[];
userError?: boolean;
warningMessages: string[];
suppressedAlertsCount?: number;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
*/

import { v4 as uuidv4 } from 'uuid';
import supertestLib from 'supertest';
import url from 'url';
import expect from '@kbn/expect';
import {
ALERT_REASON,
Expand All @@ -30,7 +32,10 @@ import {
ALERT_GROUP_ID,
} from '@kbn/security-solution-plugin/common/field_maps/field_names';
import { getMaxSignalsWarning as getMaxAlertsWarning } from '@kbn/security-solution-plugin/server/lib/detection_engine/rule_types/utils/utils';
import { ENABLE_ASSET_CRITICALITY_SETTING } from '@kbn/security-solution-plugin/common/constants';
import {
DETECTION_ENGINE_RULES_URL,
ENABLE_ASSET_CRITICALITY_SETTING,
} from '@kbn/security-solution-plugin/common/constants';
import {
getEqlRuleForAlertTesting,
getAlerts,
Expand All @@ -42,6 +47,8 @@ import {
createRule,
deleteAllRules,
deleteAllAlerts,
waitForRuleFailure,
routeWithNamespace,
} from '../../../../../../../common/utils/security_solution';
import { FtrProviderContext } from '../../../../../../ftr_provider_context';
import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder';
Expand All @@ -62,6 +69,7 @@ export default ({ getService }: FtrProviderContext) => {

// TODO: add a new service for loading archiver files similar to "getService('es')"
const config = getService('config');
const request = supertestLib(url.format(config.get('servers.kibana')));
const isServerless = config.get('serverless');
const dataPathBuilder = new EsArchivePathBuilder(isServerless);
const auditPath = dataPathBuilder.getPath('auditbeat/hosts');
Expand Down Expand Up @@ -196,6 +204,42 @@ export default ({ getService }: FtrProviderContext) => {
});
});

it('classifies verification_exception errors as user errors', async () => {
function getMetricsRequest(reset: boolean = false) {
return request
.get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`)
.set('kbn-xsrf', 'foo')
.expect(200)
.then((response) => response.body);
}

await getMetricsRequest(true);
const rule: EqlRuleCreateProps = {
...getEqlRuleForAlertTesting(['auditbeat-*']),
query: 'file where field.doesnt.exist == true',
};
const createdRule = await createRule(supertest, log, rule);
await waitForRuleFailure({ supertest, log, id: createdRule.id });

const route = routeWithNamespace(DETECTION_ENGINE_RULES_URL);
const response = await supertest
.get(route)
.set('kbn-xsrf', 'true')
.set('elastic-api-version', '2023-10-31')
.query({ id: createdRule.id })
.expect(200);

const ruleResponse = response.body;
expect(
ruleResponse.execution_summary.last_execution.message.includes('verification_exception')
).eql(true);

const metricsResponse = await getMetricsRequest();
expect(
metricsResponse.metrics.task_run.value.by_type['alerting:siem__eqlRule'].user_errors
).eql(1);
});

it('generates up to max_alerts for non-sequence EQL queries', async () => {
const maxAlerts = 200;
const rule: EqlRuleCreateProps = {
Expand Down
Loading

0 comments on commit 9242afb

Please sign in to comment.