[Security Solution] Classify EQL verification and ML job missing erro…

…rs as user errors (elastic#180094) ## Summary Building on elastic#180040 Detection rules commonly fail when prebuilt rules are imported and enabled without the appropriate indices (for EQL) or ML jobs (for ML rules). EQL rules fail with a `verification_exception` because the EQL search API validates the fields in the search request against the indices in the request; if there are no indices then it returns an exception. ML rules fail with a `<job name> missing` exception on the search request if the job is not found. Both of these errors do not mean that the system is overloaded or performing incorrectly in some way, but they are still showing up in large volumes on SLO dashboards. This PR builds on elastic#180040, which introduces the ability to classify errors as "user errors" when the error is not due to some kind of system malfunction, but more related to incorrect (or insufficient) user actions. ### Testing #### EQL 1. Create 2 indices, `test` and `test2` ``` PUT /test { "mappings": { "properties": { "@timestamp": { "type": "date" } } } } PUT /test2 { "mappings": { "properties": { "@timestamp": { "type": "date" }, "event.category": { "type": "keyword" } } } } ``` 2. Create (disabled) an EQL rule that queries `test*` and uses a simple query like `file where true` 3. Delete the index `test2` 4. Enable the rule. The rule will fail with a `verification_exception` because `test` does not have `event.category`. 5. Use your favorite debugging method to verify that `userError` was `true` in `addLastRunError` in `x-pack/plugins/alerting/server/monitoring/rule_result_service.ts` (hopefully this property will be added to the rule SO so we can check it there in API integration tests) #### ML rules 1. Import a prebuilt ML rule (`Unusual Process Spawned by a User`, for example) 2. Enable the rule. The rule will fail with `An error occurred during rule execution: message: "problem_child_rare_process_by_user missing"` 3. Use your favorite debugging method to verify that `userError` was `true` in `addLastRunError` in `x-pack/plugins/alerting/server/monitoring/rule_result_service.ts` (hopefully this property will be added to the rule SO so we can check it there in API integration tests)
semd · Apr 10, 2024 · 9242afb · 9242afb
1 parent 8ef7deb
commit 9242afb
Show file tree

Hide file tree

Showing 10 changed files with 217 additions and 48 deletions.
diff --git a/.../detection_engine/rule_monitoring/logic/rule_execution_log/client_for_executors/client.ts b/.../detection_engine/rule_monitoring/logic/rule_execution_log/client_for_executors/client.ts
@@ -164,7 +164,7 @@ export const createRuleExecutionLogClientForExecutors = (
   };
 
   const writeStatusChangeToRuleObject = async (args: NormalizedStatusChangeArgs): Promise<void> => {
-    const { newStatus, message, metrics } = args;
+    const { newStatus, message, metrics, userError } = args;
 
     if (newStatus === RuleExecutionStatusEnum.running) {
       return;
@@ -189,7 +189,7 @@ export const createRuleExecutionLogClientForExecutors = (
     }
 
     if (newStatus === RuleExecutionStatusEnum.failed) {
-      ruleResultService.addLastRunError(message);
+      ruleResultService.addLastRunError(message, userError ?? false);
     } else if (newStatus === RuleExecutionStatusEnum['partial failure']) {
       ruleResultService.addLastRunWarning(message);
     }
@@ -233,6 +233,7 @@ interface NormalizedStatusChangeArgs {
   newStatus: RuleExecutionStatus;
   message: string;
   metrics?: RuleExecutionMetrics;
+  userError?: boolean;
 }
 
 const normalizeStatusChangeArgs = (args: StatusChangeArgs): NormalizedStatusChangeArgs => {
@@ -242,7 +243,7 @@ const normalizeStatusChangeArgs = (args: StatusChangeArgs): NormalizedStatusChan
       message: '',
     };
   }
-  const { newStatus, message, metrics } = args;
+  const { newStatus, message, metrics, userError } = args;
 
   return {
     newStatus,
@@ -255,6 +256,7 @@ const normalizeStatusChangeArgs = (args: StatusChangeArgs): NormalizedStatusChan
           execution_gap_duration_s: normalizeGap(metrics.executionGap),
         }
       : undefined,
+    userError,
   };
 };
 

diff --git a/..._engine/rule_monitoring/logic/rule_execution_log/client_for_executors/client_interface.ts b/..._engine/rule_monitoring/logic/rule_execution_log/client_for_executors/client_interface.ts
@@ -122,6 +122,7 @@ export interface StatusChangeArgs {
   newStatus: RuleExecutionStatus;
   message?: string;
   metrics?: MetricsArgs;
+  userError?: boolean;
 }
 
 export interface MetricsArgs {

diff --git a/...rity_solution/server/lib/detection_engine/rule_types/create_security_rule_type_wrapper.ts b/...rity_solution/server/lib/detection_engine/rule_types/create_security_rule_type_wrapper.ts
@@ -440,6 +440,7 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
                   success: result.success && runResult.success,
                   warning: warningMessages.length > 0,
                   warningMessages,
+                  userError: runResult.userError,
                 };
                 runState = runResult.state;
               }
@@ -516,6 +517,7 @@ export const createSecurityRuleTypeWrapper: CreateSecurityRuleTypeWrapper =
                   indexingDurations: result.bulkCreateTimes,
                   enrichmentDurations: result.enrichmentTimes,
                 },
+                userError: result.userError,
               });
             }
           } catch (error) {

diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/eql/eql.test.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/eql/eql.test.ts
@@ -66,5 +66,29 @@ describe('eql_executor', () => {
         }`,
       ]);
     });
+
+    it('should classify EQL verification exceptions as "user errors" when reporting to the framework', async () => {
+      alertServices.scopedClusterClient.asCurrentUser.eql.search.mockRejectedValue({
+        name: 'ResponseError',
+        message:
+          'verification_exception\n\tRoot causes:\n\t\tverification_exception: Found 1 problem\nline 1:1: Unknown column [event.category]',
+      });
+      const result = await eqlExecutor({
+        inputIndex: DEFAULT_INDEX_PATTERN,
+        runtimeMappings: {},
+        completeRule: eqlCompleteRule,
+        tuple,
+        ruleExecutionLogger,
+        services: alertServices,
+        version,
+        bulkCreate: jest.fn(),
+        wrapHits: jest.fn(),
+        wrapSequences: jest.fn(),
+        primaryTimestamp: '@timestamp',
+        exceptionFilter: undefined,
+        unprocessedExceptions: [],
+      });
+      expect(result.userError).toEqual(true);
+    });
   });
 });
diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/eql/eql.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/eql/eql.ts
@@ -100,40 +100,54 @@ export const eqlExecutor = async ({
     }
     const eqlSignalSearchStart = performance.now();
 
-    const response = await services.scopedClusterClient.asCurrentUser.eql.search<SignalSource>(
-      request
-    );
+    try {
+      const response = await services.scopedClusterClient.asCurrentUser.eql.search<SignalSource>(
+        request
+      );
 
-    const eqlSignalSearchEnd = performance.now();
-    const eqlSearchDuration = makeFloatString(eqlSignalSearchEnd - eqlSignalSearchStart);
-    result.searchAfterTimes = [eqlSearchDuration];
+      const eqlSignalSearchEnd = performance.now();
+      const eqlSearchDuration = makeFloatString(eqlSignalSearchEnd - eqlSignalSearchStart);
+      result.searchAfterTimes = [eqlSearchDuration];
 
-    let newSignals: Array<WrappedFieldsLatest<BaseFieldsLatest>> | undefined;
-    if (response.hits.sequences !== undefined) {
-      newSignals = wrapSequences(response.hits.sequences, buildReasonMessageForEqlAlert);
-    } else if (response.hits.events !== undefined) {
-      newSignals = wrapHits(response.hits.events, buildReasonMessageForEqlAlert);
-    } else {
-      throw new Error(
-        'eql query response should have either `sequences` or `events` but had neither'
-      );
-    }
+      let newSignals: Array<WrappedFieldsLatest<BaseFieldsLatest>> | undefined;
+      if (response.hits.sequences !== undefined) {
+        newSignals = wrapSequences(response.hits.sequences, buildReasonMessageForEqlAlert);
+      } else if (response.hits.events !== undefined) {
+        newSignals = wrapHits(response.hits.events, buildReasonMessageForEqlAlert);
+      } else {
+        throw new Error(
+          'eql query response should have either `sequences` or `events` but had neither'
+        );
+      }
 
-    if (newSignals?.length) {
-      const createResult = await bulkCreate(
-        newSignals,
-        undefined,
-        createEnrichEventsFunction({
-          services,
-          logger: ruleExecutionLogger,
-        })
-      );
+      if (newSignals?.length) {
+        const createResult = await bulkCreate(
+          newSignals,
+          undefined,
+          createEnrichEventsFunction({
+            services,
+            logger: ruleExecutionLogger,
+          })
+        );
 
-      addToSearchAfterReturn({ current: result, next: createResult });
-    }
-    if (response.hits.total && response.hits.total.value >= ruleParams.maxSignals) {
-      result.warningMessages.push(getMaxSignalsWarning());
+        addToSearchAfterReturn({ current: result, next: createResult });
+      }
+      if (response.hits.total && response.hits.total.value >= ruleParams.maxSignals) {
+        result.warningMessages.push(getMaxSignalsWarning());
+      }
+      return result;
+    } catch (error) {
+      if (
+        typeof error.message === 'string' &&
+        (error.message as string).includes('verification_exception')
+      ) {
+        // We report errors that are more related to user configuration of rules rather than system outages as "user errors"
+        // so SLO dashboards can show less noise around system outages
+        result.userError = true;
+      }
+      result.errors.push(error.message);
+      result.success = false;
+      return result;
     }
-    return result;
   });
 };
diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/ml/ml.test.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/ml/ml.test.ts
@@ -64,6 +64,7 @@ describe('ml_executor', () => {
       errors: [],
       createdItems: [],
     });
+    jobsSummaryMock.mockResolvedValue([]);
   });
 
   it('should throw an error if ML plugin was not available', async () => {
@@ -131,4 +132,26 @@ describe('ml_executor', () => {
     );
     expect(response.warningMessages.length).toEqual(1);
   });
+
+  it('should report job missing errors as user errors', async () => {
+    (findMlSignals as jest.Mock).mockRejectedValue({
+      message: 'my_test_job_name missing',
+    });
+
+    const result = await mlExecutor({
+      completeRule: mlCompleteRule,
+      tuple,
+      ml: mlMock,
+      services: alertServices,
+      ruleExecutionLogger,
+      listClient,
+      bulkCreate: jest.fn(),
+      wrapHits: jest.fn(),
+      exceptionFilter: undefined,
+      unprocessedExceptions: [],
+    });
+    expect(result.userError).toEqual(true);
+    expect(result.success).toEqual(false);
+    expect(result.errors).toEqual(['my_test_job_name missing']);
+  });
 });
diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/ml/ml.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/ml/ml.ts
@@ -5,6 +5,8 @@
  * 2.0.
  */
 
+/* eslint require-atomic-updates: ["error", { "allowProperties": true }] */
+
 import type { KibanaRequest } from '@kbn/core/server';
 import type { ExceptionListItemSchema } from '@kbn/securitysolution-io-ts-list-types';
 import type {
@@ -30,6 +32,7 @@ import {
 import type { SetupPlugins } from '../../../../plugin';
 import { withSecuritySpan } from '../../../../utils/with_security_span';
 import type { IRuleExecutionLogForExecutors } from '../../rule_monitoring';
+import type { AnomalyResults } from '../../../machine_learning';
 
 export const mlExecutor = async ({
   completeRule,
@@ -93,19 +96,29 @@ export const mlExecutor = async ({
       result.warning = true;
     }
 
-    const anomalyResults = await findMlSignals({
-      ml,
-      // Using fake KibanaRequest as it is needed to satisfy the ML Services API, but can be empty as it is
-      // currently unused by the mlAnomalySearch function.
-      request: {} as unknown as KibanaRequest,
-      savedObjectsClient: services.savedObjectsClient,
-      jobIds: ruleParams.machineLearningJobId,
-      anomalyThreshold: ruleParams.anomalyThreshold,
-      from: tuple.from.toISOString(),
-      to: tuple.to.toISOString(),
-      maxSignals: tuple.maxSignals,
-      exceptionFilter,
-    });
+    let anomalyResults: AnomalyResults;
+    try {
+      anomalyResults = await findMlSignals({
+        ml,
+        // Using fake KibanaRequest as it is needed to satisfy the ML Services API, but can be empty as it is
+        // currently unused by the mlAnomalySearch function.
+        request: {} as unknown as KibanaRequest,
+        savedObjectsClient: services.savedObjectsClient,
+        jobIds: ruleParams.machineLearningJobId,
+        anomalyThreshold: ruleParams.anomalyThreshold,
+        from: tuple.from.toISOString(),
+        to: tuple.to.toISOString(),
+        maxSignals: tuple.maxSignals,
+        exceptionFilter,
+      });
+    } catch (error) {
+      if (typeof error.message === 'string' && (error.message as string).endsWith('missing')) {
+        result.userError = true;
+      }
+      result.errors.push(error.message);
+      result.success = false;
+      return result;
+    }
 
     if (
       anomalyResults.hits.total &&

diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/types.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/rule_types/types.ts
@@ -65,6 +65,7 @@ export interface SecurityAlertTypeReturnValue<TState extends RuleTypeState> {
   createdSignalsCount: number;
   createdSignals: unknown[];
   errors: string[];
+  userError?: boolean;
   lastLookbackDate?: Date | null;
   searchAfterTimes: string[];
   state: TState;
@@ -388,6 +389,7 @@ export interface SearchAfterAndBulkCreateReturnType {
   createdSignalsCount: number;
   createdSignals: unknown[];
   errors: string[];
+  userError?: boolean;
   warningMessages: string[];
   suppressedAlertsCount?: number;
 }

diff --git a/.../detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/eql.ts b/.../detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/eql.ts
@@ -6,6 +6,8 @@
  */
 
 import { v4 as uuidv4 } from 'uuid';
+import supertestLib from 'supertest';
+import url from 'url';
 import expect from '@kbn/expect';
 import {
   ALERT_REASON,
@@ -30,7 +32,10 @@ import {
   ALERT_GROUP_ID,
 } from '@kbn/security-solution-plugin/common/field_maps/field_names';
 import { getMaxSignalsWarning as getMaxAlertsWarning } from '@kbn/security-solution-plugin/server/lib/detection_engine/rule_types/utils/utils';
-import { ENABLE_ASSET_CRITICALITY_SETTING } from '@kbn/security-solution-plugin/common/constants';
+import {
+  DETECTION_ENGINE_RULES_URL,
+  ENABLE_ASSET_CRITICALITY_SETTING,
+} from '@kbn/security-solution-plugin/common/constants';
 import {
   getEqlRuleForAlertTesting,
   getAlerts,
@@ -42,6 +47,8 @@ import {
   createRule,
   deleteAllRules,
   deleteAllAlerts,
+  waitForRuleFailure,
+  routeWithNamespace,
 } from '../../../../../../../common/utils/security_solution';
 import { FtrProviderContext } from '../../../../../../ftr_provider_context';
 import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder';
@@ -62,6 +69,7 @@ export default ({ getService }: FtrProviderContext) => {
 
   // TODO: add a new service for loading archiver files similar to "getService('es')"
   const config = getService('config');
+  const request = supertestLib(url.format(config.get('servers.kibana')));
   const isServerless = config.get('serverless');
   const dataPathBuilder = new EsArchivePathBuilder(isServerless);
   const auditPath = dataPathBuilder.getPath('auditbeat/hosts');
@@ -196,6 +204,42 @@ export default ({ getService }: FtrProviderContext) => {
       });
     });
 
+    it('classifies verification_exception errors as user errors', async () => {
+      function getMetricsRequest(reset: boolean = false) {
+        return request
+          .get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`)
+          .set('kbn-xsrf', 'foo')
+          .expect(200)
+          .then((response) => response.body);
+      }
+
+      await getMetricsRequest(true);
+      const rule: EqlRuleCreateProps = {
+        ...getEqlRuleForAlertTesting(['auditbeat-*']),
+        query: 'file where field.doesnt.exist == true',
+      };
+      const createdRule = await createRule(supertest, log, rule);
+      await waitForRuleFailure({ supertest, log, id: createdRule.id });
+
+      const route = routeWithNamespace(DETECTION_ENGINE_RULES_URL);
+      const response = await supertest
+        .get(route)
+        .set('kbn-xsrf', 'true')
+        .set('elastic-api-version', '2023-10-31')
+        .query({ id: createdRule.id })
+        .expect(200);
+
+      const ruleResponse = response.body;
+      expect(
+        ruleResponse.execution_summary.last_execution.message.includes('verification_exception')
+      ).eql(true);
+
+      const metricsResponse = await getMetricsRequest();
+      expect(
+        metricsResponse.metrics.task_run.value.by_type['alerting:siem__eqlRule'].user_errors
+      ).eql(1);
+    });
+
     it('generates up to max_alerts for non-sequence EQL queries', async () => {
       const maxAlerts = 200;
       const rule: EqlRuleCreateProps = {