From bdb6ff128c8e9fbf83d5e38e9a771853a171fdd2 Mon Sep 17 00:00:00 2001 From: Juan Pablo Djeredjian Date: Wed, 6 Nov 2024 09:50:44 -0300 Subject: [PATCH] [Security Solution] `/upgrade/_perform` performance testing (#197898) ## Summary - Creates a new `withSyncSecuritySpan` wrapper to measure sync functions in APM. Adds this wrapper to new CPU intensive logic in the `/upgrade/_perform` endpoint. - Do performance testing on the endpoint. See results below. ## Performance testing ### Possible OOMs in production Serverless Created an Serverless instance and manually installed different Prebuilt Rules package to force rule upgrades. - With the current published prebuilt packages, a user cannot update more than 950 rules with a single request. - This number is expected to grow, but at a slower pace than the actual number of rules being published. - Also, as users start customizing rules, rules with conflicts will be excluded from bulk requests, which will **make payloads even smaller.** - Testing the biggest possible upgrade request, Serverless behaved reliably and no **timeouts** or **OOMs** occurred: | From version | To version | Rule Updates | Request time | |---------|--------|---------|--------| | 8.9.9 | 8.15.9 | 913 | 47.3s | | 8.9.12 | 8.15.9 | 917 | 52.34s | | 8.9.15 | 8.15.9 | 928 | 56.08s | | 8.10.4 | 8.15.9 | 872 | 43.29s | | 8.10.5 | 8.15.9 | 910 | 52.21s | | 8.10.6 | 8.15.9 | 913 | 55.92s | | 8.10.7 | 8.15.9 | 924 | 49.89s | | 8.11.2 | 8.15.9 | 910 | 56.48s | | 8.11.5 | 8.15.9 | 928 | 49.22s | | 8.11.16 | 8.15.9 | 695 | 38.91s | | 8.12.6 | 8.15.9 | 947 | 51.13s | | 8.13.11 | 8.15.9 | 646 | 42.98s | - Given the positive results for much bigger payloads seen in the **Memory profiling with limited memory in Kibana production mode** below, we can assume that there's no risk of OOMs in Serverless at the moment. ### Memory profiling with limited memory in Kibana production mode - Launched Kibana in Production mode, and set a **memory limit of 700mb** to mimic as closely as possible the Serverless environment (where memory is a hard constraint) - Stress tested with big number of requests and saw the following behaviour: | Rule Updates | Request time (min) | OOM error? | Metrics | |---------|--------|--------|--------| | 1500 | 1.1 | No |
Unfold![image](https://github.com/user-attachments/assets/46303a1a-a929-4c00-8777-8d1f23face17)
| | 2000 | 1.5 | No |
Unfold![image](https://github.com/user-attachments/assets/bd33d259-50fd-42df-947d-3a2e7c5c78c3)
| | 2500 | 1.8 | No |
Unfold![image](https://github.com/user-attachments/assets/9145d2e7-e87c-4ba6-8633-7fe1087c29fb)
| | 2750 | 1.9 | No |
Unfold![image](https://github.com/user-attachments/assets/9009163e-f58d-4be3-8a1f-87844760a037)
| | 3000 | - | YES | | - Rule upgrade OOM's consistently when the payload is >= 3000 rules, but behaves reliably below that. Good enough buffer for growth of the Prebuilt Rules package. - Also, the saw-toothed shape of the heap used graphics shows that garbage collection works properly for payloads under 3000 rules. ### APM request profiling - Connected Kibana in production mode to a APM server to measure spans of the `/upgrade/_perform` request. - Additionally, measured new CPU-intensive logic which calculates rule diffs and create rule payloads for upgrades. - An example span for a successful upgrade of 2500 rules: image - The new spans for CPU-intensive tasks `getUpgradeableRules` and `createModifiedPrebuiltRuleAssets`, which are displayed as `blocking`, have an acceptable span length, and do not have a considerable overhead on the total length of the request. ### Timeout testing - Tested Kibana with `--no-base-path` in order to check for potential timeouts in long running requests (ESS Cloud proxy is supposed to have a request timeout config of 2.5 mins) - Still [confirming](https://elastic.slack.com/archives/C5UDAFZQU/p1730297621776729) with Kibana Operations the behaviour of the timeouts in ESS and Serverless envs: - Tested with mock rules (indexed directly to ES) and **no timeouts occurred**: | Rule Updates | Request time (min) | |---------|--------| | 2000 | 2.1 | | 2000 | 2.1 | | 2250 | 2.3 | | 2500 | 2.6 | | 3000 | 3.1 | ### Conclusion The results show that the `/upgrade/_perform` endpoint performs reliably under stress, given the currentexpected request payloads. The only question to triple check is the behaviour of server request timeouts in Serverless: I'm waiting the Kibana ops team to get back to me, even though testing here did not show cases of timeouts. --------- Co-authored-by: Elastic Machine Co-authored-by: Dmitrii Shevchenko --- .../create_upgradeable_rules_payload.ts | 142 +++++++++--------- .../get_upgradeable_rules.ts | 102 +++++++------ .../server/utils/with_security_span.ts | 15 +- 3 files changed, 140 insertions(+), 119 deletions(-) diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts index 97e587646e524..b25320e1131ef 100644 --- a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts +++ b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts @@ -5,6 +5,7 @@ * 2.0. */ import { pickBy } from 'lodash'; +import { withSecuritySpanSync } from '../../../../../utils/with_security_span'; import type { PromisePoolError } from '../../../../../utils/promise_pool'; import { PickVersionValuesEnum, @@ -36,77 +37,82 @@ export const createModifiedPrebuiltRuleAssets = ({ upgradeableRules, requestBody, }: CreateModifiedPrebuiltRuleAssetsProps) => { - const { pick_version: globalPickVersion = PickVersionValuesEnum.MERGED, mode } = requestBody; - - const { modifiedPrebuiltRuleAssets, processingErrors } = upgradeableRules.reduce( - (processedRules, upgradeableRule) => { - const targetRuleType = upgradeableRule.target.type; - const ruleId = upgradeableRule.target.rule_id; - const fieldNames = FIELD_NAMES_BY_RULE_TYPE_MAP.get(targetRuleType); - - try { - if (fieldNames === undefined) { - throw new Error(`Unexpected rule type: ${targetRuleType}`); - } - - const { current, target } = upgradeableRule; - if (current.type !== target.type) { - assertPickVersionIsTarget({ ruleId, requestBody }); - } - - const calculatedRuleDiff = calculateRuleFieldsDiff({ - base_version: upgradeableRule.base - ? convertRuleToDiffable(convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.base)) - : MissingVersion, - current_version: convertRuleToDiffable(upgradeableRule.current), - target_version: convertRuleToDiffable( - convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.target) - ), - }) as AllFieldsDiff; - - if (mode === 'ALL_RULES' && globalPickVersion === 'MERGED') { - const fieldsWithConflicts = Object.keys(getFieldsDiffConflicts(calculatedRuleDiff)); - if (fieldsWithConflicts.length > 0) { - // If the mode is ALL_RULES, no fields can be overriden to any other pick_version - // than "MERGED", so throw an error for the fields that have conflicts. - throw new Error( - `Merge conflicts found in rule '${ruleId}' for fields: ${fieldsWithConflicts.join( - ', ' - )}. Please resolve the conflict manually or choose another value for 'pick_version'` - ); + return withSecuritySpanSync(createModifiedPrebuiltRuleAssets.name, () => { + const { pick_version: globalPickVersion = PickVersionValuesEnum.MERGED, mode } = requestBody; + + const { modifiedPrebuiltRuleAssets, processingErrors } = + upgradeableRules.reduce( + (processedRules, upgradeableRule) => { + const targetRuleType = upgradeableRule.target.type; + const ruleId = upgradeableRule.target.rule_id; + const fieldNames = FIELD_NAMES_BY_RULE_TYPE_MAP.get(targetRuleType); + + try { + if (fieldNames === undefined) { + throw new Error(`Unexpected rule type: ${targetRuleType}`); + } + + const { current, target } = upgradeableRule; + if (current.type !== target.type) { + assertPickVersionIsTarget({ ruleId, requestBody }); + } + + const calculatedRuleDiff = calculateRuleFieldsDiff({ + base_version: upgradeableRule.base + ? convertRuleToDiffable( + convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.base) + ) + : MissingVersion, + current_version: convertRuleToDiffable(upgradeableRule.current), + target_version: convertRuleToDiffable( + convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.target) + ), + }) as AllFieldsDiff; + + if (mode === 'ALL_RULES' && globalPickVersion === 'MERGED') { + const fieldsWithConflicts = Object.keys(getFieldsDiffConflicts(calculatedRuleDiff)); + if (fieldsWithConflicts.length > 0) { + // If the mode is ALL_RULES, no fields can be overriden to any other pick_version + // than "MERGED", so throw an error for the fields that have conflicts. + throw new Error( + `Merge conflicts found in rule '${ruleId}' for fields: ${fieldsWithConflicts.join( + ', ' + )}. Please resolve the conflict manually or choose another value for 'pick_version'` + ); + } + } + + const modifiedPrebuiltRuleAsset = createModifiedPrebuiltRuleAsset({ + upgradeableRule, + fieldNames, + requestBody, + globalPickVersion, + calculatedRuleDiff, + }); + + processedRules.modifiedPrebuiltRuleAssets.push(modifiedPrebuiltRuleAsset); + + return processedRules; + } catch (err) { + processedRules.processingErrors.push({ + error: err, + item: { rule_id: ruleId }, + }); + + return processedRules; } + }, + { + modifiedPrebuiltRuleAssets: [], + processingErrors: [], } + ); - const modifiedPrebuiltRuleAsset = createModifiedPrebuiltRuleAsset({ - upgradeableRule, - fieldNames, - requestBody, - globalPickVersion, - calculatedRuleDiff, - }); - - processedRules.modifiedPrebuiltRuleAssets.push(modifiedPrebuiltRuleAsset); - - return processedRules; - } catch (err) { - processedRules.processingErrors.push({ - error: err, - item: { rule_id: ruleId }, - }); - - return processedRules; - } - }, - { - modifiedPrebuiltRuleAssets: [], - processingErrors: [], - } - ); - - return { - modifiedPrebuiltRuleAssets, - processingErrors, - }; + return { + modifiedPrebuiltRuleAssets, + processingErrors, + }; + }); }; interface CreateModifiedPrebuiltRuleAssetParams { diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts index acfdb674c309a..750561b9858a9 100644 --- a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts +++ b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts @@ -4,7 +4,7 @@ * 2.0; you may not use this file except in compliance with the Elastic License * 2.0. */ - +import { withSecuritySpanSync } from '../../../../../utils/with_security_span'; import type { RuleResponse, RuleUpgradeSpecifier, @@ -26,58 +26,60 @@ export const getUpgradeableRules = ({ versionSpecifiers?: RuleUpgradeSpecifier[]; mode: Mode; }) => { - const upgradeableRules = new Map( - rawUpgradeableRules.map((_rule) => [_rule.current.rule_id, _rule]) - ); - const fetchErrors: Array> = []; - const skippedRules: SkippedRuleUpgrade[] = []; + return withSecuritySpanSync(getUpgradeableRules.name, () => { + const upgradeableRules = new Map( + rawUpgradeableRules.map((_rule) => [_rule.current.rule_id, _rule]) + ); + const fetchErrors: Array> = []; + const skippedRules: SkippedRuleUpgrade[] = []; - if (mode === ModeEnum.SPECIFIC_RULES) { - const installedRuleIds = new Set(currentRules.map((rule) => rule.rule_id)); - const upgradeableRuleIds = new Set(rawUpgradeableRules.map(({ current }) => current.rule_id)); - versionSpecifiers?.forEach((rule) => { - // Check that the requested rule was found - if (!installedRuleIds.has(rule.rule_id)) { - fetchErrors.push({ - error: new Error( - `Rule with rule_id "${rule.rule_id}" and version "${rule.version}" not found` - ), - item: rule, - }); - return; - } + if (mode === ModeEnum.SPECIFIC_RULES) { + const installedRuleIds = new Set(currentRules.map((rule) => rule.rule_id)); + const upgradeableRuleIds = new Set(rawUpgradeableRules.map(({ current }) => current.rule_id)); + versionSpecifiers?.forEach((rule) => { + // Check that the requested rule was found + if (!installedRuleIds.has(rule.rule_id)) { + fetchErrors.push({ + error: new Error( + `Rule with rule_id "${rule.rule_id}" and version "${rule.version}" not found` + ), + item: rule, + }); + return; + } - // Check that the requested rule is upgradeable - if (!upgradeableRuleIds.has(rule.rule_id)) { - skippedRules.push({ - rule_id: rule.rule_id, - reason: SkipRuleUpgradeReasonEnum.RULE_UP_TO_DATE, - }); - return; - } + // Check that the requested rule is upgradeable + if (!upgradeableRuleIds.has(rule.rule_id)) { + skippedRules.push({ + rule_id: rule.rule_id, + reason: SkipRuleUpgradeReasonEnum.RULE_UP_TO_DATE, + }); + return; + } - // Check that rule revisions match (no update slipped in since the user reviewed the list) - const currentRevision = currentRules.find( - (currentRule) => currentRule.rule_id === rule.rule_id - )?.revision; - if (rule.revision !== currentRevision) { - fetchErrors.push({ - error: new Error( - `Revision mismatch for rule_id ${rule.rule_id}: expected ${currentRevision}, got ${rule.revision}` - ), - item: rule, - }); - // Remove the rule from the list of upgradeable rules - if (upgradeableRules.has(rule.rule_id)) { - upgradeableRules.delete(rule.rule_id); + // Check that rule revisions match (no update slipped in since the user reviewed the list) + const currentRevision = currentRules.find( + (currentRule) => currentRule.rule_id === rule.rule_id + )?.revision; + if (rule.revision !== currentRevision) { + fetchErrors.push({ + error: new Error( + `Revision mismatch for rule_id ${rule.rule_id}: expected ${currentRevision}, got ${rule.revision}` + ), + item: rule, + }); + // Remove the rule from the list of upgradeable rules + if (upgradeableRules.has(rule.rule_id)) { + upgradeableRules.delete(rule.rule_id); + } } - } - }); - } + }); + } - return { - upgradeableRules: Array.from(upgradeableRules.values()), - fetchErrors, - skippedRules, - }; + return { + upgradeableRules: Array.from(upgradeableRules.values()), + fetchErrors, + skippedRules, + }; + }); }; diff --git a/x-pack/plugins/security_solution/server/utils/with_security_span.ts b/x-pack/plugins/security_solution/server/utils/with_security_span.ts index 58787dc45d09b..f9f78600cfb8d 100644 --- a/x-pack/plugins/security_solution/server/utils/with_security_span.ts +++ b/x-pack/plugins/security_solution/server/utils/with_security_span.ts @@ -6,7 +6,7 @@ */ import type { SpanOptions } from '@kbn/apm-utils'; import { withSpan } from '@kbn/apm-utils'; -import type agent from 'elastic-apm-node'; +import agent from 'elastic-apm-node'; import { APP_ID } from '../../common/constants'; type Span = Exclude; @@ -35,3 +35,16 @@ export const withSecuritySpan = ( }, cb ); + +export const withSecuritySpanSync = (name: string, fn: (span: Span | null) => T): T => { + const span = agent.startSpan(name, APP_ID); + + try { + const result = fn(span); + return result; + } finally { + if (span) { + span.end(); + } + } +};