From 40d62be897e213d629682481103c286b2e84b54f Mon Sep 17 00:00:00 2001
From: Juan Pablo Djeredjian <jpdjeredjian@gmail.com>
Date: Wed, 6 Nov 2024 09:50:44 -0300
Subject: [PATCH] [Security Solution] `/upgrade/_perform` performance testing
 (#197898)

## Summary

- Creates a new `withSyncSecuritySpan` wrapper to measure sync functions
in APM. Adds this wrapper to new CPU intensive logic in the
`/upgrade/_perform` endpoint.
- Do performance testing on the endpoint. See results below.


## Performance testing

### Possible OOMs in production Serverless

Created an Serverless instance and manually installed different Prebuilt
Rules package to force rule upgrades.
- With the current published prebuilt packages, a user cannot update
more than 950 rules with a single request.
- This number is expected to grow, but at a slower pace than the actual
number of rules being published.
- Also, as users start customizing rules, rules with conflicts will be
excluded from bulk requests, which will **make payloads even smaller.**
- Testing the biggest possible upgrade request, Serverless behaved
reliably and no **timeouts** or **OOMs** occurred:

| From version   | To version | Rule Updates | Request time   |
|---------|--------|---------|--------|
| 8.9.9   | 8.15.9 | 913     | 47.3s  |
| 8.9.12  | 8.15.9 | 917     | 52.34s |
| 8.9.15  | 8.15.9 | 928     | 56.08s |
| 8.10.4  | 8.15.9 | 872     | 43.29s |
| 8.10.5  | 8.15.9 | 910     | 52.21s |
| 8.10.6  | 8.15.9 | 913     | 55.92s |
| 8.10.7  | 8.15.9 | 924     | 49.89s |
| 8.11.2  | 8.15.9 | 910     | 56.48s |
| 8.11.5  | 8.15.9 | 928     | 49.22s |
| 8.11.16 | 8.15.9 | 695     | 38.91s |
| 8.12.6  | 8.15.9 | 947     | 51.13s |
| 8.13.11 | 8.15.9 | 646     | 42.98s |

- Given the positive results for much bigger payloads seen in the
**Memory profiling with limited memory in Kibana production mode**
below, we can assume that there's no risk of OOMs in Serverless at the
moment.

### Memory profiling with limited memory in Kibana production mode

- Launched Kibana in Production mode, and set a **memory limit of
700mb** to mimic as closely as possible the Serverless environment
(where memory is a hard constraint)
- Stress tested with big number of requests and saw the following
behaviour:

| Rule Updates   | Request time (min) | OOM error? | Metrics |
|---------|--------|--------|--------|
| 1500 | 1.1 | No |
<details><summary>Unfold</summary>![image](https://github.com/user-attachments/assets/46303a1a-a929-4c00-8777-8d1f23face17)</details>
|
| 2000 | 1.5 | No |
<details><summary>Unfold</summary>![image](https://github.com/user-attachments/assets/bd33d259-50fd-42df-947d-3a2e7c5c78c3)</details>
|
| 2500 | 1.8 | No |
<details><summary>Unfold</summary>![image](https://github.com/user-attachments/assets/9145d2e7-e87c-4ba6-8633-7fe1087c29fb)</details>
|
| 2750 | 1.9 | No |
<details><summary>Unfold</summary>![image](https://github.com/user-attachments/assets/9009163e-f58d-4be3-8a1f-87844760a037)</details>
|
| 3000  | - | YES |  |

- Rule upgrade OOM's consistently when the payload is >= 3000 rules, but
behaves reliably below that. Good enough buffer for growth of the
Prebuilt Rules package.
- Also, the saw-toothed shape of the heap used graphics shows that
garbage collection works properly for payloads under 3000 rules.

### APM request profiling

- Connected Kibana in production mode to a APM server to measure spans
of the `/upgrade/_perform` request.
- Additionally, measured new CPU-intensive logic which calculates rule
diffs and create rule payloads for upgrades.
- An example span for a successful upgrade of 2500 rules:
<img width="1722" alt="image"
src="https://github.com/user-attachments/assets/07aa3079-5ce4-4b87-ab41-2a3e133316ef">
- The new spans for CPU-intensive tasks `getUpgradeableRules` and
`createModifiedPrebuiltRuleAssets`, which are displayed as `blocking`,
have an acceptable span length, and do not have a considerable overhead
on the total length of the request.

### Timeout testing

- Tested Kibana with `--no-base-path` in order to check for potential
timeouts in long running requests (ESS Cloud proxy is supposed to have a
request timeout config of 2.5 mins)
- Still
[confirming](https://elastic.slack.com/archives/C5UDAFZQU/p1730297621776729)
with Kibana Operations the behaviour of the timeouts in ESS and
Serverless envs:
- Tested with mock rules (indexed directly to ES) and **no timeouts
occurred**:

| Rule Updates   | Request time (min) |
|---------|--------|
| 2000  | 2.1 |
| 2000  | 2.1 |
| 2250  | 2.3 |
| 2500  | 2.6 |
| 3000  | 3.1 |



### Conclusion

The results show that the `/upgrade/_perform` endpoint performs reliably
under stress, given the currentexpected request payloads.

The only question to triple check is the behaviour of server request
timeouts in Serverless: I'm waiting the Kibana ops team to get back to
me, even though testing here did not show cases of timeouts.

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
Co-authored-by: Dmitrii Shevchenko <dmitrii.shevchenko@elastic.co>
---
 .../create_upgradeable_rules_payload.ts       | 142 +++++++++---------
 .../get_upgradeable_rules.ts                  | 102 +++++++------
 .../server/utils/with_security_span.ts        |  15 +-
 3 files changed, 140 insertions(+), 119 deletions(-)
diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts
index 97e587646e524..b25320e1131ef 100644
--- a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts
+++ b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/create_upgradeable_rules_payload.ts
@@ -5,6 +5,7 @@
  * 2.0.
  */
 import { pickBy } from 'lodash';
+import { withSecuritySpanSync } from '../../../../../utils/with_security_span';
 import type { PromisePoolError } from '../../../../../utils/promise_pool';
 import {
   PickVersionValuesEnum,
@@ -36,77 +37,82 @@ export const createModifiedPrebuiltRuleAssets = ({
   upgradeableRules,
   requestBody,
 }: CreateModifiedPrebuiltRuleAssetsProps) => {
-  const { pick_version: globalPickVersion = PickVersionValuesEnum.MERGED, mode } = requestBody;
-
-  const { modifiedPrebuiltRuleAssets, processingErrors } = upgradeableRules.reduce<ProcessedRules>(
-    (processedRules, upgradeableRule) => {
-      const targetRuleType = upgradeableRule.target.type;
-      const ruleId = upgradeableRule.target.rule_id;
-      const fieldNames = FIELD_NAMES_BY_RULE_TYPE_MAP.get(targetRuleType);
-
-      try {
-        if (fieldNames === undefined) {
-          throw new Error(`Unexpected rule type: ${targetRuleType}`);
-        }
-
-        const { current, target } = upgradeableRule;
-        if (current.type !== target.type) {
-          assertPickVersionIsTarget({ ruleId, requestBody });
-        }
-
-        const calculatedRuleDiff = calculateRuleFieldsDiff({
-          base_version: upgradeableRule.base
-            ? convertRuleToDiffable(convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.base))
-            : MissingVersion,
-          current_version: convertRuleToDiffable(upgradeableRule.current),
-          target_version: convertRuleToDiffable(
-            convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.target)
-          ),
-        }) as AllFieldsDiff;
-
-        if (mode === 'ALL_RULES' && globalPickVersion === 'MERGED') {
-          const fieldsWithConflicts = Object.keys(getFieldsDiffConflicts(calculatedRuleDiff));
-          if (fieldsWithConflicts.length > 0) {
-            // If the mode is ALL_RULES, no fields can be overriden to any other pick_version
-            // than "MERGED", so throw an error for the fields that have conflicts.
-            throw new Error(
-              `Merge conflicts found in rule '${ruleId}' for fields: ${fieldsWithConflicts.join(
-                ', '
-              )}. Please resolve the conflict manually or choose another value for 'pick_version'`
-            );
+  return withSecuritySpanSync(createModifiedPrebuiltRuleAssets.name, () => {
+    const { pick_version: globalPickVersion = PickVersionValuesEnum.MERGED, mode } = requestBody;
+
+    const { modifiedPrebuiltRuleAssets, processingErrors } =
+      upgradeableRules.reduce<ProcessedRules>(
+        (processedRules, upgradeableRule) => {
+          const targetRuleType = upgradeableRule.target.type;
+          const ruleId = upgradeableRule.target.rule_id;
+          const fieldNames = FIELD_NAMES_BY_RULE_TYPE_MAP.get(targetRuleType);
+
+          try {
+            if (fieldNames === undefined) {
+              throw new Error(`Unexpected rule type: ${targetRuleType}`);
+            }
+
+            const { current, target } = upgradeableRule;
+            if (current.type !== target.type) {
+              assertPickVersionIsTarget({ ruleId, requestBody });
+            }
+
+            const calculatedRuleDiff = calculateRuleFieldsDiff({
+              base_version: upgradeableRule.base
+                ? convertRuleToDiffable(
+                    convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.base)
+                  )
+                : MissingVersion,
+              current_version: convertRuleToDiffable(upgradeableRule.current),
+              target_version: convertRuleToDiffable(
+                convertPrebuiltRuleAssetToRuleResponse(upgradeableRule.target)
+              ),
+            }) as AllFieldsDiff;
+
+            if (mode === 'ALL_RULES' && globalPickVersion === 'MERGED') {
+              const fieldsWithConflicts = Object.keys(getFieldsDiffConflicts(calculatedRuleDiff));
+              if (fieldsWithConflicts.length > 0) {
+                // If the mode is ALL_RULES, no fields can be overriden to any other pick_version
+                // than "MERGED", so throw an error for the fields that have conflicts.
+                throw new Error(
+                  `Merge conflicts found in rule '${ruleId}' for fields: ${fieldsWithConflicts.join(
+                    ', '
+                  )}. Please resolve the conflict manually or choose another value for 'pick_version'`
+                );
+              }
+            }
+
+            const modifiedPrebuiltRuleAsset = createModifiedPrebuiltRuleAsset({
+              upgradeableRule,
+              fieldNames,
+              requestBody,
+              globalPickVersion,
+              calculatedRuleDiff,
+            });
+
+            processedRules.modifiedPrebuiltRuleAssets.push(modifiedPrebuiltRuleAsset);
+
+            return processedRules;
+          } catch (err) {
+            processedRules.processingErrors.push({
+              error: err,
+              item: { rule_id: ruleId },
+            });
+
+            return processedRules;
           }
+        },
+        {
+          modifiedPrebuiltRuleAssets: [],
+          processingErrors: [],
         }
+      );
 
-        const modifiedPrebuiltRuleAsset = createModifiedPrebuiltRuleAsset({
-          upgradeableRule,
-          fieldNames,
-          requestBody,
-          globalPickVersion,
-          calculatedRuleDiff,
-        });
-
-        processedRules.modifiedPrebuiltRuleAssets.push(modifiedPrebuiltRuleAsset);
-
-        return processedRules;
-      } catch (err) {
-        processedRules.processingErrors.push({
-          error: err,
-          item: { rule_id: ruleId },
-        });
-
-        return processedRules;
-      }
-    },
-    {
-      modifiedPrebuiltRuleAssets: [],
-      processingErrors: [],
-    }
-  );
-
-  return {
-    modifiedPrebuiltRuleAssets,
-    processingErrors,
-  };
+    return {
+      modifiedPrebuiltRuleAssets,
+      processingErrors,
+    };
+  });
 };
 
 interface CreateModifiedPrebuiltRuleAssetParams {
diff --git a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts
index acfdb674c309a..750561b9858a9 100644
--- a/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts
+++ b/x-pack/plugins/security_solution/server/lib/detection_engine/prebuilt_rules/api/perform_rule_upgrade/get_upgradeable_rules.ts
@@ -4,7 +4,7 @@
  * 2.0; you may not use this file except in compliance with the Elastic License
  * 2.0.
  */
-
+import { withSecuritySpanSync } from '../../../../../utils/with_security_span';
 import type {
   RuleResponse,
   RuleUpgradeSpecifier,
@@ -26,58 +26,60 @@ export const getUpgradeableRules = ({
   versionSpecifiers?: RuleUpgradeSpecifier[];
   mode: Mode;
 }) => {
-  const upgradeableRules = new Map(
-    rawUpgradeableRules.map((_rule) => [_rule.current.rule_id, _rule])
-  );
-  const fetchErrors: Array<PromisePoolError<{ rule_id: string }, Error>> = [];
-  const skippedRules: SkippedRuleUpgrade[] = [];
+  return withSecuritySpanSync(getUpgradeableRules.name, () => {
+    const upgradeableRules = new Map(
+      rawUpgradeableRules.map((_rule) => [_rule.current.rule_id, _rule])
+    );
+    const fetchErrors: Array<PromisePoolError<{ rule_id: string }, Error>> = [];
+    const skippedRules: SkippedRuleUpgrade[] = [];
 
-  if (mode === ModeEnum.SPECIFIC_RULES) {
-    const installedRuleIds = new Set(currentRules.map((rule) => rule.rule_id));
-    const upgradeableRuleIds = new Set(rawUpgradeableRules.map(({ current }) => current.rule_id));
-    versionSpecifiers?.forEach((rule) => {
-      // Check that the requested rule was found
-      if (!installedRuleIds.has(rule.rule_id)) {
-        fetchErrors.push({
-          error: new Error(
-            `Rule with rule_id "${rule.rule_id}" and version "${rule.version}" not found`
-          ),
-          item: rule,
-        });
-        return;
-      }
+    if (mode === ModeEnum.SPECIFIC_RULES) {
+      const installedRuleIds = new Set(currentRules.map((rule) => rule.rule_id));
+      const upgradeableRuleIds = new Set(rawUpgradeableRules.map(({ current }) => current.rule_id));
+      versionSpecifiers?.forEach((rule) => {
+        // Check that the requested rule was found
+        if (!installedRuleIds.has(rule.rule_id)) {
+          fetchErrors.push({
+            error: new Error(
+              `Rule with rule_id "${rule.rule_id}" and version "${rule.version}" not found`
+            ),
+            item: rule,
+          });
+          return;
+        }
 
-      // Check that the requested rule is upgradeable
-      if (!upgradeableRuleIds.has(rule.rule_id)) {
-        skippedRules.push({
-          rule_id: rule.rule_id,
-          reason: SkipRuleUpgradeReasonEnum.RULE_UP_TO_DATE,
-        });
-        return;
-      }
+        // Check that the requested rule is upgradeable
+        if (!upgradeableRuleIds.has(rule.rule_id)) {
+          skippedRules.push({
+            rule_id: rule.rule_id,
+            reason: SkipRuleUpgradeReasonEnum.RULE_UP_TO_DATE,
+          });
+          return;
+        }
 
-      // Check that rule revisions match (no update slipped in since the user reviewed the list)
-      const currentRevision = currentRules.find(
-        (currentRule) => currentRule.rule_id === rule.rule_id
-      )?.revision;
-      if (rule.revision !== currentRevision) {
-        fetchErrors.push({
-          error: new Error(
-            `Revision mismatch for rule_id ${rule.rule_id}: expected ${currentRevision}, got ${rule.revision}`
-          ),
-          item: rule,
-        });
-        // Remove the rule from the list of upgradeable rules
-        if (upgradeableRules.has(rule.rule_id)) {
-          upgradeableRules.delete(rule.rule_id);
+        // Check that rule revisions match (no update slipped in since the user reviewed the list)
+        const currentRevision = currentRules.find(
+          (currentRule) => currentRule.rule_id === rule.rule_id
+        )?.revision;
+        if (rule.revision !== currentRevision) {
+          fetchErrors.push({
+            error: new Error(
+              `Revision mismatch for rule_id ${rule.rule_id}: expected ${currentRevision}, got ${rule.revision}`
+            ),
+            item: rule,
+          });
+          // Remove the rule from the list of upgradeable rules
+          if (upgradeableRules.has(rule.rule_id)) {
+            upgradeableRules.delete(rule.rule_id);
+          }
         }
-      }
-    });
-  }
+      });
+    }
 
-  return {
-    upgradeableRules: Array.from(upgradeableRules.values()),
-    fetchErrors,
-    skippedRules,
-  };
+    return {
+      upgradeableRules: Array.from(upgradeableRules.values()),
+      fetchErrors,
+      skippedRules,
+    };
+  });
 };
diff --git a/x-pack/plugins/security_solution/server/utils/with_security_span.ts b/x-pack/plugins/security_solution/server/utils/with_security_span.ts
index 58787dc45d09b..f9f78600cfb8d 100644
--- a/x-pack/plugins/security_solution/server/utils/with_security_span.ts
+++ b/x-pack/plugins/security_solution/server/utils/with_security_span.ts
@@ -6,7 +6,7 @@
  */
 import type { SpanOptions } from '@kbn/apm-utils';
 import { withSpan } from '@kbn/apm-utils';
-import type agent from 'elastic-apm-node';
+import agent from 'elastic-apm-node';
 import { APP_ID } from '../../common/constants';
 
 type Span = Exclude<typeof agent.currentSpan, undefined | null>;
@@ -35,3 +35,16 @@ export const withSecuritySpan = <T>(
     },
     cb
   );
+
+export const withSecuritySpanSync = <T>(name: string, fn: (span: Span | null) => T): T => {
+  const span = agent.startSpan(name, APP_ID);
+
+  try {
+    const result = fn(span);
+    return result;
+  } finally {
+    if (span) {
+      span.end();
+    }
+  }
+};