From 71f58a9287a7941b45629df1a14b079b646fd08a Mon Sep 17 00:00:00 2001 From: Dima Arnautov Date: Tue, 26 Nov 2024 11:33:04 +0100 Subject: [PATCH] [ML] Trained Model: Fix start deployment with ML autoscaling and 0 active nodes (#201256) ## Summary During my testing, I used the current user with all required privileges but failed to notice that, after switching to the internal` kibana_system` user, it lacked the manage_autoscaling privilege required for the `GET /_autoscaling/policy` API. As a result, the `isMlAutoscalingEnabled` flag, which we rely on in the Start Deployment modal, was always set to false. This caused a bug in scenarios with zero active ML nodes, where falling back to deriving available processors from ML limits was not possible. You can check the created deployment, it correctly identifies ML autoscaling: image Also fixes restoring vCPU levels from the API deployment params. ### Checklist Check the PR satisfies following conditions. - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios (cherry picked from commit 9827a07b5891d643a61a53e09350ff6e4ab25889) --- .../deployment_params_mapper.test.ts | 291 ++++++++++++++++++ .../deployment_params_mapper.ts | 22 +- x-pack/plugins/ml/server/lib/node_utils.ts | 2 +- x-pack/plugins/ml/server/routes/system.ts | 11 +- 4 files changed, 315 insertions(+), 11 deletions(-) diff --git a/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.test.ts b/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.test.ts index 34875b893a867..4193251d76f3a 100644 --- a/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.test.ts +++ b/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.test.ts @@ -627,6 +627,297 @@ describe('DeploymentParamsMapper', () => { }, }); }); + + describe('mapApiToUiDeploymentParams', () => { + it('should map API params to UI correctly', () => { + // Optimized for search + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 16, + number_of_allocations: 2, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'medium', + }); + + // Lower value + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 16, + number_of_allocations: 1, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'medium', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 8, + number_of_allocations: 2, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'medium', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 2, + number_of_allocations: 1, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'low', + }); + + // Exact match + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 16, + number_of_allocations: 8, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'high', + }); + + // Higher value + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 16, + number_of_allocations: 12, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'high', + }); + + // Lower value + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 16, + number_of_allocations: 5, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'high', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 16, + number_of_allocations: 6, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: false, + vCPUUsage: 'high', + }); + + // Optimized for ingest + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 1, + number_of_allocations: 1, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForIngest', + adaptiveResources: false, + vCPUUsage: 'low', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 1, + number_of_allocations: 2, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForIngest', + adaptiveResources: false, + vCPUUsage: 'low', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 1, + number_of_allocations: 6, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForIngest', + adaptiveResources: false, + vCPUUsage: 'medium', + }); + }); + + it('should map API params to UI correctly with adaptive resources', () => { + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 8, + adaptive_allocations: { + enabled: true, + min_number_of_allocations: 2, + max_number_of_allocations: 2, + }, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: true, + vCPUUsage: 'medium', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 2, + adaptive_allocations: { + enabled: true, + min_number_of_allocations: 2, + max_number_of_allocations: 2, + }, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: true, + vCPUUsage: 'medium', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 1, + adaptive_allocations: { + enabled: true, + min_number_of_allocations: 1, + max_number_of_allocations: 1, + }, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForIngest', + adaptiveResources: true, + vCPUUsage: 'low', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 2, + adaptive_allocations: { + enabled: true, + min_number_of_allocations: 0, + max_number_of_allocations: 1, + }, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: true, + vCPUUsage: 'low', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 1, + adaptive_allocations: { + enabled: true, + min_number_of_allocations: 0, + max_number_of_allocations: 64, + }, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForIngest', + adaptiveResources: true, + vCPUUsage: 'high', + }); + + expect( + mapper.mapApiToUiDeploymentParams({ + model_id: modelId, + deployment_id: 'test-deployment', + priority: 'normal', + threads_per_allocation: 16, + adaptive_allocations: { + enabled: true, + min_number_of_allocations: 0, + max_number_of_allocations: 12, + }, + } as unknown as MlTrainedModelAssignmentTaskParametersAdaptive) + ).toEqual({ + deploymentId: 'test-deployment', + optimized: 'optimizedForSearch', + adaptiveResources: true, + vCPUUsage: 'high', + }); + }); + }); }); }); }); diff --git a/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.ts b/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.ts index ecb8a06198b1c..96ba5f0755caa 100644 --- a/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.ts +++ b/x-pack/plugins/ml/public/application/model_management/deployment_params_mapper.ts @@ -25,7 +25,7 @@ type VCPUBreakpoints = Record< max: number; /** * Static value is used for the number of vCPUs when the adaptive resources are disabled. - * Not allowed in certain environments. + * Not allowed in certain environments, Obs and Security serverless projects. */ static?: number; } @@ -89,6 +89,7 @@ export class DeploymentParamsMapper { ) { /** * Initial value can be different for serverless and ESS with autoscaling. + * Also not available with 0 ML active nodes. */ const maxSingleMlNodeProcessors = this.mlServerLimits.max_single_ml_node_processors; @@ -236,18 +237,25 @@ export class DeploymentParamsMapper { ? input.adaptive_allocations!.max_number_of_allocations! : input.number_of_allocations); + // The deployment can be created via API with a number of allocations that do not exactly match our vCPU ranges. + // In this case, we should find the closest vCPU range that does not exceed the max or static value of the range. const [vCPUUsage] = Object.entries(this.vCpuBreakpoints) - .reverse() - .find(([key, val]) => vCPUs >= val.min) as [ - DeploymentParamsUI['vCPUUsage'], - { min: number; max: number } - ]; + .filter(([, range]) => vCPUs <= (adaptiveResources ? range.max : range.static!)) + .reduce( + (prev, curr) => { + const prevValue = adaptiveResources ? prev[1].max : prev[1].static!; + const currValue = adaptiveResources ? curr[1].max : curr[1].static!; + return Math.abs(vCPUs - prevValue) <= Math.abs(vCPUs - currValue) ? prev : curr; + }, + // in case allocation params exceed the max value of the high range + ['high', this.vCpuBreakpoints.high] + ); return { deploymentId: input.deployment_id, optimized, adaptiveResources, - vCPUUsage, + vCPUUsage: vCPUUsage as DeploymentParamsUI['vCPUUsage'], }; } } diff --git a/x-pack/plugins/ml/server/lib/node_utils.ts b/x-pack/plugins/ml/server/lib/node_utils.ts index d8098e3c43f42..9c5c0348f03da 100644 --- a/x-pack/plugins/ml/server/lib/node_utils.ts +++ b/x-pack/plugins/ml/server/lib/node_utils.ts @@ -33,7 +33,7 @@ export async function getMlNodeCount(client: IScopedClusterClient): Promise { const body = await client.asInternalUser.cluster.getSettings( { include_defaults: true, diff --git a/x-pack/plugins/ml/server/routes/system.ts b/x-pack/plugins/ml/server/routes/system.ts index 0804a8dc02348..04062db91305c 100644 --- a/x-pack/plugins/ml/server/routes/system.ts +++ b/x-pack/plugins/ml/server/routes/system.ts @@ -14,7 +14,7 @@ import { mlLog } from '../lib/log'; import { capabilitiesProvider } from '../lib/capabilities'; import { spacesUtilsProvider } from '../lib/spaces_utils'; import type { RouteInitialization, SystemRouteDeps } from '../types'; -import { getMlNodeCount } from '../lib/node_utils'; +import { getLazyMlNodeCount, getMlNodeCount } from '../lib/node_utils'; /** * System routes @@ -174,10 +174,15 @@ export function systemRoutes( let isMlAutoscalingEnabled = false; try { - await client.asInternalUser.autoscaling.getAutoscalingPolicy({ name: 'ml' }); + // kibana_system user does not have the manage_autoscaling cluster privilege. + // perform this check as a current user. + await client.asCurrentUser.autoscaling.getAutoscalingPolicy({ name: 'ml' }); isMlAutoscalingEnabled = true; } catch (e) { - // If doesn't exist, then keep the false + // If ml autoscaling policy doesn't exist or the user does not have privileges to fetch it, + // check the number of lazy ml nodes to determine if autoscaling is enabled. + const lazyMlNodeCount = await getLazyMlNodeCount(client); + isMlAutoscalingEnabled = lazyMlNodeCount > 0; } return response.ok({