diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentClusterService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentClusterService.java index 2caf338d2a3c7..ca9a99f2d96f5 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentClusterService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentClusterService.java @@ -164,6 +164,8 @@ public void clusterChanged(ClusterChangedEvent event) { } if (eventStateMinTransportVersionIsBeforeDistributedModelAllocationTransportVersion(event)) { + logger.trace("min transport version is before assignment change on " + event.state().nodes().getAllNodes().size() + " nodes"); + // we should not try to rebalance assignments while there may be nodes running on a version // prior to introducing distributed model allocation. // But we should remove routing to removed or shutting down nodes. @@ -238,6 +240,7 @@ public void onFailure(Exception e) { } private void removeRoutingToRemovedOrShuttingDownNodes(ClusterChangedEvent event) { + logger.trace("remove routing to removed or shutting down nodes "); if (areAssignedNodesRemoved(event)) { submitUnbatchedTask("removing routing entries for removed or shutting down nodes", new ClusterStateUpdateTask() { @Override @@ -282,6 +285,7 @@ static boolean areAssignedNodesRemoved(ClusterChangedEvent event) { // Visible for testing static ClusterState removeRoutingToUnassignableNodes(ClusterState currentState) { + logger.trace("remove routing to unassignable nodes"); Set assignableNodes = getAssignableNodes(currentState).stream().map(DiscoveryNode::getId).collect(Collectors.toSet()); TrainedModelAssignmentMetadata metadata = TrainedModelAssignmentMetadata.fromState(currentState); TrainedModelAssignmentMetadata.Builder builder = TrainedModelAssignmentMetadata.builder(currentState); @@ -431,6 +435,7 @@ public void createNewModelAssignment( } public void setModelAssignmentToStopping(String modelId, ActionListener listener) { + logger.trace("set to stopping"); submitUnbatchedTask("set model assignment stopping", new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { @@ -450,6 +455,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState) } public void removeModelAssignment(String deploymentId, ActionListener listener) { + logger.trace("remove model assignments"); submitUnbatchedTask("delete model deployment assignment", new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { @@ -486,6 +492,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState) // Used by the reset action directly public void removeAllModelAssignments(ActionListener listener) { + logger.trace("remove all assignments"); submitUnbatchedTask("delete all model assignments", new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { @@ -518,9 +525,11 @@ private static ClusterState forceUpdate(ClusterState currentState, TrainedModelA logger.debug(() -> format("updated assignments: %s", modelAssignments.build())); Metadata.Builder metadata = Metadata.builder(currentState.metadata()); if (currentState.getMinTransportVersion().onOrAfter(RENAME_ALLOCATION_TO_ASSIGNMENT_TRANSPORT_VERSION)) { + logger.trace("putting custom new name"); metadata.putCustom(TrainedModelAssignmentMetadata.NAME, modelAssignments.build()) .removeCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME); } else { + logger.trace("putting custom old name"); metadata.putCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME, modelAssignments.buildOld()); } return ClusterState.builder(currentState).metadata(metadata).build(); @@ -616,6 +625,7 @@ ClusterState stopPlatformSpecificModelsInHeterogeneousClusters( modelToAdd.get().getModelId(), mlNodesArchitectures ); + logger.info(reasonToStop); updatedState = callSetToStopping(reasonToStop, modelToAdd.get().getDeploymentId(), clusterState); } return updatedState; diff --git a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MLModelDeploymentsUpgradeIT.java b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MLModelDeploymentsUpgradeIT.java index b9fbf0b6b1f03..87d605d29fa86 100644 --- a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MLModelDeploymentsUpgradeIT.java +++ b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MLModelDeploymentsUpgradeIT.java @@ -73,7 +73,7 @@ public void setUpLogging() throws IOException { { "persistent": { "logger.org.elasticsearch.xpack.ml.inference": "TRACE", - "logger.org.elasticsearch.xpack.ml.inference.assignments": "DEBUG", + "logger.org.elasticsearch.xpack.ml.inference.assignments": "TRACE", "logger.org.elasticsearch.xpack.ml.process": "DEBUG", "logger.org.elasticsearch.xpack.ml.action": "TRACE" } @@ -97,6 +97,7 @@ public void removeLogging() throws IOException { client().performRequest(request); } + @AwaitsFix(bugUrl = "mute to try and reproduce https://github.com/elastic/elasticsearch/issues/100379") public void testTrainedModelDeployment() throws Exception { assumeTrue("NLP model deployments added in 8.0", UPGRADE_FROM_VERSION.onOrAfter(Version.V_8_0_0));