From ecfe8c71079d9695480ec17c3686439d02987879 Mon Sep 17 00:00:00 2001 From: Ruirui Zhang Date: Thu, 10 Oct 2024 13:12:22 -0700 Subject: [PATCH] add workload management IT Signed-off-by: Ruirui Zhang --- CHANGELOG.md | 3 +- .../backpressure/SearchBackpressureIT.java | 10 +- .../opensearch/wlm/WorkloadManagementIT.java | 403 ++++++++++++++++++ 3 files changed, 412 insertions(+), 4 deletions(-) create mode 100644 server/src/internalClusterTest/java/org/opensearch/wlm/WorkloadManagementIT.java diff --git a/CHANGELOG.md b/CHANGELOG.md index f5622a177ee6c..f5c343ff7f4e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fallback to Remote cluster-state on Term-Version check mismatch - ([#15424](https://github.com/opensearch-project/OpenSearch/pull/15424)) - Implement WithFieldName interface in ValuesSourceAggregationBuilder & FieldSortBuilder ([#15916](https://github.com/opensearch-project/OpenSearch/pull/15916)) - Add successfulSearchShardIndices in searchRequestContext ([#15967](https://github.com/opensearch-project/OpenSearch/pull/15967), [#16110](https://github.com/opensearch-project/OpenSearch/pull/16110)) -- [Tiered Caching] Segmented cache changes ([#16047](https://github.com/opensearch-project/OpenSearch/pull/16047)) +- [Tiered Caching] Segmented cache changes ([#16047](https://github.com/opensearch-project/OpenSearch/pull/16047))s +- [Workload Management] Add Workload Management IT ([#16359](https://github.com/opensearch-project/OpenSearch/pull/16359)) - Add support for msearch API to pass search pipeline name - ([#15923](https://github.com/opensearch-project/OpenSearch/pull/15923)) - Add _list/indices API as paginated alternate to _cat/indices ([#14718](https://github.com/opensearch-project/OpenSearch/pull/14718)) - Add success and failure metrics for async shard fetch ([#15976](https://github.com/opensearch-project/OpenSearch/pull/15976)) diff --git a/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java b/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java index 40c9301ef4bce..d200b9177353a 100644 --- a/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/search/backpressure/SearchBackpressureIT.java @@ -314,7 +314,7 @@ public void testSearchCancellationWithBackpressureDisabled() throws InterruptedE assertNull("SearchShardTask shouldn't have cancelled for monitor_only mode", caughtException); } - private static class ExceptionCatchingListener implements ActionListener { + public static class ExceptionCatchingListener implements ActionListener { private final CountDownLatch latch; private Exception exception = null; @@ -333,7 +333,11 @@ public void onFailure(Exception e) { latch.countDown(); } - private Exception getException() { + public CountDownLatch getLatch() { + return latch; + } + + public Exception getException() { return exception; } } @@ -349,7 +353,7 @@ private Supplier descriptionSupplier(String description) { return () -> description; } - interface TaskFactory { + public interface TaskFactory { T createTask(long id, String type, String action, String description, TaskId parentTaskId, Map headers); } diff --git a/server/src/internalClusterTest/java/org/opensearch/wlm/WorkloadManagementIT.java b/server/src/internalClusterTest/java/org/opensearch/wlm/WorkloadManagementIT.java new file mode 100644 index 0000000000000..060af2a89853d --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/wlm/WorkloadManagementIT.java @@ -0,0 +1,403 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.wlm; + +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.opensearch.action.ActionRequest; +import org.opensearch.action.ActionRequestValidationException; +import org.opensearch.action.ActionType; +import org.opensearch.action.search.SearchTask; +import org.opensearch.action.support.ActionFilters; +import org.opensearch.action.support.HandledTransportAction; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.ClusterStateUpdateTask; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.metadata.QueryGroup; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.inject.Inject; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.core.action.ActionListener; +import org.opensearch.core.action.ActionResponse; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.tasks.TaskCancelledException; +import org.opensearch.core.tasks.TaskId; +import org.opensearch.plugins.ActionPlugin; +import org.opensearch.plugins.Plugin; +import org.opensearch.search.backpressure.SearchBackpressureIT.ExceptionCatchingListener; +import org.opensearch.search.backpressure.SearchBackpressureIT.TaskFactory; +import org.opensearch.search.backpressure.SearchBackpressureIT.TestResponse; +import org.opensearch.tasks.CancellableTask; +import org.opensearch.tasks.Task; +import org.opensearch.test.OpenSearchIntegTestCase; +import org.opensearch.test.ParameterizedStaticSettingsOpenSearchIntegTestCase; +import org.opensearch.threadpool.ThreadPool; +import org.opensearch.transport.TransportService; +import org.hamcrest.MatcherAssert; +import org.junit.After; +import org.junit.Before; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import static org.opensearch.search.SearchService.CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING; +import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; +import static org.opensearch.wlm.QueryGroupTask.QUERY_GROUP_ID_HEADER; +import static org.hamcrest.Matchers.instanceOf; + +@OpenSearchIntegTestCase.ClusterScope(numDataNodes = 0) +public class WorkloadManagementIT extends ParameterizedStaticSettingsOpenSearchIntegTestCase { + final static String PUT = "PUT"; + final static String MEMORY = "MEMORY"; + final static String CPU = "CPU"; + final static String ENABLED = "enabled"; + final static String DELETE = "DELETE"; + private static final TimeValue TIMEOUT = new TimeValue(10, TimeUnit.SECONDS); + + public WorkloadManagementIT(Settings nodeSettings) { + super(nodeSettings); + } + + @ParametersFactory + public static Collection parameters() { + return Arrays.asList( + new Object[] { Settings.builder().put(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING.getKey(), false).build() }, + new Object[] { Settings.builder().put(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING.getKey(), true).build() } + ); + } + + @Override + protected Collection> nodePlugins() { + final List> plugins = new ArrayList<>(super.nodePlugins()); + plugins.add(TestClusterUpdatePlugin.class); + return plugins; + } + + @Before + public final void setupNodeSettings() { + Settings request = Settings.builder() + .put(WorkloadManagementSettings.NODE_LEVEL_MEMORY_REJECTION_THRESHOLD.getKey(), 0.8) + .put(WorkloadManagementSettings.NODE_LEVEL_MEMORY_CANCELLATION_THRESHOLD.getKey(), 0.9) + .put(WorkloadManagementSettings.NODE_LEVEL_CPU_REJECTION_THRESHOLD.getKey(), 0.8) + .put(WorkloadManagementSettings.NODE_LEVEL_CPU_CANCELLATION_THRESHOLD.getKey(), 0.9) + .build(); + assertAcked(client().admin().cluster().prepareUpdateSettings().setPersistentSettings(request).get()); + } + + @After + public final void cleanupNodeSettings() { + assertAcked( + client().admin() + .cluster() + .prepareUpdateSettings() + .setPersistentSettings(Settings.builder().putNull("*")) + .setTransientSettings(Settings.builder().putNull("*")) + ); + } + + public void testHighCPUInEnforcedMode() throws InterruptedException { + Settings request = Settings.builder().put(WorkloadManagementSettings.WLM_MODE_SETTING.getKey(), ENABLED).build(); + assertAcked(client().admin().cluster().prepareUpdateSettings().setPersistentSettings(request).get()); + QueryGroup queryGroup = new QueryGroup( + "name", + new MutableQueryGroupFragment( + MutableQueryGroupFragment.ResiliencyMode.ENFORCED, + Map.of(ResourceType.CPU, 0.01, ResourceType.MEMORY, 0.01) + ) + ); + updateQueryGroupInClusterState(PUT, queryGroup); + Exception caughtException = executeQueryGroupTask(CPU, queryGroup.get_id()); + assertNotNull("SearchTask should have been cancelled with TaskCancelledException", caughtException); + MatcherAssert.assertThat(caughtException, instanceOf(TaskCancelledException.class)); + updateQueryGroupInClusterState(DELETE, queryGroup); + } + + public void testHighCPUInMonitorMode() throws InterruptedException { + QueryGroup queryGroup = new QueryGroup( + "name", + new MutableQueryGroupFragment( + MutableQueryGroupFragment.ResiliencyMode.ENFORCED, + Map.of(ResourceType.CPU, 0.01, ResourceType.MEMORY, 0.01) + ) + ); + updateQueryGroupInClusterState(PUT, queryGroup); + Exception caughtException = executeQueryGroupTask(CPU, queryGroup.get_id()); + assertNull(caughtException); + updateQueryGroupInClusterState(DELETE, queryGroup); + } + + public void testHighMemoryInEnforcedMode() throws InterruptedException { + Settings request = Settings.builder().put(WorkloadManagementSettings.WLM_MODE_SETTING.getKey(), ENABLED).build(); + assertAcked(client().admin().cluster().prepareUpdateSettings().setPersistentSettings(request).get()); + QueryGroup queryGroup = new QueryGroup( + "name", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 0.01)) + ); + updateQueryGroupInClusterState(PUT, queryGroup); + Exception caughtException = executeQueryGroupTask(MEMORY, queryGroup.get_id()); + assertNotNull("SearchTask should have been cancelled with TaskCancelledException", caughtException); + MatcherAssert.assertThat(caughtException, instanceOf(TaskCancelledException.class)); + updateQueryGroupInClusterState(DELETE, queryGroup); + } + + public void testHighMemoryInMonitorMode() throws InterruptedException { + QueryGroup queryGroup = new QueryGroup( + "name", + new MutableQueryGroupFragment(MutableQueryGroupFragment.ResiliencyMode.ENFORCED, Map.of(ResourceType.MEMORY, 0.01)) + ); + updateQueryGroupInClusterState(PUT, queryGroup); + Exception caughtException = executeQueryGroupTask(MEMORY, queryGroup.get_id()); + assertNull("SearchTask should have been cancelled with TaskCancelledException", caughtException); + updateQueryGroupInClusterState(DELETE, queryGroup); + } + + public void testNoCancellation() throws InterruptedException { + QueryGroup queryGroup = new QueryGroup( + "name", + new MutableQueryGroupFragment( + MutableQueryGroupFragment.ResiliencyMode.ENFORCED, + Map.of(ResourceType.CPU, 0.8, ResourceType.MEMORY, 0.8) + ) + ); + updateQueryGroupInClusterState(PUT, queryGroup); + Exception caughtException = executeQueryGroupTask(CPU, queryGroup.get_id()); + assertNull(caughtException); + updateQueryGroupInClusterState(DELETE, queryGroup); + } + + public Exception executeQueryGroupTask(String resourceType, String queryGroupId) throws InterruptedException { + ExceptionCatchingListener listener = new ExceptionCatchingListener(); + client().execute( + TestQueryGroupTaskTransportAction.ACTION, + new TestQueryGroupTaskRequest( + resourceType, + queryGroupId, + (TaskFactory) (id, type, action, description, parentTaskId, headers) -> new SearchTask( + id, + type, + action, + () -> description, + parentTaskId, + headers + ) + ), + listener + ); + assertTrue(listener.getLatch().await(TIMEOUT.getSeconds() + 1, TimeUnit.SECONDS)); + return listener.getException(); + } + + public void updateQueryGroupInClusterState(String method, QueryGroup queryGroup) throws InterruptedException { + ExceptionCatchingListener listener = new ExceptionCatchingListener(); + client().execute(TestClusterUpdateTransportAction.ACTION, new TestClusterUpdateRequest(queryGroup, method), listener); + assertTrue(listener.getLatch().await(TIMEOUT.getSeconds(), TimeUnit.SECONDS)); + assertEquals(0, listener.getLatch().getCount()); + } + + public static class TestClusterUpdateRequest extends ActionRequest { + final private String method; + final private QueryGroup queryGroup; + + public TestClusterUpdateRequest(QueryGroup queryGroup, String method) { + this.method = method; + this.queryGroup = queryGroup; + } + + public TestClusterUpdateRequest(StreamInput in) throws IOException { + super(in); + this.method = in.readString(); + this.queryGroup = new QueryGroup(in); + } + + @Override + public ActionRequestValidationException validate() { + return null; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeString(method); + queryGroup.writeTo(out); + } + + public QueryGroup getQueryGroup() { + return queryGroup; + } + + public String getMethod() { + return method; + } + } + + public static class TestClusterUpdateTransportAction extends HandledTransportAction { + public static final ActionType ACTION = new ActionType<>("internal::test_cluster_update_action", TestResponse::new); + private final ClusterService clusterService; + + @Inject + public TestClusterUpdateTransportAction( + TransportService transportService, + ClusterService clusterService, + ActionFilters actionFilters + ) { + super(ACTION.name(), transportService, actionFilters, TestClusterUpdateRequest::new); + this.clusterService = clusterService; + } + + @Override + protected void doExecute(Task task, TestClusterUpdateRequest request, ActionListener listener) { + clusterService.submitStateUpdateTask("query-group-persistence-service", new ClusterStateUpdateTask() { + @Override + public ClusterState execute(ClusterState currentState) { + Map currentGroups = currentState.metadata().queryGroups(); + QueryGroup queryGroup = request.getQueryGroup(); + String id = queryGroup.get_id(); + String method = request.getMethod(); + Metadata metadata; + if (method.equals(PUT)) { // create + metadata = Metadata.builder(currentState.metadata()).put(queryGroup).build(); + } else { // delete + metadata = Metadata.builder(currentState.metadata()).remove(currentGroups.get(id)).build(); + } + return ClusterState.builder(currentState).metadata(metadata).build(); + } + + @Override + public void onFailure(String source, Exception e) { + listener.onFailure(e); + } + + @Override + public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { + listener.onResponse(new TestResponse()); + } + }); + } + } + + public static class TestQueryGroupTaskRequest extends ActionRequest { + private final String type; + private final String queryGroupId; + private TaskFactory taskFactory; + + public TestQueryGroupTaskRequest(String type, String queryGroupId, TaskFactory taskFactory) { + this.type = type; + this.queryGroupId = queryGroupId; + this.taskFactory = taskFactory; + } + + public TestQueryGroupTaskRequest(StreamInput in) throws IOException { + super(in); + this.type = in.readString(); + this.queryGroupId = in.readString(); + } + + @Override + public ActionRequestValidationException validate() { + return null; + } + + @Override + public Task createTask(long id, String type, String action, TaskId parentTaskId, Map headers) { + return taskFactory.createTask(id, type, action, "", parentTaskId, headers); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeString(type); + out.writeString(queryGroupId); + } + + public String getType() { + return type; + } + + public String getQueryGroupId() { + return queryGroupId; + } + } + + public static class TestQueryGroupTaskTransportAction extends HandledTransportAction { + public static final ActionType ACTION = new ActionType<>("internal::test_query_group_task_action", TestResponse::new); + private final ThreadPool threadPool; + + @Inject + public TestQueryGroupTaskTransportAction(TransportService transportService, ThreadPool threadPool, ActionFilters actionFilters) { + super(ACTION.name(), transportService, actionFilters, TestQueryGroupTaskRequest::new); + this.threadPool = threadPool; + } + + @Override + protected void doExecute(Task task, TestQueryGroupTaskRequest request, ActionListener listener) { + threadPool.getThreadContext().putHeader(QUERY_GROUP_ID_HEADER, request.getQueryGroupId()); + threadPool.executor(ThreadPool.Names.SEARCH).execute(() -> { + try { + CancellableTask cancellableTask = (CancellableTask) task; + ((QueryGroupTask) task).setQueryGroupId(threadPool.getThreadContext()); + assertEquals(request.getQueryGroupId(), ((QueryGroupTask) task).getQueryGroupId()); + long startTime = System.nanoTime(); + + while (System.nanoTime() - startTime < TIMEOUT.getNanos()) { + doWork(request); + if (cancellableTask.isCancelled()) { + break; + } + } + if (cancellableTask.isCancelled()) { + throw new TaskCancelledException(cancellableTask.getReasonCancelled()); + } else { + listener.onResponse(new TestResponse()); + } + } catch (Exception e) { + listener.onFailure(e); + } + }); + } + + private void doWork(TestQueryGroupTaskRequest request) throws InterruptedException { + switch (request.getType()) { + case "CPU": + long i = 0, j = 1, k = 1, iterations = 1000; + do { + j += i; + k *= j; + i++; + } while (i < iterations); + break; + case "MEMORY": + Byte[] bytes = new Byte[100000]; + int[] ints = new int[1000000]; + break; + } + } + } + + public static class TestClusterUpdatePlugin extends Plugin implements ActionPlugin { + @Override + public List> getActions() { + return Arrays.asList( + new ActionHandler<>(TestClusterUpdateTransportAction.ACTION, TestClusterUpdateTransportAction.class), + new ActionHandler<>(TestQueryGroupTaskTransportAction.ACTION, TestQueryGroupTaskTransportAction.class) + ); + } + + @Override + public List> getClientActions() { + return Arrays.asList(TestClusterUpdateTransportAction.ACTION, TestQueryGroupTaskTransportAction.ACTION); + } + } +}