Skip to content

Commit

Permalink
[ML] Exclude quantiles when fetching model snapshots where possible (e…
Browse files Browse the repository at this point in the history
…lastic#103530)

As a followup to elastic#70376 this change further reduces the number of
places where we fetch the `quantiles` field of model snapshot
documents.

The quantiles can be very large and can cause out-of-memory errors
on small nodes, especially if more than one document containing
quantiles is loaded into memory at one time. The method
`JobManager.validateModelSnapshotIdUpdate` was a place where
two model snapshot documents were being loaded simultaneously,
both with their quantiles unnecessarily included. Following this
change there should be no risk of that method causing an
out-of-memory exception.
  • Loading branch information
droberts195 committed Dec 19, 2023
1 parent 01e6a46 commit 07ffedd
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 23 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/103530.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 103530
summary: Exclude quantiles when fetching model snapshots where possible
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.Quantiles;
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.TimingStats;
import org.elasticsearch.xpack.core.ml.job.results.Result;
import org.elasticsearch.xpack.core.ml.utils.MlIndexAndAlias;
import org.elasticsearch.xpack.core.ml.utils.ToXContentParams;
import org.elasticsearch.xpack.ml.MlSingleNodeTestCase;
Expand Down Expand Up @@ -846,6 +847,16 @@ public void testGetSnapshots() {
assertNull(snapshots.get(3).getQuantiles());
assertNull(snapshots.get(4).getQuantiles());

// test get single snapshot
PlainActionFuture<Result<ModelSnapshot>> singleFuture = new PlainActionFuture<>();
jobProvider.getModelSnapshot(jobId, "snap_1", true, singleFuture::onResponse, singleFuture::onFailure);
ModelSnapshot withQuantiles = singleFuture.actionGet().result;
assertThat(withQuantiles.getQuantiles().getTimestamp().getTime(), equalTo(11L));

singleFuture = new PlainActionFuture<>();
jobProvider.getModelSnapshot(jobId, "snap_2", false, singleFuture::onResponse, singleFuture::onFailure);
ModelSnapshot withoutQuantiles = singleFuture.actionGet().result;
assertNull(withoutQuantiles.getQuantiles());
}

public void testGetAutodetectParams() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ private static void getModelSnapshot(
return;
}

provider.getModelSnapshot(request.getJobId(), request.getSnapshotId(), modelSnapshot -> {
provider.getModelSnapshot(request.getJobId(), request.getSnapshotId(), true, modelSnapshot -> {
if (modelSnapshot == null) {
throw missingSnapshotException(request);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ protected void doExecute(
ActionListener<UpdateModelSnapshotAction.Response> listener
) {
logger.debug("Received request to update model snapshot [{}] for job [{}]", request.getSnapshotId(), request.getJobId());
jobResultsProvider.getModelSnapshot(request.getJobId(), request.getSnapshotId(), modelSnapshot -> {
// Even though the quantiles can be large we have to fetch them initially so that the updated document is complete
jobResultsProvider.getModelSnapshot(request.getJobId(), request.getSnapshotId(), true, modelSnapshot -> {
if (modelSnapshot == null) {
listener.onFailure(
new ResourceNotFoundException(
Expand All @@ -81,8 +82,7 @@ protected void doExecute(
} else {
Result<ModelSnapshot> updatedSnapshot = applyUpdate(request, modelSnapshot);
indexModelSnapshot(updatedSnapshot, b -> {
// The quantiles can be large, and totally dominate the output -
// it's clearer to remove them
// The quantiles can be large, and totally dominate the output - it's clearer to remove them at this stage
listener.onResponse(
new UpdateModelSnapshotAction.Response(new ModelSnapshot.Builder(updatedSnapshot.result).setQuantiles(null).build())
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ protected void masterOperation(Task task, Request request, ClusterState state, A
jobResultsProvider.getModelSnapshot(
request.getJobId(),
request.getSnapshotId(),
false,
getSnapshotHandler::onResponse,
getSnapshotHandler::onFailure
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -478,24 +478,27 @@ private void validate(Job job, JobUpdate jobUpdate, ActionListener<Void> handler

private void validateModelSnapshotIdUpdate(Job job, String modelSnapshotId, VoidChainTaskExecutor voidChainTaskExecutor) {
if (modelSnapshotId != null && ModelSnapshot.isTheEmptySnapshot(modelSnapshotId) == false) {
voidChainTaskExecutor.add(listener -> jobResultsProvider.getModelSnapshot(job.getId(), modelSnapshotId, newModelSnapshot -> {
if (newModelSnapshot == null) {
String message = Messages.getMessage(Messages.REST_NO_SUCH_MODEL_SNAPSHOT, modelSnapshotId, job.getId());
listener.onFailure(new ResourceNotFoundException(message));
return;
}
jobResultsProvider.getModelSnapshot(job.getId(), job.getModelSnapshotId(), oldModelSnapshot -> {
if (oldModelSnapshot != null && newModelSnapshot.result.getTimestamp().before(oldModelSnapshot.result.getTimestamp())) {
String message = "Job ["
+ job.getId()
+ "] has a more recent model snapshot ["
+ oldModelSnapshot.result.getSnapshotId()
+ "]";
listener.onFailure(new IllegalArgumentException(message));
voidChainTaskExecutor.add(
listener -> jobResultsProvider.getModelSnapshot(job.getId(), modelSnapshotId, false, newModelSnapshot -> {
if (newModelSnapshot == null) {
String message = Messages.getMessage(Messages.REST_NO_SUCH_MODEL_SNAPSHOT, modelSnapshotId, job.getId());
listener.onFailure(new ResourceNotFoundException(message));
return;
}
listener.onResponse(null);
}, listener::onFailure);
}, listener::onFailure));
jobResultsProvider.getModelSnapshot(job.getId(), job.getModelSnapshotId(), false, oldModelSnapshot -> {
if (oldModelSnapshot != null
&& newModelSnapshot.result.getTimestamp().before(oldModelSnapshot.result.getTimestamp())) {
String message = "Job ["
+ job.getId()
+ "] has a more recent model snapshot ["
+ oldModelSnapshot.result.getSnapshotId()
+ "]";
listener.onFailure(new IllegalArgumentException(message));
}
listener.onResponse(null);
}, listener::onFailure);
}, listener::onFailure)
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1257,11 +1257,13 @@ public BatchedResultsIterator<Influencer> newBatchedInfluencersIterator(String j
}

/**
* Get a job's model snapshot by its id
* Get a job's model snapshot by its id.
* Quantiles should only be included when strictly required, because they can be very large and consume a lot of heap.
*/
public void getModelSnapshot(
String jobId,
@Nullable String modelSnapshotId,
boolean includeQuantiles,
Consumer<Result<ModelSnapshot>> handler,
Consumer<Exception> errorHandler
) {
Expand All @@ -1271,6 +1273,9 @@ public void getModelSnapshot(
}
String resultsIndex = AnomalyDetectorsIndex.jobResultsAliasedName(jobId);
SearchRequestBuilder search = createDocIdSearch(resultsIndex, ModelSnapshot.documentId(jobId, modelSnapshotId));
if (includeQuantiles == false) {
search.setFetchSource(null, ModelSnapshot.QUANTILES.getPreferredName());
}
searchSingleResult(
jobId,
ModelSnapshot.TYPE.getPreferredName(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,6 @@ private void deleteSnapshotAndFailTask(AllocatedPersistentTask task, String jobI
);

});
jobResultsProvider.getModelSnapshot(jobId, snapshotId, modelSnapshotListener::onResponse, modelSnapshotListener::onFailure);
jobResultsProvider.getModelSnapshot(jobId, snapshotId, false, modelSnapshotListener::onResponse, modelSnapshotListener::onFailure);
}
}

0 comments on commit 07ffedd

Please sign in to comment.