diff --git a/docs/changelog/91917.yaml b/docs/changelog/91917.yaml new file mode 100644 index 0000000000000..92304d353c949 --- /dev/null +++ b/docs/changelog/91917.yaml @@ -0,0 +1,6 @@ +pr: 91917 +summary: ML stats failures should not stop the usage API working +area: Machine Learning +type: bug +issues: + - 91893 diff --git a/x-pack/plugin/ml/qa/ml-with-security/build.gradle b/x-pack/plugin/ml/qa/ml-with-security/build.gradle index 478736168d330..50b8b16c2dd0e 100644 --- a/x-pack/plugin/ml/qa/ml-with-security/build.gradle +++ b/x-pack/plugin/ml/qa/ml-with-security/build.gradle @@ -197,6 +197,7 @@ tasks.named("yamlRestTest").configure { 'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid start param', 'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid end param', 'ml/jobs_get_result_overall_buckets/Test overall buckets given bucket_span is smaller than max job bucket_span', + 'ml/jobs_get_stats/Test closed results index', 'ml/jobs_get_stats/Test get job stats given missing job', 'ml/jobs_get_stats/Test no exception on get job stats with missing index', 'ml/job_groups/Test put job with empty group', diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java index c99af373e5256..5129d0d45fde3 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java @@ -20,6 +20,8 @@ import org.elasticsearch.common.util.Maps; import org.elasticsearch.env.Environment; import org.elasticsearch.license.XPackLicenseState; +import org.elasticsearch.logging.LogManager; +import org.elasticsearch.logging.Logger; import org.elasticsearch.protocol.xpack.XPackUsageRequest; import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; @@ -66,6 +68,8 @@ public class MachineLearningUsageTransportAction extends XPackUsageFeatureTransportAction { + private static final Logger logger = LogManager.getLogger(MachineLearningUsageTransportAction.class); + private final Client client; private final XPackLicenseState licenseState; private final JobManagerHolder jobManagerHolder; @@ -124,8 +128,8 @@ protected void masterOperation( int nodeCount = mlNodeCount(state); // Step 5. return final ML usage - ActionListener> inferenceUsageListener = ActionListener.wrap(inferenceUsage -> { - listener.onResponse( + ActionListener> inferenceUsageListener = ActionListener.wrap( + inferenceUsage -> listener.onResponse( new XPackUsageFeatureResponse( new MachineLearningFeatureSetUsage( MachineLearningField.ML_API_FEATURE.checkWithoutTracking(licenseState), @@ -137,45 +141,76 @@ protected void masterOperation( nodeCount ) ) - ); - }, listener::onFailure); + ), + e -> { + logger.warn("Failed to get inference usage to include in ML usage", e); + listener.onResponse( + new XPackUsageFeatureResponse( + new MachineLearningFeatureSetUsage( + MachineLearningField.ML_API_FEATURE.checkWithoutTracking(licenseState), + enabled, + jobsUsage, + datafeedsUsage, + analyticsUsage, + Collections.emptyMap(), + nodeCount + ) + ) + ); + } + ); // Step 4. Extract usage from data frame analytics configs and then get inference usage ActionListener dataframeAnalyticsListener = ActionListener.wrap(response -> { addDataFrameAnalyticsUsage(response, analyticsUsage); addInferenceUsage(inferenceUsageListener); - }, listener::onFailure); + }, e -> { + logger.warn("Failed to get data frame analytics configs to include in ML usage", e); + addInferenceUsage(inferenceUsageListener); + }); // Step 3. Extract usage from data frame analytics stats and then request data frame analytics configs + GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL); + getDfaRequest.setPageParams(new PageParams(0, 10_000)); ActionListener dataframeAnalyticsStatsListener = ActionListener.wrap(response -> { addDataFrameAnalyticsStatsUsage(response, analyticsUsage); - GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL); - getDfaRequest.setPageParams(new PageParams(0, 10_000)); client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener); - }, listener::onFailure); + }, e -> { + logger.warn("Failed to get data frame analytics stats to include in ML usage", e); + client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener); + }); // Step 2. Extract usage from datafeeds stats and then request stats for data frame analytics + GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request( + Metadata.ALL + ); + dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000)); ActionListener datafeedStatsListener = ActionListener.wrap(response -> { addDatafeedsUsage(response, datafeedsUsage); - GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request( - Metadata.ALL - ); - dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000)); client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener); - }, listener::onFailure); + }, e -> { + logger.warn("Failed to get datafeed stats to include in ML usage", e); + client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener); + }); // Step 1. Extract usage from jobs stats and then request stats for all datafeeds - GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL); + GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(Metadata.ALL); ActionListener jobStatsListener = ActionListener.wrap( response -> jobManagerHolder.getJobManager().expandJobs(Metadata.ALL, true, ActionListener.wrap(jobs -> { addJobsUsage(response, jobs.results(), jobsUsage); - GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(Metadata.ALL); client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener); - }, listener::onFailure)), - listener::onFailure + }, e -> { + logger.warn("Failed to get job configs to include in ML usage", e); + client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener); + })), + e -> { + logger.warn("Failed to get job stats to include in ML usage", e); + client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener); + } ); // Step 0. Kick off the chain of callbacks by requesting jobs stats + GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL); client.execute(GetJobsStatsAction.INSTANCE, jobStatsRequest, jobStatsListener); } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml index 798288d72700c..6aab1fb9e894a 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml @@ -397,3 +397,57 @@ setup: - is_false: jobs.1.timing_stats.maximum_bucket_processing_time_ms - is_false: jobs.1.timing_stats.average_bucket_processing_time_ms - is_false: jobs.1.timing_stats.exponential_average_bucket_processing_time_ms + +--- +"Test closed results index": + + - skip: + features: + - "warnings" + + - do: + warnings: + - 'Posting data directly to anomaly detection jobs is deprecated, in a future major version it will be compulsory to use a datafeed' + ml.post_data: + job_id: job-stats-test + body: > + {"airline":"AAL","responsetime":"132.2046","time":"1403481600"} + {"airline":"JZA","responsetime":"990.4628","time":"1403481600"} + + - do: + ml.close_job: + job_id: jobs-get-stats-datafeed-job + - match: { closed: true } + + - do: + ml.close_job: + job_id: job-stats-test + - match: { closed: true } + + - do: + ml.get_job_stats: {} + - length: { jobs : 2 } + + - do: + xpack.usage: {} + - match: { ml.available: true } + - match: { ml.enabled: true } + - match: { ml.jobs.closed.count: 2 } + + - do: + indices.close: + index: .ml-anomalies-shared + + # With the index closed the low level ML API reports a problem + - do: + catch: /type=cluster_block_exception, reason=index \[.ml-anomalies-shared\] blocked by. \[FORBIDDEN\/.\/index closed\]/ + ml.get_job_stats: {} + + # But the high level X-Pack API returns what it can - we do this + # so that corruption to ML doesn't blind observers of the general + # cluster status + - do: + xpack.usage: {} + - match: { ml.available: true } + - match: { ml.enabled: true } + - is_false: ml.jobs.closed.count