From 2d74bb795f1e7619a806aad3059eafec677a36b6 Mon Sep 17 00:00:00 2001
From: David Roberts <dave.roberts@elastic.co>
Date: Thu, 24 Nov 2022 17:53:15 +0000
Subject: [PATCH] [ML] ML stats failures should not stop the usage API working
 (#91917)

It is possible to meddle with internal ML state such that calls
to the ML stats APIs return errors. It is justifiable for these
single purpose APIs to return errors when the internal state of
ML is corrupted. However, it is undesirable for these low level
problems to completely prevent the overall usage API from returning,
because then callers cannot find out usage information from any
part of the system.

This change makes errors in the ML stats APIs non-fatal to the
overall response of the usage API. When an ML stats APIs returns
an error, the corresponding section of the ML usage information
will be blank.

Fixes #91893
---
 docs/changelog/91917.yaml                     |  6 ++
 .../ml/qa/ml-with-security/build.gradle       |  1 +
 .../MachineLearningUsageTransportAction.java  | 69 ++++++++++++++-----
 .../rest-api-spec/test/ml/jobs_get_stats.yml  | 54 +++++++++++++++
 4 files changed, 113 insertions(+), 17 deletions(-)
 create mode 100644 docs/changelog/91917.yaml

diff --git a/docs/changelog/91917.yaml b/docs/changelog/91917.yaml
new file mode 100644
index 0000000000000..92304d353c949
--- /dev/null
+++ b/docs/changelog/91917.yaml
@@ -0,0 +1,6 @@
+pr: 91917
+summary: ML stats failures should not stop the usage API working
+area: Machine Learning
+type: bug
+issues:
+ - 91893
diff --git a/x-pack/plugin/ml/qa/ml-with-security/build.gradle b/x-pack/plugin/ml/qa/ml-with-security/build.gradle
index 478736168d330..50b8b16c2dd0e 100644
--- a/x-pack/plugin/ml/qa/ml-with-security/build.gradle
+++ b/x-pack/plugin/ml/qa/ml-with-security/build.gradle
@@ -197,6 +197,7 @@ tasks.named("yamlRestTest").configure {
     'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid start param',
     'ml/jobs_get_result_overall_buckets/Test overall buckets given invalid end param',
     'ml/jobs_get_result_overall_buckets/Test overall buckets given bucket_span is smaller than max job bucket_span',
+    'ml/jobs_get_stats/Test closed results index',
     'ml/jobs_get_stats/Test get job stats given missing job',
     'ml/jobs_get_stats/Test no exception on get job stats with missing index',
     'ml/job_groups/Test put job with empty group',
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java
index c99af373e5256..5129d0d45fde3 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearningUsageTransportAction.java
@@ -20,6 +20,8 @@
 import org.elasticsearch.common.util.Maps;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.license.XPackLicenseState;
+import org.elasticsearch.logging.LogManager;
+import org.elasticsearch.logging.Logger;
 import org.elasticsearch.protocol.xpack.XPackUsageRequest;
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.threadpool.ThreadPool;
@@ -66,6 +68,8 @@
 
 public class MachineLearningUsageTransportAction extends XPackUsageFeatureTransportAction {
 
+    private static final Logger logger = LogManager.getLogger(MachineLearningUsageTransportAction.class);
+
     private final Client client;
     private final XPackLicenseState licenseState;
     private final JobManagerHolder jobManagerHolder;
@@ -124,8 +128,8 @@ protected void masterOperation(
         int nodeCount = mlNodeCount(state);
 
         // Step 5. return final ML usage
-        ActionListener<Map<String, Object>> inferenceUsageListener = ActionListener.wrap(inferenceUsage -> {
-            listener.onResponse(
+        ActionListener<Map<String, Object>> inferenceUsageListener = ActionListener.wrap(
+            inferenceUsage -> listener.onResponse(
                 new XPackUsageFeatureResponse(
                     new MachineLearningFeatureSetUsage(
                         MachineLearningField.ML_API_FEATURE.checkWithoutTracking(licenseState),
@@ -137,45 +141,76 @@ protected void masterOperation(
                         nodeCount
                     )
                 )
-            );
-        }, listener::onFailure);
+            ),
+            e -> {
+                logger.warn("Failed to get inference usage to include in ML usage", e);
+                listener.onResponse(
+                    new XPackUsageFeatureResponse(
+                        new MachineLearningFeatureSetUsage(
+                            MachineLearningField.ML_API_FEATURE.checkWithoutTracking(licenseState),
+                            enabled,
+                            jobsUsage,
+                            datafeedsUsage,
+                            analyticsUsage,
+                            Collections.emptyMap(),
+                            nodeCount
+                        )
+                    )
+                );
+            }
+        );
 
         // Step 4. Extract usage from data frame analytics configs and then get inference usage
         ActionListener<GetDataFrameAnalyticsAction.Response> dataframeAnalyticsListener = ActionListener.wrap(response -> {
             addDataFrameAnalyticsUsage(response, analyticsUsage);
             addInferenceUsage(inferenceUsageListener);
-        }, listener::onFailure);
+        }, e -> {
+            logger.warn("Failed to get data frame analytics configs to include in ML usage", e);
+            addInferenceUsage(inferenceUsageListener);
+        });
 
         // Step 3. Extract usage from data frame analytics stats and then request data frame analytics configs
+        GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
+        getDfaRequest.setPageParams(new PageParams(0, 10_000));
         ActionListener<GetDataFrameAnalyticsStatsAction.Response> dataframeAnalyticsStatsListener = ActionListener.wrap(response -> {
             addDataFrameAnalyticsStatsUsage(response, analyticsUsage);
-            GetDataFrameAnalyticsAction.Request getDfaRequest = new GetDataFrameAnalyticsAction.Request(Metadata.ALL);
-            getDfaRequest.setPageParams(new PageParams(0, 10_000));
             client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
-        }, listener::onFailure);
+        }, e -> {
+            logger.warn("Failed to get data frame analytics stats to include in ML usage", e);
+            client.execute(GetDataFrameAnalyticsAction.INSTANCE, getDfaRequest, dataframeAnalyticsListener);
+        });
 
         // Step 2. Extract usage from datafeeds stats and then request stats for data frame analytics
+        GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
+            Metadata.ALL
+        );
+        dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
         ActionListener<GetDatafeedsStatsAction.Response> datafeedStatsListener = ActionListener.wrap(response -> {
             addDatafeedsUsage(response, datafeedsUsage);
-            GetDataFrameAnalyticsStatsAction.Request dataframeAnalyticsStatsRequest = new GetDataFrameAnalyticsStatsAction.Request(
-                Metadata.ALL
-            );
-            dataframeAnalyticsStatsRequest.setPageParams(new PageParams(0, 10_000));
             client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
-        }, listener::onFailure);
+        }, e -> {
+            logger.warn("Failed to get datafeed stats to include in ML usage", e);
+            client.execute(GetDataFrameAnalyticsStatsAction.INSTANCE, dataframeAnalyticsStatsRequest, dataframeAnalyticsStatsListener);
+        });
 
         // Step 1. Extract usage from jobs stats and then request stats for all datafeeds
-        GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
+        GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(Metadata.ALL);
         ActionListener<GetJobsStatsAction.Response> jobStatsListener = ActionListener.wrap(
             response -> jobManagerHolder.getJobManager().expandJobs(Metadata.ALL, true, ActionListener.wrap(jobs -> {
                 addJobsUsage(response, jobs.results(), jobsUsage);
-                GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(Metadata.ALL);
                 client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
-            }, listener::onFailure)),
-            listener::onFailure
+            }, e -> {
+                logger.warn("Failed to get job configs to include in ML usage", e);
+                client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
+            })),
+            e -> {
+                logger.warn("Failed to get job stats to include in ML usage", e);
+                client.execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest, datafeedStatsListener);
+            }
         );
 
         // Step 0. Kick off the chain of callbacks by requesting jobs stats
+        GetJobsStatsAction.Request jobStatsRequest = new GetJobsStatsAction.Request(Metadata.ALL);
         client.execute(GetJobsStatsAction.INSTANCE, jobStatsRequest, jobStatsListener);
     }
 
diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml
index 798288d72700c..6aab1fb9e894a 100644
--- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml
+++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_get_stats.yml
@@ -397,3 +397,57 @@ setup:
   - is_false: jobs.1.timing_stats.maximum_bucket_processing_time_ms
   - is_false: jobs.1.timing_stats.average_bucket_processing_time_ms
   - is_false: jobs.1.timing_stats.exponential_average_bucket_processing_time_ms
+
+---
+"Test closed results index":
+
+  - skip:
+      features:
+        - "warnings"
+
+  - do:
+      warnings:
+        - 'Posting data directly to anomaly detection jobs is deprecated, in a future major version it will be compulsory to use a datafeed'
+      ml.post_data:
+        job_id: job-stats-test
+        body: >
+          {"airline":"AAL","responsetime":"132.2046","time":"1403481600"}
+          {"airline":"JZA","responsetime":"990.4628","time":"1403481600"}
+
+  - do:
+      ml.close_job:
+        job_id: jobs-get-stats-datafeed-job
+  - match: { closed: true }
+
+  - do:
+      ml.close_job:
+        job_id: job-stats-test
+  - match: { closed: true }
+
+  - do:
+      ml.get_job_stats: {}
+  - length: { jobs : 2 }
+
+  - do:
+      xpack.usage: {}
+  - match: { ml.available: true }
+  - match: { ml.enabled: true }
+  - match: { ml.jobs.closed.count: 2 }
+
+  - do:
+      indices.close:
+        index: .ml-anomalies-shared
+
+  # With the index closed the low level ML API reports a problem
+  - do:
+      catch: /type=cluster_block_exception, reason=index \[.ml-anomalies-shared\] blocked by. \[FORBIDDEN\/.\/index closed\]/
+      ml.get_job_stats: {}
+
+  # But the high level X-Pack API returns what it can - we do this
+  # so that corruption to ML doesn't blind observers of the general
+  # cluster status
+  - do:
+      xpack.usage: {}
+  - match: { ml.available: true }
+  - match: { ml.enabled: true }
+  - is_false: ml.jobs.closed.count