From c8456773a19b7f7ca0bc80a0c094e07658916ba8 Mon Sep 17 00:00:00 2001
From: Ilia Kurenkov <ilia.kurenkov@datadoghq.com>
Date: Tue, 8 Aug 2023 14:05:09 +0200
Subject: [PATCH] Submit critical service check whenever connection fails
 (#15208)

* Submit critical service check whenever connection fails

* clean up variable

* style fixes

* remove outdated todo comment

* fix integration tests

* catch operationfailure as well

* Extend and document critical failures

* Update mongo/datadog_checks/mongo/mongo.py

Co-authored-by: Alex Lopez <alex.lopez.zorzano@gmail.com>

* Update mongo/datadog_checks/mongo/mongo.py

Co-authored-by: Florent Clarret <florent.clarret@datadoghq.com>

* add import

* misc changes

* More errors from docs, test all errors in unit test

* style fixes

* reraise exception so it shows up in agent logs

* adjust tests to exception-raising

* Update mongo/tests/test_integration.py

Co-authored-by: Alex Lopez <alex.lopez.zorzano@gmail.com>

* fix formatting

---------

Co-authored-by: Alex Lopez <alex.lopez.zorzano@gmail.com>
Co-authored-by: Florent Clarret <florent.clarret@datadoghq.com>
---
 mongo/datadog_checks/mongo/api.py             | 33 ++++++++--
 .../mongo/collectors/custom_queries.py        |  3 +
 .../mongo/collectors/index_stats.py           |  1 +
 .../mongo/collectors/session_stats.py         |  4 +-
 mongo/datadog_checks/mongo/mongo.py           | 62 ++++++++++---------
 mongo/tests/test_integration.py               |  6 +-
 mongo/tests/test_unit.py                      | 39 +++++++-----
 7 files changed, 94 insertions(+), 54 deletions(-)

diff --git a/mongo/datadog_checks/mongo/api.py b/mongo/datadog_checks/mongo/api.py
index a0036123e9cac..fd15555c9eae5 100644
--- a/mongo/datadog_checks/mongo/api.py
+++ b/mongo/datadog_checks/mongo/api.py
@@ -3,7 +3,13 @@
 # Licensed under a 3-clause BSD style license (see LICENSE)
 
 from pymongo import MongoClient, ReadPreference
-from pymongo.errors import ConnectionFailure
+from pymongo.errors import (
+    ConfigurationError,
+    ConnectionFailure,
+    OperationFailure,
+    ProtocolError,
+    ServerSelectionTimeoutError,
+)
 
 from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, StandaloneDeployment
 
@@ -11,6 +17,18 @@
 # the server log upon establishing each connection. It is also recorded in the slow query log and profile collections.
 DD_APP_NAME = 'datadog-agent'
 
+# We collect here all pymongo exceptions that would result in a CRITICAL service check.
+CRITICAL_FAILURE = (
+    ConfigurationError,  # This occurs when TLS is misconfigured.
+    ConnectionFailure,  # This is a generic exception for any problems when connecting to mongodb.
+    OperationFailure,  # This occurs when authentication is incorrect.
+    # This means either no server is available or a replicaset has not elected a primary in the timeout window.
+    # In both cases it makes sense to submit a CRITICAL service check to Datadog.
+    ServerSelectionTimeoutError,
+    # Errors at the level of the protocol result in a lost/degraded connection. We can issue a CRITICAL check for this.
+    ProtocolError,
+)
+
 
 class MongoApi(object):
     """Mongodb connection through pymongo.MongoClient
@@ -77,7 +95,7 @@ def _get_rs_deployment_from_status_payload(repl_set_payload, cluster_role):
         replset_state = repl_set_payload["myState"]
         return ReplicaSetDeployment(replset_name, replset_state, cluster_role=cluster_role)
 
-    def get_deployment_type(self):
+    def refresh_deployment_type(self):
         # getCmdLineOpts is the runtime configuration of the mongo instance. Helpful to know whether the node is
         # a mongos or mongod, if the mongod is in a shard, if it's in a replica set, etc.
         try:
@@ -87,12 +105,14 @@ def get_deployment_type(self):
                 "Unable to run `getCmdLineOpts`, got: %s. Assuming this is an Alibaba ApsaraDB instance.", str(e)
             )
             # `getCmdLineOpts` is forbidden on Alibaba ApsaraDB
-            return self._get_alibaba_deployment_type()
+            self.deployment_type = self._get_alibaba_deployment_type()
+            return
         cluster_role = None
         if 'sharding' in options:
             if 'configDB' in options['sharding']:
                 self._log.debug("Detected MongosDeployment. Node is principal.")
-                return MongosDeployment()
+                self.deployment_type = MongosDeployment()
+                return
             elif 'clusterRole' in options['sharding']:
                 cluster_role = options['sharding']['clusterRole']
 
@@ -103,10 +123,11 @@ def get_deployment_type(self):
             is_principal = replica_set_deployment.is_principal()
             is_principal_log = "" if is_principal else "not "
             self._log.debug("Detected ReplicaSetDeployment. Node is %sprincipal.", is_principal_log)
-            return replica_set_deployment
+            self.deployment_type = replica_set_deployment
+            return
 
         self._log.debug("Detected StandaloneDeployment. Node is principal.")
-        return StandaloneDeployment()
+        self.deployment_type = StandaloneDeployment()
 
     def _get_alibaba_deployment_type(self):
         is_master_payload = self['admin'].command('isMaster')
diff --git a/mongo/datadog_checks/mongo/collectors/custom_queries.py b/mongo/datadog_checks/mongo/collectors/custom_queries.py
index cc9ec8b9ec6a0..5db504b7f7122 100644
--- a/mongo/datadog_checks/mongo/collectors/custom_queries.py
+++ b/mongo/datadog_checks/mongo/collectors/custom_queries.py
@@ -10,6 +10,7 @@
 import pymongo
 from dateutil.tz import tzutc
 
+from datadog_checks.mongo.api import CRITICAL_FAILURE
 from datadog_checks.mongo.collectors.base import MongoCollector
 from datadog_checks.mongo.common import (
     ALLOWED_CUSTOM_METRICS_TYPES,
@@ -202,6 +203,8 @@ def collect(self, api):
         for raw_query in self.custom_queries:
             try:
                 self._collect_custom_metrics_for_query(api, raw_query)
+            except CRITICAL_FAILURE as e:
+                raise e  # Critical failures must bubble up to trigger a CRITICAL service check.
             except Exception as e:
                 metric_prefix = raw_query.get('metric_prefix')
                 self.log.warning("Errors while collecting custom metrics with prefix %s", metric_prefix, exc_info=e)
diff --git a/mongo/datadog_checks/mongo/collectors/index_stats.py b/mongo/datadog_checks/mongo/collectors/index_stats.py
index 068022fd981a3..219c685d80ee5 100644
--- a/mongo/datadog_checks/mongo/collectors/index_stats.py
+++ b/mongo/datadog_checks/mongo/collectors/index_stats.py
@@ -30,3 +30,4 @@ def collect(self, api):
                     self.gauge('mongodb.collection.indexes.accesses.ops', val, idx_tags)
             except Exception as e:
                 self.log.error("Could not fetch indexes stats for collection %s: %s", coll_name, e)
+                raise e
diff --git a/mongo/datadog_checks/mongo/collectors/session_stats.py b/mongo/datadog_checks/mongo/collectors/session_stats.py
index 74a0145b34162..1858c738c976d 100644
--- a/mongo/datadog_checks/mongo/collectors/session_stats.py
+++ b/mongo/datadog_checks/mongo/collectors/session_stats.py
@@ -22,8 +22,8 @@ def collect(self, api):
             sessions_count = next(
                 config_db['system.sessions'].aggregate([{"$listSessions": {"allUsers": True}}, {"$count": "total"}])
             )['total']
-        except Exception:
+        except Exception as e:
             self.log.info('Unable to fetch system.session statistics.')
-            return
+            raise e
         metric_name = self._normalize("sessions.count", AgentCheck.gauge)
         self.check.gauge(metric_name, sessions_count, tags=self.base_tags)
diff --git a/mongo/datadog_checks/mongo/mongo.py b/mongo/datadog_checks/mongo/mongo.py
index 95c22c50ec97b..7ece04cebeed8 100644
--- a/mongo/datadog_checks/mongo/mongo.py
+++ b/mongo/datadog_checks/mongo/mongo.py
@@ -4,11 +4,12 @@
 from __future__ import division
 
 from copy import deepcopy
+from functools import cached_property
 
 from packaging.version import Version
 
 from datadog_checks.base import AgentCheck, is_affirmative
-from datadog_checks.mongo.api import MongoApi
+from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi
 from datadog_checks.mongo.collectors import (
     CollStatsCollector,
     CustomQueriesCollector,
@@ -74,9 +75,10 @@ def __init__(self, name, init_config, instances=None):
         self._api_client = None
         self._mongo_version = None
 
-    @property
+    @cached_property
     def api_client(self):
-        return self._api_client
+        # This needs to be a property for our unit test mocks to work.
+        return MongoApi(self._config, self.log)
 
     def refresh_collectors(self, deployment_type, all_dbs, tags):
         collect_tcmalloc_metrics = 'tcmalloc' in self._config.additional_metrics
@@ -159,36 +161,33 @@ def _build_metric_list_to_collect(self):
         return metrics_to_collect
 
     def _refresh_replica_role(self):
-        if self._api_client and (
-            self._api_client.deployment_type is None
-            or isinstance(self._api_client.deployment_type, ReplicaSetDeployment)
-        ):
+        if self.api_client.deployment_type is None or isinstance(self.api_client.deployment_type, ReplicaSetDeployment):
             self.log.debug("Refreshing deployment type")
-            self._api_client.deployment_type = self._api_client.get_deployment_type()
+            self.api_client.refresh_deployment_type()
 
     def check(self, _):
-        if self._connect():
-            self._check()
+        try:
+            self._refresh_metadata()
+            self._collect_metrics()
+        except CRITICAL_FAILURE as e:
+            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags)
+            self._unset_metadata()
+            raise e  # Let exception bubble up to global handler and show full error in the logs.
+        else:
+            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags)
 
-    def _connect(self) -> bool:
-        if self._api_client is None:
-            try:
-                self._api_client = MongoApi(self._config, self.log)
-                self.log.debug("Connecting to '%s'", self._config.hosts)
-                self._api_client.connect()
-                self.log.debug("Connected!")
-                self._mongo_version = self.api_client.server_info().get('version', '0.0')
-                self.set_metadata('version', self._mongo_version)
-                self.log.debug('version: %s', self._mongo_version)
-            except Exception as e:
-                self._api_client = None
-                self.log.error('Exception: %s', e)
-                self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags)
-                return False
-        self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags)
-        return True
-
-    def _check(self):
+    def _refresh_metadata(self):
+        if self._mongo_version is None:
+            self.log.debug('No metadata present, refreshing it.')
+            self._mongo_version = self.api_client.server_info().get('version', '0.0')
+            self.set_metadata('version', self._mongo_version)
+            self.log.debug('version: %s', self._mongo_version)
+
+    def _unset_metadata(self):
+        self.log.debug('Due to connection failure we will need to reset the metadata.')
+        self._mongo_version = None
+
+    def _collect_metrics(self):
         self._refresh_replica_role()
         tags = deepcopy(self._config.metric_tags)
         deployment = self.api_client.deployment_type
@@ -209,6 +208,11 @@ def _check(self):
         for collector in self.collectors:
             try:
                 collector.collect(self.api_client)
+            except CRITICAL_FAILURE as e:
+                self.log.info(
+                    "Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True
+                )
+                raise e  # Critical failures must bubble up to trigger a CRITICAL service check.
             except Exception:
                 self.log.info(
                     "Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True
diff --git a/mongo/tests/test_integration.py b/mongo/tests/test_integration.py
index ba3564593e445..479121df48393 100644
--- a/mongo/tests/test_integration.py
+++ b/mongo/tests/test_integration.py
@@ -621,7 +621,8 @@ def test_mongod_bad_auth(check, dd_run_check, aggregator, username, password):
         'options': {'authSource': 'authDB'},
     }
     mongo_check = check(instance)
-    dd_run_check(mongo_check)
+    with pytest.raises(Exception, match="pymongo.errors.OperationFailure: Authentication failed"):
+        dd_run_check(mongo_check)
     aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL)
 
 
@@ -651,5 +652,6 @@ def test_mongod_tls_fail(check, dd_run_check, aggregator):
         'tls_ca_file': '{}/ca.pem'.format(TLS_CERTS_FOLDER),
     }
     mongo_check = check(instance)
-    dd_run_check(mongo_check)
+    with pytest.raises(Exception, match=("pymongo.errors.ConfigurationError: Private key doesn't match certificate")):
+        dd_run_check(mongo_check)
     aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL)
diff --git a/mongo/tests/test_unit.py b/mongo/tests/test_unit.py
index 566c039d3bb80..8241ebee974c5 100644
--- a/mongo/tests/test_unit.py
+++ b/mongo/tests/test_unit.py
@@ -12,7 +12,7 @@
 
 from datadog_checks.base import ConfigurationError
 from datadog_checks.mongo import MongoDb, metrics
-from datadog_checks.mongo.api import MongoApi
+from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi
 from datadog_checks.mongo.collectors import MongoCollector
 from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, get_state_name
 from datadog_checks.mongo.config import MongoConfig
@@ -46,12 +46,13 @@ def test_emits_critical_service_check_when_service_is_not_available(mock_command
     # Given
     check = MongoDb('mongo', {}, [{'hosts': ['localhost']}])
     # When
-    dd_run_check(check)
+    with pytest.raises(Exception, match="pymongo.errors.ConnectionFailure: Service not available"):
+        dd_run_check(check)
     # Then
     aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL)
 
 
-@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
+@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
 @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
 @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
 def test_emits_ok_service_check_when_service_is_available(
@@ -66,7 +67,7 @@ def test_emits_ok_service_check_when_service_is_available(
     aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
 
 
-@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
+@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
 @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
 @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
 def test_emits_ok_service_check_each_run_when_service_is_available(
@@ -82,7 +83,7 @@ def test_emits_ok_service_check_each_run_when_service_is_available(
     aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK, count=2)
 
 
-@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
+@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
 @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
 @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
 def test_version_metadata(
@@ -109,7 +110,7 @@ def test_version_metadata(
 
 @mock.patch(
     'pymongo.database.Database.command',
-    side_effect=[{'ok': 1}, Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}],
+    side_effect=[Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}],
 )
 @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
 @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
@@ -123,7 +124,7 @@ def test_emits_ok_service_check_when_alibaba_mongos_deployment(
     dd_run_check(check)
     # Then
     aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
-    mock_command.assert_has_calls([mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster')])
+    mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster')])
     mock_server_info.assert_called_once()
     mock_list_database_names.assert_called_once()
 
@@ -131,7 +132,6 @@ def test_emits_ok_service_check_when_alibaba_mongos_deployment(
 @mock.patch(
     'pymongo.database.Database.command',
     side_effect=[
-        {'ok': 1},
         Exception('getCmdLineOpts exception'),
         {},
         {'configsvr': True, 'set': 'replset', "myState": 1},
@@ -149,9 +149,7 @@ def test_emits_ok_service_check_when_alibaba_replicaset_role_configsvr_deploymen
     dd_run_check(check)
     # Then
     aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
-    mock_command.assert_has_calls(
-        [mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]
-    )
+    mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')])
     mock_server_info.assert_called_once()
     mock_list_database_names.assert_called_once()
 
@@ -159,7 +157,6 @@ def test_emits_ok_service_check_when_alibaba_replicaset_role_configsvr_deploymen
 @mock.patch(
     'pymongo.database.Database.command',
     side_effect=[
-        {'ok': 1},
         Exception('getCmdLineOpts exception'),
         {},
         {'configsvr': True, 'set': 'replset', "myState": 3},
@@ -177,9 +174,7 @@ def test_when_replicaset_state_recovering_then_database_names_not_called(
     dd_run_check(check)
     # Then
     aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
-    mock_command.assert_has_calls(
-        [mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]
-    )
+    mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')])
     mock_server_info.assert_called_once()
     mock_list_database_names.assert_not_called()
 
@@ -601,3 +596,17 @@ def test_when_version_lower_than_3_6_then_no_session_metrics_reported(aggregator
         dd_run_check(check)
     # Then
     aggregator.assert_metric('mongodb.sessions.count', count=0)
+
+
+@pytest.mark.parametrize("error_cls", CRITICAL_FAILURE)
+def test_service_check_critical_when_connection_dies(error_cls, aggregator, check, instance, dd_run_check):
+    check = check(instance)
+    with mock_pymongo('standalone') as mocked_client:
+        dd_run_check(check)
+        aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
+        aggregator.reset()
+        msg = "Testing"
+        mocked_client.list_database_names = mock.MagicMock(side_effect=error_cls(msg))
+        with pytest.raises(Exception, match=f"{error_cls.__name__}: {msg}"):
+            dd_run_check(check)
+        aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL)