From c8456773a19b7f7ca0bc80a0c094e07658916ba8 Mon Sep 17 00:00:00 2001 From: Ilia Kurenkov Date: Tue, 8 Aug 2023 14:05:09 +0200 Subject: [PATCH] Submit critical service check whenever connection fails (#15208) * Submit critical service check whenever connection fails * clean up variable * style fixes * remove outdated todo comment * fix integration tests * catch operationfailure as well * Extend and document critical failures * Update mongo/datadog_checks/mongo/mongo.py Co-authored-by: Alex Lopez * Update mongo/datadog_checks/mongo/mongo.py Co-authored-by: Florent Clarret * add import * misc changes * More errors from docs, test all errors in unit test * style fixes * reraise exception so it shows up in agent logs * adjust tests to exception-raising * Update mongo/tests/test_integration.py Co-authored-by: Alex Lopez * fix formatting --------- Co-authored-by: Alex Lopez Co-authored-by: Florent Clarret --- mongo/datadog_checks/mongo/api.py | 33 ++++++++-- .../mongo/collectors/custom_queries.py | 3 + .../mongo/collectors/index_stats.py | 1 + .../mongo/collectors/session_stats.py | 4 +- mongo/datadog_checks/mongo/mongo.py | 62 ++++++++++--------- mongo/tests/test_integration.py | 6 +- mongo/tests/test_unit.py | 39 +++++++----- 7 files changed, 94 insertions(+), 54 deletions(-) diff --git a/mongo/datadog_checks/mongo/api.py b/mongo/datadog_checks/mongo/api.py index a0036123e9cac..fd15555c9eae5 100644 --- a/mongo/datadog_checks/mongo/api.py +++ b/mongo/datadog_checks/mongo/api.py @@ -3,7 +3,13 @@ # Licensed under a 3-clause BSD style license (see LICENSE) from pymongo import MongoClient, ReadPreference -from pymongo.errors import ConnectionFailure +from pymongo.errors import ( + ConfigurationError, + ConnectionFailure, + OperationFailure, + ProtocolError, + ServerSelectionTimeoutError, +) from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, StandaloneDeployment @@ -11,6 +17,18 @@ # the server log upon establishing each connection. It is also recorded in the slow query log and profile collections. DD_APP_NAME = 'datadog-agent' +# We collect here all pymongo exceptions that would result in a CRITICAL service check. +CRITICAL_FAILURE = ( + ConfigurationError, # This occurs when TLS is misconfigured. + ConnectionFailure, # This is a generic exception for any problems when connecting to mongodb. + OperationFailure, # This occurs when authentication is incorrect. + # This means either no server is available or a replicaset has not elected a primary in the timeout window. + # In both cases it makes sense to submit a CRITICAL service check to Datadog. + ServerSelectionTimeoutError, + # Errors at the level of the protocol result in a lost/degraded connection. We can issue a CRITICAL check for this. + ProtocolError, +) + class MongoApi(object): """Mongodb connection through pymongo.MongoClient @@ -77,7 +95,7 @@ def _get_rs_deployment_from_status_payload(repl_set_payload, cluster_role): replset_state = repl_set_payload["myState"] return ReplicaSetDeployment(replset_name, replset_state, cluster_role=cluster_role) - def get_deployment_type(self): + def refresh_deployment_type(self): # getCmdLineOpts is the runtime configuration of the mongo instance. Helpful to know whether the node is # a mongos or mongod, if the mongod is in a shard, if it's in a replica set, etc. try: @@ -87,12 +105,14 @@ def get_deployment_type(self): "Unable to run `getCmdLineOpts`, got: %s. Assuming this is an Alibaba ApsaraDB instance.", str(e) ) # `getCmdLineOpts` is forbidden on Alibaba ApsaraDB - return self._get_alibaba_deployment_type() + self.deployment_type = self._get_alibaba_deployment_type() + return cluster_role = None if 'sharding' in options: if 'configDB' in options['sharding']: self._log.debug("Detected MongosDeployment. Node is principal.") - return MongosDeployment() + self.deployment_type = MongosDeployment() + return elif 'clusterRole' in options['sharding']: cluster_role = options['sharding']['clusterRole'] @@ -103,10 +123,11 @@ def get_deployment_type(self): is_principal = replica_set_deployment.is_principal() is_principal_log = "" if is_principal else "not " self._log.debug("Detected ReplicaSetDeployment. Node is %sprincipal.", is_principal_log) - return replica_set_deployment + self.deployment_type = replica_set_deployment + return self._log.debug("Detected StandaloneDeployment. Node is principal.") - return StandaloneDeployment() + self.deployment_type = StandaloneDeployment() def _get_alibaba_deployment_type(self): is_master_payload = self['admin'].command('isMaster') diff --git a/mongo/datadog_checks/mongo/collectors/custom_queries.py b/mongo/datadog_checks/mongo/collectors/custom_queries.py index cc9ec8b9ec6a0..5db504b7f7122 100644 --- a/mongo/datadog_checks/mongo/collectors/custom_queries.py +++ b/mongo/datadog_checks/mongo/collectors/custom_queries.py @@ -10,6 +10,7 @@ import pymongo from dateutil.tz import tzutc +from datadog_checks.mongo.api import CRITICAL_FAILURE from datadog_checks.mongo.collectors.base import MongoCollector from datadog_checks.mongo.common import ( ALLOWED_CUSTOM_METRICS_TYPES, @@ -202,6 +203,8 @@ def collect(self, api): for raw_query in self.custom_queries: try: self._collect_custom_metrics_for_query(api, raw_query) + except CRITICAL_FAILURE as e: + raise e # Critical failures must bubble up to trigger a CRITICAL service check. except Exception as e: metric_prefix = raw_query.get('metric_prefix') self.log.warning("Errors while collecting custom metrics with prefix %s", metric_prefix, exc_info=e) diff --git a/mongo/datadog_checks/mongo/collectors/index_stats.py b/mongo/datadog_checks/mongo/collectors/index_stats.py index 068022fd981a3..219c685d80ee5 100644 --- a/mongo/datadog_checks/mongo/collectors/index_stats.py +++ b/mongo/datadog_checks/mongo/collectors/index_stats.py @@ -30,3 +30,4 @@ def collect(self, api): self.gauge('mongodb.collection.indexes.accesses.ops', val, idx_tags) except Exception as e: self.log.error("Could not fetch indexes stats for collection %s: %s", coll_name, e) + raise e diff --git a/mongo/datadog_checks/mongo/collectors/session_stats.py b/mongo/datadog_checks/mongo/collectors/session_stats.py index 74a0145b34162..1858c738c976d 100644 --- a/mongo/datadog_checks/mongo/collectors/session_stats.py +++ b/mongo/datadog_checks/mongo/collectors/session_stats.py @@ -22,8 +22,8 @@ def collect(self, api): sessions_count = next( config_db['system.sessions'].aggregate([{"$listSessions": {"allUsers": True}}, {"$count": "total"}]) )['total'] - except Exception: + except Exception as e: self.log.info('Unable to fetch system.session statistics.') - return + raise e metric_name = self._normalize("sessions.count", AgentCheck.gauge) self.check.gauge(metric_name, sessions_count, tags=self.base_tags) diff --git a/mongo/datadog_checks/mongo/mongo.py b/mongo/datadog_checks/mongo/mongo.py index 95c22c50ec97b..7ece04cebeed8 100644 --- a/mongo/datadog_checks/mongo/mongo.py +++ b/mongo/datadog_checks/mongo/mongo.py @@ -4,11 +4,12 @@ from __future__ import division from copy import deepcopy +from functools import cached_property from packaging.version import Version from datadog_checks.base import AgentCheck, is_affirmative -from datadog_checks.mongo.api import MongoApi +from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi from datadog_checks.mongo.collectors import ( CollStatsCollector, CustomQueriesCollector, @@ -74,9 +75,10 @@ def __init__(self, name, init_config, instances=None): self._api_client = None self._mongo_version = None - @property + @cached_property def api_client(self): - return self._api_client + # This needs to be a property for our unit test mocks to work. + return MongoApi(self._config, self.log) def refresh_collectors(self, deployment_type, all_dbs, tags): collect_tcmalloc_metrics = 'tcmalloc' in self._config.additional_metrics @@ -159,36 +161,33 @@ def _build_metric_list_to_collect(self): return metrics_to_collect def _refresh_replica_role(self): - if self._api_client and ( - self._api_client.deployment_type is None - or isinstance(self._api_client.deployment_type, ReplicaSetDeployment) - ): + if self.api_client.deployment_type is None or isinstance(self.api_client.deployment_type, ReplicaSetDeployment): self.log.debug("Refreshing deployment type") - self._api_client.deployment_type = self._api_client.get_deployment_type() + self.api_client.refresh_deployment_type() def check(self, _): - if self._connect(): - self._check() + try: + self._refresh_metadata() + self._collect_metrics() + except CRITICAL_FAILURE as e: + self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags) + self._unset_metadata() + raise e # Let exception bubble up to global handler and show full error in the logs. + else: + self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags) - def _connect(self) -> bool: - if self._api_client is None: - try: - self._api_client = MongoApi(self._config, self.log) - self.log.debug("Connecting to '%s'", self._config.hosts) - self._api_client.connect() - self.log.debug("Connected!") - self._mongo_version = self.api_client.server_info().get('version', '0.0') - self.set_metadata('version', self._mongo_version) - self.log.debug('version: %s', self._mongo_version) - except Exception as e: - self._api_client = None - self.log.error('Exception: %s', e) - self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags) - return False - self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags) - return True - - def _check(self): + def _refresh_metadata(self): + if self._mongo_version is None: + self.log.debug('No metadata present, refreshing it.') + self._mongo_version = self.api_client.server_info().get('version', '0.0') + self.set_metadata('version', self._mongo_version) + self.log.debug('version: %s', self._mongo_version) + + def _unset_metadata(self): + self.log.debug('Due to connection failure we will need to reset the metadata.') + self._mongo_version = None + + def _collect_metrics(self): self._refresh_replica_role() tags = deepcopy(self._config.metric_tags) deployment = self.api_client.deployment_type @@ -209,6 +208,11 @@ def _check(self): for collector in self.collectors: try: collector.collect(self.api_client) + except CRITICAL_FAILURE as e: + self.log.info( + "Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True + ) + raise e # Critical failures must bubble up to trigger a CRITICAL service check. except Exception: self.log.info( "Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True diff --git a/mongo/tests/test_integration.py b/mongo/tests/test_integration.py index ba3564593e445..479121df48393 100644 --- a/mongo/tests/test_integration.py +++ b/mongo/tests/test_integration.py @@ -621,7 +621,8 @@ def test_mongod_bad_auth(check, dd_run_check, aggregator, username, password): 'options': {'authSource': 'authDB'}, } mongo_check = check(instance) - dd_run_check(mongo_check) + with pytest.raises(Exception, match="pymongo.errors.OperationFailure: Authentication failed"): + dd_run_check(mongo_check) aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL) @@ -651,5 +652,6 @@ def test_mongod_tls_fail(check, dd_run_check, aggregator): 'tls_ca_file': '{}/ca.pem'.format(TLS_CERTS_FOLDER), } mongo_check = check(instance) - dd_run_check(mongo_check) + with pytest.raises(Exception, match=("pymongo.errors.ConfigurationError: Private key doesn't match certificate")): + dd_run_check(mongo_check) aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL) diff --git a/mongo/tests/test_unit.py b/mongo/tests/test_unit.py index 566c039d3bb80..8241ebee974c5 100644 --- a/mongo/tests/test_unit.py +++ b/mongo/tests/test_unit.py @@ -12,7 +12,7 @@ from datadog_checks.base import ConfigurationError from datadog_checks.mongo import MongoDb, metrics -from datadog_checks.mongo.api import MongoApi +from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi from datadog_checks.mongo.collectors import MongoCollector from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, get_state_name from datadog_checks.mongo.config import MongoConfig @@ -46,12 +46,13 @@ def test_emits_critical_service_check_when_service_is_not_available(mock_command # Given check = MongoDb('mongo', {}, [{'hosts': ['localhost']}]) # When - dd_run_check(check) + with pytest.raises(Exception, match="pymongo.errors.ConnectionFailure: Service not available"): + dd_run_check(check) # Then aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL) -@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}]) +@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}]) @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'}) @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[]) def test_emits_ok_service_check_when_service_is_available( @@ -66,7 +67,7 @@ def test_emits_ok_service_check_when_service_is_available( aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK) -@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}]) +@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}]) @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'}) @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[]) def test_emits_ok_service_check_each_run_when_service_is_available( @@ -82,7 +83,7 @@ def test_emits_ok_service_check_each_run_when_service_is_available( aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK, count=2) -@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}]) +@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}]) @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'}) @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[]) def test_version_metadata( @@ -109,7 +110,7 @@ def test_version_metadata( @mock.patch( 'pymongo.database.Database.command', - side_effect=[{'ok': 1}, Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}], + side_effect=[Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}], ) @mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'}) @mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[]) @@ -123,7 +124,7 @@ def test_emits_ok_service_check_when_alibaba_mongos_deployment( dd_run_check(check) # Then aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK) - mock_command.assert_has_calls([mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster')]) + mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster')]) mock_server_info.assert_called_once() mock_list_database_names.assert_called_once() @@ -131,7 +132,6 @@ def test_emits_ok_service_check_when_alibaba_mongos_deployment( @mock.patch( 'pymongo.database.Database.command', side_effect=[ - {'ok': 1}, Exception('getCmdLineOpts exception'), {}, {'configsvr': True, 'set': 'replset', "myState": 1}, @@ -149,9 +149,7 @@ def test_emits_ok_service_check_when_alibaba_replicaset_role_configsvr_deploymen dd_run_check(check) # Then aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK) - mock_command.assert_has_calls( - [mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')] - ) + mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]) mock_server_info.assert_called_once() mock_list_database_names.assert_called_once() @@ -159,7 +157,6 @@ def test_emits_ok_service_check_when_alibaba_replicaset_role_configsvr_deploymen @mock.patch( 'pymongo.database.Database.command', side_effect=[ - {'ok': 1}, Exception('getCmdLineOpts exception'), {}, {'configsvr': True, 'set': 'replset', "myState": 3}, @@ -177,9 +174,7 @@ def test_when_replicaset_state_recovering_then_database_names_not_called( dd_run_check(check) # Then aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK) - mock_command.assert_has_calls( - [mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')] - ) + mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]) mock_server_info.assert_called_once() mock_list_database_names.assert_not_called() @@ -601,3 +596,17 @@ def test_when_version_lower_than_3_6_then_no_session_metrics_reported(aggregator dd_run_check(check) # Then aggregator.assert_metric('mongodb.sessions.count', count=0) + + +@pytest.mark.parametrize("error_cls", CRITICAL_FAILURE) +def test_service_check_critical_when_connection_dies(error_cls, aggregator, check, instance, dd_run_check): + check = check(instance) + with mock_pymongo('standalone') as mocked_client: + dd_run_check(check) + aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK) + aggregator.reset() + msg = "Testing" + mocked_client.list_database_names = mock.MagicMock(side_effect=error_cls(msg)) + with pytest.raises(Exception, match=f"{error_cls.__name__}: {msg}"): + dd_run_check(check) + aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL)