Skip to content

Commit

Permalink
Submit critical service check whenever connection fails (#15208)
Browse files Browse the repository at this point in the history
* Submit critical service check whenever connection fails

* clean up variable

* style fixes

* remove outdated todo comment

* fix integration tests

* catch operationfailure as well

* Extend and document critical failures

* Update mongo/datadog_checks/mongo/mongo.py

Co-authored-by: Alex Lopez <[email protected]>

* Update mongo/datadog_checks/mongo/mongo.py

Co-authored-by: Florent Clarret <[email protected]>

* add import

* misc changes

* More errors from docs, test all errors in unit test

* style fixes

* reraise exception so it shows up in agent logs

* adjust tests to exception-raising

* Update mongo/tests/test_integration.py

Co-authored-by: Alex Lopez <[email protected]>

* fix formatting

---------

Co-authored-by: Alex Lopez <[email protected]>
Co-authored-by: Florent Clarret <[email protected]>
  • Loading branch information
3 people authored Aug 8, 2023
1 parent 1574b9a commit c845677
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 54 deletions.
33 changes: 27 additions & 6 deletions mongo/datadog_checks/mongo/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,32 @@
# Licensed under a 3-clause BSD style license (see LICENSE)

from pymongo import MongoClient, ReadPreference
from pymongo.errors import ConnectionFailure
from pymongo.errors import (
ConfigurationError,
ConnectionFailure,
OperationFailure,
ProtocolError,
ServerSelectionTimeoutError,
)

from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, StandaloneDeployment

# The name of the application that created this MongoClient instance. MongoDB 3.4 and newer will print this value in
# the server log upon establishing each connection. It is also recorded in the slow query log and profile collections.
DD_APP_NAME = 'datadog-agent'

# We collect here all pymongo exceptions that would result in a CRITICAL service check.
CRITICAL_FAILURE = (
ConfigurationError, # This occurs when TLS is misconfigured.
ConnectionFailure, # This is a generic exception for any problems when connecting to mongodb.
OperationFailure, # This occurs when authentication is incorrect.
# This means either no server is available or a replicaset has not elected a primary in the timeout window.
# In both cases it makes sense to submit a CRITICAL service check to Datadog.
ServerSelectionTimeoutError,
# Errors at the level of the protocol result in a lost/degraded connection. We can issue a CRITICAL check for this.
ProtocolError,
)


class MongoApi(object):
"""Mongodb connection through pymongo.MongoClient
Expand Down Expand Up @@ -77,7 +95,7 @@ def _get_rs_deployment_from_status_payload(repl_set_payload, cluster_role):
replset_state = repl_set_payload["myState"]
return ReplicaSetDeployment(replset_name, replset_state, cluster_role=cluster_role)

def get_deployment_type(self):
def refresh_deployment_type(self):
# getCmdLineOpts is the runtime configuration of the mongo instance. Helpful to know whether the node is
# a mongos or mongod, if the mongod is in a shard, if it's in a replica set, etc.
try:
Expand All @@ -87,12 +105,14 @@ def get_deployment_type(self):
"Unable to run `getCmdLineOpts`, got: %s. Assuming this is an Alibaba ApsaraDB instance.", str(e)
)
# `getCmdLineOpts` is forbidden on Alibaba ApsaraDB
return self._get_alibaba_deployment_type()
self.deployment_type = self._get_alibaba_deployment_type()
return
cluster_role = None
if 'sharding' in options:
if 'configDB' in options['sharding']:
self._log.debug("Detected MongosDeployment. Node is principal.")
return MongosDeployment()
self.deployment_type = MongosDeployment()
return
elif 'clusterRole' in options['sharding']:
cluster_role = options['sharding']['clusterRole']

Expand All @@ -103,10 +123,11 @@ def get_deployment_type(self):
is_principal = replica_set_deployment.is_principal()
is_principal_log = "" if is_principal else "not "
self._log.debug("Detected ReplicaSetDeployment. Node is %sprincipal.", is_principal_log)
return replica_set_deployment
self.deployment_type = replica_set_deployment
return

self._log.debug("Detected StandaloneDeployment. Node is principal.")
return StandaloneDeployment()
self.deployment_type = StandaloneDeployment()

def _get_alibaba_deployment_type(self):
is_master_payload = self['admin'].command('isMaster')
Expand Down
3 changes: 3 additions & 0 deletions mongo/datadog_checks/mongo/collectors/custom_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pymongo
from dateutil.tz import tzutc

from datadog_checks.mongo.api import CRITICAL_FAILURE
from datadog_checks.mongo.collectors.base import MongoCollector
from datadog_checks.mongo.common import (
ALLOWED_CUSTOM_METRICS_TYPES,
Expand Down Expand Up @@ -202,6 +203,8 @@ def collect(self, api):
for raw_query in self.custom_queries:
try:
self._collect_custom_metrics_for_query(api, raw_query)
except CRITICAL_FAILURE as e:
raise e # Critical failures must bubble up to trigger a CRITICAL service check.
except Exception as e:
metric_prefix = raw_query.get('metric_prefix')
self.log.warning("Errors while collecting custom metrics with prefix %s", metric_prefix, exc_info=e)
1 change: 1 addition & 0 deletions mongo/datadog_checks/mongo/collectors/index_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ def collect(self, api):
self.gauge('mongodb.collection.indexes.accesses.ops', val, idx_tags)
except Exception as e:
self.log.error("Could not fetch indexes stats for collection %s: %s", coll_name, e)
raise e
4 changes: 2 additions & 2 deletions mongo/datadog_checks/mongo/collectors/session_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def collect(self, api):
sessions_count = next(
config_db['system.sessions'].aggregate([{"$listSessions": {"allUsers": True}}, {"$count": "total"}])
)['total']
except Exception:
except Exception as e:
self.log.info('Unable to fetch system.session statistics.')
return
raise e
metric_name = self._normalize("sessions.count", AgentCheck.gauge)
self.check.gauge(metric_name, sessions_count, tags=self.base_tags)
62 changes: 33 additions & 29 deletions mongo/datadog_checks/mongo/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from __future__ import division

from copy import deepcopy
from functools import cached_property

from packaging.version import Version

from datadog_checks.base import AgentCheck, is_affirmative
from datadog_checks.mongo.api import MongoApi
from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi
from datadog_checks.mongo.collectors import (
CollStatsCollector,
CustomQueriesCollector,
Expand Down Expand Up @@ -74,9 +75,10 @@ def __init__(self, name, init_config, instances=None):
self._api_client = None
self._mongo_version = None

@property
@cached_property
def api_client(self):
return self._api_client
# This needs to be a property for our unit test mocks to work.
return MongoApi(self._config, self.log)

def refresh_collectors(self, deployment_type, all_dbs, tags):
collect_tcmalloc_metrics = 'tcmalloc' in self._config.additional_metrics
Expand Down Expand Up @@ -159,36 +161,33 @@ def _build_metric_list_to_collect(self):
return metrics_to_collect

def _refresh_replica_role(self):
if self._api_client and (
self._api_client.deployment_type is None
or isinstance(self._api_client.deployment_type, ReplicaSetDeployment)
):
if self.api_client.deployment_type is None or isinstance(self.api_client.deployment_type, ReplicaSetDeployment):
self.log.debug("Refreshing deployment type")
self._api_client.deployment_type = self._api_client.get_deployment_type()
self.api_client.refresh_deployment_type()

def check(self, _):
if self._connect():
self._check()
try:
self._refresh_metadata()
self._collect_metrics()
except CRITICAL_FAILURE as e:
self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags)
self._unset_metadata()
raise e # Let exception bubble up to global handler and show full error in the logs.
else:
self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags)

def _connect(self) -> bool:
if self._api_client is None:
try:
self._api_client = MongoApi(self._config, self.log)
self.log.debug("Connecting to '%s'", self._config.hosts)
self._api_client.connect()
self.log.debug("Connected!")
self._mongo_version = self.api_client.server_info().get('version', '0.0')
self.set_metadata('version', self._mongo_version)
self.log.debug('version: %s', self._mongo_version)
except Exception as e:
self._api_client = None
self.log.error('Exception: %s', e)
self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags)
return False
self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags)
return True

def _check(self):
def _refresh_metadata(self):
if self._mongo_version is None:
self.log.debug('No metadata present, refreshing it.')
self._mongo_version = self.api_client.server_info().get('version', '0.0')
self.set_metadata('version', self._mongo_version)
self.log.debug('version: %s', self._mongo_version)

def _unset_metadata(self):
self.log.debug('Due to connection failure we will need to reset the metadata.')
self._mongo_version = None

def _collect_metrics(self):
self._refresh_replica_role()
tags = deepcopy(self._config.metric_tags)
deployment = self.api_client.deployment_type
Expand All @@ -209,6 +208,11 @@ def _check(self):
for collector in self.collectors:
try:
collector.collect(self.api_client)
except CRITICAL_FAILURE as e:
self.log.info(
"Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True
)
raise e # Critical failures must bubble up to trigger a CRITICAL service check.
except Exception:
self.log.info(
"Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True
Expand Down
6 changes: 4 additions & 2 deletions mongo/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,8 @@ def test_mongod_bad_auth(check, dd_run_check, aggregator, username, password):
'options': {'authSource': 'authDB'},
}
mongo_check = check(instance)
dd_run_check(mongo_check)
with pytest.raises(Exception, match="pymongo.errors.OperationFailure: Authentication failed"):
dd_run_check(mongo_check)
aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL)


Expand Down Expand Up @@ -651,5 +652,6 @@ def test_mongod_tls_fail(check, dd_run_check, aggregator):
'tls_ca_file': '{}/ca.pem'.format(TLS_CERTS_FOLDER),
}
mongo_check = check(instance)
dd_run_check(mongo_check)
with pytest.raises(Exception, match=("pymongo.errors.ConfigurationError: Private key doesn't match certificate")):
dd_run_check(mongo_check)
aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL)
39 changes: 24 additions & 15 deletions mongo/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from datadog_checks.base import ConfigurationError
from datadog_checks.mongo import MongoDb, metrics
from datadog_checks.mongo.api import MongoApi
from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi
from datadog_checks.mongo.collectors import MongoCollector
from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, get_state_name
from datadog_checks.mongo.config import MongoConfig
Expand Down Expand Up @@ -46,12 +46,13 @@ def test_emits_critical_service_check_when_service_is_not_available(mock_command
# Given
check = MongoDb('mongo', {}, [{'hosts': ['localhost']}])
# When
dd_run_check(check)
with pytest.raises(Exception, match="pymongo.errors.ConnectionFailure: Service not available"):
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL)


@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
def test_emits_ok_service_check_when_service_is_available(
Expand All @@ -66,7 +67,7 @@ def test_emits_ok_service_check_when_service_is_available(
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)


@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
def test_emits_ok_service_check_each_run_when_service_is_available(
Expand All @@ -82,7 +83,7 @@ def test_emits_ok_service_check_each_run_when_service_is_available(
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK, count=2)


@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
def test_version_metadata(
Expand All @@ -109,7 +110,7 @@ def test_version_metadata(

@mock.patch(
'pymongo.database.Database.command',
side_effect=[{'ok': 1}, Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}],
side_effect=[Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}],
)
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
Expand All @@ -123,15 +124,14 @@ def test_emits_ok_service_check_when_alibaba_mongos_deployment(
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
mock_command.assert_has_calls([mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster')])
mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster')])
mock_server_info.assert_called_once()
mock_list_database_names.assert_called_once()


@mock.patch(
'pymongo.database.Database.command',
side_effect=[
{'ok': 1},
Exception('getCmdLineOpts exception'),
{},
{'configsvr': True, 'set': 'replset', "myState": 1},
Expand All @@ -149,17 +149,14 @@ def test_emits_ok_service_check_when_alibaba_replicaset_role_configsvr_deploymen
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
mock_command.assert_has_calls(
[mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]
)
mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')])
mock_server_info.assert_called_once()
mock_list_database_names.assert_called_once()


@mock.patch(
'pymongo.database.Database.command',
side_effect=[
{'ok': 1},
Exception('getCmdLineOpts exception'),
{},
{'configsvr': True, 'set': 'replset', "myState": 3},
Expand All @@ -177,9 +174,7 @@ def test_when_replicaset_state_recovering_then_database_names_not_called(
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
mock_command.assert_has_calls(
[mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]
)
mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')])
mock_server_info.assert_called_once()
mock_list_database_names.assert_not_called()

Expand Down Expand Up @@ -601,3 +596,17 @@ def test_when_version_lower_than_3_6_then_no_session_metrics_reported(aggregator
dd_run_check(check)
# Then
aggregator.assert_metric('mongodb.sessions.count', count=0)


@pytest.mark.parametrize("error_cls", CRITICAL_FAILURE)
def test_service_check_critical_when_connection_dies(error_cls, aggregator, check, instance, dd_run_check):
check = check(instance)
with mock_pymongo('standalone') as mocked_client:
dd_run_check(check)
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
aggregator.reset()
msg = "Testing"
mocked_client.list_database_names = mock.MagicMock(side_effect=error_cls(msg))
with pytest.raises(Exception, match=f"{error_cls.__name__}: {msg}"):
dd_run_check(check)
aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL)

0 comments on commit c845677

Please sign in to comment.