Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Submit critical service check whenever connection fails #15208

Merged
merged 18 commits into from
Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions mongo/datadog_checks/mongo/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,32 @@
# Licensed under a 3-clause BSD style license (see LICENSE)

from pymongo import MongoClient, ReadPreference
from pymongo.errors import ConnectionFailure
from pymongo.errors import (
ConfigurationError,
ConnectionFailure,
OperationFailure,
ProtocolError,
ServerSelectionTimeoutError,
)

from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, StandaloneDeployment

# The name of the application that created this MongoClient instance. MongoDB 3.4 and newer will print this value in
# the server log upon establishing each connection. It is also recorded in the slow query log and profile collections.
DD_APP_NAME = 'datadog-agent'

# We collect here all pymongo exceptions that would result in a CRITICAL service check.
CRITICAL_FAILURE = (
ConfigurationError, # This occurs when TLS is misconfigured.
ConnectionFailure, # This is a generic exception for any problems when connecting to mongodb.
OperationFailure, # This occurs when authentication is incorrect.
# This means either no server is available or a replicaset has not elected a primary in the timeout window.
# In both cases it makes sense to submit a CRITICAL service check to Datadog.
ServerSelectionTimeoutError,
# Errors at the level of the protocol result in a lost/degraded connection. We can issue a CRITICAL check for this.
ProtocolError,
)


class MongoApi(object):
"""Mongodb connection through pymongo.MongoClient
Expand Down Expand Up @@ -77,7 +95,7 @@ def _get_rs_deployment_from_status_payload(repl_set_payload, cluster_role):
replset_state = repl_set_payload["myState"]
return ReplicaSetDeployment(replset_name, replset_state, cluster_role=cluster_role)

def get_deployment_type(self):
def refresh_deployment_type(self):
# getCmdLineOpts is the runtime configuration of the mongo instance. Helpful to know whether the node is
# a mongos or mongod, if the mongod is in a shard, if it's in a replica set, etc.
try:
Expand All @@ -87,12 +105,14 @@ def get_deployment_type(self):
"Unable to run `getCmdLineOpts`, got: %s. Assuming this is an Alibaba ApsaraDB instance.", str(e)
)
# `getCmdLineOpts` is forbidden on Alibaba ApsaraDB
return self._get_alibaba_deployment_type()
self.deployment_type = self._get_alibaba_deployment_type()
return
cluster_role = None
if 'sharding' in options:
if 'configDB' in options['sharding']:
self._log.debug("Detected MongosDeployment. Node is principal.")
return MongosDeployment()
self.deployment_type = MongosDeployment()
return
elif 'clusterRole' in options['sharding']:
cluster_role = options['sharding']['clusterRole']

Expand All @@ -103,10 +123,11 @@ def get_deployment_type(self):
is_principal = replica_set_deployment.is_principal()
is_principal_log = "" if is_principal else "not "
self._log.debug("Detected ReplicaSetDeployment. Node is %sprincipal.", is_principal_log)
return replica_set_deployment
self.deployment_type = replica_set_deployment
return

self._log.debug("Detected StandaloneDeployment. Node is principal.")
return StandaloneDeployment()
self.deployment_type = StandaloneDeployment()

def _get_alibaba_deployment_type(self):
is_master_payload = self['admin'].command('isMaster')
Expand Down
3 changes: 3 additions & 0 deletions mongo/datadog_checks/mongo/collectors/custom_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pymongo
from dateutil.tz import tzutc

from datadog_checks.mongo.api import CRITICAL_FAILURE
from datadog_checks.mongo.collectors.base import MongoCollector
from datadog_checks.mongo.common import (
ALLOWED_CUSTOM_METRICS_TYPES,
Expand Down Expand Up @@ -202,6 +203,8 @@ def collect(self, api):
for raw_query in self.custom_queries:
try:
self._collect_custom_metrics_for_query(api, raw_query)
except CRITICAL_FAILURE as e:
raise e # Critical failures must bubble up to trigger a CRITICAL service check.
iliakur marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
metric_prefix = raw_query.get('metric_prefix')
self.log.warning("Errors while collecting custom metrics with prefix %s", metric_prefix, exc_info=e)
1 change: 1 addition & 0 deletions mongo/datadog_checks/mongo/collectors/index_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ def collect(self, api):
self.gauge('mongodb.collection.indexes.accesses.ops', val, idx_tags)
except Exception as e:
self.log.error("Could not fetch indexes stats for collection %s: %s", coll_name, e)
raise e
4 changes: 2 additions & 2 deletions mongo/datadog_checks/mongo/collectors/session_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def collect(self, api):
sessions_count = next(
config_db['system.sessions'].aggregate([{"$listSessions": {"allUsers": True}}, {"$count": "total"}])
)['total']
except Exception:
except Exception as e:
self.log.info('Unable to fetch system.session statistics.')
iliakur marked this conversation as resolved.
Show resolved Hide resolved
return
raise e
metric_name = self._normalize("sessions.count", AgentCheck.gauge)
self.check.gauge(metric_name, sessions_count, tags=self.base_tags)
62 changes: 33 additions & 29 deletions mongo/datadog_checks/mongo/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from __future__ import division

from copy import deepcopy
from functools import cached_property

from packaging.version import Version

from datadog_checks.base import AgentCheck, is_affirmative
from datadog_checks.mongo.api import MongoApi
from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi
from datadog_checks.mongo.collectors import (
CollStatsCollector,
CustomQueriesCollector,
Expand Down Expand Up @@ -74,9 +75,10 @@ def __init__(self, name, init_config, instances=None):
self._api_client = None
self._mongo_version = None

@property
@cached_property
def api_client(self):
return self._api_client
# This needs to be a property for our unit test mocks to work.
return MongoApi(self._config, self.log)

def refresh_collectors(self, deployment_type, all_dbs, tags):
collect_tcmalloc_metrics = 'tcmalloc' in self._config.additional_metrics
Expand Down Expand Up @@ -159,36 +161,33 @@ def _build_metric_list_to_collect(self):
return metrics_to_collect

def _refresh_replica_role(self):
if self._api_client and (
self._api_client.deployment_type is None
or isinstance(self._api_client.deployment_type, ReplicaSetDeployment)
):
if self.api_client.deployment_type is None or isinstance(self.api_client.deployment_type, ReplicaSetDeployment):
self.log.debug("Refreshing deployment type")
self._api_client.deployment_type = self._api_client.get_deployment_type()
self.api_client.refresh_deployment_type()

def check(self, _):
if self._connect():
self._check()
try:
self._refresh_metadata()
self._collect_metrics()
except CRITICAL_FAILURE as e:
self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags)
self._unset_metadata()
raise e # Let exception bubble up to global handler and show full error in the logs.
else:
self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags)

def _connect(self) -> bool:
if self._api_client is None:
try:
self._api_client = MongoApi(self._config, self.log)
self.log.debug("Connecting to '%s'", self._config.hosts)
self._api_client.connect()
self.log.debug("Connected!")
self._mongo_version = self.api_client.server_info().get('version', '0.0')
self.set_metadata('version', self._mongo_version)
self.log.debug('version: %s', self._mongo_version)
except Exception as e:
self._api_client = None
self.log.error('Exception: %s', e)
self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._config.service_check_tags)
return False
self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._config.service_check_tags)
return True

def _check(self):
def _refresh_metadata(self):
if self._mongo_version is None:
self.log.debug('No metadata present, refreshing it.')
self._mongo_version = self.api_client.server_info().get('version', '0.0')
self.set_metadata('version', self._mongo_version)
self.log.debug('version: %s', self._mongo_version)

def _unset_metadata(self):
self.log.debug('Due to connection failure we will need to reset the metadata.')
self._mongo_version = None

def _collect_metrics(self):
self._refresh_replica_role()
tags = deepcopy(self._config.metric_tags)
deployment = self.api_client.deployment_type
Expand All @@ -209,6 +208,11 @@ def _check(self):
for collector in self.collectors:
try:
collector.collect(self.api_client)
except CRITICAL_FAILURE as e:
self.log.info(
"Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True
)
raise e # Critical failures must bubble up to trigger a CRITICAL service check.
except Exception:
self.log.info(
"Unable to collect logs from collector %s. Some metrics will be missing.", collector, exc_info=True
Expand Down
12 changes: 10 additions & 2 deletions mongo/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,8 @@ def test_mongod_bad_auth(check, dd_run_check, aggregator, username, password):
'options': {'authSource': 'authDB'},
}
mongo_check = check(instance)
dd_run_check(mongo_check)
with pytest.raises(Exception, match="pymongo.errors.OperationFailure: Authentication failed"):
dd_run_check(mongo_check)
aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL)


Expand Down Expand Up @@ -651,5 +652,12 @@ def test_mongod_tls_fail(check, dd_run_check, aggregator):
'tls_ca_file': '{}/ca.pem'.format(TLS_CERTS_FOLDER),
}
mongo_check = check(instance)
dd_run_check(mongo_check)
with pytest.raises(
Exception,
match=(
"pymongo.errors.ConfigurationError:"
r" Private key doesn't match certificate: \[SSL\] PEM lib \(_ssl.c:4065\)"
iliakur marked this conversation as resolved.
Show resolved Hide resolved
),
):
dd_run_check(mongo_check)
aggregator.assert_service_check('mongodb.can_connect', status=MongoDb.CRITICAL)
39 changes: 24 additions & 15 deletions mongo/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from datadog_checks.base import ConfigurationError
from datadog_checks.mongo import MongoDb, metrics
from datadog_checks.mongo.api import MongoApi
from datadog_checks.mongo.api import CRITICAL_FAILURE, MongoApi
from datadog_checks.mongo.collectors import MongoCollector
from datadog_checks.mongo.common import MongosDeployment, ReplicaSetDeployment, get_state_name
from datadog_checks.mongo.config import MongoConfig
Expand Down Expand Up @@ -46,12 +46,13 @@ def test_emits_critical_service_check_when_service_is_not_available(mock_command
# Given
check = MongoDb('mongo', {}, [{'hosts': ['localhost']}])
# When
dd_run_check(check)
with pytest.raises(Exception, match="pymongo.errors.ConnectionFailure: Service not available"):
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL)


@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
def test_emits_ok_service_check_when_service_is_available(
Expand All @@ -66,7 +67,7 @@ def test_emits_ok_service_check_when_service_is_available(
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)


@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
def test_emits_ok_service_check_each_run_when_service_is_available(
Expand All @@ -82,7 +83,7 @@ def test_emits_ok_service_check_each_run_when_service_is_available(
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK, count=2)


@mock.patch('pymongo.database.Database.command', side_effect=[{'ok': 1}, {'parsed': {}}])
@mock.patch('pymongo.database.Database.command', side_effect=[{'parsed': {}}])
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
def test_version_metadata(
Expand All @@ -109,7 +110,7 @@ def test_version_metadata(

@mock.patch(
'pymongo.database.Database.command',
side_effect=[{'ok': 1}, Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}],
side_effect=[Exception('getCmdLineOpts exception'), {'msg': 'isdbgrid'}],
)
@mock.patch('pymongo.mongo_client.MongoClient.server_info', return_value={'version': '5.0.0'})
@mock.patch('pymongo.mongo_client.MongoClient.list_database_names', return_value=[])
Expand All @@ -123,15 +124,14 @@ def test_emits_ok_service_check_when_alibaba_mongos_deployment(
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
mock_command.assert_has_calls([mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster')])
mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster')])
mock_server_info.assert_called_once()
mock_list_database_names.assert_called_once()


@mock.patch(
'pymongo.database.Database.command',
side_effect=[
{'ok': 1},
Exception('getCmdLineOpts exception'),
{},
{'configsvr': True, 'set': 'replset', "myState": 1},
Expand All @@ -149,17 +149,14 @@ def test_emits_ok_service_check_when_alibaba_replicaset_role_configsvr_deploymen
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
mock_command.assert_has_calls(
[mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]
)
mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')])
mock_server_info.assert_called_once()
mock_list_database_names.assert_called_once()


@mock.patch(
'pymongo.database.Database.command',
side_effect=[
{'ok': 1},
Exception('getCmdLineOpts exception'),
{},
{'configsvr': True, 'set': 'replset', "myState": 3},
Expand All @@ -177,9 +174,7 @@ def test_when_replicaset_state_recovering_then_database_names_not_called(
dd_run_check(check)
# Then
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
mock_command.assert_has_calls(
[mock.call('ping'), mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')]
)
mock_command.assert_has_calls([mock.call('getCmdLineOpts'), mock.call('isMaster'), mock.call('replSetGetStatus')])
mock_server_info.assert_called_once()
mock_list_database_names.assert_not_called()

Expand Down Expand Up @@ -601,3 +596,17 @@ def test_when_version_lower_than_3_6_then_no_session_metrics_reported(aggregator
dd_run_check(check)
# Then
aggregator.assert_metric('mongodb.sessions.count', count=0)


@pytest.mark.parametrize("error_cls", CRITICAL_FAILURE)
def test_service_check_critical_when_connection_dies(error_cls, aggregator, check, instance, dd_run_check):
check = check(instance)
with mock_pymongo('standalone') as mocked_client:
dd_run_check(check)
aggregator.assert_service_check('mongodb.can_connect', MongoDb.OK)
aggregator.reset()
msg = "Testing"
mocked_client.list_database_names = mock.MagicMock(side_effect=error_cls(msg))
with pytest.raises(Exception, match=f"{error_cls.__name__}: {msg}"):
dd_run_check(check)
aggregator.assert_service_check('mongodb.can_connect', MongoDb.CRITICAL)
Loading