From d9dbbf73c3e00c53f03b04750107e28938fbe181 Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Thu, 29 Aug 2024 11:08:01 -0400 Subject: [PATCH 1/3] Wrap health check in try/except --- corehq/apps/hqadmin/service_checks.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/corehq/apps/hqadmin/service_checks.py b/corehq/apps/hqadmin/service_checks.py index ec32f18cd751..bcdaeb42ed54 100644 --- a/corehq/apps/hqadmin/service_checks.py +++ b/corehq/apps/hqadmin/service_checks.py @@ -123,7 +123,11 @@ def check_kafka(): @change_log_level('urllib3.connectionpool', logging.WARNING) def check_elasticsearch(): - cluster_health = check_es_cluster_health() + try: + cluster_health = check_es_cluster_health() + except Exception: + return ServiceStatus(False, "Something went wrong checking cluster health") + if cluster_health == 'red': return ServiceStatus(False, "Cluster health at %s" % cluster_health) From ccfc4345244a47ead689a16f512a5fa5893f4dbd Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Thu, 29 Aug 2024 11:09:42 -0400 Subject: [PATCH 2/3] Fix lint errors --- corehq/apps/hqadmin/service_checks.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/corehq/apps/hqadmin/service_checks.py b/corehq/apps/hqadmin/service_checks.py index bcdaeb42ed54..be81a0b19e5f 100644 --- a/corehq/apps/hqadmin/service_checks.py +++ b/corehq/apps/hqadmin/service_checks.py @@ -78,9 +78,7 @@ def check_all_rabbitmq(): return ServiceStatus(True, 'RabbitMQ OK') else: - return ServiceStatus(False, '; '.join(['{}:{}'.format(rabbit[0], rabbit[1]) - for rabbit in unwell_rabbits]) - ) + return ServiceStatus(False, "; ".join(["{}:{}".format(rabbit[0], rabbit[1]) for rabbit in unwell_rabbits])) def check_rabbitmq(broker_url): @@ -180,8 +178,9 @@ def check_celery(): bad_queues.append( f"{queue} has been blocked for {blockage_duration} (max allowed is {threshold})" ) - elif (heartbeat_time_to_start is not None and - heartbeat_time_to_start > max(threshold, datetime.timedelta(minutes=5))): + elif heartbeat_time_to_start is not None and heartbeat_time_to_start > max( + threshold, datetime.timedelta(minutes=5) + ): bad_queues.append( f"{queue} is delayed for {heartbeat_time_to_start} (max allowed is {threshold})" ) @@ -198,7 +197,7 @@ def check_postgres(): for db in settings.DATABASES: db_conn = connections[db] try: - c = db_conn.cursor() + db_conn.cursor() c_status = 'OK' except OperationalError: c_status = 'FAIL' From 8072d469cf260038e2b4ae92694efa2ef903a732 Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Thu, 29 Aug 2024 13:21:06 -0400 Subject: [PATCH 3/3] Send error to sentry --- corehq/apps/hqadmin/service_checks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/corehq/apps/hqadmin/service_checks.py b/corehq/apps/hqadmin/service_checks.py index be81a0b19e5f..57883a20095b 100644 --- a/corehq/apps/hqadmin/service_checks.py +++ b/corehq/apps/hqadmin/service_checks.py @@ -19,6 +19,9 @@ import attr import gevent + +from dimagi.utils.logging import notify_exception + from corehq.apps.app_manager.models import Application from corehq.apps.change_feed.connection import ( get_kafka_client, @@ -124,6 +127,7 @@ def check_elasticsearch(): try: cluster_health = check_es_cluster_health() except Exception: + notify_exception(None, message="Error while checking elasticsearch cluster health") return ServiceStatus(False, "Something went wrong checking cluster health") if cluster_health == 'red':