From 83c2b4482461ac30908624a6eca89a628a7df1d2 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 12 Nov 2024 14:03:42 +0100 Subject: [PATCH] add health monitor --- Dockerfile | 1 + main/config_default/healthmonitor_daemon | 4 + main/config_default/idds_health_check.py | 223 ++++++++++++++++++ main/config_default/logrotate_daemon | 4 +- .../supervisord_healthmonitor.ini | 17 ++ start-daemon.sh | 8 + 6 files changed, 256 insertions(+), 1 deletion(-) create mode 100755 main/config_default/healthmonitor_daemon create mode 100644 main/config_default/idds_health_check.py create mode 100644 main/config_default/supervisord_healthmonitor.ini diff --git a/Dockerfile b/Dockerfile index 054a8910..81a3757f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -157,6 +157,7 @@ RUN ln -fs /opt/idds/config/idds/supervisord_idds.ini /etc/supervisord.d/idds.in RUN ln -fs /opt/idds/config/idds/supervisord_httpd.ini /etc/supervisord.d/httpd.ini # RUN ln -fs /opt/idds/config/idds/supervisord_syslog-ng.ini /etc/supervisord.d/syslog-ng.ini RUN ln -fs /opt/idds/config/idds/supervisord_logrotate.ini /etc/supervisord.d/logrotate.ini +RUN ln -fs /opt/idds/config/idds/supervisord_healthmonitor.ini /etc/supervisord.d/healthmonitor.ini RUN ln -fs /opt/idds/config/idds/logrotate_idds /etc/logrotate.d/idds # for syslog-ng diff --git a/main/config_default/healthmonitor_daemon b/main/config_default/healthmonitor_daemon new file mode 100755 index 00000000..39acb949 --- /dev/null +++ b/main/config_default/healthmonitor_daemon @@ -0,0 +1,4 @@ +#!/bin/bash +# while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status -d /etc/logrotate.d/idds; sleep 86400; done + +while true; do python /opt/idds/config/idds/idds_health_check.py; sleep 600; done diff --git a/main/config_default/idds_health_check.py b/main/config_default/idds_health_check.py new file mode 100644 index 00000000..f6002103 --- /dev/null +++ b/main/config_default/idds_health_check.py @@ -0,0 +1,223 @@ +#!/usr/bin/python + +""" +check iDDS health +""" + +import json +import os +import re +import subprocess +import time + + +def check_command(command, check_string): + print("Checking command : {0}".format(command)) + print("For string : {0}".format(check_string)) + + tmp_array = command.split() + output = ( + subprocess.Popen(tmp_array, stdout=subprocess.PIPE) + .communicate()[0] + .decode("ascii") + ) + + if re.search(check_string, output): + print("Found the string, return 100") + return 100 + else: + print("String not found, return 0") + return 0 + + +def is_logrotate_running(): + # get the count of logrotate processes - if >=1 then logrotate is running + output = ( + subprocess.Popen( + "ps -eo pgid,args | grep logrotate | grep -v grep | wc -l", + stdout=subprocess.PIPE, + shell=True, + ) + .communicate()[0] + .decode("ascii") + ) + + try: + cleaned_output = output.strip() + n_logrotate_processes = int(cleaned_output) + except ValueError: + print( + "The string has an unexpected format and couldn't be converted to an integer." + ) + + # logrotate process found + if n_logrotate_processes >= 1: + print("Logrotate is running") + return True + + return False + + +def is_restarting(): + # get the count of logrotate processes - if >=1 then logrotate is running + output = ( + subprocess.Popen( + "ps -eo pgid,args | grep restart|grep http | grep -v grep | wc -l", + stdout=subprocess.PIPE, + shell=True, + ) + .communicate()[0] + .decode("ascii") + ) + + try: + cleaned_output = output.strip() + n_restarting_processes = int(cleaned_output) + except ValueError: + print( + "The string has an unexpected format and couldn't be converted to an integer." + ) + + # logrotate process found + if n_restarting_processes >= 1: + print("http is restarting") + return True + + return False + + +def http_availability(host): + # check the http + avail = 0 + if os.environ.get('X509_USER_PROXY', None): + curl = "curl -i -k --cert $X509_USER_PROXY --key $X509_USER_PROXY --cacert $X509_USER_PROXY https://%s:8443/idds/ping" % host + avail = check_command(curl, '"Status": "OK"') + print("http check availability (with proxy): %s" % avail) + elif os.environ.get('PANDA_AUTH', None) and os.environ.get('PANDA_AUTH_VO', None) and os.environ.get('PANDA_AUTH_ID_TOKEN', None): + curl = "curl -i -k -H \"X-IDDS-Auth-Type: ${PANDA_AUTH}\" -H \"X-IDDS-Auth-VO: ${PANDA_AUTH_VO}\" -H \"X-Idds-Auth-Token: ${PANDA_AUTH_ID_TOKEN}\" https://%s:8443/idds/ping" % host + avail = check_command(curl, '"Status": "OK"') + print("http check availability (with oidc token): %s" % avail) + if not avail or avail == 0: + curl = "curl -i -k https://%s:8443/idds/ping" % host + avail = check_command(curl, 'IDDSException') + print("http check availability (without proxy): %s" % avail) + + if not avail or avail == 0: + logrotate_running = is_logrotate_running() + restarting = is_restarting() + if logrotate_running and restarting: + print("log rotation is running and http is restarting") + return 1 + return avail + + +def process_availability(): + # check the http + process_avail = 0 + output = ( + subprocess.Popen( + "ps -eo pgid,args | grep 'idds/agents/main.py' | grep -v grep | uniq", + stdout=subprocess.PIPE, + shell=True, + ) + .communicate()[0] + .decode("ascii") + ) + count = 0 + for line in output.split("\n"): + line = line.strip() + if line == "": + continue + count += 1 + if count >= 1: + process_avail = 100 + + print("agent process check availability: %s" % process_avail) + return process_avail + + +def heartbeat_availability(log_location): + avail = 100 + hang_workers = 0 + heartbeat_file = os.path.join(log_location, 'idds_availability') + if not os.path.exists(heartbeat_file): + avail = 0 + print("idds_heartbeat at %s not exist, avail: %s" % (heartbeat_file, avail)) + return avail, hang_workers + + mod_time = os.path.getmtime(heartbeat_file) + print("idds_heartbeat updated at %s (currently is %s, %s seconds ago)" % (mod_time, time.time(), time.time() - mod_time)) + if mod_time < time.time() - 1800: + avail = 0 + return avail, hang_workers + + try: + with open(heartbeat_file, 'r') as f: + d = json.load(f) + for agent in d: + info = d[agent] + num_hang_workers = info['num_hang_workers'] + num_active_workers = info['num_active_workers'] + if num_active_workers > 0 and num_hang_workers > 0: + hang_workers += num_hang_workers + agent_avail = int(num_hang_workers * 100 / num_active_workers) + if agent_avail < avail: + avail = agent_avail + print("iDDS agent %s has % hang workers" % num_hang_workers) + except Exception as ex: + print("Failed to parse idds_heartbeat: %s" % str(ex)) + avail = 50 + + return avail, hang_workers + + +def idds_availability(host, log_location): + infos = {} + http_avail = http_availability(host) + print(f"http avail: {http_avail}") + + process_avail = process_availability() + print(f"agent daemon avail: {process_avail}") + + heartbeat_avail, hang_workers = heartbeat_availability(log_location) + print(f"heartbeat avail: {heartbeat_avail}, hang workers: {hang_workers}") + infos['num_hang_workers'] = hang_workers + + if not http_avail: + availability = 0 + avail_info = "iDDS http rest service is not running" + elif not process_avail: + availability = 50 + avail_info = "iDDS agents are not running" + else: + if not heartbeat_avail: + availability = 50 + avail_info = "iDDS agents are running. However heartbeat file is not found (or not renewed)" + elif heartbeat_avail < 100: + availability = heartbeat_avail + avail_info = "iDDS agents are running. However there are hanging workers" + else: + availability = heartbeat_avail + avail_info = "iDDS is OK" + + print("availability: %s, avail_info: %s, infos: %s" % (availability, avail_info, infos)) + + return availability, avail_info, infos + + +def main(): + host = 'localhost' + log_location = '/var/log/idds' + avail, avail_info, infos = idds_availability(host, log_location) + + health_file = os.path.join(log_location, 'idds_health') + if avail >= 100: + with open(health_file, 'w') as f: + f.write('OK') + else: + if os.path.exists(health_file): + os.remove(health_file) + + +if __name__ == '__main__': + main() diff --git a/main/config_default/logrotate_daemon b/main/config_default/logrotate_daemon index 70091208..668f7dd9 100755 --- a/main/config_default/logrotate_daemon +++ b/main/config_default/logrotate_daemon @@ -1,4 +1,6 @@ #!/bin/bash # while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status -d /etc/logrotate.d/idds; sleep 86400; done -while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status /etc/logrotate.d/idds >> /var/log/idds/logrotate.log 2>&1; sleep 3600; done +# while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status /etc/logrotate.d/idds >> /var/log/idds/logrotate.log 2>&1; sleep 3600; done + +while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status /etc/logrotate.d/idds; sleep 86400; done diff --git a/main/config_default/supervisord_healthmonitor.ini b/main/config_default/supervisord_healthmonitor.ini new file mode 100644 index 00000000..c23a9f56 --- /dev/null +++ b/main/config_default/supervisord_healthmonitor.ini @@ -0,0 +1,17 @@ +[program:health-monitor] +# command=/usr/sbin/logrotate -s /var/log/idds/logrotate.status -d /etc/logrotate.d/idds +command=/opt/idds/config/idds/healthmonitor_daemon +# process_name=%(process_num)02d +# user=atlpan +childlogdir=/var/log/idds +stdout_logfile=/var/log/idds/%(program_name)s-stdout.log +stderr_logfile=/var/log/idds/%(program_name)s-stderr.log +stdout_logfile_maxbytes=2GB +stderr_logfile_maxbytes=2GB +stdout_logfile_backups=1 +stderr_logfile_backups=1 +redirect_stderr=false +autorestart=true +stopsignal=TERM +stopasgroup=true +exitcodes=1 diff --git a/start-daemon.sh b/start-daemon.sh index 8d050e7b..703277f9 100755 --- a/start-daemon.sh +++ b/start-daemon.sh @@ -134,11 +134,19 @@ else cp /opt/idds/config_default/supervisord_httpd.ini /opt/idds/config/idds/supervisord_httpd.ini # cp /opt/idds/config_default/supervisord_syslog-ng.ini /opt/idds/config/idds/supervisord_syslog-ng.ini + echo "setup log rotation" cp /opt/idds/config_default/supervisord_logrotate.ini /opt/idds/config/idds/supervisord_logrotate.ini cp /opt/idds/config_default/logrotate_idds /opt/idds/config/idds/logrotate_idds cp /opt/idds/config_default/logrotate_daemon /opt/idds/config/idds/logrotate_daemon chmod +x /opt/idds/config/idds/logrotate_daemon chown root /opt/idds/config/idds/logrotate_idds + + echo "setup health monitor" + cp /opt/idds/config_default/supervisord_healthmonitor.ini /opt/idds/config/idds/ + cp /opt/idds/config_default/healthmonitor_daemon /opt/idds/config/idds/ + cp /opt/idds/config_default/idds_health_check.py /opt/idds/config/idds/ + chmod +x /opt/idds/config/idds/healthmonitor_daemon + chmod +x /opt/idds/config/idds/idds_health_check.py fi if [ -f /etc/grid-security/hostkey.pem ]; then