Skip to content

Commit

Permalink
add health monitor
Browse files Browse the repository at this point in the history
  • Loading branch information
wguanicedew committed Nov 12, 2024
1 parent 6d5c7af commit 83c2b44
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 1 deletion.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ RUN ln -fs /opt/idds/config/idds/supervisord_idds.ini /etc/supervisord.d/idds.in
RUN ln -fs /opt/idds/config/idds/supervisord_httpd.ini /etc/supervisord.d/httpd.ini
# RUN ln -fs /opt/idds/config/idds/supervisord_syslog-ng.ini /etc/supervisord.d/syslog-ng.ini
RUN ln -fs /opt/idds/config/idds/supervisord_logrotate.ini /etc/supervisord.d/logrotate.ini
RUN ln -fs /opt/idds/config/idds/supervisord_healthmonitor.ini /etc/supervisord.d/healthmonitor.ini
RUN ln -fs /opt/idds/config/idds/logrotate_idds /etc/logrotate.d/idds

# for syslog-ng
Expand Down
4 changes: 4 additions & 0 deletions main/config_default/healthmonitor_daemon
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
# while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status -d /etc/logrotate.d/idds; sleep 86400; done

while true; do python /opt/idds/config/idds/idds_health_check.py; sleep 600; done
223 changes: 223 additions & 0 deletions main/config_default/idds_health_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#!/usr/bin/python

"""
check iDDS health
"""

import json
import os
import re
import subprocess
import time


def check_command(command, check_string):
print("Checking command : {0}".format(command))
print("For string : {0}".format(check_string))

tmp_array = command.split()
output = (
subprocess.Popen(tmp_array, stdout=subprocess.PIPE)
.communicate()[0]
.decode("ascii")
)

if re.search(check_string, output):
print("Found the string, return 100")
return 100
else:
print("String not found, return 0")
return 0


def is_logrotate_running():
# get the count of logrotate processes - if >=1 then logrotate is running
output = (
subprocess.Popen(
"ps -eo pgid,args | grep logrotate | grep -v grep | wc -l",
stdout=subprocess.PIPE,
shell=True,
)
.communicate()[0]
.decode("ascii")
)

try:
cleaned_output = output.strip()
n_logrotate_processes = int(cleaned_output)
except ValueError:
print(
"The string has an unexpected format and couldn't be converted to an integer."
)

# logrotate process found
if n_logrotate_processes >= 1:
print("Logrotate is running")
return True

return False


def is_restarting():
# get the count of logrotate processes - if >=1 then logrotate is running
output = (
subprocess.Popen(
"ps -eo pgid,args | grep restart|grep http | grep -v grep | wc -l",
stdout=subprocess.PIPE,
shell=True,
)
.communicate()[0]
.decode("ascii")
)

try:
cleaned_output = output.strip()
n_restarting_processes = int(cleaned_output)
except ValueError:
print(
"The string has an unexpected format and couldn't be converted to an integer."
)

# logrotate process found
if n_restarting_processes >= 1:
print("http is restarting")
return True

return False


def http_availability(host):
# check the http
avail = 0
if os.environ.get('X509_USER_PROXY', None):
curl = "curl -i -k --cert $X509_USER_PROXY --key $X509_USER_PROXY --cacert $X509_USER_PROXY https://%s:8443/idds/ping" % host
avail = check_command(curl, '"Status": "OK"')
print("http check availability (with proxy): %s" % avail)
elif os.environ.get('PANDA_AUTH', None) and os.environ.get('PANDA_AUTH_VO', None) and os.environ.get('PANDA_AUTH_ID_TOKEN', None):
curl = "curl -i -k -H \"X-IDDS-Auth-Type: ${PANDA_AUTH}\" -H \"X-IDDS-Auth-VO: ${PANDA_AUTH_VO}\" -H \"X-Idds-Auth-Token: ${PANDA_AUTH_ID_TOKEN}\" https://%s:8443/idds/ping" % host
avail = check_command(curl, '"Status": "OK"')
print("http check availability (with oidc token): %s" % avail)
if not avail or avail == 0:
curl = "curl -i -k https://%s:8443/idds/ping" % host
avail = check_command(curl, 'IDDSException')
print("http check availability (without proxy): %s" % avail)

if not avail or avail == 0:
logrotate_running = is_logrotate_running()
restarting = is_restarting()
if logrotate_running and restarting:
print("log rotation is running and http is restarting")
return 1
return avail


def process_availability():
# check the http
process_avail = 0
output = (
subprocess.Popen(
"ps -eo pgid,args | grep 'idds/agents/main.py' | grep -v grep | uniq",
stdout=subprocess.PIPE,
shell=True,
)
.communicate()[0]
.decode("ascii")
)
count = 0
for line in output.split("\n"):
line = line.strip()
if line == "":
continue
count += 1
if count >= 1:
process_avail = 100

print("agent process check availability: %s" % process_avail)
return process_avail


def heartbeat_availability(log_location):
avail = 100
hang_workers = 0
heartbeat_file = os.path.join(log_location, 'idds_availability')
if not os.path.exists(heartbeat_file):
avail = 0
print("idds_heartbeat at %s not exist, avail: %s" % (heartbeat_file, avail))
return avail, hang_workers

mod_time = os.path.getmtime(heartbeat_file)
print("idds_heartbeat updated at %s (currently is %s, %s seconds ago)" % (mod_time, time.time(), time.time() - mod_time))
if mod_time < time.time() - 1800:
avail = 0
return avail, hang_workers

try:
with open(heartbeat_file, 'r') as f:
d = json.load(f)
for agent in d:
info = d[agent]
num_hang_workers = info['num_hang_workers']
num_active_workers = info['num_active_workers']
if num_active_workers > 0 and num_hang_workers > 0:
hang_workers += num_hang_workers
agent_avail = int(num_hang_workers * 100 / num_active_workers)
if agent_avail < avail:
avail = agent_avail
print("iDDS agent %s has % hang workers" % num_hang_workers)
except Exception as ex:
print("Failed to parse idds_heartbeat: %s" % str(ex))
avail = 50

return avail, hang_workers


def idds_availability(host, log_location):
infos = {}
http_avail = http_availability(host)
print(f"http avail: {http_avail}")

process_avail = process_availability()
print(f"agent daemon avail: {process_avail}")

heartbeat_avail, hang_workers = heartbeat_availability(log_location)
print(f"heartbeat avail: {heartbeat_avail}, hang workers: {hang_workers}")
infos['num_hang_workers'] = hang_workers

if not http_avail:
availability = 0
avail_info = "iDDS http rest service is not running"
elif not process_avail:
availability = 50
avail_info = "iDDS agents are not running"
else:
if not heartbeat_avail:
availability = 50
avail_info = "iDDS agents are running. However heartbeat file is not found (or not renewed)"
elif heartbeat_avail < 100:
availability = heartbeat_avail
avail_info = "iDDS agents are running. However there are hanging workers"
else:
availability = heartbeat_avail
avail_info = "iDDS is OK"

print("availability: %s, avail_info: %s, infos: %s" % (availability, avail_info, infos))

return availability, avail_info, infos


def main():
host = 'localhost'
log_location = '/var/log/idds'
avail, avail_info, infos = idds_availability(host, log_location)

health_file = os.path.join(log_location, 'idds_health')
if avail >= 100:
with open(health_file, 'w') as f:
f.write('OK')
else:
if os.path.exists(health_file):
os.remove(health_file)


if __name__ == '__main__':
main()
4 changes: 3 additions & 1 deletion main/config_default/logrotate_daemon
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/bin/bash
# while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status -d /etc/logrotate.d/idds; sleep 86400; done

while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status /etc/logrotate.d/idds >> /var/log/idds/logrotate.log 2>&1; sleep 3600; done
# while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status /etc/logrotate.d/idds >> /var/log/idds/logrotate.log 2>&1; sleep 3600; done

while true; do /usr/sbin/logrotate -s /var/log/idds/logrotate.status /etc/logrotate.d/idds; sleep 86400; done
17 changes: 17 additions & 0 deletions main/config_default/supervisord_healthmonitor.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[program:health-monitor]
# command=/usr/sbin/logrotate -s /var/log/idds/logrotate.status -d /etc/logrotate.d/idds
command=/opt/idds/config/idds/healthmonitor_daemon
# process_name=%(process_num)02d
# user=atlpan
childlogdir=/var/log/idds
stdout_logfile=/var/log/idds/%(program_name)s-stdout.log
stderr_logfile=/var/log/idds/%(program_name)s-stderr.log
stdout_logfile_maxbytes=2GB
stderr_logfile_maxbytes=2GB
stdout_logfile_backups=1
stderr_logfile_backups=1
redirect_stderr=false
autorestart=true
stopsignal=TERM
stopasgroup=true
exitcodes=1
8 changes: 8 additions & 0 deletions start-daemon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,19 @@ else
cp /opt/idds/config_default/supervisord_httpd.ini /opt/idds/config/idds/supervisord_httpd.ini
# cp /opt/idds/config_default/supervisord_syslog-ng.ini /opt/idds/config/idds/supervisord_syslog-ng.ini

echo "setup log rotation"
cp /opt/idds/config_default/supervisord_logrotate.ini /opt/idds/config/idds/supervisord_logrotate.ini
cp /opt/idds/config_default/logrotate_idds /opt/idds/config/idds/logrotate_idds
cp /opt/idds/config_default/logrotate_daemon /opt/idds/config/idds/logrotate_daemon
chmod +x /opt/idds/config/idds/logrotate_daemon
chown root /opt/idds/config/idds/logrotate_idds

echo "setup health monitor"
cp /opt/idds/config_default/supervisord_healthmonitor.ini /opt/idds/config/idds/
cp /opt/idds/config_default/healthmonitor_daemon /opt/idds/config/idds/
cp /opt/idds/config_default/idds_health_check.py /opt/idds/config/idds/
chmod +x /opt/idds/config/idds/healthmonitor_daemon
chmod +x /opt/idds/config/idds/idds_health_check.py
fi

if [ -f /etc/grid-security/hostkey.pem ]; then
Expand Down

0 comments on commit 83c2b44

Please sign in to comment.