Skip to content

Commit

Permalink
Merge pull request #693 from Alignak-monitoring/improve-stats
Browse files Browse the repository at this point in the history
Improve stats
  • Loading branch information
ddurieux authored Jan 19, 2017
2 parents 7b59276 + b585571 commit 09c8c91
Show file tree
Hide file tree
Showing 8 changed files with 598 additions and 56 deletions.
2 changes: 1 addition & 1 deletion alignak/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,7 +1169,7 @@ def hook_point(self, hook_name):
'and set it to restart later', inst.get_name(), str(exp))
logger.exception('Exception %s', exp)
self.modules_manager.set_to_restart(inst)
statsmgr.incr('core.hook.%s' % hook_name, time.time() - _t0)
statsmgr.timer('core.hook.%s' % hook_name, time.time() - _t0)

def get_retention_data(self): # pylint: disable=R0201
"""Basic function to get retention data,
Expand Down
12 changes: 6 additions & 6 deletions alignak/daemons/arbiterdaemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def load_modules_configuration_objects(self, raw_objects):
logger.error("Back trace of this remove: %s", output.getvalue())
output.close()
continue
statsmgr.incr('hook.get-objects', time.time() - _t0)
statsmgr.timer('core.hook.get_objects', time.time() - _t0)
types_creations = self.conf.types_creations
for type_c in types_creations:
(_, _, prop, dummy) = types_creations[type_c]
Expand Down Expand Up @@ -763,21 +763,21 @@ def run(self):
# Now the dispatcher job
_t0 = time.time()
self.dispatcher.check_alive()
statsmgr.incr('core.check-alive', time.time() - _t0)
statsmgr.timer('core.check-alive', time.time() - _t0)

_t0 = time.time()
self.dispatcher.check_dispatch()
statsmgr.incr('core.check-dispatch', time.time() - _t0)
statsmgr.timer('core.check-dispatch', time.time() - _t0)

# REF: doc/alignak-conf-dispatching.png (3)
_t0 = time.time()
self.dispatcher.prepare_dispatch()
self.dispatcher.dispatch()
statsmgr.incr('core.dispatch', time.time() - _t0)
statsmgr.timer('core.dispatch', time.time() - _t0)

_t0 = time.time()
self.dispatcher.check_bad_dispatch()
statsmgr.incr('core.check-bad-dispatch', time.time() - _t0)
statsmgr.timer('core.check-bad-dispatch', time.time() - _t0)

# Now get things from our module instances
self.get_objects_from_from_queues()
Expand All @@ -798,7 +798,7 @@ def run(self):

_t0 = time.time()
self.push_external_commands_to_schedulers()
statsmgr.incr('core.push-external-commands', time.time() - _t0)
statsmgr.timer('core.push-external-commands', time.time() - _t0)

# It's sent, do not keep them
# TODO: check if really sent. Queue by scheduler?
Expand Down
19 changes: 14 additions & 5 deletions alignak/daemons/brokerdaemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def pynag_con_init(self, _id, i_type='scheduler'):
"""
_t0 = time.time()
res = self.do_pynag_con_init(_id, i_type)
statsmgr.incr('con-init.%s' % i_type, time.time() - _t0)
statsmgr.timer('con-init.%s' % i_type, time.time() - _t0)
return res

def do_pynag_con_init(self, s_id, i_type='scheduler'):
Expand Down Expand Up @@ -325,7 +325,9 @@ def manage_brok(self, brok):
# Call all modules if they catch the call
for mod in self.modules_manager.get_internal_instances():
try:
_t0 = time.time()
mod.manage_brok(brok)
statsmgr.timer('core.manage-broks.%s' % mod.get_name(), time.time() - _t0)
except Exception as exp: # pylint: disable=broad-except
logger.warning("The mod %s raise an exception: %s, I'm tagging it to restart later",
mod.get_name(), str(exp))
Expand Down Expand Up @@ -779,19 +781,24 @@ def do_loop_turn(self):
if self.new_conf:
self.setup_new_conf()

# Maybe the last loop we raised some broks internally
# Maybe the last loop we dir raised some broks internally
_t0 = time.time()
# we should integrate them in broks
self.interger_internal_broks()
statsmgr.timer('get-new-broks.broker', time.time() - _t0)

_t0 = time.time()
# Also reap broks sent from the arbiters
self.interger_arbiter_broks()
statsmgr.timer('get-new-broks.arbiter', time.time() - _t0)

# Main job, go get broks in our distant daemons
types = ['scheduler', 'poller', 'reactionner', 'receiver']
for _type in types:
_t0 = time.time()
# And from schedulers
self.get_new_broks(i_type=_type)
statsmgr.incr('get-new-broks.%s' % _type, time.time() - _t0)
statsmgr.timer('get-new-broks.%s' % _type, time.time() - _t0)

# Sort the brok list by id
self.broks.sort(sort_by_ids)
Expand All @@ -809,7 +816,9 @@ def do_loop_turn(self):
# instead of killing ourselves :)
for mod in ext_modules:
try:
t000 = time.time()
mod.to_q.put(to_send)
statsmgr.timer('core.put-to-external-queue.%s' % mod.get_name(), time.time() - t000)
except Exception as exp: # pylint: disable=broad-except
# first we must find the modules
logger.warning("The mod %s queue raise an exception: %s, "
Expand All @@ -821,7 +830,7 @@ def do_loop_turn(self):
# No more need to send them
for brok in to_send:
brok.need_send_to_ext = False
statsmgr.incr('core.put-to-external-queue', time.time() - t00)
statsmgr.timer('core.put-to-external-queue', time.time() - t00)
logger.debug("Time to send %s broks (%d secs)", len(to_send), time.time() - t00)

# We must had new broks at the end of the list, so we reverse the list
Expand All @@ -842,7 +851,7 @@ def do_loop_turn(self):
brok.prepare()
_t0 = time.time()
self.manage_brok(brok)
statsmgr.incr('core.manage-brok', time.time() - _t0)
statsmgr.timer('core.manage-broks', time.time() - _t0)

nb_broks = len(self.broks)

Expand Down
9 changes: 7 additions & 2 deletions alignak/daemons/receiverdaemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ def push_external_commands_to_schedulers(self):
commands_to_process = self.unprocessed_external_commands
self.unprocessed_external_commands = []
logger.debug("Commands: %s", commands_to_process)
statsmgr.gauge('external-commands.pushed', len(self.unprocessed_external_commands))

# Now get all external commands and put them into the
# good schedulers
Expand Down Expand Up @@ -363,10 +364,10 @@ def push_external_commands_to_schedulers(self):
logger.error("A satellite raised an unknown exception: %s (%s)", exp, type(exp))
raise

# Wether we sent the commands or not, clean the scheduler list
# Whether we sent the commands or not, clean the scheduler list
self.schedulers[sched_id]['external_commands'] = []

# If we didn't send them, add the commands to the arbiter list
# If we didn't sent them, add the commands to the arbiter list
if not sent:
for extcmd in extcmds:
self.external_commands.append(extcmd)
Expand All @@ -389,9 +390,13 @@ def do_loop_turn(self):

# Maybe external modules raised 'objects'
# we should get them
_t0 = time.time()
self.get_objects_from_from_queues()
statsmgr.timer('core.get-objects-from-queues', time.time() - _t0)

_t0 = time.time()
self.push_external_commands_to_schedulers()
statsmgr.timer('core.push-external-commands', time.time() - _t0)

# Maybe we do not have something to do, so we wait a little
if len(self.broks) == 0:
Expand Down
12 changes: 6 additions & 6 deletions alignak/satellite.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def pynag_con_init(self, _id):
"""
_t0 = time.time()
res = self.do_pynag_con_init(_id)
statsmgr.incr('con-init.scheduler', time.time() - _t0)
statsmgr.timer('con-init.scheduler', time.time() - _t0)
return res

def do_pynag_con_init(self, s_id):
Expand Down Expand Up @@ -336,7 +336,7 @@ def manage_returns(self):
"""
_t0 = time.time()
self.do_manage_returns()
statsmgr.incr('core.manage-returns', time.time() - _t0)
statsmgr.timer('core.manage-returns', time.time() - _t0)

def do_manage_returns(self):
"""Manage the checks and then
Expand Down Expand Up @@ -653,7 +653,7 @@ def get_new_actions(self):
"""
_t0 = time.time()
self.do_get_new_actions()
statsmgr.incr('core.get-new-actions', time.time() - _t0)
statsmgr.timer('core.get-new-actions', time.time() - _t0)

def do_get_new_actions(self):
"""Get new actions from schedulers
Expand Down Expand Up @@ -806,7 +806,7 @@ def do_loop_turn(self):
sched_id, sched['name'], mod,
index, queue.qsize(), self.get_returns_queue_len())
# also update the stats module
statsmgr.incr('core.worker-%s.queue-size' % mod, queue.qsize())
statsmgr.gauge('core.worker-%s.queue-size' % mod, queue.qsize())

# Before return or get new actions, see how we manage
# old ones: are they still in queue (s)? If True, we
Expand All @@ -827,14 +827,14 @@ def do_loop_turn(self):
self.wait_ratio.update_load(self.polling_interval)
wait_ratio = self.wait_ratio.get_load()
logger.debug("Wait ratio: %f", wait_ratio)
statsmgr.incr('core.wait-ratio', wait_ratio)
statsmgr.timer('core.wait-ratio', wait_ratio)

# We can wait more than 1s if needed,
# no more than 5s, but no less than 1
timeout = self.timeout * wait_ratio
timeout = max(self.polling_interval, timeout)
self.timeout = min(5 * self.polling_interval, timeout)
statsmgr.incr('core.timeout', wait_ratio)
statsmgr.timer('core.wait-arbiter', self.timeout)

# Maybe we do not have enough workers, we check for it
# and launch the new ones if needed
Expand Down
38 changes: 35 additions & 3 deletions alignak/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,18 @@ def load_conf(self, conf):
self.triggers.load_objects(self)
self.escalations = conf.escalations

# Internal statistics
statsmgr.gauge('configuration.hosts', len(self.hosts))
statsmgr.gauge('configuration.services', len(self.services))
statsmgr.gauge('configuration.hostgroups', len(self.hostgroups))
statsmgr.gauge('configuration.servicegroups', len(self.servicegroups))
statsmgr.gauge('configuration.contacts', len(self.contacts))
statsmgr.gauge('configuration.contactgroups', len(self.contactgroups))
statsmgr.gauge('configuration.timeperiods', len(self.timeperiods))
statsmgr.gauge('configuration.commands', len(self.commands))
statsmgr.gauge('configuration.notificationways', len(self.notificationways))
statsmgr.gauge('configuration.escalations', len(self.escalations))

# self.status_file = StatusFile(self)
# External status file
# From Arbiter. Use for Broker to differentiate schedulers
Expand Down Expand Up @@ -386,9 +398,11 @@ def run_external_commands(self, cmds):
:type cmds: list
:return: None
"""
_t0 = time.time()
logger.debug("Scheduler '%s' got %d commands", self.instance_name, len(cmds))
for command in cmds:
self.run_external_command(command)
statsmgr.timer('core.run_external_commands', time.time() - _t0)

def run_external_command(self, command):
"""Run a single external command
Expand Down Expand Up @@ -541,6 +555,7 @@ def hook_point(self, hook_name):
:return:None
TODO: find a way to merge this and the version in daemon.py
"""
_t0 = time.time()
for inst in self.sched_daemon.modules_manager.instances:
full_hook_name = 'hook_' + hook_name
logger.debug("hook_point: %s: %s %s",
Expand All @@ -559,6 +574,7 @@ def hook_point(self, hook_name):
logger.error("Exception trace follows: %s", output.getvalue())
output.close()
self.sched_daemon.modules_manager.set_to_restart(inst)
statsmgr.timer('core.hook.%s' % hook_name, time.time() - _t0)

def clean_queues(self):
"""Reduces internal list size to max allowed
Expand Down Expand Up @@ -1435,6 +1451,7 @@ def restore_retention_data(self, data):
host = self.hosts.find_by_name(ret_h_name)
if host is not None:
self.restore_retention_data_item(h_dict, host)
statsmgr.gauge('retention.hosts', len(ret_hosts))

# Same for services
ret_services = data['services']
Expand All @@ -1445,6 +1462,7 @@ def restore_retention_data(self, data):

if serv is not None:
self.restore_retention_data_item(s_dict, serv)
statsmgr.gauge('retention.services', len(ret_services))

def restore_retention_data_item(self, data, item):
"""
Expand Down Expand Up @@ -2144,7 +2162,9 @@ def run(self):

# Ok, now all is initialized, we can make the initial broks
logger.info("[%s] First scheduling launched", self.instance_name)
_t1 = time.time()
self.schedule()
statsmgr.timer('first_scheduling', time.time() - _t1)
logger.info("[%s] First scheduling done", self.instance_name)

# Now connect to the passive satellites if needed
Expand Down Expand Up @@ -2183,6 +2203,9 @@ def run(self):
load = min(100, 100.0 - self.load_one_min.get_load() * 100)
logger.debug("Load: (sleep) %.2f (average: %.2f) -> %d%%",
self.sched_daemon.sleep_time, self.load_one_min.get_load(), load)
statsmgr.gauge('load.sleep', self.sched_daemon.sleep_time)
statsmgr.gauge('load.average', self.load_one_min.get_load())
statsmgr.gauge('load.load', load)

self.sched_daemon.sleep_time = 0.0

Expand All @@ -2200,12 +2223,16 @@ def run(self):
# Call it and save the time spend in it
_t0 = time.time()
fun()
statsmgr.incr('loop.%s' % name, time.time() - _t0)
statsmgr.incr('complete_loop', time.time() - _t1)
statsmgr.timer('loop.%s' % name, time.time() - _t0)
statsmgr.timer('loop.whole', time.time() - _t1)

# DBG: push actions to passives?
_t1 = time.time()
self.push_actions_to_passives_satellites()
statsmgr.timer('push_actions_to_passives_satellites', time.time() - _t1)
_t1 = time.time()
self.get_actions_from_passives_satellites()
statsmgr.timer('get_actions_from_passives_satellites', time.time() - _t1)

# stats
nb_scheduled = nb_inpoller = nb_zombies = 0
Expand All @@ -2221,6 +2248,11 @@ def run(self):
logger.debug("Checks: total %s, scheduled %s,"
"inpoller %s, zombies %s, notifications %s",
len(self.checks), nb_scheduled, nb_inpoller, nb_zombies, nb_notifications)
statsmgr.gauge('checks.total', len(self.checks))
statsmgr.gauge('checks.scheduled', nb_scheduled)
statsmgr.gauge('checks.inpoller', nb_inpoller)
statsmgr.gauge('checks.zombie', nb_zombies)
statsmgr.gauge('actions.notifications', nb_notifications)

now = time.time()

Expand All @@ -2246,6 +2278,6 @@ def run(self):

self.hook_point('scheduler_tick')

# WE must save the retention at the quit BY OURSELVES
# We must save the retention at the quit BY OURSELVES
# because our daemon will not be able to do it for us
self.update_retention_file(True)
Loading

0 comments on commit 09c8c91

Please sign in to comment.