Merge pull request #693 from Alignak-monitoring/improve-stats

Improve stats
Alignak-monitoring · Jan 19, 2017 · 09c8c91 · 09c8c91
2 parents 7b59276 + b585571
commit 09c8c91
Show file tree

Hide file tree

Showing 8 changed files with 598 additions and 56 deletions.
diff --git a/alignak/daemon.py b/alignak/daemon.py
@@ -1169,7 +1169,7 @@ def hook_point(self, hook_name):
                                    'and set it to restart later', inst.get_name(), str(exp))
                     logger.exception('Exception %s', exp)
                     self.modules_manager.set_to_restart(inst)
-        statsmgr.incr('core.hook.%s' % hook_name, time.time() - _t0)
+        statsmgr.timer('core.hook.%s' % hook_name, time.time() - _t0)
 
     def get_retention_data(self):  # pylint: disable=R0201
         """Basic function to get retention data,

diff --git a/alignak/daemons/arbiterdaemon.py b/alignak/daemons/arbiterdaemon.py
@@ -482,7 +482,7 @@ def load_modules_configuration_objects(self, raw_objects):
                 logger.error("Back trace of this remove: %s", output.getvalue())
                 output.close()
                 continue
-            statsmgr.incr('hook.get-objects', time.time() - _t0)
+            statsmgr.timer('core.hook.get_objects', time.time() - _t0)
             types_creations = self.conf.types_creations
             for type_c in types_creations:
                 (_, _, prop, dummy) = types_creations[type_c]
@@ -763,21 +763,21 @@ def run(self):
             # Now the dispatcher job
             _t0 = time.time()
             self.dispatcher.check_alive()
-            statsmgr.incr('core.check-alive', time.time() - _t0)
+            statsmgr.timer('core.check-alive', time.time() - _t0)
 
             _t0 = time.time()
             self.dispatcher.check_dispatch()
-            statsmgr.incr('core.check-dispatch', time.time() - _t0)
+            statsmgr.timer('core.check-dispatch', time.time() - _t0)
 
             # REF: doc/alignak-conf-dispatching.png (3)
             _t0 = time.time()
             self.dispatcher.prepare_dispatch()
             self.dispatcher.dispatch()
-            statsmgr.incr('core.dispatch', time.time() - _t0)
+            statsmgr.timer('core.dispatch', time.time() - _t0)
 
             _t0 = time.time()
             self.dispatcher.check_bad_dispatch()
-            statsmgr.incr('core.check-bad-dispatch', time.time() - _t0)
+            statsmgr.timer('core.check-bad-dispatch', time.time() - _t0)
 
             # Now get things from our module instances
             self.get_objects_from_from_queues()
@@ -798,7 +798,7 @@ def run(self):
 
             _t0 = time.time()
             self.push_external_commands_to_schedulers()
-            statsmgr.incr('core.push-external-commands', time.time() - _t0)
+            statsmgr.timer('core.push-external-commands', time.time() - _t0)
 
             # It's sent, do not keep them
             # TODO: check if really sent. Queue by scheduler?

diff --git a/alignak/daemons/brokerdaemon.py b/alignak/daemons/brokerdaemon.py
@@ -229,7 +229,7 @@ def pynag_con_init(self, _id, i_type='scheduler'):
         """
         _t0 = time.time()
         res = self.do_pynag_con_init(_id, i_type)
-        statsmgr.incr('con-init.%s' % i_type, time.time() - _t0)
+        statsmgr.timer('con-init.%s' % i_type, time.time() - _t0)
         return res
 
     def do_pynag_con_init(self, s_id, i_type='scheduler'):
@@ -325,7 +325,9 @@ def manage_brok(self, brok):
         # Call all modules if they catch the call
         for mod in self.modules_manager.get_internal_instances():
             try:
+                _t0 = time.time()
                 mod.manage_brok(brok)
+                statsmgr.timer('core.manage-broks.%s' % mod.get_name(), time.time() - _t0)
             except Exception as exp:  # pylint: disable=broad-except
                 logger.warning("The mod %s raise an exception: %s, I'm tagging it to restart later",
                                mod.get_name(), str(exp))
@@ -779,19 +781,24 @@ def do_loop_turn(self):
         if self.new_conf:
             self.setup_new_conf()
 
-        # Maybe the last loop we raised some broks internally
+        # Maybe the last loop we dir raised some broks internally
+        _t0 = time.time()
         # we should integrate them in broks
         self.interger_internal_broks()
+        statsmgr.timer('get-new-broks.broker', time.time() - _t0)
+
+        _t0 = time.time()
         # Also reap broks sent from the arbiters
         self.interger_arbiter_broks()
+        statsmgr.timer('get-new-broks.arbiter', time.time() - _t0)
 
         # Main job, go get broks in our distant daemons
         types = ['scheduler', 'poller', 'reactionner', 'receiver']
         for _type in types:
             _t0 = time.time()
             # And from schedulers
             self.get_new_broks(i_type=_type)
-            statsmgr.incr('get-new-broks.%s' % _type, time.time() - _t0)
+            statsmgr.timer('get-new-broks.%s' % _type, time.time() - _t0)
 
         # Sort the brok list by id
         self.broks.sort(sort_by_ids)
@@ -809,7 +816,9 @@ def do_loop_turn(self):
         # instead of killing ourselves :)
         for mod in ext_modules:
             try:
+                t000 = time.time()
                 mod.to_q.put(to_send)
+                statsmgr.timer('core.put-to-external-queue.%s' % mod.get_name(), time.time() - t000)
             except Exception as exp:  # pylint: disable=broad-except
                 # first we must find the modules
                 logger.warning("The mod %s queue raise an exception: %s, "
@@ -821,7 +830,7 @@ def do_loop_turn(self):
         # No more need to send them
         for brok in to_send:
             brok.need_send_to_ext = False
-        statsmgr.incr('core.put-to-external-queue', time.time() - t00)
+        statsmgr.timer('core.put-to-external-queue', time.time() - t00)
         logger.debug("Time to send %s broks (%d secs)", len(to_send), time.time() - t00)
 
         # We must had new broks at the end of the list, so we reverse the list
@@ -842,7 +851,7 @@ def do_loop_turn(self):
             brok.prepare()
             _t0 = time.time()
             self.manage_brok(brok)
-            statsmgr.incr('core.manage-brok', time.time() - _t0)
+            statsmgr.timer('core.manage-broks', time.time() - _t0)
 
             nb_broks = len(self.broks)
 

diff --git a/alignak/daemons/receiverdaemon.py b/alignak/daemons/receiverdaemon.py
@@ -324,6 +324,7 @@ def push_external_commands_to_schedulers(self):
         commands_to_process = self.unprocessed_external_commands
         self.unprocessed_external_commands = []
         logger.debug("Commands: %s", commands_to_process)
+        statsmgr.gauge('external-commands.pushed', len(self.unprocessed_external_commands))
 
         # Now get all external commands and put them into the
         # good schedulers
@@ -363,10 +364,10 @@ def push_external_commands_to_schedulers(self):
                     logger.error("A satellite raised an unknown exception: %s (%s)", exp, type(exp))
                     raise
 
-            # Wether we sent the commands or not, clean the scheduler list
+            # Whether we sent the commands or not, clean the scheduler list
             self.schedulers[sched_id]['external_commands'] = []
 
-            # If we didn't send them, add the commands to the arbiter list
+            # If we didn't sent them, add the commands to the arbiter list
             if not sent:
                 for extcmd in extcmds:
                     self.external_commands.append(extcmd)
@@ -389,9 +390,13 @@ def do_loop_turn(self):
 
         # Maybe external modules raised 'objects'
         # we should get them
+        _t0 = time.time()
         self.get_objects_from_from_queues()
+        statsmgr.timer('core.get-objects-from-queues', time.time() - _t0)
 
+        _t0 = time.time()
         self.push_external_commands_to_schedulers()
+        statsmgr.timer('core.push-external-commands', time.time() - _t0)
 
         # Maybe we do not have something to do, so we wait a little
         if len(self.broks) == 0:

diff --git a/alignak/satellite.py b/alignak/satellite.py
@@ -227,7 +227,7 @@ def pynag_con_init(self, _id):
         """
         _t0 = time.time()
         res = self.do_pynag_con_init(_id)
-        statsmgr.incr('con-init.scheduler', time.time() - _t0)
+        statsmgr.timer('con-init.scheduler', time.time() - _t0)
         return res
 
     def do_pynag_con_init(self, s_id):
@@ -336,7 +336,7 @@ def manage_returns(self):
         """
         _t0 = time.time()
         self.do_manage_returns()
-        statsmgr.incr('core.manage-returns', time.time() - _t0)
+        statsmgr.timer('core.manage-returns', time.time() - _t0)
 
     def do_manage_returns(self):
         """Manage the checks and then
@@ -653,7 +653,7 @@ def get_new_actions(self):
         """
         _t0 = time.time()
         self.do_get_new_actions()
-        statsmgr.incr('core.get-new-actions', time.time() - _t0)
+        statsmgr.timer('core.get-new-actions', time.time() - _t0)
 
     def do_get_new_actions(self):
         """Get new actions from schedulers
@@ -806,7 +806,7 @@ def do_loop_turn(self):
                                  sched_id, sched['name'], mod,
                                  index, queue.qsize(), self.get_returns_queue_len())
                     # also update the stats module
-                    statsmgr.incr('core.worker-%s.queue-size' % mod, queue.qsize())
+                    statsmgr.gauge('core.worker-%s.queue-size' % mod, queue.qsize())
 
         # Before return or get new actions, see how we manage
         # old ones: are they still in queue (s)? If True, we
@@ -827,14 +827,14 @@ def do_loop_turn(self):
             self.wait_ratio.update_load(self.polling_interval)
         wait_ratio = self.wait_ratio.get_load()
         logger.debug("Wait ratio: %f", wait_ratio)
-        statsmgr.incr('core.wait-ratio', wait_ratio)
+        statsmgr.timer('core.wait-ratio', wait_ratio)
 
         # We can wait more than 1s if needed,
         # no more than 5s, but no less than 1
         timeout = self.timeout * wait_ratio
         timeout = max(self.polling_interval, timeout)
         self.timeout = min(5 * self.polling_interval, timeout)
-        statsmgr.incr('core.timeout', wait_ratio)
+        statsmgr.timer('core.wait-arbiter', self.timeout)
 
         # Maybe we do not have enough workers, we check for it
         # and launch the new ones if needed

diff --git a/alignak/scheduler.py b/alignak/scheduler.py
@@ -253,6 +253,18 @@ def load_conf(self, conf):
         self.triggers.load_objects(self)
         self.escalations = conf.escalations
 
+        # Internal statistics
+        statsmgr.gauge('configuration.hosts', len(self.hosts))
+        statsmgr.gauge('configuration.services', len(self.services))
+        statsmgr.gauge('configuration.hostgroups', len(self.hostgroups))
+        statsmgr.gauge('configuration.servicegroups', len(self.servicegroups))
+        statsmgr.gauge('configuration.contacts', len(self.contacts))
+        statsmgr.gauge('configuration.contactgroups', len(self.contactgroups))
+        statsmgr.gauge('configuration.timeperiods', len(self.timeperiods))
+        statsmgr.gauge('configuration.commands', len(self.commands))
+        statsmgr.gauge('configuration.notificationways', len(self.notificationways))
+        statsmgr.gauge('configuration.escalations', len(self.escalations))
+
         # self.status_file = StatusFile(self)
         #  External status file
         # From Arbiter. Use for Broker to differentiate schedulers
@@ -386,9 +398,11 @@ def run_external_commands(self, cmds):
         :type cmds: list
         :return: None
         """
+        _t0 = time.time()
         logger.debug("Scheduler '%s' got %d commands", self.instance_name, len(cmds))
         for command in cmds:
             self.run_external_command(command)
+        statsmgr.timer('core.run_external_commands', time.time() - _t0)
 
     def run_external_command(self, command):
         """Run a single external command
@@ -541,6 +555,7 @@ def hook_point(self, hook_name):
         :return:None
         TODO: find a way to merge this and the version in daemon.py
         """
+        _t0 = time.time()
         for inst in self.sched_daemon.modules_manager.instances:
             full_hook_name = 'hook_' + hook_name
             logger.debug("hook_point: %s: %s %s",
@@ -559,6 +574,7 @@ def hook_point(self, hook_name):
                     logger.error("Exception trace follows: %s", output.getvalue())
                     output.close()
                     self.sched_daemon.modules_manager.set_to_restart(inst)
+        statsmgr.timer('core.hook.%s' % hook_name, time.time() - _t0)
 
     def clean_queues(self):
         """Reduces internal list size to max allowed
@@ -1435,6 +1451,7 @@ def restore_retention_data(self, data):
             host = self.hosts.find_by_name(ret_h_name)
             if host is not None:
                 self.restore_retention_data_item(h_dict, host)
+        statsmgr.gauge('retention.hosts', len(ret_hosts))
 
         # Same for services
         ret_services = data['services']
@@ -1445,6 +1462,7 @@ def restore_retention_data(self, data):
 
             if serv is not None:
                 self.restore_retention_data_item(s_dict, serv)
+        statsmgr.gauge('retention.services', len(ret_services))
 
     def restore_retention_data_item(self, data, item):
         """
@@ -2144,7 +2162,9 @@ def run(self):
 
         # Ok, now all is initialized, we can make the initial broks
         logger.info("[%s] First scheduling launched", self.instance_name)
+        _t1 = time.time()
         self.schedule()
+        statsmgr.timer('first_scheduling', time.time() - _t1)
         logger.info("[%s] First scheduling done", self.instance_name)
 
         # Now connect to the passive satellites if needed
@@ -2183,6 +2203,9 @@ def run(self):
             load = min(100, 100.0 - self.load_one_min.get_load() * 100)
             logger.debug("Load: (sleep) %.2f (average: %.2f) -> %d%%",
                          self.sched_daemon.sleep_time, self.load_one_min.get_load(), load)
+            statsmgr.gauge('load.sleep', self.sched_daemon.sleep_time)
+            statsmgr.gauge('load.average', self.load_one_min.get_load())
+            statsmgr.gauge('load.load', load)
 
             self.sched_daemon.sleep_time = 0.0
 
@@ -2200,12 +2223,16 @@ def run(self):
                         # Call it and save the time spend in it
                         _t0 = time.time()
                         fun()
-                        statsmgr.incr('loop.%s' % name, time.time() - _t0)
-            statsmgr.incr('complete_loop', time.time() - _t1)
+                        statsmgr.timer('loop.%s' % name, time.time() - _t0)
+            statsmgr.timer('loop.whole', time.time() - _t1)
 
             # DBG: push actions to passives?
+            _t1 = time.time()
             self.push_actions_to_passives_satellites()
+            statsmgr.timer('push_actions_to_passives_satellites', time.time() - _t1)
+            _t1 = time.time()
             self.get_actions_from_passives_satellites()
+            statsmgr.timer('get_actions_from_passives_satellites', time.time() - _t1)
 
             # stats
             nb_scheduled = nb_inpoller = nb_zombies = 0
@@ -2221,6 +2248,11 @@ def run(self):
             logger.debug("Checks: total %s, scheduled %s,"
                          "inpoller %s, zombies %s, notifications %s",
                          len(self.checks), nb_scheduled, nb_inpoller, nb_zombies, nb_notifications)
+            statsmgr.gauge('checks.total', len(self.checks))
+            statsmgr.gauge('checks.scheduled', nb_scheduled)
+            statsmgr.gauge('checks.inpoller', nb_inpoller)
+            statsmgr.gauge('checks.zombie', nb_zombies)
+            statsmgr.gauge('actions.notifications', nb_notifications)
 
             now = time.time()
 
@@ -2246,6 +2278,6 @@ def run(self):
 
             self.hook_point('scheduler_tick')
 
-        # WE must save the retention at the quit BY OURSELVES
+        # We must save the retention at the quit BY OURSELVES
         # because our daemon will not be able to do it for us
         self.update_retention_file(True)