Merge pull request #18 from paulgear/feature/prometheus-exporter

Enable prometheus exporter
paulgear · Sep 25, 2023 · b12ea6e · b12ea6e
2 parents 20bb8dc + 6ad3b87
commit b12ea6e
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 Copyright (c) 2015-2023 Paul D. Gear <https://libertysys.com.au/>
 
+Juju layer copyright (c) 2017-2018 Canonical Ltd <https://charmhub.io/ntp>
+
 ## License
 
 GPLv3 - see COPYING.txt for details
@@ -12,8 +14,8 @@ GPLv3 - see COPYING.txt for details
 NTPmon is a program which is designed to report on essential health metrics for
 NTP.  It provides a Nagios check which can be used with many alerting systems,
 including support for Nagios performance data.  NTPmon can also run as a daemon
-for sending metrics to collectd or telegraf.  It supports both `ntpd` and
-`chronyd`.
+for sending metrics to collectd, prometheus, or telegraf.  It supports both
+`ntpd` and `chronyd`.
 
 
 ## Prerequisites
@@ -22,12 +24,15 @@ NTPmon is written in python, and requires python 3.3 or later.  It uses modules
 from the standard python library, and also requires the `psutil` library, which
 is available from pypi or your operating system repositories. NTPmon also
 requires `ntpq` and `ntptrace` from the NTP distribution (or `chronyc` if you're
-using chrony instead).
+using chrony).
 
 On Ubuntu (and probably other Debian-based Linux distributions), you can install
 all the prerequisites by running:
 
-    sudo apt-get install ntp python3-psutil
+    sudo apt-get install ntp python3-prometheus-client python3-psutil
+
+(The python3-prometheus-client package is only needed if you intend to run in
+prometheus exporter mode - see below.)
 
 ## Usage
 
@@ -38,7 +43,6 @@ To start running NTPmon, run:
     cd ntpmon
     ./src/ntpmon.py --help
 
-
 ## Metrics
 
 NTPmon alerts on the following metrics of the local NTP server:
@@ -62,19 +66,18 @@ than 75% total reachability of all configured peers.
 
 #### offset
 
-Is the clock offset from its sync peer (or other peers, if the sync peer
-is not available) acceptable?  Return CRITICAL for 50 milliseconds or more
-average difference, WARNING for 10 ms or more average difference, and OK
-for anything less.
+Is the clock offset from its sync peer (or other peers, if the sync peer is not
+available) acceptable?  Return CRITICAL for 50 milliseconds or more average
+difference, WARNING for 10 ms or more average difference, and OK for anything
+less.
 
 #### traceloop
 
-Is there a sync loop between the local server and the stratum 1 servers?
-If so, return CRITICAL.  Most public NTP servers do not support tracing,
-so for anything other than a loop (including a timeout), return OK.
-Traceloop is disabled by default and may be deprecated in a future
-release, since it produces additional NTP traffic which is not useful in
-most cases.
+Is there a sync loop between the local server and the stratum 1 servers? If so,
+return CRITICAL; for anything other than a loop (including a timeout), return
+OK.  Most public NTP servers do not support tracing, so using this produces
+additional NTP traffic which is not useful in most cases. Trace loop detection
+is deprecated, disabled by default, and is not supported for prometheus.
 
 ### System metrics
 
@@ -91,6 +94,15 @@ server (using `ntpq -nc readvar`):
 See the [NTP documentation](http://doc.ntp.org/current-stable/ntpq.html#system)
 for the meaning of these metrics.
 
+### Prometheus exporter
+
+When run in prometheus mode, NTPmon uses the [prometheus python
+client](https://pypi.python.org/pypi/prometheus_client) to expose metrics via
+the HTTP server built into that library.  No security testing or validation has
+been performed on this library by the NTPmon author; users are suggested not to
+expose it on untrusted networks, and are reminded that - as stated in the GNU
+General Public License terms - this software comes with no warranty.
+
 ## Changes from previous version
 
 NTPmon has been rewritten from version 1.0.0 of check_ntpmon.  Changes from
@@ -112,15 +124,13 @@ the original check_ntpmon are:
 - Removed support for changing thresholds; if the one person on the Internet
   who actually uses this really wants it, I might add it back. :-)
 
-
 ## Startup delay
 
 By default, until ntpd has been running for 512 seconds (the minimum time for
 8 polls at 64-second intervals), check_ntpmon will return OK (zero return code).
 This is to prevent false positives on startup or for short-lived VMs.  To
 ignore this safety precaution, use --run-time with a low number (e.g. 1 sec).
 
-
 ## To do
 
 - Better/more documentation.

diff --git a/src/alert.py b/src/alert.py
@@ -1,6 +1,6 @@
 
 #
-# Copyright:    (c) 2016 Paul D. Gear
+# Copyright:    (c) 2016, 2019 Paul D. Gear
 # License:      GPLv3 <http://www.gnu.org/licenses/gpl.html>
 #
 # This program is free software: you can redistribute it and/or modify it under
@@ -136,6 +136,23 @@
 
 }
 
+"""
+Metric types and suffixes for prometheus
+"""
+_prometheus_types = {
+
+    'frequency': (None, '_hertz', 'Frequency error of the local clock'),
+    'offset': (None, '_seconds', 'Mean clock offset of peers'),
+    'reach': ('%', '_ratio', 'Peer reachability over the last 8 polls'),
+    'rootdelay': (None, '_seconds', 'Network delay to stratum 0 sources'),
+    'rootdisp': (None, '_seconds', 'Maximum calculated offset from stratum 0 sources'),
+    'runtime': (None, '_duration_seconds', 'Duration NTP service has been running'),
+    'stratum': ('i', None, 'NTP stratum of this server'),
+    'sysjitter': (None, '_seconds', 'RMS average of most recent system peer offset differences'),
+    'sysoffset': (None, '_seconds', 'Current clock offset of selected system peer'),
+
+}
+
 """
 Metric types for telegraf
 """
@@ -158,12 +175,13 @@
 
 
 class NTPAlerter(object):
- 
+
     def __init__(self, checks):
         self.checks = checks
         self.mc = MetricClassifier(_metricdefs)
         self.metrics = {}
         self.objs = {}
+        self.prometheus_objs = {}
 
     def collectmetrics(self, checkobjs, debug):
         """
@@ -234,7 +252,7 @@ def custom_message_tracehosts(self, result):
             ', '.join(trace.hostlist)
         )
 
-    def alert(self, checkobjs, hostname, interval, format):
+    def alert(self, checkobjs, hostname, interval, format, debug=False):
         """
         Produce the metrics
         """
@@ -244,9 +262,11 @@ def alert(self, checkobjs, hostname, interval, format):
         self.metrics['result'] = self.return_code()
         if format == 'collectd':
             self.alert_collectd(hostname, interval)
+        elif format == 'prometheus':
+            self.alert_prometheus(debug=debug)
         elif format == 'telegraf':
             self.alert_telegraf()
-        self.alert_peers(hostname, interval, format)
+        self.alert_peers(hostname, interval, format, debug)
         self.finished_output()
 
     def alert_collectd(self, hostname, interval):
@@ -262,6 +282,47 @@ def alert_collectd(self, hostname, interval):
                     self.metrics[metric],
                 ))
 
+    def set_prometheus_metric(self, name, description, value, peertype=None):
+        import prometheus_client
+        if name in self.prometheus_objs:
+            g = self.prometheus_objs[name]
+            if peertype is not None:
+                g = g.labels(peertype=peertype)
+        else:
+            if peertype is not None:
+                g = prometheus_client.Gauge(name, description, ['peertype'])
+                self.prometheus_objs[name] = g
+                g = g.labels(peertype=peertype)
+            else:
+                g = prometheus_client.Gauge(name, description)
+                self.prometheus_objs[name] = g
+        g.set(value)
+
+    def alert_prometheus(self, debug=False):
+
+        def emit_metric(name, description, metrictype, value, format):
+            if debug:
+                valuestr = format % (value,)
+                print('# HELP %s %s' % (name, description))
+                print('# TYPE %s gauge' % (name,))
+                print('%s %s' % (name, valuestr))
+            else:
+                self.set_prometheus_metric(name, description, value)
+
+        for metric in sorted(_prometheus_types.keys()):
+            if metric in self.metrics:
+                (metrictype, suffix, description) = _prometheus_types[metric]
+                s = 'ntpmon_' + metric
+                if suffix is not None:
+                    s += suffix
+                val = self.metrics[metric]
+                fmt = '%.9f'
+                if metrictype == 'i':
+                    fmt = '%d'
+                elif metrictype == '%':
+                    val /= 100
+                emit_metric(s, description, metrictype, val, fmt)
+
     def alert_telegraf(self):
         print('ntpmon ', end='')
         telegraf_metrics = []
@@ -275,7 +336,9 @@ def alert_telegraf(self):
                 telegraf_metrics.append(s)
         print(','.join(telegraf_metrics))
 
-    def alert_peers(self, hostname, interval, format):
+    def alert_peers(self, hostname, interval, format, debug=False):
+        if debug and format == 'prometheus':
+            print('# TYPE ntpmon_peers gauge')
         for metric in _peer_types:
             value = self.metrics.get(metric)
             if format == 'collectd':
@@ -285,6 +348,11 @@ def alert_peers(self, hostname, interval, format):
                     interval,
                     value,
                 ))
+            elif format == 'prometheus':
+                if debug:
+                    print('ntpmon_peers{peertype="%s"} %d' % (metric, value))
+                else:
+                    self.set_prometheus_metric('ntpmon_peers', 'NTP peer count', value, metric)
             elif format == 'telegraf':
                 print('ntpmon_peers,peertype=%s count=%di' % (metric, value))
 

diff --git a/src/ntpmon-prometheus.systemd b/src/ntpmon-prometheus.systemd
@@ -0,0 +1,16 @@
+[Unit]
+Description=NTP statistics monitor
+Documentation=https://github.com/paulgear/ntpmon
+After=chrony.service ntp.service
+Wants=chrony.service ntp.service
+
+[Service]
+ExecStart={{ install_dir }}/ntpmon.py --interval 60 --mode prometheus --implementation {{ implementation }}
+KillMode=process
+Restart=on-failure
+RestartSec=42s
+User={{ user }}
+Group={{ group }}
+
+[Install]
+WantedBy=multi-user.target
diff --git a/src/ntpmon.py b/src/ntpmon.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 #
-# Copyright:    (c) 2016 Paul D. Gear
+# Copyright:    (c) 2016, 2019 Paul D. Gear
 # License:      GPLv3 <http://www.gnu.org/licenses/gpl.html>
 #
 # This program is free software: you can redistribute it and/or modify it under
@@ -32,7 +32,11 @@ def get_args():
     parser.add_argument(
         '--mode',
         type=str,
-        choices=['collectd', 'telegraf'],
+        choices=[
+            'collectd',
+            'prometheus',
+            'telegraf',
+        ],
         help='Collectd is the default if collectd environment variables are detected.',
     )
     parser.add_argument(
@@ -47,6 +51,12 @@ def get_args():
         help='How often to report statistics (default: the value of the COLLECTD_INTERVAL environment variable, '
              'or 60 seconds if COLLECTD_INTERVAL is not set).',
     )
+    parser.add_argument(
+        '--port',
+        type=int,
+        help='TCP port on which to listen when acting as a prometheus exporter (default: 9648)',
+        default=9648,
+    )
     args = parser.parse_args()
     return args
 
@@ -82,12 +92,17 @@ def main():
     if args.interval is None:
         args.interval = 60
 
-    if args.mode == 'telegraf' and not sys.stdout.isatty():
-        (host, port) = args.connect.split(':')
-        port = int(port)
-        s = socket.socket()
-        s.connect((host, port))
-        sys.stdout = s.makefile(mode='w')
+    debug = sys.stdout.isatty()
+    if not debug:
+        if args.mode == 'telegraf':
+            (host, port) = args.connect.split(':')
+            port = int(port)
+            s = socket.socket()
+            s.connect((host, port))
+            sys.stdout = s.makefile(mode='w')
+        elif args.mode == 'prometheus':
+            import prometheus_client
+            prometheus_client.start_http_server(args.port)
 
     alerter = alert.NTPAlerter(checks)
     implementation = None
@@ -100,11 +115,10 @@ def main():
             # run the checks
             checkobjs = process.ntpchecks(checks, debug=False, implementation=implementation)
             # alert on what we've collected
-            alerter.alert(checkobjs=checkobjs, hostname=hostname, interval=args.interval, format=args.mode)
+            alerter.alert(checkobjs=checkobjs, hostname=hostname, interval=args.interval, format=args.mode, debug=debug)
 
         sleep_until(args.interval)
 
 
 if __name__ == '__main__':
     main()
-