From 4fbf17fb373435ebeba6e49f3144dbea3a1b3b69 Mon Sep 17 00:00:00 2001 From: Sanskar Jaiswal Date: Fri, 13 Dec 2024 16:36:54 +0530 Subject: [PATCH] feat(prometheus): expose controlplane connectivity state as a gauge Add a new Prometheus gauge metric `control_plane_reachable`. Similar to `datastore_reachable` gauge, 0 means the connection is not healthy; 1 means that the connection is healthy. We mark the connection as unhealthy under the following circumstances: * Failure while establihing a websocket connection * Failure while sending basic information to controlplane * Failure while sending ping to controlplane * Failure while receiving a packet from the websocket connection This is helpful for users running a signficant number of gateways to be alerted about potential issues any gateway(s) may be facing while talking to the controlplane. Signed-off-by: Sanskar Jaiswal --- kong/clustering/data_plane.lua | 25 ++++++++++++++++++++++--- kong/plugins/prometheus/exporter.lua | 13 +++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/kong/clustering/data_plane.lua b/kong/clustering/data_plane.lua index 63e566863981..5552a3a80df3 100644 --- a/kong/clustering/data_plane.lua +++ b/kong/clustering/data_plane.lua @@ -74,11 +74,20 @@ function _M.new(clustering) end +local function set_control_plane_reachable(reachable) + local ok, err = ngx.shared.kong:safe_set("control_plane_reachable", reachable) + if not ok then + ngx_log(ngx_ERR, _log_prefix, "failed to set controlplane_reachable key in shm to " .. reachable .. " :", err) + end +end + + function _M:init_worker(basic_info) -- ROLE = "data_plane" self.plugins_list = basic_info.plugins self.filters = basic_info.filters + set_control_plane_reachable(false) -- only run in process which worker_id() == 0 assert(ngx.timer.at(0, function(premature) @@ -98,13 +107,17 @@ local function send_ping(c, log_suffix) local _, err = c:send_ping(hash) if err then + set_control_plane_reachable(false) ngx_log(is_timeout(err) and ngx_NOTICE or ngx_WARN, _log_prefix, "unable to send ping frame to control plane: ", err, log_suffix) -- only log a ping if the hash changed - elseif hash ~= prev_hash then - prev_hash = hash - ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix) + else + set_control_plane_reachable(true) + if hash ~= prev_hash then + prev_hash = hash + ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix) + end end end @@ -156,6 +169,7 @@ function _M:communicate(premature) local c, uri, err = clustering_utils.connect_cp(self, "/v1/outlet") if not c then + set_control_plane_reachable(false) ngx_log(ngx_WARN, _log_prefix, "connection to control plane ", uri, " broken: ", err, " (retrying after ", reconnection_delay, " seconds)", log_suffix) @@ -188,6 +202,7 @@ function _M:communicate(premature) filters = self.filters, labels = labels, })) if err then + set_control_plane_reachable(false) ngx_log(ngx_ERR, _log_prefix, "unable to send basic information to control plane: ", uri, " err: ", err, " (retrying after ", reconnection_delay, " seconds)", log_suffix) @@ -197,6 +212,7 @@ function _M:communicate(premature) end)) return end + set_control_plane_reachable(true) local config_semaphore = semaphore.new(0) @@ -302,16 +318,19 @@ function _M:communicate(premature) local data, typ, err = c:recv_frame() if err then if not is_timeout(err) then + set_control_plane_reachable(false) return nil, "error while receiving frame from control plane: " .. err end local waited = ngx_time() - last_seen if waited > PING_WAIT then + set_control_plane_reachable(false) return nil, "did not receive pong frame from control plane within " .. PING_WAIT .. " seconds" end goto continue end + set_control_plane_reachable(true) if typ == "close" then ngx_log(ngx_DEBUG, _log_prefix, "received close frame from control plane", log_suffix) diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua index 4c37287da5ab..30d6f02c3628 100644 --- a/kong/plugins/prometheus/exporter.lua +++ b/kong/plugins/prometheus/exporter.lua @@ -10,6 +10,7 @@ local lower = string.lower local ngx_timer_pending_count = ngx.timer.pending_count local ngx_timer_running_count = ngx.timer.running_count local get_all_upstreams = balancer.get_all_upstreams + if not balancer.get_all_upstreams then -- API changed since after Kong 2.5 get_all_upstreams = require("kong.runloop.balancer.upstreams").get_all_upstreams end @@ -65,6 +66,11 @@ local function init() "0 is unreachable", nil, prometheus.LOCAL_STORAGE) + metrics.cp_reachable = prometheus:gauge("control_plane_reachable", + "Control plane reachable from gateway, " .. + "0 is unreachable", + nil, + prometheus.LOCAL_STORAGE) metrics.node_info = prometheus:gauge("node_info", "Kong Node metadata information", {"node_id", "version"}, @@ -449,6 +455,13 @@ local function metric_data(write_fn) kong.log.err("prometheus: failed to reach database while processing", "/metrics endpoint: ", err) end + + local cp_reachable = ngx.shared.kong:get("control_plane_reachable") + if cp_reachable then + metrics.cp_reachable:set(1) + else + metrics.cp_reachable:set(0) + end end local phase = get_phase()