Skip to content

Commit

Permalink
feat(prometheus): expose controlplane connectivity state as a gauge
Browse files Browse the repository at this point in the history
Add a new Prometheus gauge metric `controlplane_reachable`. Similar to
`datastore_reachable` gauge, 0 means the connection is not healthy; 1
means that the connection is healthy. We mark the connection as
unhealthy under the following circumstances:
* Failure while establihing a websocket connection
* Failure while sending basic information to controlplane
* Failure while sending ping to controlplane
* Failure while receiving a packet from the websocket connection

This is helpful for users running a signficant number of gateways to be
alerted about potential issues any gateway(s) may be facing while
talking to the controlplane.

Signed-off-by: Sanskar Jaiswal <[email protected]>
  • Loading branch information
aryan9600 committed Dec 13, 2024
1 parent 358fff3 commit 0f59660
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 3 deletions.
22 changes: 19 additions & 3 deletions kong/clustering/data_plane.lua
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ local PING_WAIT = PING_INTERVAL * 1.5
local _log_prefix = "[clustering] "
local DECLARATIVE_EMPTY_CONFIG_HASH = constants.DECLARATIVE_EMPTY_CONFIG_HASH
local prev_hash
local cp_reachable = true

local endswith = require("pl.stringx").endswith

Expand Down Expand Up @@ -74,6 +75,11 @@ function _M.new(clustering)
end


function _M.is_cp_reachable()
return cp_reachable
end


function _M:init_worker(basic_info)
-- ROLE = "data_plane"

Expand All @@ -98,13 +104,17 @@ local function send_ping(c, log_suffix)

local _, err = c:send_ping(hash)
if err then
cp_reachable = false
ngx_log(is_timeout(err) and ngx_NOTICE or ngx_WARN, _log_prefix,
"unable to send ping frame to control plane: ", err, log_suffix)

-- only log a ping if the hash changed
elseif hash ~= prev_hash then
prev_hash = hash
ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix)
else
cp_reachable = true
if hash ~= prev_hash then
prev_hash = hash
ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix)
end
end
end

Expand Down Expand Up @@ -156,6 +166,7 @@ function _M:communicate(premature)

local c, uri, err = clustering_utils.connect_cp(self, "/v1/outlet")
if not c then
cp_reachable = false
ngx_log(ngx_WARN, _log_prefix, "connection to control plane ", uri, " broken: ", err,
" (retrying after ", reconnection_delay, " seconds)", log_suffix)

Expand Down Expand Up @@ -188,6 +199,7 @@ function _M:communicate(premature)
filters = self.filters,
labels = labels, }))
if err then
cp_reachable = false
ngx_log(ngx_ERR, _log_prefix, "unable to send basic information to control plane: ", uri,
" err: ", err, " (retrying after ", reconnection_delay, " seconds)", log_suffix)

Expand All @@ -197,6 +209,7 @@ function _M:communicate(premature)
end))
return
end
cp_reachable = true

local config_semaphore = semaphore.new(0)

Expand Down Expand Up @@ -302,16 +315,19 @@ function _M:communicate(premature)
local data, typ, err = c:recv_frame()
if err then
if not is_timeout(err) then
cp_reachable = false
return nil, "error while receiving frame from control plane: " .. err
end

local waited = ngx_time() - last_seen
if waited > PING_WAIT then
cp_reachable = false
return nil, "did not receive pong frame from control plane within " .. PING_WAIT .. " seconds"
end

goto continue
end
cp_reachable = true

if typ == "close" then
ngx_log(ngx_DEBUG, _log_prefix, "received close frame from control plane", log_suffix)
Expand Down
14 changes: 14 additions & 0 deletions kong/plugins/prometheus/exporter.lua
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ local lower = string.lower
local ngx_timer_pending_count = ngx.timer.pending_count
local ngx_timer_running_count = ngx.timer.running_count
local get_all_upstreams = balancer.get_all_upstreams
local data_plane = require("kong.clustering.data_plane")

if not balancer.get_all_upstreams then -- API changed since after Kong 2.5
get_all_upstreams = require("kong.runloop.balancer.upstreams").get_all_upstreams
end
Expand Down Expand Up @@ -65,6 +67,11 @@ local function init()
"0 is unreachable",
nil,
prometheus.LOCAL_STORAGE)
metrics.cp_reachable = prometheus:gauge("control_plane_reachable",
"Control plane reachable from gateway, " ..
"0 is unreachable",
nil,
prometheus.LOCAL_STORAGE)
metrics.node_info = prometheus:gauge("node_info",
"Kong Node metadata information",
{"node_id", "version"},
Expand Down Expand Up @@ -449,6 +456,13 @@ local function metric_data(write_fn)
kong.log.err("prometheus: failed to reach database while processing",
"/metrics endpoint: ", err)
end

local cp_reachable = data_plane.is_cp_reachable()
if cp_reachable then
metrics.cp_reachable:set(1)
else
metrics.cp_reachable:set(0)
end
end

local phase = get_phase()
Expand Down

0 comments on commit 0f59660

Please sign in to comment.