From a303a71a61912d06c7bb0300072e21d199122081 Mon Sep 17 00:00:00 2001 From: Sanskar Jaiswal Date: Fri, 13 Dec 2024 16:36:54 +0530 Subject: [PATCH 1/2] feat(prometheus): expose controlplane connectivity state as a gauge Add a new Prometheus gauge metric `control_plane_reachable`. Similar to `datastore_reachable` gauge, 0 means the connection is not healthy; 1 means that the connection is healthy. We mark the connection as unhealthy under the following circumstances: * Failure while establihing a websocket connection * Failure while sending basic information to controlplane * Failure while sending ping to controlplane * Failure while receiving a packet from the websocket connection This is helpful for users running a signficant number of gateways to be alerted about potential issues any gateway(s) may be facing while talking to the controlplane. Signed-off-by: Sanskar Jaiswal --- .../add-cp-connectivity-metric-prometheus.yml | 4 +++ kong/clustering/data_plane.lua | 25 ++++++++++++++++--- kong/plugins/prometheus/exporter.lua | 18 +++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 changelog/unreleased/kong/add-cp-connectivity-metric-prometheus.yml diff --git a/changelog/unreleased/kong/add-cp-connectivity-metric-prometheus.yml b/changelog/unreleased/kong/add-cp-connectivity-metric-prometheus.yml new file mode 100644 index 000000000000..f384c8c5e898 --- /dev/null +++ b/changelog/unreleased/kong/add-cp-connectivity-metric-prometheus.yml @@ -0,0 +1,4 @@ +message: | + **Prometheus**: Added gauge to expose connectivity state to controlplane. +type: feature +scope: Plugin diff --git a/kong/clustering/data_plane.lua b/kong/clustering/data_plane.lua index 63e566863981..5552a3a80df3 100644 --- a/kong/clustering/data_plane.lua +++ b/kong/clustering/data_plane.lua @@ -74,11 +74,20 @@ function _M.new(clustering) end +local function set_control_plane_reachable(reachable) + local ok, err = ngx.shared.kong:safe_set("control_plane_reachable", reachable) + if not ok then + ngx_log(ngx_ERR, _log_prefix, "failed to set controlplane_reachable key in shm to " .. reachable .. " :", err) + end +end + + function _M:init_worker(basic_info) -- ROLE = "data_plane" self.plugins_list = basic_info.plugins self.filters = basic_info.filters + set_control_plane_reachable(false) -- only run in process which worker_id() == 0 assert(ngx.timer.at(0, function(premature) @@ -98,13 +107,17 @@ local function send_ping(c, log_suffix) local _, err = c:send_ping(hash) if err then + set_control_plane_reachable(false) ngx_log(is_timeout(err) and ngx_NOTICE or ngx_WARN, _log_prefix, "unable to send ping frame to control plane: ", err, log_suffix) -- only log a ping if the hash changed - elseif hash ~= prev_hash then - prev_hash = hash - ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix) + else + set_control_plane_reachable(true) + if hash ~= prev_hash then + prev_hash = hash + ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix) + end end end @@ -156,6 +169,7 @@ function _M:communicate(premature) local c, uri, err = clustering_utils.connect_cp(self, "/v1/outlet") if not c then + set_control_plane_reachable(false) ngx_log(ngx_WARN, _log_prefix, "connection to control plane ", uri, " broken: ", err, " (retrying after ", reconnection_delay, " seconds)", log_suffix) @@ -188,6 +202,7 @@ function _M:communicate(premature) filters = self.filters, labels = labels, })) if err then + set_control_plane_reachable(false) ngx_log(ngx_ERR, _log_prefix, "unable to send basic information to control plane: ", uri, " err: ", err, " (retrying after ", reconnection_delay, " seconds)", log_suffix) @@ -197,6 +212,7 @@ function _M:communicate(premature) end)) return end + set_control_plane_reachable(true) local config_semaphore = semaphore.new(0) @@ -302,16 +318,19 @@ function _M:communicate(premature) local data, typ, err = c:recv_frame() if err then if not is_timeout(err) then + set_control_plane_reachable(false) return nil, "error while receiving frame from control plane: " .. err end local waited = ngx_time() - last_seen if waited > PING_WAIT then + set_control_plane_reachable(false) return nil, "did not receive pong frame from control plane within " .. PING_WAIT .. " seconds" end goto continue end + set_control_plane_reachable(true) if typ == "close" then ngx_log(ngx_DEBUG, _log_prefix, "received close frame from control plane", log_suffix) diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua index 4c37287da5ab..e22fbe983785 100644 --- a/kong/plugins/prometheus/exporter.lua +++ b/kong/plugins/prometheus/exporter.lua @@ -10,6 +10,7 @@ local lower = string.lower local ngx_timer_pending_count = ngx.timer.pending_count local ngx_timer_running_count = ngx.timer.running_count local get_all_upstreams = balancer.get_all_upstreams + if not balancer.get_all_upstreams then -- API changed since after Kong 2.5 get_all_upstreams = require("kong.runloop.balancer.upstreams").get_all_upstreams end @@ -65,6 +66,14 @@ local function init() "0 is unreachable", nil, prometheus.LOCAL_STORAGE) + if role == "data_plane" then + metrics.cp_reachable = prometheus:gauge("control_plane_reachable", + "Control plane reachable from gateway, " .. + "0 is unreachable", + nil, + prometheus.LOCAL_STORAGE) + end + metrics.node_info = prometheus:gauge("node_info", "Kong Node metadata information", {"node_id", "version"}, @@ -449,6 +458,15 @@ local function metric_data(write_fn) kong.log.err("prometheus: failed to reach database while processing", "/metrics endpoint: ", err) end + + if role == "data_plane" then + local cp_reachable = ngx.shared.kong:get("control_plane_reachable") + if cp_reachable then + metrics.cp_reachable:set(1) + else + metrics.cp_reachable:set(0) + end + end end local phase = get_phase() From 697c1e64a8a7f5e4a730e15d225de024879fa70b Mon Sep 17 00:00:00 2001 From: Sanskar Jaiswal Date: Mon, 23 Dec 2024 18:45:52 +0530 Subject: [PATCH 2/2] tests(prometheus): add tests to assert controlplane connectivity behaviour Signed-off-by: Sanskar Jaiswal --- .../26-prometheus/04-status_api_spec.lua | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/spec/03-plugins/26-prometheus/04-status_api_spec.lua b/spec/03-plugins/26-prometheus/04-status_api_spec.lua index 2fec1a089b03..ad6a393960b7 100644 --- a/spec/03-plugins/26-prometheus/04-status_api_spec.lua +++ b/spec/03-plugins/26-prometheus/04-status_api_spec.lua @@ -529,3 +529,70 @@ describe("Plugin: prometheus (access) granular metrics switch", function() end) end + +describe("CP/DP connectivity state #", function () + local status_client + + local function get_metrics() + if not status_client then + status_client = helpers.http_client("127.0.0.1", tcp_status_port, 20000) + status_client.reopen = true -- retry on a closed connection + end + + local res, err = status_client:get("/metrics") + + assert.is_nil(err, "failed GET /metrics: " .. tostring(err)) + return assert.res_status(200, res) + end + + setup(function() + local bp = helpers.get_db_utils() + + bp.plugins:insert { + protocols = { "http", "https", "grpc", "grpcs", "tcp", "tls" }, + name = "prometheus", + } + + assert(helpers.start_kong({ + role = "control_plane", + prefix = "prom_cp", + cluster_cert = "spec/fixtures/kong_clustering.crt", + cluster_cert_key = "spec/fixtures/kong_clustering.key", + cluster_listen = "127.0.0.1:9005", + plugins = "bundled, prometheus", + })) + + assert(helpers.start_kong({ + role = "data_plane", + database = "off", + prefix = "prom_dp", + cluster_cert = "spec/fixtures/kong_clustering.crt", + cluster_cert_key = "spec/fixtures/kong_clustering.key", + cluster_control_plane = "127.0.0.1:9005", + proxy_listen = "0.0.0.0:9000", + worker_state_update_frequency = 1, + status_listen = "0.0.0.0:" .. tcp_status_port, + nginx_worker_processes = 1, + dedicated_config_processing = "on", + plugins = "bundled, prometheus", + })) + status_client = helpers.http_client("127.0.0.1", tcp_status_port, 20000) + end) + + teardown(function() + if status_client then + status_client:close() + end + + helpers.stop_kong("prom_dp") + end) + + it("exposes controlplane connectivity status", function () + local body = get_metrics() + assert.matches('kong_control_plane_reachable 1', body, nil, true) + + helpers.stop_kong("prom_cp") + local body = get_metrics() + assert.matches('kong_control_plane_reachable 0', body, nil, true) + end) +end)