Skip to content

Commit

Permalink
feat(request-debugging): improve the time resolution and latency field (
Browse files Browse the repository at this point in the history
#13460)

Using `time_ns` to measure the context duration and introduced a field
`total_time_without_upstream` to represent total latency without upstream.

KAG-4733
  • Loading branch information
Oyami-Srk authored Oct 11, 2024
1 parent 4a8c3fb commit ad758d7
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 83 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
message: |
Improved the output of the request debugger:
- Now the resolution of field `total_time` is microseconds.
- A new field `total_time_without_upstream` on the top level shows the latency only introduced by Kong.
type: feature
scope: Core

43 changes: 37 additions & 6 deletions kong/timing/context.lua
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
local cjson = require("cjson.safe")
local cjson = require("cjson.safe").new()

local ngx_get_phase = ngx.get_phase
local ngx_re_gmatch = ngx.re.gmatch
Expand All @@ -8,13 +8,15 @@ local setmetatable = setmetatable
local table_insert = table.insert
local table_remove = table.remove

local get_cur_msec = require("kong.tools.time").get_updated_monotonic_ms
local time_ns = require("kong.tools.time").time_ns

local assert = assert

local _M = {}
local _MT = { __index = _M }

-- Set number precision smaller than 16 to avoid floating point errors
cjson.encode_number_precision(14)

function _M:enter_subcontext(name)
assert(name ~= nil, "name is required")
Expand All @@ -29,14 +31,13 @@ function _M:enter_subcontext(name)
end

self.current_subcontext = self.current_subcontext.child[name]
self.current_subcontext.____start____ = get_cur_msec()
self.current_subcontext.____start____ = time_ns()
end


function _M:leave_subcontext(attributes)
assert(#self.sub_context_stack > 0, "subcontext stack underflow")

local elapsed = get_cur_msec() - self.current_subcontext.____start____
local elapsed = (time_ns() - self.current_subcontext.____start____) / 1e6
local old_total_time = self.current_subcontext.total_time or 0
self.current_subcontext.total_time = old_total_time + elapsed
self.current_subcontext.____start____ = nil
Expand Down Expand Up @@ -75,11 +76,37 @@ function _M:set_root_context_prop(k, v)
end


function _M:finalize(subcontext)
-- finalize total_time optionally rounding to the nearest integer
if subcontext.total_time then
-- round to 2 decimal places
subcontext.total_time = math_floor(subcontext.total_time * 100) / 100
end

if subcontext.child then
for _, child in pairs(subcontext.child) do
self:finalize(child)
end
end
end


function _M:get_total_time_without_upstream()
local total_time = 0
for k, child in pairs(self.root_context.child) do
if k ~= "upstream" and child.total_time then
total_time = total_time + child.total_time
end
end
return total_time
end


function _M:to_json()
local dangling = nil

-- `> 1` means we have at least one subcontext (the root context)
-- We always call this function at then end of the header_filter and
-- We always call this function at then end of the header_filter and
-- log phases, so we should always have at least one subcontext.
while #self.sub_context_stack > 1 do
self:set_context_prop("dangling", true)
Expand All @@ -92,6 +119,10 @@ function _M:to_json()
end

self:set_root_context_prop("dangling", dangling)
self:finalize(self.root_context)
if self.root_context.child ~= nil then
self.root_context.total_time_without_upstream = self:get_total_time_without_upstream()
end
return assert(cjson.encode(self.root_context))
end

Expand Down
205 changes: 128 additions & 77 deletions spec/02-integration/21-request-debug/01-request-debug_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,26 @@ local function assert_plugin_has_span(plugin_span, span_name)
return true
end

local function assert_total_time_is_float(subcontext, current_path)
local current_path = current_path or '$'

if subcontext.total_time then
-- body filter is mostly zero
if subcontext.total_time ~= 0 then
assert.are_not.equal(subcontext.total_time % 1, 0, current_path .. ".total_time is not a float number");
end
end

if subcontext.child then
for path, child in pairs(subcontext.child) do
-- Upstream time is measured by nginx and it's always decimal rather than float
if path ~= "upstream" then
assert_total_time_is_float(child, current_path .. "." .. path)
end
end
end
end


local function start_kong(strategy, deployment, disable_req_dbg, token)
local request_debug = nil
Expand Down Expand Up @@ -500,22 +520,28 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/slow-streaming", "*")
local log_output = assert_has_output_log(deployment, "/slow-streaming", "*")
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/slow-streaming", "*")
local log_output = assert_has_output_log(deployment, "/slow-streaming", "*")

local total_header = assert(tonumber(header_output.child.upstream.total_time))
local tfb_header = assert(tonumber(header_output.child.upstream.child.time_to_first_byte.total_time))
assert.falsy(header_output.child.upstream.child.streaming)
assert.same(total_header, tfb_header)

local total_log = assert(tonumber(log_output.child.upstream.total_time))
local tfb_log = assert(tonumber(log_output.child.upstream.child.time_to_first_byte.total_time))
local streaming = assert(tonumber(log_output.child.upstream.child.streaming.total_time))

local total_header = assert(tonumber(header_output.child.upstream.total_time))
local tfb_header = assert(tonumber(header_output.child.upstream.child.time_to_first_byte.total_time))
assert.falsy(header_output.child.upstream.child.streaming)
assert.same(total_header, tfb_header)
assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)

local total_log = assert(tonumber(log_output.child.upstream.total_time))
local tfb_log = assert(tonumber(log_output.child.upstream.child.time_to_first_byte.total_time))
local streaming = assert(tonumber(log_output.child.upstream.child.streaming.total_time))
assert.near(tfb_header, tfb_log, 50)
assert.same(total_log, tfb_log + streaming)
assert.near(tfb_header, tfb_log, 50)
assert.same(total_log, tfb_log + streaming)

assert.near(TIME_TO_FIRST_BYTE, tfb_log, 50)
assert.near(STREAMING, streaming, 50)
assert.near(TIME_TO_FIRST_BYTE, tfb_log, 50)
assert.near(STREAMING, streaming, 50)
end, 10)
end)

it("rewrite, access, balancer, header_filter, body_filter, log, plugin span, dns span", function()
Expand All @@ -529,26 +555,31 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/mutiple-spans", "*")
local log_output = assert_has_output_log(deployment, "/mutiple-spans", "*")

assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert(header_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(header_output.child.balancer)
assert.truthy(header_output.child.header_filter)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(log_output.child.access.child.router) -- router is executed in access phase
assert(log_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(log_output.child.balancer)
assert.truthy(log_output.child.header_filter)
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/mutiple-spans", "*")
local log_output = assert_has_output_log(deployment, "/mutiple-spans", "*")

assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert(header_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(header_output.child.balancer)
assert.truthy(header_output.child.header_filter)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(log_output.child.access.child.router) -- router is executed in access phase
assert(log_output.child.access.child.dns.child.localhost.child.resolve.cache_hit ~= nil, "dns cache hit should be recorded")
assert.truthy(log_output.child.balancer)
assert.truthy(log_output.child.header_filter)
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)

assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("subrequests involved", function()
Expand All @@ -568,34 +599,39 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/subrequests", "*")
local log_output = assert_has_output_log(deployment, "/subrequests", "*")

-- spans of main request
assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(header_output.child.response)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)

-- spans of subrequest
assert.truthy(header_output.child.response.child.balancer)
assert.truthy(header_output.child.response.child.header_filter)
assert.truthy(header_output.child.response.child.plugins)
assert.truthy(header_output.child.response.child.plugins.child["enable-buffering-response"])

assert.truthy(log_output.child.response.child.balancer)
assert.truthy(log_output.child.response.child.header_filter)
assert.truthy(log_output.child.response.child.body_filter)
assert.truthy(log_output.child.response.child.plugins)
assert.truthy(log_output.child.response.child.plugins.child["enable-buffering-response"])
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/subrequests", "*")
local log_output = assert_has_output_log(deployment, "/subrequests", "*")

-- spans of main request
assert.truthy(header_output.child.rewrite)
assert.truthy(header_output.child.access)
assert.truthy(header_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(header_output.child.response)

assert.truthy(log_output.child.rewrite)
assert.truthy(log_output.child.access)
assert.truthy(log_output.child.access.child.dns) -- upstream is resolved in access phase
assert.truthy(header_output.child.access.child.router) -- router is executed in access phase
assert.truthy(log_output.child.body_filter)
assert.truthy(log_output.child.log)

-- spans of subrequest
assert.truthy(header_output.child.response.child.balancer)
assert.truthy(header_output.child.response.child.header_filter)
assert.truthy(header_output.child.response.child.plugins)
assert.truthy(header_output.child.response.child.plugins.child["enable-buffering-response"])

assert.truthy(log_output.child.response.child.balancer)
assert.truthy(log_output.child.response.child.header_filter)
assert.truthy(log_output.child.response.child.body_filter)
assert.truthy(log_output.child.response.child.plugins)
assert.truthy(log_output.child.response.child.plugins.child["enable-buffering-response"])

assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("external_http span", function()
Expand All @@ -614,14 +650,19 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/external_http", "*")
local log_output = assert_has_output_log(deployment, "/external_http", "*")
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/external_http", "*")
local log_output = assert_has_output_log(deployment, "/external_http", "*")

local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")
local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")

plugin_span = assert.truthy(log_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")
plugin_span = assert.truthy(log_output.child.access.child.plugins.child["muti-external-http-calls"].child)
assert_plugin_has_span(plugin_span, "external_http")

assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("redis span", function()
Expand Down Expand Up @@ -649,14 +690,19 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/redis", "*")
local log_output = assert_has_output_log(deployment, "/redis", "*")
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/redis", "*")
local log_output = assert_has_output_log(deployment, "/redis", "*")

local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")

local plugin_span = assert.truthy(header_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")
plugin_span = assert.truthy(log_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")

plugin_span = assert.truthy(log_output.child.access.child.plugins.child["rate-limiting"].child)
assert_plugin_has_span(plugin_span, "redis")
assert_total_time_is_float(header_output)
assert_total_time_is_float(log_output)
end, 10)
end)

it("truncate/split too large debug output", function()
Expand All @@ -675,11 +721,16 @@ describe(desc, function()

helpers.wait_for_all_config_update()

local header_output = assert_has_output_header(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)
local _, truncated = assert_has_output_log(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)
helpers.pwait_until(function()
local header_output = assert_has_output_header(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)
local _, truncated = assert_has_output_log(deployment, "/large_debug_output", "*", "1.1.1.1", TOKEN)

assert.truthy(header_output.truncated)
assert.truthy(truncated)

assert.truthy(header_output.truncated)
assert.truthy(truncated)
assert_total_time_is_float(header_output)
assert_total_time_is_float(truncated)
end, 10)
end)

it("invalid X-Kong-Request-Debug request header should not trigger this feature", function()
Expand Down

1 comment on commit ad758d7

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bazel Build

Docker image available kong/kong:ad758d7bd6680bdf6a98306fb8a8dfe094eed75b
Artifacts available https://github.com/Kong/kong/actions/runs/11285843615

Please sign in to comment.