Skip to content

Commit

Permalink
feat(opentelemetry): sampling rate configuration option (Kong#12054)
Browse files Browse the repository at this point in the history
Sampling rate can now be set via the Opentelemetry plugin instead of
it just being a global setting for the gateway.

It also fixes a small bug where, in the edge case of opentelemetry being
used for propagation only (instrumentations disabled), the `sampled`
flag was incorrectly set to `true` although no span was sampled for that
request.

Includes tests to cover more configuration scenarios (esp. different
sampling rates) and verify propagation is done correctly.
  • Loading branch information
samugi authored and chobits committed Dec 22, 2023
1 parent 46af157 commit 753e369
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 75 deletions.
5 changes: 5 additions & 0 deletions changelog/unreleased/kong/tracing-sampling-rate-scope.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
message: >
Tracing Sampling Rate can now be set via the `config.sampling_rate` property
of the OpenTelemetry plugin instead of it just being a global setting for the gateway.
type: feature
scope: Plugin
7 changes: 7 additions & 0 deletions kong/clustering/compat/removed_fields.lua
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,11 @@ return {
"read_body_for_logout",
},
},

-- Any dataplane older than 3.6.0
[3006000000] = {
opentelemetry = {
"sampling_rate",
},
},
}
98 changes: 70 additions & 28 deletions kong/pdk/tracing.lua
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ local tablepool = require "tablepool"
local new_tab = require "table.new"
local utils = require "kong.tools.utils"
local phase_checker = require "kong.pdk.private.phases"
local tracing_context = require "kong.tracing.tracing_context"

local ngx = ngx
local type = type
Expand Down Expand Up @@ -63,34 +64,29 @@ local function generate_span_id()
return rand_bytes(8)
end

--- Build-in sampler
local function always_on_sampler()
return true
end

local function always_off_sampler()
return false
end

-- Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
-- spec: https://github.com/c24t/opentelemetry-specification/blob/3b3d321865cf46364bdfb292c179b6444dc96bf9/specification/sdk-tracing.md#probability-sampler-algorithm
local function get_trace_id_based_sampler(rate)
if type(rate) ~= "number" then
error("invalid fraction", 2)
end
local function get_trace_id_based_sampler(options_sampling_rate)
return function(trace_id, sampling_rate)
sampling_rate = sampling_rate or options_sampling_rate

if rate >= 1 then
return always_on_sampler
end
if type(sampling_rate) ~= "number" then
error("invalid fraction", 2)
end

if rate <= 0 then
return always_off_sampler
end
-- always on sampler
if sampling_rate >= 1 then
return true
end

-- always off sampler
if sampling_rate <= 0 then
return false
end

local bound = rate * BOUND_MAX
-- probability sampler
local bound = sampling_rate * BOUND_MAX

-- TODO: is this a sound method to sample?
return function(trace_id)
if #trace_id < SAMPLING_BYTE then
error(TOO_SHORT_MESSAGE, 2)
end
Expand Down Expand Up @@ -200,17 +196,17 @@ local function create_span(tracer, options)
span.span_id = generate_span_id()
span.trace_id = trace_id
span.kind = options.span_kind or SPAN_KIND.INTERNAL
-- get_sampling_decision() can be used to dynamically run the sampler's logic
-- and obtain the sampling decision for the span. This way plugins can apply
-- their configured sampling rate dynamically. The sampled flag can then be
-- overwritten by set_should_sample.
span.should_sample = sampled

setmetatable(span, span_mt)
return span
end

local function link_span(tracer, span, name, options)
if not span.should_sample then
kong.log.debug("skipping non-sampled span")
return
end
if tracer and type(tracer) ~= "table" then
error("invalid tracer", 2)
end
Expand Down Expand Up @@ -270,8 +266,8 @@ end
-- local time = ngx.now()
-- span:finish(time * 100000000)
function span_mt:finish(end_time_ns)
if self.end_time_ns ~= nil or not self.should_sample then
-- span is finished, and already processed or not sampled
if self.end_time_ns ~= nil then
-- span is finished, and already processed
return
end

Expand Down Expand Up @@ -426,6 +422,7 @@ noop_tracer.active_span = NOOP
noop_tracer.set_active_span = NOOP
noop_tracer.process_span = NOOP
noop_tracer.set_should_sample = NOOP
noop_tracer.get_sampling_decision = NOOP

local VALID_TRACING_PHASES = {
rewrite = true,
Expand Down Expand Up @@ -554,6 +551,51 @@ local function new_tracer(name, options)
end
end

--- Get the sampling decision result
--
-- Uses a parent-based sampler when the parent has sampled flag == false
-- to inherit the non-recording decision from the parent span, or when
-- trace_id is not available.
--
-- Else, apply the probability-based should_sample decision.
--
-- @function kong.tracing:get_sampling_decision
-- @tparam bool parent_should_sample value of the parent span sampled flag
-- extracted from the incoming tracing headers
-- @tparam number sampling_rate the sampling rate to apply for the
-- probability sampler
-- @treturn bool sampled value of sampled for this trace
function self:get_sampling_decision(parent_should_sample, sampling_rate)
local ctx = ngx.ctx

local sampled
local root_span = ctx.KONG_SPANS and ctx.KONG_SPANS[1]
local trace_id = tracing_context.get_raw_trace_id(ctx)

if not root_span or root_span.attributes["kong.propagation_only"] then
-- should not sample if there is no root span or if the root span is
-- a dummy created only to propagate headers
sampled = false

elseif parent_should_sample == false or not trace_id then
-- trace_id can be nil when tracing instrumentations are disabled
-- and Kong is configured to only do headers propagation
sampled = parent_should_sample

elseif not sampling_rate then
-- no custom sampling_rate was passed:
-- reuse the sampling result of the root_span
sampled = root_span.should_sample == true

else
-- use probability-based sampler
sampled = self.sampler(trace_id, sampling_rate)
end

-- enforce boolean
return not not sampled
end

tracer_memo[name] = setmetatable(self, tracer_mt)
return tracer_memo[name]
end
Expand Down
41 changes: 29 additions & 12 deletions kong/plugins/opentelemetry/handler.lua
Original file line number Diff line number Diff line change
Expand Up @@ -94,34 +94,32 @@ end
function OpenTelemetryHandler:access(conf)
local headers = ngx_get_headers()
local root_span = ngx.ctx.KONG_SPANS and ngx.ctx.KONG_SPANS[1]
local tracer = kong.tracing.new("otel")

-- make propagation running with tracing instrumetation not enabled
-- get the global tracer when available, or instantiate a new one
local tracer = kong.tracing.name == "noop" and kong.tracing.new("otel")
or kong.tracing

-- make propagation work with tracing disabled
if not root_span then
root_span = tracer.start_span("root")
root_span:set_attribute("kong.propagation_only", true)

-- the span created only for the propagation and will be bypassed to the exporter
-- since tracing is disabled, turn off sampling entirely for this trace
kong.ctx.plugin.should_sample = false
end

local injected_parent_span = tracing_context.get_unlinked_span("balancer") or root_span
local header_type, trace_id, span_id, parent_id, parent_sampled, _ = propagation_parse(headers, conf.header_type)

local header_type, trace_id, span_id, parent_id, should_sample, _ = propagation_parse(headers, conf.header_type)
if should_sample == false then
tracer:set_should_sample(should_sample)
injected_parent_span.should_sample = should_sample
end

-- overwrite trace id
-- as we are in a chain of existing trace
-- Overwrite trace ids
-- with the value extracted from incoming tracing headers
if trace_id then
-- to propagate the correct trace ID we have to set it here
-- before passing this span to propagation.set()
injected_parent_span.trace_id = trace_id
-- update the Tracing Context with the trace ID extracted from headers
tracing_context.set_raw_trace_id(trace_id)
end

-- overwrite root span's parent_id
if span_id then
root_span.parent_id = span_id
Expand All @@ -130,6 +128,25 @@ function OpenTelemetryHandler:access(conf)
root_span.parent_id = parent_id
end

-- Configure the sampled flags
local sampled
if kong.ctx.plugin.should_sample == false then
sampled = false

else
-- Sampling decision for the current trace.
local err
-- get_sampling_decision() depends on the value of the trace id: call it
-- after the trace_id is updated
sampled, err = tracer:get_sampling_decision(parent_sampled, conf.sampling_rate)
if err then
ngx_log(ngx_ERR, _log_prefix, "sampler failure: ", err)
end
end
tracer:set_should_sample(sampled)
-- Set the sampled flag for the outgoing header's span
injected_parent_span.should_sample = sampled

propagation_set(conf.header_type, header_type, injected_parent_span, "w3c")
end

Expand Down
7 changes: 7 additions & 0 deletions kong/plugins/opentelemetry/schema.lua
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ return {
required = false,
default = "preserve",
one_of = { "preserve", "ignore", "b3", "b3-single", "w3c", "jaeger", "ot", "aws", "gcp" } } },
{ sampling_rate = {
description = "Tracing sampling rate for configuring the probability-based sampler. When set, this value supersedes the global `tracing_sampling_rate` setting from kong.conf.",
type = "number",
between = {0, 1},
required = false,
default = nil,
} },
},
entity_checks = {
{ custom_entity_check = {
Expand Down
2 changes: 2 additions & 0 deletions spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ describe("CP/DP config compat transformations #" .. strategy, function()

local expected_otel_prior_35 = utils.cycle_aware_deep_copy(opentelemetry)
expected_otel_prior_35.config.header_type = "preserve"
expected_otel_prior_35.config.sampling_rate = nil
do_assert(utils.uuid(), "3.4.0", expected_otel_prior_35)

-- cleanup
Expand All @@ -231,6 +232,7 @@ describe("CP/DP config compat transformations #" .. strategy, function()

local expected_otel_prior_34 = utils.cycle_aware_deep_copy(opentelemetry)
expected_otel_prior_34.config.header_type = "preserve"
expected_otel_prior_34.config.sampling_rate = nil
do_assert(utils.uuid(), "3.3.0", expected_otel_prior_34)

-- cleanup
Expand Down
Loading

0 comments on commit 753e369

Please sign in to comment.