diff --git a/changelog/unreleased/kong/tracing-sampling-rate-scope.yml b/changelog/unreleased/kong/tracing-sampling-rate-scope.yml new file mode 100644 index 000000000000..96cde17f1ff8 --- /dev/null +++ b/changelog/unreleased/kong/tracing-sampling-rate-scope.yml @@ -0,0 +1,5 @@ +message: > + Tracing Sampling Rate can now be set via the `config.sampling_rate` property + of the OpenTelemetry plugin instead of it just being a global setting for the gateway. +type: feature +scope: Plugin diff --git a/kong/clustering/compat/removed_fields.lua b/kong/clustering/compat/removed_fields.lua index 7a0eb3c768f4..e0083de8a9b1 100644 --- a/kong/clustering/compat/removed_fields.lua +++ b/kong/clustering/compat/removed_fields.lua @@ -109,4 +109,11 @@ return { "read_body_for_logout", }, }, + + -- Any dataplane older than 3.6.0 + [3006000000] = { + opentelemetry = { + "sampling_rate", + }, + }, } diff --git a/kong/pdk/tracing.lua b/kong/pdk/tracing.lua index c41500d50196..a2074888a6b3 100644 --- a/kong/pdk/tracing.lua +++ b/kong/pdk/tracing.lua @@ -11,6 +11,7 @@ local tablepool = require "tablepool" local new_tab = require "table.new" local utils = require "kong.tools.utils" local phase_checker = require "kong.pdk.private.phases" +local tracing_context = require "kong.tracing.tracing_context" local ngx = ngx local type = type @@ -63,34 +64,29 @@ local function generate_span_id() return rand_bytes(8) end ---- Build-in sampler -local function always_on_sampler() - return true -end - -local function always_off_sampler() - return false -end - -- Fractions >= 1 will always sample. Fractions < 0 are treated as zero. -- spec: https://github.com/c24t/opentelemetry-specification/blob/3b3d321865cf46364bdfb292c179b6444dc96bf9/specification/sdk-tracing.md#probability-sampler-algorithm -local function get_trace_id_based_sampler(rate) - if type(rate) ~= "number" then - error("invalid fraction", 2) - end +local function get_trace_id_based_sampler(options_sampling_rate) + return function(trace_id, sampling_rate) + sampling_rate = sampling_rate or options_sampling_rate - if rate >= 1 then - return always_on_sampler - end + if type(sampling_rate) ~= "number" then + error("invalid fraction", 2) + end - if rate <= 0 then - return always_off_sampler - end + -- always on sampler + if sampling_rate >= 1 then + return true + end + + -- always off sampler + if sampling_rate <= 0 then + return false + end - local bound = rate * BOUND_MAX + -- probability sampler + local bound = sampling_rate * BOUND_MAX - -- TODO: is this a sound method to sample? - return function(trace_id) if #trace_id < SAMPLING_BYTE then error(TOO_SHORT_MESSAGE, 2) end @@ -200,6 +196,10 @@ local function create_span(tracer, options) span.span_id = generate_span_id() span.trace_id = trace_id span.kind = options.span_kind or SPAN_KIND.INTERNAL + -- get_sampling_decision() can be used to dynamically run the sampler's logic + -- and obtain the sampling decision for the span. This way plugins can apply + -- their configured sampling rate dynamically. The sampled flag can then be + -- overwritten by set_should_sample. span.should_sample = sampled setmetatable(span, span_mt) @@ -207,10 +207,6 @@ local function create_span(tracer, options) end local function link_span(tracer, span, name, options) - if not span.should_sample then - kong.log.debug("skipping non-sampled span") - return - end if tracer and type(tracer) ~= "table" then error("invalid tracer", 2) end @@ -270,8 +266,8 @@ end -- local time = ngx.now() -- span:finish(time * 100000000) function span_mt:finish(end_time_ns) - if self.end_time_ns ~= nil or not self.should_sample then - -- span is finished, and already processed or not sampled + if self.end_time_ns ~= nil then + -- span is finished, and already processed return end @@ -426,6 +422,7 @@ noop_tracer.active_span = NOOP noop_tracer.set_active_span = NOOP noop_tracer.process_span = NOOP noop_tracer.set_should_sample = NOOP +noop_tracer.get_sampling_decision = NOOP local VALID_TRACING_PHASES = { rewrite = true, @@ -554,6 +551,51 @@ local function new_tracer(name, options) end end + --- Get the sampling decision result + -- + -- Uses a parent-based sampler when the parent has sampled flag == false + -- to inherit the non-recording decision from the parent span, or when + -- trace_id is not available. + -- + -- Else, apply the probability-based should_sample decision. + -- + -- @function kong.tracing:get_sampling_decision + -- @tparam bool parent_should_sample value of the parent span sampled flag + -- extracted from the incoming tracing headers + -- @tparam number sampling_rate the sampling rate to apply for the + -- probability sampler + -- @treturn bool sampled value of sampled for this trace + function self:get_sampling_decision(parent_should_sample, sampling_rate) + local ctx = ngx.ctx + + local sampled + local root_span = ctx.KONG_SPANS and ctx.KONG_SPANS[1] + local trace_id = tracing_context.get_raw_trace_id(ctx) + + if not root_span or root_span.attributes["kong.propagation_only"] then + -- should not sample if there is no root span or if the root span is + -- a dummy created only to propagate headers + sampled = false + + elseif parent_should_sample == false or not trace_id then + -- trace_id can be nil when tracing instrumentations are disabled + -- and Kong is configured to only do headers propagation + sampled = parent_should_sample + + elseif not sampling_rate then + -- no custom sampling_rate was passed: + -- reuse the sampling result of the root_span + sampled = root_span.should_sample == true + + else + -- use probability-based sampler + sampled = self.sampler(trace_id, sampling_rate) + end + + -- enforce boolean + return not not sampled + end + tracer_memo[name] = setmetatable(self, tracer_mt) return tracer_memo[name] end diff --git a/kong/plugins/opentelemetry/handler.lua b/kong/plugins/opentelemetry/handler.lua index db296fe045b0..71be03634f00 100644 --- a/kong/plugins/opentelemetry/handler.lua +++ b/kong/plugins/opentelemetry/handler.lua @@ -94,26 +94,25 @@ end function OpenTelemetryHandler:access(conf) local headers = ngx_get_headers() local root_span = ngx.ctx.KONG_SPANS and ngx.ctx.KONG_SPANS[1] - local tracer = kong.tracing.new("otel") - -- make propagation running with tracing instrumetation not enabled + -- get the global tracer when available, or instantiate a new one + local tracer = kong.tracing.name == "noop" and kong.tracing.new("otel") + or kong.tracing + + -- make propagation work with tracing disabled if not root_span then root_span = tracer.start_span("root") + root_span:set_attribute("kong.propagation_only", true) - -- the span created only for the propagation and will be bypassed to the exporter + -- since tracing is disabled, turn off sampling entirely for this trace kong.ctx.plugin.should_sample = false end local injected_parent_span = tracing_context.get_unlinked_span("balancer") or root_span + local header_type, trace_id, span_id, parent_id, parent_sampled, _ = propagation_parse(headers, conf.header_type) - local header_type, trace_id, span_id, parent_id, should_sample, _ = propagation_parse(headers, conf.header_type) - if should_sample == false then - tracer:set_should_sample(should_sample) - injected_parent_span.should_sample = should_sample - end - - -- overwrite trace id - -- as we are in a chain of existing trace + -- Overwrite trace ids + -- with the value extracted from incoming tracing headers if trace_id then -- to propagate the correct trace ID we have to set it here -- before passing this span to propagation.set() @@ -121,7 +120,6 @@ function OpenTelemetryHandler:access(conf) -- update the Tracing Context with the trace ID extracted from headers tracing_context.set_raw_trace_id(trace_id) end - -- overwrite root span's parent_id if span_id then root_span.parent_id = span_id @@ -130,6 +128,25 @@ function OpenTelemetryHandler:access(conf) root_span.parent_id = parent_id end + -- Configure the sampled flags + local sampled + if kong.ctx.plugin.should_sample == false then + sampled = false + + else + -- Sampling decision for the current trace. + local err + -- get_sampling_decision() depends on the value of the trace id: call it + -- after the trace_id is updated + sampled, err = tracer:get_sampling_decision(parent_sampled, conf.sampling_rate) + if err then + ngx_log(ngx_ERR, _log_prefix, "sampler failure: ", err) + end + end + tracer:set_should_sample(sampled) + -- Set the sampled flag for the outgoing header's span + injected_parent_span.should_sample = sampled + propagation_set(conf.header_type, header_type, injected_parent_span, "w3c") end diff --git a/kong/plugins/opentelemetry/schema.lua b/kong/plugins/opentelemetry/schema.lua index afeae44008be..4601703163dd 100644 --- a/kong/plugins/opentelemetry/schema.lua +++ b/kong/plugins/opentelemetry/schema.lua @@ -59,6 +59,13 @@ return { required = false, default = "preserve", one_of = { "preserve", "ignore", "b3", "b3-single", "w3c", "jaeger", "ot", "aws", "gcp" } } }, + { sampling_rate = { + description = "Tracing sampling rate for configuring the probability-based sampler. When set, this value supersedes the global `tracing_sampling_rate` setting from kong.conf.", + type = "number", + between = {0, 1}, + required = false, + default = nil, + } }, }, entity_checks = { { custom_entity_check = { diff --git a/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua b/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua index ce941e445abd..e3fe12f9bb54 100644 --- a/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua +++ b/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua @@ -212,6 +212,7 @@ describe("CP/DP config compat transformations #" .. strategy, function() local expected_otel_prior_35 = utils.cycle_aware_deep_copy(opentelemetry) expected_otel_prior_35.config.header_type = "preserve" + expected_otel_prior_35.config.sampling_rate = nil do_assert(utils.uuid(), "3.4.0", expected_otel_prior_35) -- cleanup @@ -231,6 +232,7 @@ describe("CP/DP config compat transformations #" .. strategy, function() local expected_otel_prior_34 = utils.cycle_aware_deep_copy(opentelemetry) expected_otel_prior_34.config.header_type = "preserve" + expected_otel_prior_34.config.sampling_rate = nil do_assert(utils.uuid(), "3.3.0", expected_otel_prior_34) -- cleanup diff --git a/spec/03-plugins/37-opentelemetry/03-propagation_spec.lua b/spec/03-plugins/37-opentelemetry/03-propagation_spec.lua index daf0a6ee2d84..e1d029df92d1 100644 --- a/spec/03-plugins/37-opentelemetry/03-propagation_spec.lua +++ b/spec/03-plugins/37-opentelemetry/03-propagation_spec.lua @@ -57,10 +57,22 @@ local function assert_correct_trace_hierarchy(spans, incoming_span_id) end for _, strategy in helpers.each_strategy() do -describe("propagation tests #" .. strategy, function() +for _, instrumentations in ipairs({"all", "off"}) do +for _, sampling_rate in ipairs({1, 0}) do +describe("propagation tests #" .. strategy .. " instrumentations: " .. instrumentations .. " sampling_rate: " .. sampling_rate, function() local service local proxy_client + local sampled_flag_w3c + local sampled_flag_b3 + if instrumentations == "all" and sampling_rate == 1 then + sampled_flag_w3c = "01" + sampled_flag_b3 = "1" + else + sampled_flag_w3c = "00" + sampled_flag_b3 = "0" + end + lazy_setup(function() local bp = helpers.get_db_utils(strategy, { "services", "routes", "plugins" }, { "trace-propagator" }) @@ -127,6 +139,8 @@ describe("propagation tests #" .. strategy, function() database = strategy, plugins = "bundled, trace-propagator", nginx_conf = "spec/fixtures/custom_nginx.template", + tracing_instrumentations = instrumentations, + tracing_sampling_rate = sampling_rate, }) proxy_client = helpers.proxy_client() @@ -144,8 +158,7 @@ describe("propagation tests #" .. strategy, function() }) local body = assert.response(r).has.status(200) local json = cjson.decode(body) - - assert.matches("00%-%x+-%x+-01", json.headers.traceparent) + assert.matches("00%-%x+-%x+-" .. sampled_flag_w3c, json.headers.traceparent) end) it("propagates tracing headers (b3 request)", function() @@ -176,7 +189,7 @@ describe("propagation tests #" .. strategy, function() }) local body = assert.response(r).has.status(200) local json = cjson.decode(body) - assert.matches(trace_id .. "%-%x+%-1%-%x+", json.headers.b3) + assert.matches(trace_id .. "%-%x+%-" .. sampled_flag_b3 .. "%-%x+", json.headers.b3) end) it("without parent_id", function() @@ -191,10 +204,10 @@ describe("propagation tests #" .. strategy, function() }) local body = assert.response(r).has.status(200) local json = cjson.decode(body) - assert.matches(trace_id .. "%-%x+%-1", json.headers.b3) + assert.matches(trace_id .. "%-%x+%-" .. sampled_flag_b3, json.headers.b3) end) - it("with disabled sampling", function() + it("reflects the disabled sampled flag of the incoming tracing header", function() local trace_id = gen_trace_id() local span_id = gen_span_id() @@ -206,6 +219,8 @@ describe("propagation tests #" .. strategy, function() }) local body = assert.response(r).has.status(200) local json = cjson.decode(body) + -- incoming header has sampled=0: always disabled by + -- parent-based sampler assert.matches(trace_id .. "%-%x+%-0", json.headers.b3) end) end) @@ -222,7 +237,7 @@ describe("propagation tests #" .. strategy, function() }) local body = assert.response(r).has.status(200) local json = cjson.decode(body) - assert.matches("00%-" .. trace_id .. "%-%x+-01", json.headers.traceparent) + assert.matches("00%-" .. trace_id .. "%-%x+-" .. sampled_flag_w3c, json.headers.traceparent) end) it("defaults to w3c without propagating when header_type set to ignore and w3c headers sent", function() @@ -239,7 +254,7 @@ describe("propagation tests #" .. strategy, function() local json = cjson.decode(body) assert.is_not_nil(json.headers.traceparent) -- incoming trace id is ignored - assert.not_matches("00%-" .. trace_id .. "%-%x+-01", json.headers.traceparent) + assert.not_matches("00%-" .. trace_id .. "%-%x+-" .. sampled_flag_w3c, json.headers.traceparent) end) it("defaults to w3c without propagating when header_type set to ignore and b3 headers sent", function() @@ -255,7 +270,7 @@ describe("propagation tests #" .. strategy, function() local json = cjson.decode(body) assert.is_not_nil(json.headers.traceparent) -- incoming trace id is ignored - assert.not_matches("00%-" .. trace_id .. "%-%x+-01", json.headers.traceparent) + assert.not_matches("00%-" .. trace_id .. "%-%x+-" .. sampled_flag_w3c, json.headers.traceparent) end) it("propagates w3c tracing headers when header_type set to w3c", function() @@ -270,7 +285,7 @@ describe("propagation tests #" .. strategy, function() }) local body = assert.response(r).has.status(200) local json = cjson.decode(body) - assert.matches("00%-" .. trace_id .. "%-%x+-01", json.headers.traceparent) + assert.matches("00%-" .. trace_id .. "%-%x+-" .. sampled_flag_w3c, json.headers.traceparent) end) it("propagates jaeger tracing headers", function() @@ -287,7 +302,7 @@ describe("propagation tests #" .. strategy, function() local body = assert.response(r).has.status(200) local json = cjson.decode(body) -- Trace ID is left padded with 0 for assert - assert.matches( ('0'):rep(32-#trace_id) .. trace_id .. ":%x+:%x+:01", json.headers["uber-trace-id"]) + assert.matches( ('0'):rep(32-#trace_id) .. trace_id .. ":%x+:%x+:" .. sampled_flag_w3c, json.headers["uber-trace-id"]) end) it("propagates ot headers", function() @@ -322,10 +337,10 @@ describe("propagation tests #" .. strategy, function() assert.same(32, #m[1]) assert.same(16, #m[2]) - assert.same("01", m[3]) + assert.same(sampled_flag_w3c, m[3]) end) - it("reuses span propagated by another plugin", function() + it("with multiple plugins, propagates the correct header", function() local trace_id = gen_trace_id() local r = proxy_client:get("/", { @@ -337,13 +352,11 @@ describe("propagation tests #" .. strategy, function() }) local body = assert.response(r).has.status(200) local json = cjson.decode(body) - - -- trace-propagator parses incoming b3 headers, generates a span and - -- propagates it as b3. Opentelemetry ignores incoming type, reuses span - -- generated by the other plugin and propagates it as w3c. - assert.matches("00%-%x+-" .. json.headers["x-b3-spanid"] .. "%-01", json.headers.traceparent) + assert.matches("00%-%x+-" .. json.headers["x-b3-spanid"] .. "%-" .. sampled_flag_w3c, json.headers.traceparent) end) end) +end +end for _, instrumentation in ipairs({ "request", "request,balancer", "all" }) do describe("propagation tests with enabled " .. instrumentation .. " instrumentation (issue #11294) #" .. strategy, function() diff --git a/spec/03-plugins/37-opentelemetry/04-exporter_spec.lua b/spec/03-plugins/37-opentelemetry/04-exporter_spec.lua index 55e057d09776..9eb5a71996ff 100644 --- a/spec/03-plugins/37-opentelemetry/04-exporter_spec.lua +++ b/spec/03-plugins/37-opentelemetry/04-exporter_spec.lua @@ -46,7 +46,7 @@ for _, strategy in helpers.each_strategy() do end) -- helpers - local function setup_instrumentations(types, config, fixtures, router_scoped, service_scoped, another_global) + local function setup_instrumentations(types, config, fixtures, router_scoped, service_scoped, another_global, global_sampling_rate) local http_srv = assert(bp.services:insert { name = "mock-service", host = helpers.mock_upstream_host, @@ -93,7 +93,7 @@ for _, strategy in helpers.each_strategy() do nginx_conf = "spec/fixtures/custom_nginx.template", plugins = "opentelemetry", tracing_instrumentations = types, - tracing_sampling_rate = 1, + tracing_sampling_rate = global_sampling_rate or 1, }, nil, nil, fixtures)) end @@ -131,7 +131,6 @@ for _, strategy in helpers.each_strategy() do }) assert.res_status(200, r) - -- close client connection cli:close() local lines @@ -165,6 +164,85 @@ for _, strategy in helpers.each_strategy() do end) end) + -- this test is not meant to check that the sampling rate is applied + -- precisely (we have unit tests for that), but rather that the config + -- option is properly handled by the plugin and has an effect on the + -- sampling decision. + for _, global_sampling_rate in ipairs{ 0, 0.001, 1} do + describe("With config.sampling_rate set, using global sampling rate: " .. global_sampling_rate, function () + local mock + local sampling_rate = 0.5 + -- this trace_id is always sampled with 0.5 rate + local sampled_trace_id = "92a54b3e1a7c4f2da9e44b8a6f3e1dab" + -- this trace_id is never sampled with 0.5 rate + local non_sampled_trace_id = "4bf92f3577b34da6a3ce929d0e0e4736" + + lazy_setup(function() + bp, _ = assert(helpers.get_db_utils(strategy, { + "services", + "routes", + "plugins", + }, { "opentelemetry" })) + + setup_instrumentations("all", { + sampling_rate = sampling_rate, + }, nil, nil, nil, nil, global_sampling_rate) + mock = helpers.http_mock(HTTP_SERVER_PORT, { timeout = HTTP_MOCK_TIMEOUT }) + end) + + lazy_teardown(function() + helpers.stop_kong() + if mock then + mock("close", true) + end + end) + + it("does not sample spans when trace_id == non_sampled_trace_id", function() + local cli = helpers.proxy_client(7000, PROXY_PORT) + local r = assert(cli:send { + method = "GET", + path = "/", + headers = { + traceparent = "00-" .. non_sampled_trace_id .. "-0123456789abcdef-01" + } + }) + assert.res_status(200, r) + + cli:close() + + ngx.sleep(2) + local lines = mock() + assert.is_falsy(lines) + end) + + it("samples spans when trace_id == sampled_trace_id", function () + local body + helpers.wait_until(function() + local cli = helpers.proxy_client(7000, PROXY_PORT) + local r = assert(cli:send { + method = "GET", + path = "/", + headers = { + traceparent = "00-" .. sampled_trace_id .. "-0123456789abcdef-01" + } + }) + assert.res_status(200, r) + + cli:close() + + local lines + lines, body = mock() + return lines + end, 10) + + local decoded = assert(pb.decode("opentelemetry.proto.collector.trace.v1.ExportTraceServiceRequest", body)) + assert.not_nil(decoded) + local scope_spans = decoded.resource_spans[1].scope_spans + assert.is_true(#scope_spans > 0, scope_spans) + end) + end) + end + for _, case in ipairs{ {true, true, true}, {true, true, nil}, @@ -208,7 +286,6 @@ for _, strategy in helpers.each_strategy() do }) assert.res_status(200, r) - -- close client connection cli:close() local lines, err = mock() @@ -259,7 +336,6 @@ for _, strategy in helpers.each_strategy() do }) assert.res_status(200, r) - -- close client connection cli:close() local lines @@ -357,7 +433,6 @@ for _, strategy in helpers.each_strategy() do }) assert.res_status(200, r) - -- close client connection cli:close() helpers.wait_until(function() @@ -428,7 +503,6 @@ for _, strategy in helpers.each_strategy() do }) assert.res_status(200, r) - -- close client connection cli:close() local lines @@ -510,7 +584,6 @@ for _, strategy in helpers.each_strategy() do }) assert.res_status(200, r) - -- close client connection cli:close() local lines diff --git a/spec/fixtures/custom_plugins/kong/plugins/trace-propagator/handler.lua b/spec/fixtures/custom_plugins/kong/plugins/trace-propagator/handler.lua index 909a11f093ba..5b61cbcd3f4b 100644 --- a/spec/fixtures/custom_plugins/kong/plugins/trace-propagator/handler.lua +++ b/spec/fixtures/custom_plugins/kong/plugins/trace-propagator/handler.lua @@ -14,31 +14,41 @@ local _M = { function _M:access(conf) local headers = ngx.req.get_headers() - local tracer = kong.tracing.new("trace-propagator") + local tracer = kong.tracing.name == "noop" and kong.tracing.new("otel") + or kong.tracing local root_span = ngx.ctx.KONG_SPANS and ngx.ctx.KONG_SPANS[1] if not root_span then root_span = tracer.start_span("root") + root_span:set_attribute("kong.propagation_only", true) + kong.ctx.plugin.should_sample = false end - local injected_parent_span = tracing_context.get_unlinked_span("balancer") or root_span - local header_type, trace_id, span_id, parent_id, should_sample = propagation_parse(headers) + local injected_parent_span = tracing_context.get_unlinked_span("balancer") or root_span - if should_sample == false then - tracer:set_should_sample(should_sample) - injected_parent_span.should_sample = should_sample - end + local header_type, trace_id, span_id, parent_id, parent_sampled = propagation_parse(headers) + -- overwrite trace ids + -- with the value extracted from incoming tracing headers if trace_id then injected_parent_span.trace_id = trace_id + tracing_context.set_raw_trace_id(trace_id) end - if span_id then root_span.parent_id = span_id - elseif parent_id then root_span.parent_id = parent_id end + -- Set the sampled flag for the outgoing header's span + local sampled + if kong.ctx.plugin.should_sample == false then + sampled = false + else + sampled = tracer:get_sampling_decision(parent_sampled, conf.sampling_rate) + tracer:set_should_sample(sampled) + end + injected_parent_span.should_sample = sampled + local type = header_type and "preserve" or "w3c" propagation_set(type, header_type, injected_parent_span, "w3c") end