From 4d3cd10d67e0ad2fa81ce0ca375716a752ab643d Mon Sep 17 00:00:00 2001 From: Aapo Talvensaari Date: Tue, 20 Feb 2024 11:31:54 +0200 Subject: [PATCH] fix(vault): postpone vault reference resolving on init_worker (#12554) ### Summary It was reported on KAG-2907 that existing LMDB database with secrets can lead to an error when resolving secrets on init worker: ``` resty/http.lua:74: API disabled in the context of init_worker_by_lua* stack traceback: [C]: in function 'co_create' ``` This fixes the issue. Signed-off-by: Aapo Talvensaari (cherry picked from commit 9a7498cda2b01f020a0b7fabd41dcd62c83c8dfb) --- .../unreleased/kong/fix-vault-init-worker.yml | 3 + kong/db/schema/init.lua | 6 +- kong/pdk/vault.lua | 119 ++++++- .../02-cmd/02-start_stop_spec.lua | 327 +++++++++++++++++- 4 files changed, 431 insertions(+), 24 deletions(-) create mode 100644 changelog/unreleased/kong/fix-vault-init-worker.yml diff --git a/changelog/unreleased/kong/fix-vault-init-worker.yml b/changelog/unreleased/kong/fix-vault-init-worker.yml new file mode 100644 index 000000000000..d5315d0d7c28 --- /dev/null +++ b/changelog/unreleased/kong/fix-vault-init-worker.yml @@ -0,0 +1,3 @@ +message: fix vault initialization by postponing vault reference resolving on init_worker +type: bugfix +scope: Core diff --git a/kong/db/schema/init.lua b/kong/db/schema/init.lua index 0a3db763ad6d..6bc02d0fca31 100644 --- a/kong/db/schema/init.lua +++ b/kong/db/schema/init.lua @@ -1769,7 +1769,7 @@ function Schema:process_auto_fields(data, context, nulls, opts) if err then kong.log.warn("unable to resolve reference ", value, " (", err, ")") else - kong.log.warn("unable to resolve reference ", value) + kong.log.notice("unable to resolve reference ", value) end value = "" @@ -1808,7 +1808,7 @@ function Schema:process_auto_fields(data, context, nulls, opts) if err then kong.log.warn("unable to resolve reference ", value[i], " (", err, ")") else - kong.log.warn("unable to resolve reference ", value[i]) + kong.log.notice("unable to resolve reference ", value[i]) end value[i] = "" @@ -1854,7 +1854,7 @@ function Schema:process_auto_fields(data, context, nulls, opts) if err then kong.log.warn("unable to resolve reference ", v, " (", err, ")") else - kong.log.warn("unable to resolve reference ", v) + kong.log.notice("unable to resolve reference ", v) end value[k] = "" diff --git a/kong/pdk/vault.lua b/kong/pdk/vault.lua index 9473ae9a2e09..222614f5b40c 100644 --- a/kong/pdk/vault.lua +++ b/kong/pdk/vault.lua @@ -197,6 +197,8 @@ local function new(self) local SECRETS_CACHE = ngx.shared.kong_secrets local SECRETS_CACHE_MIN_TTL = ROTATION_INTERVAL * 2 + local INIT_SECRETS = {} + local INIT_WORKER_SECRETS = {} local STRATEGIES = {} local SCHEMAS = {} local CONFIGS = {} @@ -616,7 +618,7 @@ local function new(self) if not vault then if err then - self.log.notice("could not find vault (", prefix, "): ", err) + return nil, fmt("could not find vault (%s): %s", prefix, err) end return nil, fmt("could not find vault (%s)", prefix) @@ -821,10 +823,15 @@ local function new(self) -- If the value is not found in these caches and `cache_only` is not `truthy`, -- it attempts to retrieve the value from a vault. -- + -- On init worker phase the resolving of the secrets is postponed to a timer, + -- and in this case the function returns `""` when it fails to find a value + -- in a cache. This is because of current limitations in platform that disallows + -- using cosockets/coroutines in that phase. + -- -- @local -- @function get -- @tparam string reference the reference key to lookup - -- @tparam boolean cache_only optional boolean flag (if set to `true`, + -- @tparam[opt] boolean cache_only optional boolean flag (if set to `true`, -- the function will not attempt to retrieve the value from the vault) -- @treturn string the retrieved value corresponding to the provided reference, -- or `nil` (when found negatively cached, or in case of an error) @@ -841,19 +848,40 @@ local function new(self) local strategy, err, config, cache_key, parsed_reference = get_strategy(reference) if not strategy then + -- this can fail on init as the lmdb cannot be accessed and secondly, + -- because the data is not yet inserted into LMDB when using KONG_DECLARATIVE_CONFIG. + if get_phase() == "init" then + if not INIT_SECRETS[cache_key] then + INIT_SECRETS[reference] = true + INIT_SECRETS[#INIT_SECRETS + 1] = reference + end + + return "" + end + return nil, err end value = SECRETS_CACHE:get(cache_key) - if cache_only and not value then - return nil, "could not find cached value" - end - if value == NEGATIVELY_CACHED_VALUE then return nil end if not value then + if cache_only then + return nil, "could not find cached value" + end + + -- this can fail on init worker as there is no cosockets / coroutines available + if get_phase() == "init_worker" then + if not INIT_WORKER_SECRETS[cache_key] then + INIT_WORKER_SECRETS[cache_key] = true + INIT_WORKER_SECRETS[#INIT_WORKER_SECRETS + 1] = cache_key + end + + return "" + end + return get_from_vault(reference, strategy, config, cache_key, parsed_reference) end @@ -883,7 +911,7 @@ local function new(self) -- update_from_cache("{vault://env/example}", record, "field" }) local function update_from_cache(reference, record, field) local value, err = get(reference, true) - if not value then + if err then self.log.warn("error updating secret reference ", reference, ": ", err) end @@ -1236,19 +1264,20 @@ local function new(self) --- - -- Function `rotate_secrets` rotates the secrets in the shared dictionary cache (SHDICT). + -- Function `rotate_secrets` rotates the secrets. -- - -- It iterates over all keys in the SHDICT and, if a key corresponds to a reference and the + -- It iterates over all keys in the secrets and, if a key corresponds to a reference and the -- ttl of the key is less than or equal to the resurrection period, it refreshes the value -- associated with the reference. -- -- @local -- @function rotate_secrets - -- @treturn boolean `true` after it has finished iterating over all keys in the SHDICT - local function rotate_secrets() + -- @tparam table secrets the secrets to rotate + -- @treturn boolean `true` after it has finished iterating over all keys in the secrets + local function rotate_secrets(secrets) local phase = get_phase() local caching_strategy = get_caching_strategy() - for _, cache_key in ipairs(SECRETS_CACHE:get_keys(0)) do + for _, cache_key in ipairs(secrets) do yield(true, phase) local ok, err = rotate_secret(cache_key, caching_strategy) @@ -1262,20 +1291,69 @@ local function new(self) --- - -- A recurring secrets rotation timer handler. + -- Function `rotate_secrets_cache` rotates the secrets in the shared dictionary cache. + -- + -- @local + -- @function rotate_secrets_cache + -- @treturn boolean `true` after it has finished iterating over all keys in the shared dictionary cache + local function rotate_secrets_cache() + return rotate_secrets(SECRETS_CACHE:get_keys(0)) + end + + + --- + -- Function `rotate_secrets_init_worker` rotates the secrets in init worker cache + -- + -- On init worker the secret resolving is postponed to a timer because init worker + -- cannot cosockets / coroutines, and there is no other workaround currently. + -- + -- @local + -- @function rotate_secrets_init_worker + -- @treturn boolean `true` after it has finished iterating over all keys in the init worker cache + local function rotate_secrets_init_worker() + local _, err, err2 + if INIT_SECRETS then + _, err = rotate_references(INIT_SECRETS) + end + + if INIT_WORKER_SECRETS then + _, err2 = rotate_secrets(INIT_WORKER_SECRETS) + end + + if err or err2 then + return nil, err or err2 + end + + return true + end + + + --- + -- A secrets rotation timer handler. + -- + -- Uses a node-level mutex to prevent multiple threads/workers running it the same time. -- -- @local -- @function rotate_secrets_timer - -- @tparam boolean premature `true` if server is shutting down. - local function rotate_secrets_timer(premature) + -- @tparam boolean premature `true` if server is shutting down + -- @tparam[opt] boolean init `true` when this is a one of init_worker timer run + -- By default rotates the secrets in shared dictionary cache. + local function rotate_secrets_timer(premature, init) if premature then - return + return true end - local ok, err = concurrency.with_worker_mutex(ROTATION_MUTEX_OPTS, rotate_secrets) + local ok, err = concurrency.with_worker_mutex(ROTATION_MUTEX_OPTS, init and rotate_secrets_init_worker or rotate_secrets_cache) if not ok and err ~= "timeout" then self.log.err("rotating secrets failed (", err, ")") end + + if init then + INIT_SECRETS = nil + INIT_WORKER_SECRETS = nil + end + + return true end @@ -1314,7 +1392,7 @@ local function new(self) -- refresh all the secrets local _, err = self.timer:named_at("secret-rotation-on-crud-event", 0, rotate_secrets_timer) if err then - self.log.err("could not schedule timer to rotate vault secret references: ", err) + self.log.err("could not schedule timer to rotate vault secret references on crud event: ", err) end end @@ -1343,6 +1421,11 @@ local function new(self) if err then self.log.err("could not schedule timer to rotate vault secret references: ", err) end + + local _, err = self.timer:named_at("secret-rotation-on-init", 0, rotate_secrets_timer, true) + if err then + self.log.err("could not schedule timer to rotate vault secret references on init: ", err) + end end diff --git a/spec/02-integration/02-cmd/02-start_stop_spec.lua b/spec/02-integration/02-cmd/02-start_stop_spec.lua index 2c831503a7ec..48d0554acbae 100644 --- a/spec/02-integration/02-cmd/02-start_stop_spec.lua +++ b/spec/02-integration/02-cmd/02-start_stop_spec.lua @@ -130,6 +130,7 @@ describe("kong start/stop #" .. strategy, function() end) it("resolves referenced secrets", function() + helpers.clean_logfile() helpers.setenv("PG_PASSWORD", "dummy") local _, stderr, stdout = assert(kong_exec("start", { @@ -169,7 +170,7 @@ describe("kong start/stop #" .. strategy, function() assert(kong_exec("stop", { prefix = PREFIX })) end) - it("start/stop stops without error when references cannot be resolved #test", function() + it("start/stop stops without error when references cannot be resolved", function() helpers.setenv("PG_PASSWORD", "dummy") local _, stderr, stdout = assert(kong_exec("start", { @@ -226,6 +227,7 @@ describe("kong start/stop #" .. strategy, function() end) it("should not add [emerg], [alert], [crit], [error] or [warn] lines to error log", function() + helpers.clean_logfile() assert(helpers.kong_exec("start ", { prefix = helpers.test_conf.prefix, stream_listen = "127.0.0.1:9022", @@ -634,6 +636,8 @@ describe("kong start/stop #" .. strategy, function() if strategy == "off" then it("does not start with an invalid declarative config file", function() + helpers.clean_logfile() + local yaml_file = helpers.make_yaml_file [[ _format_version: "1.1" services: @@ -665,6 +669,9 @@ describe("kong start/stop #" .. strategy, function() end) it("dbless can reference secrets in declarative configuration", function() + helpers.clean_logfile() + helpers.setenv("SESSION_SECRET", "top-secret-value") + local yaml_file = helpers.make_yaml_file [[ _format_version: "3.0" _transform: true @@ -672,10 +679,11 @@ describe("kong start/stop #" .. strategy, function() - name: session instance_name: session config: - secret: "{vault://mocksocket/test}" + secret: "{vault://mocksocket/session-secret}" ]] finally(function() + helpers.unsetenv("SESSION_SECRET") os.remove(yaml_file) end) @@ -692,12 +700,325 @@ describe("kong start/stop #" .. strategy, function() database = "off", declarative_config = yaml_file, vaults = "mocksocket", - plugins = "session" + plugins = "session", }) + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + assert.truthy(ok) assert.not_matches("error", err) assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.no.line(" {vault://mocksocket/session-secret}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + assert(helpers.restart_kong({ + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.no.line(" {vault://mocksocket/session-secret}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + assert(helpers.reload_kong("off", "reload --prefix " .. helpers.test_conf.prefix, { + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.no.line(" {vault://mocksocket/session-secret}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + end) + + it("dbless does not fail fatally when referencing secrets doesn't work in declarative configuration", function() + helpers.clean_logfile() + + local yaml_file = helpers.make_yaml_file [[ + _format_version: "3.0" + _transform: true + plugins: + - name: session + instance_name: session + config: + secret: "{vault://mocksocket/session-secret-unknown}" + ]] + + finally(function() + os.remove(yaml_file) + end) + + helpers.setenv("KONG_LUA_PATH_OVERRIDE", "./spec/fixtures/custom_vaults/?.lua;./spec/fixtures/custom_vaults/?/init.lua;;") + helpers.get_db_utils(strategy, { + "vaults", + }, { + "session" + }, { + "mocksocket" + }) + + local ok, err = helpers.start_kong({ + database = "off", + declarative_config = yaml_file, + vaults = "mocksocket", + plugins = "session", + }) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + assert.truthy(ok) + assert.not_matches("error", err) + assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.line(" {vault://mocksocket/session-secret-unknown}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + assert(helpers.restart_kong({ + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.line(" {vault://mocksocket/session-secret-unknown}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + assert(helpers.reload_kong("off", "reload --prefix " .. helpers.test_conf.prefix, { + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.line(" {vault://mocksocket/session-secret-unknown}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + end) + + it("dbless can reference secrets in declarative configuration using vault entities", function() + helpers.clean_logfile() + helpers.setenv("SESSION_SECRET_AGAIN", "top-secret-value") + + local yaml_file = helpers.make_yaml_file [[ + _format_version: "3.0" + _transform: true + plugins: + - name: session + instance_name: session + config: + secret: "{vault://mock/session-secret-again}" + vaults: + - description: my vault + name: mocksocket + prefix: mock + ]] + + finally(function() + helpers.unsetenv("SESSION_SECRET_AGAIN") + os.remove(yaml_file) + end) + + helpers.setenv("KONG_LUA_PATH_OVERRIDE", "./spec/fixtures/custom_vaults/?.lua;./spec/fixtures/custom_vaults/?/init.lua;;") + helpers.get_db_utils(strategy, { + "vaults", + }, { + "session" + }, { + "mocksocket" + }) + + local ok, err = helpers.start_kong({ + database = "off", + declarative_config = yaml_file, + vaults = "mocksocket", + plugins = "session", + }) + + assert.truthy(ok) + assert.not_matches("error", err) + assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.no.line(" {vault://mock/session-secret-again}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + assert(helpers.restart_kong({ + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.no.line(" {vault://mock/session-secret-again}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + assert(helpers.reload_kong("off", "reload --prefix " .. helpers.test_conf.prefix, { + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.no.line(" {vault://mock/session-secret-again}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + end) + + it("dbless does not fail fatally when referencing secrets doesn't work in declarative configuration using vault entities", function() + helpers.clean_logfile() + + local yaml_file = helpers.make_yaml_file [[ + _format_version: "3.0" + _transform: true + plugins: + - name: session + instance_name: session + config: + secret: "{vault://mock/session-secret-unknown-again}" + vaults: + - description: my vault + name: mocksocket + prefix: mock + ]] + + finally(function() + os.remove(yaml_file) + end) + + helpers.setenv("KONG_LUA_PATH_OVERRIDE", "./spec/fixtures/custom_vaults/?.lua;./spec/fixtures/custom_vaults/?/init.lua;;") + helpers.get_db_utils(strategy, { + "vaults", + }, { + "session" + }, { + "mocksocket" + }) + + local ok, err = helpers.start_kong({ + database = "off", + declarative_config = yaml_file, + vaults = "mocksocket", + plugins = "session", + }) + + assert.truthy(ok) + assert.not_matches("error", err) + assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.line(" {vault://mock/session-secret-unknown-again}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + assert(helpers.restart_kong({ + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("[error]", true, 0) + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.line(" {vault://mock/session-secret-unknown-again}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) + + assert(helpers.reload_kong("off", "reload --prefix " .. helpers.test_conf.prefix, { + database = "off", + vaults = "mocksocket", + plugins = "session", + declarative_config = "", + })) + + assert.logfile().has.no.line("traceback", true, 0) + assert.logfile().has.line(" {vault://mock/session-secret-unknown-again}", true, 0) + assert.logfile().has.no.line("could not find vault", true, 0) + + proxy_client = helpers.proxy_client() + + local res = proxy_client:get("/") + assert.res_status(404, res) + local body = assert.response(res).has.jsonbody() + assert.equal("no Route matched with those values", body.message) end) end end)