Skip to content

Commit

Permalink
fix(vault): postpone vault reference resolving on init_worker (#12554)
Browse files Browse the repository at this point in the history
### Summary

It was reported on KAG-2907 that existing LMDB database with secrets
can lead to an error when resolving secrets on init worker:

```
resty/http.lua:74: API disabled in the context of init_worker_by_lua*
stack traceback:
	[C]: in function 'co_create'
```

This fixes the issue.

Signed-off-by: Aapo Talvensaari <[email protected]>
(cherry picked from commit 9a7498c)
  • Loading branch information
bungle authored and windmgc committed Mar 5, 2024
1 parent 1984174 commit 4d3cd10
Show file tree
Hide file tree
Showing 4 changed files with 431 additions and 24 deletions.
3 changes: 3 additions & 0 deletions changelog/unreleased/kong/fix-vault-init-worker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
message: fix vault initialization by postponing vault reference resolving on init_worker
type: bugfix
scope: Core
6 changes: 3 additions & 3 deletions kong/db/schema/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -1769,7 +1769,7 @@ function Schema:process_auto_fields(data, context, nulls, opts)
if err then
kong.log.warn("unable to resolve reference ", value, " (", err, ")")
else
kong.log.warn("unable to resolve reference ", value)
kong.log.notice("unable to resolve reference ", value)
end

value = ""
Expand Down Expand Up @@ -1808,7 +1808,7 @@ function Schema:process_auto_fields(data, context, nulls, opts)
if err then
kong.log.warn("unable to resolve reference ", value[i], " (", err, ")")
else
kong.log.warn("unable to resolve reference ", value[i])
kong.log.notice("unable to resolve reference ", value[i])
end

value[i] = ""
Expand Down Expand Up @@ -1854,7 +1854,7 @@ function Schema:process_auto_fields(data, context, nulls, opts)
if err then
kong.log.warn("unable to resolve reference ", v, " (", err, ")")
else
kong.log.warn("unable to resolve reference ", v)
kong.log.notice("unable to resolve reference ", v)
end

value[k] = ""
Expand Down
119 changes: 101 additions & 18 deletions kong/pdk/vault.lua
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ local function new(self)
local SECRETS_CACHE = ngx.shared.kong_secrets
local SECRETS_CACHE_MIN_TTL = ROTATION_INTERVAL * 2

local INIT_SECRETS = {}
local INIT_WORKER_SECRETS = {}
local STRATEGIES = {}
local SCHEMAS = {}
local CONFIGS = {}
Expand Down Expand Up @@ -616,7 +618,7 @@ local function new(self)

if not vault then
if err then
self.log.notice("could not find vault (", prefix, "): ", err)
return nil, fmt("could not find vault (%s): %s", prefix, err)
end

return nil, fmt("could not find vault (%s)", prefix)
Expand Down Expand Up @@ -821,10 +823,15 @@ local function new(self)
-- If the value is not found in these caches and `cache_only` is not `truthy`,
-- it attempts to retrieve the value from a vault.
--
-- On init worker phase the resolving of the secrets is postponed to a timer,
-- and in this case the function returns `""` when it fails to find a value
-- in a cache. This is because of current limitations in platform that disallows
-- using cosockets/coroutines in that phase.
--
-- @local
-- @function get
-- @tparam string reference the reference key to lookup
-- @tparam boolean cache_only optional boolean flag (if set to `true`,
-- @tparam[opt] boolean cache_only optional boolean flag (if set to `true`,
-- the function will not attempt to retrieve the value from the vault)
-- @treturn string the retrieved value corresponding to the provided reference,
-- or `nil` (when found negatively cached, or in case of an error)
Expand All @@ -841,19 +848,40 @@ local function new(self)

local strategy, err, config, cache_key, parsed_reference = get_strategy(reference)
if not strategy then
-- this can fail on init as the lmdb cannot be accessed and secondly,
-- because the data is not yet inserted into LMDB when using KONG_DECLARATIVE_CONFIG.
if get_phase() == "init" then
if not INIT_SECRETS[cache_key] then
INIT_SECRETS[reference] = true
INIT_SECRETS[#INIT_SECRETS + 1] = reference
end

return ""
end

return nil, err
end

value = SECRETS_CACHE:get(cache_key)
if cache_only and not value then
return nil, "could not find cached value"
end

if value == NEGATIVELY_CACHED_VALUE then
return nil
end

if not value then
if cache_only then
return nil, "could not find cached value"
end

-- this can fail on init worker as there is no cosockets / coroutines available
if get_phase() == "init_worker" then
if not INIT_WORKER_SECRETS[cache_key] then
INIT_WORKER_SECRETS[cache_key] = true
INIT_WORKER_SECRETS[#INIT_WORKER_SECRETS + 1] = cache_key
end

return ""
end

return get_from_vault(reference, strategy, config, cache_key, parsed_reference)
end

Expand Down Expand Up @@ -883,7 +911,7 @@ local function new(self)
-- update_from_cache("{vault://env/example}", record, "field" })
local function update_from_cache(reference, record, field)
local value, err = get(reference, true)
if not value then
if err then
self.log.warn("error updating secret reference ", reference, ": ", err)
end

Expand Down Expand Up @@ -1236,19 +1264,20 @@ local function new(self)


---
-- Function `rotate_secrets` rotates the secrets in the shared dictionary cache (SHDICT).
-- Function `rotate_secrets` rotates the secrets.
--
-- It iterates over all keys in the SHDICT and, if a key corresponds to a reference and the
-- It iterates over all keys in the secrets and, if a key corresponds to a reference and the
-- ttl of the key is less than or equal to the resurrection period, it refreshes the value
-- associated with the reference.
--
-- @local
-- @function rotate_secrets
-- @treturn boolean `true` after it has finished iterating over all keys in the SHDICT
local function rotate_secrets()
-- @tparam table secrets the secrets to rotate
-- @treturn boolean `true` after it has finished iterating over all keys in the secrets
local function rotate_secrets(secrets)
local phase = get_phase()
local caching_strategy = get_caching_strategy()
for _, cache_key in ipairs(SECRETS_CACHE:get_keys(0)) do
for _, cache_key in ipairs(secrets) do
yield(true, phase)

local ok, err = rotate_secret(cache_key, caching_strategy)
Expand All @@ -1262,20 +1291,69 @@ local function new(self)


---
-- A recurring secrets rotation timer handler.
-- Function `rotate_secrets_cache` rotates the secrets in the shared dictionary cache.
--
-- @local
-- @function rotate_secrets_cache
-- @treturn boolean `true` after it has finished iterating over all keys in the shared dictionary cache
local function rotate_secrets_cache()
return rotate_secrets(SECRETS_CACHE:get_keys(0))
end


---
-- Function `rotate_secrets_init_worker` rotates the secrets in init worker cache
--
-- On init worker the secret resolving is postponed to a timer because init worker
-- cannot cosockets / coroutines, and there is no other workaround currently.
--
-- @local
-- @function rotate_secrets_init_worker
-- @treturn boolean `true` after it has finished iterating over all keys in the init worker cache
local function rotate_secrets_init_worker()
local _, err, err2
if INIT_SECRETS then
_, err = rotate_references(INIT_SECRETS)
end

if INIT_WORKER_SECRETS then
_, err2 = rotate_secrets(INIT_WORKER_SECRETS)
end

if err or err2 then
return nil, err or err2
end

return true
end


---
-- A secrets rotation timer handler.
--
-- Uses a node-level mutex to prevent multiple threads/workers running it the same time.
--
-- @local
-- @function rotate_secrets_timer
-- @tparam boolean premature `true` if server is shutting down.
local function rotate_secrets_timer(premature)
-- @tparam boolean premature `true` if server is shutting down
-- @tparam[opt] boolean init `true` when this is a one of init_worker timer run
-- By default rotates the secrets in shared dictionary cache.
local function rotate_secrets_timer(premature, init)
if premature then
return
return true
end

local ok, err = concurrency.with_worker_mutex(ROTATION_MUTEX_OPTS, rotate_secrets)
local ok, err = concurrency.with_worker_mutex(ROTATION_MUTEX_OPTS, init and rotate_secrets_init_worker or rotate_secrets_cache)
if not ok and err ~= "timeout" then
self.log.err("rotating secrets failed (", err, ")")
end

if init then
INIT_SECRETS = nil
INIT_WORKER_SECRETS = nil
end

return true
end


Expand Down Expand Up @@ -1314,7 +1392,7 @@ local function new(self)
-- refresh all the secrets
local _, err = self.timer:named_at("secret-rotation-on-crud-event", 0, rotate_secrets_timer)
if err then
self.log.err("could not schedule timer to rotate vault secret references: ", err)
self.log.err("could not schedule timer to rotate vault secret references on crud event: ", err)
end
end

Expand Down Expand Up @@ -1343,6 +1421,11 @@ local function new(self)
if err then
self.log.err("could not schedule timer to rotate vault secret references: ", err)
end

local _, err = self.timer:named_at("secret-rotation-on-init", 0, rotate_secrets_timer, true)
if err then
self.log.err("could not schedule timer to rotate vault secret references on init: ", err)
end
end


Expand Down
Loading

0 comments on commit 4d3cd10

Please sign in to comment.