Skip to content

Commit

Permalink
fix(dns): fix retry and timeout handling (#11386)
Browse files Browse the repository at this point in the history
- Stop retrying in dns/client.lua, let the resolver handle this.  This
   change also makes it possible to disable retries, which previously
   was not possible
 - Be more faithful to the timeouts set by the user.  Previously, the
   timeout configured was used only for the ultimate request sent to
   the DNS server, but asynchronous requests allowed longer timeouts
   which was not transparent.
 - When the DNS server fails, stop trying other query types.  Previously,
   the behavior was such that after an (intermediate) failure to query
   for one record type (say "SRV"), the client would try the next record
   type (say "A") and succeed with that.  It would then return the
   contents of the "A" record even if the "SRV" record pointed to a
   different address.
 - Change domain names used for testing the DNS client into the
   kong-gateway-testing.link zone, which is controlled by the Kong Gateway
   team.

Fixes #10182
KAG-2300

Signed-off-by: Xiaochen Wang <[email protected]>
  • Loading branch information
hanshuebner authored and dndx committed Nov 24, 2023
1 parent 1a97e63 commit 7315817
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 82 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
message: Update the DNS client to follow configured timeouts in a more predictable manner
type: bugfix
scope: Core
95 changes: 39 additions & 56 deletions kong/resty/dns/client.lua
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,9 @@ end
-- @param self the try_list to add to
-- @param status string with current status, added to the list for the current try
-- @return the try_list
local function try_status(self, status)
local status_list = self[#self].msg
status_list[#status_list + 1] = status
local function add_status_to_try_list(self, status)
local try_list = self[#self].msg
try_list[#try_list + 1] = status
return self
end

Expand All @@ -383,8 +383,7 @@ end
-- @section resolving


local poolMaxWait
local poolMaxRetry
local resolve_max_wait

--- Initialize the client. Can be called multiple times. When called again it
-- will clear the cache.
Expand Down Expand Up @@ -638,8 +637,7 @@ _M.init = function(options)

config = options -- store it in our module level global

poolMaxRetry = 1 -- do one retry, dns resolver is already doing 'retrans' number of retries on top
poolMaxWait = options.timeout / 1000 * options.retrans -- default is to wait for the dns resolver to hit its timeouts
resolve_max_wait = options.timeout / 1000 * options.retrans -- maximum time to wait for the dns resolver to hit its timeouts

return true
end
Expand Down Expand Up @@ -672,7 +670,7 @@ local function parseAnswer(qname, qtype, answers, try_list)

if (answer.type ~= qtype) or (answer.name ~= check_qname) then
local key = answer.type..":"..answer.name
try_status(try_list, key .. " removed")
add_status_to_try_list(try_list, key .. " removed")
local lst = others[key]
if not lst then
lst = {}
Expand Down Expand Up @@ -710,7 +708,7 @@ local function individualQuery(qname, r_opts, try_list)
return r, "failed to create a resolver: " .. err, try_list
end

try_status(try_list, "querying")
add_status_to_try_list(try_list, "querying")

local result
result, err = r:query(qname, r_opts)
Expand All @@ -737,7 +735,7 @@ local function executeQuery(premature, item)
--[[
log(DEBUG, PREFIX, "Query executing: ", item.qname, ":", item.r_opts.qtype, " ", fquery(item))
--]]
try_status(item.try_list, "querying")
add_status_to_try_list(item.try_list, "querying")
item.result, item.err = r:query(item.qname, item.r_opts)
if item.result then
--[[
Expand Down Expand Up @@ -779,7 +777,7 @@ local function asyncQuery(qname, r_opts, try_list)
--[[
log(DEBUG, PREFIX, "Query async (exists): ", key, " ", fquery(item))
--]]
try_status(try_list, "in progress (async)")
add_status_to_try_list(try_list, "in progress (async)")
return item -- already in progress, return existing query
end

Expand All @@ -801,41 +799,32 @@ local function asyncQuery(qname, r_opts, try_list)
--[[
log(DEBUG, PREFIX, "Query async (scheduled): ", key, " ", fquery(item))
--]]
try_status(try_list, "scheduled")
add_status_to_try_list(try_list, "scheduled")

return item
end


-- schedules a sync query.
-- This will be synchronized, so multiple calls (sync or async) might result in 1 query.
-- The `poolMaxWait` is how long a thread waits for another to complete the query.
-- The `poolMaxRetry` is how often we wait for another query to complete.
-- The maximum delay would be `poolMaxWait * poolMaxRetry`.
-- The maximum delay would be `options.timeout * options.retrans`.
-- @param qname the name to query for
-- @param r_opts a table with the query options
-- @param try_list the try_list object to add to
-- @return `result + nil + try_list`, or `nil + err + try_list` in case of errors
local function syncQuery(qname, r_opts, try_list, count)
local function syncQuery(qname, r_opts, try_list)
local key = qname..":"..r_opts.qtype
local item = queue[key]
count = count or 1

-- if nothing is in progress, we start a new async query
if not item then
local err
item, err = asyncQuery(qname, r_opts, try_list)
--[[
log(DEBUG, PREFIX, "Query sync (new): ", key, " ", fquery(item)," count=", count)
--]]
if not item then
return item, err, try_list
end
else
--[[
log(DEBUG, PREFIX, "Query sync (exists): ", key, " ", fquery(item)," count=", count)
--]]
try_status(try_list, "in progress (sync)")
add_status_to_try_list(try_list, "in progress (sync)")
end

local supported_semaphore_wait_phases = {
Expand All @@ -860,7 +849,7 @@ local function syncQuery(qname, r_opts, try_list, count)
end

-- block and wait for the async query to complete
local ok, err = item.semaphore:wait(poolMaxWait)
local ok, err = item.semaphore:wait(resolve_max_wait)
if ok and item.result then
-- we were released, and have a query result from the
-- other thread, so all is well, return it
Expand All @@ -871,25 +860,16 @@ local function syncQuery(qname, r_opts, try_list, count)
return item.result, item.err, try_list
end

-- there was an error, either a semaphore timeout, or a lookup error
-- go retry
try_status(try_list, "try "..count.." error: "..(item.err or err or "unknown"))
if count > poolMaxRetry then
--[[
log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retries exceeded. count=", count)
--]]
return nil, "dns lookup pool exceeded retries (" ..
tostring(poolMaxRetry) .. "): "..tostring(item.err or err),
try_list
end
err = err or item.err or "unknown"
add_status_to_try_list(try_list, "error: "..err)

-- don't block on the same thread again, so remove it from the queue
if queue[key] == item then queue[key] = nil end
if queue[key] == item then
queue[key] = nil
end

--[[
log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retrying. count=", count)
--]]
return syncQuery(qname, r_opts, try_list, count + 1)
-- there was an error, either a semaphore timeout, or a lookup error
return nil, err
end

-- will lookup a name in the cache, or alternatively query the nameservers.
Expand Down Expand Up @@ -928,7 +908,7 @@ local function lookup(qname, r_opts, dnsCacheOnly, try_list)
try_list = try_add(try_list, qname, r_opts.qtype, "cache-hit")
if entry.expired then
-- the cached record is stale but usable, so we do a refresh query in the background
try_status(try_list, "stale")
add_status_to_try_list(try_list, "stale")
asyncQuery(qname, r_opts, try_list)
end

Expand All @@ -946,7 +926,7 @@ local function check_ipv6(qname, qtype, try_list)

local record = cachelookup(qname, qtype)
if record then
try_status(try_list, "cached")
add_status_to_try_list(try_list, "cached")
return record, nil, try_list
end

Expand All @@ -966,7 +946,7 @@ local function check_ipv6(qname, qtype, try_list)
end
if qtype == _M.TYPE_AAAA and
check:match("^%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?$") then
try_status(try_list, "validated")
add_status_to_try_list(try_list, "validated")
record = {{
address = qname,
type = _M.TYPE_AAAA,
Expand All @@ -978,7 +958,7 @@ local function check_ipv6(qname, qtype, try_list)
else
-- not a valid IPv6 address, or a bad type (non ipv6)
-- return a "server error"
try_status(try_list, "bad IPv6")
add_status_to_try_list(try_list, "bad IPv6")
record = {
errcode = 3,
errstr = "name error",
Expand All @@ -999,12 +979,12 @@ local function check_ipv4(qname, qtype, try_list)

local record = cachelookup(qname, qtype)
if record then
try_status(try_list, "cached")
add_status_to_try_list(try_list, "cached")
return record, nil, try_list
end

if qtype == _M.TYPE_A then
try_status(try_list, "validated")
add_status_to_try_list(try_list, "validated")
record = {{
address = qname,
type = _M.TYPE_A,
Expand All @@ -1016,7 +996,7 @@ local function check_ipv4(qname, qtype, try_list)
else
-- bad query type for this ipv4 address
-- return a "server error"
try_status(try_list, "bad IPv4")
add_status_to_try_list(try_list, "bad IPv4")
record = {
errcode = 3,
errstr = "name error",
Expand Down Expand Up @@ -1160,7 +1140,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
records = nil
-- luacheck: pop
err = "recursion detected"
try_status(try_list, "recursion detected")
add_status_to_try_list(try_list, "recursion detected")
return nil, err, try_list
end
end
Expand All @@ -1172,14 +1152,14 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
-- luacheck: push no unused
records = nil
-- luacheck: pop
try_list = try_status(try_list, "stale")
try_list = add_status_to_try_list(try_list, "stale")

else
-- a valid non-stale record
-- check for CNAME records, and dereferencing the CNAME
if (records[1] or EMPTY).type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then
opts.qtype = nil
try_status(try_list, "dereferencing")
add_status_to_try_list(try_list, "dereferencing")
return resolve(records[1].cname, opts, dnsCacheOnly, try_list)
end

Expand Down Expand Up @@ -1227,8 +1207,10 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
end

if not records then -- luacheck: ignore
-- nothing to do, an error
-- fall through to the next entry in our search sequence
-- An error has occurred, terminate the lookup process. We don't want to try other record types because
-- that would potentially cause us to respond with wrong answers (i.e. the contents of an A record if the
-- query for the SRV record failed due to a network error).
goto failed

elseif records.errcode then
-- dns error: fall through to the next entry in our search sequence
Expand Down Expand Up @@ -1287,7 +1269,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
if records[1].type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then
-- dereference CNAME
opts.qtype = nil
try_status(try_list, "dereferencing")
add_status_to_try_list(try_list, "dereferencing")
return resolve(records[1].cname, opts, dnsCacheOnly, try_list)
end

Expand All @@ -1296,8 +1278,9 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
end

-- we had some error, record it in the status list
try_status(try_list, err)
add_status_to_try_list(try_list, err)
end
::failed::

-- we failed, clear cache and return last error
if not dnsCacheOnly then
Expand Down Expand Up @@ -1507,7 +1490,7 @@ local function toip(qname, port, dnsCacheOnly, try_list)
local entry = rec[roundRobinW(rec)]
-- our SRV entry might still contain a hostname, so recurse, with found port number
local srvport = (entry.port ~= 0 and entry.port) or port -- discard port if it is 0
try_status(try_list, "dereferencing SRV")
add_status_to_try_list(try_list, "dereferencing SRV")
return toip(entry.target, srvport, dnsCacheOnly, try_list)
else
-- must be A or AAAA
Expand Down
Loading

0 comments on commit 7315817

Please sign in to comment.