Skip to content

Commit

Permalink
fix(bk-cache-fallback/init.lua): if acquire lock timeout, try to get … (
Browse files Browse the repository at this point in the history
  • Loading branch information
wklken authored Nov 13, 2024
1 parent 2c82c0e commit 55eb657
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 9 deletions.
29 changes: 24 additions & 5 deletions src/apisix/plugins/bk-cache-fallback/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,32 @@ function _M.get_with_fallback(self, ctx, key, version, create_obj_func, ...)
local key_s = cache_key
log.info("try to lock with key ", key_s)

-- FIXME: possible problem here, if high concurrent, all requests may wait here except one
-- and at that time, process one by one after the retrieve finished
-- maybe some requests will timeout?
-- check the sentry and error log after this version online
-- NOTE: possible problem here, if high concurrent, all requests may wait here except one
-- and at that time, process one by one after the retrieve finished, some requests will timeout?
local elapsed, lock_err = lock:lock(key_s)
if not elapsed then
return nil, "failed to acquire the bk-cache-fallback lock, err: " .. lock_err
-- FIXME: if lock err not timeout, should we use the cache data in shared_dict too? (Very rarely)
if lock_err ~= "timeout" then
return nil, "failed to acquire the bk-cache-fallback lock, key: " .. key_s .. ", err: " .. lock_err
end

-- NOTE: 2024-11-11 we met some timeout here, in the same apisix pod, the same cache_key,
-- the lock aquire timeout, then cause all responses fail at that time!
-- So: we should try to use the fallback shared_dict data here
local shared_data_dict = ngx_shared[self.plugin_shared_dict_name]
if shared_data_dict ~= nil then
local sd = shared_data_dict:get(cache_key)
if sd ~= nil then
local obj_decoded, json_err = core.json.decode(sd)
if json_err == nil then
log.error("failed to acquire the bk-cache-fallback lock, key: " .. key_s ..
", fallback to get the data from shared_dict")
return obj_decoded, nil
end
end
end

return nil, "failed to acquire the bk-cache-fallback lock, key: " .. key_s .. ", error: timeout."
end

-- TODO: 函数过长, 需要考虑拆分, 特别是 unlock 特别多, 也容易出问题
Expand Down
2 changes: 1 addition & 1 deletion src/apisix/plugins/bk-permission.lua
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ function _M.init()
lrucache_max_items = 10240, -- key is gateway+resource+app_code, so maybe a lot of keys
lrucache_ttl = 60,
lrucache_short_ttl = 2, -- cache failed response for 2 seconds
fallback_cache_ttl = 60 * 60 * 12, -- if core-api is down, use the fallback cache data generated in 12 hours
fallback_cache_ttl = 60 * 60 * 24, -- if core-api is down, use the fallback cache data generated in 24 hours
}, plugin_name
)
end
Expand Down
119 changes: 119 additions & 0 deletions src/apisix/tests/bk-cache-fallback/test-init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -313,5 +313,124 @@ describe(
end
)

-- a new case here, to test the lock:lock timeout and other errors
describe("lock:lock timeout and other errors", function()
local key
local f

before_each(function ()
key = "foo"

f = {
create_obj_func_ok = function() end,
create_obj_func_fail = function() end,
}

stub(f, "create_obj_func_ok", function ()
return {
["hello"] = "world",
}, nil
end)
stub(f, "create_obj_func_fail", function ()
return nil, "error"
end)

end)

after_each(function ()
f.create_obj_func_ok:revert()
f.create_obj_func_fail:revert()
end)

it("is not timeout error, should return the error", function ()
local ctx = {
conf_id = 789,
conf_type = "test",
}
local cache = cache_fallback.new(conf, "bk-cache-fallback-lock-error")
local cache_key = cache.key_prefix .. key

local shared_data_dict = ngx_shared[cache.plugin_shared_dict_name]
assert.is_not_nil(shared_data_dict)

-- Stub lock:lock to simulate an error (not timeout)
local lock = require("resty.lock")
stub(lock, "lock", function(self, key_s)
if key_s == cache_key then
return nil, "some error"
else
return 0, nil
end
end)

local obj, err = cache:get_with_fallback(ctx, key, nil, f.create_obj_func_fail)
assert.is_nil(obj)
assert.is_not_nil(err)
assert.equal("failed to acquire the bk-cache-fallback lock, key: bk-cache-fallback-lock-error:foo, err: some error", err)

lock.lock:revert()
end)

it("is timeout, no data in shared_dict, return the error", function ()
local ctx = {
conf_id = 789,
conf_type = "test",
}
local cache = cache_fallback.new(conf, "bk-cache-fallback-lock-timeout")
local cache_key = cache.key_prefix .. key

local shared_data_dict = ngx_shared[cache.plugin_shared_dict_name]
assert.is_not_nil(shared_data_dict)

-- Stub lock:lock to simulate a timeout
local lock = require("resty.lock")
stub(lock, "lock", function(self, key_s)
if key_s == cache_key then
return nil, "timeout"
else
return 0, nil
end
end)

local obj, err = cache:get_with_fallback(ctx, key, nil, f.create_obj_func_fail)
assert.is_nil(obj)
assert.is_not_nil(err)
assert.equal("failed to acquire the bk-cache-fallback lock, key: bk-cache-fallback-lock-timeout:foo, error: timeout.", err)

lock.lock:revert()
end)

it("is timeout, got the data in shared_dict, return data,nil", function ()
local ctx = {
conf_id = 789,
conf_type = "test",
}
local cache = cache_fallback.new(conf, "bk-cache-fallback-lock-timeout-data")
local cache_key = cache.key_prefix .. key

local shared_data_dict = ngx_shared[cache.plugin_shared_dict_name]
assert.is_not_nil(shared_data_dict)

-- set data in shared_dict
shared_data_dict:set(cache_key, '{"hello":"abc"}', 60 * 60)

-- Stub lock:lock to simulate a timeout
local lock = require("resty.lock")
stub(lock, "lock", function(self, key_s)
if key_s == cache_key then
return nil, "timeout"
else
return 0, nil
end
end)

local obj, err = cache:get_with_fallback(ctx, key, nil, f.create_obj_func_ok)
assert.is_not_nil(obj)
assert.is_nil(err)
assert.equal("abc", obj["hello"])

lock.lock:revert()
end)
end)
end
)
7 changes: 5 additions & 2 deletions src/apisix/tests/conf/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ lua_shared_dict prometheus-metrics 1m;
lua_shared_dict plugin-bk-permission 1m;
lua_shared_dict plugin-bk-cache-fallback 1m;
lua_shared_dict plugin-bk-cache-fallback-lock 1m;
# for unittest, bk-cache-fallback/init.lua, case ok/fail/ok->fail
# for unittest, bk-cache-fallback/init.lua, case ok/fail/ok->fail/lock fail
lua_shared_dict plugin-bk-cache-fallback-ok 1m;
lua_shared_dict plugin-bk-cache-fallback-fail 1m;
lua_shared_dict plugin-bk-cache-fallback-ok-fail 1m;
lua_shared_dict plugin-bk-cache-fallback-lock-error 1m;
lua_shared_dict plugin-bk-cache-fallback-lock-timeout 1m;
lua_shared_dict plugin-bk-cache-fallback-lock-timeout-data 1m;

error_log /bkgateway/logs/error.log;
error_log /bkgateway/logs/error.log;
2 changes: 1 addition & 1 deletion src/apisix/tests/test-bk-permission.lua
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ describe(
assert.equal(10240, cache.lrucache_max_items)
assert.equal(60, cache.lrucache_ttl)
assert.equal(2, cache.lrucache_short_ttl)
assert.equal(43200, cache.fallback_cache_ttl)
assert.equal(86400, cache.fallback_cache_ttl)

end
)
Expand Down

0 comments on commit 55eb657

Please sign in to comment.