From 55eb657f4696409c5a1d09c830d12563834b3bfe Mon Sep 17 00:00:00 2001 From: wklken Date: Wed, 13 Nov 2024 14:29:37 +0800 Subject: [PATCH] =?UTF-8?q?fix(bk-cache-fallback/init.lua):=20if=20acquire?= =?UTF-8?q?=20lock=20timeout,=20try=20to=20get=20=E2=80=A6=20(#87)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/apisix/plugins/bk-cache-fallback/init.lua | 29 ++++- src/apisix/plugins/bk-permission.lua | 2 +- .../tests/bk-cache-fallback/test-init.lua | 119 ++++++++++++++++++ src/apisix/tests/conf/nginx.conf | 7 +- src/apisix/tests/test-bk-permission.lua | 2 +- 5 files changed, 150 insertions(+), 9 deletions(-) diff --git a/src/apisix/plugins/bk-cache-fallback/init.lua b/src/apisix/plugins/bk-cache-fallback/init.lua index 7ac5a84..209522b 100644 --- a/src/apisix/plugins/bk-cache-fallback/init.lua +++ b/src/apisix/plugins/bk-cache-fallback/init.lua @@ -134,13 +134,32 @@ function _M.get_with_fallback(self, ctx, key, version, create_obj_func, ...) local key_s = cache_key log.info("try to lock with key ", key_s) - -- FIXME: possible problem here, if high concurrent, all requests may wait here except one - -- and at that time, process one by one after the retrieve finished - -- maybe some requests will timeout? - -- check the sentry and error log after this version online + -- NOTE: possible problem here, if high concurrent, all requests may wait here except one + -- and at that time, process one by one after the retrieve finished, some requests will timeout? local elapsed, lock_err = lock:lock(key_s) if not elapsed then - return nil, "failed to acquire the bk-cache-fallback lock, err: " .. lock_err + -- FIXME: if lock err not timeout, should we use the cache data in shared_dict too? (Very rarely) + if lock_err ~= "timeout" then + return nil, "failed to acquire the bk-cache-fallback lock, key: " .. key_s .. ", err: " .. lock_err + end + + -- NOTE: 2024-11-11 we met some timeout here, in the same apisix pod, the same cache_key, + -- the lock aquire timeout, then cause all responses fail at that time! + -- So: we should try to use the fallback shared_dict data here + local shared_data_dict = ngx_shared[self.plugin_shared_dict_name] + if shared_data_dict ~= nil then + local sd = shared_data_dict:get(cache_key) + if sd ~= nil then + local obj_decoded, json_err = core.json.decode(sd) + if json_err == nil then + log.error("failed to acquire the bk-cache-fallback lock, key: " .. key_s .. + ", fallback to get the data from shared_dict") + return obj_decoded, nil + end + end + end + + return nil, "failed to acquire the bk-cache-fallback lock, key: " .. key_s .. ", error: timeout." end -- TODO: 函数过长, 需要考虑拆分, 特别是 unlock 特别多, 也容易出问题 diff --git a/src/apisix/plugins/bk-permission.lua b/src/apisix/plugins/bk-permission.lua index a9eb6b1..db1f048 100644 --- a/src/apisix/plugins/bk-permission.lua +++ b/src/apisix/plugins/bk-permission.lua @@ -80,7 +80,7 @@ function _M.init() lrucache_max_items = 10240, -- key is gateway+resource+app_code, so maybe a lot of keys lrucache_ttl = 60, lrucache_short_ttl = 2, -- cache failed response for 2 seconds - fallback_cache_ttl = 60 * 60 * 12, -- if core-api is down, use the fallback cache data generated in 12 hours + fallback_cache_ttl = 60 * 60 * 24, -- if core-api is down, use the fallback cache data generated in 24 hours }, plugin_name ) end diff --git a/src/apisix/tests/bk-cache-fallback/test-init.lua b/src/apisix/tests/bk-cache-fallback/test-init.lua index 429010f..8eb4d12 100644 --- a/src/apisix/tests/bk-cache-fallback/test-init.lua +++ b/src/apisix/tests/bk-cache-fallback/test-init.lua @@ -313,5 +313,124 @@ describe( end ) + -- a new case here, to test the lock:lock timeout and other errors + describe("lock:lock timeout and other errors", function() + local key + local f + + before_each(function () + key = "foo" + + f = { + create_obj_func_ok = function() end, + create_obj_func_fail = function() end, + } + + stub(f, "create_obj_func_ok", function () + return { + ["hello"] = "world", + }, nil + end) + stub(f, "create_obj_func_fail", function () + return nil, "error" + end) + + end) + + after_each(function () + f.create_obj_func_ok:revert() + f.create_obj_func_fail:revert() + end) + + it("is not timeout error, should return the error", function () + local ctx = { + conf_id = 789, + conf_type = "test", + } + local cache = cache_fallback.new(conf, "bk-cache-fallback-lock-error") + local cache_key = cache.key_prefix .. key + + local shared_data_dict = ngx_shared[cache.plugin_shared_dict_name] + assert.is_not_nil(shared_data_dict) + + -- Stub lock:lock to simulate an error (not timeout) + local lock = require("resty.lock") + stub(lock, "lock", function(self, key_s) + if key_s == cache_key then + return nil, "some error" + else + return 0, nil + end + end) + + local obj, err = cache:get_with_fallback(ctx, key, nil, f.create_obj_func_fail) + assert.is_nil(obj) + assert.is_not_nil(err) + assert.equal("failed to acquire the bk-cache-fallback lock, key: bk-cache-fallback-lock-error:foo, err: some error", err) + + lock.lock:revert() + end) + + it("is timeout, no data in shared_dict, return the error", function () + local ctx = { + conf_id = 789, + conf_type = "test", + } + local cache = cache_fallback.new(conf, "bk-cache-fallback-lock-timeout") + local cache_key = cache.key_prefix .. key + + local shared_data_dict = ngx_shared[cache.plugin_shared_dict_name] + assert.is_not_nil(shared_data_dict) + + -- Stub lock:lock to simulate a timeout + local lock = require("resty.lock") + stub(lock, "lock", function(self, key_s) + if key_s == cache_key then + return nil, "timeout" + else + return 0, nil + end + end) + + local obj, err = cache:get_with_fallback(ctx, key, nil, f.create_obj_func_fail) + assert.is_nil(obj) + assert.is_not_nil(err) + assert.equal("failed to acquire the bk-cache-fallback lock, key: bk-cache-fallback-lock-timeout:foo, error: timeout.", err) + + lock.lock:revert() + end) + + it("is timeout, got the data in shared_dict, return data,nil", function () + local ctx = { + conf_id = 789, + conf_type = "test", + } + local cache = cache_fallback.new(conf, "bk-cache-fallback-lock-timeout-data") + local cache_key = cache.key_prefix .. key + + local shared_data_dict = ngx_shared[cache.plugin_shared_dict_name] + assert.is_not_nil(shared_data_dict) + + -- set data in shared_dict + shared_data_dict:set(cache_key, '{"hello":"abc"}', 60 * 60) + + -- Stub lock:lock to simulate a timeout + local lock = require("resty.lock") + stub(lock, "lock", function(self, key_s) + if key_s == cache_key then + return nil, "timeout" + else + return 0, nil + end + end) + + local obj, err = cache:get_with_fallback(ctx, key, nil, f.create_obj_func_ok) + assert.is_not_nil(obj) + assert.is_nil(err) + assert.equal("abc", obj["hello"]) + + lock.lock:revert() + end) + end) end ) diff --git a/src/apisix/tests/conf/nginx.conf b/src/apisix/tests/conf/nginx.conf index ff4f749..ea3dee6 100644 --- a/src/apisix/tests/conf/nginx.conf +++ b/src/apisix/tests/conf/nginx.conf @@ -4,9 +4,12 @@ lua_shared_dict prometheus-metrics 1m; lua_shared_dict plugin-bk-permission 1m; lua_shared_dict plugin-bk-cache-fallback 1m; lua_shared_dict plugin-bk-cache-fallback-lock 1m; -# for unittest, bk-cache-fallback/init.lua, case ok/fail/ok->fail +# for unittest, bk-cache-fallback/init.lua, case ok/fail/ok->fail/lock fail lua_shared_dict plugin-bk-cache-fallback-ok 1m; lua_shared_dict plugin-bk-cache-fallback-fail 1m; lua_shared_dict plugin-bk-cache-fallback-ok-fail 1m; +lua_shared_dict plugin-bk-cache-fallback-lock-error 1m; +lua_shared_dict plugin-bk-cache-fallback-lock-timeout 1m; +lua_shared_dict plugin-bk-cache-fallback-lock-timeout-data 1m; -error_log /bkgateway/logs/error.log; \ No newline at end of file +error_log /bkgateway/logs/error.log; diff --git a/src/apisix/tests/test-bk-permission.lua b/src/apisix/tests/test-bk-permission.lua index 332816a..c3cff88 100644 --- a/src/apisix/tests/test-bk-permission.lua +++ b/src/apisix/tests/test-bk-permission.lua @@ -75,7 +75,7 @@ describe( assert.equal(10240, cache.lrucache_max_items) assert.equal(60, cache.lrucache_ttl) assert.equal(2, cache.lrucache_short_ttl) - assert.equal(43200, cache.fallback_cache_ttl) + assert.equal(86400, cache.fallback_cache_ttl) end )