diff --git a/kong/llm/drivers/bedrock.lua b/kong/llm/drivers/bedrock.lua index a32ad5120e6..5f7ddce5119 100644 --- a/kong/llm/drivers/bedrock.lua +++ b/kong/llm/drivers/bedrock.lua @@ -9,6 +9,7 @@ local string_gsub = string.gsub local table_insert = table.insert local string_lower = string.lower local signer = require("resty.aws.request.sign") +local llm_state = require("kong.llm.state") -- -- globals @@ -381,7 +382,7 @@ end -- returns err or nil function _M.configure_request(conf, aws_sdk) - local operation = kong.ctx.shared.ai_proxy_streaming_mode and "converse-stream" + local operation = llm_state.is_streaming_mode() and "converse-stream" or "converse" local f_url = conf.model.options and conf.model.options.upstream_url diff --git a/kong/llm/drivers/gemini.lua b/kong/llm/drivers/gemini.lua index 0a68a0af8e1..d386961997f 100644 --- a/kong/llm/drivers/gemini.lua +++ b/kong/llm/drivers/gemini.lua @@ -9,6 +9,7 @@ local string_gsub = string.gsub local buffer = require("string.buffer") local table_insert = table.insert local string_lower = string.lower +local llm_state = require("kong.llm.state") -- -- globals @@ -338,7 +339,7 @@ end -- returns err or nil function _M.configure_request(conf, identity_interface) local parsed_url - local operation = kong.ctx.shared.ai_proxy_streaming_mode and "streamGenerateContent" + local operation = llm_state.is_streaming_mode() and "streamGenerateContent" or "generateContent" local f_url = conf.model.options and conf.model.options.upstream_url diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua index 15d9ce7e62f..b9fa994934b 100644 --- a/kong/llm/drivers/shared.lua +++ b/kong/llm/drivers/shared.lua @@ -612,12 +612,12 @@ function _M.post_request(conf, response_object) if kong.ctx.plugin[start_time_key] then local llm_latency = math.floor((ngx.now() - kong.ctx.plugin[start_time_key]) * 1000) request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.LLM_LATENCY] = llm_latency - kong.ctx.shared.ai_request_latency = llm_latency + llm_state.set_metrics("e2e_latency", llm_latency) if response_object.usage and response_object.usage.completion_tokens then local time_per_token = math.floor(llm_latency / response_object.usage.completion_tokens) request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TIME_PER_TOKEN] = time_per_token - kong.ctx.shared.ai_request_time_per_token = time_per_token + llm_state.set_metrics("tpot_latency", time_per_token) end end diff --git a/kong/llm/state.lua b/kong/llm/state.lua index fa2c29edf8c..1ba0eb52e74 100644 --- a/kong/llm/state.lua +++ b/kong/llm/state.lua @@ -94,4 +94,14 @@ function _M.get_response_tokens_count() return kong.ctx.shared.llm_response_tokens_count end -return _M \ No newline at end of file +function _M.set_metrics(key, value) + local m = kong.ctx.shared.llm_metrics or {} + m[key] = value + kong.ctx.shared.llm_metrics = m +end + +function _M.get_metrics(key) + return (kong.ctx.shared.llm_metrics or {})[key] +end + +return _M