Skip to content

Commit

Permalink
feat(ai-plugins): retrieve latency data and push it to logs and metri…
Browse files Browse the repository at this point in the history
  • Loading branch information
AntoineJac authored Jul 30, 2024
1 parent cb8d046 commit 9821437
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 9 deletions.
3 changes: 3 additions & 0 deletions changelog/unreleased/kong/add-ai-data-latency.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
message: "AI plugins: retrieved latency data and pushed it to logs and metrics."
type: feature
scope: "Plugin"
30 changes: 25 additions & 5 deletions kong/llm/drivers/shared.lua
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ local log_entry_keys = {
PROVIDER_NAME = "provider_name",
REQUEST_MODEL = "request_model",
RESPONSE_MODEL = "response_model",
LLM_LATENCY = "llm_latency",

-- usage keys
PROMPT_TOKENS = "prompt_tokens",
COMPLETION_TOKENS = "completion_tokens",
TOTAL_TOKENS = "total_tokens",
TIME_PER_TOKEN = "time_per_token",
COST = "cost",

-- cache keys
Expand Down Expand Up @@ -527,13 +529,14 @@ function _M.pre_request(conf, request_table)
request_table[auth_param_name] = auth_param_value
end

-- retrieve the plugin name
local plugin_name = conf.__key__:match('plugins:(.-):')
if not plugin_name or plugin_name == "" then
return nil, "no plugin name is being passed by the plugin"
end

-- if enabled AND request type is compatible, capture the input for analytics
if conf.logging and conf.logging.log_payloads then
local plugin_name = conf.__key__:match('plugins:(.-):')
if not plugin_name or plugin_name == "" then
return nil, "no plugin name is being passed by the plugin"
end

kong.log.set_serialize_value(fmt("ai.%s.%s.%s", plugin_name, log_entry_keys.PAYLOAD_CONTAINER, log_entry_keys.REQUEST_BODY), kong.request.get_raw_body())
end

Expand All @@ -547,6 +550,9 @@ function _M.pre_request(conf, request_table)
kong.ctx.shared.ai_prompt_tokens = (kong.ctx.shared.ai_prompt_tokens or 0) + prompt_tokens
end

local start_time_key = "ai_request_start_time_" .. plugin_name
kong.ctx.plugin[start_time_key] = ngx.now()

return true, nil
end

Expand Down Expand Up @@ -600,6 +606,20 @@ function _M.post_request(conf, response_object)
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.REQUEST_MODEL] = kong.ctx.plugin.llm_model_requested or conf.model.name
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.RESPONSE_MODEL] = response_object.model or conf.model.name

-- Set the llm latency meta, and time per token usage
local start_time_key = "ai_request_start_time_" .. plugin_name
if kong.ctx.plugin[start_time_key] then
local llm_latency = math.floor((ngx.now() - kong.ctx.plugin[start_time_key]) * 1000)
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.LLM_LATENCY] = llm_latency
kong.ctx.shared.ai_request_latency = llm_latency

if response_object.usage and response_object.usage.completion_tokens then
local time_per_token = math.floor(llm_latency / response_object.usage.completion_tokens)
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TIME_PER_TOKEN] = time_per_token
kong.ctx.shared.ai_request_time_per_token = time_per_token
end
end

-- set extra per-provider meta
if kong.ctx.plugin.ai_extra_meta and type(kong.ctx.plugin.ai_extra_meta) == "table" then
for k, v in pairs(kong.ctx.plugin.ai_extra_meta) do
Expand Down
14 changes: 12 additions & 2 deletions kong/plugins/prometheus/exporter.lua
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ local stream_available, stream_api = pcall(require, "kong.tools.stream_api")

local role = kong.configuration.role

local KONG_LATENCY_BUCKETS = { 1, 2, 5, 7, 10, 15, 20, 30, 50, 75, 100, 200, 500, 750, 1000}
local UPSTREAM_LATENCY_BUCKETS = {25, 50, 80, 100, 250, 400, 700, 1000, 2000, 5000, 10000, 30000, 60000 }
local KONG_LATENCY_BUCKETS = { 1, 2, 5, 7, 10, 15, 20, 30, 50, 75, 100, 200, 500, 750, 1000 }
local UPSTREAM_LATENCY_BUCKETS = { 25, 50, 80, 100, 250, 400, 700, 1000, 2000, 5000, 10000, 30000, 60000 }
local AI_LLM_PROVIDER_LATENCY_BUCKETS = { 250, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 10000, 30000, 60000 }
local IS_PROMETHEUS_ENABLED


Expand Down Expand Up @@ -157,6 +158,11 @@ local function init()
"AI requests cost per ai_provider/cache in Kong",
{"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "token_type", "workspace"})

metrics.ai_llm_provider_latency = prometheus:histogram("ai_llm_provider_latency_ms",
"LLM response Latency for each AI plugins per ai_provider in Kong",
{"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"},
AI_LLM_PROVIDER_LATENCY_BUCKETS)

-- Hybrid mode status
if role == "control_plane" then
metrics.data_plane_last_seen = prometheus:gauge("data_plane_last_seen",
Expand Down Expand Up @@ -349,6 +355,10 @@ local function log(message, serialized)
metrics.ai_llm_cost:inc(ai_plugin.usage.cost, labels_table_ai_llm_status)
end

if ai_plugin.meta.llm_latency and ai_plugin.meta.llm_latency > 0 then
metrics.ai_llm_provider_latency:observe(ai_plugin.meta.llm_latency, labels_table_ai_llm_status)
end

labels_table_ai_llm_tokens[1] = ai_plugin.meta.provider_name
labels_table_ai_llm_tokens[2] = ai_plugin.meta.request_model
labels_table_ai_llm_tokens[3] = cache_status
Expand Down
6 changes: 6 additions & 0 deletions spec/03-plugins/26-prometheus/02-access_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,7 @@ describe("Plugin: prometheus (access) AI metrics", function()
assert.not_match('ai_llm_requests_total', body, nil, true)
assert.not_match('ai_llm_cost_total', body, nil, true)
assert.not_match('ai_llm_tokens_total', body, nil, true)
assert.not_match('ai_llm_provider_latency_ms_bucket', body, nil, true)
end)

it("update prometheus plugin config", function()
Expand Down Expand Up @@ -829,6 +830,8 @@ describe("Plugin: prometheus (access) AI metrics", function()

assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00037', body, nil, true)

assert.matches('ai_llm_provider_latency_ms_bucket{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default",le="+Inf"} 1', body, nil, true)

assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 12', body, nil, true)
assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 25', body, nil, true)
assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 37', body, nil, true)
Expand Down Expand Up @@ -865,6 +868,8 @@ describe("Plugin: prometheus (access) AI metrics", function()

assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true)

assert.matches('ai_llm_provider_latency_ms_bucket{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default",le="+Inf"} 2', body, nil, true)

assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 24', body, nil, true)
assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 50', body, nil, true)
assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 74', body, nil, true)
Expand Down Expand Up @@ -895,5 +900,6 @@ describe("Plugin: prometheus (access) AI metrics", function()

assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 2', body, nil, true)
assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true)
assert.matches('ai_llm_provider_latency_ms_bucket{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default",le="+Inf"} 2', body, nil, true)
end)
end)
12 changes: 12 additions & 0 deletions spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ local _EXPECTED_CHAT_STATS = {
provider_name = 'openai',
request_model = 'gpt-3.5-turbo',
response_model = 'gpt-3.5-turbo-0613',
llm_latency = 1
},
usage = {
prompt_tokens = 25,
completion_tokens = 12,
total_tokens = 37,
time_per_token = 1,
cost = 0.00037,
},
cache = {}
Expand Down Expand Up @@ -713,7 +715,17 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then
assert.is_number(log_message.response.size)

-- test ai-proxy stats
local actual_chat_stats = log_message.ai
local actual_llm_latency = actual_chat_stats["ai-proxy"].meta.llm_latency
local actual_time_per_token = actual_chat_stats["ai-proxy"].usage.time_per_token
local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-proxy"].usage.completion_tokens)

log_message.ai["ai-proxy"].meta.llm_latency = 1
log_message.ai["ai-proxy"].usage.time_per_token = 1

assert.same(_EXPECTED_CHAT_STATS, log_message.ai)
assert.is_true(actual_llm_latency > 0)
assert.same(actual_time_per_token, time_per_token)
end)

it("does not log statistics", function()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,13 @@ local _EXPECTED_CHAT_STATS = {
provider_name = 'openai',
request_model = 'gpt-4',
response_model = 'gpt-3.5-turbo-0613',
llm_latency = 1
},
usage = {
prompt_tokens = 25,
completion_tokens = 12,
total_tokens = 37,
time_per_token = 1,
cost = 0.00037,
},
cache = {}
Expand Down Expand Up @@ -295,8 +297,18 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then
assert.is_number(log_message.request.size)
assert.is_number(log_message.response.size)

-- test ai-proxy stats
-- test ai-request-transformer stats
local actual_chat_stats = log_message.ai
local actual_llm_latency = actual_chat_stats["ai-request-transformer"].meta.llm_latency
local actual_time_per_token = actual_chat_stats["ai-request-transformer"].usage.time_per_token
local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-request-transformer"].usage.completion_tokens)

log_message.ai["ai-request-transformer"].meta.llm_latency = 1
log_message.ai["ai-request-transformer"].usage.time_per_token = 1

assert.same(_EXPECTED_CHAT_STATS, log_message.ai)
assert.is_true(actual_llm_latency > 0)
assert.same(actual_time_per_token, time_per_token)
end)

it("bad request from LLM", function()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,13 @@ local _EXPECTED_CHAT_STATS = {
provider_name = 'openai',
request_model = 'gpt-4',
response_model = 'gpt-3.5-turbo-0613',
llm_latency = 1
},
usage = {
prompt_tokens = 25,
completion_tokens = 12,
total_tokens = 37,
time_per_token = 1,
cost = 0.00037,
},
cache = {}
Expand Down Expand Up @@ -424,8 +426,18 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then
assert.is_number(log_message.request.size)
assert.is_number(log_message.response.size)

-- test ai-proxy stats
-- test ai-response-transformer stats
local actual_chat_stats = log_message.ai
local actual_llm_latency = actual_chat_stats["ai-response-transformer"].meta.llm_latency
local actual_time_per_token = actual_chat_stats["ai-response-transformer"].usage.time_per_token
local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-response-transformer"].usage.completion_tokens)

log_message.ai["ai-response-transformer"].meta.llm_latency = 1
log_message.ai["ai-response-transformer"].usage.time_per_token = 1

assert.same(_EXPECTED_CHAT_STATS, log_message.ai)
assert.is_true(actual_llm_latency > 0)
assert.same(actual_time_per_token, time_per_token)
end)

it("fails properly when json instructions are bad", function()
Expand Down

1 comment on commit 9821437

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bazel Build

Docker image available kong/kong:982143769aa5bd271307fa70840051da1ca32a08
Artifacts available https://github.com/Kong/kong/actions/runs/10159248407

Please sign in to comment.