feat(ai-plugins): retrieve latency data and push it to logs and metri…

…cs (#13428) For https://konghq.atlassian.net/browse/AG-79
Kong · Jul 30, 2024 · 9821437 · 9821437 · github-actions · Jul 30, 2024
1 parent cb8d046
commit 9821437
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 9 deletions.
diff --git a/changelog/unreleased/kong/add-ai-data-latency.yml b/changelog/unreleased/kong/add-ai-data-latency.yml
@@ -0,0 +1,3 @@
+message: "AI plugins: retrieved latency data and pushed it to logs and metrics."
+type: feature
+scope: "Plugin"
diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua
@@ -40,11 +40,13 @@ local log_entry_keys = {
   PROVIDER_NAME = "provider_name",
   REQUEST_MODEL = "request_model",
   RESPONSE_MODEL = "response_model",
+  LLM_LATENCY = "llm_latency",
 
   -- usage keys
   PROMPT_TOKENS = "prompt_tokens",
   COMPLETION_TOKENS = "completion_tokens",
   TOTAL_TOKENS = "total_tokens",
+  TIME_PER_TOKEN = "time_per_token",
   COST = "cost",
 
   -- cache keys
@@ -527,13 +529,14 @@ function _M.pre_request(conf, request_table)
     request_table[auth_param_name] = auth_param_value
   end
 
+  -- retrieve the plugin name
+  local plugin_name = conf.__key__:match('plugins:(.-):')
+  if not plugin_name or plugin_name == "" then
+    return nil, "no plugin name is being passed by the plugin"
+  end
+
   -- if enabled AND request type is compatible, capture the input for analytics
   if conf.logging and conf.logging.log_payloads then
-    local plugin_name = conf.__key__:match('plugins:(.-):')
-    if not plugin_name or plugin_name == "" then
-      return nil, "no plugin name is being passed by the plugin"
-    end
-
     kong.log.set_serialize_value(fmt("ai.%s.%s.%s", plugin_name, log_entry_keys.PAYLOAD_CONTAINER, log_entry_keys.REQUEST_BODY), kong.request.get_raw_body())
   end
 
@@ -547,6 +550,9 @@ function _M.pre_request(conf, request_table)
     kong.ctx.shared.ai_prompt_tokens = (kong.ctx.shared.ai_prompt_tokens or 0) + prompt_tokens
   end
 
+  local start_time_key = "ai_request_start_time_" .. plugin_name
+  kong.ctx.plugin[start_time_key] = ngx.now()
+
   return true, nil
 end
 
@@ -600,6 +606,20 @@ function _M.post_request(conf, response_object)
   request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.REQUEST_MODEL] = kong.ctx.plugin.llm_model_requested or conf.model.name
   request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.RESPONSE_MODEL] = response_object.model or conf.model.name
 
+  -- Set the llm latency meta, and time per token usage
+  local start_time_key = "ai_request_start_time_" .. plugin_name
+  if kong.ctx.plugin[start_time_key] then
+    local llm_latency = math.floor((ngx.now() - kong.ctx.plugin[start_time_key]) * 1000)
+    request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.LLM_LATENCY] = llm_latency
+    kong.ctx.shared.ai_request_latency = llm_latency
+
+    if response_object.usage and response_object.usage.completion_tokens then
+      local time_per_token = math.floor(llm_latency / response_object.usage.completion_tokens)
+      request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TIME_PER_TOKEN] = time_per_token
+      kong.ctx.shared.ai_request_time_per_token = time_per_token
+    end
+  end
+
   -- set extra per-provider meta
   if kong.ctx.plugin.ai_extra_meta and type(kong.ctx.plugin.ai_extra_meta) == "table" then
     for k, v in pairs(kong.ctx.plugin.ai_extra_meta) do

diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua
@@ -17,8 +17,9 @@ local stream_available, stream_api = pcall(require, "kong.tools.stream_api")
 
 local role = kong.configuration.role
 
-local KONG_LATENCY_BUCKETS = { 1, 2, 5, 7, 10, 15, 20, 30, 50, 75, 100, 200, 500, 750, 1000}
-local UPSTREAM_LATENCY_BUCKETS = {25, 50, 80, 100, 250, 400, 700, 1000, 2000, 5000, 10000, 30000, 60000 }
+local KONG_LATENCY_BUCKETS = { 1, 2, 5, 7, 10, 15, 20, 30, 50, 75, 100, 200, 500, 750, 1000 }
+local UPSTREAM_LATENCY_BUCKETS = { 25, 50, 80, 100, 250, 400, 700, 1000, 2000, 5000, 10000, 30000, 60000 }
+local AI_LLM_PROVIDER_LATENCY_BUCKETS = { 250, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 10000, 30000, 60000 }
 local IS_PROMETHEUS_ENABLED
 
 
@@ -157,6 +158,11 @@ local function init()
                                       "AI requests cost per ai_provider/cache in Kong",
                                       {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "token_type", "workspace"})
 
+  metrics.ai_llm_provider_latency = prometheus:histogram("ai_llm_provider_latency_ms",
+                                      "LLM response Latency for each AI plugins per ai_provider in Kong",
+                                      {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"},
+                                      AI_LLM_PROVIDER_LATENCY_BUCKETS)
+
   -- Hybrid mode status
   if role == "control_plane" then
     metrics.data_plane_last_seen = prometheus:gauge("data_plane_last_seen",
@@ -349,6 +355,10 @@ local function log(message, serialized)
         metrics.ai_llm_cost:inc(ai_plugin.usage.cost, labels_table_ai_llm_status)
       end
 
+      if ai_plugin.meta.llm_latency and ai_plugin.meta.llm_latency > 0 then
+        metrics.ai_llm_provider_latency:observe(ai_plugin.meta.llm_latency, labels_table_ai_llm_status)
+      end
+
       labels_table_ai_llm_tokens[1] = ai_plugin.meta.provider_name
       labels_table_ai_llm_tokens[2] = ai_plugin.meta.request_model
       labels_table_ai_llm_tokens[3] = cache_status

diff --git a/spec/03-plugins/26-prometheus/02-access_spec.lua b/spec/03-plugins/26-prometheus/02-access_spec.lua
@@ -768,6 +768,7 @@ describe("Plugin: prometheus (access) AI metrics", function()
     assert.not_match('ai_llm_requests_total', body, nil, true)
     assert.not_match('ai_llm_cost_total', body, nil, true)
     assert.not_match('ai_llm_tokens_total', body, nil, true)
+    assert.not_match('ai_llm_provider_latency_ms_bucket', body, nil, true)
   end)
 
   it("update prometheus plugin config", function()
@@ -829,6 +830,8 @@ describe("Plugin: prometheus (access) AI metrics", function()
 
     assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00037', body, nil, true)
 
+    assert.matches('ai_llm_provider_latency_ms_bucket{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default",le="+Inf"} 1', body, nil, true)
+
     assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 12', body, nil, true)
     assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 25', body, nil, true)
     assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 37', body, nil, true)
@@ -865,6 +868,8 @@ describe("Plugin: prometheus (access) AI metrics", function()
 
     assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true)
 
+    assert.matches('ai_llm_provider_latency_ms_bucket{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default",le="+Inf"} 2', body, nil, true)
+
     assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 24', body, nil, true)
     assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 50', body, nil, true)
     assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 74', body, nil, true)
@@ -895,5 +900,6 @@ describe("Plugin: prometheus (access) AI metrics", function()
 
     assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 2', body, nil, true)
     assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true)
+    assert.matches('ai_llm_provider_latency_ms_bucket{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default",le="+Inf"} 2', body, nil, true)
   end)
 end)
diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
@@ -44,11 +44,13 @@ local _EXPECTED_CHAT_STATS = {
       provider_name = 'openai',
       request_model = 'gpt-3.5-turbo',
       response_model = 'gpt-3.5-turbo-0613',
+      llm_latency = 1
     },
     usage = {
       prompt_tokens = 25,
       completion_tokens = 12,
       total_tokens = 37,
+      time_per_token = 1,
       cost = 0.00037,
     },
     cache = {}
@@ -713,7 +715,17 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then
         assert.is_number(log_message.response.size)
 
         -- test ai-proxy stats
+        local actual_chat_stats = log_message.ai
+        local actual_llm_latency = actual_chat_stats["ai-proxy"].meta.llm_latency
+        local actual_time_per_token = actual_chat_stats["ai-proxy"].usage.time_per_token
+        local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-proxy"].usage.completion_tokens)
+
+        log_message.ai["ai-proxy"].meta.llm_latency = 1
+        log_message.ai["ai-proxy"].usage.time_per_token = 1
+
         assert.same(_EXPECTED_CHAT_STATS, log_message.ai)
+        assert.is_true(actual_llm_latency > 0)
+        assert.same(actual_time_per_token, time_per_token)
       end)
 
       it("does not log statistics", function()

diff --git a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua
@@ -124,11 +124,13 @@ local _EXPECTED_CHAT_STATS = {
       provider_name = 'openai',
       request_model = 'gpt-4',
       response_model = 'gpt-3.5-turbo-0613',
+      llm_latency = 1
     },
     usage = {
       prompt_tokens = 25,
       completion_tokens = 12,
       total_tokens = 37,
+      time_per_token = 1,
       cost = 0.00037,
     },
     cache = {}
@@ -295,8 +297,18 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then
         assert.is_number(log_message.request.size)
         assert.is_number(log_message.response.size)
 
-        -- test ai-proxy stats
+        -- test ai-request-transformer stats
+        local actual_chat_stats = log_message.ai
+        local actual_llm_latency = actual_chat_stats["ai-request-transformer"].meta.llm_latency
+        local actual_time_per_token = actual_chat_stats["ai-request-transformer"].usage.time_per_token
+        local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-request-transformer"].usage.completion_tokens)
+
+        log_message.ai["ai-request-transformer"].meta.llm_latency = 1
+        log_message.ai["ai-request-transformer"].usage.time_per_token = 1
+
         assert.same(_EXPECTED_CHAT_STATS, log_message.ai)
+        assert.is_true(actual_llm_latency > 0)
+        assert.same(actual_time_per_token, time_per_token)
       end)
 
       it("bad request from LLM", function()

diff --git a/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua b/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua
@@ -181,11 +181,13 @@ local _EXPECTED_CHAT_STATS = {
       provider_name = 'openai',
       request_model = 'gpt-4',
       response_model = 'gpt-3.5-turbo-0613',
+      llm_latency = 1
     },
     usage = {
       prompt_tokens = 25,
       completion_tokens = 12,
       total_tokens = 37,
+      time_per_token = 1,
       cost = 0.00037,
     },
     cache = {}
@@ -424,8 +426,18 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then
         assert.is_number(log_message.request.size)
         assert.is_number(log_message.response.size)
 
-        -- test ai-proxy stats
+        -- test ai-response-transformer stats
+        local actual_chat_stats = log_message.ai
+        local actual_llm_latency = actual_chat_stats["ai-response-transformer"].meta.llm_latency
+        local actual_time_per_token = actual_chat_stats["ai-response-transformer"].usage.time_per_token
+        local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-response-transformer"].usage.completion_tokens)
+
+        log_message.ai["ai-response-transformer"].meta.llm_latency = 1
+        log_message.ai["ai-response-transformer"].usage.time_per_token = 1
+
         assert.same(_EXPECTED_CHAT_STATS, log_message.ai)
+        assert.is_true(actual_llm_latency > 0)
+        assert.same(actual_time_per_token, time_per_token)
       end)
 
       it("fails properly when json instructions are bad", function()