feat(prometheus): add AI metrics (#13148)

Also fix a regression from #13148 AG-41 (cherry picked from commit 68925dd)
Kong · Jun 27, 2024 · 15a6fec · 15a6fec
1 parent b8c4e59
commit 15a6fec
Show file tree

Hide file tree

Showing 10 changed files with 426 additions and 40 deletions.
diff --git a/changelog/unreleased/kong/add-ai-data-prometheus.yml b/changelog/unreleased/kong/add-ai-data-prometheus.yml
@@ -0,0 +1,3 @@
+"message": "**prometheus**: Added `ai_requests_total`, `ai_cost_total` and `ai_tokens_total` metrics in the Prometheus plugin to start counting AI usage."
+"type": feature
+"scope": Core
diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua
@@ -28,25 +28,32 @@ end
 --
 
 local log_entry_keys = {
-  TOKENS_CONTAINER = "usage",
+  USAGE_CONTAINER = "usage",
   META_CONTAINER = "meta",
   PAYLOAD_CONTAINER = "payload",
+  CACHE_CONTAINER = "cache",
 
   -- payload keys
   REQUEST_BODY = "request",
   RESPONSE_BODY = "response",
 
   -- meta keys
+  PLUGIN_ID = "plugin_id",
+  PROVIDER_NAME = "provider_name",
   REQUEST_MODEL = "request_model",
   RESPONSE_MODEL = "response_model",
-  PROVIDER_NAME = "provider_name",
-  PLUGIN_ID = "plugin_id",
 
   -- usage keys
-  PROCESSING_TIME = "processing_time",
-  PROMPT_TOKEN = "prompt_token",
-  COMPLETION_TOKEN = "completion_token",
+  PROMPT_TOKENS = "prompt_tokens",
+  COMPLETION_TOKENS = "completion_tokens",
   TOTAL_TOKENS = "total_tokens",
+  COST = "cost",
+
+  -- cache keys
+  VECTOR_DB = "vector_db",
+  EMBEDDINGS_PROVIDER = "embeddings_provider",
+  EMBEDDINGS_MODEL = "embeddings_model",
+  CACHE_STATUS = "cache_status",
 }
 
 local openai_override = os.getenv("OPENAI_TEST_PORT")
@@ -494,26 +501,18 @@ function _M.post_request(conf, response_object)
     request_analytics = {}
   end
 
-  -- check if we already have analytics for this provider
-  local request_analytics_plugin = request_analytics[plugin_name]
-
-  -- create a new structure if not
-  if not request_analytics_plugin then
-    request_analytics_plugin = {
-      [log_entry_keys.META_CONTAINER] = {},
-      [log_entry_keys.TOKENS_CONTAINER] = {
-        [log_entry_keys.PROMPT_TOKEN] = 0,
-        [log_entry_keys.COMPLETION_TOKEN] = 0,
-        [log_entry_keys.TOTAL_TOKENS] = 0,
-      },
-    }
-  end
+  -- create a new analytics structure for this plugin
+  local request_analytics_plugin = {
+    [log_entry_keys.META_CONTAINER] = {},
+    [log_entry_keys.USAGE_CONTAINER] = {},
+    [log_entry_keys.CACHE_CONTAINER] = {},
+  }
 
   -- Set the model, response, and provider names in the current try context
+  request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PLUGIN_ID] = conf.__plugin_id
+  request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PROVIDER_NAME] = provider_name
   request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.REQUEST_MODEL] = kong.ctx.plugin.llm_model_requested or conf.model.name
   request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.RESPONSE_MODEL] = response_object.model or conf.model.name
-  request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PROVIDER_NAME] = provider_name
-  request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PLUGIN_ID] = conf.__plugin_id
 
   -- set extra per-provider meta
   if kong.ctx.plugin.ai_extra_meta and type(kong.ctx.plugin.ai_extra_meta) == "table" then
@@ -525,13 +524,20 @@ function _M.post_request(conf, response_object)
   -- Capture openai-format usage stats from the transformed response body
   if response_object.usage then
     if response_object.usage.prompt_tokens then
-      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] + response_object.usage.prompt_tokens
+      request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.PROMPT_TOKENS] = response_object.usage.prompt_tokens
     end
     if response_object.usage.completion_tokens then
-      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] + response_object.usage.completion_tokens
+      request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.COMPLETION_TOKENS] = response_object.usage.completion_tokens
     end
     if response_object.usage.total_tokens then
-      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] + response_object.usage.total_tokens
+      request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TOTAL_TOKENS] = response_object.usage.total_tokens
+    end
+
+    if response_object.usage.prompt_tokens and response_object.usage.completion_tokens
+      and conf.model.options.input_cost and conf.model.options.output_cost then 
+        request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.COST] = 
+          (response_object.usage.prompt_tokens * conf.model.options.input_cost
+          + response_object.usage.completion_tokens * conf.model.options.output_cost) / 1000000 -- 1 million
     end
   end
 
@@ -548,13 +554,17 @@ function _M.post_request(conf, response_object)
   kong.ctx.shared.analytics = request_analytics
 
   if conf.logging and conf.logging.log_statistics then
-    -- Log analytics data
-    kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.TOKENS_CONTAINER),
-      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER])
-
-    -- Log meta
+    -- Log meta data
     kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.META_CONTAINER),
       request_analytics_plugin[log_entry_keys.META_CONTAINER])
+
+    -- Log usage data
+    kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.USAGE_CONTAINER),
+      request_analytics_plugin[log_entry_keys.USAGE_CONTAINER])
+
+    -- Log cache data
+    kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.CACHE_CONTAINER),
+      request_analytics_plugin[log_entry_keys.CACHE_CONTAINER])
   end
 
   -- log tokens response for reports and billing

diff --git a/kong/llm/schemas/init.lua b/kong/llm/schemas/init.lua
@@ -96,6 +96,16 @@ local model_options_schema = {
         description = "Defines the max_tokens, if using chat or completion models.",
         required = false,
         default = 256 }},
+    { input_cost = {
+        type = "number",
+        description = "Defines the cost per 1M tokens in your prompt.",
+        required = false,
+        gt = 0}},
+    { output_cost = {
+        type = "number",
+        description = "Defines the cost per 1M tokens in the output of the AI.",
+        required = false,
+        gt = 0}},
     { temperature = {
         type = "number",
         description = "Defines the matching temperature, if using chat or completion models.",

diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua
@@ -43,7 +43,6 @@ local enterprise = require("kong.plugins.prometheus.enterprise.exporter")
 local kong_subsystem = ngx.config.subsystem
 local http_subsystem = kong_subsystem == "http"
 
-
 local function init()
   local shm = "prometheus_metrics"
   if not ngx.shared[shm] then
@@ -157,6 +156,19 @@ local function init()
   -- XXX EE
   enterprise.init(prometheus)
 
+  -- AI mode
+  metrics.ai_llm_requests = prometheus:counter("ai_llm_requests_total",
+                                      "AI requests total per ai_provider in Kong",
+                                      {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"})
+
+  metrics.ai_llm_cost = prometheus:counter("ai_llm_cost_total",
+                                      "AI requests cost per ai_provider/cache in Kong",
+                                      {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"})
+
+  metrics.ai_llm_tokens = prometheus:counter("ai_llm_tokens_total",
+                                      "AI requests cost per ai_provider/cache in Kong",
+                                      {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "token_type", "workspace"})
+
   -- Hybrid mode status
   if role == "control_plane" then
     metrics.data_plane_last_seen = prometheus:gauge("data_plane_last_seen",
@@ -222,6 +234,9 @@ local upstream_target_addr_health_table = {
   { value = 0, labels = { 0, 0, 0, "unhealthy", ngx.config.subsystem } },
   { value = 0, labels = { 0, 0, 0, "dns_error", ngx.config.subsystem } },
 }
+-- ai
+local labels_table_ai_llm_status = {0, 0, 0, 0, 0, 0, 0}
+local labels_table_ai_llm_tokens = {0, 0, 0, 0, 0, 0, 0, 0}
 
 local function set_healthiness_metrics(table, upstream, target, address, status, metrics_bucket)
   for i = 1, #table do
@@ -328,6 +343,51 @@ local function log(message, serialized)
       metrics.kong_latency:observe(kong_proxy_latency, labels_table_latency)
     end
   end
+
+  if serialized.ai_metrics then
+    for _, ai_plugin in pairs(serialized.ai_metrics) do
+      local cache_status = ai_plugin.cache.cache_status or ""
+      local vector_db = ai_plugin.cache.vector_db or ""
+      local embeddings_provider = ai_plugin.cache.embeddings_provider or ""
+      local embeddings_model = ai_plugin.cache.embeddings_model or ""
+
+      labels_table_ai_llm_status[1] = ai_plugin.meta.provider_name
+      labels_table_ai_llm_status[2] = ai_plugin.meta.request_model
+      labels_table_ai_llm_status[3] = cache_status
+      labels_table_ai_llm_status[4] = vector_db
+      labels_table_ai_llm_status[5] = embeddings_provider
+      labels_table_ai_llm_status[6] = embeddings_model
+      labels_table_ai_llm_status[7] = workspace
+      metrics.ai_llm_requests:inc(1, labels_table_ai_llm_status)
+
+      if ai_plugin.usage.cost and ai_plugin.usage.cost > 0 then
+        metrics.ai_llm_cost:inc(ai_plugin.usage.cost, labels_table_ai_llm_status)
+      end
+
+      labels_table_ai_llm_tokens[1] = ai_plugin.meta.provider_name
+      labels_table_ai_llm_tokens[2] = ai_plugin.meta.request_model
+      labels_table_ai_llm_tokens[3] = cache_status
+      labels_table_ai_llm_tokens[4] = vector_db
+      labels_table_ai_llm_tokens[5] = embeddings_provider
+      labels_table_ai_llm_tokens[6] = embeddings_model
+      labels_table_ai_llm_tokens[8] = workspace
+
+      if ai_plugin.usage.prompt_tokens and ai_plugin.usage.prompt_tokens > 0 then
+        labels_table_ai_llm_tokens[7] = "prompt_tokens"
+        metrics.ai_llm_tokens:inc(ai_plugin.usage.prompt_tokens, labels_table_ai_llm_tokens)
+      end
+
+      if ai_plugin.usage.completion_tokens and ai_plugin.usage.completion_tokens > 0 then
+        labels_table_ai_llm_tokens[7] = "completion_tokens"
+        metrics.ai_llm_tokens:inc(ai_plugin.usage.completion_tokens, labels_table_ai_llm_tokens)
+      end
+
+      if ai_plugin.usage.total_tokens and ai_plugin.usage.total_tokens > 0 then
+        labels_table_ai_llm_tokens[7] = "total_tokens"
+        metrics.ai_llm_tokens:inc(ai_plugin.usage.total_tokens, labels_table_ai_llm_tokens)
+      end
+    end
+  end
 end
 
 -- The upstream health metrics is turned on if at least one of

diff --git a/kong/plugins/prometheus/handler.lua b/kong/plugins/prometheus/handler.lua
@@ -61,6 +61,10 @@ function PrometheusHandler:log(conf)
     serialized.latencies = message.latencies
   end
 
+  if conf.ai_metrics then
+    serialized.ai_metrics = message.ai
+  end
+
   if conf.upstream_health_metrics then
     exporter.set_export_upstream_health_metrics(true)
   else

diff --git a/kong/plugins/prometheus/schema.lua b/kong/plugins/prometheus/schema.lua
@@ -25,6 +25,7 @@ return {
         fields = {
           { per_consumer = { description = "A boolean value that determines if per-consumer metrics should be collected. If enabled, the `kong_http_requests_total` and `kong_bandwidth_bytes` metrics fill in the consumer label when available.", type = "boolean", default = false }, },
           { status_code_metrics = { description = "A boolean value that determines if status code metrics should be collected. If enabled, `http_requests_total`, `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, },
+          { ai_metrics = { description = "A boolean value that determines if ai metrics should be collected. If enabled, the `ai_llm_requests_total`, `ai_llm_cost_total` and `ai_llm_tokens_total` metrics will be exported.", type = "boolean", default = false }, },
           { latency_metrics = { description = "A boolean value that determines if latency metrics should be collected. If enabled, `kong_latency_ms`, `upstream_latency_ms` and `request_latency_ms` metrics will be exported.", type = "boolean", default = false }, },
           { bandwidth_metrics = { description = "A boolean value that determines if bandwidth metrics should be collected. If enabled, `bandwidth_bytes` and `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, },
           { upstream_health_metrics = { description = "A boolean value that determines if upstream metrics should be collected. If enabled, `upstream_target_health` metric will be exported.", type = "boolean", default = false }, },