Kong · fffonion · Jun 26, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/changelog/unreleased/kong/add-ai-data-prometheus.yml b/changelog/unreleased/kong/add-ai-data-prometheus.yml
@@ -0,0 +1,3 @@
+"message": "**prometheus**: Added `ai_requests_total`, `ai_cost_total` and `ai_tokens_total` metrics in the Prometheus plugin to start counting AI usage"
+"type": feature
+"scope": Core
diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua
@@ -24,6 +24,7 @@ local log_entry_keys = {
   TOKENS_CONTAINER = "usage",
   META_CONTAINER = "meta",
   PAYLOAD_CONTAINER = "payload",
+  CACHE_CONTAINER = "cache",
 
   -- payload keys
   REQUEST_BODY = "request",
@@ -37,9 +38,16 @@ local log_entry_keys = {
 
   -- usage keys
   PROCESSING_TIME = "processing_time",
-  PROMPT_TOKEN = "prompt_token",
-  COMPLETION_TOKEN = "completion_token",
+  PROMPT_TOKENS = "prompt_tokens",
+  COMPLETION_TOKENS = "completion_tokens",
   TOTAL_TOKENS = "total_tokens",
+  COST = "cost",
+
+  -- cache keys
+  VECTOR_DB = "vector_db",
+  EMBEDDINGS_PROVIDER = "embeddings_provider",
+  EMBEDDINGS_MODEL = "embeddings_model",
+  CACHE_STATUS = "cache_status",
 }
 
 local openai_override = os.getenv("OPENAI_TEST_PORT")
@@ -471,20 +479,22 @@ function _M.post_request(conf, response_object)
     request_analytics = {}
   end
 
-  -- check if we already have analytics for this provider
-  local request_analytics_plugin = request_analytics[plugin_name]
-
-  -- create a new structure if not
-  if not request_analytics_plugin then
-    request_analytics_plugin = {
-      [log_entry_keys.META_CONTAINER] = {},
-      [log_entry_keys.TOKENS_CONTAINER] = {
-        [log_entry_keys.PROMPT_TOKEN] = 0,
-        [log_entry_keys.COMPLETION_TOKEN] = 0,
-        [log_entry_keys.TOTAL_TOKENS] = 0,
-      },
-    }
-  end
+  -- create a new analytics structure for this plugin
+  local request_analytics_plugin = {
+    [log_entry_keys.META_CONTAINER] = {},
+    [log_entry_keys.TOKENS_CONTAINER] = {
+      [log_entry_keys.PROMPT_TOKENS] = 0,
+      [log_entry_keys.COMPLETION_TOKENS] = 0,
+      [log_entry_keys.TOTAL_TOKENS] = 0,
+      [log_entry_keys.COST] = 0,
+    },
+    [log_entry_keys.CACHE_CONTAINER] = {
+      [log_entry_keys.VECTOR_DB] = "",
+      [log_entry_keys.EMBEDDINGS_PROVIDER] = "",
+      [log_entry_keys.EMBEDDINGS_MODEL] = "",
+      [log_entry_keys.CACHE_STATUS] = "",
+    },
+  }
 
   -- Set the model, response, and provider names in the current try context
   request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.REQUEST_MODEL] = kong.ctx.plugin.llm_model_requested or conf.model.name
@@ -502,13 +512,20 @@ function _M.post_request(conf, response_object)
   -- Capture openai-format usage stats from the transformed response body
   if response_object.usage then
     if response_object.usage.prompt_tokens then
-      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] + response_object.usage.prompt_tokens
+      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKENS] = response_object.usage.prompt_tokens
     end
     if response_object.usage.completion_tokens then
-      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] + response_object.usage.completion_tokens
+      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKENS] = response_object.usage.completion_tokens
     end
     if response_object.usage.total_tokens then
-      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] + response_object.usage.total_tokens
+      request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] = response_object.usage.total_tokens
+    end
+
+    if response_object.usage.prompt_tokens and response_object.usage.completion_tokens
+      and conf.model.options.input_cost and conf.model.options.output_cost then 
+        request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COST] = 
+          (response_object.usage.prompt_tokens * conf.model.options.input_cost
+          + response_object.usage.completion_tokens * conf.model.options.output_cost) / 1000000 -- 1 million
     end
   end
 
@@ -532,6 +549,10 @@ function _M.post_request(conf, response_object)
     -- Log meta
     kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.META_CONTAINER),
       request_analytics_plugin[log_entry_keys.META_CONTAINER])
+
+    -- Log cache
+    kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.CACHE_CONTAINER),
+      request_analytics_plugin[log_entry_keys.CACHE_CONTAINER])
   end
 
   -- log tokens response for reports and billing

diff --git a/kong/llm/schemas/init.lua b/kong/llm/schemas/init.lua
@@ -49,6 +49,16 @@ local model_options_schema = {
         description = "Defines the max_tokens, if using chat or completion models.",
         required = false,
         default = 256 }},
+    { input_cost = {
+        type = "number",
+        description = "Defines the cost per 1M tokens in your prompt.",
+        required = false,
+        gt = 0}},
+    { output_cost = {
+        type = "number",
+        description = "Defines the cost per 1M tokens in the output of the AI.",
+        required = false,
+        gt = 0}},
     { temperature = {
         type = "number",
         description = "Defines the matching temperature, if using chat or completion models.",

diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua
@@ -34,6 +34,8 @@ package.loaded['prometheus_resty_counter'] = require("resty.counter")
 local kong_subsystem = ngx.config.subsystem
 local http_subsystem = kong_subsystem == "http"
 
+-- AI metrics
+local ai_request = true
 
 local function init()
   local shm = "prometheus_metrics"
@@ -145,6 +147,21 @@ local function init()
                                           {"service", "route", "direction", "workspace"})
   end
 
+  -- AI mode
+  if ai_request then
+    metrics.ai_llm_requests = prometheus:counter("ai_llm_requests_total",
+                                        "AI requests total per ai_provider in Kong",
+                                        {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"})
+
+    metrics.ai_llm_cost = prometheus:counter("ai_llm_cost_total",
+                                        "AI requests cost per ai_provider/cache in Kong",
+                                        {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"})
+
+    metrics.ai_llm_tokens = prometheus:counter("ai_llm_tokens_total",
+                                        "AI requests cost per ai_provider/cache in Kong",
+                                        {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "token_type", "workspace"})
+  end
+
   -- Hybrid mode status
   if role == "control_plane" then
     metrics.data_plane_last_seen = prometheus:gauge("data_plane_last_seen",
@@ -207,6 +224,9 @@ local upstream_target_addr_health_table = {
   { value = 0, labels = { 0, 0, 0, "unhealthy", ngx.config.subsystem } },
   { value = 0, labels = { 0, 0, 0, "dns_error", ngx.config.subsystem } },
 }
+-- ai
+local labels_table_ai_llm_status = {0, 0, 0, 0, 0, 0, 0}
+local labels_table_ai_llm_tokens = {0, 0, 0, 0, 0, 0, 0, 0}
 
 local function set_healthiness_metrics(table, upstream, target, address, status, metrics_bucket)
   for i = 1, #table do
@@ -313,6 +333,66 @@ local function log(message, serialized)
       metrics.kong_latency:observe(kong_proxy_latency, labels_table_latency)
     end
   end
+
+  if serialized.ai_metrics then
+    for _, ai_plugin in pairs(serialized.ai_metrics) do
+      local cache_status
+      if ai_plugin.cache and ai_plugin.cache.cache_status then
+        cache_status = ai_plugin.cache.cache_status
+      end
+
+      local vector_db, embeddings_provider, embeddings_model
+      if ai_plugin.cache then
+        if ai_plugin.cache.vector_db then
+          vector_db = ai_plugin.cache.vector_db
+        end
+
+        if ai_plugin.cache.embeddings_provider then
+          embeddings_provider = ai_plugin.cache.embeddings_provider
+        end
+
+        if ai_plugin.cache.embeddings_model then
+          embeddings_model = ai_plugin.cache.embeddings_model
+        end
+      end
+
+      labels_table_ai_llm_status[1] = ai_plugin.meta.provider_name
+      labels_table_ai_llm_status[2] = ai_plugin.meta.request_model
+      labels_table_ai_llm_status[3] = cache_status
+      labels_table_ai_llm_status[4] = vector_db
+      labels_table_ai_llm_status[5] = embeddings_provider
+      labels_table_ai_llm_status[6] = embeddings_model
+      labels_table_ai_llm_status[7] = workspace
+      metrics.ai_llm_requests:inc(1, labels_table_ai_llm_status)
+
+      if ai_plugin.usage.cost and ai_plugin.usage.cost > 0 then
+        metrics.ai_llm_cost:inc(ai_plugin.usage.cost, labels_table_ai_llm_status)
+      end
+
+      labels_table_ai_llm_tokens[1] = ai_plugin.meta.provider_name
+      labels_table_ai_llm_tokens[2] = ai_plugin.meta.request_model
+      labels_table_ai_llm_tokens[3] = cache_status
+      labels_table_ai_llm_tokens[4] = vector_db
+      labels_table_ai_llm_tokens[5] = embeddings_provider
+      labels_table_ai_llm_tokens[6] = embeddings_model
+      labels_table_ai_llm_tokens[8] = workspace
+
+      if ai_plugin.usage.prompt_tokens and ai_plugin.usage.prompt_tokens > 0 then
+        labels_table_ai_llm_tokens[7] = "prompt_tokens"
+        metrics.ai_llm_tokens:inc(ai_plugin.usage.prompt_tokens, labels_table_ai_llm_tokens)
+      end
+
+      if ai_plugin.usage.completion_tokens and ai_plugin.usage.completion_tokens > 0 then
+        labels_table_ai_llm_tokens[7] = "completion_tokens"
+        metrics.ai_llm_tokens:inc(ai_plugin.usage.completion_tokens, labels_table_ai_llm_tokens)
+      end
+
+      if ai_plugin.usage.total_tokens and ai_plugin.usage.total_tokens > 0 then
+        labels_table_ai_llm_tokens[7] = "total_tokens"
+        metrics.ai_llm_tokens:inc(ai_plugin.usage.total_tokens, labels_table_ai_llm_tokens)
+      end
+    end
+  end
 end
 
 -- The upstream health metrics is turned on if at least one of

diff --git a/kong/plugins/prometheus/handler.lua b/kong/plugins/prometheus/handler.lua
@@ -54,6 +54,10 @@ function PrometheusHandler:log(conf)
     serialized.latencies = message.latencies
   end
 
+  if conf.ai_metrics and message.ai then
+    serialized.ai_metrics = message.ai
+  end
+
   if conf.upstream_health_metrics then
     exporter.set_export_upstream_health_metrics(true)
   else

diff --git a/kong/plugins/prometheus/schema.lua b/kong/plugins/prometheus/schema.lua
@@ -18,6 +18,7 @@ return {
         fields = {
           { per_consumer = { description = "A boolean value that determines if per-consumer metrics should be collected. If enabled, the `kong_http_requests_total` and `kong_bandwidth_bytes` metrics fill in the consumer label when available.", type = "boolean", default = false }, },
           { status_code_metrics = { description = "A boolean value that determines if status code metrics should be collected. If enabled, `http_requests_total`, `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, },
+          { ai_metrics = { description = "A boolean value that determines if ai metrics should be collected. If enabled, the `kong_ai_llm_requests_total`, `kong_ai_llm_cost_total`, `ai_cache_retrieve_latency_ms` and `ai_cache_store_latency_ms` metrics will be exported.", type = "boolean", default = false }, },
           { latency_metrics = { description = "A boolean value that determines if latency metrics should be collected. If enabled, `kong_latency_ms`, `upstream_latency_ms` and `request_latency_ms` metrics will be exported.", type = "boolean", default = false }, },
           { bandwidth_metrics = { description = "A boolean value that determines if bandwidth metrics should be collected. If enabled, `bandwidth_bytes` and `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, },
           { upstream_health_metrics = { description = "A boolean value that determines if upstream metrics should be collected. If enabled, `upstream_target_health` metric will be exported.", type = "boolean", default = false }, },