Skip to content

Commit

Permalink
feat(prometheus): add AI metrics (#13148)
Browse files Browse the repository at this point in the history
Also fix a regression from #13148

AG-41

(cherry picked from commit 68925dd)
  • Loading branch information
AntoineJac authored and jschmid1 committed Jun 27, 2024
1 parent b8c4e59 commit 15a6fec
Show file tree
Hide file tree
Showing 10 changed files with 426 additions and 40 deletions.
3 changes: 3 additions & 0 deletions changelog/unreleased/kong/add-ai-data-prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"message": "**prometheus**: Added `ai_requests_total`, `ai_cost_total` and `ai_tokens_total` metrics in the Prometheus plugin to start counting AI usage."
"type": feature
"scope": Core
70 changes: 40 additions & 30 deletions kong/llm/drivers/shared.lua
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,32 @@ end
--

local log_entry_keys = {
TOKENS_CONTAINER = "usage",
USAGE_CONTAINER = "usage",
META_CONTAINER = "meta",
PAYLOAD_CONTAINER = "payload",
CACHE_CONTAINER = "cache",

-- payload keys
REQUEST_BODY = "request",
RESPONSE_BODY = "response",

-- meta keys
PLUGIN_ID = "plugin_id",
PROVIDER_NAME = "provider_name",
REQUEST_MODEL = "request_model",
RESPONSE_MODEL = "response_model",
PROVIDER_NAME = "provider_name",
PLUGIN_ID = "plugin_id",

-- usage keys
PROCESSING_TIME = "processing_time",
PROMPT_TOKEN = "prompt_token",
COMPLETION_TOKEN = "completion_token",
PROMPT_TOKENS = "prompt_tokens",
COMPLETION_TOKENS = "completion_tokens",
TOTAL_TOKENS = "total_tokens",
COST = "cost",

-- cache keys
VECTOR_DB = "vector_db",
EMBEDDINGS_PROVIDER = "embeddings_provider",
EMBEDDINGS_MODEL = "embeddings_model",
CACHE_STATUS = "cache_status",
}

local openai_override = os.getenv("OPENAI_TEST_PORT")
Expand Down Expand Up @@ -494,26 +501,18 @@ function _M.post_request(conf, response_object)
request_analytics = {}
end

-- check if we already have analytics for this provider
local request_analytics_plugin = request_analytics[plugin_name]

-- create a new structure if not
if not request_analytics_plugin then
request_analytics_plugin = {
[log_entry_keys.META_CONTAINER] = {},
[log_entry_keys.TOKENS_CONTAINER] = {
[log_entry_keys.PROMPT_TOKEN] = 0,
[log_entry_keys.COMPLETION_TOKEN] = 0,
[log_entry_keys.TOTAL_TOKENS] = 0,
},
}
end
-- create a new analytics structure for this plugin
local request_analytics_plugin = {
[log_entry_keys.META_CONTAINER] = {},
[log_entry_keys.USAGE_CONTAINER] = {},
[log_entry_keys.CACHE_CONTAINER] = {},
}

-- Set the model, response, and provider names in the current try context
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PLUGIN_ID] = conf.__plugin_id
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PROVIDER_NAME] = provider_name
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.REQUEST_MODEL] = kong.ctx.plugin.llm_model_requested or conf.model.name
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.RESPONSE_MODEL] = response_object.model or conf.model.name
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PROVIDER_NAME] = provider_name
request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PLUGIN_ID] = conf.__plugin_id

-- set extra per-provider meta
if kong.ctx.plugin.ai_extra_meta and type(kong.ctx.plugin.ai_extra_meta) == "table" then
Expand All @@ -525,13 +524,20 @@ function _M.post_request(conf, response_object)
-- Capture openai-format usage stats from the transformed response body
if response_object.usage then
if response_object.usage.prompt_tokens then
request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] + response_object.usage.prompt_tokens
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.PROMPT_TOKENS] = response_object.usage.prompt_tokens
end
if response_object.usage.completion_tokens then
request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] + response_object.usage.completion_tokens
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.COMPLETION_TOKENS] = response_object.usage.completion_tokens
end
if response_object.usage.total_tokens then
request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] + response_object.usage.total_tokens
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TOTAL_TOKENS] = response_object.usage.total_tokens
end

if response_object.usage.prompt_tokens and response_object.usage.completion_tokens
and conf.model.options.input_cost and conf.model.options.output_cost then
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.COST] =
(response_object.usage.prompt_tokens * conf.model.options.input_cost
+ response_object.usage.completion_tokens * conf.model.options.output_cost) / 1000000 -- 1 million
end
end

Expand All @@ -548,13 +554,17 @@ function _M.post_request(conf, response_object)
kong.ctx.shared.analytics = request_analytics

if conf.logging and conf.logging.log_statistics then
-- Log analytics data
kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.TOKENS_CONTAINER),
request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER])

-- Log meta
-- Log meta data
kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.META_CONTAINER),
request_analytics_plugin[log_entry_keys.META_CONTAINER])

-- Log usage data
kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.USAGE_CONTAINER),
request_analytics_plugin[log_entry_keys.USAGE_CONTAINER])

-- Log cache data
kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.CACHE_CONTAINER),
request_analytics_plugin[log_entry_keys.CACHE_CONTAINER])
end

-- log tokens response for reports and billing
Expand Down
10 changes: 10 additions & 0 deletions kong/llm/schemas/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ local model_options_schema = {
description = "Defines the max_tokens, if using chat or completion models.",
required = false,
default = 256 }},
{ input_cost = {
type = "number",
description = "Defines the cost per 1M tokens in your prompt.",
required = false,
gt = 0}},
{ output_cost = {
type = "number",
description = "Defines the cost per 1M tokens in the output of the AI.",
required = false,
gt = 0}},
{ temperature = {
type = "number",
description = "Defines the matching temperature, if using chat or completion models.",
Expand Down
62 changes: 61 additions & 1 deletion kong/plugins/prometheus/exporter.lua
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ local enterprise = require("kong.plugins.prometheus.enterprise.exporter")
local kong_subsystem = ngx.config.subsystem
local http_subsystem = kong_subsystem == "http"


local function init()
local shm = "prometheus_metrics"
if not ngx.shared[shm] then
Expand Down Expand Up @@ -157,6 +156,19 @@ local function init()
-- XXX EE
enterprise.init(prometheus)

-- AI mode
metrics.ai_llm_requests = prometheus:counter("ai_llm_requests_total",
"AI requests total per ai_provider in Kong",
{"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"})

metrics.ai_llm_cost = prometheus:counter("ai_llm_cost_total",
"AI requests cost per ai_provider/cache in Kong",
{"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"})

metrics.ai_llm_tokens = prometheus:counter("ai_llm_tokens_total",
"AI requests cost per ai_provider/cache in Kong",
{"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "token_type", "workspace"})

-- Hybrid mode status
if role == "control_plane" then
metrics.data_plane_last_seen = prometheus:gauge("data_plane_last_seen",
Expand Down Expand Up @@ -222,6 +234,9 @@ local upstream_target_addr_health_table = {
{ value = 0, labels = { 0, 0, 0, "unhealthy", ngx.config.subsystem } },
{ value = 0, labels = { 0, 0, 0, "dns_error", ngx.config.subsystem } },
}
-- ai
local labels_table_ai_llm_status = {0, 0, 0, 0, 0, 0, 0}
local labels_table_ai_llm_tokens = {0, 0, 0, 0, 0, 0, 0, 0}

local function set_healthiness_metrics(table, upstream, target, address, status, metrics_bucket)
for i = 1, #table do
Expand Down Expand Up @@ -328,6 +343,51 @@ local function log(message, serialized)
metrics.kong_latency:observe(kong_proxy_latency, labels_table_latency)
end
end

if serialized.ai_metrics then
for _, ai_plugin in pairs(serialized.ai_metrics) do
local cache_status = ai_plugin.cache.cache_status or ""
local vector_db = ai_plugin.cache.vector_db or ""
local embeddings_provider = ai_plugin.cache.embeddings_provider or ""
local embeddings_model = ai_plugin.cache.embeddings_model or ""

labels_table_ai_llm_status[1] = ai_plugin.meta.provider_name
labels_table_ai_llm_status[2] = ai_plugin.meta.request_model
labels_table_ai_llm_status[3] = cache_status
labels_table_ai_llm_status[4] = vector_db
labels_table_ai_llm_status[5] = embeddings_provider
labels_table_ai_llm_status[6] = embeddings_model
labels_table_ai_llm_status[7] = workspace
metrics.ai_llm_requests:inc(1, labels_table_ai_llm_status)

if ai_plugin.usage.cost and ai_plugin.usage.cost > 0 then
metrics.ai_llm_cost:inc(ai_plugin.usage.cost, labels_table_ai_llm_status)
end

labels_table_ai_llm_tokens[1] = ai_plugin.meta.provider_name
labels_table_ai_llm_tokens[2] = ai_plugin.meta.request_model
labels_table_ai_llm_tokens[3] = cache_status
labels_table_ai_llm_tokens[4] = vector_db
labels_table_ai_llm_tokens[5] = embeddings_provider
labels_table_ai_llm_tokens[6] = embeddings_model
labels_table_ai_llm_tokens[8] = workspace

if ai_plugin.usage.prompt_tokens and ai_plugin.usage.prompt_tokens > 0 then
labels_table_ai_llm_tokens[7] = "prompt_tokens"
metrics.ai_llm_tokens:inc(ai_plugin.usage.prompt_tokens, labels_table_ai_llm_tokens)
end

if ai_plugin.usage.completion_tokens and ai_plugin.usage.completion_tokens > 0 then
labels_table_ai_llm_tokens[7] = "completion_tokens"
metrics.ai_llm_tokens:inc(ai_plugin.usage.completion_tokens, labels_table_ai_llm_tokens)
end

if ai_plugin.usage.total_tokens and ai_plugin.usage.total_tokens > 0 then
labels_table_ai_llm_tokens[7] = "total_tokens"
metrics.ai_llm_tokens:inc(ai_plugin.usage.total_tokens, labels_table_ai_llm_tokens)
end
end
end
end

-- The upstream health metrics is turned on if at least one of
Expand Down
4 changes: 4 additions & 0 deletions kong/plugins/prometheus/handler.lua
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ function PrometheusHandler:log(conf)
serialized.latencies = message.latencies
end

if conf.ai_metrics then
serialized.ai_metrics = message.ai
end

if conf.upstream_health_metrics then
exporter.set_export_upstream_health_metrics(true)
else
Expand Down
1 change: 1 addition & 0 deletions kong/plugins/prometheus/schema.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ return {
fields = {
{ per_consumer = { description = "A boolean value that determines if per-consumer metrics should be collected. If enabled, the `kong_http_requests_total` and `kong_bandwidth_bytes` metrics fill in the consumer label when available.", type = "boolean", default = false }, },
{ status_code_metrics = { description = "A boolean value that determines if status code metrics should be collected. If enabled, `http_requests_total`, `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, },
{ ai_metrics = { description = "A boolean value that determines if ai metrics should be collected. If enabled, the `ai_llm_requests_total`, `ai_llm_cost_total` and `ai_llm_tokens_total` metrics will be exported.", type = "boolean", default = false }, },
{ latency_metrics = { description = "A boolean value that determines if latency metrics should be collected. If enabled, `kong_latency_ms`, `upstream_latency_ms` and `request_latency_ms` metrics will be exported.", type = "boolean", default = false }, },
{ bandwidth_metrics = { description = "A boolean value that determines if bandwidth metrics should be collected. If enabled, `bandwidth_bytes` and `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, },
{ upstream_health_metrics = { description = "A boolean value that determines if upstream metrics should be collected. If enabled, `upstream_target_health` metric will be exported.", type = "boolean", default = false }, },
Expand Down
Loading

0 comments on commit 15a6fec

Please sign in to comment.