diff --git a/changelog/unreleased/kong/add-ai-data-prometheus.yml b/changelog/unreleased/kong/add-ai-data-prometheus.yml new file mode 100644 index 000000000000..284c4fd933ce --- /dev/null +++ b/changelog/unreleased/kong/add-ai-data-prometheus.yml @@ -0,0 +1,3 @@ +"message": "**prometheus**: Added `ai_requests_total`, `ai_cost_total` and `ai_tokens_total` metrics in the Prometheus plugin to start counting AI usage." +"type": feature +"scope": Core diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua index a41a6e664c7f..9d62998c34cd 100644 --- a/kong/llm/drivers/shared.lua +++ b/kong/llm/drivers/shared.lua @@ -21,25 +21,32 @@ end -- local log_entry_keys = { - TOKENS_CONTAINER = "usage", + USAGE_CONTAINER = "usage", META_CONTAINER = "meta", PAYLOAD_CONTAINER = "payload", + CACHE_CONTAINER = "cache", -- payload keys REQUEST_BODY = "request", RESPONSE_BODY = "response", -- meta keys + PLUGIN_ID = "plugin_id", + PROVIDER_NAME = "provider_name", REQUEST_MODEL = "request_model", RESPONSE_MODEL = "response_model", - PROVIDER_NAME = "provider_name", - PLUGIN_ID = "plugin_id", -- usage keys - PROCESSING_TIME = "processing_time", - PROMPT_TOKEN = "prompt_token", - COMPLETION_TOKEN = "completion_token", + PROMPT_TOKENS = "prompt_tokens", + COMPLETION_TOKENS = "completion_tokens", TOTAL_TOKENS = "total_tokens", + COST = "cost", + + -- cache keys + VECTOR_DB = "vector_db", + EMBEDDINGS_PROVIDER = "embeddings_provider", + EMBEDDINGS_MODEL = "embeddings_model", + CACHE_STATUS = "cache_status", } local openai_override = os.getenv("OPENAI_TEST_PORT") @@ -487,26 +494,18 @@ function _M.post_request(conf, response_object) request_analytics = {} end - -- check if we already have analytics for this provider - local request_analytics_plugin = request_analytics[plugin_name] - - -- create a new structure if not - if not request_analytics_plugin then - request_analytics_plugin = { - [log_entry_keys.META_CONTAINER] = {}, - [log_entry_keys.TOKENS_CONTAINER] = { - [log_entry_keys.PROMPT_TOKEN] = 0, - [log_entry_keys.COMPLETION_TOKEN] = 0, - [log_entry_keys.TOTAL_TOKENS] = 0, - }, - } - end + -- create a new analytics structure for this plugin + local request_analytics_plugin = { + [log_entry_keys.META_CONTAINER] = {}, + [log_entry_keys.USAGE_CONTAINER] = {}, + [log_entry_keys.CACHE_CONTAINER] = {}, + } -- Set the model, response, and provider names in the current try context + request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PLUGIN_ID] = conf.__plugin_id + request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PROVIDER_NAME] = provider_name request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.REQUEST_MODEL] = kong.ctx.plugin.llm_model_requested or conf.model.name request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.RESPONSE_MODEL] = response_object.model or conf.model.name - request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PROVIDER_NAME] = provider_name - request_analytics_plugin[log_entry_keys.META_CONTAINER][log_entry_keys.PLUGIN_ID] = conf.__plugin_id -- set extra per-provider meta if kong.ctx.plugin.ai_extra_meta and type(kong.ctx.plugin.ai_extra_meta) == "table" then @@ -518,13 +517,20 @@ function _M.post_request(conf, response_object) -- Capture openai-format usage stats from the transformed response body if response_object.usage then if response_object.usage.prompt_tokens then - request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.PROMPT_TOKEN] + response_object.usage.prompt_tokens + request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.PROMPT_TOKENS] = response_object.usage.prompt_tokens end if response_object.usage.completion_tokens then - request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COMPLETION_TOKEN] + response_object.usage.completion_tokens + request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.COMPLETION_TOKENS] = response_object.usage.completion_tokens end if response_object.usage.total_tokens then - request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] = request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.TOTAL_TOKENS] + response_object.usage.total_tokens + request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TOTAL_TOKENS] = response_object.usage.total_tokens + end + + if response_object.usage.prompt_tokens and response_object.usage.completion_tokens + and conf.model.options.input_cost and conf.model.options.output_cost then + request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.COST] = + (response_object.usage.prompt_tokens * conf.model.options.input_cost + + response_object.usage.completion_tokens * conf.model.options.output_cost) / 1000000 -- 1 million end end @@ -541,13 +547,17 @@ function _M.post_request(conf, response_object) kong.ctx.shared.analytics = request_analytics if conf.logging and conf.logging.log_statistics then - -- Log analytics data - kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.TOKENS_CONTAINER), - request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER]) - - -- Log meta + -- Log meta data kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.META_CONTAINER), request_analytics_plugin[log_entry_keys.META_CONTAINER]) + + -- Log usage data + kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.USAGE_CONTAINER), + request_analytics_plugin[log_entry_keys.USAGE_CONTAINER]) + + -- Log cache data + kong.log.set_serialize_value(fmt("ai.%s.%s", plugin_name, log_entry_keys.CACHE_CONTAINER), + request_analytics_plugin[log_entry_keys.CACHE_CONTAINER]) end -- log tokens response for reports and billing diff --git a/kong/llm/schemas/init.lua b/kong/llm/schemas/init.lua index 37b5aaf34761..15ce1a2a1ef0 100644 --- a/kong/llm/schemas/init.lua +++ b/kong/llm/schemas/init.lua @@ -49,6 +49,16 @@ local model_options_schema = { description = "Defines the max_tokens, if using chat or completion models.", required = false, default = 256 }}, + { input_cost = { + type = "number", + description = "Defines the cost per 1M tokens in your prompt.", + required = false, + gt = 0}}, + { output_cost = { + type = "number", + description = "Defines the cost per 1M tokens in the output of the AI.", + required = false, + gt = 0}}, { temperature = { type = "number", description = "Defines the matching temperature, if using chat or completion models.", diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua index d94d9a08e14b..2a94ebac272c 100644 --- a/kong/plugins/prometheus/exporter.lua +++ b/kong/plugins/prometheus/exporter.lua @@ -34,7 +34,6 @@ package.loaded['prometheus_resty_counter'] = require("resty.counter") local kong_subsystem = ngx.config.subsystem local http_subsystem = kong_subsystem == "http" - local function init() local shm = "prometheus_metrics" if not ngx.shared[shm] then @@ -145,6 +144,19 @@ local function init() {"service", "route", "direction", "workspace"}) end + -- AI mode + metrics.ai_llm_requests = prometheus:counter("ai_llm_requests_total", + "AI requests total per ai_provider in Kong", + {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"}) + + metrics.ai_llm_cost = prometheus:counter("ai_llm_cost_total", + "AI requests cost per ai_provider/cache in Kong", + {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "workspace"}) + + metrics.ai_llm_tokens = prometheus:counter("ai_llm_tokens_total", + "AI requests cost per ai_provider/cache in Kong", + {"ai_provider", "ai_model", "cache_status", "vector_db", "embeddings_provider", "embeddings_model", "token_type", "workspace"}) + -- Hybrid mode status if role == "control_plane" then metrics.data_plane_last_seen = prometheus:gauge("data_plane_last_seen", @@ -207,6 +219,9 @@ local upstream_target_addr_health_table = { { value = 0, labels = { 0, 0, 0, "unhealthy", ngx.config.subsystem } }, { value = 0, labels = { 0, 0, 0, "dns_error", ngx.config.subsystem } }, } +-- ai +local labels_table_ai_llm_status = {0, 0, 0, 0, 0, 0, 0} +local labels_table_ai_llm_tokens = {0, 0, 0, 0, 0, 0, 0, 0} local function set_healthiness_metrics(table, upstream, target, address, status, metrics_bucket) for i = 1, #table do @@ -313,6 +328,51 @@ local function log(message, serialized) metrics.kong_latency:observe(kong_proxy_latency, labels_table_latency) end end + + if serialized.ai_metrics then + for _, ai_plugin in pairs(serialized.ai_metrics) do + local cache_status = ai_plugin.cache.cache_status or "" + local vector_db = ai_plugin.cache.vector_db or "" + local embeddings_provider = ai_plugin.cache.embeddings_provider or "" + local embeddings_model = ai_plugin.cache.embeddings_model or "" + + labels_table_ai_llm_status[1] = ai_plugin.meta.provider_name + labels_table_ai_llm_status[2] = ai_plugin.meta.request_model + labels_table_ai_llm_status[3] = cache_status + labels_table_ai_llm_status[4] = vector_db + labels_table_ai_llm_status[5] = embeddings_provider + labels_table_ai_llm_status[6] = embeddings_model + labels_table_ai_llm_status[7] = workspace + metrics.ai_llm_requests:inc(1, labels_table_ai_llm_status) + + if ai_plugin.usage.cost and ai_plugin.usage.cost > 0 then + metrics.ai_llm_cost:inc(ai_plugin.usage.cost, labels_table_ai_llm_status) + end + + labels_table_ai_llm_tokens[1] = ai_plugin.meta.provider_name + labels_table_ai_llm_tokens[2] = ai_plugin.meta.request_model + labels_table_ai_llm_tokens[3] = cache_status + labels_table_ai_llm_tokens[4] = vector_db + labels_table_ai_llm_tokens[5] = embeddings_provider + labels_table_ai_llm_tokens[6] = embeddings_model + labels_table_ai_llm_tokens[8] = workspace + + if ai_plugin.usage.prompt_tokens and ai_plugin.usage.prompt_tokens > 0 then + labels_table_ai_llm_tokens[7] = "prompt_tokens" + metrics.ai_llm_tokens:inc(ai_plugin.usage.prompt_tokens, labels_table_ai_llm_tokens) + end + + if ai_plugin.usage.completion_tokens and ai_plugin.usage.completion_tokens > 0 then + labels_table_ai_llm_tokens[7] = "completion_tokens" + metrics.ai_llm_tokens:inc(ai_plugin.usage.completion_tokens, labels_table_ai_llm_tokens) + end + + if ai_plugin.usage.total_tokens and ai_plugin.usage.total_tokens > 0 then + labels_table_ai_llm_tokens[7] = "total_tokens" + metrics.ai_llm_tokens:inc(ai_plugin.usage.total_tokens, labels_table_ai_llm_tokens) + end + end + end end -- The upstream health metrics is turned on if at least one of diff --git a/kong/plugins/prometheus/handler.lua b/kong/plugins/prometheus/handler.lua index d7bce154eb74..3666b406f009 100644 --- a/kong/plugins/prometheus/handler.lua +++ b/kong/plugins/prometheus/handler.lua @@ -54,6 +54,10 @@ function PrometheusHandler:log(conf) serialized.latencies = message.latencies end + if conf.ai_metrics then + serialized.ai_metrics = message.ai + end + if conf.upstream_health_metrics then exporter.set_export_upstream_health_metrics(true) else diff --git a/kong/plugins/prometheus/schema.lua b/kong/plugins/prometheus/schema.lua index 9b067e3bf877..a23e3b3fc5ed 100644 --- a/kong/plugins/prometheus/schema.lua +++ b/kong/plugins/prometheus/schema.lua @@ -18,6 +18,7 @@ return { fields = { { per_consumer = { description = "A boolean value that determines if per-consumer metrics should be collected. If enabled, the `kong_http_requests_total` and `kong_bandwidth_bytes` metrics fill in the consumer label when available.", type = "boolean", default = false }, }, { status_code_metrics = { description = "A boolean value that determines if status code metrics should be collected. If enabled, `http_requests_total`, `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, }, + { ai_metrics = { description = "A boolean value that determines if ai metrics should be collected. If enabled, the `ai_llm_requests_total`, `ai_llm_cost_total` and `ai_llm_tokens_total` metrics will be exported.", type = "boolean", default = false }, }, { latency_metrics = { description = "A boolean value that determines if latency metrics should be collected. If enabled, `kong_latency_ms`, `upstream_latency_ms` and `request_latency_ms` metrics will be exported.", type = "boolean", default = false }, }, { bandwidth_metrics = { description = "A boolean value that determines if bandwidth metrics should be collected. If enabled, `bandwidth_bytes` and `stream_sessions_total` metrics will be exported.", type = "boolean", default = false }, }, { upstream_health_metrics = { description = "A boolean value that determines if upstream metrics should be collected. If enabled, `upstream_target_health` metric will be exported.", type = "boolean", default = false }, }, diff --git a/spec/03-plugins/26-prometheus/02-access_spec.lua b/spec/03-plugins/26-prometheus/02-access_spec.lua index f1478b558383..9138637d2f27 100644 --- a/spec/03-plugins/26-prometheus/02-access_spec.lua +++ b/spec/03-plugins/26-prometheus/02-access_spec.lua @@ -1,8 +1,10 @@ local helpers = require "spec.helpers" local shell = require "resty.shell" +local pl_file = require "pl.file" local tcp_service_port = helpers.get_available_port() local tcp_proxy_port = helpers.get_available_port() +local MOCK_PORT = helpers.get_available_port() local UUID_PATTERN = "%x%x%x%x%x%x%x%x%-%x%x%x%x%-%x%x%x%x%-%x%x%x%x%-%x%x%x%x%x%x%x%x%x%x%x%x" describe("Plugin: prometheus (access)", function() @@ -611,3 +613,287 @@ describe("Plugin: prometheus (access) granular metrics switch", function() end) end + +describe("Plugin: prometheus (access) AI metrics", function() + local proxy_client + local admin_client + local prometheus_plugin + + setup(function() + local bp = helpers.get_db_utils() + + local fixtures = { + http_mock = {}, + } + + fixtures.http_mock.openai = [[ + server { + server_name openai; + listen ]]..MOCK_PORT..[[; + + default_type 'application/json'; + + + location = "/llm/v1/chat/good" { + content_by_lua_block { + local pl_file = require "pl.file" + local json = require("cjson.safe") + + ngx.req.read_body() + local body, err = ngx.req.get_body_data() + body, err = json.decode(body) + + local token = ngx.req.get_headers()["authorization"] + local token_query = ngx.req.get_uri_args()["apikey"] + + if token == "Bearer openai-key" or token_query == "openai-key" or body.apikey == "openai-key" then + ngx.req.read_body() + local body, err = ngx.req.get_body_data() + body, err = json.decode(body) + + if err or (body.messages == ngx.null) then + ngx.status = 400 + ngx.print(pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/responses/bad_request.json")) + else + ngx.status = 200 + ngx.print(pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/responses/good.json")) + end + else + ngx.status = 401 + ngx.print(pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/responses/unauthorized.json")) + end + } + } + } + ]] + + local empty_service = assert(bp.services:insert { + name = "empty_service", + host = "localhost", --helpers.mock_upstream_host, + port = 8080, --MOCK_PORT, + path = "/", + }) + + -- 200 chat good with one option + local chat_good = assert(bp.routes:insert { + service = empty_service, + name = "http-route", + protocols = { "http" }, + strip_path = true, + paths = { "/" } + }) + + bp.plugins:insert { + name = "ai-proxy", + route = { id = chat_good.id }, + config = { + route_type = "llm/v1/chat", + logging = { + log_payloads = false, + log_statistics = true, + }, + auth = { + header_name = "Authorization", + header_value = "Bearer openai-key", + }, + model = { + name = "gpt-3.5-turbo", + provider = "openai", + options = { + max_tokens = 256, + temperature = 1.0, + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/llm/v1/chat/good", + input_cost = 10.0, + output_cost = 10.0, + }, + }, + }, + } + + prometheus_plugin = assert(bp.plugins:insert { + protocols = { "http", "https", "grpc", "grpcs", "tcp", "tls" }, + name = "prometheus", + config = { + -- ai_metrics = true, + status_code_metrics = true, + }, + }) + + assert(helpers.start_kong ({ + nginx_conf = "spec/fixtures/custom_nginx.template", + plugins = "bundled, prometheus", + }, nil, nil, fixtures)) + proxy_client = helpers.proxy_client() + admin_client = helpers.admin_client() + end) + + teardown(function() + if proxy_client then + proxy_client:close() + end + if admin_client then + admin_client:close() + end + + helpers.stop_kong() + end) + + it("no AI metrics when not enable in Prometheus plugin", function() + local res = assert(proxy_client:send { + method = "GET", + path = "/status/200", + headers = { + host = helpers.mock_upstream_host, + authorization = 'Bearer openai-key', + ["content-type"] = 'application/json', + accept = 'application/json', + }, + body = pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/requests/good.json"), + }) + assert.res_status(200, res) + + local body + helpers.wait_until(function() + local res = assert(admin_client:send { + method = "GET", + path = "/metrics", + }) + body = assert.res_status(200, res) + return res.status == 200 + end) + + assert.matches('kong_nginx_metric_errors_total 0', body, nil, true) + assert.matches('http_requests_total{service="empty_service",route="http-route",code="200",source="service",workspace="default",consumer=""} 1', body, nil, true) + + assert.not_match('ai_llm_requests_total', body, nil, true) + assert.not_match('ai_llm_cost_total', body, nil, true) + assert.not_match('ai_llm_tokens_total', body, nil, true) + end) + + it("update prometheus plugin config", function() + local body + helpers.wait_until(function() + local res = assert(admin_client:send { + method = "PATCH", + path = "/plugins/" .. prometheus_plugin.id, + body = { + name = "prometheus", + config = { + status_code_metrics = true, + ai_metrics = true, + } + }, + headers = { + ["Content-Type"] = "application/json" + } + }) + body = assert.res_status(200, res) + return res.status == 200 + end) + + local cjson = require "cjson" + local json = cjson.decode(body) + assert.equal(true, json.config.ai_metrics) + + ngx.sleep(2) + end) + + it("add the count for proxied AI requests", function() + local res = assert(proxy_client:send { + method = "GET", + path = "/status/200", + headers = { + host = helpers.mock_upstream_host, + authorization = 'Bearer openai-key', + ["content-type"] = 'application/json', + accept = 'application/json', + }, + body = pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/requests/good.json"), + }) + assert.res_status(200, res) + + local body + helpers.wait_until(function() + local res = assert(admin_client:send { + method = "GET", + path = "/metrics", + }) + body = assert.res_status(200, res) + return res.status == 200 + end) + + assert.matches('kong_nginx_metric_errors_total 0', body, nil, true) + assert.matches('http_requests_total{service="empty_service",route="http-route",code="200",source="service",workspace="default",consumer=""} 2', body, nil, true) + + assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 1', body, nil, true) + + assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00037', body, nil, true) + + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 12', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 25', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 37', body, nil, true) + end) + + it("increments the count for proxied AI requests", function() + local res = assert(proxy_client:send { + method = "GET", + path = "/status/200", + headers = { + host = helpers.mock_upstream_host, + authorization = 'Bearer openai-key', + ["content-type"] = 'application/json', + accept = 'application/json', + }, + body = pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/requests/good.json"), + }) + assert.res_status(200, res) + + local body + helpers.wait_until(function() + local res = assert(admin_client:send { + method = "GET", + path = "/metrics", + }) + body = assert.res_status(200, res) + return res.status == 200 + end) + + assert.matches('kong_nginx_metric_errors_total 0', body, nil, true) + assert.matches('http_requests_total{service="empty_service",route="http-route",code="200",source="service",workspace="default",consumer=""} 3', body, nil, true) + + assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 2', body, nil, true) + + assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true) + + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 24', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 50', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 74', body, nil, true) + end) + + it("behave correctly if AI metrics are not found", function() + local res = assert(proxy_client:send { + method = "GET", + path = "/status/400", + headers = { + host = helpers.mock_upstream_host, + } + }) + assert.res_status(400, res) + + local body + helpers.wait_until(function() + local res = assert(admin_client:send { + method = "GET", + path = "/metrics", + }) + body = assert.res_status(200, res) + return res.status == 200 + end) + + assert.matches('http_requests_total{service="empty_service",route="http-route",code="400",source="kong",workspace="default",consumer=""} 1', body, nil, true) + assert.matches('kong_nginx_metric_errors_total 0', body, nil, true) + + assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 2', body, nil, true) + assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache_status="",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true) + end) +end) \ No newline at end of file diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua index b0c6e4ee7ef6..b67d815fa07e 100644 --- a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua @@ -46,10 +46,12 @@ local _EXPECTED_CHAT_STATS = { response_model = 'gpt-3.5-turbo-0613', }, usage = { - completion_token = 12, - prompt_token = 25, + prompt_tokens = 25, + completion_tokens = 12, total_tokens = 37, + cost = 0.00037, }, + cache = {} }, } @@ -250,7 +252,9 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/llm/v1/chat/good" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/llm/v1/chat/good", + input_cost = 10.0, + output_cost = 10.0, }, }, }, diff --git a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua index 25351787ec2c..0e8014dc5fee 100644 --- a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua +++ b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua @@ -43,7 +43,9 @@ local OPENAI_FLAT_RESPONSE = { options = { max_tokens = 512, temperature = 0.5, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/flat" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/flat", + input_cost = 10.0, + output_cost = 10.0, }, }, auth = { @@ -124,10 +126,12 @@ local _EXPECTED_CHAT_STATS = { response_model = 'gpt-3.5-turbo-0613', }, usage = { - completion_token = 12, - prompt_token = 25, + prompt_tokens = 25, + completion_tokens = 12, total_tokens = 37, + cost = 0.00037, }, + cache = {} }, } diff --git a/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua b/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua index 47072bb39a06..34f5afab3b6c 100644 --- a/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua +++ b/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua @@ -60,7 +60,9 @@ local OPENAI_FLAT_RESPONSE = { options = { max_tokens = 512, temperature = 0.5, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/flat" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/flat", + input_cost = 10.0, + output_cost = 10.0, }, }, auth = { @@ -181,10 +183,12 @@ local _EXPECTED_CHAT_STATS = { response_model = 'gpt-3.5-turbo-0613', }, usage = { - completion_token = 12, - prompt_token = 25, + prompt_tokens = 25, + completion_tokens = 12, total_tokens = 37, + cost = 0.00037, }, + cache = {} }, }