diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua index 6504a171dbbb..79393d491c72 100644 --- a/kong/llm/drivers/shared.lua +++ b/kong/llm/drivers/shared.lua @@ -44,8 +44,10 @@ local log_entry_keys = { COST_REQUEST = "cost_request", -- cache keys - DB_NAME = "db_name", - CACHE_TYPE = "cache_type", + VECTOR_DB = "vector_db", + EMBEDDINGS_PROVIDER = "embeddings_provider", + EMBEDDINGS_MODEL = "embeddings_model", + CACHE_STATUS = "cache_status", } local openai_override = os.getenv("OPENAI_TEST_PORT") @@ -487,8 +489,10 @@ function _M.post_request(conf, response_object) [log_entry_keys.COST_REQUEST] = 0, }, [log_entry_keys.CACHE_CONTAINER] = { - [log_entry_keys.DB_NAME] = "", - [log_entry_keys.CACHE_TYPE] = "not_cached", + [log_entry_keys.VECTOR_DB] = "", + [log_entry_keys.EMBEDDINGS_PROVIDER] = "", + [log_entry_keys.EMBEDDINGS_MODEL] = "", + [log_entry_keys.CACHE_STATUS] = "", }, } @@ -521,7 +525,7 @@ function _M.post_request(conf, response_object) and conf.model.options.input_cost and conf.model.options.output_cost then request_analytics_plugin[log_entry_keys.TOKENS_CONTAINER][log_entry_keys.COST_REQUEST] = (response_object.usage.prompt_tokens * conf.model.options.input_cost - + response_object.usage.completion_tokens * conf.model.options.output_cost) / 1000 + + response_object.usage.completion_tokens * conf.model.options.output_cost) / 1000000 -- 1 million end end diff --git a/kong/llm/schemas/init.lua b/kong/llm/schemas/init.lua index d1f0e590360d..15ce1a2a1ef0 100644 --- a/kong/llm/schemas/init.lua +++ b/kong/llm/schemas/init.lua @@ -51,14 +51,14 @@ local model_options_schema = { default = 256 }}, { input_cost = { type = "number", - description = "Defines the cost per 1000 tokens in your prompt.", + description = "Defines the cost per 1M tokens in your prompt.", required = false, - between = { 0.0, 0.1 }}}, + gt = 0}}, { output_cost = { type = "number", - description = "Defines the cost per 1000 tokens in the output of the AI.", + description = "Defines the cost per 1M tokens in the output of the AI.", required = false, - between = { 0.0, 0.1 }}}, + gt = 0}}, { temperature = { type = "number", description = "Defines the matching temperature, if using chat or completion models.", diff --git a/kong/plugins/prometheus/exporter.lua b/kong/plugins/prometheus/exporter.lua index 91feeaa8d4d3..4fef171f2ab7 100644 --- a/kong/plugins/prometheus/exporter.lua +++ b/kong/plugins/prometheus/exporter.lua @@ -151,15 +151,15 @@ local function init() if ai_request then metrics.ai_llm_requests = prometheus:counter("ai_llm_requests_total", "AI requests total per ai_provider in Kong", - {"ai_provider", "ai_model", "cache", "db_name", "workspace"}) + {"ai_provider", "ai_model", "cache", "vector_db", "embeddings_provider", "embeddings_model", "workspace"}) metrics.ai_llm_cost = prometheus:counter("ai_llm_cost_total", "AI requests cost per ai_provider/cache in Kong", - {"ai_provider", "ai_model", "cache", "db_name", "workspace"}) + {"ai_provider", "ai_model", "cache", "vector_db", "embeddings_provider", "embeddings_model", "workspace"}) metrics.ai_llm_tokens = prometheus:counter("ai_llm_tokens_total", "AI requests cost per ai_provider/cache in Kong", - {"ai_provider", "ai_model", "cache", "db_name", "token_type", "workspace"}) + {"ai_provider", "ai_model", "cache", "vector_db", "embeddings_provider", "embeddings_model", "token_type", "workspace"}) end -- Hybrid mode status @@ -225,8 +225,8 @@ local upstream_target_addr_health_table = { { value = 0, labels = { 0, 0, 0, "dns_error", ngx.config.subsystem } }, } -- ai -local labels_table_ai_llm_status = {0, 0, 0, 0, 0} -local labels_table_ai_llm_tokens = {0, 0, 0, 0, 0, 0} +local labels_table_ai_llm_status = {0, 0, 0, 0, 0, 0, 0} +local labels_table_ai_llm_tokens = {0, 0, 0, 0, 0, 0, 0, 0} local function set_healthiness_metrics(table, upstream, target, address, status, metrics_bucket) for i = 1, #table do @@ -336,21 +336,33 @@ local function log(message, serialized) if serialized.ai_metrics then for _, ai_plugin in pairs(serialized.ai_metrics) do - local cache_type - if ai_plugin.cache and ai_plugin.cache.cache_type then - cache_type = ai_plugin.cache.cache_type + local cache_status + if ai_plugin.cache and ai_plugin.cache.cache_status then + cache_status = ai_plugin.cache.cache_status end - local db_name - if ai_plugin.cache and ai_plugin.cache.db_name then - db_name = ai_plugin.cache.db_name + local vector_db, embeddings_provider, embeddings_model + if ai_plugin.cache then + if ai_plugin.cache.vector_db then + vector_db = ai_plugin.cache.vector_db + end + + if ai_plugin.cache.embeddings_provider then + vector_db = ai_plugin.cache.embeddings_provider + end + + if ai_plugin.cache.embeddings_model then + vector_db = ai_plugin.cache.embeddings_model + end end labels_table_ai_llm_status[1] = ai_plugin.meta.provider_name labels_table_ai_llm_status[2] = ai_plugin.meta.request_model - labels_table_ai_llm_status[3] = cache_type - labels_table_ai_llm_status[4] = db_name - labels_table_ai_llm_status[5] = workspace + labels_table_ai_llm_status[3] = cache_status + labels_table_ai_llm_status[4] = vector_db + labels_table_ai_llm_status[5] = embeddings_provider + labels_table_ai_llm_status[6] = embeddings_model + labels_table_ai_llm_status[7] = workspace metrics.ai_llm_requests:inc(1, labels_table_ai_llm_status) if ai_plugin.usage.cost_request and ai_plugin.usage.cost_request > 0 then @@ -359,12 +371,14 @@ local function log(message, serialized) labels_table_ai_llm_tokens[1] = ai_plugin.meta.provider_name labels_table_ai_llm_tokens[2] = ai_plugin.meta.request_model - labels_table_ai_llm_tokens[3] = cache_type - labels_table_ai_llm_tokens[4] = db_name - labels_table_ai_llm_tokens[6] = workspace + labels_table_ai_llm_tokens[3] = cache_status + labels_table_ai_llm_tokens[4] = vector_db + labels_table_ai_llm_tokens[5] = embeddings_provider + labels_table_ai_llm_tokens[6] = embeddings_model + labels_table_ai_llm_tokens[8] = workspace if ai_plugin.usage.prompt_tokens and ai_plugin.usage.prompt_tokens > 0 then - labels_table_ai_llm_tokens[5] = "prompt_tokens" + labels_table_ai_llm_tokens[7] = "prompt_tokens" metrics.ai_llm_tokens:inc(ai_plugin.usage.prompt_tokens, labels_table_ai_llm_tokens) end diff --git a/spec/03-plugins/26-prometheus/02-access_spec.lua b/spec/03-plugins/26-prometheus/02-access_spec.lua index e17c3f2c6d2a..f614e9b87a00 100644 --- a/spec/03-plugins/26-prometheus/02-access_spec.lua +++ b/spec/03-plugins/26-prometheus/02-access_spec.lua @@ -702,8 +702,8 @@ describe("Plugin: prometheus (access) AI metrics", function() max_tokens = 256, temperature = 1.0, upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/llm/v1/chat/good", - input_cost = 0.01, - output_cost = 0.01, + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -764,13 +764,13 @@ describe("Plugin: prometheus (access) AI metrics", function() assert.matches('kong_nginx_metric_errors_total 0', body, nil, true) assert.matches('http_requests_total{service="empty_service",route="http-route",code="200",source="service",workspace="default",consumer=""} 1', body, nil, true) - assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",workspace="default"} 1', body, nil, true) + assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 1', body, nil, true) - assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",workspace="default"} 0.00037', body, nil, true) + assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00037', body, nil, true) - assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",token_type="completion_tokens",workspace="default"} 12', body, nil, true) - assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",token_type="prompt_tokens",workspace="default"} 25', body, nil, true) - assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",token_type="total_tokens",workspace="default"} 37', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 12', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 25', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 37', body, nil, true) end) it("increments the count for proxied AI requests", function() @@ -800,13 +800,13 @@ describe("Plugin: prometheus (access) AI metrics", function() assert.matches('kong_nginx_metric_errors_total 0', body, nil, true) assert.matches('http_requests_total{service="empty_service",route="http-route",code="200",source="service",workspace="default",consumer=""} 2', body, nil, true) - assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",workspace="default"} 2', body, nil, true) + assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 2', body, nil, true) - assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",workspace="default"} 0.00074', body, nil, true) + assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true) - assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",token_type="completion_tokens",workspace="default"} 24', body, nil, true) - assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",token_type="prompt_tokens",workspace="default"} 50', body, nil, true) - assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",token_type="total_tokens",workspace="default"} 74', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",token_type="completion_tokens",workspace="default"} 24', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",token_type="prompt_tokens",workspace="default"} 50', body, nil, true) + assert.matches('ai_llm_tokens_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",token_type="total_tokens",workspace="default"} 74', body, nil, true) end) it("behave correctly if AI metrics are not found", function() @@ -832,7 +832,7 @@ describe("Plugin: prometheus (access) AI metrics", function() assert.matches('http_requests_total{service="empty_service",route="http-route",code="400",source="kong",workspace="default",consumer=""} 1', body, nil, true) assert.matches('kong_nginx_metric_errors_total 0', body, nil, true) - assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",workspace="default"} 2', body, nil, true) - assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",db_name="",workspace="default"} 0.00074', body, nil, true) + assert.matches('ai_llm_requests_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 2', body, nil, true) + assert.matches('ai_llm_cost_total{ai_provider="openai",ai_model="gpt-3.5-turbo",cache="not_cached",vector_db="",embeddings_provider="",embeddings_model="",workspace="default"} 0.00074', body, nil, true) end) end) \ No newline at end of file diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua index 547ea3fb8c25..7fc751546d11 100644 --- a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua @@ -49,8 +49,10 @@ local _EXPECTED_CHAT_STATS = { total_tokens = 37, }, cache = { - cache_type = 'not_cached', - db_name = '', + cache_status = '', + vector_db = '', + embeddings_provider = '', + embeddings_model = '', } }, } @@ -253,8 +255,8 @@ for _, strategy in helpers.all_strategies() do if strategy ~= "cassandra" then max_tokens = 256, temperature = 1.0, upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/llm/v1/chat/good", - input_cost = 0.01, - output_cost = 0.01, + input_cost = 10.0, + output_cost = 10.0, }, }, }, diff --git a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua index 0003b4c3fb41..e96e570f3887 100644 --- a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua +++ b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua @@ -43,8 +43,8 @@ local OPENAI_FLAT_RESPONSE = { max_tokens = 512, temperature = 0.5, upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/flat", - input_cost = 0.01, - output_cost = 0.01, + input_cost = 10.0, + output_cost = 10.0, }, }, auth = { @@ -131,8 +131,10 @@ local _EXPECTED_CHAT_STATS = { total_tokens = 37, }, cache = { - cache_type = 'not_cached', - db_name = '', + cache_status = '', + vector_db = '', + embeddings_provider = '', + embeddings_model = '', } }, } diff --git a/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua b/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua index 4f2122182db5..806c824ace0f 100644 --- a/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua +++ b/spec/03-plugins/40-ai-response-transformer/02-integration_spec.lua @@ -60,8 +60,8 @@ local OPENAI_FLAT_RESPONSE = { max_tokens = 512, temperature = 0.5, upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/flat", - input_cost = 0.01, - output_cost = 0.01, + input_cost = 10.0, + output_cost = 10.0, }, }, auth = { @@ -188,8 +188,10 @@ local _EXPECTED_CHAT_STATS = { total_tokens = 37, }, cache = { - cache_type = 'not_cached', - db_name = '', + cache_status = '', + vector_db = '', + embeddings_provider = '', + embeddings_model = '', } }, }