diff --git a/kong/llm/plugin/base.lua b/kong/llm/plugin/base.lua index 3a23c78a8732..0daca7a29419 100644 --- a/kong/llm/plugin/base.lua +++ b/kong/llm/plugin/base.lua @@ -19,6 +19,13 @@ local STAGES = { RES_POST_PROCESSING = 7, } +-- Filters in those stages are allowed to execute more than one time in a request +-- TODO: implement singleton support, that in one iteration of of body_filter only one filter +-- only ran one times. This is not an issue today as they are only used in one plugin. +local REPEATED_PHASES = { + [STAGES.STREAMING] = true, +} + local MetaPlugin = {} local all_filters = {} @@ -38,7 +45,7 @@ local function run_stage(stage, sub_plugin, conf) if not f then kong.log.err("no filter named '" .. name .. "' registered") - elseif not ai_executed_filters[name] then + elseif not ai_executed_filters[name] or REPEATED_PHASES[stage] then ai_executed_filters[name] = true kong.log.debug("executing filter ", name) diff --git a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua index 4684f8d081ec..d97d5c59b949 100644 --- a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua +++ b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua @@ -83,6 +83,8 @@ local function handle_streaming_frame(conf, chunk, finished) for _, event in ipairs(events) do + -- TODO: currently only subset of driver follow the body, err, metadata pattern + -- unify this so that it was always extracted from the body local formatted, _, metadata = ai_driver.from_format(event, conf.model, "stream/" .. conf.route_type) if formatted then @@ -106,12 +108,6 @@ local function handle_streaming_frame(conf, chunk, finished) if body_buffer then body_buffer:put(token_t) end - - -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them - -- but this is all we can do until OpenAI fixes this... - -- - -- essentially, every 4 characters is a token, with minimum of 1*4 per event - ai_plugin_o11y.metrics_add("llm_completion_tokens_count", math.ceil(#strip(token_t) / 4)) end end @@ -141,6 +137,19 @@ local function handle_streaming_frame(conf, chunk, finished) local prompt_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") local completion_tokens_count = ai_plugin_o11y.metrics_get("llm_completion_tokens_count") + + if conf.logging and conf.logging.log_statistics then + -- no metadata populated in the event streams, do our estimation + if completion_tokens_count == 0 then + -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them + -- but this is all we can do until OpenAI fixes this... + -- + -- essentially, every 4 characters is a token, with minimum of 1*4 per event + completion_tokens_count = math.ceil(#strip(response) / 4) + ai_plugin_o11y.metrics_set("llm_completion_tokens_count", completion_tokens_count) + end + end + -- populate cost if conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then local cost = (prompt_tokens_count * conf.model.options.input_cost + diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua index f4dbf536ab70..9831385efe95 100644 --- a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua @@ -871,14 +871,14 @@ for _, strategy in helpers.all_strategies() do local _, first_got = next(log_message.ai) local actual_llm_latency = first_got.meta.llm_latency local actual_time_per_token = first_got.usage.time_per_token - local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens) + local time_per_token = actual_llm_latency / first_got.usage.completion_tokens first_got.meta.llm_latency = 1 first_got.usage.time_per_token = 1 assert.same(first_expected, first_got) assert.is_true(actual_llm_latency >= 0) - assert.same(actual_time_per_token, time_per_token) + assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token))) assert.same(first_got.meta.request_model, "gpt-3.5-turbo") end) @@ -1529,14 +1529,14 @@ for _, strategy in helpers.all_strategies() do local actual_llm_latency = first_got.meta.llm_latency local actual_time_per_token = first_got.usage.time_per_token - local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens) + local time_per_token = actual_llm_latency / first_got.usage.completion_tokens first_got.meta.llm_latency = 1 first_got.usage.time_per_token = 1 assert.same(first_expected, first_got) assert.is_true(actual_llm_latency >= 0) - assert.same(actual_time_per_token, time_per_token) + assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token))) end) it("logs payloads", function() diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua index 5cb8a94ed531..6ba0d7bdf97d 100644 --- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua @@ -20,10 +20,10 @@ local _EXPECTED_CHAT_STATS = { }, usage = { prompt_tokens = 18, - completion_tokens = 7, - total_tokens = 25, + completion_tokens = 13, -- this was from estimation + total_tokens = 31, time_per_token = 1, - cost = 0.00037, + cost = 0.00031, }, } @@ -377,7 +377,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -418,7 +420,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -454,7 +458,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -492,6 +498,8 @@ for _, strategy in helpers.all_strategies() do temperature = 1.0, upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/anthropic/llm/v1/chat/good", anthropic_version = "2023-06-01", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -527,7 +535,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad", + input_cost = 10.0, + output_cost = 10.0, }, }, },