Skip to content

Commit

Permalink
fix(llm): fix streaming sse filter not ran twice and prompt token cou…
Browse files Browse the repository at this point in the history
…nt and missing metadata

(cherry picked from commit e053601)
  • Loading branch information
tysoekong authored and fffonion committed Nov 29, 2024
1 parent 9ecbc3e commit 2a032d0
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 18 deletions.
9 changes: 8 additions & 1 deletion kong/llm/plugin/base.lua
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ local STAGES = {
RES_POST_PROCESSING = 7,
}

-- Filters in those stages are allowed to execute more than one time in a request
-- TODO: implement singleton support, that in one iteration of of body_filter only one filter
-- only ran one times. This is not an issue today as they are only used in one plugin.
local REPEATED_PHASES = {
[STAGES.STREAMING] = true,
}

local MetaPlugin = {}

local all_filters = {}
Expand All @@ -38,7 +45,7 @@ local function run_stage(stage, sub_plugin, conf)
if not f then
kong.log.err("no filter named '" .. name .. "' registered")

elseif not ai_executed_filters[name] then
elseif not ai_executed_filters[name] or REPEATED_PHASES[stage] then
ai_executed_filters[name] = true

kong.log.debug("executing filter ", name)
Expand Down
21 changes: 15 additions & 6 deletions kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ local function handle_streaming_frame(conf, chunk, finished)


for _, event in ipairs(events) do
-- TODO: currently only subset of driver follow the body, err, metadata pattern
-- unify this so that it was always extracted from the body
local formatted, _, metadata = ai_driver.from_format(event, conf.model, "stream/" .. conf.route_type)

if formatted then
Expand All @@ -106,12 +108,6 @@ local function handle_streaming_frame(conf, chunk, finished)
if body_buffer then
body_buffer:put(token_t)
end

-- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-- but this is all we can do until OpenAI fixes this...
--
-- essentially, every 4 characters is a token, with minimum of 1*4 per event
ai_plugin_o11y.metrics_add("llm_completion_tokens_count", math.ceil(#strip(token_t) / 4))
end
end

Expand Down Expand Up @@ -141,6 +137,19 @@ local function handle_streaming_frame(conf, chunk, finished)

local prompt_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count")
local completion_tokens_count = ai_plugin_o11y.metrics_get("llm_completion_tokens_count")

if conf.logging and conf.logging.log_statistics then
-- no metadata populated in the event streams, do our estimation
if completion_tokens_count == 0 then
-- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-- but this is all we can do until OpenAI fixes this...
--
-- essentially, every 4 characters is a token, with minimum of 1*4 per event
completion_tokens_count = math.ceil(#strip(response) / 4)
ai_plugin_o11y.metrics_set("llm_completion_tokens_count", completion_tokens_count)
end
end

-- populate cost
if conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then
local cost = (prompt_tokens_count * conf.model.options.input_cost +
Expand Down
8 changes: 4 additions & 4 deletions spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -871,14 +871,14 @@ for _, strategy in helpers.all_strategies() do
local _, first_got = next(log_message.ai)
local actual_llm_latency = first_got.meta.llm_latency
local actual_time_per_token = first_got.usage.time_per_token
local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
local time_per_token = actual_llm_latency / first_got.usage.completion_tokens

first_got.meta.llm_latency = 1
first_got.usage.time_per_token = 1

assert.same(first_expected, first_got)
assert.is_true(actual_llm_latency >= 0)
assert.same(actual_time_per_token, time_per_token)
assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
assert.same(first_got.meta.request_model, "gpt-3.5-turbo")
end)

Expand Down Expand Up @@ -1529,14 +1529,14 @@ for _, strategy in helpers.all_strategies() do

local actual_llm_latency = first_got.meta.llm_latency
local actual_time_per_token = first_got.usage.time_per_token
local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
local time_per_token = actual_llm_latency / first_got.usage.completion_tokens

first_got.meta.llm_latency = 1
first_got.usage.time_per_token = 1

assert.same(first_expected, first_got)
assert.is_true(actual_llm_latency >= 0)
assert.same(actual_time_per_token, time_per_token)
assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
end)

it("logs payloads", function()
Expand Down
24 changes: 17 additions & 7 deletions spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ local _EXPECTED_CHAT_STATS = {
},
usage = {
prompt_tokens = 18,
completion_tokens = 7,
total_tokens = 25,
completion_tokens = 13, -- this was from estimation
total_tokens = 31,
time_per_token = 1,
cost = 0.00037,
cost = 0.00031,
},
}

Expand Down Expand Up @@ -377,7 +377,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -418,7 +420,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -454,7 +458,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -492,6 +498,8 @@ for _, strategy in helpers.all_strategies() do
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/anthropic/llm/v1/chat/good",
anthropic_version = "2023-06-01",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -527,7 +535,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down

0 comments on commit 2a032d0

Please sign in to comment.