Merge branch 'release/3.7.x' into release/3.7.1-fixup

Kong · Jun 21, 2024 · 93929aa · 93929aa
2 parents e89feb5 + 23055b3
commit 93929aa
Show file tree

Hide file tree

Showing 32 changed files with 876 additions and 540 deletions.
diff --git a/.github/matrix-commitly.yml b/.github/matrix-commitly.yml
@@ -1,7 +1,7 @@
 # please see matrix-full.yml for meaning of each field
 build-packages:
 - label: ubuntu-22.04
-  os: ubuntu-22.04
+  image: ubuntu:22.04
   package: deb
   check-manifest-suite: ubuntu-22.04-amd64
 

diff --git a/.github/matrix-full.yml b/.github/matrix-full.yml
@@ -12,9 +12,11 @@ build-packages:
   package: deb
   check-manifest-suite: ubuntu-20.04-amd64
 - label: ubuntu-22.04
+  image: ubuntu:22.04
   package: deb
   check-manifest-suite: ubuntu-22.04-amd64
 - label: ubuntu-22.04-arm64
+  image: ubuntu:22.04
   package: deb
   bazel-args: --platforms=//:generic-crossbuild-aarch64
   check-manifest-suite: ubuntu-22.04-arm64

diff --git a/changelog/unreleased/fix_hash.yml b/changelog/unreleased/fix_hash.yml
@@ -0,0 +1,3 @@
+message: Fixed an inefficiency issue in the Luajit hashing algorithm
+type: performance
+scope: Performance
diff --git a/changelog/unreleased/kong/ai-proxy-azure-streaming.yml b/changelog/unreleased/kong/ai-proxy-azure-streaming.yml
@@ -0,0 +1,5 @@
+message: |
+  **AI-proxy-plugin**: Fixed a bug where certain Azure models would return partial tokens/words 
+  when in response-streaming mode.
+scope: Plugin
+type: bugfix
diff --git a/changelog/unreleased/kong/ai-proxy-fix-model-parameter.yml b/changelog/unreleased/kong/ai-proxy-fix-model-parameter.yml
@@ -0,0 +1,5 @@
+message: |
+  **AI-proxy-plugin**: Fixed a bug where Cohere and Anthropic providers don't read the `model` parameter properly 
+  from the caller's request body.
+scope: Plugin
+type: bugfix
diff --git a/changelog/unreleased/kong/ai-proxy-fix-nil-response-token-count.yml b/changelog/unreleased/kong/ai-proxy-fix-nil-response-token-count.yml
@@ -0,0 +1,5 @@
+message: |
+  **AI-proxy-plugin**: Fixed a bug where using "OpenAI Function" inference requests would log a 
+  request error, and then hang until timeout.
+scope: Plugin
+type: bugfix
diff --git a/changelog/unreleased/kong/ai-proxy-fix-sending-own-model.yml b/changelog/unreleased/kong/ai-proxy-fix-sending-own-model.yml
@@ -0,0 +1,5 @@
+message: |
+  **AI-proxy-plugin**: Fixed a bug where AI Proxy would still allow callers to specify their own model,  
+  ignoring the plugin-configured model name.
+scope: Plugin
+type: bugfix
diff --git a/changelog/unreleased/kong/ai-proxy-fix-tuning-parameter-precedence.yml b/changelog/unreleased/kong/ai-proxy-fix-tuning-parameter-precedence.yml
@@ -0,0 +1,5 @@
+message: |
+  **AI-proxy-plugin**: Fixed a bug where AI Proxy would not take precedence of the 
+  plugin's configured model tuning options, over those in the user's LLM request.
+scope: Plugin
+type: bugfix
diff --git a/changelog/unreleased/kong/ai-proxy-proper-model-assignment.yml b/changelog/unreleased/kong/ai-proxy-proper-model-assignment.yml
@@ -0,0 +1,5 @@
+message: |
+  **AI-proxy-plugin**: Fixed a bug where setting OpenAI SDK model parameter "null" caused analytics 
+  to not be written to the logging plugin(s).
+scope: Plugin
+type: bugfix
diff --git a/changelog/unreleased/kong/fix-ai-proxy-shared-state.yml b/changelog/unreleased/kong/fix-ai-proxy-shared-state.yml
@@ -0,0 +1,3 @@
+message: "**AI-Proxy**: Resolved a bug where the object constructor would set data on the class instead of the instance"
+type: bugfix
+scope: Plugin
diff --git a/changelog/unreleased/kong/fix-realm-compat-changes-basic-auth.yml b/changelog/unreleased/kong/fix-realm-compat-changes-basic-auth.yml
@@ -0,0 +1,3 @@
+message: "**Basic-Auth**: Fix an issue of realm field not recognized for older kong versions (before 3.6)"
+type: bugfix
+scope: Plugin
diff --git a/changelog/unreleased/kong/fix-realm-compat-changes-key-auth.yml b/changelog/unreleased/kong/fix-realm-compat-changes-key-auth.yml
@@ -0,0 +1,3 @@
+message: "**Key-Auth**: Fix an issue of realm field not recognized for older kong versions (before 3.7)"
+type: bugfix
+scope: Plugin
diff --git a/kong-3.7.1-0.rockspec b/kong-3.7.1-0.rockspec
@@ -591,6 +591,7 @@ build = {
     ["kong.plugins.ai-response-transformer.schema"] = "kong/plugins/ai-response-transformer/schema.lua",
 
     ["kong.llm"] = "kong/llm/init.lua",
+    ["kong.llm.schemas"] = "kong/llm/schemas/init.lua",
     ["kong.llm.drivers.shared"] = "kong/llm/drivers/shared.lua",
     ["kong.llm.drivers.openai"] = "kong/llm/drivers/openai.lua",
     ["kong.llm.drivers.azure"] = "kong/llm/drivers/azure.lua",

diff --git a/kong/clustering/compat/removed_fields.lua b/kong/clustering/compat/removed_fields.lua
@@ -115,6 +115,9 @@ return {
     opentelemetry = {
       "sampling_rate",
     },
+    basic_auth = {
+      "realm"
+    }
   },
 
   -- Any dataplane older than 3.7.0
@@ -135,5 +138,8 @@ return {
     ai_response_transformer = {
       "llm.model.options.upstream_path",
     },
+    key_auth = {
+      "realm"
+    }
   },
 }
diff --git a/kong/llm/drivers/anthropic.lua b/kong/llm/drivers/anthropic.lua
@@ -93,8 +93,8 @@ local transformers_to = {
       return nil, nil, err
     end
 
-    messages.temperature = request_table.temperature or (model.options and model.options.temperature) or nil
-    messages.max_tokens = request_table.max_tokens or (model.options and model.options.max_tokens) or nil
+    messages.temperature = (model.options and model.options.temperature) or request_table.temperature
+    messages.max_tokens = (model.options and model.options.max_tokens) or request_table.max_tokens
     messages.model = model.name or request_table.model
     messages.stream = request_table.stream or false  -- explicitly set this if nil
 
@@ -110,9 +110,8 @@ local transformers_to = {
       return nil, nil, err
     end
 
-    prompt.temperature = request_table.temperature or (model.options and model.options.temperature) or nil
-    prompt.max_tokens_to_sample = request_table.max_tokens or (model.options and model.options.max_tokens) or nil
-    prompt.model = model.name
+    prompt.temperature = (model.options and model.options.temperature) or request_table.temperature
+    prompt.max_tokens_to_sample = (model.options and model.options.max_tokens) or request_table.max_tokens
     prompt.model = model.name or request_table.model
     prompt.stream = request_table.stream or false  -- explicitly set this if nil
 
@@ -152,11 +151,9 @@ local function start_to_event(event_data, model_info)
 
   local metadata = {
     prompt_tokens = meta.usage
-                    and meta.usage.input_tokens
-                    or nil,
+                    and meta.usage.input_tokens,
     completion_tokens = meta.usage
-                    and meta.usage.output_tokens
-                    or nil,
+                    and meta.usage.output_tokens,
     model = meta.model,
     stop_reason = meta.stop_reason,
     stop_sequence = meta.stop_sequence,
@@ -209,14 +206,11 @@ local function handle_stream_event(event_t, model_info, route_type)
     and event_data.usage then
       return nil, nil, {
         prompt_tokens = nil,
-        completion_tokens = event_data.usage.output_tokens
-                        or nil,
+        completion_tokens = event_data.usage.output_tokens,
         stop_reason = event_data.delta
-                  and event_data.delta.stop_reason
-                    or nil,
+                  and event_data.delta.stop_reason,
         stop_sequence = event_data.delta
-                    and event_data.delta.stop_sequence
-                      or nil,
+                    and event_data.delta.stop_sequence,
       }
     else
       return nil, "message_delta is missing the metadata block", nil
@@ -267,7 +261,7 @@ local transformers_from = {
           prompt_tokens = usage.input_tokens,
           completion_tokens = usage.output_tokens,
           total_tokens = usage.input_tokens and usage.output_tokens and
-            usage.input_tokens + usage.output_tokens or nil,
+            usage.input_tokens + usage.output_tokens,
         }
 
       else
@@ -442,12 +436,7 @@ function _M.post_request(conf)
 end
 
 function _M.pre_request(conf, body)
-  -- check for user trying to bring own model
-  if body and body.model then
-    return nil, "cannot use own model for this instance"
-  end
-
-  return true, nil
+  return true
 end
 
 -- returns err or nil

diff --git a/kong/llm/drivers/cohere.lua b/kong/llm/drivers/cohere.lua
@@ -219,18 +219,15 @@ local transformers_from = {
       local stats = {
         completion_tokens = response_table.meta
                         and response_table.meta.billed_units
-                        and response_table.meta.billed_units.output_tokens
-                        or nil,
+                        and response_table.meta.billed_units.output_tokens,
 
         prompt_tokens = response_table.meta
                     and response_table.meta.billed_units
-                    and response_table.meta.billed_units.input_tokens
-                    or nil,
+                    and response_table.meta.billed_units.input_tokens,
 
         total_tokens = response_table.meta
                   and response_table.meta.billed_units
-                  and (response_table.meta.billed_units.output_tokens + response_table.meta.billed_units.input_tokens)
-                  or nil,
+                  and (response_table.meta.billed_units.output_tokens + response_table.meta.billed_units.input_tokens),
       }
       messages.usage = stats
 
@@ -252,26 +249,23 @@ local transformers_from = {
       local stats = {
         completion_tokens = response_table.meta
                         and response_table.meta.billed_units
-                        and response_table.meta.billed_units.output_tokens
-                        or nil,
+                        and response_table.meta.billed_units.output_tokens,
 
         prompt_tokens = response_table.meta
                     and response_table.meta.billed_units
-                    and response_table.meta.billed_units.input_tokens
-                    or nil,
+                    and response_table.meta.billed_units.input_tokens,
 
         total_tokens = response_table.meta
                   and response_table.meta.billed_units
-                  and (response_table.meta.billed_units.output_tokens + response_table.meta.billed_units.input_tokens)
-                  or nil,
+                  and (response_table.meta.billed_units.output_tokens + response_table.meta.billed_units.input_tokens),
       }
       messages.usage = stats
 
     else -- probably a fault
       return nil, "'text' or 'generations' missing from cohere response body"
 
     end
-  
+
     return cjson.encode(messages)
   end,
 
@@ -299,11 +293,10 @@ local transformers_from = {
       prompt.id = response_table.id
 
       local stats = {
-        completion_tokens = response_table.meta and response_table.meta.billed_units.output_tokens or nil,
-        prompt_tokens = response_table.meta and response_table.meta.billed_units.input_tokens or nil,
+        completion_tokens = response_table.meta and response_table.meta.billed_units.output_tokens,
+        prompt_tokens = response_table.meta and response_table.meta.billed_units.input_tokens,
         total_tokens = response_table.meta
-                  and (response_table.meta.billed_units.output_tokens + response_table.meta.billed_units.input_tokens)
-                  or nil,
+                  and (response_table.meta.billed_units.output_tokens + response_table.meta.billed_units.input_tokens),
       }
       prompt.usage = stats
 
@@ -323,9 +316,9 @@ local transformers_from = {
       prompt.id = response_table.generation_id
 
       local stats = {
-        completion_tokens = response_table.token_count and response_table.token_count.response_tokens or nil,
-        prompt_tokens = response_table.token_count and response_table.token_count.prompt_tokens or nil,
-        total_tokens = response_table.token_count and response_table.token_count.total_tokens or nil,
+        completion_tokens = response_table.token_count and response_table.token_count.response_tokens,
+        prompt_tokens = response_table.token_count and response_table.token_count.prompt_tokens,
+        total_tokens = response_table.token_count and response_table.token_count.total_tokens,
       }
       prompt.usage = stats
 
@@ -400,12 +393,7 @@ function _M.post_request(conf)
 end
 
 function _M.pre_request(conf, body)
-  -- check for user trying to bring own model
-  if body and body.model then
-    return false, "cannot use own model for this instance"
-  end
-
-  return true, nil
+  return true
 end
 
 function _M.subrequest(body, conf, http_opts, return_res_table)
@@ -467,7 +455,7 @@ end
 function _M.configure_request(conf)
   local parsed_url
 
-  if conf.model.options.upstream_url then
+  if conf.model.options and conf.model.options.upstream_url then
     parsed_url = socket_url.parse(conf.model.options.upstream_url)
   else
     parsed_url = socket_url.parse(ai_shared.upstream_url_format[DRIVER_NAME])
@@ -476,10 +464,6 @@ function _M.configure_request(conf)
                       or ai_shared.operation_map[DRIVER_NAME][conf.route_type]
                       and ai_shared.operation_map[DRIVER_NAME][conf.route_type].path
                       or "/"
-
-    if not parsed_url.path then
-      return false, fmt("operation %s is not supported for cohere provider", conf.route_type)
-    end
   end
 
   -- if the path is read from a URL capture, ensure that it is valid

diff --git a/kong/llm/drivers/openai.lua b/kong/llm/drivers/openai.lua
@@ -18,15 +18,15 @@ end
 
 local transformers_to = {
   ["llm/v1/chat"] = function(request_table, model_info, route_type)
-    request_table.model = request_table.model or model_info.name
+    request_table.model = model_info.name or request_table.model
     request_table.stream = request_table.stream or false  -- explicitly set this
     request_table.top_k = nil  -- explicitly remove unsupported default
 
     return request_table, "application/json", nil
   end,
 
   ["llm/v1/completions"] = function(request_table, model_info, route_type)
-    request_table.model = model_info.name
+    request_table.model = model_info.name or request_table.model
     request_table.stream = request_table.stream or false -- explicitly set this
     request_table.top_k = nil  -- explicitly remove unsupported default
 

diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua
@@ -131,10 +131,10 @@ _M.clear_response_headers = {
 -- @return {string} error if any is thrown - request should definitely be terminated if this is not nil
 function _M.merge_config_defaults(request, options, request_format)
   if options then
-    request.temperature = request.temperature or options.temperature
-    request.max_tokens = request.max_tokens or options.max_tokens
-    request.top_p = request.top_p or options.top_p
-    request.top_k = request.top_k or options.top_k
+    request.temperature = options.temperature or request.temperature
+    request.max_tokens = options.max_tokens or request.max_tokens 
+    request.top_p = options.top_p or request.top_p
+    request.top_k = options.top_k or request.top_k
   end
 
   return request, nil
@@ -197,28 +197,44 @@ end
 function _M.frame_to_events(frame)
   local events = {}
 
-  -- todo check if it's raw json and
+  -- Cohere / Other flat-JSON format parser
   -- just return the split up data frame
-  if string.sub(str_ltrim(frame), 1, 1) == "{" then
+  if (not kong or not kong.ctx.plugin.truncated_frame) and string.sub(str_ltrim(frame), 1, 1) == "{" then
     for event in frame:gmatch("[^\r\n]+") do
       events[#events + 1] = {
         data = event,
       }
     end
   else
+    -- standard SSE parser
     local event_lines = split(frame, "\n")
     local struct = { event = nil, id = nil, data = nil }
 
-    for _, dat in ipairs(event_lines) do
+    for i, dat in ipairs(event_lines) do
       if #dat < 1 then
         events[#events + 1] = struct
         struct = { event = nil, id = nil, data = nil }
       end
 
+      -- test for truncated chunk on the last line (no trailing \r\n\r\n)
+      if #dat > 0 and #event_lines == i then
+        ngx.log(ngx.DEBUG, "[ai-proxy] truncated sse frame head")
+        kong.ctx.plugin.truncated_frame = dat
+        break  -- stop parsing immediately, server has done something wrong
+      end
+
+      -- test for abnormal start-of-frame (truncation tail)
+      if kong and kong.ctx.plugin.truncated_frame then
+        -- this is the tail of a previous incomplete chunk
+        ngx.log(ngx.DEBUG, "[ai-proxy] truncated sse frame tail")
+        dat = fmt("%s%s", kong.ctx.plugin.truncated_frame, dat)
+        kong.ctx.plugin.truncated_frame = nil
+      end
+
       local s1, _ = str_find(dat, ":") -- find where the cut point is
 
       if s1 and s1 ~= 1 then
-        local field = str_sub(dat, 1, s1-1) -- returns "data " from data: hello world
+        local field = str_sub(dat, 1, s1-1) -- returns "data" from data: hello world
         local value = str_ltrim(str_sub(dat, s1+1)) -- returns "hello world" from data: hello world
 
         -- for now not checking if the value is already been set
@@ -249,7 +265,7 @@ function _M.to_ollama(request_table, model)
 
   -- common parameters
   input.stream = request_table.stream or false -- for future capability
-  input.model = model.name
+  input.model = model.name or request_table.name
 
   if model.options then
     input.options = {}
@@ -603,8 +619,10 @@ end
 -- Function to count the number of words in a string
 local function count_words(str)
   local count = 0
-  for word in str:gmatch("%S+") do
-      count = count + 1
+  if type(str) == "string" then
+    for word in str:gmatch("%S+") do
+        count = count + 1
+    end
   end
   return count
 end