diff --git a/changelog/unreleased/kong/ai-proxy-google-gemini.yml b/changelog/unreleased/kong/ai-proxy-google-gemini.yml new file mode 100644 index 000000000000..bc4fb06b21c4 --- /dev/null +++ b/changelog/unreleased/kong/ai-proxy-google-gemini.yml @@ -0,0 +1,5 @@ +message: | + Kong AI Gateway (AI Proxy and associated plugin family) now supports + the Google Gemini "chat" (generateContent) interface. +type: feature +scope: Plugin diff --git a/kong-3.8.0-0.rockspec b/kong-3.8.0-0.rockspec index f0c4dbe8ce12..ce680566797a 100644 --- a/kong-3.8.0-0.rockspec +++ b/kong-3.8.0-0.rockspec @@ -35,6 +35,7 @@ dependencies = { "lua-messagepack == 0.5.4", "lua-resty-aws == 1.5.0", "lua-resty-openssl == 1.4.0", + "lua-resty-gcp == 0.0.13", "lua-resty-counter == 0.2.1", "lua-resty-ipmatcher == 0.6.1", "lua-resty-acme == 0.14.0", @@ -607,6 +608,8 @@ build = { ["kong.llm.drivers.mistral"] = "kong/llm/drivers/mistral.lua", ["kong.llm.drivers.llama2"] = "kong/llm/drivers/llama2.lua", + ["kong.llm.drivers.gemini"] = "kong/llm/drivers/gemini.lua", + ["kong.plugins.ai-prompt-decorator.handler"] = "kong/plugins/ai-prompt-decorator/handler.lua", ["kong.plugins.ai-prompt-decorator.schema"] = "kong/plugins/ai-prompt-decorator/schema.lua", diff --git a/kong/llm/drivers/gemini.lua b/kong/llm/drivers/gemini.lua new file mode 100644 index 000000000000..59296ee9160b --- /dev/null +++ b/kong/llm/drivers/gemini.lua @@ -0,0 +1,433 @@ +local _M = {} + +-- imports +local cjson = require("cjson.safe") +local fmt = string.format +local ai_shared = require("kong.llm.drivers.shared") +local socket_url = require("socket.url") +local string_gsub = string.gsub +local buffer = require("string.buffer") +local table_insert = table.insert +local string_lower = string.lower +-- + +-- globals +local DRIVER_NAME = "gemini" +-- + +local _OPENAI_ROLE_MAPPING = { + ["system"] = "system", + ["user"] = "user", + ["assistant"] = "model", +} + +local function to_gemini_generation_config(request_table) + return { + ["maxOutputTokens"] = request_table.max_tokens, + ["stopSequences"] = request_table.stop, + ["temperature"] = request_table.temperature, + ["topK"] = request_table.top_k, + ["topP"] = request_table.top_p, + } +end + +local function is_response_content(content) + return content + and content.candidates + and #content.candidates > 0 + and content.candidates[1].content + and content.candidates[1].content.parts + and #content.candidates[1].content.parts > 0 + and content.candidates[1].content.parts[1].text +end + +local function is_response_finished(content) + return content + and content.candidates + and #content.candidates > 0 + and content.candidates[1].finishReason +end + +local function handle_stream_event(event_t, model_info, route_type) + -- discard empty frames, it should either be a random new line, or comment + if (not event_t.data) or (#event_t.data < 1) then + return + end + + local event, err = cjson.decode(event_t.data) + if err then + ngx.log(ngx.WARN, "failed to decode stream event frame from gemini: " .. err) + return nil, "failed to decode stream event frame from gemini", nil + end + + local new_event + local metadata = nil + + if is_response_content(event) then + new_event = { + choices = { + [1] = { + delta = { + content = event.candidates[1].content.parts[1].text or "", + role = "assistant", + }, + index = 0, + }, + }, + } + end + + if is_response_finished(event) then + metadata = metadata or {} + metadata.finished_reason = event.candidates[1].finishReason + new_event = "[DONE]" + end + + if event.usageMetadata then + metadata = metadata or {} + metadata.completion_tokens = event.usageMetadata.candidatesTokenCount or 0 + metadata.prompt_tokens = event.usageMetadata.promptTokenCount or 0 + end + + if new_event then + if new_event ~= "[DONE]" then + new_event = cjson.encode(new_event) + end + + return new_event, nil, metadata + else + return nil, nil, metadata -- caller code will handle "unrecognised" event types + end +end + +local function to_gemini_chat_openai(request_table, model_info, route_type) + if request_table then -- try-catch type mechanism + local new_r = {} + + if request_table.messages and #request_table.messages > 0 then + local system_prompt + + for i, v in ipairs(request_table.messages) do + + -- for 'system', we just concat them all into one Gemini instruction + if v.role and v.role == "system" then + system_prompt = system_prompt or buffer.new() + system_prompt:put(v.content or "") + else + -- for any other role, just construct the chat history as 'parts.text' type + new_r.contents = new_r.contents or {} + table_insert(new_r.contents, { + role = _OPENAI_ROLE_MAPPING[v.role or "user"], -- default to 'user' + parts = { + { + text = v.content or "" + }, + }, + }) + end + end + + -- This was only added in Gemini 1.5 + if system_prompt and model_info.name:sub(1, 10) == "gemini-1.0" then + return nil, nil, "system prompts aren't supported on gemini-1.0 models" + + elseif system_prompt then + new_r.systemInstruction = { + parts = { + { + text = system_prompt:get(), + }, + }, + } + end + end + + new_r.generationConfig = to_gemini_generation_config(request_table) + + return new_r, "application/json", nil + end + + local new_r = {} + + if request_table.messages and #request_table.messages > 0 then + local system_prompt + + for i, v in ipairs(request_table.messages) do + + -- for 'system', we just concat them all into one Gemini instruction + if v.role and v.role == "system" then + system_prompt = system_prompt or buffer.new() + system_prompt:put(v.content or "") + else + -- for any other role, just construct the chat history as 'parts.text' type + new_r.contents = new_r.contents or {} + table_insert(new_r.contents, { + role = _OPENAI_ROLE_MAPPING[v.role or "user"], -- default to 'user' + parts = { + { + text = v.content or "" + }, + }, + }) + end + end + end + + new_r.generationConfig = to_gemini_generation_config(request_table) + + return new_r, "application/json", nil +end + +local function from_gemini_chat_openai(response, model_info, route_type) + local response, err = cjson.decode(response) + + if err then + local err_client = "failed to decode response from Gemini" + ngx.log(ngx.ERR, fmt("%s: %s", err_client, err)) + return nil, err_client + end + + -- messages/choices table is only 1 size, so don't need to static allocate + local messages = {} + messages.choices = {} + + if response.candidates + and #response.candidates > 0 + and is_response_content(response) then + + messages.choices[1] = { + index = 0, + message = { + role = "assistant", + content = response.candidates[1].content.parts[1].text, + }, + finish_reason = string_lower(response.candidates[1].finishReason), + } + messages.object = "chat.completion" + messages.model = model_info.name + + else -- probably a server fault or other unexpected response + local err = "no generation candidates received from Gemini, or max_tokens too short" + ngx.log(ngx.ERR, err) + return nil, err + end + + return cjson.encode(messages) +end + +local transformers_to = { + ["llm/v1/chat"] = to_gemini_chat_openai, +} + +local transformers_from = { + ["llm/v1/chat"] = from_gemini_chat_openai, + ["stream/llm/v1/chat"] = handle_stream_event, +} + +function _M.from_format(response_string, model_info, route_type) + ngx.log(ngx.DEBUG, "converting from ", model_info.provider, "://", route_type, " type to kong") + + -- MUST return a string, to set as the response body + if not transformers_from[route_type] then + return nil, fmt("no transformer available from format %s://%s", model_info.provider, route_type) + end + + local ok, response_string, err, metadata = pcall(transformers_from[route_type], response_string, model_info, route_type) + if not ok or err then + return nil, fmt("transformation failed from type %s://%s: %s", + model_info.provider, + route_type, + err or "unexpected_error" + ) + end + + return response_string, nil, metadata +end + +function _M.to_format(request_table, model_info, route_type) + ngx.log(ngx.DEBUG, "converting from kong type to ", model_info.provider, "/", route_type) + + if route_type == "preserve" then + -- do nothing + return request_table, nil, nil + end + + if not transformers_to[route_type] then + return nil, nil, fmt("no transformer for %s://%s", model_info.provider, route_type) + end + + request_table = ai_shared.merge_config_defaults(request_table, model_info.options, model_info.route_type) + + local ok, response_object, content_type, err = pcall( + transformers_to[route_type], + request_table, + model_info + ) + if err or (not ok) then + return nil, nil, fmt("error transforming to %s://%s: %s", model_info.provider, route_type, err) + end + + return response_object, content_type, nil +end + +function _M.subrequest(body, conf, http_opts, return_res_table) + -- use shared/standard subrequest routine + local body_string, err + + if type(body) == "table" then + body_string, err = cjson.encode(body) + if err then + return nil, nil, "failed to parse body to json: " .. err + end + elseif type(body) == "string" then + body_string = body + else + return nil, nil, "body must be table or string" + end + + -- may be overridden + local url = (conf.model.options and conf.model.options.upstream_url) + or fmt( + "%s%s", + ai_shared.upstream_url_format[DRIVER_NAME], + ai_shared.operation_map[DRIVER_NAME][conf.route_type].path + ) + + local method = ai_shared.operation_map[DRIVER_NAME][conf.route_type].method + + local headers = { + ["Accept"] = "application/json", + ["Content-Type"] = "application/json", + } + + if conf.auth and conf.auth.header_name then + headers[conf.auth.header_name] = conf.auth.header_value + end + + local res, err, httpc = ai_shared.http_request(url, body_string, method, headers, http_opts, return_res_table) + if err then + return nil, nil, "request to ai service failed: " .. err + end + + if return_res_table then + return res, res.status, nil, httpc + else + -- At this point, the entire request / response is complete and the connection + -- will be closed or back on the connection pool. + local status = res.status + local body = res.body + + if status > 299 then + return body, res.status, "status code " .. status + end + + return body, res.status, nil + end +end + +function _M.header_filter_hooks(body) + -- nothing to parse in header_filter phase +end + +function _M.post_request(conf) + if ai_shared.clear_response_headers[DRIVER_NAME] then + for i, v in ipairs(ai_shared.clear_response_headers[DRIVER_NAME]) do + kong.response.clear_header(v) + end + end +end + +function _M.pre_request(conf, body) + -- disable gzip for gemini because it breaks streaming + kong.service.request.set_header("Accept-Encoding", "identity") + + return true, nil +end + +-- returns err or nil +function _M.configure_request(conf, identity_interface) + local parsed_url + local operation = kong.ctx.shared.ai_proxy_streaming_mode and "streamGenerateContent" + or "generateContent" + local f_url = conf.model.options and conf.model.options.upstream_url + + if not f_url then -- upstream_url override is not set + -- check if this is "public" or "vertex" gemini deployment + if conf.model.options + and conf.model.options.gemini + and conf.model.options.gemini.api_endpoint + and conf.model.options.gemini.project_id + and conf.model.options.gemini.location_id + then + -- vertex mode + f_url = fmt(ai_shared.upstream_url_format["gemini_vertex"], + conf.model.options.gemini.api_endpoint) .. + fmt(ai_shared.operation_map["gemini_vertex"][conf.route_type].path, + conf.model.options.gemini.project_id, + conf.model.options.gemini.location_id, + conf.model.name, + operation) + else + -- public mode + f_url = ai_shared.upstream_url_format["gemini"] .. + fmt(ai_shared.operation_map["gemini"][conf.route_type].path, + conf.model.name, + operation) + end + end + + parsed_url = socket_url.parse(f_url) + + if conf.model.options and conf.model.options.upstream_path then + -- upstream path override is set (or templated from request params) + parsed_url.path = conf.model.options.upstream_path + end + + -- if the path is read from a URL capture, ensure that it is valid + parsed_url.path = string_gsub(parsed_url.path, "^/*", "/") + + kong.service.request.set_path(parsed_url.path) + kong.service.request.set_scheme(parsed_url.scheme) + kong.service.set_target(parsed_url.host, (tonumber(parsed_url.port) or 443)) + + local auth_header_name = conf.auth and conf.auth.header_name + local auth_header_value = conf.auth and conf.auth.header_value + local auth_param_name = conf.auth and conf.auth.param_name + local auth_param_value = conf.auth and conf.auth.param_value + local auth_param_location = conf.auth and conf.auth.param_location + + -- DBO restrictions makes sure that only one of these auth blocks runs in one plugin config + if auth_header_name and auth_header_value then + kong.service.request.set_header(auth_header_name, auth_header_value) + end + + if auth_param_name and auth_param_value and auth_param_location == "query" then + local query_table = kong.request.get_query() + query_table[auth_param_name] = auth_param_value + kong.service.request.set_query(query_table) + end + -- if auth_param_location is "form", it will have already been set in a global pre-request hook + + -- if we're passed a GCP SDK, for cloud identity / SSO, use it appropriately + if identity_interface then + if identity_interface:needsRefresh() then + -- HACK: A bug in lua-resty-gcp tries to re-load the environment + -- variable every time, which fails in nginx + -- Create a whole new interface instead. + -- Memory leaks are mega unlikely because this should only + -- happen about once an hour, and the old one will be + -- cleaned up anyway. + local service_account_json = identity_interface.service_account_json + local identity_interface_new = identity_interface:new(service_account_json) + identity_interface.token = identity_interface_new.token + + kong.log.notice("gcp identity token for ", kong.plugin.get_id(), " has been refreshed") + end + + kong.service.request.set_header("Authorization", "Bearer " .. identity_interface.token) + end + + return true +end + +return _M diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua index 9d62998c34cd..0e1d0d18a962 100644 --- a/kong/llm/drivers/shared.lua +++ b/kong/llm/drivers/shared.lua @@ -16,7 +16,7 @@ local split = require("kong.tools.string").split local cycle_aware_deep_copy = require("kong.tools.table").cycle_aware_deep_copy local function str_ltrim(s) -- remove leading whitespace from string. - return (s:gsub("^%s*", "")) + return type(s) == "string" and s:gsub("^%s*", "") end -- @@ -55,6 +55,7 @@ _M.streaming_has_token_counts = { ["cohere"] = true, ["llama2"] = true, ["anthropic"] = true, + ["gemini"] = true, } _M.upstream_url_format = { @@ -62,6 +63,8 @@ _M.upstream_url_format = { anthropic = "https://api.anthropic.com:443", cohere = "https://api.cohere.com:443", azure = "https://%s.openai.azure.com:443/openai/deployments/%s", + gemini = "https://generativelanguage.googleapis.com", + gemini_vertex = "https://%s", } _M.operation_map = { @@ -105,6 +108,18 @@ _M.operation_map = { method = "POST", }, }, + gemini = { + ["llm/v1/chat"] = { + path = "/v1beta/models/%s:%s", + method = "POST", + }, + }, + gemini_vertex = { + ["llm/v1/chat"] = { + path = "/v1/projects/%s/locations/%s/publishers/google/models/%s:%s", + method = "POST", + }, + }, } _M.clear_response_headers = { @@ -120,6 +135,9 @@ _M.clear_response_headers = { mistral = { "Set-Cookie", }, + gemini = { + "Set-Cookie", + }, } --- @@ -199,21 +217,44 @@ end -- as if it were an SSE message. -- -- @param {string} frame input string to format into SSE events --- @param {string} delimiter delimeter (can be complex string) to split by +-- @param {boolean} raw_json sets application/json byte-parser mode -- @return {table} n number of split SSE messages, or empty table -function _M.frame_to_events(frame) +function _M.frame_to_events(frame, raw_json_mode) local events = {} + if (not frame) or (#frame < 1) or (type(frame)) ~= "string" then + return + end + + -- some new LLMs return the JSON object-by-object, + -- because that totally makes sense to parse?! + if raw_json_mode then + -- if this is the first frame, it will begin with array opener '[' + frame = (string.sub(str_ltrim(frame), 1, 1) == "[" and string.sub(str_ltrim(frame), 2)) or frame + + -- it may start with ',' which is the start of the new frame + frame = (string.sub(str_ltrim(frame), 1, 1) == "," and string.sub(str_ltrim(frame), 2)) or frame + + -- finally, it may end with the array terminator ']' indicating the finished stream + frame = (string.sub(str_ltrim(frame), -1) == "]" and string.sub(str_ltrim(frame), 1, -2)) or frame + + -- for multiple events that arrive in the same frame, split by top-level comma + for _, v in ipairs(split(frame, "\n,")) do + events[#events+1] = { data = v } + end + + -- check if it's raw json and just return the split up data frame -- Cohere / Other flat-JSON format parser -- just return the split up data frame - if (not kong or not kong.ctx.plugin.truncated_frame) and string.sub(str_ltrim(frame), 1, 1) == "{" then + elseif (not kong or not kong.ctx.plugin.truncated_frame) and string.sub(str_ltrim(frame), 1, 1) == "{" then for event in frame:gmatch("[^\r\n]+") do events[#events + 1] = { data = event, } end + + -- standard SSE parser else - -- standard SSE parser local event_lines = split(frame, "\n") local struct = { event = nil, id = nil, data = nil } @@ -226,7 +267,10 @@ function _M.frame_to_events(frame) -- test for truncated chunk on the last line (no trailing \r\n\r\n) if #dat > 0 and #event_lines == i then ngx.log(ngx.DEBUG, "[ai-proxy] truncated sse frame head") - kong.ctx.plugin.truncated_frame = dat + if kong then + kong.ctx.plugin.truncated_frame = dat + end + break -- stop parsing immediately, server has done something wrong end @@ -404,24 +448,26 @@ function _M.resolve_plugin_conf(kong_request, conf) -- handle all other options for k, v in pairs(conf.model.options or {}) do - local prop_m = string_match(v or "", '%$%((.-)%)') - if prop_m then - local splitted = split(prop_m, '.') - if #splitted ~= 2 then - return nil, "cannot parse expression for field '" .. v .. "'" - end - - -- find the request parameter, with the configured name - prop_m, err = _M.conf_from_request(kong_request, splitted[1], splitted[2]) - if err then - return nil, err - end - if not prop_m then - return nil, splitted[1] .. " key " .. splitted[2] .. " was not provided" - end + if type(v) == "string" then + local prop_m = string_match(v or "", '%$%((.-)%)') + if prop_m then + local splitted = split(prop_m, '.') + if #splitted ~= 2 then + return nil, "cannot parse expression for field '" .. v .. "'" + end + + -- find the request parameter, with the configured name + prop_m, err = _M.conf_from_request(kong_request, splitted[1], splitted[2]) + if err then + return nil, err + end + if not prop_m then + return nil, splitted[1] .. " key " .. splitted[2] .. " was not provided" + end - -- replace the value - conf_m.model.options[k] = prop_m + -- replace the value + conf_m.model.options[k] = prop_m + end end end diff --git a/kong/llm/init.lua b/kong/llm/init.lua index aaf3af08a790..85802e54b9c7 100644 --- a/kong/llm/init.lua +++ b/kong/llm/init.lua @@ -10,8 +10,6 @@ local _M = { config_schema = require "kong.llm.schemas", } - - do -- formats_compatible is a map of formats that are compatible with each other. local formats_compatible = { diff --git a/kong/llm/schemas/init.lua b/kong/llm/schemas/init.lua index 15ce1a2a1ef0..9dc68f16db8a 100644 --- a/kong/llm/schemas/init.lua +++ b/kong/llm/schemas/init.lua @@ -2,6 +2,28 @@ local typedefs = require("kong.db.schema.typedefs") local fmt = string.format +local gemini_options_schema = { + type = "record", + required = false, + fields = { + { api_endpoint = { + type = "string", + description = "If running Gemini on Vertex, specify the regional API endpoint (hostname only).", + required = false }}, + { project_id = { + type = "string", + description = "If running Gemini on Vertex, specify the project ID.", + required = false }}, + { location_id = { + type = "string", + description = "If running Gemini on Vertex, specify the location ID.", + required = false }}, + }, + entity_checks = { + { mutually_required = { "api_endpoint", "project_id", "location_id" }, }, + }, +} + local auth_schema = { type = "record", @@ -34,11 +56,22 @@ local auth_schema = { description = "Specify whether the 'param_name' and 'param_value' options go in a query string, or the POST form/JSON body.", required = false, one_of = { "query", "body" } }}, + { gcp_use_service_account = { + type = "boolean", + description = "Use service account auth for GCP-based providers and models.", + required = false, + default = false }}, + { gcp_service_account_json = { + type = "string", + description = "Set this field to the full JSON of the GCP service account to authenticate, if required. " .. + "If null (and gcp_use_service_account is true), Kong will attempt to read from " .. + "environment variable `GCP_SERVICE_ACCOUNT`.", + required = false, + referenceable = true }}, } } - local model_options_schema = { description = "Key/value settings for the model", type = "record", @@ -110,6 +143,7 @@ local model_options_schema = { .. "used when e.g. using the 'preserve' route_type.", type = "string", required = false }}, + { gemini = gemini_options_schema }, } } @@ -123,7 +157,7 @@ local model_schema = { type = "string", description = "AI provider request format - Kong translates " .. "requests to and from the specified backend compatible formats.", required = true, - one_of = { "openai", "azure", "anthropic", "cohere", "mistral", "llama2" }}}, + one_of = { "openai", "azure", "anthropic", "cohere", "mistral", "llama2", "gemini" }}}, { name = { type = "string", description = "Model name to execute.", diff --git a/kong/plugins/ai-proxy/handler.lua b/kong/plugins/ai-proxy/handler.lua index 35e13fbe8d9e..8e661e89317e 100644 --- a/kong/plugins/ai-proxy/handler.lua +++ b/kong/plugins/ai-proxy/handler.lua @@ -6,6 +6,14 @@ local kong_meta = require("kong.meta") local buffer = require "string.buffer" local strip = require("kong.tools.utils").strip +-- cloud auth/sdk providers +local GCP_SERVICE_ACCOUNT do + GCP_SERVICE_ACCOUNT = os.getenv("GCP_SERVICE_ACCOUNT") +end + +local GCP = require("resty.gcp.request.credentials.accesstoken") +-- + local EMPTY = {} @@ -16,16 +24,41 @@ local _M = { } +-- static messages +local ERROR__NOT_SET = 'data: {"error": true, "message": "empty or unsupported transformer response"}' + + +local _KEYBASTION = setmetatable({}, { + __mode = "k", + __index = function(this_cache, plugin_config) + if plugin_config.model.provider == "gemini" and + plugin_config.auth and + plugin_config.auth.gcp_use_service_account then + + ngx.log(ngx.NOTICE, "loading gcp sdk for plugin ", kong.plugin.get_id()) + + local service_account_json = (plugin_config.auth and plugin_config.auth.gcp_service_account_json) or GCP_SERVICE_ACCOUNT + + local ok, gcp_auth = pcall(GCP.new, nil, service_account_json) + if ok and gcp_auth then + -- store our item for the next time we need it + gcp_auth.service_account_json = service_account_json + this_cache[plugin_config] = { interface = gcp_auth, error = nil } + return this_cache[plugin_config] + end + + return { interface = nil, error = "cloud-authentication with GCP failed" } + end + end, +}) + ---- Return a 400 response with a JSON body. This function is used to --- return errors to the client while also logging the error. local function bad_request(msg) kong.log.info(msg) return kong.response.exit(400, { error = { message = msg } }) end - -- get the token text from an event frame local function get_token_text(event_t) -- get: event_t.choices[1] @@ -63,10 +96,43 @@ local function handle_streaming_frame(conf) -- because we have already 200 OK'd the client by now if (not finished) and (is_gzip) then - chunk = kong_utils.inflate_gzip(chunk) + chunk = kong_utils.inflate_gzip(ngx.arg[1]) + end + + local is_raw_json = conf.model.provider == "gemini" + local events = ai_shared.frame_to_events(chunk, is_raw_json ) + + if not events then + -- usually a not-supported-transformer or empty frames. + -- header_filter has already run, so all we can do is log it, + -- and then send the client a readable error in a single chunk + local response = ERROR__NOT_SET + + if is_gzip then + response = kong_utils.deflate_gzip(response) + end + + ngx.arg[1] = response + ngx.arg[2] = true + + return end - local events = ai_shared.frame_to_events(chunk) + if not events then + -- usually a not-supported-transformer or empty frames. + -- header_filter has already run, so all we can do is log it, + -- and then send the client a readable error in a single chunk + local response = ERROR__NOT_SET + + if is_gzip then + response = kong_utils.deflate_gzip(response) + end + + ngx.arg[1] = response + ngx.arg[2] = true + + return + end for _, event in ipairs(events) do local formatted, _, metadata = ai_driver.from_format(event, conf.model, "stream/" .. conf.route_type) @@ -320,7 +386,7 @@ function _M:access(conf) if not request_table then if not string.find(content_type, "multipart/form-data", nil, true) then - return bad_request("content-type header does not match request body") + return bad_request("content-type header does not match request body, or bad JSON formatting") end multipart = true -- this may be a large file upload, so we have to proxy it directly @@ -365,13 +431,6 @@ function _M:access(conf) end end - -- check the incoming format is the same as the configured LLM format - local compatible, err = llm.is_compatible(request_table, route_type) - if not compatible then - kong_ctx_shared.skip_response_transformer = true - return bad_request(err) - end - -- check if the user has asked for a stream, and/or if -- we are forcing all requests to be of streaming type if request_table and request_table.stream or @@ -384,8 +443,9 @@ function _M:access(conf) return bad_request("response streaming is not enabled for this LLM") end - -- store token cost estimate, on first pass - if not kong_ctx_plugin.ai_stream_prompt_tokens then + -- store token cost estimate, on first pass, if the + -- provider doesn't reply with a prompt token count + if (not kong.ctx.plugin.ai_stream_prompt_tokens) and (not ai_shared.streaming_has_token_counts[conf_m.model.provider]) then local prompt_tokens, err = ai_shared.calculate_cost(request_table or {}, {}, 1.8) if err then kong.log.err("unable to estimate request token cost: ", err) @@ -431,8 +491,17 @@ function _M:access(conf) kong.service.request.set_body(parsed_request_body, content_type) end + -- get the provider's cached identity interface - nil may come back, which is fine + local identity_interface = _KEYBASTION[conf] + if identity_interface and identity_interface.error then + kong.ctx.shared.skip_response_transformer = true + kong.log.err("error authenticating with cloud-provider, ", identity_interface.error) + return kong.response.exit(500, "LLM request failed before proxying") + end + -- now re-configure the request for this operation type - local ok, err = ai_driver.configure_request(conf_m) + local ok, err = ai_driver.configure_request(conf_m, + identity_interface and identity_interface.interface) if not ok then kong_ctx_shared.skip_response_transformer = true kong.log.err("failed to configure request for AI service: ", err) diff --git a/spec/03-plugins/38-ai-proxy/01-unit_spec.lua b/spec/03-plugins/38-ai-proxy/01-unit_spec.lua index c1dfadfb4aca..aeb42600d639 100644 --- a/spec/03-plugins/38-ai-proxy/01-unit_spec.lua +++ b/spec/03-plugins/38-ai-proxy/01-unit_spec.lua @@ -223,6 +223,20 @@ local FORMATS = { }, }, }, + gemini = { + ["llm/v1/chat"] = { + config = { + name = "gemini-pro", + provider = "gemini", + options = { + max_tokens = 8192, + temperature = 0.8, + top_k = 1, + top_p = 0.6, + }, + }, + }, + }, } local STREAMS = { @@ -646,5 +660,48 @@ describe(PLUGIN_NAME .. ": (unit)", function() }, formatted) end) + describe("streaming transformer tests", function() + + it("transforms truncated-json type (beginning of stream)", function() + local input = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/input.bin")) + local events = ai_shared.frame_to_events(input, true) + + local expected = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/expected-output.json")) + local expected_events = cjson.decode(expected) + + assert.same(events, expected_events, true) + end) + + it("transforms truncated-json type (end of stream)", function() + local input = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/input.bin")) + local events = ai_shared.frame_to_events(input, true) + + local expected = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/expected-output.json")) + local expected_events = cjson.decode(expected) + + assert.same(events, expected_events, true) + end) + + it("transforms complete-json type", function() + local input = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/input.bin")) + local events = ai_shared.frame_to_events(input, false) -- not "truncated json mode" like Gemini + + local expected = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/expected-output.json")) + local expected_events = cjson.decode(expected) + + assert.same(events, expected_events) + end) + + it("transforms text/event-stream type", function() + local input = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/input.bin")) + local events = ai_shared.frame_to_events(input, false) -- not "truncated json mode" like Gemini + + local expected = pl_file.read(fmt("spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/expected-output.json")) + local expected_events = cjson.decode(expected) + + assert.same(events, expected_events) + end) + + end) end) diff --git a/spec/fixtures/ai-proxy/unit/expected-requests/gemini/llm-v1-chat.json b/spec/fixtures/ai-proxy/unit/expected-requests/gemini/llm-v1-chat.json new file mode 100644 index 000000000000..f236df678a4d --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/expected-requests/gemini/llm-v1-chat.json @@ -0,0 +1,57 @@ +{ + "contents": [ + { + "role": "user", + "parts": [ + { + "text": "What is 1 + 2?" + } + ] + }, + { + "role": "model", + "parts": [ + { + "text": "The sum of 1 + 2 is 3. If you have any more math questions or if there's anything else I can help you with, feel free to ask!" + } + ] + }, + { + "role": "user", + "parts": [ + { + "text": "Multiply that by 2" + } + ] + }, + { + "role": "model", + "parts": [ + { + "text": "Certainly! If you multiply 3 by 2, the result is 6. If you have any more questions or if there's anything else I can help you with, feel free to ask!" + } + ] + }, + { + "role": "user", + "parts": [ + { + "text": "Why can't you divide by zero?" + } + ] + } + ], + "generationConfig": { + "temperature": 0.8, + "topK": 1, + "topP": 0.6, + "maxOutputTokens": 8192 + }, + "systemInstruction": { + "parts": [ + { + "text": "You are a mathematician." + } + ] + } +} \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/expected-responses/gemini/llm-v1-chat.json b/spec/fixtures/ai-proxy/unit/expected-responses/gemini/llm-v1-chat.json new file mode 100644 index 000000000000..90a1656d2a37 --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/expected-responses/gemini/llm-v1-chat.json @@ -0,0 +1,14 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "Ah, vous voulez savoir le double de ce résultat ? Eh bien, le double de 2 est **4**. \n", + "role": "assistant" + } + } + ], + "model": "gemini-pro", + "object": "chat.completion" +} \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/real-responses/gemini/llm-v1-chat.json b/spec/fixtures/ai-proxy/unit/real-responses/gemini/llm-v1-chat.json new file mode 100644 index 000000000000..80781b6eb72a --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/real-responses/gemini/llm-v1-chat.json @@ -0,0 +1,34 @@ +{ + "candidates": [ + { + "content": { + "parts": [ + { + "text": "Ah, vous voulez savoir le double de ce résultat ? Eh bien, le double de 2 est **4**. \n" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0, + "safetyRatings": [ + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HARASSMENT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "probability": "NEGLIGIBLE" + } + ] + } + ] + } \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/expected-output.json b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/expected-output.json new file mode 100644 index 000000000000..b08549afbf4e --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/expected-output.json @@ -0,0 +1,8 @@ +[ + { + "data": "{\"is_finished\":false,\"event_type\":\"stream-start\",\"generation_id\":\"10f31c2f-1a4c-48cf-b500-dc8141a25ae5\"}" + }, + { + "data": "{\"is_finished\":false,\"event_type\":\"text-generation\",\"text\":\"2\"}" + } +] \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/input.bin b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/input.bin new file mode 100644 index 000000000000..af13220a423a --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/complete-json/input.bin @@ -0,0 +1,2 @@ +{"is_finished":false,"event_type":"stream-start","generation_id":"10f31c2f-1a4c-48cf-b500-dc8141a25ae5"} +{"is_finished":false,"event_type":"text-generation","text":"2"} \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/expected-output.json b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/expected-output.json new file mode 100644 index 000000000000..5f3b0afa51d4 --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/expected-output.json @@ -0,0 +1,14 @@ +[ + { + "data": "{\n \"candidates\": [\n {\n \"content\": {\n \"parts\": [\n {\n \"text\": \"The\"\n }\n ],\n \"role\": \"model\"\n },\n \"finishReason\": \"STOP\",\n \"index\": 0\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\": 6,\n \"candidatesTokenCount\": 1,\n \"totalTokenCount\": 7\n }\n}" + }, + { + "data": "\n{\n \"candidates\": [\n {\n \"content\": {\n \"parts\": [\n {\n \"text\": \" theory of relativity is actually two theories by Albert Einstein: **special relativity** and\"\n }\n ],\n \"role\": \"model\"\n },\n \"finishReason\": \"STOP\",\n \"index\": 0,\n \"safetyRatings\": [\n {\n \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HARASSMENT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n \"probability\": \"NEGLIGIBLE\"\n }\n ]\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\": 6,\n \"candidatesTokenCount\": 17,\n \"totalTokenCount\": 23\n }\n}" + }, + { + "data": "\n{\n \"candidates\": [\n {\n \"content\": {\n \"parts\": [\n {\n \"text\": \" **general relativity**. Here's a simplified breakdown:\\n\\n**Special Relativity (\"\n }\n ],\n \"role\": \"model\"\n },\n \"finishReason\": \"STOP\",\n \"index\": 0,\n \"safetyRatings\": [\n {\n \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HARASSMENT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n \"probability\": \"NEGLIGIBLE\"\n }\n ]\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\": 6,\n \"candidatesTokenCount\": 33,\n \"totalTokenCount\": 39\n }\n}" + }, + { + "data": "\n{\n \"candidates\": [\n {\n \"content\": {\n \"parts\": [\n {\n \"text\": \"1905):**\\n\\n* **Focus:** The relationship between space and time.\\n* **Key ideas:**\\n * **Speed of light\"\n }\n ],\n \"role\": \"model\"\n },\n \"finishReason\": \"STOP\",\n \"index\": 0,\n \"safetyRatings\": [\n {\n \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HARASSMENT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n \"probability\": \"NEGLIGIBLE\"\n }\n ]\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\": 6,\n \"candidatesTokenCount\": 65,\n \"totalTokenCount\": 71\n }\n}\n" + } +] \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/input.bin b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/input.bin new file mode 100644 index 000000000000..8cef2a01fa8d --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-beginning/input.bin @@ -0,0 +1,141 @@ +[{ + "candidates": [ + { + "content": { + "parts": [ + { + "text": "The" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0 + } + ], + "usageMetadata": { + "promptTokenCount": 6, + "candidatesTokenCount": 1, + "totalTokenCount": 7 + } +} +, +{ + "candidates": [ + { + "content": { + "parts": [ + { + "text": " theory of relativity is actually two theories by Albert Einstein: **special relativity** and" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0, + "safetyRatings": [ + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HARASSMENT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "probability": "NEGLIGIBLE" + } + ] + } + ], + "usageMetadata": { + "promptTokenCount": 6, + "candidatesTokenCount": 17, + "totalTokenCount": 23 + } +} +, +{ + "candidates": [ + { + "content": { + "parts": [ + { + "text": " **general relativity**. Here's a simplified breakdown:\n\n**Special Relativity (" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0, + "safetyRatings": [ + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HARASSMENT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "probability": "NEGLIGIBLE" + } + ] + } + ], + "usageMetadata": { + "promptTokenCount": 6, + "candidatesTokenCount": 33, + "totalTokenCount": 39 + } +} +, +{ + "candidates": [ + { + "content": { + "parts": [ + { + "text": "1905):**\n\n* **Focus:** The relationship between space and time.\n* **Key ideas:**\n * **Speed of light" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0, + "safetyRatings": [ + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HARASSMENT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "probability": "NEGLIGIBLE" + } + ] + } + ], + "usageMetadata": { + "promptTokenCount": 6, + "candidatesTokenCount": 65, + "totalTokenCount": 71 + } +} diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/expected-output.json b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/expected-output.json new file mode 100644 index 000000000000..ba6a64384d95 --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/expected-output.json @@ -0,0 +1,8 @@ +[ + { + "data": "{\n \"candidates\": [\n {\n \"content\": {\n \"parts\": [\n {\n \"text\": \" is constant:** No matter how fast you are moving, light always travels at the same speed (approximately 299,792,458\"\n }\n ],\n \"role\": \"model\"\n },\n \"finishReason\": \"STOP\",\n \"index\": 0,\n \"safetyRatings\": [\n {\n \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HARASSMENT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n \"probability\": \"NEGLIGIBLE\"\n }\n ]\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\": 6,\n \"candidatesTokenCount\": 97,\n \"totalTokenCount\": 103\n }\n}" + }, + { + "data": "\n{\n \"candidates\": [\n {\n \"content\": {\n \"parts\": [\n {\n \"text\": \" not a limit.\\n\\nIf you're interested in learning more about relativity, I encourage you to explore further resources online or in books. There are many excellent introductory materials available. \\n\"\n }\n ],\n \"role\": \"model\"\n },\n \"finishReason\": \"STOP\",\n \"index\": 0,\n \"safetyRatings\": [\n {\n \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_HARASSMENT\",\n \"probability\": \"NEGLIGIBLE\"\n },\n {\n \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n \"probability\": \"NEGLIGIBLE\"\n }\n ]\n }\n ],\n \"usageMetadata\": {\n \"promptTokenCount\": 6,\n \"candidatesTokenCount\": 547,\n \"totalTokenCount\": 553\n }\n}\n" + } +] \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/input.bin b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/input.bin new file mode 100644 index 000000000000..d6489e74d19d --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/partial-json-end/input.bin @@ -0,0 +1,80 @@ +,{ + "candidates": [ + { + "content": { + "parts": [ + { + "text": " is constant:** No matter how fast you are moving, light always travels at the same speed (approximately 299,792,458" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0, + "safetyRatings": [ + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HARASSMENT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "probability": "NEGLIGIBLE" + } + ] + } + ], + "usageMetadata": { + "promptTokenCount": 6, + "candidatesTokenCount": 97, + "totalTokenCount": 103 + } +} +, +{ + "candidates": [ + { + "content": { + "parts": [ + { + "text": " not a limit.\n\nIf you're interested in learning more about relativity, I encourage you to explore further resources online or in books. There are many excellent introductory materials available. \n" + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0, + "safetyRatings": [ + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_HARASSMENT", + "probability": "NEGLIGIBLE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "probability": "NEGLIGIBLE" + } + ] + } + ], + "usageMetadata": { + "promptTokenCount": 6, + "candidatesTokenCount": 547, + "totalTokenCount": 553 + } +} +] \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/expected-output.json b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/expected-output.json new file mode 100644 index 000000000000..f515516c7ec8 --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/expected-output.json @@ -0,0 +1,11 @@ +[ + { + "data": "{ \"choices\": [ { \"delta\": { \"content\": \"\", \"role\": \"assistant\" }, \"finish_reason\": null, \"index\": 0, \"logprobs\": null } ], \"created\": 1720136012, \"id\": \"chatcmpl-9hQFArK1oMZcRwaIa86RGwrjVNmeY\", \"model\": \"gpt-4-0613\", \"object\": \"chat.completion.chunk\", \"system_fingerprint\": null}" + }, + { + "data": "{ \"choices\": [ { \"delta\": { \"content\": \"2\" }, \"finish_reason\": null, \"index\": 0, \"logprobs\": null } ], \"created\": 1720136012, \"id\": \"chatcmpl-9hQFArK1oMZcRwaIa86RGwrjVNmeY\", \"model\": \"gpt-4-0613\", \"object\": \"chat.completion.chunk\", \"system_fingerprint\": null}" + }, + { + "data": "{ \"choices\": [ { \"delta\": {}, \"finish_reason\": \"stop\", \"index\": 0, \"logprobs\": null } ], \"created\": 1720136012, \"id\": \"chatcmpl-9hQFArK1oMZcRwaIa86RGwrjVNmeY\", \"model\": \"gpt-4-0613\", \"object\": \"chat.completion.chunk\", \"system_fingerprint\": null}" + } +] \ No newline at end of file diff --git a/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/input.bin b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/input.bin new file mode 100644 index 000000000000..efe2ad50c657 --- /dev/null +++ b/spec/fixtures/ai-proxy/unit/streaming-chunk-formats/text-event-stream/input.bin @@ -0,0 +1,7 @@ +data: { "choices": [ { "delta": { "content": "", "role": "assistant" }, "finish_reason": null, "index": 0, "logprobs": null } ], "created": 1720136012, "id": "chatcmpl-9hQFArK1oMZcRwaIa86RGwrjVNmeY", "model": "gpt-4-0613", "object": "chat.completion.chunk", "system_fingerprint": null} + +data: { "choices": [ { "delta": { "content": "2" }, "finish_reason": null, "index": 0, "logprobs": null } ], "created": 1720136012, "id": "chatcmpl-9hQFArK1oMZcRwaIa86RGwrjVNmeY", "model": "gpt-4-0613", "object": "chat.completion.chunk", "system_fingerprint": null} + +data: { "choices": [ { "delta": {}, "finish_reason": "stop", "index": 0, "logprobs": null } ], "created": 1720136012, "id": "chatcmpl-9hQFArK1oMZcRwaIa86RGwrjVNmeY", "model": "gpt-4-0613", "object": "chat.completion.chunk", "system_fingerprint": null} + +data: [DONE] \ No newline at end of file