From 7dfddc7036e8eebce0fd0619ec510c4e38a5b14d Mon Sep 17 00:00:00 2001
From: Dave Lee <dave@gray101.com>
Date: Wed, 16 Oct 2024 18:50:58 -0400
Subject: [PATCH] centralized request middleware

Signed-off-by: Dave Lee <dave@gray101.com>
---
 Makefile                                      |   2 +-
 backend/python/autogptq/requirements.txt      |   2 +-
 backend/python/bark/requirements.txt          |   2 +-
 .../python/common/template/requirements.txt   |   2 +-
 backend/python/coqui/requirements.txt         |   2 +-
 backend/python/diffusers/requirements.txt     |   2 +-
 backend/python/exllama2/requirements.txt      |   2 +-
 backend/python/mamba/requirements.txt         |   2 +-
 .../python/openvoice/requirements-intel.txt   |   2 +-
 backend/python/openvoice/requirements.txt     |   2 +-
 backend/python/parler-tts/requirements.txt    |   2 +-
 backend/python/rerankers/requirements.txt     |   2 +-
 .../sentencetransformers/requirements.txt     |   2 +-
 .../transformers-musicgen/requirements.txt    |   2 +-
 backend/python/transformers/requirements.txt  |   2 +-
 backend/python/vall-e-x/requirements.txt      |   2 +-
 backend/python/vllm/install.sh                |   2 +-
 backend/python/vllm/requirements.txt          |   2 +-
 core/backend/llm.go                           |   6 +-
 core/backend/tokenize.go                      |   4 +-
 core/config/backend_config.go                 |  31 +-
 core/config/backend_config_loader.go          |  12 +-
 core/config/guesser.go                        |   9 +-
 core/http/app.go                              |  12 +-
 core/http/ctx/fiber.go                        |  47 --
 .../endpoints/elevenlabs/soundgeneration.go   |  42 +-
 core/http/endpoints/elevenlabs/tts.go         |  36 +-
 core/http/endpoints/jina/rerank.go            |  48 +-
 .../endpoints/localai/get_token_metrics.go    |  15 +-
 core/http/endpoints/localai/tokenize.go       |  43 +-
 core/http/endpoints/localai/tts.go            |  50 +-
 core/http/endpoints/openai/chat.go            |  26 +-
 core/http/endpoints/openai/completion.go      |  25 +-
 core/http/endpoints/openai/edit.go            |  18 +-
 core/http/endpoints/openai/embeddings.go      |  14 +-
 core/http/endpoints/openai/image.go           |  21 +-
 core/http/endpoints/openai/inference.go       |   2 +-
 core/http/endpoints/openai/transcription.go   |  16 +-
 .../openai => middleware}/request.go          | 753 ++++++++++--------
 core/http/routes/elevenlabs.go                |  13 +-
 core/http/routes/jina.go                      |   8 +-
 core/http/routes/localai.go                   |  16 +-
 core/http/routes/openai.go                    |  73 +-
 core/schema/elevenlabs.go                     |  14 +
 core/schema/jina.go                           |   3 +-
 core/schema/localai.go                        |  14 +-
 core/schema/prediction.go                     |   2 +-
 core/schema/request.go                        |  22 +
 core/schema/tokenize.go                       |   2 +-
 core/services/list_models.go                  |  12 +
 core/startup/startup.go                       |   8 +-
 gallery/index.yaml                            | 124 +++
 pkg/model/initializers.go                     |   2 +-
 tests/e2e-aio/e2e_test.go                     |   4 +-
 54 files changed, 904 insertions(+), 679 deletions(-)
 delete mode 100644 core/http/ctx/fiber.go
 rename core/http/{endpoints/openai => middleware}/request.go (51%)
 create mode 100644 core/schema/request.go

diff --git a/Makefile b/Makefile
index 154ae5587e6f..aea5e1572542 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=a89f75e1b7b90cb2d4d4c52ca53ef9e9b466aa45
+CPPLLAMA_VERSION?=9e041024481f6b249ab8918e18b9477f873b5a5e
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
diff --git a/backend/python/autogptq/requirements.txt b/backend/python/autogptq/requirements.txt
index 9cb6ce94f589..7e66f084958e 100644
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 transformers
\ No newline at end of file
diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt
index 6e46924ae77b..d1a90719f9f4 100644
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/common/template/requirements.txt b/backend/python/common/template/requirements.txt
index 540c0eb56cf6..16716764c30a 100644
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
\ No newline at end of file
diff --git a/backend/python/coqui/requirements.txt b/backend/python/coqui/requirements.txt
index 29484f7d26f6..0ad62f7027e9 100644
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
 coqui-tts
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements.txt b/backend/python/diffusers/requirements.txt
index 730e316f68dd..624b048e7848 100644
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.2
+grpcio==1.67.0
 pillow
 protobuf
 certifi
diff --git a/backend/python/exllama2/requirements.txt b/backend/python/exllama2/requirements.txt
index e3db2b2f2016..8a0d9a17d563 100644
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 wheel
diff --git a/backend/python/mamba/requirements.txt b/backend/python/mamba/requirements.txt
index 83ae4279db55..6be5d8ac6b89 100644
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/openvoice/requirements-intel.txt b/backend/python/openvoice/requirements-intel.txt
index 687efe78b52b..b446386f9761 100644
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt
index 6ee29ce4d185..fd1268d0c215 100644
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 librosa
 faster-whisper
diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt
index d7f36feb3837..ff9adca994b2 100644
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 llvmlite==0.43.0
\ No newline at end of file
diff --git a/backend/python/rerankers/requirements.txt b/backend/python/rerankers/requirements.txt
index 83ae4279db55..6be5d8ac6b89 100644
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/sentencetransformers/requirements.txt b/backend/python/sentencetransformers/requirements.txt
index 40a387f12c20..36ce8a6f30e5 100644
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 datasets
diff --git a/backend/python/transformers-musicgen/requirements.txt b/backend/python/transformers-musicgen/requirements.txt
index a3f666512907..ea0e3fa9632e 100644
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 scipy==1.14.0
 certifi
\ No newline at end of file
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
index 084cc034e7b8..d006cf0e1e44 100644
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt
index 83ae4279db55..6be5d8ac6b89 100644
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 9078b81b84a1..69e74a0624be 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -22,7 +22,7 @@ if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
             git clone https://github.com/vllm-project/vllm
         fi
         pushd vllm
-            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.67.0 protobuf bitsandbytes
             uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
             VLLM_TARGET_DEVICE=cpu python setup.py install
         popd
diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
index 8fb8a4185fe7..95447f746da3 100644
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 setuptools
\ No newline at end of file
diff --git a/core/backend/llm.go b/core/backend/llm.go
index d946d3f88170..31f5a7da756e 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -31,13 +31,13 @@ type TokenUsage struct {
 	Completion int
 }
 
-func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 
 	var inferenceModel grpc.Backend
 	var err error
 
-	opts := ModelOptions(c, o, []model.Option{})
+	opts := ModelOptions(*c, o, []model.Option{})
 
 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
@@ -85,7 +85,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 
 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
 	fn := func() (LLMResponse, error) {
-		opts := gRPCPredictOpts(c, loader.ModelPath)
+		opts := gRPCPredictOpts(*c, loader.ModelPath)
 		opts.Prompt = s
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go
index c8ec8d1cb260..57fef1b4fa83 100644
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -7,9 +7,7 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 
-func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
-
-	modelFile := backendConfig.Model
+func ModelTokenize(s string, modelFile string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
 
 	var inferenceModel grpc.Backend
 	var err error
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 79e134d8b93c..ab4d5fdd82ec 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -433,19 +433,20 @@ func (c *BackendConfig) HasTemplate() bool {
 type BackendConfigUsecases int
 
 const (
-	FLAG_ANY              BackendConfigUsecases = 0b000000000
-	FLAG_CHAT             BackendConfigUsecases = 0b000000001
-	FLAG_COMPLETION       BackendConfigUsecases = 0b000000010
-	FLAG_EDIT             BackendConfigUsecases = 0b000000100
-	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000001000
-	FLAG_RERANK           BackendConfigUsecases = 0b000010000
-	FLAG_IMAGE            BackendConfigUsecases = 0b000100000
-	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b001000000
-	FLAG_TTS              BackendConfigUsecases = 0b010000000
-	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
+	FLAG_ANY              BackendConfigUsecases = 0b0000000000
+	FLAG_CHAT             BackendConfigUsecases = 0b0000000001
+	FLAG_COMPLETION       BackendConfigUsecases = 0b0000000010
+	FLAG_EDIT             BackendConfigUsecases = 0b0000000100
+	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b0000001000
+	FLAG_RERANK           BackendConfigUsecases = 0b0000010000
+	FLAG_IMAGE            BackendConfigUsecases = 0b0000100000
+	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b0001000000
+	FLAG_TTS              BackendConfigUsecases = 0b0010000000
+	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b0100000000
+	FLAG_TOKENIZE         BackendConfigUsecases = 0b1000000000
 
 	// Common Subsets
-	FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
+	FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )
 
 func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
@@ -460,6 +461,7 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 		"FLAG_TRANSCRIPT":       FLAG_TRANSCRIPT,
 		"FLAG_TTS":              FLAG_TTS,
 		"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
+		"FLAG_TOKENIZE":         FLAG_TOKENIZE,
 		"FLAG_LLM":              FLAG_LLM,
 	}
 }
@@ -545,5 +547,12 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 
+	if (u & FLAG_TOKENIZE) == FLAG_TOKENIZE {
+		tokenizeCapableBackends := []string{"llama.cpp", "rwkv"}
+		if !slices.Contains(tokenizeCapableBackends, c.Backend) {
+			return false
+		}
+	}
+
 	return true
 }
diff --git a/core/config/backend_config_loader.go b/core/config/backend_config_loader.go
index 7fe49bab322a..8244672277ba 100644
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -117,7 +117,9 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 	// Load a config file if present after the model name
 	cfg := &BackendConfig{
 		PredictionOptions: schema.PredictionOptions{
-			Model: modelName,
+			BasicModelRequest: schema.BasicModelRequest{
+				Model: modelName,
+			},
 		},
 	}
 
@@ -145,6 +147,14 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 	return cfg, nil
 }
 
+func (bcl *BackendConfigLoader) LoadBackendConfigFileByNameDefaultOptions(modelName string, appConfig *ApplicationConfig) (*BackendConfig, error) {
+	return bcl.LoadBackendConfigFileByName(modelName, appConfig.ModelPath,
+		LoadOptionDebug(appConfig.Debug),
+		LoadOptionThreads(appConfig.Threads),
+		LoadOptionContextSize(appConfig.ContextSize),
+		LoadOptionF16(appConfig.F16))
+}
+
 // This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile
 func (bcl *BackendConfigLoader) LoadMultipleBackendConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()
diff --git a/core/config/guesser.go b/core/config/guesser.go
index b63dd051a32a..3dea311ffe23 100644
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -26,14 +26,14 @@ const (
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
-	RepeatPenalty float64
+	RepeatPenalty  float64
 }
 
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
-		StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
@@ -161,10 +161,11 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 	}
 
 	// We try to guess only if we don't have a template defined already
-	f, err := gguf.ParseGGUFFile(filepath.Join(modelPath, cfg.ModelFileName()))
+	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+	f, err := gguf.ParseGGUFFile(guessPath)
 	if err != nil {
 		// Only valid for gguf files
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "not a GGUF file")
+		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
 		return
 	}
 
diff --git a/core/http/app.go b/core/http/app.go
index 2cf0ad17f26c..3384eb39b5db 100644
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -121,7 +121,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		})
 	}
 
- // Health Checks should always be exempt from auth, so register these first
+	// Health Checks should always be exempt from auth, so register these first
 	routes.HealthRoutes(app)
 
 	kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
@@ -156,13 +156,15 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	galleryService := services.NewGalleryService(appConfig)
 	galleryService.Start(appConfig.Context, cl)
 
-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
+	requestExtractor := middleware.NewRequestExtractor(cl, ml, appConfig)
+
+	routes.RegisterElevenLabsRoutes(app, requestExtractor, cl, ml, appConfig)
+	routes.RegisterLocalAIRoutes(app, requestExtractor, cl, ml, appConfig, galleryService)
+	routes.RegisterOpenAIRoutes(app, requestExtractor, cl, ml, appConfig)
 	if !appConfig.DisableWebUI {
 		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
 	}
-	routes.RegisterJINARoutes(app, cl, ml, appConfig)
+	routes.RegisterJINARoutes(app, requestExtractor, cl, ml, appConfig)
 
 	httpFS := http.FS(embedDirStatic)
 
diff --git a/core/http/ctx/fiber.go b/core/http/ctx/fiber.go
deleted file mode 100644
index 254f070400b7..000000000000
--- a/core/http/ctx/fiber.go
+++ /dev/null
@@ -1,47 +0,0 @@
-package fiberContext
-
-import (
-	"fmt"
-	"strings"
-
-	"github.com/gofiber/fiber/v2"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/rs/zerolog/log"
-)
-
-// ModelFromContext returns the model from the context
-// If no model is specified, it will take the first available
-// Takes a model string as input which should be the one received from the user request.
-// It returns the model name resolved from the context and an error if any.
-func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *model.ModelLoader, modelInput string, firstModel bool) (string, error) {
-	if ctx.Params("model") != "" {
-		modelInput = ctx.Params("model")
-	}
-	if ctx.Query("model") != "" {
-		modelInput = ctx.Query("model")
-	}
-	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
-	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
-
-	// If no model was specified, take the first available
-	if modelInput == "" && !bearerExists && firstModel {
-		models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
-		if len(models) > 0 {
-			modelInput = models[0]
-			log.Debug().Msgf("No model specified, using: %s", modelInput)
-		} else {
-			log.Debug().Msgf("No model specified, returning error")
-			return "", fmt.Errorf("no model specified")
-		}
-	}
-
-	// If a model is found in bearer token takes precedence
-	if bearerExists {
-		log.Debug().Msgf("Using model from bearer token: %s", bearer)
-		modelInput = bearer
-	}
-	return modelInput, nil
-}
diff --git a/core/http/endpoints/elevenlabs/soundgeneration.go b/core/http/endpoints/elevenlabs/soundgeneration.go
index 345df35b8a2b..a1318825f68f 100644
--- a/core/http/endpoints/elevenlabs/soundgeneration.go
+++ b/core/http/endpoints/elevenlabs/soundgeneration.go
@@ -4,7 +4,7 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
@@ -17,45 +17,21 @@ import (
 // @Router /v1/sound-generation [post]
 func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		input := new(schema.ElevenLabsSoundGenerationRequest)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
 
-		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.ModelID, false)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Str("ModelID", input.ModelID).Msg("Model not found in context")
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.ElevenLabsSoundGenerationRequest)
+		if !ok || input.ModelID == "" {
+			return fiber.ErrBadRequest
 		}
 
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Str("Request ModelID", input.ModelID).Err(err).Msg("error during LoadBackendConfigFileByName, using request ModelID")
-		} else {
-			if input.ModelID != "" {
-				modelFile = input.ModelID
-			} else {
-				modelFile = cfg.Model
-			}
+		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || cfg == nil {
+			return fiber.ErrBadRequest
 		}
-		log.Debug().Str("modelFile", "modelFile").Str("backend", cfg.Backend).Msg("Sound Generation Request about to be sent to backend")
 
-		if input.Duration != nil {
-			log.Debug().Float32("duration", *input.Duration).Msg("duration set")
-		}
-		if input.Temperature != nil {
-			log.Debug().Float32("temperature", *input.Temperature).Msg("temperature set")
-		}
+		log.Debug().Str("modelFile", "modelFile").Str("backend", cfg.Backend).Msg("Sound Generation Request about to be sent to backend")
 
 		// TODO: Support uploading files?
-		filePath, _, err := backend.SoundGeneration(modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
+		filePath, _, err := backend.SoundGeneration(input.ModelID, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
diff --git a/core/http/endpoints/elevenlabs/tts.go b/core/http/endpoints/elevenlabs/tts.go
index bb6901be8878..f207ce5229cf 100644
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -3,7 +3,7 @@ package elevenlabs
 import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/pkg/model"
 
 	"github.com/gofiber/fiber/v2"
@@ -20,39 +20,21 @@ import (
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 
-		input := new(schema.ElevenLabsTTSRequest)
 		voiceID := c.Params("voice-id")
 
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.ElevenLabsTTSRequest)
+		if !ok || input.ModelID == "" {
+			return fiber.ErrBadRequest
 		}
 
-		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.ModelID, false)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
+		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || cfg == nil {
+			return fiber.ErrBadRequest
 		}
 
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
-		} else {
-			if input.ModelID != "" {
-				modelFile = input.ModelID
-			} else {
-				modelFile = cfg.Model
-			}
-		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
+		log.Debug().Str("modelName", input.ModelID).Msg("elevenlabs TTS request recieved")
 
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, input.ModelID, "", voiceID, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
diff --git a/core/http/endpoints/jina/rerank.go b/core/http/endpoints/jina/rerank.go
index 58c3972d6555..26af3c8bb9b0 100644
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -3,9 +3,9 @@ package jina
 import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
 
 	"github.com/gofiber/fiber/v2"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -19,58 +19,36 @@ import (
 // @Router /v1/rerank [post]
 func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		req := new(schema.JINARerankRequest)
-		if err := c.BodyParser(req); err != nil {
-			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{
-				"error": "Cannot parse JSON",
-			})
-		}
-
-		input := new(schema.TTSRequest)
-
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
 
-		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.JINARerankRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
 		}
 
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
-		} else {
-			modelFile = cfg.Model
+		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || cfg == nil {
+			return fiber.ErrBadRequest
 		}
 
-		log.Debug().Msgf("Request for model: %s", modelFile)
+		log.Debug().Str("model", input.Model).Msg("JINA Rerank Request recieved")
 
 		if input.Backend != "" {
 			cfg.Backend = input.Backend
 		}
 
 		request := &proto.RerankRequest{
-			Query:     req.Query,
-			TopN:      int32(req.TopN),
-			Documents: req.Documents,
+			Query:     input.Query,
+			TopN:      int32(input.TopN),
+			Documents: input.Documents,
 		}
 
-		results, err := backend.Rerank(modelFile, request, ml, appConfig, *cfg)
+		results, err := backend.Rerank(input.Model, request, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
 
 		response := &schema.JINARerankResponse{
-			Model: req.Model,
+			Model: input.Model,
 		}
 
 		for _, r := range results.Results {
diff --git a/core/http/endpoints/localai/get_token_metrics.go b/core/http/endpoints/localai/get_token_metrics.go
index e0e6943f1295..30de2cdd5f20 100644
--- a/core/http/endpoints/localai/get_token_metrics.go
+++ b/core/http/endpoints/localai/get_token_metrics.go
@@ -4,13 +4,15 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/rs/zerolog/log"
 
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
+// TODO: This is not yet in use. Needs middleware rework, since it is not referenced.
+
 // TokenMetricsEndpoint is an endpoint to get TokensProcessed Per Second for Active SlotID
 //
 //	@Summary	Get TokenMetrics for Active Slot.
@@ -29,18 +31,13 @@ func TokenMetricsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader,
 			return err
 		}
 
-		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if err != nil {
+		modelFile, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
+		if !ok || modelFile != "" {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
 
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
+		cfg, err := cl.LoadBackendConfigFileByNameDefaultOptions(modelFile, appConfig)
 
 		if err != nil {
 			log.Err(err)
diff --git a/core/http/endpoints/localai/tokenize.go b/core/http/endpoints/localai/tokenize.go
index da110bf864e0..69783ae3cfe1 100644
--- a/core/http/endpoints/localai/tokenize.go
+++ b/core/http/endpoints/localai/tokenize.go
@@ -4,10 +4,9 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/rs/zerolog/log"
 )
 
 // TokenizeEndpoint exposes a REST API to tokenize the content
@@ -15,44 +14,22 @@ import (
 // @Success 200 {object} schema.TokenizeResponse "Response"
 // @Router /v1/tokenize [post]
 func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		input := new(schema.TokenizeRequest)
-
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
+	return func(ctx *fiber.Ctx) error {
+		input, ok := ctx.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.TokenizeRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
 		}
 
-		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		cfg, ok := ctx.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || cfg == nil {
+			return fiber.ErrBadRequest
 		}
 
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-
-		if err != nil {
-			log.Err(err)
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
-		} else {
-			modelFile = cfg.Model
-		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
-
-		tokenResponse, err := backend.ModelTokenize(input.Content, ml, *cfg, appConfig)
+		tokenResponse, err := backend.ModelTokenize(input.Content, input.Model, ml, *cfg, appConfig)
 		if err != nil {
 			return err
 		}
 
-		c.JSON(tokenResponse)
-		return nil
-
+		return ctx.JSON(tokenResponse)
 	}
 }
diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go
index ca3f58bd9e28..e23c5aea4215 100644
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -3,7 +3,7 @@ package localai
 import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/pkg/model"
 
 	"github.com/gofiber/fiber/v2"
@@ -12,44 +12,28 @@ import (
 )
 
 // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
-//	@Summary	Generates audio from the input text.
-//  @Accept json
-//  @Produce audio/x-wav
-//	@Param		request	body		schema.TTSRequest	true	"query params"
-//	@Success	200		{string}	binary				"generated audio/wav file"
-//	@Router		/v1/audio/speech [post]
-//	@Router		/tts [post]
+//
+//		@Summary	Generates audio from the input text.
+//	 @Accept json
+//	 @Produce audio/x-wav
+//		@Param		request	body		schema.TTSRequest	true	"query params"
+//		@Success	200		{string}	binary				"generated audio/wav file"
+//		@Router		/v1/audio/speech [post]
+//		@Router		/tts [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 
-		input := new(schema.TTSRequest)
-
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.TTSRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
 		}
 
-		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || cfg == nil {
+			return fiber.ErrBadRequest
 		}
 
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-
-		if err != nil {
-			log.Err(err)
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
-		} else {
-			modelFile = cfg.Model
-		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
+		log.Debug().Str("model", input.Model).Msg("LocalAI TTS Request recieved")
 
 		if input.Backend != "" {
 			cfg.Backend = input.Backend
@@ -63,7 +47,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			cfg.Voice = input.Voice
 		}
 
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, input.Model, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 1ac1387eed3e..8657a2390774 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -12,9 +12,10 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
-	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -161,23 +162,18 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		textContentToReturn = ""
 		id = uuid.New().String()
 		created = int(time.Now().Unix())
-		// Set CorrelationID
-		correlationID := c.Get("X-Correlation-ID")
-		if len(strings.TrimSpace(correlationID)) == 0 {
-			correlationID = id
-		}
-		c.Set("X-Correlation-ID", correlationID)
 
-		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
 		}
 
-		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, startupOptions.Debug, startupOptions.Threads, startupOptions.ContextSize, startupOptions.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || config == nil {
+			return fiber.ErrBadRequest
 		}
-		log.Debug().Msgf("Configuration read: %+v", config)
+
+		log.Debug().Msgf("Chat endpoint configuration read: %+v", config)
 
 		funcs := input.Functions
 		shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
@@ -656,7 +652,7 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
 		audios = append(audios, m.StringAudios...)
 	}
 
-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
+	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, o, nil)
 	if err != nil {
 		log.Error().Err(err).Msg("model inference failed")
 		return "", err
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index e5de1b3f0296..3cbf9d186477 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
@@ -26,10 +27,9 @@ import (
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
 func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	id := uuid.New().String()
 	created := int(time.Now().Unix())
 
-	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+	process := func(id string, s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		ComputeChoices(req, s, config, appConfig, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
 				ID:      id,
@@ -57,18 +57,17 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 	}
 
 	return func(c *fiber.Ctx) error {
-		// Add Correlation
-		c.Set("X-Correlation-ID", id)
-		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
+		// Handle Correlation
+		id := c.Get("X-Correlation-ID", uuid.New().String())
 
-		log.Debug().Msgf("`input`: %+v", input)
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
+		}
 
-		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || config == nil {
+			return fiber.ErrBadRequest
 		}
 
 		if config.ResponseFormatMap != nil {
@@ -125,7 +124,7 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 
 			responses := make(chan schema.OpenAIResponse)
 
-			go process(predInput, input, config, ml, responses)
+			go process(id, predInput, input, config, ml, responses)
 
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
 
diff --git a/core/http/endpoints/openai/edit.go b/core/http/endpoints/openai/edit.go
index 12fb40352552..58cc06b8d66e 100644
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -7,11 +7,12 @@ import (
 
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
-	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/model"
 
 	"github.com/rs/zerolog/log"
 )
@@ -23,17 +24,18 @@ import (
 // @Router /v1/edits [post]
 func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
 		}
 
-		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || config == nil {
+			return fiber.ErrBadRequest
 		}
 
-		log.Debug().Msgf("Parameter Config: %+v", config)
+		log.Debug().Msgf("Edit Endpoint Input : %+v", input)
+		log.Debug().Msgf("Edit Endpoint Config: %+v", *config)
 
 		templateFile := ""
 
diff --git a/core/http/endpoints/openai/embeddings.go b/core/http/endpoints/openai/embeddings.go
index e247d84e332b..9cbbe189457d 100644
--- a/core/http/endpoints/openai/embeddings.go
+++ b/core/http/endpoints/openai/embeddings.go
@@ -2,11 +2,11 @@ package openai
 
 import (
 	"encoding/json"
-	"fmt"
 	"time"
 
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/pkg/model"
 
 	"github.com/google/uuid"
@@ -23,14 +23,14 @@ import (
 // @Router /v1/embeddings [post]
 func EmbeddingsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		model, input, err := readRequest(c, cl, ml, appConfig, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
 		}
 
-		config, input, err := mergeRequestWithConfig(model, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || config == nil {
+			return fiber.ErrBadRequest
 		}
 
 		log.Debug().Msgf("Parameter Config: %+v", config)
diff --git a/core/http/endpoints/openai/image.go b/core/http/endpoints/openai/image.go
index 6c76ba843275..3e89ee0e0714 100644
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -15,6 +15,7 @@ import (
 
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 
 	"github.com/mudler/LocalAI/core/backend"
@@ -66,25 +67,23 @@ func downloadFile(url string) (string, error) {
 // @Router /v1/images/generations [post]
 func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readRequest(c, cl, ml, appConfig, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		if m == "" {
-			m = model.StableDiffusionBackend
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			log.Error().Msg("Image Endpoint - Invalid Input")
+			return fiber.ErrBadRequest
 		}
-		log.Debug().Msgf("Loading model: %+v", m)
 
-		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, 0, 0, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || config == nil {
+			log.Error().Msg("Image Endpoint - Invalid Config")
+			return fiber.ErrBadRequest
 		}
 
 		src := ""
 		if input.File != "" {
 
 			fileData := []byte{}
+			var err error
 			// check if input.File is an URL, if so download it and save it
 			// to a temporary file
 			if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go
index da75d3a1ea5b..fd85bb74d1de 100644
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -37,7 +37,7 @@ func ComputeChoices(
 	}
 
 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
diff --git a/core/http/endpoints/openai/transcription.go b/core/http/endpoints/openai/transcription.go
index 4e23f8046c6a..b10e06ef6963 100644
--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -1,7 +1,6 @@
 package openai
 
 import (
-	"fmt"
 	"io"
 	"net/http"
 	"os"
@@ -10,6 +9,8 @@ import (
 
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
 	model "github.com/mudler/LocalAI/pkg/model"
 
 	"github.com/gofiber/fiber/v2"
@@ -25,15 +26,16 @@ import (
 // @Router /v1/audio/transcriptions [post]
 func TranscriptEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readRequest(c, cl, ml, appConfig, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return fiber.ErrBadRequest
 		}
 
-		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request: %w", err)
+		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || config == nil {
+			return fiber.ErrBadRequest
 		}
+
 		// retrieve the file data from the request
 		file, err := c.FormFile("file")
 		if err != nil {
diff --git a/core/http/endpoints/openai/request.go b/core/http/middleware/request.go
similarity index 51%
rename from core/http/endpoints/openai/request.go
rename to core/http/middleware/request.go
index a418433ed959..e15de7910ab3 100644
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/middleware/request.go
@@ -1,315 +1,438 @@
-package openai
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-
-	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
-	"github.com/mudler/LocalAI/core/config"
-	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/functions"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/templates"
-	"github.com/mudler/LocalAI/pkg/utils"
-	"github.com/rs/zerolog/log"
-)
-
-type correlationIDKeyType string
-
-// CorrelationIDKey to track request across process boundary
-const CorrelationIDKey correlationIDKeyType = "correlationID"
-
-func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
-	input := new(schema.OpenAIRequest)
-
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
-	}
-
-	received, _ := json.Marshal(input)
-	// Extract or generate the correlation ID
-	correlationID := c.Get("X-Correlation-ID", uuid.New().String())
-
-	ctx, cancel := context.WithCancel(o.Context)
-	// Add the correlation ID to the new context
-	ctxWithCorrelationID := context.WithValue(ctx, CorrelationIDKey, correlationID)
-
-	input.Context = ctxWithCorrelationID
-	input.Cancel = cancel
-
-	log.Debug().Msgf("Request received: %s", string(received))
-
-	modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, firstModel)
-
-	return modelFile, input, err
-}
-
-func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIRequest) {
-	if input.Echo {
-		config.Echo = input.Echo
-	}
-	if input.TopK != nil {
-		config.TopK = input.TopK
-	}
-	if input.TopP != nil {
-		config.TopP = input.TopP
-	}
-
-	if input.Backend != "" {
-		config.Backend = input.Backend
-	}
-
-	if input.ClipSkip != 0 {
-		config.Diffusers.ClipSkip = input.ClipSkip
-	}
-
-	if input.ModelBaseName != "" {
-		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
-	}
-
-	if input.NegativePromptScale != 0 {
-		config.NegativePromptScale = input.NegativePromptScale
-	}
-
-	if input.UseFastTokenizer {
-		config.UseFastTokenizer = input.UseFastTokenizer
-	}
-
-	if input.NegativePrompt != "" {
-		config.NegativePrompt = input.NegativePrompt
-	}
-
-	if input.RopeFreqBase != 0 {
-		config.RopeFreqBase = input.RopeFreqBase
-	}
-
-	if input.RopeFreqScale != 0 {
-		config.RopeFreqScale = input.RopeFreqScale
-	}
-
-	if input.Grammar != "" {
-		config.Grammar = input.Grammar
-	}
-
-	if input.Temperature != nil {
-		config.Temperature = input.Temperature
-	}
-
-	if input.Maxtokens != nil {
-		config.Maxtokens = input.Maxtokens
-	}
-
-	if input.ResponseFormat != nil {
-		switch responseFormat := input.ResponseFormat.(type) {
-		case string:
-			config.ResponseFormat = responseFormat
-		case map[string]interface{}:
-			config.ResponseFormatMap = responseFormat
-		}
-	}
-
-	switch stop := input.Stop.(type) {
-	case string:
-		if stop != "" {
-			config.StopWords = append(config.StopWords, stop)
-		}
-	case []interface{}:
-		for _, pp := range stop {
-			if s, ok := pp.(string); ok {
-				config.StopWords = append(config.StopWords, s)
-			}
-		}
-	}
-
-	if len(input.Tools) > 0 {
-		for _, tool := range input.Tools {
-			input.Functions = append(input.Functions, tool.Function)
-		}
-	}
-
-	if input.ToolsChoice != nil {
-		var toolChoice functions.Tool
-
-		switch content := input.ToolsChoice.(type) {
-		case string:
-			_ = json.Unmarshal([]byte(content), &toolChoice)
-		case map[string]interface{}:
-			dat, _ := json.Marshal(content)
-			_ = json.Unmarshal(dat, &toolChoice)
-		}
-		input.FunctionCall = map[string]interface{}{
-			"name": toolChoice.Function.Name,
-		}
-	}
-
-	// Decode each request's message content
-	imgIndex, vidIndex, audioIndex := 0, 0, 0
-	for i, m := range input.Messages {
-		switch content := m.Content.(type) {
-		case string:
-			input.Messages[i].StringContent = content
-		case []interface{}:
-			dat, _ := json.Marshal(content)
-			c := []schema.Content{}
-			json.Unmarshal(dat, &c)
-		CONTENT:
-			for _, pp := range c {
-				switch pp.Type {
-				case "text":
-					input.Messages[i].StringContent = pp.Text
-				case "video", "video_url":
-					// Decode content as base64 either if it's an URL or base64 text
-					base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
-					if err != nil {
-						log.Error().Msgf("Failed encoding video: %s", err)
-						continue CONTENT
-					}
-					input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
-
-					t := "[vid-{{.ID}}]{{.Text}}"
-					if config.TemplateConfig.Video != "" {
-						t = config.TemplateConfig.Video
-					}
-					// set a placeholder for each image
-					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
-					vidIndex++
-				case "audio_url", "audio":
-					// Decode content as base64 either if it's an URL or base64 text
-					base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
-					if err != nil {
-						log.Error().Msgf("Failed encoding image: %s", err)
-						continue CONTENT
-					}
-					input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
-					// set a placeholder for each image
-					t := "[audio-{{.ID}}]{{.Text}}"
-					if config.TemplateConfig.Audio != "" {
-						t = config.TemplateConfig.Audio
-					}
-					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
-					audioIndex++
-				case "image_url", "image":
-					// Decode content as base64 either if it's an URL or base64 text
-					base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
-					if err != nil {
-						log.Error().Msgf("Failed encoding image: %s", err)
-						continue CONTENT
-					}
-
-					t := "[img-{{.ID}}]{{.Text}}"
-					if config.TemplateConfig.Image != "" {
-						t = config.TemplateConfig.Image
-					}
-					input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
-					// set a placeholder for each image
-					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
-					imgIndex++
-				}
-			}
-		}
-	}
-
-	if input.RepeatPenalty != 0 {
-		config.RepeatPenalty = input.RepeatPenalty
-	}
-
-	if input.FrequencyPenalty != 0 {
-		config.FrequencyPenalty = input.FrequencyPenalty
-	}
-
-	if input.PresencePenalty != 0 {
-		config.PresencePenalty = input.PresencePenalty
-	}
-
-	if input.Keep != 0 {
-		config.Keep = input.Keep
-	}
-
-	if input.Batch != 0 {
-		config.Batch = input.Batch
-	}
-
-	if input.IgnoreEOS {
-		config.IgnoreEOS = input.IgnoreEOS
-	}
-
-	if input.Seed != nil {
-		config.Seed = input.Seed
-	}
-
-	if input.TypicalP != nil {
-		config.TypicalP = input.TypicalP
-	}
-
-	switch inputs := input.Input.(type) {
-	case string:
-		if inputs != "" {
-			config.InputStrings = append(config.InputStrings, inputs)
-		}
-	case []interface{}:
-		for _, pp := range inputs {
-			switch i := pp.(type) {
-			case string:
-				config.InputStrings = append(config.InputStrings, i)
-			case []interface{}:
-				tokens := []int{}
-				for _, ii := range i {
-					tokens = append(tokens, int(ii.(float64)))
-				}
-				config.InputToken = append(config.InputToken, tokens)
-			}
-		}
-	}
-
-	// Can be either a string or an object
-	switch fnc := input.FunctionCall.(type) {
-	case string:
-		if fnc != "" {
-			config.SetFunctionCallString(fnc)
-		}
-	case map[string]interface{}:
-		var name string
-		n, exists := fnc["name"]
-		if exists {
-			nn, e := n.(string)
-			if e {
-				name = nn
-			}
-		}
-		config.SetFunctionCallNameString(name)
-	}
-
-	switch p := input.Prompt.(type) {
-	case string:
-		config.PromptStrings = append(config.PromptStrings, p)
-	case []interface{}:
-		for _, pp := range p {
-			if s, ok := pp.(string); ok {
-				config.PromptStrings = append(config.PromptStrings, s)
-			}
-		}
-	}
-}
-
-func mergeRequestWithConfig(modelFile string, input *schema.OpenAIRequest, cm *config.BackendConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.BackendConfig, *schema.OpenAIRequest, error) {
-	cfg, err := cm.LoadBackendConfigFileByName(modelFile, loader.ModelPath,
-		config.LoadOptionDebug(debug),
-		config.LoadOptionThreads(threads),
-		config.LoadOptionContextSize(ctx),
-		config.LoadOptionF16(f16),
-		config.ModelPath(loader.ModelPath),
-	)
-
-	// Set the parameters for the language model prediction
-	updateRequestConfig(cfg, input)
-
-	if !cfg.Validate() {
-		return nil, nil, fmt.Errorf("failed to validate config")
-	}
-
-	return cfg, input, err
-}
+package middleware
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/google/uuid"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/services"
+	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
+	"github.com/mudler/LocalAI/pkg/utils"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+type correlationIDKeyType string
+
+// CorrelationIDKey to track request across process boundary
+const CorrelationIDKey correlationIDKeyType = "correlationID"
+
+type RequestExtractor struct {
+	backendConfigLoader *config.BackendConfigLoader
+	modelLoader         *model.ModelLoader
+	applicationConfig   *config.ApplicationConfig
+}
+
+func NewRequestExtractor(backendConfigLoader *config.BackendConfigLoader, modelLoader *model.ModelLoader, applicationConfig *config.ApplicationConfig) *RequestExtractor {
+	return &RequestExtractor{
+		backendConfigLoader: backendConfigLoader,
+		modelLoader:         modelLoader,
+		applicationConfig:   applicationConfig,
+	}
+}
+
+const CONTEXT_LOCALS_KEY_MODEL_NAME = "MODEL_NAME"
+const CONTEXT_LOCALS_KEY_LOCALAI_REQUEST = "LOCALAI_REQUEST"
+const CONTEXT_LOCALS_KEY_MODEL_CONFIG = "MODEL_CONFIG"
+
+// TODO: Refactor to not return error if unchanged
+func (re *RequestExtractor) setModelNameFromRequest(ctx *fiber.Ctx) {
+	model, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
+	if ok && model != "" {
+		return
+	}
+	model = ctx.Params("model")
+
+	if (model == "") && ctx.Query("model") != "" {
+		model = ctx.Query("model")
+	}
+
+	if model == "" {
+		// Set model from bearer token, if available
+		bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // "Bearer " => "Bear" to please go-staticcheck. It looks dumb but we might as well take free performance on something called for nearly every request.
+		if bearer != "" {
+			exists, err := services.CheckIfModelExists(re.backendConfigLoader, re.modelLoader, bearer, services.ALWAYS_INCLUDE)
+			if err == nil && exists {
+				model = bearer
+			}
+		}
+	}
+
+	ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME, model)
+}
+
+func (re *RequestExtractor) BuildConstantDefaultModelNameMiddleware(defaultModelName string) fiber.Handler {
+	return func(ctx *fiber.Ctx) error {
+		re.setModelNameFromRequest(ctx)
+		localModelName, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
+		if !ok || localModelName == "" {
+			ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME, defaultModelName)
+			log.Debug().Str("defaultModelName", defaultModelName).Msg("context local model name not found, setting to default")
+		}
+		return ctx.Next()
+	}
+}
+
+func (re *RequestExtractor) BuildFilteredFirstAvailableDefaultModel(filterFn config.BackendConfigFilterFn) fiber.Handler {
+	return func(ctx *fiber.Ctx) error {
+		re.setModelNameFromRequest(ctx)
+		localModelName := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
+		if localModelName != "" { // Don't overwrite existing values
+			return ctx.Next()
+		}
+
+		modelNames, err := services.ListModels(re.backendConfigLoader, re.modelLoader, filterFn, services.SKIP_IF_CONFIGURED)
+		if err != nil {
+			log.Error().Err(err).Msg("non-fatal error calling ListModels during SetDefaultModelNameToFirstAvailable()")
+			return ctx.Next()
+		}
+
+		if len(modelNames) == 0 {
+			log.Warn().Msg("SetDefaultModelNameToFirstAvailable used with no matching models installed")
+			// This is non-fatal - making it so was breaking the case of direct installation of raw models
+			// return errors.New("this endpoint requires at least one model to be installed")
+			return ctx.Next()
+		}
+
+		ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME, modelNames[0])
+		log.Debug().Str("first model name", modelNames[0]).Msg("context local model name not found, setting to the first model")
+		return ctx.Next()
+	}
+}
+
+// TODO: If context and cancel above belong on all methods, move that part of above into here!
+// Otherwise, it's in its own method below for now
+func (re *RequestExtractor) SetModelAndConfig(initializer func() schema.LocalAIRequest) fiber.Handler {
+	return func(ctx *fiber.Ctx) error {
+		input := initializer()
+		if input == nil {
+			return fmt.Errorf("unable to initialize body")
+		}
+		if err := ctx.BodyParser(input); err != nil {
+			return fmt.Errorf("failed parsing request body: %w", err)
+		}
+
+		// If this request doesn't have an associated model name, fetch it from earlier in the middleware chain
+		if input.ModelName(nil) == "" {
+			localModelName, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
+			if ok && localModelName != "" {
+				log.Debug().Str("context localModelName", localModelName).Msg("overriding empty model name in request body with value found earlier in middleware chain")
+				input.ModelName(&localModelName)
+			}
+		}
+
+		cfg, err := re.backendConfigLoader.LoadBackendConfigFileByNameDefaultOptions(input.ModelName(nil), re.applicationConfig)
+
+		if err != nil {
+			log.Err(err)
+			log.Warn().Msgf("Model Configuration File not found for %q", input.ModelName(nil))
+		} else if cfg.Model != "" {
+			log.Debug().Str("BEFORE ModelName", input.ModelName(nil)).Str("cfg.Model", cfg.Model).Msg("REPLACING ModelName from CONFIG SOURCE")
+			input.ModelName(&cfg.Model)
+		}
+
+		ctx.Locals(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST, input)
+		ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_CONFIG, cfg)
+
+		return ctx.Next()
+	}
+}
+
+func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error {
+	input, ok := ctx.Locals(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+	if !ok || input.Model == "" {
+		return fiber.ErrBadRequest
+	}
+
+	cfg, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+	if !ok || cfg == nil {
+		return fiber.ErrBadRequest
+	}
+
+	// Extract or generate the correlation ID
+	correlationID := ctx.Get("X-Correlation-ID", uuid.New().String())
+	ctx.Set("X-Correlation-ID", correlationID)
+
+	c1, cancel := context.WithCancel(re.applicationConfig.Context)
+	// Add the correlation ID to the new context
+	context := context.WithValue(c1, CorrelationIDKey, correlationID)
+
+	input.Context = context
+	input.Cancel = cancel
+
+	err := mergeOpenAIRequestAndBackendConfig(cfg, input)
+	if err != nil {
+		return err
+	}
+
+	if cfg.Model == "" {
+		log.Debug().Str("input.Model", input.Model).Msg("replacing empty cfg.Model with input value")
+		cfg.Model = input.Model
+	}
+
+	ctx.Locals(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST, input)
+	ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_CONFIG, cfg)
+
+	return ctx.Next()
+}
+
+func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *schema.OpenAIRequest) error {
+	if input.Echo {
+		config.Echo = input.Echo
+	}
+	if input.TopK != nil {
+		config.TopK = input.TopK
+	}
+	if input.TopP != nil {
+		config.TopP = input.TopP
+	}
+
+	if input.Backend != "" {
+		config.Backend = input.Backend
+	}
+
+	if input.ClipSkip != 0 {
+		config.Diffusers.ClipSkip = input.ClipSkip
+	}
+
+	if input.ModelBaseName != "" {
+		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
+	}
+
+	if input.NegativePromptScale != 0 {
+		config.NegativePromptScale = input.NegativePromptScale
+	}
+
+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
+	if input.NegativePrompt != "" {
+		config.NegativePrompt = input.NegativePrompt
+	}
+
+	if input.RopeFreqBase != 0 {
+		config.RopeFreqBase = input.RopeFreqBase
+	}
+
+	if input.RopeFreqScale != 0 {
+		config.RopeFreqScale = input.RopeFreqScale
+	}
+
+	if input.Grammar != "" {
+		config.Grammar = input.Grammar
+	}
+
+	if input.Temperature != nil {
+		config.Temperature = input.Temperature
+	}
+
+	if input.Maxtokens != nil {
+		config.Maxtokens = input.Maxtokens
+	}
+
+	if input.ResponseFormat != nil {
+		switch responseFormat := input.ResponseFormat.(type) {
+		case string:
+			config.ResponseFormat = responseFormat
+		case map[string]interface{}:
+			config.ResponseFormatMap = responseFormat
+		}
+	}
+
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
+	}
+
+	if len(input.Tools) > 0 {
+		for _, tool := range input.Tools {
+			input.Functions = append(input.Functions, tool.Function)
+		}
+	}
+
+	if input.ToolsChoice != nil {
+		var toolChoice functions.Tool
+
+		switch content := input.ToolsChoice.(type) {
+		case string:
+			_ = json.Unmarshal([]byte(content), &toolChoice)
+		case map[string]interface{}:
+			dat, _ := json.Marshal(content)
+			_ = json.Unmarshal(dat, &toolChoice)
+		}
+		input.FunctionCall = map[string]interface{}{
+			"name": toolChoice.Function.Name,
+		}
+	}
+
+	// Decode each request's message content
+	imgIndex, vidIndex, audioIndex := 0, 0, 0
+	for i, m := range input.Messages {
+		switch content := m.Content.(type) {
+		case string:
+			input.Messages[i].StringContent = content
+		case []interface{}:
+			dat, _ := json.Marshal(content)
+			c := []schema.Content{}
+			json.Unmarshal(dat, &c)
+		CONTENT:
+			for _, pp := range c {
+				switch pp.Type {
+				case "text":
+					input.Messages[i].StringContent = pp.Text
+				case "video", "video_url":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding video: %s", err)
+						continue CONTENT
+					}
+					input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
+
+					t := "[vid-{{.ID}}]{{.Text}}"
+					if config.TemplateConfig.Video != "" {
+						t = config.TemplateConfig.Video
+					}
+					// set a placeholder for each image
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
+					vidIndex++
+				case "audio_url", "audio":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
+					}
+					input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					t := "[audio-{{.ID}}]{{.Text}}"
+					if config.TemplateConfig.Audio != "" {
+						t = config.TemplateConfig.Audio
+					}
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
+					audioIndex++
+				case "image_url", "image":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
+					}
+
+					t := "[img-{{.ID}}]{{.Text}}"
+					if config.TemplateConfig.Image != "" {
+						t = config.TemplateConfig.Image
+					}
+					input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
+					imgIndex++
+				}
+			}
+		}
+	}
+
+	if input.RepeatPenalty != 0 {
+		config.RepeatPenalty = input.RepeatPenalty
+	}
+
+	if input.FrequencyPenalty != 0 {
+		config.FrequencyPenalty = input.FrequencyPenalty
+	}
+
+	if input.PresencePenalty != 0 {
+		config.PresencePenalty = input.PresencePenalty
+	}
+
+	if input.Keep != 0 {
+		config.Keep = input.Keep
+	}
+
+	if input.Batch != 0 {
+		config.Batch = input.Batch
+	}
+
+	if input.IgnoreEOS {
+		config.IgnoreEOS = input.IgnoreEOS
+	}
+
+	if input.Seed != nil {
+		config.Seed = input.Seed
+	}
+
+	if input.TypicalP != nil {
+		config.TypicalP = input.TypicalP
+	}
+
+	log.Debug().Str("input.Input", fmt.Sprintf("%+v", input.Input))
+
+	switch inputs := input.Input.(type) {
+	case string:
+		if inputs != "" {
+			config.InputStrings = append(config.InputStrings, inputs)
+		}
+	case []interface{}:
+		for _, pp := range inputs {
+			switch i := pp.(type) {
+			case string:
+				config.InputStrings = append(config.InputStrings, i)
+			case []interface{}:
+				tokens := []int{}
+				for _, ii := range i {
+					tokens = append(tokens, int(ii.(float64)))
+				}
+				config.InputToken = append(config.InputToken, tokens)
+			}
+		}
+	}
+
+	// Can be either a string or an object
+	switch fnc := input.FunctionCall.(type) {
+	case string:
+		if fnc != "" {
+			config.SetFunctionCallString(fnc)
+		}
+	case map[string]interface{}:
+		var name string
+		n, exists := fnc["name"]
+		if exists {
+			nn, e := n.(string)
+			if e {
+				name = nn
+			}
+		}
+		config.SetFunctionCallNameString(name)
+	}
+
+	switch p := input.Prompt.(type) {
+	case string:
+		config.PromptStrings = append(config.PromptStrings, p)
+	case []interface{}:
+		for _, pp := range p {
+			if s, ok := pp.(string); ok {
+				config.PromptStrings = append(config.PromptStrings, s)
+			}
+		}
+	}
+
+	if config.Validate() {
+		return nil
+	}
+	return fmt.Errorf("unable to validate configuration after merging")
+}
diff --git a/core/http/routes/elevenlabs.go b/core/http/routes/elevenlabs.go
index 73387c7bb763..9e735bb11b6c 100644
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -4,17 +4,26 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/elevenlabs"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
 func RegisterElevenLabsRoutes(app *fiber.App,
+	re *middleware.RequestExtractor,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig) {
 
 	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id", elevenlabs.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/text-to-speech/:voice-id",
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.ElevenLabsTTSRequest) }),
+		elevenlabs.TTSEndpoint(cl, ml, appConfig))
 
-	app.Post("/v1/sound-generation", elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
+	app.Post("/v1/sound-generation",
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_SOUND_GENERATION)),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.ElevenLabsSoundGenerationRequest) }),
+		elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
 
 }
diff --git a/core/http/routes/jina.go b/core/http/routes/jina.go
index 93125e6cb911..1f7a1a7c3e40 100644
--- a/core/http/routes/jina.go
+++ b/core/http/routes/jina.go
@@ -3,16 +3,22 @@ package routes
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/jina"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
 func RegisterJINARoutes(app *fiber.App,
+	re *middleware.RequestExtractor,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig) {
 
 	// POST endpoint to mimic the reranking
-	app.Post("/v1/rerank", jina.JINARerankEndpoint(cl, ml, appConfig))
+	app.Post("/v1/rerank",
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_RERANK)),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.JINARerankRequest) }),
+		jina.JINARerankEndpoint(cl, ml, appConfig))
 }
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index f2f0dfa487dc..2434e0649461 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -5,13 +5,16 @@ import (
 	"github.com/gofiber/swagger"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/p2p"
+	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
 func RegisterLocalAIRoutes(app *fiber.App,
+	requestExtractor *middleware.RequestExtractor,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
@@ -33,7 +36,10 @@ func RegisterLocalAIRoutes(app *fiber.App,
 		app.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
 	}
 
-	app.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/tts",
+		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
+		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TTSRequest) }),
+		localai.TTSEndpoint(cl, ml, appConfig))
 
 	// Stores
 	sl := model.NewModelLoader("")
@@ -44,7 +50,8 @@ func RegisterLocalAIRoutes(app *fiber.App,
 
 	app.Get("/metrics", localai.LocalAIMetricsEndpoint())
 
-	// Experimental Backend Statistics Module
+	// Backend Statistics Module
+	// TODO: Should these use standard middlewares? Refactor later, they are extremely simple.
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
 	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
 	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
@@ -64,6 +71,9 @@ func RegisterLocalAIRoutes(app *fiber.App,
 	app.Get("/system", localai.SystemInformations(ml, appConfig))
 
 	// misc
-	app.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
+	app.Post("/v1/tokenize",
+		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TOKENIZE)),
+		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TokenizeRequest) }),
+		localai.TokenizeEndpoint(cl, ml, appConfig))
 
 }
diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go
index 081daf70d80c..91aff3adb10c 100644
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -5,22 +5,50 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
 func RegisterOpenAIRoutes(app *fiber.App,
+	re *middleware.RequestExtractor,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig) {
 	// openAI compatible API endpoint
 
 	// chat
-	app.Post("/v1/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
+	chatChain := []fiber.Handler{
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		re.SetOpenAIRequest,
+		openai.ChatEndpoint(cl, ml, appConfig),
+	}
+	app.Post("/v1/chat/completions", chatChain...)
+	app.Post("/chat/completions", chatChain...)
 
 	// edit
-	app.Post("/v1/edits", openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", openai.EditEndpoint(cl, ml, appConfig))
+	editChain := []fiber.Handler{
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EDIT)),
+		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		re.SetOpenAIRequest,
+		openai.EditEndpoint(cl, ml, appConfig),
+	}
+	app.Post("/v1/edits", editChain...)
+	app.Post("/edits", editChain...)
+
+	// completion
+	completionChain := []fiber.Handler{
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_COMPLETION)),
+		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		re.SetOpenAIRequest,
+		openai.CompletionEndpoint(cl, ml, appConfig),
+	}
+	app.Post("/v1/completions", completionChain...)
+	app.Post("/completions", completionChain...)
+	app.Post("/v1/engines/:model/completions", completionChain...)
 
 	// assistant
 	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
@@ -54,22 +82,37 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
 	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
 
-	// completion
-	app.Post("/v1/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-
 	// embeddings
-	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	embeddingChain := []fiber.Handler{
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EMBEDDINGS)),
+		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		re.SetOpenAIRequest,
+		openai.EmbeddingsEndpoint(cl, ml, appConfig),
+	}
+	app.Post("/v1/embeddings", embeddingChain...)
+	app.Post("/embeddings", embeddingChain...)
+	app.Post("/v1/engines/:model/embeddings", embeddingChain...)
 
 	// audio
-	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/transcriptions",
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		re.SetOpenAIRequest,
+		openai.TranscriptEndpoint(cl, ml, appConfig),
+	)
+
+	app.Post("/v1/audio/speech",
+		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TTSRequest) }),
+		localai.TTSEndpoint(cl, ml, appConfig))
 
 	// images
-	app.Post("/v1/images/generations", openai.ImageEndpoint(cl, ml, appConfig))
+	app.Post("/v1/images/generations",
+		re.BuildConstantDefaultModelNameMiddleware(model.StableDiffusionBackend),
+		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+		re.SetOpenAIRequest,
+		openai.ImageEndpoint(cl, ml, appConfig))
 
 	if appConfig.ImageDir != "" {
 		app.Static("/generated-images", appConfig.ImageDir)
diff --git a/core/schema/elevenlabs.go b/core/schema/elevenlabs.go
index 119e0a588860..93bbe70143c5 100644
--- a/core/schema/elevenlabs.go
+++ b/core/schema/elevenlabs.go
@@ -12,3 +12,17 @@ type ElevenLabsSoundGenerationRequest struct {
 	Temperature *float32 `json:"prompt_influence,omitempty" yaml:"prompt_influence,omitempty"`
 	DoSample    *bool    `json:"do_sample,omitempty" yaml:"do_sample,omitempty"`
 }
+
+func (elttsr *ElevenLabsTTSRequest) ModelName(s *string) string {
+	if s != nil {
+		elttsr.ModelID = *s
+	}
+	return elttsr.ModelID
+}
+
+func (elsgr *ElevenLabsSoundGenerationRequest) ModelName(s *string) string {
+	if s != nil {
+		elsgr.ModelID = *s
+	}
+	return elsgr.ModelID
+}
diff --git a/core/schema/jina.go b/core/schema/jina.go
index 7f80689cb2e6..63d24556fe97 100644
--- a/core/schema/jina.go
+++ b/core/schema/jina.go
@@ -2,10 +2,11 @@ package schema
 
 // RerankRequest defines the structure of the request payload
 type JINARerankRequest struct {
-	Model     string   `json:"model"`
+	BasicModelRequest
 	Query     string   `json:"query"`
 	Documents []string `json:"documents"`
 	TopN      int      `json:"top_n"`
+	Backend   string   `json:"backend"`
 }
 
 // DocumentResult represents a single document result
diff --git a/core/schema/localai.go b/core/schema/localai.go
index cdc3e5b0784f..7cb4a0cb971a 100644
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -7,11 +7,11 @@ import (
 )
 
 type BackendMonitorRequest struct {
-	Model string `json:"model" yaml:"model"`
+	BasicModelRequest
 }
 
 type TokenMetricsRequest struct {
-	Model string `json:"model" yaml:"model"`
+	BasicModelRequest
 }
 
 type BackendMonitorResponse struct {
@@ -27,11 +27,11 @@ type GalleryResponse struct {
 
 // @Description TTS request body
 type TTSRequest struct {
-	Model    string `json:"model" yaml:"model"` // model name or full path
-	Input    string `json:"input" yaml:"input"` // text input
-	Voice    string `json:"voice" yaml:"voice"` // voice audio file or speaker id
-	Backend  string `json:"backend" yaml:"backend"`
-	Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
+	BasicModelRequest        // model name or full path
+	Input             string `json:"input" yaml:"input"` // text input
+	Voice             string `json:"voice" yaml:"voice"` // voice audio file or speaker id
+	Backend           string `json:"backend" yaml:"backend"`
+	Language          string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
 }
 
 type StoresSet struct {
diff --git a/core/schema/prediction.go b/core/schema/prediction.go
index 18d2782bb4bc..15785f1916df 100644
--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@@ -3,7 +3,7 @@ package schema
 type PredictionOptions struct {
 
 	// Also part of the OpenAI official spec
-	Model string `json:"model" yaml:"model"`
+	BasicModelRequest `yaml:",inline"`
 
 	// Also part of the OpenAI official spec
 	Language string `json:"language"`
diff --git a/core/schema/request.go b/core/schema/request.go
new file mode 100644
index 000000000000..f55f39200ca4
--- /dev/null
+++ b/core/schema/request.go
@@ -0,0 +1,22 @@
+package schema
+
+// This file and type represent a generic request to LocalAI - as opposed to requests to LocalAI-specific endpoints, which live in localai.go
+type LocalAIRequest interface {
+	ModelName(*string) string
+}
+
+type BasicModelRequest struct {
+	Model string `json:"model" yaml:"model"`
+	// TODO: Should this also include the following fields from the OpenAI side of the world?
+	// If so, changes should be made to core/http/middleware/request.go to match
+
+	// Context context.Context    `json:"-"`
+	// Cancel  context.CancelFunc `json:"-"`
+}
+
+func (bmr *BasicModelRequest) ModelName(s *string) string {
+	if s != nil {
+		bmr.Model = *s
+	}
+	return bmr.Model
+}
diff --git a/core/schema/tokenize.go b/core/schema/tokenize.go
index 3770cc5affd7..e481f186333f 100644
--- a/core/schema/tokenize.go
+++ b/core/schema/tokenize.go
@@ -1,8 +1,8 @@
 package schema
 
 type TokenizeRequest struct {
+	BasicModelRequest
 	Content string `json:"content"`
-	Model   string `json:"model"`
 }
 
 type TokenizeResponse struct {
diff --git a/core/services/list_models.go b/core/services/list_models.go
index ef555d221962..45c05f5828c4 100644
--- a/core/services/list_models.go
+++ b/core/services/list_models.go
@@ -49,3 +49,15 @@ func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter c
 
 	return dataModels, nil
 }
+
+func CheckIfModelExists(bcl *config.BackendConfigLoader, ml *model.ModelLoader, modelName string, looseFilePolicy LooseFilePolicy) (bool, error) {
+	filter, err := config.BuildNameFilterFn(modelName)
+	if err != nil {
+		return false, err
+	}
+	models, err := ListModels(bcl, ml, filter, looseFilePolicy)
+	if err != nil {
+		return false, err
+	}
+	return (len(models) > 0), nil
+}
diff --git a/core/startup/startup.go b/core/startup/startup.go
index 17e54bc0603b..60c2e04caed4 100644
--- a/core/startup/startup.go
+++ b/core/startup/startup.go
@@ -147,13 +147,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 
 	if options.LoadToMemory != nil {
 		for _, m := range options.LoadToMemory {
-			cfg, err := cl.LoadBackendConfigFileByName(m, options.ModelPath,
-				config.LoadOptionDebug(options.Debug),
-				config.LoadOptionThreads(options.Threads),
-				config.LoadOptionContextSize(options.ContextSize),
-				config.LoadOptionF16(options.F16),
-				config.ModelPath(options.ModelPath),
-			)
+			cfg, err := cl.LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
 				return nil, nil, nil, err
 			}
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 99d941a9c600..b1dbf20b3175 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -182,6 +182,34 @@
     - filename: Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
       sha256: 7f45fa79bc6c9847ef9fbad08c3bb5a0f2dbb56d2e2200a5d37b260a57274e55
       uri: huggingface://QuantFactory/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO-GGUF/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "llama-3.2-chibi-3b"
+  icon: https://huggingface.co/AELLM/Llama-3.2-Chibi-3B/resolve/main/chibi.jpg
+  urls:
+    - https://huggingface.co/AELLM/Llama-3.2-Chibi-3B
+    - https://huggingface.co/mradermacher/Llama-3.2-Chibi-3B-GGUF
+  description: |
+    Small parameter LLMs are ideal for navigating the complexities of the Japanese language, which involves multiple character systems like kanji, hiragana, and katakana, along with subtle social cues. Despite their smaller size, these models are capable of delivering highly accurate and context-aware results, making them perfect for use in environments where resources are constrained. Whether deployed on mobile devices with limited processing power or in edge computing scenarios where fast, real-time responses are needed, these models strike the perfect balance between performance and efficiency, without sacrificing quality or speed.
+  overrides:
+    parameters:
+      model: Llama-3.2-Chibi-3B.Q4_K_M.gguf
+  files:
+    - filename: Llama-3.2-Chibi-3B.Q4_K_M.gguf
+      sha256: 4b594cd5f66181202713f1cf97ce2f86d0acfa1b862a64930d5f512c45640a2f
+      uri: huggingface://mradermacher/Llama-3.2-Chibi-3B-GGUF/Llama-3.2-Chibi-3B.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "llama-3.2-3b-reasoning-time"
+  urls:
+    - https://huggingface.co/mradermacher/Llama-3.2-3B-Reasoning-Time-GGUF
+  description: |
+    Lyte/Llama-3.2-3B-Reasoning-Time is a large language model with 3.2 billion parameters, designed for reasoning and time-based tasks in English. It is based on the Llama architecture and has been quantized using the GGUF format by mradermacher.
+  overrides:
+    parameters:
+      model: Llama-3.2-3B-Reasoning-Time.Q4_K_M.gguf
+  files:
+    - filename: Llama-3.2-3B-Reasoning-Time.Q4_K_M.gguf
+      sha256: 80b10e1a5c6e27f6d8cf08c3472af2b15a9f63ebf8385eedfe8615f85116c73f
+      uri: huggingface://mradermacher/Llama-3.2-3B-Reasoning-Time-GGUF/Llama-3.2-3B-Reasoning-Time.Q4_K_M.gguf
 - &qwen25
   ## Qwen2.5
   name: "qwen2.5-14b-instruct"
@@ -544,6 +572,26 @@
     - filename: EdgeRunner-Command-Nested.i1-Q4_K_M.gguf
       sha256: a1cc4d2b601dc20e58cbb549bd3e9bc460995840c0aaf1cd3c1cb5414c900ac7
       uri: huggingface://mradermacher/EdgeRunner-Command-Nested-i1-GGUF/EdgeRunner-Command-Nested.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tsunami-0.5x-7b-instruct-i1"
+  icon: https://huggingface.co/Tsunami-th/Tsunami-0.5x-7B-Instruct/resolve/main/Tsunami.webp
+  urls:
+    - https://huggingface.co/Tsunami-th/Tsunami-0.5x-7B-Instruct
+    - https://huggingface.co/mradermacher/Tsunami-0.5x-7B-Instruct-i1-GGUF
+  description: |
+    TSUNAMI: Transformative Semantic Understanding and Natural Augmentation Model for Intelligence.
+
+    TSUNAMI full name was created by ChatGPT.
+    infomation
+
+    Tsunami-0.5x-7B-Instruct is Thai Large Language Model that fine-tuned from Qwen2.5-7B around 100,000 rows in Thai dataset.
+  overrides:
+    parameters:
+      model: Tsunami-0.5x-7B-Instruct.i1-Q4_K_M.gguf
+  files:
+    - filename: Tsunami-0.5x-7B-Instruct.i1-Q4_K_M.gguf
+      sha256: 22e2003ecec7f1e91f2e9aaec334613c0f37fb3000d0e628b5a9980e53322fa7
+      uri: huggingface://mradermacher/Tsunami-0.5x-7B-Instruct-i1-GGUF/Tsunami-0.5x-7B-Instruct.i1-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:
@@ -1569,6 +1617,61 @@
     - filename: hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
       sha256: 8cff9d399a0583616fe1f290da6daa091ab5c5493d0e173a8fffb45202d79417
       uri: huggingface://mlabonne/Hermes-3-Llama-3.1-8B-lorablated-GGUF/hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "doctoraifinetune-3.1-8b-i1"
+  urls:
+    - https://huggingface.co/huzaifa525/Doctoraifinetune-3.1-8B
+    - https://huggingface.co/mradermacher/Doctoraifinetune-3.1-8B-i1-GGUF
+  description: |
+      This is a fine-tuned version of the Meta-Llama-3.1-8B-bnb-4bit model, specifically adapted for the medical field. It has been trained using a dataset that provides extensive information on diseases, symptoms, and treatments, making it ideal for AI-powered healthcare tools such as medical chatbots, virtual assistants, and diagnostic support systems.
+      Key Features
+
+          Disease Diagnosis: Accurately identifies diseases based on symptoms provided by the user.
+          Symptom Analysis: Breaks down and interprets symptoms to provide a comprehensive medical overview.
+          Treatment Recommendations: Suggests treatments and remedies according to medical conditions.
+
+      Dataset
+
+      The model is fine-tuned on 2000 rows from a dataset consisting of 272k rows. This dataset includes rich information about diseases, symptoms, and their corresponding treatments. The model is continuously being updated and will be further trained on the remaining data in future releases to improve accuracy and capabilities.
+  overrides:
+    parameters:
+      model: Doctoraifinetune-3.1-8B.i1-Q4_K_M.gguf
+  files:
+    - filename: Doctoraifinetune-3.1-8B.i1-Q4_K_M.gguf
+      sha256: 282456efcb6c7e54d34ac25ae7fc022a94152ed77281ae4625b9628091e0a3d6
+      uri: huggingface://mradermacher/Doctoraifinetune-3.1-8B-i1-GGUF/Doctoraifinetune-3.1-8B.i1-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "astral-fusion-neural-happy-l3.1-8b"
+  urls:
+    - https://huggingface.co/ZeroXClem/Astral-Fusion-Neural-Happy-L3.1-8B
+    - https://huggingface.co/mradermacher/Astral-Fusion-Neural-Happy-L3.1-8B-GGUF
+  description: |
+    Astral-Fusion-Neural-Happy-L3.1-8B is a celestial blend of magic, creativity, and dynamic storytelling. Designed to excel in instruction-following, immersive roleplaying, and magical narrative generation, this model is a fusion of the finest qualities from Astral-Fusion, NIHAPPY, and NeuralMahou. ✨🚀
+
+    This model is perfect for anyone seeking a cosmic narrative experience, with the ability to generate both precise instructional content and fantastical stories in one cohesive framework. Whether you're crafting immersive stories, creating AI roleplaying characters, or working on interactive storytelling, this model brings out the magic. 🌟
+  overrides:
+    parameters:
+      model: Astral-Fusion-Neural-Happy-L3.1-8B.Q4_K_M.gguf
+  files:
+    - filename: Astral-Fusion-Neural-Happy-L3.1-8B.Q4_K_M.gguf
+      sha256: 14a3b07c1723ef1ca24f99382254b1227d95974541e23792a4e7ff621896055d
+      uri: huggingface://mradermacher/Astral-Fusion-Neural-Happy-L3.1-8B-GGUF/Astral-Fusion-Neural-Happy-L3.1-8B.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "mahou-1.5-llama3.1-70b-i1"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://huggingface.co/flammenai/Mahou-1.0-mistral-7B/resolve/main/mahou1.png
+  urls:
+    - https://huggingface.co/flammenai/Mahou-1.5-llama3.1-70B
+    - https://huggingface.co/mradermacher/Mahou-1.5-llama3.1-70B-i1-GGUF
+  description: |
+    Mahou is designed to provide short messages in a conversational context. It is capable of casual conversation and character roleplay.
+  overrides:
+    parameters:
+      model: Mahou-1.5-llama3.1-70B.i1-Q4_K_M.gguf
+  files:
+    - filename: Mahou-1.5-llama3.1-70B.i1-Q4_K_M.gguf
+      sha256: c2711c4c9c8d011edbeaa391b4418d433e273a318d1de3dbdda9b85baf4996f2
+      uri: huggingface://mradermacher/Mahou-1.5-llama3.1-70B-i1-GGUF/Mahou-1.5-llama3.1-70B.i1-Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
@@ -2193,6 +2296,27 @@
     - filename: MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
       sha256: cea68768dff58b553974b755bb40ef790ab8b86866d9b5c46bc2e6c3311b876a
       uri: huggingface://Lewdiculous/MN-BackyardAI-Party-12B-v1-GGUF-IQ-ARM-Imatrix/MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
+- !!merge <<: *mistral03
+  name: "ml-ms-etheris-123b"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/ieEjL3TxpDM3WAZQcya6E.png
+  urls:
+    - https://huggingface.co/Steelskull/ML-MS-Etheris-123B
+    - https://huggingface.co/mradermacher/ML-MS-Etheris-123B-GGUF
+  description: |
+    This model merges the robust storytelling of mutiple models while attempting to maintain intelligence. The final model was merged after Model Soup with DELLA to add some specal sause.
+      - model: NeverSleep/Lumimaid-v0.2-123B
+      - model: TheDrummer/Behemoth-123B-v1
+      - model: migtissera/Tess-3-Mistral-Large-2-123B
+      - model: anthracite-org/magnum-v2-123b
+    Use Mistral, ChatML, or Meth Format
+  overrides:
+    parameters:
+      model: ML-MS-Etheris-123B.Q2_K.gguf
+  files:
+    - filename: ML-MS-Etheris-123B.Q2_K.gguf
+      sha256: a17c5615413b5c9c8d01cf55386573d0acd00e01f6e2bcdf492624c73c593fc3
+      uri: huggingface://mradermacher/ML-MS-Etheris-123B-GGUF/ML-MS-Etheris-123B.Q2_K.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index bd668ec25a2a..4988de257228 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -457,7 +457,7 @@ func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error)
 func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
 	o := NewOptions(opts...)
 
-	log.Info().Msgf("Loading model '%s' with backend %s", o.modelID, o.backendString)
+	log.Info().Str("modelID", o.modelID).Str("backend", o.backendString).Str("o.model", o.model).Msg("BackendLoader starting")
 
 	backend := strings.ToLower(o.backendString)
 	if realBackend, exists := Aliases[backend]; exists {
diff --git a/tests/e2e-aio/e2e_test.go b/tests/e2e-aio/e2e_test.go
index a9c5549742a5..c498ca2b9e52 100644
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -235,7 +235,9 @@ var _ = Describe("E2E test", func() {
 				modelName := "jina-reranker-v1-base-en"
 
 				req := schema.JINARerankRequest{
-					Model: modelName,
+					BasicModelRequest: schema.BasicModelRequest{
+						Model: modelName,
+					},
 					Query: "Organic skincare products for sensitive skin",
 					Documents: []string{
 						"Eco-friendly kitchenware for modern homes",