centralized request middleware

Signed-off-by: Dave Lee <[email protected]>
mudler · Oct 16, 2024 · 7dfddc7 · 7dfddc7
1 parent 5f130fe
commit 7dfddc7
Show file tree

Hide file tree

Showing 54 changed files with 904 additions and 679 deletions.
diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=a89f75e1b7b90cb2d4d4c52ca53ef9e9b466aa45
+CPPLLAMA_VERSION?=9e041024481f6b249ab8918e18b9477f873b5a5e
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp

diff --git a/backend/python/autogptq/requirements.txt b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 transformers
diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
diff --git a/backend/python/common/template/requirements.txt b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
diff --git a/backend/python/coqui/requirements.txt b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
 coqui-tts
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
diff --git a/backend/python/diffusers/requirements.txt b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.66.2
+grpcio==1.67.0
 pillow
 protobuf
 certifi
diff --git a/backend/python/exllama2/requirements.txt b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 wheel

diff --git a/backend/python/mamba/requirements.txt b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
diff --git a/backend/python/openvoice/requirements-intel.txt b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3

diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 librosa
 faster-whisper

diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 llvmlite==0.43.0
diff --git a/backend/python/rerankers/requirements.txt b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
diff --git a/backend/python/sentencetransformers/requirements.txt b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 datasets

diff --git a/backend/python/transformers-musicgen/requirements.txt b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 scipy==1.14.0
 certifi
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
@@ -22,7 +22,7 @@ if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
             git clone https://github.com/vllm-project/vllm
         fi
         pushd vllm
-            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.67.0 protobuf bitsandbytes
             uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
             VLLM_TARGET_DEVICE=cpu python setup.py install
         popd

diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.2
+grpcio==1.67.0
 protobuf
 certifi
 setuptools
diff --git a/core/backend/llm.go b/core/backend/llm.go
@@ -31,13 +31,13 @@ type TokenUsage struct {
 	Completion int
 }
 
-func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 
 	var inferenceModel grpc.Backend
 	var err error
 
-	opts := ModelOptions(c, o, []model.Option{})
+	opts := ModelOptions(*c, o, []model.Option{})
 
 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
@@ -85,7 +85,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 
 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
 	fn := func() (LLMResponse, error) {
-		opts := gRPCPredictOpts(c, loader.ModelPath)
+		opts := gRPCPredictOpts(*c, loader.ModelPath)
 		opts.Prompt = s
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate

diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go
@@ -7,9 +7,7 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 
-func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
-
-	modelFile := backendConfig.Model
+func ModelTokenize(s string, modelFile string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
 
 	var inferenceModel grpc.Backend
 	var err error

diff --git a/core/config/backend_config.go b/core/config/backend_config.go
@@ -433,19 +433,20 @@ func (c *BackendConfig) HasTemplate() bool {
 type BackendConfigUsecases int
 
 const (
-	FLAG_ANY              BackendConfigUsecases = 0b000000000
-	FLAG_CHAT             BackendConfigUsecases = 0b000000001
-	FLAG_COMPLETION       BackendConfigUsecases = 0b000000010
-	FLAG_EDIT             BackendConfigUsecases = 0b000000100
-	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000001000
-	FLAG_RERANK           BackendConfigUsecases = 0b000010000
-	FLAG_IMAGE            BackendConfigUsecases = 0b000100000
-	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b001000000
-	FLAG_TTS              BackendConfigUsecases = 0b010000000
-	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
+	FLAG_ANY              BackendConfigUsecases = 0b0000000000
+	FLAG_CHAT             BackendConfigUsecases = 0b0000000001
+	FLAG_COMPLETION       BackendConfigUsecases = 0b0000000010
+	FLAG_EDIT             BackendConfigUsecases = 0b0000000100
+	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b0000001000
+	FLAG_RERANK           BackendConfigUsecases = 0b0000010000
+	FLAG_IMAGE            BackendConfigUsecases = 0b0000100000
+	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b0001000000
+	FLAG_TTS              BackendConfigUsecases = 0b0010000000
+	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b0100000000
+	FLAG_TOKENIZE         BackendConfigUsecases = 0b1000000000
 
 	// Common Subsets
-	FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
+	FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )
 
 func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
@@ -460,6 +461,7 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 		"FLAG_TRANSCRIPT":       FLAG_TRANSCRIPT,
 		"FLAG_TTS":              FLAG_TTS,
 		"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
+		"FLAG_TOKENIZE":         FLAG_TOKENIZE,
 		"FLAG_LLM":              FLAG_LLM,
 	}
 }
@@ -545,5 +547,12 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 
+	if (u & FLAG_TOKENIZE) == FLAG_TOKENIZE {
+		tokenizeCapableBackends := []string{"llama.cpp", "rwkv"}
+		if !slices.Contains(tokenizeCapableBackends, c.Backend) {
+			return false
+		}
+	}
+
 	return true
 }
diff --git a/core/config/backend_config_loader.go b/core/config/backend_config_loader.go
@@ -117,7 +117,9 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 	// Load a config file if present after the model name
 	cfg := &BackendConfig{
 		PredictionOptions: schema.PredictionOptions{
-			Model: modelName,
+			BasicModelRequest: schema.BasicModelRequest{
+				Model: modelName,
+			},
 		},
 	}
 
@@ -145,6 +147,14 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 	return cfg, nil
 }
 
+func (bcl *BackendConfigLoader) LoadBackendConfigFileByNameDefaultOptions(modelName string, appConfig *ApplicationConfig) (*BackendConfig, error) {
+	return bcl.LoadBackendConfigFileByName(modelName, appConfig.ModelPath,
+		LoadOptionDebug(appConfig.Debug),
+		LoadOptionThreads(appConfig.Threads),
+		LoadOptionContextSize(appConfig.ContextSize),
+		LoadOptionF16(appConfig.F16))
+}
+
 // This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile
 func (bcl *BackendConfigLoader) LoadMultipleBackendConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()

diff --git a/core/config/guesser.go b/core/config/guesser.go
@@ -26,14 +26,14 @@ const (
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
-	RepeatPenalty float64
+	RepeatPenalty  float64
 }
 
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
-		StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
@@ -161,10 +161,11 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 	}
 
 	// We try to guess only if we don't have a template defined already
-	f, err := gguf.ParseGGUFFile(filepath.Join(modelPath, cfg.ModelFileName()))
+	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+	f, err := gguf.ParseGGUFFile(guessPath)
 	if err != nil {
 		// Only valid for gguf files
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "not a GGUF file")
+		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
 		return
 	}
 

diff --git a/core/http/app.go b/core/http/app.go
@@ -121,7 +121,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		})
 	}
 
- // Health Checks should always be exempt from auth, so register these first
+	// Health Checks should always be exempt from auth, so register these first
 	routes.HealthRoutes(app)
 
 	kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
@@ -156,13 +156,15 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	galleryService := services.NewGalleryService(appConfig)
 	galleryService.Start(appConfig.Context, cl)
 
-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
+	requestExtractor := middleware.NewRequestExtractor(cl, ml, appConfig)
+
+	routes.RegisterElevenLabsRoutes(app, requestExtractor, cl, ml, appConfig)
+	routes.RegisterLocalAIRoutes(app, requestExtractor, cl, ml, appConfig, galleryService)
+	routes.RegisterOpenAIRoutes(app, requestExtractor, cl, ml, appConfig)
 	if !appConfig.DisableWebUI {
 		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
 	}
-	routes.RegisterJINARoutes(app, cl, ml, appConfig)
+	routes.RegisterJINARoutes(app, requestExtractor, cl, ml, appConfig)
 
 	httpFS := http.FS(embedDirStatic)
 

diff --git a/core/http/ctx/fiber.go b/core/http/ctx/fiber.go