Merge branch 'master' into feat-request-middleware

mudler · Nov 15, 2024 · 51f861e · 51f861e
2 parents 990e752 + 65bcc01
commit 51f861e
Show file tree

Hide file tree

Showing 6 changed files with 297 additions and 42 deletions.
diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=fb4a0ec0833c71cff5a1a367ba375447ce6106eb
+CPPLLAMA_VERSION?=ae8de6d50a09d49545e0afab2e50cc4acfb280e2
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -138,10 +138,10 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
-	export GGML_HIPBLAS=1
+	export GGML_HIP=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif
 
@@ -761,7 +761,7 @@ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/ll
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
+	cp backend/cpp/llama-fallback/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
 endif
 
 backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp

diff --git a/backend/cpp/llama/Makefile b/backend/cpp/llama/Makefile
@@ -22,7 +22,7 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)

diff --git a/gallery/index.yaml b/gallery/index.yaml
@@ -1357,6 +1357,60 @@
     - filename: Qwen2.5-32B-ArliAI-RPMax-v1.3-Q4_K_M.gguf
       sha256: 51b369068b124165b1b8c253371b88b573af9dd350e331ce93d7e47b6b710003
       uri: huggingface://bartowski/Qwen2.5-32B-ArliAI-RPMax-v1.3-GGUF/Qwen2.5-32B-ArliAI-RPMax-v1.3-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "q2.5-ms-mistoria-72b-i1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/5LOvUFYiMMw6pcEsOhmo2.webp
+  urls:
+    - https://huggingface.co/Steelskull/Q2.5-MS-Mistoria-72b
+    - https://huggingface.co/mradermacher/Q2.5-MS-Mistoria-72b-i1-GGUF
+  description: |
+    This model is my fist attempt at a 72b model as usual my goal is to merge the robust storytelling of mutiple models while attempting to maintain intelligence.
+    Merge of:
+      - model: EVA-UNIT-01/EVA-Qwen2.5-72B-v0.1
+      - model: ZeusLabs/Chronos-Platinum-72B
+      - model: shuttleai/shuttle-3
+  overrides:
+    parameters:
+      model: Q2.5-MS-Mistoria-72b.i1-Q4_K_M.gguf
+  files:
+    - filename: Q2.5-MS-Mistoria-72b.i1-Q4_K_M.gguf
+      sha256: f51ac3db855259c0132070e7bb9f58b67538103ffb3c716880ceef3bb09d43d9
+      uri: huggingface://mradermacher/Q2.5-MS-Mistoria-72b-i1-GGUF/Q2.5-MS-Mistoria-72b.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "athene-v2-agent"
+  icon: https://huggingface.co/Nexusflow/Athene-V2-Agent/resolve/main/agent.png
+  urls:
+    - https://huggingface.co/Nexusflow/Athene-V2-Agent
+    - https://huggingface.co/bartowski/Athene-V2-Agent-GGUF
+  description: |
+    Athene-V2-Agent is an open-source Agent LLM that surpasses the state-of-the-art in function calling and agentic capabilities.
+
+    💪 Versatile Agent Capability: Athene-V2-Agent is an agent model, capable of operating in environments with deeply nested dependencies with the environment. It is capable of reasoning and doing planning for trajectories with many tool calls necessary to answer a single query.
+
+    📊 Performance Highlights: Athene-V2-Agent surpasses GPT-4o in single FC tasks by 18% in function calling success rates, and by 17% in Agentic success rates.
+
+    🔧 Generalization to the Unseen: Athene-V2-Agent has never been trained on the functions or agentic settings used in evaluation.
+  overrides:
+    parameters:
+      model: Athene-V2-Agent-Q4_K_M.gguf
+  files:
+    - filename: Athene-V2-Agent-Q4_K_M.gguf
+      sha256: 2829d205519da34852c374286d42a4403f3be012ea56424e88ebcb8dc89676ad
+      uri: huggingface://bartowski/Athene-V2-Agent-GGUF/Athene-V2-Agent-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "athene-v2-chat"
+  urls:
+    - https://huggingface.co/Nexusflow/Athene-V2-Chat
+    - https://huggingface.co/bartowski/Athene-V2-Chat-GGUF
+  description: |
+    We introduce Athene-V2-Chat-72B, an open-weights LLM on-par with GPT-4o across benchmarks. It is trained through RLHF with Qwen-2.5-72B-Instruct as base model. Athene-V2-Chat-72B excels in chat, math, and coding. Its sister model, Athene-V2-Agent-72B, surpasses GPT-4o in complex function calling and agentic applications.
+  overrides:
+    parameters:
+      model: Athene-V2-Chat-Q4_K_M.gguf
+  files:
+    - filename: Athene-V2-Chat-Q4_K_M.gguf
+      sha256: bda8b784ad55982891e5aa69b08ce4030c91a2e28ad9c4c35284d45d3c7aeb16
+      uri: huggingface://bartowski/Athene-V2-Chat-GGUF/Athene-V2-Chat-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:
@@ -2753,6 +2807,44 @@
     - filename: L3.1-8B-Slush.i1-Q4_K_M.gguf
       sha256: 98c53cd1ec0e2b00400c5968cd076a589d0c889bca13ec52abfe4456cfa039be
       uri: huggingface://mradermacher/L3.1-8B-Slush-i1-GGUF/L3.1-8B-Slush.i1-Q4_K_M.gguf
+- !!merge <<: *llama31
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/C-ndfxAGdf21DjchZcf2p.png
+  name: "l3.1-ms-astoria-70b-v2"
+  urls:
+    - https://huggingface.co/Steelskull/L3.1-MS-Astoria-70b-v2
+    - https://huggingface.co/bartowski/L3.1-MS-Astoria-70b-v2-GGUF
+  description: |
+    This model is a remake of the original astoria with modern models and context sizes its goal is to merge the robust storytelling of mutiple models while attempting to maintain intelligence.
+
+    Use Llama 3 Format or meth format (llama 3 refuses to work with stepped thinking but meth works)
+      - model: migtissera/Tess-3-Llama-3.1-70B
+      - model: NeverSleep/Lumimaid-v0.2-70B
+      - model: Sao10K/L3.1-70B-Euryale-v2.2
+      - model: ArliAI/Llama-3.1-70B-ArliAI-RPMax-v1.2
+      - model: nbeerbower/Llama3.1-Gutenberg-Doppel-70B
+  overrides:
+    parameters:
+      model: L3.1-MS-Astoria-70b-v2-Q4_K_M.gguf
+  files:
+    - filename: L3.1-MS-Astoria-70b-v2-Q4_K_M.gguf
+      sha256: c02658ead1ecdc25c7218b8d9d11786f19c16d64f0d453082998e313edb0d4a6
+      uri: huggingface://bartowski/L3.1-MS-Astoria-70b-v2-GGUF/L3.1-MS-Astoria-70b-v2-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "magnum-v2-4b-i1"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/658a46cbfb9c2bdfae75b3a6/9JwXZze4tHRGpc_RzE2AU.png
+  urls:
+    - https://huggingface.co/anthracite-org/magnum-v2-4b
+    - https://huggingface.co/mradermacher/magnum-v2-4b-i1-GGUF
+  description: |
+    This is the eighth in a series of models designed to replicate the prose quality of the Claude 3 models, specifically Sonnet and Opus. This model is fine-tuned on top of IntervitensInc/Llama-3.1-Minitron-4B-Width-Base-chatml.
+  overrides:
+    parameters:
+      model: magnum-v2-4b.i1-Q4_K_M.gguf
+  files:
+    - filename: magnum-v2-4b.i1-Q4_K_M.gguf
+      sha256: 692618059fee8870759d67d275ebc59bc0474b18ae3571b3ebdec8f9da786a64
+      uri: huggingface://mradermacher/magnum-v2-4b-i1-GGUF/magnum-v2-4b.i1-Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

diff --git a/swagger/docs.go b/swagger/docs.go
@@ -279,6 +279,25 @@ const docTemplate = `{
                 }
             }
         },
+        "/tokenMetrics": {
+            "get": {
+                "consumes": [
+                    "application/json"
+                ],
+                "produces": [
+                    "audio/x-wav"
+                ],
+                "summary": "Get TokenMetrics for Active Slot.",
+                "responses": {
+                    "200": {
+                        "description": "generated audio/wav file",
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        },
         "/tts": {
             "post": {
                 "consumes": [
@@ -723,6 +742,38 @@ const docTemplate = `{
                     }
                 }
             }
+        },
+        "/v1/tokenMetrics": {
+            "get": {
+                "consumes": [
+                    "application/json"
+                ],
+                "produces": [
+                    "audio/x-wav"
+                ],
+                "summary": "Get TokenMetrics for Active Slot.",
+                "responses": {
+                    "200": {
+                        "description": "generated audio/wav file",
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/tokenize": {
+            "post": {
+                "summary": "Tokenize the input.",
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.TokenizeResponse"
+                        }
+                    }
+                }
+            }
         }
     },
     "definitions": {
@@ -972,14 +1023,6 @@ const docTemplate = `{
                 }
             }
         },
-        "model.Model": {
-            "type": "object",
-            "properties": {
-                "id": {
-                    "type": "string"
-                }
-            }
-        },
         "openai.Assistant": {
             "type": "object",
             "properties": {
@@ -1682,6 +1725,14 @@ const docTemplate = `{
                 }
             }
         },
+        "schema.SysInfoModel": {
+            "type": "object",
+            "properties": {
+                "id": {
+                    "type": "string"
+                }
+            }
+        },
         "schema.SystemInformationResponse": {
             "type": "object",
             "properties": {
@@ -1694,7 +1745,7 @@ const docTemplate = `{
                 "loaded_models": {
                     "type": "array",
                     "items": {
-                        "$ref": "#/definitions/model.Model"
+                        "$ref": "#/definitions/schema.SysInfoModel"
                     }
                 }
             }
@@ -1718,14 +1769,25 @@ const docTemplate = `{
                     "description": "model name or full path",
                     "type": "string"
                 },
-                "voice": {
-                    "description": "voice audio file or speaker id",
+                "response_format": {
+                    "description": "(optional) output format",
                     "type": "string"
                 },
-		"response_format": {
-                    "description": "(optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus",
+                "voice": {
+                    "description": "voice audio file or speaker id",
                     "type": "string"
-                },
+                }
+            }
+        },
+        "schema.TokenizeResponse": {
+            "type": "object",
+            "properties": {
+                "tokens": {
+                    "type": "array",
+                    "items": {
+                        "type": "integer"
+                    }
+                }
             }
         },
         "schema.ToolCall": {