From 46cceda440783f61fc8a4a6791936015cb819daf Mon Sep 17 00:00:00 2001
From: francis2tm <franciscomelot@hotmail.com>
Date: Fri, 20 Dec 2024 19:49:32 +0000
Subject: [PATCH] feat: vllm increased customization

---
 .env.example        | 3 ++-
 docker-compose.yaml | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.env.example b/.env.example
index c76727a4..7c5ea468 100644
--- a/.env.example
+++ b/.env.example
@@ -52,7 +52,8 @@ CHAT_COMPLETIONS_MODEL=meta-llama/Llama-3.1-70B-Instruct
 CHAT_COMPLETIONS_MAX_MODEL_LEN=4096 # context length
 
 # vllm backend
-VLLM_TENSOR_PARALLEL_SIZE=1 # should be equal to GPU_COUNT
+# Know more about vllm engine arguments here: https://docs.vllm.ai/en/latest/usage/engine_args.html
+VLLM_ENGINE_ARGS=--model ${CHAT_COMPLETIONS_MODEL} --max-model-len ${CHAT_COMPLETIONS_MAX_MODEL_LEN}
 
 # ----------------------------------------------------------------------------------
 # embeddings server
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 197a06aa..481b7918 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -131,7 +131,7 @@ services:
     ports:
       - "${CHAT_COMPLETIONS_SERVER_PORT}:8000"
     ipc: host
-    command: --model ${CHAT_COMPLETIONS_MODEL} --max-model-len ${CHAT_COMPLETIONS_MAX_MODEL_LEN} --tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE}
+    command: ${VLLM_ENGINE_ARGS}
 
   vllm-cpu:
     <<: *inference-service-cpu