From 46cceda440783f61fc8a4a6791936015cb819daf Mon Sep 17 00:00:00 2001 From: francis2tm Date: Fri, 20 Dec 2024 19:49:32 +0000 Subject: [PATCH] feat: vllm increased customization --- .env.example | 3 ++- docker-compose.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index c76727a4..7c5ea468 100644 --- a/.env.example +++ b/.env.example @@ -52,7 +52,8 @@ CHAT_COMPLETIONS_MODEL=meta-llama/Llama-3.1-70B-Instruct CHAT_COMPLETIONS_MAX_MODEL_LEN=4096 # context length # vllm backend -VLLM_TENSOR_PARALLEL_SIZE=1 # should be equal to GPU_COUNT +# Know more about vllm engine arguments here: https://docs.vllm.ai/en/latest/usage/engine_args.html +VLLM_ENGINE_ARGS=--model ${CHAT_COMPLETIONS_MODEL} --max-model-len ${CHAT_COMPLETIONS_MAX_MODEL_LEN} # ---------------------------------------------------------------------------------- # embeddings server diff --git a/docker-compose.yaml b/docker-compose.yaml index 197a06aa..481b7918 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -131,7 +131,7 @@ services: ports: - "${CHAT_COMPLETIONS_SERVER_PORT}:8000" ipc: host - command: --model ${CHAT_COMPLETIONS_MODEL} --max-model-len ${CHAT_COMPLETIONS_MAX_MODEL_LEN} --tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE} + command: ${VLLM_ENGINE_ARGS} vllm-cpu: <<: *inference-service-cpu