Skip to content

Commit

Permalink
feat: vllm increased customization
Browse files Browse the repository at this point in the history
  • Loading branch information
francis2tm committed Dec 20, 2024
1 parent bdd7d84 commit 46cceda
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ CHAT_COMPLETIONS_MODEL=meta-llama/Llama-3.1-70B-Instruct
CHAT_COMPLETIONS_MAX_MODEL_LEN=4096 # context length

# vllm backend
VLLM_TENSOR_PARALLEL_SIZE=1 # should be equal to GPU_COUNT
# Know more about vllm engine arguments here: https://docs.vllm.ai/en/latest/usage/engine_args.html
VLLM_ENGINE_ARGS=--model ${CHAT_COMPLETIONS_MODEL} --max-model-len ${CHAT_COMPLETIONS_MAX_MODEL_LEN}

# ----------------------------------------------------------------------------------
# embeddings server
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ services:
ports:
- "${CHAT_COMPLETIONS_SERVER_PORT}:8000"
ipc: host
command: --model ${CHAT_COMPLETIONS_MODEL} --max-model-len ${CHAT_COMPLETIONS_MAX_MODEL_LEN} --tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE}
command: ${VLLM_ENGINE_ARGS}

vllm-cpu:
<<: *inference-service-cpu
Expand Down

0 comments on commit 46cceda

Please sign in to comment.