From 4dc87cdf4368db71c537274bb46d98795a1ff9ea Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 31 Oct 2024 15:36:06 +0000 Subject: [PATCH] update Dockerfile version in docs --- vllm-tt-metal-llama3-70b/README.md | 12 ++++++------ vllm-tt-metal-llama3-70b/docs/development.md | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md index fc5c620..19358c0 100644 --- a/vllm-tt-metal-llama3-70b/README.md +++ b/vllm-tt-metal-llama3-70b/README.md @@ -22,7 +22,7 @@ If first run setup has already been completed, start here. If first run setup ha ### Docker Run - vLLM llama3 inference server -Container will run `gunicorn --config gunicorn.conf.py` and start the inference server and model backend. +Container will run with uvicorn and start the inference server and model backend. ```bash cd tt-inference-server # make sure if you already set up the model weights and cache you use the correct persistent volume @@ -37,11 +37,11 @@ docker run \ --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \ --shm-size 32G \ --publish 7000:7000 \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.53.0-rc16-ebdffa93d911 + ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-685ef1303b5a-54b9157d852b ``` ```bash -# run server +# run server manually python examples/offline_inference_tt.py ``` @@ -82,12 +82,12 @@ sudo cpupower frequency-set -g performance ```bash # pull image from GHCR -docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.53.0-rc16-ebdffa93d911 +docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-685ef1303b5a-54b9157d852b ``` ### 5. Automated Setup: environment variables and weights files -The script `tt-metal-llama3-70b/setup.sh` automates: +The script `vllm-tt-metal-llama3-70b/setup.sh` automates: 1. interactively creating the .env file, 2. downloading the Llama model weights, @@ -95,7 +95,7 @@ The script `tt-metal-llama3-70b/setup.sh` automates: 4. creating the default persistent storage directory structure and permissions. ```bash -cd tt-inference-server/tt-metal-llama3-70b +cd tt-inference-server/vllm-tt-metal-llama3-70b chmod +x setup.sh ./setup.sh llama-3.1-70b-instruct ``` diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md index 5448239..082379f 100644 --- a/vllm-tt-metal-llama3-70b/docs/development.md +++ b/vllm-tt-metal-llama3-70b/docs/development.md @@ -14,7 +14,7 @@ When building, update the commit SHA and get correct SHA from model developers o export TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc27 export TT_METAL_COMMIT_SHA_OR_TAG=685ef1303b5abdfda63183fdd4fd6ed51b496833 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12} -export TT_VLLM_COMMIT_SHA_OR_TAG=582c05ecaa37a7d03224a26f52df5af067d3311f +export TT_VLLM_COMMIT_SHA_OR_TAG=54b9157d852b0fa219613c00abbaa5a35f221049 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12} docker build \ -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \ @@ -70,7 +70,7 @@ Already built into Docker image, continue to run vLLM. # option 2: install from github cd /home/user/vllm git fetch -# git checkout +git checkout git pull pip install -e . echo "done vllm install."