tenstorrent · tstescoTT · Nov 5, 2024 · Oct 15, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -39,3 +39,4 @@ db.sqlite3
 
 # unignore
 !requirements.txt
+!requirements-dev.txt
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.7.0
+  hooks:
+    # Run the linter.
+    - id: ruff
+    # Run the formatter.
+    - id: ruff-format
diff --git a/benchmarking/README.md b/benchmarking/README.md
@@ -0,0 +1,38 @@
+# Benchmarking
+
+# Llama 3.1 70B Instruct
+
+### vLLM offline benchmarking
+
+The vLLM benchmarking script is https://github.com/tenstorrent/vllm/blob/dev/examples/offline_inference_tt.py
+
+It is recommended to run the vLLM model implementation via `docker run`, at tt-inference-server/vllm-tt-metal-llama3-70b/README.md.
+
+To measure performance for a single batch (with the default prompt length of 128 tokens):
+```bash
+export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml
+python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --perf_prompt_len 128 --max_tokens 128
+# for example, changing to input 2048, output 2048
+python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --perf_prompt_len 2048 --max_tokens 2048
+```
+
+#### Command Line Arguments
+
+- `--prompts_json` (default: `"tt_metal/prompts.json"`):
+  - **Path to prompts JSON file** used for inference. Prompts should be in a list format. This will not be used if `measure_perf` is set.
+
+- `--measure_perf`:
+  - **Measure model performance** using synthetic inputs. If enabled, any provided `prompts_json` is ignored, and dummy prompts are used instead for benchmarking.
+
+- `--perf_prompt_len` (default: `128`):
+  - **Length of dummy prompts** (in tokens) for benchmarking. Used only when `--measure_perf` is provided.
+
+- `--max_tokens` (default: `128`):
+  - **Maximum output length** (in tokens) generated by the model for each prompt.
+
+- `--greedy_sampling`:
+  - **Use greedy decoding** instead of probabilistic sampling (top-k/top-p). Greedy sampling always selects the token with the highest probability, leading to more deterministic output.
+
+- `--max_seqs_in_batch` (default: `32`):
+  - **Maximum batch size** for inference, determining the number of prompts processed in parallel.
+
diff --git a/docs/development.md b/docs/development.md
@@ -0,0 +1,26 @@
+# Development
+
+Set up linting and formatting for development:
+```bash
+# [optional] step 1: use venv
+python3 -m venv .venv
+source .venv/bin/activate
+
+# step 2: install
+pip install --upgrade pip setuptools wheel
+pip install -r requirements-dev.txt
+
+# step 3: pre-commit
+pre-commit install
+```
+
+run pre-commit:
+```bash
+# pre-commit behaviour is defined in .pre-commit-config.yaml
+# by default it runs only on git staged files
+pre-commit run
+# run on all files in repo
+pre-commit run --all-files
+# or point to specific files
+pre-commit run --files path/to/file
+```
diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,170 @@
+# Running LM evals with vLLM
+
+Source code:
+- tt-metal and vLLM are under active development in lock-step: https://github.com/tenstorrent/vllm/tree/dev/tt_metal 
+- lm-evaluation-harness fork: https://github.com/tstescoTT/lm-evaluation-harness
+- llama-recipes fork: https://github.com/tstescoTT/llama-recipes
+
+## Step 1: Pull Docker image
+
+Docker images are published to: https://ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm
+```bash
+# commit SHAs
+export TT_METAL_COMMIT_SHA_OR_TAG=685ef1303b5abdfda63183fdd4fd6ed51b496833
+export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12}
+export TT_VLLM_COMMIT_SHA_OR_TAG=54b9157d852b0fa219613c00abbaa5a35f221049
+export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12}
+# pull image
+docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
+```
+
+For instructions on building the Docker image see: [Development](../vllm-tt-metal-llama3-70b/docs/development.md)
+
+## Step 2: Run Docker container for LM evals development
+
+note: this requires running `setup.sh` to set up the weights for a particular model, in this example `llama-3.1-70b-instruct`.
+
+```bash
+cd tt-inference-server
+export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b-instructv0.0.1/
+docker run \
+  --rm \
+  -it \
+  --env-file tt-metal-llama3-70b/.env \
+  --cap-add ALL \
+  --device /dev/tenstorrent:/dev/tenstorrent \
+  --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
+  --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \
+  --shm-size 32G \
+  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash
+```
+
+## Step 3: Inside container setup and run vLLM
+
+#### Install vLLM - Option 1: use default installation in docker image
+
+already built into Docker image
+
+#### Install vLLM - option 2: install vLLM from github
+
+```bash
+# option 2: install from github
+cd /home/user/vllm
+git fetch
+git checkout <branch>
+git pull
+pip install -e .
+echo "done vllm install."
+```
+#### Install vLLM - option 3: install edittable (for development) from mounted volume
+
+```bash
+# option 3: install edittable (for development) - mount from outside container
+cd /home/user/vllm
+pip install -e .
+echo "done vllm install."
+```
+
+#### Run vllm serving openai compatible API server
+
+```bash
+# run vllm serving
+cd /home/user/vllm
+python examples/server_example_tt.py
+```
+
+## Step 4: Inside container setup LM evalulation harness
+
+Enter new bash shell in running container (this does so with newest running container):
+```bash
+docker exec -it $(docker ps -q | head -n1) bash
+```
+
+Now inside container:
+```bash
+# option 1: install from github: https://github.com/tstescoTT/lm-evaluation-harness
+pip install git+https://github.com/tstescoTT/lm-evaluation-harness.git#egg=lm-eval[ifeval]
+# option 2: install edittable (for development) - mounted to container
+cd ~/lm-evaluation-harness
+pip install -e .[ifeval]
+```
+
+## Step 5: Inside container set up llama-recipes LM evalulation harness templates
+
+
+Using Meta’s LM eval reproduce documentation: https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval 
+
+To access Meta Llama 3.1 evals, you must:
+
+1. Log in to the Hugging Face website (https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f ) and click the 3.1 evals dataset pages and agree to the terms.
+2. Follow the [Hugging Face authentication instructions](https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication) to gain read access for your machine.
+
+#### Hugging Face authentication - option 1: HF_TOKEN (if not already passed into Docker container)
+```bash
+# set up HF Token, needed for IFEval dataset
+# echo "hf_<token>" > ${HF_HOME}/token
+export PYTHONPATH=${PYTHONPATH}:$PWD
+```
+
+#### Hugging Face authentication - option 2: huggingface_hub login
+```python
+from huggingface_hub import notebook_login
+notebook_login()
+```
+
+Finally,  build llama-recipe lm-evaluation-harness templates:
+```bash
+git clone https://github.com/tstescoTT/llama-recipes.git
+cd llama-recipes/tools/benchmarks/llm_eval_harness/meta_eval
+python prepare_meta_eval.py --config_path ./eval_config.yaml
+cp -rf work_dir/ ~/lm-evaluation-harness/
+```
+
+## Step 6: Inside container run LM evals
+
+`run_evals.sh` can be run from where lm_eval CLI is available:
+```bash
+cd ~/lm-evaluation-harness
+run_evals.sh
+```
+
+For example, running GPQA manually:
+
+The model args (`Meta-Llama-3.1-70B` below) need only correspond to the model defined by running the server, not the actual weights.
+```bash
+lm_eval \
+--model local-completions \
+--model_args model=meta-llama/Meta-Llama-3.1-70B,base_url=http://127.0.0.1:8000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True \
+--gen_kwargs model=meta-llama/Meta-Llama-3.1-70B,stop="<|eot_id|>",stream=False \
+--tasks meta_ifeval \
+--batch_size auto \
+--output_path /home/user/cache_root/eval_output \
+--include_path ./work_dir \
+--seed 42  \
+--log_samples
+```
+
+## Notes:
+
+### Chat templating
+
+As mentioned in: https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval#run-eval-tasks 
+
+“As for add_bos_token=True, since our prompts in the evals dataset has already included all the special tokens required by instruct model, such as <|start_header_id|>user<|end_header_id|>, we will not use --apply_chat_template argument for instruct models anymore. However, we need to use add_bos_token=True flag to add the BOS_token back during VLLM inference, as the BOS_token is removed by default in this PR.”
+
+The chat template can be manually added via the `lm_eval` runtime argument:
+```bash
+--apply_chat_template chat_template.jinja 
+```
+chat_template.jinja: text file jinja template for llama 3.1 instruct:
+```
+<|begin_of_text|>
+{% for message in chat_history %}
+<|start_header_id|>{{ message['role'] }}<|end_header_id|>
+\n\n{{ message['content'] }}{% if not loop.last %}<|eot_id|>{% endif %}
+{% endfor %}
+<|start_header_id|>assistant<|end_header_id|>\n\n
+```
+
+The instruct chat template could also be applied on the vLLM server side, but this implementation gives more flexibility to the caller of vLLM.
+
diff --git a/evals/run_evals.sh b/evals/run_evals.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# GPQA
+lm_eval \
+--model local-completions \
+--model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:8000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True \
+--gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \
+--tasks meta_gpqa \
+--batch_size auto \
+--output_path /home/user/cache_root/eval_output \
+--include_path ./work_dir \
+--seed 42  \
+--log_samples
+
+# IFEval
+lm_eval \
+--model local-completions \
+--model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:8000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True \
+--gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \
+--tasks meta_ifeval \
+--batch_size auto \
+--output_path /home/user/cache_root/eval_output \
+--include_path ./work_dir \
+--seed 42  \
+--log_samples
diff --git a/locust/README.md b/locust/README.md
@@ -0,0 +1 @@
+# Locust server stress testing
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,2 @@
+pre-commit==3.5.0
+ruff==0.7.0
diff --git a/tt-metal-llama3-70b/llama3.src.base.inference.v0.51.0.Dockerfile b/tt-metal-llama3-70b/llama3.src.base.inference.v0.51.0.Dockerfile
@@ -2,13 +2,16 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
+
 # default base image, override with --build-arg TT_METAL_DOCKERFILE_VERSION=<version>
 ARG TT_METAL_DOCKERFILE_VERSION=v0.51.0-rc31
 
 FROM ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:$TT_METAL_DOCKERFILE_VERSION-dev
 
 # Build stage
 LABEL maintainer="Tom Stesco <[email protected]>"
+# connect Github repo with package
+LABEL org.opencontainers.image.source https://github.com/tenstorrent/tt-inference-server
 
 ARG DEBIAN_FRONTEND=noninteractive
 # default commit sha, override with --build-arg TT_METAL_COMMIT_SHA_OR_TAG=<sha>