From 4e6814600bb8d37505ea8329e01e0d16c90affc2 Mon Sep 17 00:00:00 2001 From: vbedida79 Date: Thu, 31 Oct 2024 09:01:42 -0700 Subject: [PATCH] tests_gaudi: Added L2 vllm workload vllm gaudi ubi image based on PR https://github.com/HabanaAI/vllm-fork/pull/602 Signed-off-by: vbedida79 --- tests/gaudi/l2/README.md | 101 ++++++++++++++++++ tests/gaudi/l2/vllm_buildconfig.yaml | 153 +++++++++++++++++++++++++++ tests/gaudi/l2/vllm_deployment.yaml | 66 ++++++++++++ tests/gaudi/l2/vllm_hf_secret.yaml | 10 ++ 4 files changed, 330 insertions(+) create mode 100644 tests/gaudi/l2/vllm_buildconfig.yaml create mode 100644 tests/gaudi/l2/vllm_deployment.yaml create mode 100644 tests/gaudi/l2/vllm_hf_secret.yaml diff --git a/tests/gaudi/l2/README.md b/tests/gaudi/l2/README.md index 07f14bf5..a6c8a267 100644 --- a/tests/gaudi/l2/README.md +++ b/tests/gaudi/l2/README.md @@ -75,3 +75,104 @@ Welcome to HCCL demo [BENCHMARK] Algo Bandwidth : 147.548069 GB/s #################################################################################################### ``` +<<<<<<< HEAD +======= + +## vLLM +vLLM is a serving engine for LLM's. The following workloads deploys a VLLM server with an LLM using Intel Gaudi. Refer to [Intel Gaudi vLLM fork](https://github.com/HabanaAI/vllm-fork.git) for more details. + +Build the workload container image: +``` +git clone https://github.com/HabanaAI/vllm-fork.git --branch v1.18.0 + +cd vllm/ + +$ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/vllm_buildconfig.yaml + +$ oc start-build vllm-workload --from-dir=./ --follow +``` +Check if the build has completed +``` +$ oc get builds +NAMESPACE NAME TYPE FROM STATUS STARTED DURATION +gaudi-validation vllm-workload-1 Docker Dockerfile Complete 7 minutes ago 4m58s + +``` + +Deploy the workload: +* Update the hugging face token in the ```vllm_hf_secret.yaml``` file, refer to [link](https://huggingface.co/docs/hub/en/security-tokens) for more details. +``` +$ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/vllm_hf_secret.yaml +``` +meta-llama/Llama-3.1-8B model is used in this deployment and the hugging face token is used to access such gated models. +* For the PV setup with NFS, refer to [documentation](https://docs.openshift.com/container-platform/4.17/storage/persistent_storage/persistent-storage-nfs.html). +``` +$ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/vllm_deployment.yaml +``` +Create the vllm service +``` +oc expose deploy/vllm-workload +``` +Verify Output: +``` +$ oc get pods +NAME READY STATUS RESTARTS AGE +vllm-workload-1-build 0/1 Completed 0 19m +vllm-workload-55f7c6cb7b-cwj2b 1/1 Running 0 8m36s + +$ oc get svc +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +vllm-workload ClusterIP 1.2.3.4 8000/TCP 114s +``` +``` +$ oc logs vllm-workload-55f7c6cb7b-cwj2b + +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BS_BUCKET_MIN=32 (default:min) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BS_BUCKET_STEP=32 (default:step) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BS_BUCKET_MAX=256 (default:max) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_PROMPT_SEQ_BUCKET_MIN=128 (default:min) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_PROMPT_SEQ_BUCKET_STEP=128 (default:step) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_PROMPT_SEQ_BUCKET_MAX=1024 (default:max) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BLOCK_BUCKET_MIN=128 (default:min) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BLOCK_BUCKET_STEP=128 (default:step) +INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BLOCK_BUCKET_MAX=4096 (default:max) +INFO 10-30 19:35:53 habana_model_runner.py:691] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024] +INFO 10-30 19:35:53 habana_model_runner.py:696] Decode bucket config (min, step, max_warmup) bs:[32, 32, 256], block:[128, 128, 4096] +============================= HABANA PT BRIDGE CONFIGURATION =========================== +PT_HPU_LAZY_MODE = 1 +PT_RECIPE_CACHE_PATH = +PT_CACHE_FOLDER_DELETE = 0 +PT_HPU_RECIPE_CACHE_CONFIG = +PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807 +PT_HPU_LAZY_ACC_PAR_MODE = 1 +PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0 +PT_HPU_EAGER_PIPELINE_ENABLE = 1 +PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1 +---------------------------: System Configuration :--------------------------- +Num CPU Cores : 160 +CPU RAM : 1056371848 KB +------------------------------------------------------------------------------ +INFO 10-30 19:35:56 selector.py:85] Using HabanaAttention backend. +INFO 10-30 19:35:56 loader.py:284] Loading weights on hpu ... +INFO 10-30 19:35:56 weight_utils.py:224] Using model weights format ['*.safetensors', '*.bin', '*.pt'] +Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00>>>>>> 46ef40e (tests_gaudi: Added L2 vllm workload) diff --git a/tests/gaudi/l2/vllm_buildconfig.yaml b/tests/gaudi/l2/vllm_buildconfig.yaml new file mode 100644 index 00000000..dff37f79 --- /dev/null +++ b/tests/gaudi/l2/vllm_buildconfig.yaml @@ -0,0 +1,153 @@ +# Copyright (c) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: vllm-workload + namespace: gaudi-validation +spec: {} +--- +apiVersion: build.openshift.io/v1 +kind: BuildConfig +metadata: + name: vllm-workload + namespace: gaudi-validation +spec: + triggers: + - type: "ConfigChange" + - type: "ImageChange" + runPolicy: "Serial" + source: + type: Dockerfile + dockerfile: | + ARG BASE_IMAGE=vault.habana.ai/gaudi-docker/1.18.0/rhel9.4/habanalabs/pytorch-installer-2.4.0:1.18.0-524 + FROM ${BASE_IMAGE} as habana-base + + USER root + + ENV VLLM_TARGET_DEVICE="hpu" + ENV HABANA_SOFTWARE_VERSION="1.18.0-524" + + RUN dnf -y update --best --allowerasing --skip-broken && dnf clean all + + WORKDIR /workspace + + ## Python Installer ################################################################# + FROM habana-base as python-install + + ARG PYTHON_VERSION=3.11 + + ENV VIRTUAL_ENV=/opt/vllm + ENV PATH="$VIRTUAL_ENV/bin:$PATH" + RUN dnf install -y --setopt=install_weak_deps=0 --nodocs \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV --system-site-packages && pip install --no-cache -U pip wheel && dnf clean all + + ## Python Habana base ################################################################# + FROM python-install as python-habana-base + + ENV VIRTUAL_ENV=/opt/vllm + ENV PATH="$VIRTUAL_ENV/bin:$PATH" + + # install Habana Software and common dependencies + RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-hpu.txt,target=requirements-hpu.txt \ + pip install \ + -r requirements-hpu.txt + + ## Builder ##################################################################### + FROM python-habana-base AS build + + # install build dependencies + + # copy input files + COPY csrc csrc + COPY setup.py setup.py + COPY cmake cmake + COPY CMakeLists.txt CMakeLists.txt + COPY requirements-common.txt requirements-common.txt + COPY requirements-hpu.txt requirements-hpu.txt + COPY pyproject.toml pyproject.toml + + # max jobs used by Ninja to build extensions + ARG max_jobs=2 + ENV MAX_JOBS=${max_jobs} + # # make sure punica kernels are built (for LoRA) + # HPU currently doesn't support LoRA + # ENV VLLM_INSTALL_PUNICA_KERNELS=1 + + # Copy the entire directory before building wheel + COPY vllm vllm + + ENV CCACHE_DIR=/root/.cache/ccache + RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=.git,target=/workspace/.git \ + env CFLAGS="-march=haswell" \ + CXXFLAGS="$CFLAGS $CXXFLAGS" \ + CMAKE_BUILD_TYPE=Release \ + python3 setup.py bdist_wheel --dist-dir=dist + + ## Release ##################################################################### + FROM python-install AS vllm-openai + + WORKDIR /workspace + + ENV VIRTUAL_ENV=/opt/vllm + ENV PATH=$VIRTUAL_ENV/bin/:$PATH + + # Triton needs a CC compiler + RUN dnf install -y --setopt=install_weak_deps=0 --nodocs gcc \ + && dnf clean all + + # install vllm wheel first, so that torch etc will be installed + RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + pip install $(echo dist/*.whl)'[tensorizer]' --verbose + + ENV HF_HUB_OFFLINE=1 \ + PORT=8000 \ + HOME=/home/vllm \ + VLLM_USAGE_SOURCE=production-docker-image + + # setup non-root user for OpenShift + RUN umask 002 \ + && useradd --uid 2000 --gid 0 vllm \ + && chmod g+rwx $HOME /usr/src /workspace + + COPY LICENSE /licenses/vllm.md + + USER 2000 + ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] + + FROM vllm-openai as vllm-grpc-adapter + + USER root + + RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + pip install $(echo dist/*.whl)'[tensorizer]' vllm-tgis-adapter==0.2.3 + + ENV GRPC_PORT=8033 \ + PORT=8000 \ + # As an optimization, vLLM disables logprobs when using spec decoding by + # default, but this would be unexpected to users of a hosted model that + # happens to have spec decoding + # see: https://github.com/vllm-project/vllm/pull/6485 + DISABLE_LOGPROBS_DURING_SPEC_DECODING=false + + USER 2000 + ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"] + strategy: + type: Docker + noCache: true + dockerStrategy: + buildArgs: + - name: "BASE_IMAGE" + value: "vault.habana.ai/gaudi-docker/1.18.0/rhel9.4/habanalabs/pytorch-installer-2.4.0:1.18.0-524" + output: + to: + kind: ImageStreamTag + name: vllm-workload:latest \ No newline at end of file diff --git a/tests/gaudi/l2/vllm_deployment.yaml b/tests/gaudi/l2/vllm_deployment.yaml new file mode 100644 index 00000000..fa38b4d2 --- /dev/null +++ b/tests/gaudi/l2/vllm_deployment.yaml @@ -0,0 +1,66 @@ +--- +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: vllm-workload-pvc + namespace: gaudi-validation +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 60Gi + storageClassName: "" # Add your storage class + volumeMode: Filesystem +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-workload + namespace: gaudi-validation +labels: + app: vllm-workload +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-workload + template: + metadata: + labels: + app: vllm-workload + spec: + containers: + - name: vllm-container + image: image-registry.openshift-image-registry.svc:5000/gaudi-validation/vllm-workload:latest + command: [ "/bin/bash", "-c", "--" ] + args: ["vllm serve meta-llama/Llama-3.1-8B"] # Add the model + ports: + - containerPort: 8000 + resources: + limits: + habana.ai/gaudi: 1 + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: hf-token + - name: HF_HOME + value: /home/vllm/.cache/huggingface + - name: HF_HUB_OFFLINE + value: "0" + imagePullPolicy: Always + volumeMounts: + - name: hf-cache + mountPath: /home/vllm/.cache + - name: shm + mountPath: /dev/shm + volumes: + - name: hf-cache + persistentVolumeClaim: + claimName: vllm-workload-pvc + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" \ No newline at end of file diff --git a/tests/gaudi/l2/vllm_hf_secret.yaml b/tests/gaudi/l2/vllm_hf_secret.yaml new file mode 100644 index 00000000..f239d912 --- /dev/null +++ b/tests/gaudi/l2/vllm_hf_secret.yaml @@ -0,0 +1,10 @@ +# Copyright (c) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +kind: Secret +metadata: + name: hf-token + namespace: gaudi-validation +type: Opaque +data: + hf-token: # Add your token \ No newline at end of file