diff --git a/ci/L0_backend_vllm/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend_test.py index e1839947..29a62650 100755 --- a/ci/L0_backend_vllm/vllm_backend_test.py +++ b/ci/L0_backend_vllm/vllm_backend_test.py @@ -107,7 +107,7 @@ def _test_vllm_model(self, send_parameters_as_tensor): result = user_data._completed_requests.get() self.assertIsNot(type(result), InferenceServerException) - output = result.as_numpy("TEXT") + output = result.as_numpy("text_output") self.assertIsNotNone(output) self.triton_client.stop_stream() @@ -150,21 +150,21 @@ def _create_vllm_request_data( inputs = [] prompt_data = np.array([prompt.encode("utf-8")], dtype=np.object_) - inputs.append(grpcclient.InferInput("PROMPT", [1], "BYTES")) + inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) inputs[-1].set_data_from_numpy(prompt_data) stream_data = np.array([stream], dtype=bool) - inputs.append(grpcclient.InferInput("STREAM", [1], "BOOL")) + inputs.append(grpcclient.InferInput("stream", [1], "BOOL")) inputs[-1].set_data_from_numpy(stream_data) if send_parameters_as_tensor: sampling_parameters_data = np.array( [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_ ) - inputs.append(grpcclient.InferInput("SAMPLING_PARAMETERS", [1], "BYTES")) + inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES")) inputs[-1].set_data_from_numpy(sampling_parameters_data) - outputs = [grpcclient.InferRequestedOutput("TEXT")] + outputs = [grpcclient.InferRequestedOutput("text_output")] return inputs, outputs diff --git a/ci/qa_models/vllm_opt/config.pbtxt b/ci/qa_models/vllm_opt/config.pbtxt index 0f4cedb8..3da80f1f 100644 --- a/ci/qa_models/vllm_opt/config.pbtxt +++ b/ci/qa_models/vllm_opt/config.pbtxt @@ -34,17 +34,18 @@ model_transaction_policy { input [ { - name: "PROMPT" + name: "text_input" data_type: TYPE_STRING dims: [ 1 ] }, { - name: "STREAM" + name: "stream" data_type: TYPE_BOOL dims: [ 1 ] + optional: true }, { - name: "SAMPLING_PARAMETERS" + name: "sampling_parameters" data_type: TYPE_STRING dims: [ 1 ] optional: true @@ -53,7 +54,7 @@ input [ output [ { - name: "TEXT" + name: "text_output" data_type: TYPE_STRING dims: [ -1 ] } diff --git a/src/model.py b/src/model.py index 0313da9d..de03fa36 100644 --- a/src/model.py +++ b/src/model.py @@ -69,7 +69,7 @@ def initialize(self, args): AsyncEngineArgs(**vllm_engine_config) ) - output_config = pb_utils.get_output_config_by_name(self.model_config, "TEXT") + output_config = pb_utils.get_output_config_by_name(self.model_config, "text_output") self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) # Counter to keep track of ongoing request counts @@ -160,7 +160,7 @@ def create_response(self, vllm_output): (prompt + output.text).encode("utf-8") for output in vllm_output.outputs ] triton_output_tensor = pb_utils.Tensor( - "TEXT", np.asarray(text_outputs, dtype=self.output_dtype) + "text_output", np.asarray(text_outputs, dtype=self.output_dtype) ) return pb_utils.InferenceResponse(output_tensors=[triton_output_tensor]) @@ -172,17 +172,17 @@ async def generate(self, request): self.ongoing_request_count += 1 try: request_id = random_uuid() - prompt = pb_utils.get_input_tensor_by_name(request, "PROMPT").as_numpy()[0] + prompt = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy()[0] if isinstance(prompt, bytes): prompt = prompt.decode("utf-8") - stream = pb_utils.get_input_tensor_by_name(request, "STREAM").as_numpy()[0] + stream = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy()[0] # Request parameters are not yet supported via # BLS. Provide an optional mechanism to receive serialized # parameters as an input tensor until support is added parameters_input_tensor = pb_utils.get_input_tensor_by_name( - request, "SAMPLING_PARAMETERS" + request, "sampling_parameters" ) if parameters_input_tensor: parameters = parameters_input_tensor.as_numpy()[0].decode("utf-8") @@ -211,7 +211,7 @@ async def generate(self, request): self.logger.log_info(f"Error generating stream: {e}") error = pb_utils.TritonError(f"Error generating stream: {e}") triton_output_tensor = pb_utils.Tensor( - "TEXT", np.asarray(["N/A"], dtype=self.output_dtype) + "text_output", np.asarray(["N/A"], dtype=self.output_dtype) ) response = pb_utils.InferenceResponse( output_tensors=[triton_output_tensor], error=error diff --git a/tools/environment.yml b/tools/environment.yml deleted file mode 100644 index 5426a643..00000000 --- a/tools/environment.yml +++ /dev/null @@ -1,151 +0,0 @@ -name: vllm_env -channels: - - nvidia/label/cuda-11.8.0 - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2023.7.22=hbcca054_0 - - conda-pack=0.7.1=pyhd8ed1ab_0 - - cuda-cccl=11.8.89=0 - - cuda-command-line-tools=11.8.0=0 - - cuda-compiler=11.8.0=0 - - cuda-cudart=11.8.89=0 - - cuda-cudart-dev=11.8.89=0 - - cuda-cuobjdump=11.8.86=0 - - cuda-cupti=11.8.87=0 - - cuda-cuxxfilt=11.8.86=0 - - cuda-documentation=11.8.86=0 - - cuda-driver-dev=11.8.89=0 - - cuda-gdb=11.8.86=0 - - cuda-libraries=11.8.0=0 - - cuda-libraries-dev=11.8.0=0 - - cuda-memcheck=11.8.86=0 - - cuda-nsight=11.8.86=0 - - cuda-nsight-compute=11.8.0=0 - - cuda-nvcc=11.8.89=0 - - cuda-nvdisasm=11.8.86=0 - - cuda-nvml-dev=11.8.86=0 - - cuda-nvprof=11.8.87=0 - - cuda-nvprune=11.8.86=0 - - cuda-nvrtc=11.8.89=0 - - cuda-nvrtc-dev=11.8.89=0 - - cuda-nvtx=11.8.86=0 - - cuda-nvvp=11.8.87=0 - - cuda-profiler-api=11.8.86=0 - - cuda-sanitizer-api=11.8.86=0 - - cuda-toolkit=11.8.0=0 - - cuda-tools=11.8.0=0 - - cuda-visual-tools=11.8.0=0 - - gds-tools=1.4.0.31=0 - - ld_impl_linux-64=2.38=h1181459_1 - - libcublas=11.11.3.6=0 - - libcublas-dev=11.11.3.6=0 - - libcufft=10.9.0.58=0 - - libcufft-dev=10.9.0.58=0 - - libcufile=1.4.0.31=0 - - libcufile-dev=1.4.0.31=0 - - libcurand=10.3.0.86=0 - - libcurand-dev=10.3.0.86=0 - - libcusolver=11.4.1.48=0 - - libcusolver-dev=11.4.1.48=0 - - libcusparse=11.7.5.86=0 - - libcusparse-dev=11.7.5.86=0 - - libffi=3.4.4=h6a678d5_0 - - libgcc-ng=11.2.0=h1234567_1 - - libgomp=11.2.0=h1234567_1 - - libnpp=11.8.0.86=0 - - libnpp-dev=11.8.0.86=0 - - libnvjpeg=11.9.0.86=0 - - libnvjpeg-dev=11.9.0.86=0 - - libstdcxx-ng=12.3.0=h0f45ef3_2 - - libuuid=1.41.5=h5eee18b_0 - - ncurses=6.4=h6a678d5_0 - - nsight-compute=2022.3.0.22=0 - - openssl=3.0.11=h7f8727e_2 - - pip=23.2.1=py310h06a4308_0 - - python=3.10.13=h955ad1f_0 - - readline=8.2=h5eee18b_0 - - setuptools=68.0.0=py310h06a4308_0 - - sqlite=3.41.2=h5eee18b_0 - - tk=8.6.12=h1ccaba5_0 - - wheel=0.41.2=py310h06a4308_0 - - xz=5.4.2=h5eee18b_0 - - zlib=1.2.13=h5eee18b_0 - - pip: - - aiosignal==1.3.1 - - anyio==3.7.1 - - attrs==23.1.0 - - certifi==2023.7.22 - - charset-normalizer==3.3.0 - - click==8.1.7 - - cmake==3.27.6 - - exceptiongroup==1.1.3 - - fastapi==0.103.2 - - filelock==3.12.4 - - frozenlist==1.4.0 - - fsspec==2023.9.2 - - h11==0.14.0 - - httptools==0.6.0 - - huggingface-hub==0.16.4 - - idna==3.4 - - jinja2==3.1.2 - - jsonschema==4.19.1 - - jsonschema-specifications==2023.7.1 - - lit==17.0.2 - - markupsafe==2.1.3 - - mpmath==1.3.0 - - msgpack==1.0.7 - - networkx==3.1 - - ninja==1.11.1 - - numpy==1.26.0 - - nvidia-cublas-cu11==11.10.3.66 - - nvidia-cuda-cupti-cu11==11.7.101 - - nvidia-cuda-nvrtc-cu11==11.7.99 - - nvidia-cuda-runtime-cu11==11.7.99 - - nvidia-cudnn-cu11==8.5.0.96 - - nvidia-cufft-cu11==10.9.0.58 - - nvidia-curand-cu11==10.2.10.91 - - nvidia-cusolver-cu11==11.4.0.1 - - nvidia-cusparse-cu11==11.7.4.91 - - nvidia-nccl-cu11==2.14.3 - - nvidia-nvtx-cu11==11.7.91 - - packaging==23.2 - - pandas==2.1.1 - - protobuf==4.24.3 - - psutil==5.9.5 - - pyarrow==13.0.0 - - pydantic==1.10.13 - - python-dateutil==2.8.2 - - python-dotenv==1.0.0 - - pytz==2023.3.post1 - - pyyaml==6.0.1 - - ray==2.7.0 - - referencing==0.30.2 - - regex==2023.10.3 - - requests==2.31.0 - - rpds-py==0.10.3 - - safetensors==0.3.3 - - sentencepiece==0.1.99 - - six==1.16.0 - - sniffio==1.3.0 - - starlette==0.27.0 - - sympy==1.12 - - tokenizers==0.14.0 - - torch==2.0.1 - - tqdm==4.66.1 - - transformers==4.34.0 - - triton==2.0.0 - - typing-extensions==4.8.0 - - tzdata==2023.3 - - urllib3==2.0.6 - - uvicorn==0.23.2 - - uvloop==0.17.0 - - vllm==0.2.0 - - watchfiles==0.20.0 - - websockets==11.0.3 - - xformers==0.0.22 -variables: - PYTHONNOUSERSITE: True