diff --git a/.gitignore b/.gitignore
index 9d4769c9..6b696074 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+*.out
 
 # Translations
 *.mo
diff --git a/README.md b/README.md
index 887fe44e..69945106 100644
--- a/README.md
+++ b/README.md
@@ -114,10 +114,12 @@ cd server
                 --upstream-container-version=${TRITON_CONTAINER_VERSION}
                 --backend=python:r${TRITON_CONTAINER_VERSION}
                 --backend=vllm:r${TRITON_CONTAINER_VERSION}
+                --backend=ensemble
                 --vllm-version=${VLLM_VERSION}
 # Build Triton Server
 cd build
 bash -x ./docker_build
+
 ```
 
 ### Option 3. Add the vLLM Backend to the Default Triton Container
@@ -129,7 +131,8 @@ container with the following commands:
 
 ```
 mkdir -p /opt/tritonserver/backends/vllm
-wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py
+git clone https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend
+cp -r /tmp/vllm_backend/src/* /opt/tritonserver/backends/vllm
 ```
 
 ## Using the vLLM Backend
@@ -212,7 +215,6 @@ starting from 23.10 release.
 
 You can use  `pip install ...` within the container to upgrade vLLM version.
 
-
 ## Running Multiple Instances of Triton Server
 
 If you are running multiple instances of Triton server with a Python-based backend,
@@ -220,6 +222,114 @@ you need to specify a different `shm-region-prefix-name` for each server. See
 [here](https://github.com/triton-inference-server/python_backend#running-multiple-instances-of-triton-server)
 for more information.
 
+## Triton Metrics
+Starting with the 24.08 release of Triton, users can now obtain specific
+vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics
+[here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be
+accomplished by launching a Triton server in any of the ways described above
+(ensuring the build code / container is 24.08 or later) and querying the server.
+Upon receiving a successful response, you can query the metrics endpoint by entering
+the following:
+```bash
+curl localhost:8002/metrics
+```
+VLLM stats are reported by the metrics endpoint in fields that are prefixed with
+`vllm:`. Triton currently supports reporting of the following metrics from vLLM.
+```bash
+# Number of prefill tokens processed.
+counter_prompt_tokens
+# Number of generation tokens processed.
+counter_generation_tokens
+# Histogram of time to first token in seconds.
+histogram_time_to_first_token
+# Histogram of time per output token in seconds.
+histogram_time_per_output_token
+# Histogram of end to end request latency in seconds.
+histogram_e2e_time_request
+# Number of prefill tokens processed.
+histogram_num_prompt_tokens_request
+# Number of generation tokens processed.
+histogram_num_generation_tokens_request
+# Histogram of the best_of request parameter.
+histogram_best_of_request
+# Histogram of the n request parameter.
+histogram_n_request
+```
+Your output for these fields should look similar to the following:
+```bash
+# HELP vllm:prompt_tokens_total Number of prefill tokens processed.
+# TYPE vllm:prompt_tokens_total counter
+vllm:prompt_tokens_total{model="vllm_model",version="1"} 10
+# HELP vllm:generation_tokens_total Number of generation tokens processed.
+# TYPE vllm:generation_tokens_total counter
+vllm:generation_tokens_total{model="vllm_model",version="1"} 16
+# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE vllm:time_to_first_token_seconds histogram
+vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1
+vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559
+vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0
+...
+vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
+# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds.
+# TYPE vllm:time_per_output_token_seconds histogram
+vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15
+vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781
+vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14
+...
+vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15
+# HELP vllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
+# TYPE vllm:e2e_request_latency_seconds histogram
+vllm:e2e_request_latency_seconds_count{model="vllm_model",version="1"} 1
+vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.08686184883117676
+vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1
+...
+vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
+# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
+# TYPE vllm:request_prompt_tokens histogram
+vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1
+vllm:request_prompt_tokens_sum{model="vllm_model",version="1"} 10
+vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="1"} 0
+...
+vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
+# HELP vllm:request_generation_tokens Number of generation tokens processed.
+# TYPE vllm:request_generation_tokens histogram
+vllm:request_generation_tokens_count{model="vllm_model",version="1"} 1
+vllm:request_generation_tokens_sum{model="vllm_model",version="1"} 16
+vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="1"} 0
+...
+vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
+# HELP vllm:request_params_best_of Histogram of the best_of request parameter.
+# TYPE vllm:request_params_best_of histogram
+vllm:request_params_best_of_count{model="vllm_model",version="1"} 1
+vllm:request_params_best_of_sum{model="vllm_model",version="1"} 1
+vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="1"} 1
+...
+vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="+Inf"} 1
+# HELP vllm:request_params_n Histogram of the n request parameter.
+# TYPE vllm:request_params_n histogram
+vllm:request_params_n_count{model="vllm_model",version="1"} 1
+vllm:request_params_n_sum{model="vllm_model",version="1"} 1
+vllm:request_params_n_bucket{model="vllm_model",version="1",le="1"} 1
+...
+vllm:request_params_n_bucket{model="vllm_model",version="1",le="+Inf"} 1
+```
+To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false
+or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json).
+```bash
+"disable_log_stats": false
+```
+*Note:* vLLM metrics are not reported to Triton metrics server by default
+due to potential performance slowdowns. To enable vLLM model's metrics
+reporting, please add following lines to its config.pbtxt as well.
+```bash
+parameters: {
+  key: "REPORT_CUSTOM_METRICS"
+  value: {
+    string_value:"yes"
+  }
+}
+```
+
 ## Referencing the Tutorial
 
 You can read further in the
diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh
new file mode 100755
index 00000000..fd976d4a
--- /dev/null
+++ b/ci/L0_backend_vllm/metrics_test/test.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+source ../../common/util.sh
+
+TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
+SERVER=${TRITON_DIR}/bin/tritonserver
+BACKEND_DIR=${TRITON_DIR}/backends
+SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_opt --log-verbose=1"
+SERVER_LOG="./vllm_metrics_server.log"
+CLIENT_LOG="./vllm_metrics_client.log"
+TEST_RESULT_FILE='test_results.txt'
+CLIENT_PY="./vllm_metrics_test.py"
+SAMPLE_MODELS_REPO="../../../samples/model_repository"
+EXPECTED_NUM_TESTS=1
+
+# Helpers =======================================
+function copy_model_repository {
+    rm -rf models && mkdir -p models
+    cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
+    # `vllm_opt` model will be loaded on server start and stay loaded throughout
+    # unittesting. To ensure that vllm's memory profiler will not error out
+    # on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`,
+    # so that at least 60% of GPU memory was available for other models.
+    sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json
+}
+
+run_test() {
+    local TEST_CASE=$1
+
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        cat $SERVER_LOG
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        exit 1
+    fi
+
+    set +e
+    python3 $CLIENT_PY $TEST_CASE -v > $CLIENT_LOG 2>&1
+
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Running $CLIENT_PY $TEST_CASE FAILED. \n***"
+        RET=1
+    else
+        check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+        if [ $? -ne 0 ]; then
+            cat $CLIENT_LOG
+            echo -e "\n***\n*** Test Result Verification FAILED.\n***"
+            RET=1
+        fi
+    fi
+    set -e
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+}
+
+RET=0
+
+# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt
+copy_model_repository
+run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
+
+# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt
+copy_model_repository
+echo -e "
+parameters: {
+  key: \"REPORT_CUSTOM_METRICS\"
+  value: {
+    string_value:\"no\"
+  }
+}
+" >> models/vllm_opt/config.pbtxt
+run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
+
+# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt
+copy_model_repository
+cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
+echo -e "
+parameters: {
+  key: \"REPORT_CUSTOM_METRICS\"
+  value: {
+    string_value:\"yes\"
+  }
+}
+" >> models/vllm_opt/config.pbtxt
+run_test VLLMTritonMetricsTest.test_vllm_metrics
+
+# Test vLLM metrics custom sampling parameters
+# Custom sampling parameters may result in different vLLM output depending
+# on the platform. Therefore, these metrics are tests separately.
+copy_model_repository
+cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
+echo -e "
+parameters: {
+  key: \"REPORT_CUSTOM_METRICS\"
+  value: {
+    string_value:\"yes\"
+  }
+}
+" >> models/vllm_opt/config.pbtxt
+run_test VLLMTritonMetricsTest.test_custom_sampling_params
+
+# Test enabling vLLM metrics reporting in config.pbtxt but disabling in model.json
+copy_model_repository
+jq '. += {"disable_log_stats" : true}' models/vllm_opt/1/model.json > "temp.json"
+mv temp.json models/vllm_opt/1/model.json
+echo -e "
+parameters: {
+  key: \"REPORT_CUSTOM_METRICS\"
+  value: {
+    string_value:\"yes\"
+  }
+}
+" >> models/vllm_opt/config.pbtxt
+run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
+
+# Test enabling vLLM metrics reporting in config.pbtxt while disabling in server option
+copy_model_repository
+echo -e "
+parameters: {
+  key: \"REPORT_CUSTOM_METRICS\"
+  value: {
+    string_value:\"yes\"
+  }
+}
+" >> models/vllm_opt/config.pbtxt
+SERVER_ARGS="${SERVER_ARGS} --allow-metrics=false"
+run_test VLLMTritonMetricsTest.test_vllm_metrics_refused
+
+rm -rf "./models" "temp.json"
+
+if [ $RET -eq 1 ]; then
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** vLLM test FAILED. \n***"
+else
+    echo -e "\n***\n*** vLLM test PASSED. \n***"
+fi
+
+collect_artifacts_from_subdir
+exit $RET
diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
new file mode 100644
index 00000000..6bef1746
--- /dev/null
+++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
@@ -0,0 +1,235 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import sys
+import unittest
+from functools import partial
+
+import requests
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import *
+
+sys.path.append("../../common")
+from test_util import TestResultCollector, UserData, callback, create_vllm_request
+
+
+class VLLMTritonMetricsTest(TestResultCollector):
+    def setUp(self):
+        self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
+        self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
+        self.vllm_model_name = "vllm_opt"
+        self.prompts = [
+            "The most dangerous animal is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        self.sampling_parameters = {"temperature": "0", "top_p": "1"}
+
+    def parse_vllm_metrics(self):
+        """
+        Store vllm metrics in a dictionary.
+        """
+        r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics")
+        r.raise_for_status()
+
+        # Regular expression to match the pattern
+        pattern = r"^(vllm:[^ {]+)(?:{.*})? ([0-9.-]+)$"
+        vllm_dict = {}
+
+        # Find all matches in the text
+        matches = re.findall(pattern, r.text, re.MULTILINE)
+
+        for match in matches:
+            key, value = match
+            vllm_dict[key] = float(value) if "." in value else int(value)
+
+        return vllm_dict
+
+    def vllm_infer(
+        self,
+        prompts,
+        sampling_parameters,
+        model_name,
+    ):
+        """
+        Helper function to send async stream infer requests to vLLM.
+        """
+        user_data = UserData()
+        number_of_vllm_reqs = len(prompts)
+
+        self.triton_client.start_stream(callback=partial(callback, user_data))
+        for i in range(number_of_vllm_reqs):
+            request_data = create_vllm_request(
+                prompts[i],
+                i,
+                False,
+                sampling_parameters,
+                model_name,
+                True,
+            )
+            self.triton_client.async_stream_infer(
+                model_name=model_name,
+                inputs=request_data["inputs"],
+                request_id=request_data["request_id"],
+                outputs=request_data["outputs"],
+                parameters=sampling_parameters,
+            )
+
+        for _ in range(number_of_vllm_reqs):
+            result = user_data._completed_requests.get()
+            if type(result) is InferenceServerException:
+                print(result.message())
+            self.assertIsNot(type(result), InferenceServerException, str(result))
+
+            output = result.as_numpy("text_output")
+            self.assertIsNotNone(output, "`text_output` should not be None")
+
+        self.triton_client.stop_stream()
+
+    def test_vllm_metrics(self):
+        # Test vLLM metrics
+        self.vllm_infer(
+            prompts=self.prompts,
+            sampling_parameters=self.sampling_parameters,
+            model_name=self.vllm_model_name,
+        )
+        metrics_dict = self.parse_vllm_metrics()
+        total_prompts = len(self.prompts)
+
+        # vllm:prompt_tokens_total
+        # (2, 133, 144, 2702, 3477, 16)
+        # (2, 133, 812, 9, 1470, 16)
+        # (2, 133, 499, 9, 4687, 16)
+        self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18)
+        # vllm:generation_tokens_total
+        # (5, 65, 14, 16, 144, 533, 7, 28, 848, 30, 10, 512, 4, 50118, 100, 437)
+        # (5, 812, 9, 5, 1515, 3497, 4, 50118, 50118, 133, 812, 9, 1470, 16, 5, 812)
+        # (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5)
+        self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
+        # vllm:time_to_first_token_seconds
+        self.assertEqual(
+            metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts
+        )
+        self.assertGreater(metrics_dict["vllm:time_to_first_token_seconds_sum"], 0)
+        self.assertEqual(
+            metrics_dict["vllm:time_to_first_token_seconds_bucket"], total_prompts
+        )
+        # vllm:time_per_output_token_seconds
+        self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45)
+        self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0)
+        self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45)
+        # vllm:e2e_request_latency_seconds
+        self.assertEqual(
+            metrics_dict["vllm:e2e_request_latency_seconds_count"], total_prompts
+        )
+        self.assertGreater(metrics_dict["vllm:e2e_request_latency_seconds_sum"], 0)
+        self.assertEqual(
+            metrics_dict["vllm:e2e_request_latency_seconds_bucket"], total_prompts
+        )
+        # vllm:request_prompt_tokens
+        self.assertEqual(
+            metrics_dict["vllm:request_prompt_tokens_count"], total_prompts
+        )
+        self.assertEqual(metrics_dict["vllm:request_prompt_tokens_sum"], 18)
+        self.assertEqual(
+            metrics_dict["vllm:request_prompt_tokens_bucket"], total_prompts
+        )
+        # vllm:request_generation_tokens
+        self.assertEqual(
+            metrics_dict["vllm:request_generation_tokens_count"],
+            total_prompts,
+        )
+        self.assertEqual(metrics_dict["vllm:request_generation_tokens_sum"], 48)
+        self.assertEqual(
+            metrics_dict["vllm:request_generation_tokens_bucket"],
+            total_prompts,
+        )
+
+    def test_custom_sampling_params(self):
+        # Adding sampling parameters for testing metrics.
+        # Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
+        n, best_of = 2, 4
+        custom_sampling_parameters = self.sampling_parameters.copy()
+        # Changing "temperature" because "best_of" must be 1 when using greedy
+        # sampling, i.e. "temperature": "0".
+        custom_sampling_parameters.update(
+            {"n": str(n), "best_of": str(best_of), "temperature": "1"}
+        )
+
+        # Test vLLM metrics
+        self.vllm_infer(
+            prompts=self.prompts,
+            sampling_parameters=custom_sampling_parameters,
+            model_name=self.vllm_model_name,
+        )
+        metrics_dict = self.parse_vllm_metrics()
+        total_prompts = len(self.prompts)
+
+        # vllm:request_params_best_of
+        self.assertEqual(
+            metrics_dict["vllm:request_params_best_of_count"], total_prompts
+        )
+        self.assertEqual(
+            metrics_dict["vllm:request_params_best_of_sum"], best_of * total_prompts
+        )
+        self.assertEqual(
+            metrics_dict["vllm:request_params_best_of_bucket"], total_prompts
+        )
+        # vllm:request_params_n
+        self.assertEqual(metrics_dict["vllm:request_params_n_count"], total_prompts)
+        self.assertEqual(metrics_dict["vllm:request_params_n_sum"], n * total_prompts)
+        self.assertEqual(metrics_dict["vllm:request_params_n_bucket"], total_prompts)
+
+    def test_vllm_metrics_disabled(self):
+        # Test vLLM metrics
+        self.vllm_infer(
+            prompts=self.prompts,
+            sampling_parameters=self.sampling_parameters,
+            model_name=self.vllm_model_name,
+        )
+        metrics_dict = self.parse_vllm_metrics()
+
+        # No vLLM metric found
+        self.assertEqual(len(metrics_dict), 0)
+
+    def test_vllm_metrics_refused(self):
+        # Test vLLM metrics
+        self.vllm_infer(
+            prompts=self.prompts,
+            sampling_parameters=self.sampling_parameters,
+            model_name=self.vllm_model_name,
+        )
+        with self.assertRaises(requests.exceptions.ConnectionError):
+            self.parse_vllm_metrics()
+
+    def tearDown(self):
+        self.triton_client.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/ci/L0_backend_vllm/test.sh b/ci/L0_backend_vllm/test.sh
index 93d065c8..a9f89894 100755
--- a/ci/L0_backend_vllm/test.sh
+++ b/ci/L0_backend_vllm/test.sh
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 RET=0
-SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend"
+SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"
 
 python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc]
 
diff --git a/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
new file mode 100644
index 00000000..07977d0d
--- /dev/null
+++ b/ci/L0_backend_vllm/vllm_backend/ensemble_config.pbtxt
@@ -0,0 +1,59 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "ensemble_model"
+platform: "ensemble"
+max_batch_size: 1
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "vllm_opt"
+      model_version: -1
+      input_map {
+        key: "text_input"
+        value: "text_input"
+      }
+      output_map {
+        key: "text_output"
+        value: "text_output"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh
index a6dd0aa7..87e04b21 100755
--- a/ci/L0_backend_vllm/vllm_backend/test.sh
+++ b/ci/L0_backend_vllm/vllm_backend/test.sh
@@ -50,7 +50,7 @@ function assert_curl_success {
 
 rm -rf models && mkdir -p models
 cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
-# `vllm_opt`` model will be loaded on server start and stay loaded throughout
+# `vllm_opt` model will be loaded on server start and stay loaded throughout
 # unittesting. To test vllm model load/unload we use a dedicated
 # `vllm_load_test`. To ensure that vllm's memory profiler will not error out
 # on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`,
@@ -70,6 +70,11 @@ sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/m
 cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/
 sed -i 's/"facebook\/opt-125m"/"invalid_model"/' models/vllm_invalid_2/1/model.json
 
+
+# Sanity check ensembles are enabled and can successfully be loaded
+mkdir -p models/ensemble_model/1
+cp -r ensemble_config.pbtxt models/ensemble_model/config.pbtxt
+
 RET=0
 
 run_server
@@ -166,4 +171,4 @@ fi
 
 collect_artifacts_from_subdir
 
-exit $RET
+exit $RET
\ No newline at end of file
diff --git a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
index 8ca206f0..c53c391a 100644
--- a/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
+++ b/ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
@@ -48,6 +48,7 @@ def setUp(self):
         self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
         self.vllm_model_name = "vllm_opt"
         self.python_model_name = "add_sub"
+        self.ensemble_model_name = "ensemble_model"
         self.vllm_load_test = "vllm_load_test"
 
     def test_vllm_triton_backend(self):
@@ -57,6 +58,13 @@ def test_vllm_triton_backend(self):
         self.triton_client.load_model(self.python_model_name)
         self.assertTrue(self.triton_client.is_model_ready(self.python_model_name))
 
+        # Test to ensure that ensemble models are supported in vllm container.
+        # If ensemble support not present, triton will error out at model loading stage.
+        # Ensemble Model is a pipeline consisting of 1 model (vllm_opt)
+        self.triton_client.load_model(self.ensemble_model_name)
+        self.assertTrue(self.triton_client.is_model_ready(self.ensemble_model_name))
+        self.triton_client.unload_model(self.ensemble_model_name)
+
         # Unload vllm model and test add_sub model
         self.triton_client.unload_model(self.vllm_load_test)
         self.assertFalse(self.triton_client.is_model_ready(self.vllm_load_test))
diff --git a/ci/L0_multi_gpu/multi_lora/test.sh b/ci/L0_multi_gpu/multi_lora/test.sh
index a500958f..b561a2d0 100755
--- a/ci/L0_multi_gpu/multi_lora/test.sh
+++ b/ci/L0_multi_gpu/multi_lora/test.sh
@@ -52,17 +52,19 @@ cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_llama_multi_lora
 
 export SERVER_ENABLE_LORA=true
 
+# Check boolean flag value for `enable_lora`
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": "true",
+    "disable_log_requests": true,
     "gpu_memory_utilization": 0.7,
     "tensor_parallel_size": 2,
     "block_size": 16,
-    "enforce_eager": "true",
-    "enable_lora": "true",
+    "enforce_eager": true,
+    "enable_lora": true,
     "max_lora_rank": 32,
-    "lora_extra_vocab_size": 256
+    "lora_extra_vocab_size": 256,
+    "distributed_executor_backend":"ray"
 }
 EOF
 )
@@ -109,18 +111,111 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+# Check string flag value for `enable_lora`
+model_json=$(cat <<EOF
+{
+    "model":"./weights/backbone/gemma-2b",
+    "disable_log_requests": true,
+    "gpu_memory_utilization": 0.7,
+    "tensor_parallel_size": 2,
+    "block_size": 16,
+    "enforce_eager": true,
+    "enable_lora": "true",
+    "max_lora_rank": 32,
+    "lora_extra_vocab_size": 256,
+    "distributed_executor_backend":"ray"
+}
+EOF
+)
+echo "$model_json" > models/vllm_llama_multi_lora/1/model.json
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    exit 1
+fi
+
+set +e
+python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1
+
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***"
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification FAILED.\n***"
+        RET=1
+    fi
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+# disable lora
+export SERVER_ENABLE_LORA=false
+# check bool flag value for `enable_lora`
+model_json=$(cat <<EOF
+{
+    "model":"./weights/backbone/gemma-2b",
+    "disable_log_requests": true,
+    "gpu_memory_utilization": 0.8,
+    "tensor_parallel_size": 2,
+    "block_size": 16,
+    "enforce_eager": true,
+    "enable_lora": false,
+    "lora_extra_vocab_size": 256,
+    "distributed_executor_backend":"ray"
+}
+EOF
+)
+echo "$model_json" > models/vllm_llama_multi_lora/1/model.json
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    exit 1
+fi
+
+set +e
+python3 $CLIENT_PY -v >> $CLIENT_LOG 2>&1
+
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***"
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification FAILED.\n***"
+        RET=1
+    fi
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
 # disable lora
 export SERVER_ENABLE_LORA=false
+# check string flag value for `enable_lora`
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": "true",
+    "disable_log_requests": true,
     "gpu_memory_utilization": 0.8,
     "tensor_parallel_size": 2,
     "block_size": 16,
-    "enforce_eager": "true",
+    "enforce_eager": true,
     "enable_lora": "false",
-    "lora_extra_vocab_size": 256
+    "lora_extra_vocab_size": 256,
+    "distributed_executor_backend":"ray"
 }
 EOF
 )
diff --git a/ci/L0_multi_gpu/vllm_backend/test.sh b/ci/L0_multi_gpu/vllm_backend/test.sh
index fb583cb3..0609bebf 100755
--- a/ci/L0_multi_gpu/vllm_backend/test.sh
+++ b/ci/L0_multi_gpu/vllm_backend/test.sh
@@ -63,6 +63,7 @@ function run_multi_gpu_test() {
     export KIND="${1}"
     export TENSOR_PARALLELISM="${2}"
     export INSTANCE_COUNT="${3}"
+    export DISTRIBUTED_EXECUTOR_BACKEND="${4}"
 
     # Setup a clean model repository
     export TEST_MODEL="vllm_opt_${KIND}_tp${TENSOR_PARALLELISM}_count${INSTANCE_COUNT}"
@@ -73,6 +74,10 @@ function run_multi_gpu_test() {
     cp -r "${SAMPLE_MODELS_REPO}/vllm_model" "models/${TEST_MODEL}"
     sed -i "s/KIND_MODEL/${KIND}/" "${TEST_MODEL_TRITON_CONFIG}"
     sed -i "3s/^/    \"tensor_parallel_size\": ${TENSOR_PARALLELISM},\n/" "${TEST_MODEL_VLLM_CONFIG}"
+    if [ $TENSOR_PARALLELISM -ne "1" ]; then
+        jq --arg backend $DISTRIBUTED_EXECUTOR_BACKEND '. += {"distributed_executor_backend":$backend}' "${TEST_MODEL_VLLM_CONFIG}" > "temp.json"
+        mv temp.json "${TEST_MODEL_VLLM_CONFIG}"
+    fi
     # Assert the correct kind is set in case the template config changes in the future
     validate_file_contains "${KIND}" "${TEST_MODEL_TRITON_CONFIG}"
 
@@ -119,10 +124,11 @@ RET=0
 KINDS="KIND_MODEL KIND_GPU"
 TPS="1 2"
 INSTANCE_COUNTS="1 2"
+DISTRIBUTED_EXECUTOR_BACKEND="ray"
 for kind in ${KINDS}; do
   for tp in ${TPS}; do
     for count in ${INSTANCE_COUNTS}; do
-        run_multi_gpu_test "${kind}" "${tp}" "${count}"
+        run_multi_gpu_test "${kind}" "${tp}" "${count}" "${DISTRIBUTED_EXECUTOR_BACKEND}"
     done
   done
 done
diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json
index 6eb5e070..8a32050d 100644
--- a/samples/model_repository/vllm_model/1/model.json
+++ b/samples/model_repository/vllm_model/1/model.json
@@ -1,6 +1,6 @@
 {
     "model":"facebook/opt-125m",
-    "disable_log_requests": "true",
+    "disable_log_requests": true,
     "gpu_memory_utilization": 0.5,
-    "enforce_eager": "true"
+    "enforce_eager": true
 }
diff --git a/src/model.py b/src/model.py
index 3fe7cd1e..3f6e23bb 100644
--- a/src/model.py
+++ b/src/model.py
@@ -25,8 +25,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import asyncio
+import gc
 import json
 import os
+import queue
 import threading
 from typing import Dict, List
 
@@ -39,6 +41,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
+from utils.metrics import VllmStatLogger
+
 _VLLM_ENGINE_ARGS_FILENAME = "model.json"
 _MULTI_LORA_ARGS_FILENAME = "multi_lora.json"
 
@@ -113,13 +117,19 @@ def initialize(self, args):
         # Counter to keep track of ongoing request counts
         self.ongoing_request_count = 0
 
+        # Starting the response thread. It allows vLLM to keep making progress while
+        # response sender(s) are sending responses to server frontend.
+        self._response_queue = queue.Queue()
+        self._response_thread = threading.Thread(target=self.response_loop)
+        self._response_thread.start()
+
         # Starting asyncio event loop to process the received requests asynchronously.
         self._loop = asyncio.get_event_loop()
-        self._loop_thread = threading.Thread(
+        self._event_thread = threading.Thread(
             target=self.engine_loop, args=(self._loop,)
         )
         self._shutdown_event = asyncio.Event()
-        self._loop_thread.start()
+        self._event_thread.start()
 
     def init_engine(self):
         # Currently, Triton needs to use decoupled policy for asynchronously
@@ -147,16 +157,43 @@ def init_engine(self):
         self.setup_lora()
 
         # Create an AsyncLLMEngine from the config from JSON
-        self.llm_engine = AsyncLLMEngine.from_engine_args(
-            AsyncEngineArgs(**self.vllm_engine_config)
-        )
+        aync_engine_args = AsyncEngineArgs(**self.vllm_engine_config)
+        self.llm_engine = AsyncLLMEngine.from_engine_args(aync_engine_args)
+
+        # Create vLLM custom metrics
+        self.vllm_metrics = None
+        if (
+            "REPORT_CUSTOM_METRICS" in self.model_config["parameters"]
+            and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"]
+            == "yes"
+            and not aync_engine_args.disable_log_stats
+        ):
+            try:
+                labels = {
+                    "model": self.args["model_name"],
+                    "version": self.args["model_version"],
+                }
+                # Add vLLM custom metrics
+                engine_config = self.llm_engine.engine.model_config
+                self.vllm_metrics = VllmStatLogger(
+                    labels, engine_config.max_model_len, self.logger
+                )
+                self.llm_engine.add_logger("triton", self.vllm_metrics)
+            except pb_utils.TritonModelException as e:
+                if "metrics not supported" in str(e):
+                    # Metrics are disabled at the server
+                    self.logger.log_info("[vllm] Metrics not supported")
+                else:
+                    raise e
 
     def setup_lora(self):
         self.enable_lora = False
 
+        # Check if `enable_lora` field is in the `model.json`,
+        # and if it is, read its contents, which can be string or bool.
         if (
             "enable_lora" in self.vllm_engine_config.keys()
-            and self.vllm_engine_config["enable_lora"].lower() == "true"
+            and str(self.vllm_engine_config["enable_lora"]).lower() == "true"
         ):
             # create Triton LoRA weights repository
             multi_lora_args_filepath = os.path.join(
@@ -273,6 +310,27 @@ def get_sampling_params_dict(self, params_json):
 
         return params_dict
 
+    def response_loop(self):
+        while True:
+            item = self._response_queue.get()
+            # To signal shutdown a None item will be added to the queue.
+            if item is None:
+                break
+            response_state, response, response_flag = item
+            response_sender = response_state["response_sender"]
+            try:
+                response_sender.send(response, response_flag)
+                # Stop checking for cancellation if the last response is generated.
+                if not response_state["last_response_generated"]:
+                    response_state["is_cancelled"] = response_sender.is_cancelled()
+            except Exception as e:
+                self.logger.log_error(
+                    f"An error occurred while sending a response: {e}"
+                )
+            finally:
+                if response_flag == pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL:
+                    self.ongoing_request_count -= 1
+
     def create_response(self, vllm_output, prepend_input):
         """
         Parses the output from the vLLM engine into Triton
@@ -313,7 +371,13 @@ async def generate(self, request):
         Forwards single request to LLM engine and returns responses.
         """
         response_sender = request.get_response_sender()
+        response_state = {
+            "response_sender": response_sender,
+            "is_cancelled": False,
+            "last_response_generated": False,  # last response ready but not yet sent
+        }
         self.ongoing_request_count += 1
+        decrement_ongoing_request_count = True
         try:
             request_id = random_uuid()
             prompt = pb_utils.get_input_tensor_by_name(
@@ -368,13 +432,31 @@ async def generate(self, request):
                 lora_local_path = self.lora_repository[lora_name]
                 lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path)
 
-            async for output in self.llm_engine.generate(
-                prompt, sampling_params, request_id, lora_request=lora_request
-            ):
-                if response_sender.is_cancelled():
+            response_iterator = await self.llm_engine.add_request(
+                request_id, prompt, sampling_params, lora_request=lora_request
+            )
+
+            async for output in response_iterator:
+                is_cancelled = response_state["is_cancelled"]
+                if not stream:
+                    is_cancelled = response_sender.is_cancelled()
+                if is_cancelled:
                     self.logger.log_info("[vllm] Cancelling the request")
                     await self.llm_engine.abort(request_id)
                     self.logger.log_info("[vllm] Successfully cancelled the request")
+                    if stream:
+                        response_state["last_response_generated"] = True
+                        response = pb_utils.InferenceResponse(
+                            error=pb_utils.TritonError(
+                                message="Request was cancelled",
+                                code=pb_utils.TritonError.CANCELLED,
+                            )
+                        )
+                        flags = pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                        decrement_ongoing_request_count = False
+                        self._response_queue.put_nowait(
+                            (response_state, response, flags)
+                        )
                     break
                 if stream:
                     prev_outputs_lengths = None
@@ -383,15 +465,13 @@ async def generate(self, request):
                             len(prev_output.text)
                             for prev_output in prev_outputs.outputs
                         ]
+                    response = self.create_stream_response(output, prev_outputs_lengths)
+                    flags = 0
                     if output.finished:
-                        response_sender.send(
-                            self.create_stream_response(output, prev_outputs_lengths),
-                            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
-                        )
-                    else:
-                        response_sender.send(
-                            self.create_stream_response(output, prev_outputs_lengths)
-                        )
+                        response_state["last_response_generated"] = True
+                        flags = pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                        decrement_ongoing_request_count = False
+                    self._response_queue.put_nowait((response_state, response, flags))
                 prev_outputs = output
 
             last_output = output
@@ -403,7 +483,7 @@ async def generate(self, request):
                 )
 
         except Exception as e:
-            self.logger.log_info(f"[vllm] Error generating stream: {e}")
+            self.logger.log_error(f"[vllm] Error generating stream: {e}")
             error = pb_utils.TritonError(f"Error generating stream: {e}")
             triton_output_tensor = pb_utils.Tensor(
                 "text_output", np.asarray(["N/A"], dtype=self.output_dtype)
@@ -416,7 +496,8 @@ async def generate(self, request):
             )
             raise e
         finally:
-            self.ongoing_request_count -= 1
+            if decrement_ongoing_request_count:
+                self.ongoing_request_count -= 1
 
     def verify_loras(self, request):
         # We will check if the requested lora exists here, if not we will send a
@@ -483,6 +564,24 @@ def finalize(self):
         """
         self.logger.log_info("[vllm] Issuing finalize to vllm backend")
         self._shutdown_event.set()
-        if self._loop_thread is not None:
-            self._loop_thread.join()
-            self._loop_thread = None
+
+        # Shutdown the event thread.
+        if self._event_thread is not None:
+            self._event_thread.join()
+            self._event_thread = None
+
+        # Shutdown the response thread.
+        self._response_queue.put(None)
+        if self._response_thread is not None:
+            self._response_thread.join()
+            self._response_thread = None
+
+        # Shutdown the logger thread.
+        if self.vllm_metrics is not None:
+            self.vllm_metrics.finalize()
+
+        # When using parallel tensors, the stub process may not shutdown due to
+        # unreleased references, so manually run the garbage collector once.
+        self.logger.log_info("[vllm] Running Garbage Collector on finalize...")
+        gc.collect()
+        self.logger.log_info("[vllm] Garbage Collector on finalize... done")
diff --git a/src/utils/metrics.py b/src/utils/metrics.py
new file mode 100644
index 00000000..75c097dc
--- /dev/null
+++ b/src/utils/metrics.py
@@ -0,0 +1,278 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import queue
+import threading
+from typing import Dict, List, Union
+
+import triton_python_backend_utils as pb_utils
+from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
+from vllm.engine.metrics import Stats as VllmStats
+from vllm.engine.metrics import SupportsMetricsInfo, build_1_2_5_buckets
+
+
+class TritonMetrics:
+    def __init__(self, labels: List[str], max_model_len: int):
+        # Initialize metric families
+        # Iteration stats
+        self.counter_prompt_tokens_family = pb_utils.MetricFamily(
+            name="vllm:prompt_tokens_total",
+            description="Number of prefill tokens processed.",
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
+        self.counter_generation_tokens_family = pb_utils.MetricFamily(
+            name="vllm:generation_tokens_total",
+            description="Number of generation tokens processed.",
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
+        self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
+            name="vllm:time_to_first_token_seconds",
+            description="Histogram of time to first token in seconds.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
+            name="vllm:time_per_output_token_seconds",
+            description="Histogram of time per output token in seconds.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        # Request stats
+        #   Latency
+        self.histogram_e2e_time_request_family = pb_utils.MetricFamily(
+            name="vllm:e2e_request_latency_seconds",
+            description="Histogram of end to end request latency in seconds.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        #   Metadata
+        self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
+            name="vllm:request_prompt_tokens",
+            description="Number of prefill tokens processed.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        self.histogram_num_generation_tokens_request_family = pb_utils.MetricFamily(
+            name="vllm:request_generation_tokens",
+            description="Number of generation tokens processed.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        self.histogram_best_of_request_family = pb_utils.MetricFamily(
+            name="vllm:request_params_best_of",
+            description="Histogram of the best_of request parameter.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+        self.histogram_n_request_family = pb_utils.MetricFamily(
+            name="vllm:request_params_n",
+            description="Histogram of the n request parameter.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
+        )
+
+        # Initialize metrics
+        # Iteration stats
+        self.counter_prompt_tokens = self.counter_prompt_tokens_family.Metric(
+            labels=labels
+        )
+        self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
+            labels=labels
+        )
+        # Use the same bucket boundaries from vLLM sample metrics as an example.
+        # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
+        self.histogram_time_to_first_token = (
+            self.histogram_time_to_first_token_family.Metric(
+                labels=labels,
+                buckets=[
+                    0.001,
+                    0.005,
+                    0.01,
+                    0.02,
+                    0.04,
+                    0.06,
+                    0.08,
+                    0.1,
+                    0.25,
+                    0.5,
+                    0.75,
+                    1.0,
+                    2.5,
+                    5.0,
+                    7.5,
+                    10.0,
+                ],
+            )
+        )
+        self.histogram_time_per_output_token = (
+            self.histogram_time_per_output_token_family.Metric(
+                labels=labels,
+                buckets=[
+                    0.01,
+                    0.025,
+                    0.05,
+                    0.075,
+                    0.1,
+                    0.15,
+                    0.2,
+                    0.3,
+                    0.4,
+                    0.5,
+                    0.75,
+                    1.0,
+                    2.5,
+                ],
+            )
+        )
+        # Request stats
+        #   Latency
+        self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
+            labels=labels,
+            buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
+        )
+        #   Metadata
+        self.histogram_num_prompt_tokens_request = (
+            self.histogram_num_prompt_tokens_request_family.Metric(
+                labels=labels,
+                buckets=build_1_2_5_buckets(max_model_len),
+            )
+        )
+        self.histogram_num_generation_tokens_request = (
+            self.histogram_num_generation_tokens_request_family.Metric(
+                labels=labels,
+                buckets=build_1_2_5_buckets(max_model_len),
+            )
+        )
+        self.histogram_best_of_request = self.histogram_best_of_request_family.Metric(
+            labels=labels,
+            buckets=[1, 2, 5, 10, 20],
+        )
+        self.histogram_n_request = self.histogram_n_request_family.Metric(
+            labels=labels,
+            buckets=[1, 2, 5, 10, 20],
+        )
+
+
+class VllmStatLogger(VllmStatLoggerBase):
+    """StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
+
+    def __init__(self, labels: Dict, max_model_len: int, log_logger) -> None:
+        # Tracked stats over current local logging interval.
+        # local_interval not used here. It's for vLLM logs to stdout.
+        super().__init__(local_interval=0)
+        self.metrics = TritonMetrics(labels, max_model_len)
+        self.log_logger = log_logger
+
+        # Starting the metrics thread. It allows vLLM to keep making progress
+        # while reporting metrics to triton metrics service.
+        self._logger_queue = queue.Queue()
+        self._logger_thread = threading.Thread(target=self.logger_loop)
+        self._logger_thread.start()
+
+    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+        pass
+
+    def _log_counter(self, counter, data: Union[int, float]) -> None:
+        """Convenience function for logging to counter.
+
+        Args:
+            counter: A counter metric instance.
+            data: An int or float to increment the count metric.
+
+        Returns:
+            None
+        """
+        if data != 0:
+            self._logger_queue.put_nowait((counter, "increment", data))
+
+    def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
+        """Convenience function for logging list to histogram.
+
+        Args:
+            histogram: A histogram metric instance.
+            data: A list of int or float data to observe into the histogram metric.
+
+        Returns:
+            None
+        """
+        for datum in data:
+            self._logger_queue.put_nowait((histogram, "observe", datum))
+
+    def log(self, stats: VllmStats) -> None:
+        """Report stats to Triton metrics server.
+
+        Args:
+            stats: Created by LLMEngine for use by VllmStatLogger.
+
+        Returns:
+            None
+        """
+        # The list of vLLM metrics reporting to Triton is also documented here.
+        # https://github.com/triton-inference-server/vllm_backend/blob/main/README.md#triton-metrics
+        counter_metrics = [
+            (self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter),
+            (self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter),
+        ]
+        histogram_metrics = [
+            (
+                self.metrics.histogram_time_to_first_token,
+                stats.time_to_first_tokens_iter,
+            ),
+            (
+                self.metrics.histogram_time_per_output_token,
+                stats.time_per_output_tokens_iter,
+            ),
+            (self.metrics.histogram_e2e_time_request, stats.time_e2e_requests),
+            (
+                self.metrics.histogram_num_prompt_tokens_request,
+                stats.num_prompt_tokens_requests,
+            ),
+            (
+                self.metrics.histogram_num_generation_tokens_request,
+                stats.num_generation_tokens_requests,
+            ),
+            (self.metrics.histogram_best_of_request, stats.best_of_requests),
+            (self.metrics.histogram_n_request, stats.n_requests),
+        ]
+
+        for metric, data in counter_metrics:
+            self._log_counter(metric, data)
+        for metric, data in histogram_metrics:
+            self._log_histogram(metric, data)
+
+    def logger_loop(self):
+        while True:
+            item = self._logger_queue.get()
+            # To signal shutdown a None item will be added to the queue.
+            if item is None:
+                break
+            metric, command, data = item
+            if command == "increment":
+                metric.increment(data)
+            elif command == "observe":
+                metric.observe(data)
+            else:
+                self.log_logger.log_error(f"Undefined command name: {command}")
+
+    def finalize(self):
+        # Shutdown the logger thread.
+        self._logger_queue.put(None)
+        if self._logger_thread is not None:
+            self._logger_thread.join()
+            self._logger_thread = None