From 4b6bb09fcb0643241098d3a5475b02c25fbe0f14 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 26 Sep 2023 15:53:16 -0700
Subject: [PATCH 01/12] Add cancelled response status

---
 README.md      | 1 +
 src/pb_stub.cc | 4 ++++
 2 files changed, 5 insertions(+)
diff --git a/README.md b/README.md
index 517a9b64..6ad95407 100644
--- a/README.md
+++ b/README.md
@@ -502,6 +502,7 @@ Supported error codes:
 * `pb_utils.TritonError.UNAVAILABLE`
 * `pb_utils.TritonError.UNSUPPORTED`
 * `pb_utils.TritonError.ALREADY_EXISTS`
+* `pb_utils.TritonError.CANCELLED` (since 23.10)
 
 #### Decoupled mode
 
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 37c9a5b5..cd9fe799 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -1364,6 +1364,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       .value(
           "ALREADY_EXISTS",
           TRITONSERVER_Error_Code::TRITONSERVER_ERROR_ALREADY_EXISTS)
+      .value("CANCELLED", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_CANCELLED)
       .export_values();
   triton_error.def_property_readonly_static(
       "UNKNOWN",
@@ -1386,6 +1387,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
   triton_error.def_property_readonly_static(
       "ALREADY_EXISTS",
       [](py::object /* self */) { return TRITONSERVER_ERROR_ALREADY_EXISTS; });
+  triton_error.def_property_readonly_static(
+      "CANCELLED",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_CANCELLED; });
   triton_error.def(
       py::init<const std::string&, TRITONSERVER_Error_Code>(),
       py::arg("message").none(false),

From af0f29b0ea6b0a08f4900fe27a50b01b03b388bd Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Wed, 27 Sep 2023 18:58:25 -0700
Subject: [PATCH 02/12] Add request cancellation

---
 CMakeLists.txt       |  2 ++
 README.md            | 32 +++++++++++++++++++
 src/infer_request.cc | 17 +++++++++++
 src/infer_request.h  |  1 +
 src/ipc_message.h    |  3 +-
 src/pb_cancel.cc     | 73 ++++++++++++++++++++++++++++++++++++++++++++
 src/pb_cancel.h      | 62 +++++++++++++++++++++++++++++++++++++
 src/pb_stub.cc       | 45 ++++++++++++++++++++++++++-
 src/pb_stub.h        |  7 +++++
 src/pb_utils.h       |  5 +++
 src/python_be.cc     | 28 +++++++++++++++++
 src/python_be.h      |  3 ++
 12 files changed, 276 insertions(+), 2 deletions(-)
 create mode 100644 src/pb_cancel.cc
 create mode 100644 src/pb_cancel.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93a7ae60..057797dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,6 +150,8 @@ set(
   src/pb_error.h
   src/pb_log.cc
   src/pb_log.h
+  src/pb_cancel.cc
+  src/pb_cancel.h
   src/pb_memory.cc
   src/pb_memory.h
   src/pb_tensor.cc
diff --git a/README.md b/README.md
index 6ad95407..aa4e5cc2 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,7 @@ any C++ code.
     - [`execute`](#execute)
       - [Default Mode](#default-mode)
       - [Error Handling](#error-handling)
+      - [Request Cancellation](#request-cancellation)
       - [Decoupled mode](#decoupled-mode)
         - [Use Cases](#use-cases)
         - [Known Issues](#known-issues)
@@ -504,6 +505,37 @@ Supported error codes:
 * `pb_utils.TritonError.ALREADY_EXISTS`
 * `pb_utils.TritonError.CANCELLED` (since 23.10)
 
+#### Request Cancellation
+
+One or more requests may be cancelled during execution, for example, cancelled
+by the user. Starting from 23.10, `request.is_cancelled()` returns up-to-date
+`True` or `False` on whether the request is cancelled. If a request is
+cancelled, the model should respond `pb_utils.TritonError.CANCELLED` in place of
+the normal output tensors on the request. For example:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    ...
+
+    def execute(self, requests):
+        responses = []
+
+        for request in requests:
+            if request.is_cancelled():
+                responses.append(pb_utils.InferenceResponse(
+                    error=pb_utils.TritonError("Message", pb_utils.TritonError.CANCELLED)))
+            else:
+                ...
+
+        return responses
+```
+
+Although checking for request cancellation is optional, it is recommended to
+check for cancellation at strategic request execution stages that can early
+terminate the execution in the event of its response is no longer needed.
+
 #### Decoupled mode
 
 This mode allows user to send multiple responses for a request or
diff --git a/src/infer_request.cc b/src/infer_request.cc
index 5fdae669..270c7bb3 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -400,6 +400,23 @@ InferRequest::DeleteResponseFactory()
 #endif
 
 #ifdef TRITON_PB_STUB
+bool
+InferRequest::IsCancelled()
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  if (!stub->StubToParentServiceActive()) {
+    LOG_ERROR << "Cannot communicate with parent service";
+    return false;
+  }
+  if (request_address_ == 0) {
+    LOG_ERROR << "Request address not provided (default initialized?)";
+    return false;
+  }
+  std::unique_ptr<PbCancel> pb_cancel(new PbCancel(request_address_));
+  stub->EnqueueIsCancelled(pb_cancel);
+  return pb_cancel->IsCancelled();
+}
+
 std::shared_ptr<ResponseSender>
 InferRequest::GetResponseSender()
 {
diff --git a/src/infer_request.h b/src/infer_request.h
index 6652b2fb..a96545a3 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -107,6 +107,7 @@ class InferRequest {
 #ifdef TRITON_PB_STUB
   std::shared_ptr<InferResponse> Exec(const bool is_decoupled);
   std::shared_ptr<ResponseSender> GetResponseSender();
+  bool IsCancelled();
 #endif
 
   /// Save an Inference Request to shared memory.
diff --git a/src/ipc_message.h b/src/ipc_message.h
index 7040f2b4..14d3dc5f 100644
--- a/src/ipc_message.h
+++ b/src/ipc_message.h
@@ -62,7 +62,8 @@ typedef enum PYTHONSTUB_commandtype_enum {
   PYTHONSTUB_MetricRequestSet,
   PYTHONSTUB_LoadModelRequest,
   PYTHONSTUB_UnloadModelRequest,
-  PYTHONSTUB_ModelReadinessRequest
+  PYTHONSTUB_ModelReadinessRequest,
+  PYTHONSTUB_IsRequestCancelled
 } PYTHONSTUB_CommandType;
 
 ///
diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc
new file mode 100644
index 00000000..9add280b
--- /dev/null
+++ b/src/pb_cancel.cc
@@ -0,0 +1,73 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_cancel.h"
+
+namespace triton { namespace backend { namespace python {
+
+void
+PbCancel::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  cancel_shm_ = shm_pool->Construct<IsCancelledMessage>();
+  new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex;
+  new (&(cancel_shm_.data_->cv)) bi::interprocess_condition;
+  cancel_shm_.data_->waiting_on_stub = false;
+  cancel_shm_.data_->request_address = request_address_;
+  cancel_shm_.data_->is_cancelled = is_cancelled_;
+}
+
+bi::managed_external_buffer::handle_t
+PbCancel::ShmHandle()
+{
+  return cancel_shm_.handle_;
+}
+
+IsCancelledMessage*
+PbCancel::ShmPayload()
+{
+  return cancel_shm_.data_.get();
+}
+
+bool
+PbCancel::IsCancelled()
+{
+  std::unique_lock<std::mutex> lk(mu_);
+  cv_.wait(lk, [this] { return updated_; });
+  return is_cancelled_;
+}
+
+void
+PbCancel::ReportIsCancelled(bool is_cancelled)
+{
+  {
+    std::lock_guard<std::mutex> lk(mu_);
+    is_cancelled_ = is_cancelled;
+    updated_ = true;
+  }
+  cv_.notify_all();
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_cancel.h b/src/pb_cancel.h
new file mode 100644
index 00000000..904e98c0
--- /dev/null
+++ b/src/pb_cancel.h
@@ -0,0 +1,62 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+class PbCancel {
+ public:
+  PbCancel(intptr_t request_address)
+      : updated_(false), request_address_(request_address), is_cancelled_(false)
+  {
+  }
+  DISALLOW_COPY_AND_ASSIGN(PbCancel);
+
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+  bi::managed_external_buffer::handle_t ShmHandle();
+  IsCancelledMessage* ShmPayload();
+
+  bool IsCancelled();
+  void ReportIsCancelled(bool is_cancelled);
+
+ private:
+  AllocatedSharedMemory<IsCancelledMessage> cancel_shm_;
+
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool updated_;
+
+  intptr_t request_address_;
+  bool is_cancelled_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index cd9fe799..5a5094e0 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -945,6 +945,9 @@ Stub::ServiceStubToParentRequests()
         SendLogMessage(utils_msg_payload);
       } else if (utils_msg_payload->command_type == PYTHONSTUB_CleanupRequest) {
         SendCleanupId(utils_msg_payload);
+      } else if (
+          utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) {
+        SendIsCancelled(utils_msg_payload);
       } else {
         std::cerr << "Error when sending message via stub_to_parent message "
                      "buffer - unknown command\n";
@@ -1028,6 +1031,45 @@ Stub::EnqueueCleanupId(void* id)
   }
 }
 
+void
+Stub::EnqueueIsCancelled(const std::unique_ptr<PbCancel>& pb_cancel)
+{
+  std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
+      std::make_unique<UtilsMessagePayload>(
+          PYTHONSTUB_IsRequestCancelled,
+          reinterpret_cast<void*>(pb_cancel.get()));
+  EnqueueUtilsMessage(std::move(utils_msg_payload));
+}
+
+void
+Stub::SendIsCancelled(std::unique_ptr<UtilsMessagePayload>& utils_msg_payload)
+{
+  PbCancel* pb_cancel =
+      reinterpret_cast<PbCancel*>(utils_msg_payload->utils_message_ptr);
+  pb_cancel->SaveToSharedMemory(shm_pool_);
+
+  IsCancelledMessage* message_payload = pb_cancel->ShmPayload();
+  std::unique_ptr<IPCMessage> ipc_message =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+  ipc_message->Command() = utils_msg_payload->command_type;
+  ipc_message->Args() = pb_cancel->ShmHandle();
+
+  bool is_cancelled = false;
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lk(message_payload->mu);
+
+    SendIPCUtilsMessage(ipc_message);
+    while (!message_payload->waiting_on_stub) {
+      message_payload->cv.wait(lk);
+    }
+
+    is_cancelled = message_payload->is_cancelled;
+    message_payload->waiting_on_stub = false;
+    message_payload->cv.notify_all();
+  }
+  pb_cancel->ReportIsCancelled(is_cancelled);
+}
+
 bool
 Stub::StubToParentServiceActive()
 {
@@ -1505,7 +1547,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       .def(
           "requested_output_names", &InferRequest::RequestedOutputNames,
           py::return_value_policy::reference_internal)
-      .def("get_response_sender", &InferRequest::GetResponseSender);
+      .def("get_response_sender", &InferRequest::GetResponseSender)
+      .def("is_cancelled", &InferRequest::IsCancelled);
 
   py::class_<PbTensor, std::shared_ptr<PbTensor>>(module, "Tensor")
       .def(py::init(&PbTensor::FromNumpy))
diff --git a/src/pb_stub.h b/src/pb_stub.h
index 6d047d29..530dca2c 100644
--- a/src/pb_stub.h
+++ b/src/pb_stub.h
@@ -49,6 +49,7 @@
 #include "message_queue.h"
 #include "metric.h"
 #include "metric_family.h"
+#include "pb_cancel.h"
 #include "pb_log.h"
 #include "pb_response_iterator.h"
 #include "pb_utils.h"
@@ -308,6 +309,12 @@ class Stub {
   /// Add cleanup id to queue
   void EnqueueCleanupId(void* id);
 
+  /// Add request cancellation query to queue
+  void EnqueueIsCancelled(const std::unique_ptr<PbCancel>& pb_cancel);
+
+  /// Send request cancellation query to python backend
+  void SendIsCancelled(std::unique_ptr<UtilsMessagePayload>& utils_msg_payload);
+
   /// Is the stub initialized
   bool IsInitialized();
 
diff --git a/src/pb_utils.h b/src/pb_utils.h
index 1d651f3f..71a70272 100644
--- a/src/pb_utils.h
+++ b/src/pb_utils.h
@@ -182,6 +182,11 @@ struct CleanupMessage : SendMessageBase {
   void* id;
 };
 
+struct IsCancelledMessage : SendMessageBase {
+  intptr_t request_address;
+  bool is_cancelled;
+};
+
 struct CustomMetricsMessage : SendMessageBase {
   bi::managed_external_buffer::handle_t message;
   bool has_error;
diff --git a/src/python_be.cc b/src/python_be.cc
index b196cfab..b9835221 100644
--- a/src/python_be.cc
+++ b/src/python_be.cc
@@ -817,6 +817,10 @@ ModelInstanceState::StubToParentMQMonitor()
         ProcessBLSCleanupRequest(message);
         break;
       }
+      case PYTHONSTUB_IsRequestCancelled: {
+        ProcessIsRequestCancelled(message);
+        break;
+      }
       case PYTHONSTUB_MetricFamilyRequestNew:
       case PYTHONSTUB_MetricFamilyRequestDelete: {
         ProcessMetricFamilyRequest(message);
@@ -918,6 +922,30 @@ ModelInstanceState::ProcessBLSCleanupRequest(
   }
 }
 
+void
+ModelInstanceState::ProcessIsRequestCancelled(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  AllocatedSharedMemory<IsCancelledMessage> message_shm =
+      Stub()->ShmPool()->Load<IsCancelledMessage>(message->Args());
+  IsCancelledMessage* message_payload =
+      reinterpret_cast<IsCancelledMessage*>(message_shm.data_.get());
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lk{message_payload->mu};
+
+    TRITONBACKEND_Request* request = reinterpret_cast<TRITONBACKEND_Request*>(
+        message_payload->request_address);
+    TRITONBACKEND_RequestIsCancelled(request, &message_payload->is_cancelled);
+
+    message_payload->waiting_on_stub = true;
+    message_payload->cv.notify_all();
+    while (message_payload->waiting_on_stub) {
+      message_payload->cv.wait(lk);
+    }
+  }
+}
+
 template <typename T, typename MessageType>
 void
 ModelInstanceState::ProcessMessage(
diff --git a/src/python_be.h b/src/python_be.h
index 825c45de..4c8d702f 100644
--- a/src/python_be.h
+++ b/src/python_be.h
@@ -394,6 +394,9 @@ class ModelInstanceState : public BackendModelInstance {
   // Process the bls decoupled cleanup request
   void ProcessBLSCleanupRequest(const std::unique_ptr<IPCMessage>& message);
 
+  // Process request cancellation query
+  void ProcessIsRequestCancelled(const std::unique_ptr<IPCMessage>& message);
+
   // Process a message. The function 'request_handler' is invoked
   // to handle the request. T should be either 'MetricFamily', 'Metric' or
   // 'ModelLoader', and MessageType should be either 'MetricFamilyMessage',

From 3d87786a0ef7c16c963eeb6724b7cc74a04c3416 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 29 Sep 2023 11:17:30 -0700
Subject: [PATCH 03/12] Check cancellation on response factory if available

---
 src/infer_request.cc |  7 ++-----
 src/pb_cancel.cc     |  1 +
 src/pb_cancel.h      |  6 ++++--
 src/pb_utils.h       |  1 +
 src/python_be.cc     | 18 +++++++++++++++---
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 270c7bb3..1410fb46 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -408,11 +408,8 @@ InferRequest::IsCancelled()
     LOG_ERROR << "Cannot communicate with parent service";
     return false;
   }
-  if (request_address_ == 0) {
-    LOG_ERROR << "Request address not provided (default initialized?)";
-    return false;
-  }
-  std::unique_ptr<PbCancel> pb_cancel(new PbCancel(request_address_));
+  std::unique_ptr<PbCancel> pb_cancel(
+      new PbCancel(response_factory_address_, request_address_));
   stub->EnqueueIsCancelled(pb_cancel);
   return pb_cancel->IsCancelled();
 }
diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc
index 9add280b..272babf0 100644
--- a/src/pb_cancel.cc
+++ b/src/pb_cancel.cc
@@ -35,6 +35,7 @@ PbCancel::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
   new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex;
   new (&(cancel_shm_.data_->cv)) bi::interprocess_condition;
   cancel_shm_.data_->waiting_on_stub = false;
+  cancel_shm_.data_->response_factory_address = response_factory_address_;
   cancel_shm_.data_->request_address = request_address_;
   cancel_shm_.data_->is_cancelled = is_cancelled_;
 }
diff --git a/src/pb_cancel.h b/src/pb_cancel.h
index 904e98c0..ec4954b3 100644
--- a/src/pb_cancel.h
+++ b/src/pb_cancel.h
@@ -35,8 +35,9 @@ namespace triton { namespace backend { namespace python {
 
 class PbCancel {
  public:
-  PbCancel(intptr_t request_address)
-      : updated_(false), request_address_(request_address), is_cancelled_(false)
+  PbCancel(intptr_t response_factory_address, intptr_t request_address)
+      : updated_(false), response_factory_address_(response_factory_address),
+        request_address_(request_address), is_cancelled_(false)
   {
   }
   DISALLOW_COPY_AND_ASSIGN(PbCancel);
@@ -55,6 +56,7 @@ class PbCancel {
   std::condition_variable cv_;
   bool updated_;
 
+  intptr_t response_factory_address_;
   intptr_t request_address_;
   bool is_cancelled_;
 };
diff --git a/src/pb_utils.h b/src/pb_utils.h
index 71a70272..612c46a4 100644
--- a/src/pb_utils.h
+++ b/src/pb_utils.h
@@ -183,6 +183,7 @@ struct CleanupMessage : SendMessageBase {
 };
 
 struct IsCancelledMessage : SendMessageBase {
+  intptr_t response_factory_address;
   intptr_t request_address;
   bool is_cancelled;
 };
diff --git a/src/python_be.cc b/src/python_be.cc
index b9835221..f70a01a3 100644
--- a/src/python_be.cc
+++ b/src/python_be.cc
@@ -934,9 +934,21 @@ ModelInstanceState::ProcessIsRequestCancelled(
   {
     bi::scoped_lock<bi::interprocess_mutex> lk{message_payload->mu};
 
-    TRITONBACKEND_Request* request = reinterpret_cast<TRITONBACKEND_Request*>(
-        message_payload->request_address);
-    TRITONBACKEND_RequestIsCancelled(request, &message_payload->is_cancelled);
+    if (message_payload->response_factory_address != 0) {
+      TRITONBACKEND_ResponseFactory* response_factory =
+          reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+              message_payload->response_factory_address);
+      TRITONBACKEND_ResponseFactoryIsCancelled(
+          response_factory, &message_payload->is_cancelled);
+    } else if (message_payload->request_address != 0) {
+      TRITONBACKEND_Request* request = reinterpret_cast<TRITONBACKEND_Request*>(
+          message_payload->request_address);
+      TRITONBACKEND_RequestIsCancelled(request, &message_payload->is_cancelled);
+    } else {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR, "Cannot determine request cancellation");
+      message_payload->is_cancelled = false;
+    }
 
     message_payload->waiting_on_stub = true;
     message_payload->cv.notify_all();

From 2319355a742f9a9f7c5d6d37b9a485330ace7a9b Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 29 Sep 2023 12:08:15 -0700
Subject: [PATCH 04/12] Remove unnecessary wrapping

---
 src/infer_request.cc | 7 +++----
 src/pb_stub.cc       | 5 ++---
 src/pb_stub.h        | 2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 1410fb46..8e753bb1 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -408,10 +408,9 @@ InferRequest::IsCancelled()
     LOG_ERROR << "Cannot communicate with parent service";
     return false;
   }
-  std::unique_ptr<PbCancel> pb_cancel(
-      new PbCancel(response_factory_address_, request_address_));
-  stub->EnqueueIsCancelled(pb_cancel);
-  return pb_cancel->IsCancelled();
+  PbCancel pb_cancel(response_factory_address_, request_address_);
+  stub->EnqueueIsCancelled(&pb_cancel);
+  return pb_cancel.IsCancelled();
 }
 
 std::shared_ptr<ResponseSender>
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 5a5094e0..2eb8b08c 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -1032,12 +1032,11 @@ Stub::EnqueueCleanupId(void* id)
 }
 
 void
-Stub::EnqueueIsCancelled(const std::unique_ptr<PbCancel>& pb_cancel)
+Stub::EnqueueIsCancelled(PbCancel* pb_cancel)
 {
   std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
       std::make_unique<UtilsMessagePayload>(
-          PYTHONSTUB_IsRequestCancelled,
-          reinterpret_cast<void*>(pb_cancel.get()));
+          PYTHONSTUB_IsRequestCancelled, reinterpret_cast<void*>(pb_cancel));
   EnqueueUtilsMessage(std::move(utils_msg_payload));
 }
 
diff --git a/src/pb_stub.h b/src/pb_stub.h
index 530dca2c..d52196e1 100644
--- a/src/pb_stub.h
+++ b/src/pb_stub.h
@@ -310,7 +310,7 @@ class Stub {
   void EnqueueCleanupId(void* id);
 
   /// Add request cancellation query to queue
-  void EnqueueIsCancelled(const std::unique_ptr<PbCancel>& pb_cancel);
+  void EnqueueIsCancelled(PbCancel* pb_cancel);
 
   /// Send request cancellation query to python backend
   void SendIsCancelled(std::unique_ptr<UtilsMessagePayload>& utils_msg_payload);

From 9262f7c11d6d5cabd55726fa34509dd8d599acae Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 29 Sep 2023 18:01:16 -0700
Subject: [PATCH 05/12] Throw error instead of log error

---
 src/python_be.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/python_be.cc b/src/python_be.cc
index f70a01a3..7f46d473 100644
--- a/src/python_be.cc
+++ b/src/python_be.cc
@@ -945,9 +945,7 @@ ModelInstanceState::ProcessIsRequestCancelled(
           message_payload->request_address);
       TRITONBACKEND_RequestIsCancelled(request, &message_payload->is_cancelled);
     } else {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_ERROR, "Cannot determine request cancellation");
-      message_payload->is_cancelled = false;
+      throw PythonBackendException("Cannot determine request cancellation");
     }
 
     message_payload->waiting_on_stub = true;

From 913ab0a30c922d0bbd2d39e705463d427a8d9fae Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Mon, 2 Oct 2023 14:31:12 -0700
Subject: [PATCH 06/12] Add is cancelled check at response sender

---
 src/pb_stub.cc         |  3 ++-
 src/response_sender.cc | 14 ++++++++++++++
 src/response_sender.h  |  1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 2eb8b08c..87abe583 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -1585,7 +1585,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       module, "InferenceResponseSender")
       .def(
           "send", &ResponseSender::Send, py::arg("response") = nullptr,
-          py::arg("flags") = 0);
+          py::arg("flags") = 0)
+      .def("is_cancelled", &ResponseSender::IsCancelled);
 
   py::class_<ResponseIterator, std::shared_ptr<ResponseIterator>>(
       module, "ResponseIterator")
diff --git a/src/response_sender.cc b/src/response_sender.cc
index a74459f6..9390aa15 100644
--- a/src/response_sender.cc
+++ b/src/response_sender.cc
@@ -184,4 +184,18 @@ ResponseSender::Send(
     }
   }
 }
+
+bool
+ResponseSender::IsCancelled()
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  if (!stub->StubToParentServiceActive()) {
+    LOG_ERROR << "Cannot communicate with parent service";
+    return false;
+  }
+  PbCancel pb_cancel(response_factory_address_, request_address_);
+  stub->EnqueueIsCancelled(&pb_cancel);
+  return pb_cancel.IsCancelled();
+}
+
 }}}  // namespace triton::backend::python
diff --git a/src/response_sender.h b/src/response_sender.h
index 114f22c0..9b3509f1 100644
--- a/src/response_sender.h
+++ b/src/response_sender.h
@@ -37,6 +37,7 @@ class ResponseSender {
       intptr_t request_address, intptr_t response_factory_address,
       std::unique_ptr<SharedMemoryManager>& shm_pool);
   void Send(std::shared_ptr<InferResponse> response, const uint32_t flags);
+  bool IsCancelled();
 
  private:
   intptr_t request_address_;

From aee842e150b7555efef28b9ff89b3663f1130a4f Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 3 Oct 2023 16:03:38 -0700
Subject: [PATCH 07/12] Enable more reuse on request cancellation and improve
 model interface

---
 CMakeLists.txt         |  4 ++--
 README.md              | 25 ++++++++++++++++---------
 src/infer_request.cc   | 23 +++++++++++++----------
 src/infer_request.h    |  3 +++
 src/pb_cancel.cc       | 26 ++++++++++++++++++++++++--
 src/pb_cancel.h        |  6 ++++--
 src/pb_stub.cc         | 13 ++++++++++---
 src/response_sender.cc | 16 +++++++---------
 src/response_sender.h  |  5 ++++-
 9 files changed, 83 insertions(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 057797dd..3f20bbc3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,8 +150,6 @@ set(
   src/pb_error.h
   src/pb_log.cc
   src/pb_log.h
-  src/pb_cancel.cc
-  src/pb_cancel.h
   src/pb_memory.cc
   src/pb_memory.h
   src/pb_tensor.cc
@@ -210,6 +208,8 @@ set(
   src/pb_stub.cc
   src/pb_response_iterator.h
   src/pb_response_iterator.cc
+  src/pb_cancel.cc
+  src/pb_cancel.h
 )
 
 list(APPEND
diff --git a/README.md b/README.md
index aa4e5cc2..da7dcb2e 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ any C++ code.
     - [`execute`](#execute)
       - [Default Mode](#default-mode)
       - [Error Handling](#error-handling)
-      - [Request Cancellation](#request-cancellation)
+      - [Request Cancellation Handling](#request-cancellation-handling)
       - [Decoupled mode](#decoupled-mode)
         - [Use Cases](#use-cases)
         - [Known Issues](#known-issues)
@@ -505,13 +505,13 @@ Supported error codes:
 * `pb_utils.TritonError.ALREADY_EXISTS`
 * `pb_utils.TritonError.CANCELLED` (since 23.10)
 
-#### Request Cancellation
+#### Request Cancellation Handling
 
-One or more requests may be cancelled during execution, for example, cancelled
-by the user. Starting from 23.10, `request.is_cancelled()` returns up-to-date
-`True` or `False` on whether the request is cancelled. If a request is
-cancelled, the model should respond `pb_utils.TritonError.CANCELLED` in place of
-the normal output tensors on the request. For example:
+One or more requests may be cancelled by the client during execution. Starting
+from 23.10, `request.is_cancelled()` returns up-to-date `True` or `False` on
+whether the request is cancelled. If a request is cancelled, the model may
+respond with any dummy object in place of the normal output tensors on the
+request. For example:
 
 ```python
 import triton_python_backend_utils as pb_utils
@@ -524,8 +524,7 @@ class TritonPythonModel:
 
         for request in requests:
             if request.is_cancelled():
-                responses.append(pb_utils.InferenceResponse(
-                    error=pb_utils.TritonError("Message", pb_utils.TritonError.CANCELLED)))
+                responses.append(None)
             else:
                 ...
 
@@ -576,6 +575,12 @@ request. After setting errors for an pb_utils.InferenceResponse
 object, use InferenceResponseSender.send() to send response with the
 error back to the user.
 
+Starting from 23.10, request cancellation can be checked directly on the
+`InferenceResponseSender` object, for example `response_sender.is_cancelled()`,
+even after the request has gone out-of-scope. If
+`response_sender.is_cancelled()` returned `True`, the
+TRITONSERVER_RESPONSE_COMPLETE_FINAL flag is sent automatically.
+
 ##### Use Cases
 
 The decoupled mode is powerful and supports various other use cases:
@@ -598,6 +603,8 @@ full power of what can be achieved from decoupled API. Read
 [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
 for more details on how to host a decoupled model.
 
+#####
+
 ##### Known Issues
 
 * Currently, decoupled Python models can not make async infer requests.
diff --git a/src/infer_request.cc b/src/infer_request.cc
index 8e753bb1..e148b062 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -71,9 +71,11 @@ InferRequest::InferRequest(
   inputs_ = inputs;
   requested_output_names_ = requested_output_names;
 #ifdef TRITON_PB_STUB
+  pb_cancel_ =
+      std::make_shared<PbCancel>(response_factory_address_, request_address_);
   response_sender_ = std::make_shared<ResponseSender>(
       request_address_, response_factory_address_,
-      Stub::GetOrCreateInstance()->SharedMemory());
+      Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
 #endif
 }
 
@@ -379,9 +381,11 @@ InferRequest::InferRequest(
   trace_ = infer_request_shm_ptr_->trace;
 
 #ifdef TRITON_PB_STUB
+  pb_cancel_ =
+      std::make_shared<PbCancel>(response_factory_address_, request_address_);
   response_sender_ = std::make_shared<ResponseSender>(
       request_address_, response_factory_address_,
-      Stub::GetOrCreateInstance()->SharedMemory());
+      Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_);
 #endif
 }
 
@@ -403,14 +407,13 @@ InferRequest::DeleteResponseFactory()
 bool
 InferRequest::IsCancelled()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
-  if (!stub->StubToParentServiceActive()) {
-    LOG_ERROR << "Cannot communicate with parent service";
-    return false;
-  }
-  PbCancel pb_cancel(response_factory_address_, request_address_);
-  stub->EnqueueIsCancelled(&pb_cancel);
-  return pb_cancel.IsCancelled();
+  return pb_cancel_->IsCancelled();
+}
+
+bool
+InferRequest::IsCancelledLastResponse()
+{
+  return pb_cancel_->IsCancelledInternalFlag();
 }
 
 std::shared_ptr<ResponseSender>
diff --git a/src/infer_request.h b/src/infer_request.h
index a96545a3..926e50b9 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -34,6 +34,7 @@
 #include "pb_tensor.h"
 
 #ifdef TRITON_PB_STUB
+#include "pb_cancel.h"
 #include "response_sender.h"
 #endif
 
@@ -108,6 +109,7 @@ class InferRequest {
   std::shared_ptr<InferResponse> Exec(const bool is_decoupled);
   std::shared_ptr<ResponseSender> GetResponseSender();
   bool IsCancelled();
+  bool IsCancelledLastResponse();
 #endif
 
   /// Save an Inference Request to shared memory.
@@ -174,6 +176,7 @@ class InferRequest {
   std::unique_ptr<PbString> parameters_shm_;
 
 #ifdef TRITON_PB_STUB
+  std::shared_ptr<PbCancel> pb_cancel_;
   std::shared_ptr<ResponseSender> response_sender_;
 #endif
 };
diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc
index 272babf0..4fdcda81 100644
--- a/src/pb_cancel.cc
+++ b/src/pb_cancel.cc
@@ -26,6 +26,8 @@
 
 #include "pb_cancel.h"
 
+#include "pb_stub.h"
+
 namespace triton { namespace backend { namespace python {
 
 void
@@ -52,11 +54,31 @@ PbCancel::ShmPayload()
   return cancel_shm_.data_.get();
 }
 
+bool
+PbCancel::IsCancelledInternalFlag()
+{
+  return is_cancelled_;
+}
+
 bool
 PbCancel::IsCancelled()
 {
   std::unique_lock<std::mutex> lk(mu_);
-  cv_.wait(lk, [this] { return updated_; });
+  // The cancelled flag can only move from false to true, not the other way, so
+  // it is checked on each query until cancelled and then implicitly cached.
+  if (is_cancelled_) {
+    return is_cancelled_;
+  }
+  if (!updating_) {
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    if (!stub->StubToParentServiceActive()) {
+      LOG_ERROR << "Cannot communicate with parent service";
+      return false;
+    }
+    stub->EnqueueIsCancelled(this);
+    updating_ = true;
+  }
+  cv_.wait(lk, [this] { return !updating_; });
   return is_cancelled_;
 }
 
@@ -66,7 +88,7 @@ PbCancel::ReportIsCancelled(bool is_cancelled)
   {
     std::lock_guard<std::mutex> lk(mu_);
     is_cancelled_ = is_cancelled;
-    updated_ = true;
+    updating_ = false;
   }
   cv_.notify_all();
 }
diff --git a/src/pb_cancel.h b/src/pb_cancel.h
index ec4954b3..4eb4a8ff 100644
--- a/src/pb_cancel.h
+++ b/src/pb_cancel.h
@@ -36,7 +36,7 @@ namespace triton { namespace backend { namespace python {
 class PbCancel {
  public:
   PbCancel(intptr_t response_factory_address, intptr_t request_address)
-      : updated_(false), response_factory_address_(response_factory_address),
+      : updating_(false), response_factory_address_(response_factory_address),
         request_address_(request_address), is_cancelled_(false)
   {
   }
@@ -46,6 +46,8 @@ class PbCancel {
   bi::managed_external_buffer::handle_t ShmHandle();
   IsCancelledMessage* ShmPayload();
 
+  bool IsCancelledInternalFlag();
+
   bool IsCancelled();
   void ReportIsCancelled(bool is_cancelled);
 
@@ -54,7 +56,7 @@ class PbCancel {
 
   std::mutex mu_;
   std::condition_variable cv_;
-  bool updated_;
+  bool updating_;
 
   intptr_t response_factory_address_;
   intptr_t request_address_;
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 87abe583..c379998d 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -771,10 +771,17 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
           std::to_string(response_size) + "\n";
       throw PythonBackendException(err);
     }
-    for (auto& response : responses) {
+    for (size_t i = 0; i < response_size; i++) {
+      // If the model has checked for cancellation and the request is cancelled,
+      // replace returned type with a cancelled response.
+      if (py_request_list[i].cast<InferRequest*>()->IsCancelledLastResponse()) {
+        responses[i] = std::make_shared<InferResponse>(
+            std::vector<std::shared_ptr<PbTensor>>{},
+            std::make_shared<PbError>("", TRITONSERVER_ERROR_CANCELLED));
+      }
       // Check the return type of execute function.
-      if (!py::isinstance<InferResponse>(response)) {
-        std::string str = py::str(response.get_type());
+      else if (!py::isinstance<InferResponse>(responses[i])) {
+        std::string str = py::str(responses[i].get_type());
         throw PythonBackendException(
             std::string("Expected an 'InferenceResponse' object in the execute "
                         "function return list, found type '") +
diff --git a/src/response_sender.cc b/src/response_sender.cc
index 9390aa15..bd10d271 100644
--- a/src/response_sender.cc
+++ b/src/response_sender.cc
@@ -37,10 +37,11 @@ namespace triton { namespace backend { namespace python {
 
 ResponseSender::ResponseSender(
     intptr_t request_address, intptr_t response_factory_address,
-    std::unique_ptr<SharedMemoryManager>& shm_pool)
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    const std::shared_ptr<PbCancel>& pb_cancel)
     : request_address_(request_address),
       response_factory_address_(response_factory_address), shm_pool_(shm_pool),
-      closed_(false)
+      closed_(false), pb_cancel_(pb_cancel)
 {
 }
 
@@ -188,14 +189,11 @@ ResponseSender::Send(
 bool
 ResponseSender::IsCancelled()
 {
-  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
-  if (!stub->StubToParentServiceActive()) {
-    LOG_ERROR << "Cannot communicate with parent service";
-    return false;
+  bool is_cancelled = pb_cancel_->IsCancelled();
+  if (is_cancelled && !closed_) {
+    Send(nullptr, TRITONSERVER_RESPONSE_COMPLETE_FINAL);
   }
-  PbCancel pb_cancel(response_factory_address_, request_address_);
-  stub->EnqueueIsCancelled(&pb_cancel);
-  return pb_cancel.IsCancelled();
+  return is_cancelled;
 }
 
 }}}  // namespace triton::backend::python
diff --git a/src/response_sender.h b/src/response_sender.h
index 9b3509f1..1d12765a 100644
--- a/src/response_sender.h
+++ b/src/response_sender.h
@@ -27,6 +27,7 @@
 #pragma once
 
 #include "infer_response.h"
+#include "pb_cancel.h"
 #include "shm_manager.h"
 
 namespace triton { namespace backend { namespace python {
@@ -35,7 +36,8 @@ class ResponseSender {
  public:
   ResponseSender(
       intptr_t request_address, intptr_t response_factory_address,
-      std::unique_ptr<SharedMemoryManager>& shm_pool);
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      const std::shared_ptr<PbCancel>& pb_cancel);
   void Send(std::shared_ptr<InferResponse> response, const uint32_t flags);
   bool IsCancelled();
 
@@ -44,5 +46,6 @@ class ResponseSender {
   intptr_t response_factory_address_;
   std::unique_ptr<SharedMemoryManager>& shm_pool_;
   bool closed_;
+  std::shared_ptr<PbCancel> pb_cancel_;
 };
 }}}  // namespace triton::backend::python

From 3a43ef3a9d0550935ae9857f3c3d54e73987ab35 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 3 Oct 2023 18:29:21 -0700
Subject: [PATCH 08/12] Documentation wording updates

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index da7dcb2e..8bc057c1 100644
--- a/README.md
+++ b/README.md
@@ -508,10 +508,10 @@ Supported error codes:
 #### Request Cancellation Handling
 
 One or more requests may be cancelled by the client during execution. Starting
-from 23.10, `request.is_cancelled()` returns up-to-date `True` or `False` on
-whether the request is cancelled. If a request is cancelled, the model may
-respond with any dummy object in place of the normal output tensors on the
-request. For example:
+from 23.10, `request.is_cancelled()` returns whether the request is cancelled.
+
+If a request is cancelled, the model may respond with any dummy object in place
+of the normal output tensors on the request. For example:
 
 ```python
 import triton_python_backend_utils as pb_utils
@@ -576,10 +576,9 @@ object, use InferenceResponseSender.send() to send response with the
 error back to the user.
 
 Starting from 23.10, request cancellation can be checked directly on the
-`InferenceResponseSender` object, for example `response_sender.is_cancelled()`,
-even after the request has gone out-of-scope. If
-`response_sender.is_cancelled()` returned `True`, the
-TRITONSERVER_RESPONSE_COMPLETE_FINAL flag is sent automatically.
+`InferenceResponseSender` object using `response_sender.is_cancelled()`. If
+`response_sender.is_cancelled()` returned `True`, then no further steps are
+needed to be performed on this object.
 
 ##### Use Cases
 

From e3c476e01b28cba3b8c774a755d15e82d6aee3b2 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Wed, 4 Oct 2023 11:54:18 -0700
Subject: [PATCH 09/12] Copyright year update

---
 src/response_sender.cc | 2 +-
 src/response_sender.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/response_sender.cc b/src/response_sender.cc
index bd10d271..952c6af7 100644
--- a/src/response_sender.cc
+++ b/src/response_sender.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/response_sender.h b/src/response_sender.h
index 1d12765a..fda0d5d3 100644
--- a/src/response_sender.h
+++ b/src/response_sender.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions

From a5892b6984d7b8cf467a764e05b40df72df1eed2 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 5 Oct 2023 00:08:44 -0700
Subject: [PATCH 10/12] Rollback response sender auto close on cancel

---
 README.md              | 4 +---
 src/response_sender.cc | 6 +-----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 8bc057c1..86abb10d 100644
--- a/README.md
+++ b/README.md
@@ -576,9 +576,7 @@ object, use InferenceResponseSender.send() to send response with the
 error back to the user.
 
 Starting from 23.10, request cancellation can be checked directly on the
-`InferenceResponseSender` object using `response_sender.is_cancelled()`. If
-`response_sender.is_cancelled()` returned `True`, then no further steps are
-needed to be performed on this object.
+`InferenceResponseSender` object using `response_sender.is_cancelled()`.
 
 ##### Use Cases
 
diff --git a/src/response_sender.cc b/src/response_sender.cc
index 952c6af7..1e2e9b50 100644
--- a/src/response_sender.cc
+++ b/src/response_sender.cc
@@ -189,11 +189,7 @@ ResponseSender::Send(
 bool
 ResponseSender::IsCancelled()
 {
-  bool is_cancelled = pb_cancel_->IsCancelled();
-  if (is_cancelled && !closed_) {
-    Send(nullptr, TRITONSERVER_RESPONSE_COMPLETE_FINAL);
-  }
-  return is_cancelled;
+  return pb_cancel_->IsCancelled();
 }
 
 }}}  // namespace triton::backend::python

From 3de5922c48ece05b5d55862063e67ab3a22c97f5 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 5 Oct 2023 14:30:30 -0700
Subject: [PATCH 11/12] Rollback non-decoupled any response on cancel

---
 README.md            | 11 ++++-------
 src/infer_request.cc |  6 ------
 src/infer_request.h  |  1 -
 src/pb_cancel.cc     |  6 ------
 src/pb_cancel.h      |  2 --
 src/pb_stub.cc       | 13 +++----------
 6 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 86abb10d..6a45a619 100644
--- a/README.md
+++ b/README.md
@@ -508,10 +508,8 @@ Supported error codes:
 #### Request Cancellation Handling
 
 One or more requests may be cancelled by the client during execution. Starting
-from 23.10, `request.is_cancelled()` returns whether the request is cancelled.
-
-If a request is cancelled, the model may respond with any dummy object in place
-of the normal output tensors on the request. For example:
+from 23.10, `request.is_cancelled()` returns whether the request is cancelled or
+not. For example:
 
 ```python
 import triton_python_backend_utils as pb_utils
@@ -524,7 +522,8 @@ class TritonPythonModel:
 
         for request in requests:
             if request.is_cancelled():
-                responses.append(None)
+                responses.append(pb_utils.InferenceResponse(
+                    error=pb_utils.TritonError("Message", pb_utils.TritonError.CANCELLED)))
             else:
                 ...
 
@@ -600,8 +599,6 @@ full power of what can be achieved from decoupled API. Read
 [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
 for more details on how to host a decoupled model.
 
-#####
-
 ##### Known Issues
 
 * Currently, decoupled Python models can not make async infer requests.
diff --git a/src/infer_request.cc b/src/infer_request.cc
index e148b062..e9d243f1 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -410,12 +410,6 @@ InferRequest::IsCancelled()
   return pb_cancel_->IsCancelled();
 }
 
-bool
-InferRequest::IsCancelledLastResponse()
-{
-  return pb_cancel_->IsCancelledInternalFlag();
-}
-
 std::shared_ptr<ResponseSender>
 InferRequest::GetResponseSender()
 {
diff --git a/src/infer_request.h b/src/infer_request.h
index 926e50b9..bc6a2acf 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -109,7 +109,6 @@ class InferRequest {
   std::shared_ptr<InferResponse> Exec(const bool is_decoupled);
   std::shared_ptr<ResponseSender> GetResponseSender();
   bool IsCancelled();
-  bool IsCancelledLastResponse();
 #endif
 
   /// Save an Inference Request to shared memory.
diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc
index 4fdcda81..4c9b926b 100644
--- a/src/pb_cancel.cc
+++ b/src/pb_cancel.cc
@@ -54,12 +54,6 @@ PbCancel::ShmPayload()
   return cancel_shm_.data_.get();
 }
 
-bool
-PbCancel::IsCancelledInternalFlag()
-{
-  return is_cancelled_;
-}
-
 bool
 PbCancel::IsCancelled()
 {
diff --git a/src/pb_cancel.h b/src/pb_cancel.h
index 4eb4a8ff..3ebf07b5 100644
--- a/src/pb_cancel.h
+++ b/src/pb_cancel.h
@@ -46,8 +46,6 @@ class PbCancel {
   bi::managed_external_buffer::handle_t ShmHandle();
   IsCancelledMessage* ShmPayload();
 
-  bool IsCancelledInternalFlag();
-
   bool IsCancelled();
   void ReportIsCancelled(bool is_cancelled);
 
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index c379998d..87abe583 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -771,17 +771,10 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
           std::to_string(response_size) + "\n";
       throw PythonBackendException(err);
     }
-    for (size_t i = 0; i < response_size; i++) {
-      // If the model has checked for cancellation and the request is cancelled,
-      // replace returned type with a cancelled response.
-      if (py_request_list[i].cast<InferRequest*>()->IsCancelledLastResponse()) {
-        responses[i] = std::make_shared<InferResponse>(
-            std::vector<std::shared_ptr<PbTensor>>{},
-            std::make_shared<PbError>("", TRITONSERVER_ERROR_CANCELLED));
-      }
+    for (auto& response : responses) {
       // Check the return type of execute function.
-      else if (!py::isinstance<InferResponse>(responses[i])) {
-        std::string str = py::str(responses[i].get_type());
+      if (!py::isinstance<InferResponse>(response)) {
+        std::string str = py::str(response.get_type());
         throw PythonBackendException(
             std::string("Expected an 'InferenceResponse' object in the execute "
                         "function return list, found type '") +

From 09e358adab39322dbf807d00a9c993f16c375edf Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 5 Oct 2023 16:29:49 -0700
Subject: [PATCH 12/12] Decoupled final flag docs update

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6a45a619..4cb9a960 100644
--- a/README.md
+++ b/README.md
@@ -575,7 +575,9 @@ object, use InferenceResponseSender.send() to send response with the
 error back to the user.
 
 Starting from 23.10, request cancellation can be checked directly on the
-`InferenceResponseSender` object using `response_sender.is_cancelled()`.
+`InferenceResponseSender` object using `response_sender.is_cancelled()`. Sending
+the TRITONSERVER_RESPONSE_COMPLETE_FINAL flag at the end of response is still
+needed even the request is cancelled.
 
 ##### Use Cases