Support inplace update for PythonOp/Grad (#17687)

### Support inplace update for PythonOp/Grad This PR is based on another PR #17685 branch, to make it easier to review. With PR: PR #17685, By default all PythonOp inputs/outputs are assumed to not be inplaced, if during run, we found some inplace update happens (by checking output data address with all inputs data address), we add clone before set it as PythonOp/Grad's outputs. In this case, results are correct, but implicit copies overheads are introduced. This PR allow users to define output input reuse map, to let ORT know how to do the reuse map, avoid such unnecessary copies.
microsoft · Oct 11, 2023 · 0e27824 · 0e27824
1 parent 54b7503
commit 0e27824
Show file tree

Hide file tree

Showing 21 changed files with 590 additions and 78 deletions.
diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
@@ -109,6 +109,9 @@ onnxruntime_add_include_to_target(onnxruntime_optimizer onnxruntime_common onnxr
 target_include_directories(onnxruntime_optimizer PRIVATE ${ONNXRUNTIME_ROOT})
 if (onnxruntime_ENABLE_TRAINING)
   target_include_directories(onnxruntime_optimizer PRIVATE ${ORTTRAINING_ROOT})
+  if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
+    onnxruntime_add_include_to_target(onnxruntime_optimizer Python::Module)
+  endif()
 endif()
 if (onnxruntime_ENABLE_TRITON)
   target_link_libraries(onnxruntime_optimizer PRIVATE nlohmann_json::nlohmann_json)

diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
@@ -320,7 +320,7 @@ class PlannerImpl {
       return false;
     }
 
-    const auto& alias_map = ci.kernel_def->Alias();
+    const auto alias_map = GetAliasMap(node, ci);
     auto input_args = node.InputDefs();
     for (auto& pair : alias_map) {
       if (pair.second == output_arg_num) {
@@ -829,6 +829,34 @@ class PlannerImpl {
     return p_provider->GetOrtDeviceByMemType(utils::IsInputOnCpu(node, &kernel_create_info, input_index) ? OrtMemTypeCPUInput : OrtMemTypeDefault);
   }
 
+  std::vector<std::pair<int, int>> GetAliasMap(const Node& node, const KernelCreateInfo& kernel_create_info) {
+    ORT_ENFORCE(kernel_create_info.kernel_def != nullptr, "KernelDef is null for node: ", node.Name());
+#ifdef ENABLE_TRAINING_TORCH_INTEROP
+    if ((node.OpType().compare("PythonOp") == 0 || node.OpType().compare("PythonOpGrad") == 0) &&
+        node.Domain() == kMSDomain) {
+      const auto& attrs = node.GetAttributes();
+      auto attr_it = attrs.find("tensor_reuse_map");
+      if (attr_it != attrs.end()) {
+        const auto& inplace_map = attr_it->second.ints();
+        std::vector<std::pair<int, int>> alias_map;
+        alias_map.reserve(inplace_map.size());
+        for (int i = 0; i < inplace_map.size(); ++i) {
+          int output_index = i;
+          int input_index = inplace_map[i];
+          if (input_index == -1) {
+            // skip because no reuse for this output
+            continue;
+          }
+          alias_map.emplace_back(std::make_pair(input_index, output_index));
+        }
+        return alias_map;
+      }
+    }
+#endif
+
+    return kernel_create_info.kernel_def->Alias();
+  }
+
   void GeneratePlanForWeightsHelper(const GraphViewer& graph_viewer,
                                     const InitializedTensorSet& weights,
                                     const KernelCreateInfoMap& kernel_create_info_map,
@@ -1084,7 +1112,7 @@ class PlannerImpl {
         }
 
         bool found_reusable = false;
-        const auto& alias_map = ci.kernel_def->Alias();
+        const auto alias_map = GetAliasMap(*node, ci);
         auto input_args = node->InputDefs();
         for (auto* input_arg : input_args) {
           OrtValueIndex input_idx_global{};

diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -2385,10 +2385,10 @@ def _infer_PythonOp(self, node):  # noqa: N802
         output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
         assert output_tensor_ranks
 
-        from onnxruntime.training.ortmodule._custom_autograd_function_exporter import PythonOpShapeInferStore
+        from onnxruntime.capi._pybind_state import get_shape_inference_function
 
         func_name = get_attribute(node, "func_name").decode()
-        shape_inferer = PythonOpShapeInferStore.get_shape_infer(func_name)
+        shape_inferer = get_shape_inference_function(func_name)
 
         # Set the context output separately.
         # The first output is torch.autograd.Function''s context.

diff --git a/orttraining/orttraining/core/framework/torch/custom_function_register.cc b/orttraining/orttraining/core/framework/torch/custom_function_register.cc
@@ -95,6 +95,16 @@ void OrtTorchFunctionPool::RegisterTorchAutogradFunction(
   RegisterEntry(mutex_, key, backward.get(), backward_core_pool_);
 }
 
+void OrtTorchFunctionPool::RegisterShapeInferenceFunction(const std::string& key,
+                                                          PyObject* obj) {
+  RegisterEntry(mutex_, key, obj, shape_inference_function_pool_);
+}
+
+void OrtTorchFunctionPool::RegisterInputAliasFunction(const std::string& key,
+                                                      PyObject* obj) {
+  RegisterEntry(mutex_, key, obj, input_alias_function_pool_);
+}
+
 static void RegisterEntry(
     std::mutex& mutex,
     PyObject* obj,
@@ -153,6 +163,26 @@ PyObject* OrtTorchFunctionPool::GetBackwardCore(const std::string& key) {
   return iter->second.get();
 }
 
+std::optional<PyObject*> OrtTorchFunctionPool::TryGettingShapeInferenceFunction(const std::string& key) {
+  ORT_ENFORCE(!key.empty(), "Cannot be empty string.");
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto iter = shape_inference_function_pool_.find(key);
+  if (iter != shape_inference_function_pool_.end()) {
+    return iter->second.get();
+  }
+  return std::nullopt;
+}
+
+std::optional<PyObject*> OrtTorchFunctionPool::TryGettingInputAliasFunction(const std::string& key) {
+  ORT_ENFORCE(!key.empty(), "Cannot be empty string.");
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto iter = input_alias_function_pool_.find(key);
+  if (iter != input_alias_function_pool_.end()) {
+    return iter->second.get();
+  }
+  return std::nullopt;
+}
+
 void OrtTorchFunctionPool::RegisterMiscellaneousConstInput(PyObject* obj) {
   ORT_ENFORCE(obj, "Cannot register NULL reference input.");
   const void* address = static_cast<const void*>(obj);
@@ -205,6 +235,8 @@ void OrtTorchFunctionPool::UnRegisterGlobalFunctions() {
 void OrtTorchFunctionPool::UnRegisterModelSpecificFunctions() {
   forward_core_pool_.clear();
   backward_core_pool_.clear();
+  shape_inference_function_pool_.clear();
+  input_alias_function_pool_.clear();
   miscellaneous_const_input_pool_.clear();
 }
 

diff --git a/orttraining/orttraining/core/framework/torch/custom_function_register.h b/orttraining/orttraining/core/framework/torch/custom_function_register.h
@@ -34,6 +34,16 @@ class OrtTorchFunctionPool final {
   //  2. Caller of GetBackwardCore should not decrease the reference count of the returned object.
   PyObject* GetBackwardCore(const std::string& key);  // The "key" is the "name" attribute in PythonOpGrad.
 
+  // Shape inference function is used to infer output shape of a PythonOp.
+  void RegisterShapeInferenceFunction(const std::string& key, PyObject* obj);
+  // Return a borrowed reference to the stored Python function, if it exists; otherwise, return nullptr.
+  std::optional<PyObject*> TryGettingShapeInferenceFunction(const std::string& key);
+
+  // Input alias function is used to infer memory reuse map of a PythonOp.
+  void RegisterInputAliasFunction(const std::string& key, PyObject* obj);
+  // Return a borrowed reference to the stored Python function, if it exists; otherwise, return nullptr.
+  std::optional<PyObject*> TryGettingInputAliasFunction(const std::string& key);
+
   // Autograd function may take input of "non-tensor && non int/float && non int/float tuple" types.
   // While PythonOp running requires those inputs be there otherwise kernel execution will fail.
   // So during model exporting, we need register those input with this API, then a ref cnt is increased by 1,
@@ -92,6 +102,9 @@ class OrtTorchFunctionPool final {
 
   std::unordered_map<std::string, PythonObjectPtr> forward_core_pool_;
   std::unordered_map<std::string, PythonObjectPtr> backward_core_pool_;
+  std::unordered_map<std::string, PythonObjectPtr> shape_inference_function_pool_;
+  std::unordered_map<std::string, PythonObjectPtr> input_alias_function_pool_;
+
   std::unordered_map<std::string, PythonObjectPtr> miscellaneous_const_input_pool_;
   std::unordered_map<int64_t, PythonObjectPtr> func_context_pool_;
 

diff --git a/orttraining/orttraining/core/framework/torch/torch_proxy.cc b/orttraining/orttraining/core/framework/torch/torch_proxy.cc
@@ -372,4 +372,54 @@ void TorchProxy::Backward(
       returned_ortvalues);
 }
 
+void TorchProxy::RunInputAliasFunction(
+    void* input_alias_function,
+    const std::string& node_proto_str,
+    std::vector<int64_t>& fw_output_to_input_alias_map,
+    std::vector<int64_t>& bw_output_to_input_alias_map) {
+  PyObject* input_alias_func = reinterpret_cast<PyObject*>(input_alias_function);
+  ORT_ENFORCE(PyCallable_Check(input_alias_func), "input_alias_func is not callable.");
+
+  // All arguments created for Python call will be destroyed along with PythonObjectPtr.
+  PythonObjectPtr args(Ort_PyTuple_New(1, "input_alias_func_arguments_tuple"), PythonObjectDeleter);
+  PyObject* node_proto_ptr_arg = PyBytes_FromStringAndSize(node_proto_str.c_str(), node_proto_str.size());
+  Ort_PyTuple_SetItem_NoIncref(args.get(), 0, node_proto_ptr_arg, "node_proto_ptr_arg");
+
+  PythonObjectPtr result_ptr(PyObject_CallObject(input_alias_func, args.get()), PythonObjectDeleter);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    ORT_THROW("Python function execution fails with the above information.");
+  }
+
+  bool is_tuple = PyTuple_Check(result_ptr.get());
+  bool is_list = PyList_Check(result_ptr.get());
+  ORT_ENFORCE(is_tuple || is_list, "Python function must return a tuple or a list. is_tuple: ",
+              is_tuple, ", is_list: ", is_list);
+  Py_ssize_t ret_tuple_size =
+      is_tuple ? PyTuple_Size(result_ptr.get()) : PyList_Size(result_ptr.get());
+  ORT_ENFORCE(ret_tuple_size == 2, "Input alias function must return a tuple/list of size 2.");
+
+  for (Py_ssize_t tuple_index = 0; tuple_index < ret_tuple_size; ++tuple_index) {
+    PyObject* alias_map = is_tuple ? PyTuple_GetItem(result_ptr.get(), tuple_index)
+                                   : PyList_GetItem(result_ptr.get(), tuple_index);
+
+    std::vector<int64_t>& output_to_input_alias_map =
+        tuple_index == 0 ? fw_output_to_input_alias_map : bw_output_to_input_alias_map;
+
+    bool is_elem_tuple = PyTuple_Check(alias_map);
+    bool is_elem_list = PyList_Check(alias_map);
+
+    ORT_ENFORCE(is_elem_tuple || is_elem_list, "Input alias map must be a tuple or a list. is_elem_list: ",
+                is_elem_list, ", is_elem_tuple: ", is_elem_tuple);
+    Py_ssize_t output_count = is_elem_tuple ? PyTuple_Size(alias_map) : PyList_Size(alias_map);
+    for (Py_ssize_t output_index = 0; output_index < output_count; ++output_index) {
+      PyObject* input_index =
+          is_elem_tuple ? PyTuple_GetItem(alias_map, output_index) : PyList_GetItem(alias_map, output_index);
+      ORT_ENFORCE(PyLong_Check(input_index), "Alias input index must be an integer.");
+      int64_t alias_index_int = PyLong_AsLongLong(input_index);
+      output_to_input_alias_map.push_back(alias_index_int);
+    }
+  }
+}
+
 }  // namespace onnxruntime::language_interop_ops::torch
diff --git a/orttraining/orttraining/core/framework/torch/torch_proxy.h b/orttraining/orttraining/core/framework/torch/torch_proxy.h
@@ -2,8 +2,11 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include <mutex>
 #include <optional>
+#include <string>
+#include <vector>
 #include "orttraining/core/framework/torch/python_common.h"
 
 #ifndef SHARED_PROVIDER
@@ -61,6 +64,34 @@ class TorchProxy {
       const std::string& invoke_id,
       std::vector<OrtValue>& return_args);
 
+  /**
+   * @brief Run given function to get output to input reuse map.
+   *
+   * @param input_alias_func Python function to run.
+   *  The function should take a serialized PythonOp NodeProto string as input, return a tuple of two lists.
+   *  The signature of the function should be:
+   *     def alias_input(node_proto_str: str):
+   *         fw_alias_map = [1, -1, -1]
+   *         bw_alias_map = [-1, 0]
+   *         return fw_alias_map, bw_alias_map
+   * @param node_proto_str The serialized PythonOp NodeProto string.
+   * @param fw_output_to_input_alias_map Used as returned value, return the output to input alias map for forward pass.
+   *   For example, if the inputs of the torch.autograd.Function are [non_tensor_a, tensor_b],
+   *   outputs are [tensor_x, tensor_y, tensor_z], and the alias map is [1, -1, -1], this is explained as:
+   *   tensor_x is reusing the input tensor_b, tensor_y and tensor_z are not reusing any input.
+   *   The value of alias map is 0 based input index. -1 means the output is not reusing any input.
+   * @param bw_output_to_input_alias_map Used as returned value, return the output to input alias map for backward pass.
+   *   For example, if the inputs of the torch.autograd.Function are [tensor_x_grad, None, None],
+   *   outputs are [None, tensor_b_grad], and the alias map is [-1, 0], this is explained as:
+   *   tensor_b_grad is reusing the input tensor_x_grad.
+   *   The value of alias map is 0 based grad input index. -1 means the output is not reusing any input.
+   */
+  void RunInputAliasFunction(
+      void* input_alias_func,
+      const std::string& node_proto_str,
+      std::vector<int64_t>& fw_output_to_input_alias_map,
+      std::vector<int64_t>& bw_output_to_input_alias_map);
+
  private:
   TorchProxy(){};
   ~TorchProxy(){};

diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -1848,6 +1848,14 @@ IMPLEMENT_GRADIENT_BUILDER(GetPythonOpGradient) {
               "PythonOpGrad requiring gradient output count mismatch.");
   attrs.push_back(MakeAttribute("output_tensor_requires_grads", bw_tensor_output_requires_grads));
 
+  // Copy bw_tensor_reuse_map attribute from PythonOp to PythonOpGrad if it is present.
+  auto attr_it = src_attrs.find("bw_tensor_reuse_map");
+  if (attr_it != src_attrs.end()) {
+    std::vector<int64_t> tensor_output_to_tensor_input_reuse_map(attr_it->second.ints().begin(),
+                                                                 attr_it->second.ints().end());
+    attrs.push_back(MakeAttribute("tensor_reuse_map", tensor_output_to_tensor_input_reuse_map));
+  }
+
   if (src_attrs.find("comment") != src_attrs.end() && utils::HasString(src_attrs.at("comment"))) {
     attrs.push_back(MakeAttribute("comment", src_attrs.at("comment").s()));
   }

diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -3918,6 +3918,17 @@ Return true if all elements are true and false otherwise.
           "- the output 2 reuses the input 0.",
           AttributeProto::INTS,
           false)
+      .Attr(
+          "bw_tensor_reuse_map",
+          "Used for backward op only."
+          "A int array indicating whether output at each index is reusing specific input or now."
+          "If the given index is -1, it means the output is not reusing any input."
+          "For example, there are 3 inputs (including ctx) and 2 outputs, tensor_reuse_map = [2, 1] means"
+          "- the output 0 reuses the input 2."
+          "- the output 1 reuses the input 1."
+          "Be noted: the input 0 is ctx.",
+          AttributeProto::INTS,
+          false)
       .Attr(
           "training_mode",
           "Indicate if the model is exported in training_mode, by default, False.",

diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -68,6 +68,9 @@
 #include "core/optimizer/pre_shape_node_elimination.h"
 #include "orttraining/core/optimizer/compute_optimizer/padding_elimination.h"
 #include "orttraining/core/optimizer/compute_optimizer/sceloss_compute_optimization.h"
+#ifdef ENABLE_TRAINING_TORCH_INTEROP
+#include "orttraining/core/optimizer/pythonop_rewriter.h"
+#endif
 
 namespace onnxruntime {
 namespace training {
@@ -106,6 +109,9 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
       ORT_THROW_IF_ERROR(rule_transformer->Register(std::make_unique<InsertSoftmaxCrossEntropyLossOutput>()));
       ORT_THROW_IF_ERROR(rule_transformer->Register(std::make_unique<LSTMReplacement>()));
       ORT_THROW_IF_ERROR(rule_transformer->Register(std::make_unique<GRUReplacement>()));
+#ifdef ENABLE_TRAINING_TORCH_INTEROP
+      ORT_THROW_IF_ERROR(rule_transformer->Register(std::make_unique<PythonOpRewriter>()));
+#endif
 
       // Put ConstantSharing before CommonSubexpressionElimination by intention as it can create more opportunities for
       // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by