From 8894077d07af28afaf28eb63057cad8e1f350d52 Mon Sep 17 00:00:00 2001 From: Ashwini Khade Date: Tue, 3 Oct 2023 20:38:12 -0700 Subject: [PATCH] plus pybind compile fixes --- .../python/orttraining_pybind_state.cc | 426 ++++++++++++++++++ ...orttraining-py-packaging-pipeline-cuda.yml | 13 - 2 files changed, 426 insertions(+), 13 deletions(-) diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index 5ada1cf824548..ff86ece910dfd 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -18,6 +18,7 @@ #include "core/session/environment.h" #include "core/session/custom_ops.h" #include "core/dlpack/dlpack_converter.h" +#include "orttraining/core/session/training_session.h" #include "orttraining/core/agent/training_agent.h" #include "orttraining/core/graph/gradient_config.h" #include "orttraining/core/graph/optimizer_config.h" @@ -108,6 +109,66 @@ GetExecutionProvidersForTrainingApis(OrtDevice device) { } // namespace #endif +struct TrainingParameters { + std::string loss_output_name; + std::unordered_set weights_to_train; + std::unordered_set weights_not_to_train; + + // onnxruntime::training::TrainingSession::ImmutableWeights immutable_weights; + + // optimizer + std::string training_optimizer_name; + std::string lr_params_feed_name = "Learning_Rate"; + std::unordered_map> optimizer_attributes_map; + std::unordered_map> optimizer_int_attributes_map; + // onnxruntime::training::TrainingSession::OptimizerState optimizer_initial_state; + std::unordered_map> sliced_schema; + std::unordered_map sliced_axes; + std::vector sliced_tensor_names; + bool use_fp16_moments = false; + + bool use_mixed_precision = false; + bool allreduce_post_accumulation = false; + float loss_scale = 0.0f; + int world_rank = 0; + int world_size = 1; + int local_rank = 0; + int local_size = 1; + int gradient_accumulation_steps = 1; + int data_parallel_size = 1; + int horizontal_parallel_size = 1; + int pipeline_parallel_size = 1; + int num_pipeline_micro_batches = 1; + int deepspeed_zero_stage = 0; + bool enable_grad_norm_clip = true; + bool set_gradients_as_graph_outputs = false; + bool use_memory_efficient_gradient = false; + + std::string pipeline_cut_info_string = {}; + + // recompute + bool attn_dropout_recompute = false; + bool gelu_recompute = false; + bool transformer_layer_recompute = false; + int number_recompute_layers = 0; + bool enable_adasum = false; + + // transformation + int propagate_cast_ops_level = 1; + std::vector propagate_cast_ops_allow; + GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy propagate_cast_ops_strategy = + GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy::FloodFill; + + // graph dumping + std::string model_after_graph_transforms_path; + std::string model_with_gradient_graph_path; + std::string model_with_training_graph_path; +}; + +struct TrainingConfigurationResult { + optional loss_scale_input_name; +}; + #ifdef ENABLE_TRAINING_APIS // Thin wrapper over internal C++ Optimizer struct PyOptimizer { @@ -146,6 +207,185 @@ struct PyGradientGraphBuilderContext { local_registries_(local_registries) {} }; +// // TODO: this method does not handle parallel optimization. +// TrainingConfigurationResult ConfigureSessionForTraining( +// training::PipelineTrainingSession* sess, TrainingParameters& parameters) { +// // TODO tix, refactor the mpi related code to populate all fields correctly by default. +// ORT_ENFORCE(parameters.data_parallel_size <= parameters.world_size, "data_parallel_size: ", parameters.data_parallel_size, ", world_size: ", parameters.world_size); +// ORT_ENFORCE(parameters.horizontal_parallel_size <= parameters.world_size, "horizontal_parallel_size: ", parameters.horizontal_parallel_size, ", world_size: ", parameters.world_size); +// ORT_ENFORCE(parameters.pipeline_parallel_size <= parameters.world_size, "pipeline_parallel_size: ", parameters.pipeline_parallel_size, ", world_size: ", parameters.world_size); + +// // When DxHxP != the total number of ranks, we try adjusting D so that DxHxP == the total number of ranks. +// if (parameters.world_size != parameters.data_parallel_size * parameters.horizontal_parallel_size * parameters.pipeline_parallel_size) { +// ORT_ENFORCE(parameters.world_size % parameters.horizontal_parallel_size * parameters.pipeline_parallel_size == 0, +// "D, H, P sizes are incorrect. To enable automatic correction, total number of ranks must be a divisible by HxP."); + +// const auto new_data_parallel_size = parameters.world_size / (parameters.horizontal_parallel_size * parameters.pipeline_parallel_size); +// parameters.data_parallel_size = new_data_parallel_size; + +// const std::string msg = "Cannot distribute " + std::to_string(parameters.world_size) + " ranks for distributed computation with D=" + std::to_string(parameters.data_parallel_size) + +// ", H=" + std::to_string(parameters.horizontal_parallel_size) + ", P=" + std::to_string(parameters.pipeline_parallel_size) + ", so D is automatically changed to " + std::to_string(new_data_parallel_size); +// LOGS(*(sess->GetLogger()), WARNING) << msg; +// } + +// training::PipelineTrainingSession::TrainingConfiguration config{}; +// config.weight_names_to_train = parameters.weights_to_train; +// config.weight_names_to_not_train = parameters.weights_not_to_train; +// config.immutable_weights = parameters.immutable_weights; +// config.gradient_accumulation_steps = parameters.gradient_accumulation_steps; + +// config.distributed_config.world_rank = parameters.world_rank; +// config.distributed_config.world_size = parameters.world_size; +// config.distributed_config.local_rank = parameters.local_rank; +// config.distributed_config.local_size = parameters.local_size; +// config.distributed_config.data_parallel_size = parameters.data_parallel_size; +// config.distributed_config.horizontal_parallel_size = parameters.horizontal_parallel_size; +// config.distributed_config.pipeline_parallel_size = parameters.pipeline_parallel_size; +// config.distributed_config.num_pipeline_micro_batches = parameters.num_pipeline_micro_batches; +// config.distributed_config.sliced_schema = parameters.sliced_schema; +// config.distributed_config.sliced_axes = parameters.sliced_axes; +// config.distributed_config.sliced_tensor_names = parameters.sliced_tensor_names; + +// if (parameters.use_mixed_precision) { +// training::PipelineTrainingSession::TrainingConfiguration::MixedPrecisionConfiguration mp{}; +// mp.use_mixed_precision_initializers = true; + +// config.mixed_precision_config = mp; +// } + +// if (config.distributed_config.pipeline_parallel_size > 1) { +// training::PipelineTrainingSession::TrainingConfiguration::PipelineConfiguration pipeline_config; + +// // Currently don't support auto-partition. User needs to pass in cut information for pipeline +// pipeline_config.do_partition = true; +// assert(!parameters.pipeline_cut_info_string.empty()); + +// auto process_with_delimiter = [](std::string& input_str, const std::string& delimiter) { +// std::vector result; +// size_t pos = 0; +// while ((pos = input_str.find(delimiter)) != std::string::npos) { +// std::string token = input_str.substr(0, pos); +// result.emplace_back(token); +// input_str.erase(0, pos + delimiter.length()); +// } +// // push the last split of substring into result. +// result.emplace_back(input_str); +// return result; +// }; + +// auto process_cut_info = [&](std::string& cut_info_string) { +// std::vector cut_list; +// const std::string group_delimiter = ","; +// const std::string edge_delimiter = ":"; +// const std::string consumer_delimiter = "/"; +// const std::string producer_consumer_delimiter = "-"; + +// auto cut_info_groups = process_with_delimiter(cut_info_string, group_delimiter); +// for (auto& cut_info_group : cut_info_groups) { +// PipelineTrainingSession::TrainingConfiguration::CutInfo cut_info; +// auto cut_edges = process_with_delimiter(cut_info_group, edge_delimiter); +// for (auto& cut_edge : cut_edges) { +// auto process_edge = process_with_delimiter(cut_edge, producer_consumer_delimiter); +// if (process_edge.size() == 1) { +// PipelineTrainingSession::TrainingConfiguration::CutEdge edge{process_edge[0]}; +// cut_info.emplace_back(edge); +// } else { +// ORT_ENFORCE(process_edge.size() == 2); +// auto consumer_list = process_with_delimiter(process_edge[1], consumer_delimiter); + +// PipelineTrainingSession::TrainingConfiguration::CutEdge edge{process_edge[0], consumer_list}; +// cut_info.emplace_back(edge); +// } +// } +// cut_list.emplace_back(cut_info); +// } +// return cut_list; +// }; + +// pipeline_config.cut_list = process_cut_info(parameters.pipeline_cut_info_string); +// config.pipeline_config = pipeline_config; +// } +// config.loss_name = parameters.loss_output_name; + +// if (!parameters.training_optimizer_name.empty()) { +// training::PipelineTrainingSession::TrainingConfiguration::OptimizerConfiguration opt{}; +// opt.name = parameters.training_optimizer_name; +// opt.learning_rate_input_name = parameters.lr_params_feed_name; +// opt.weight_attributes_generator = [¶meters](const std::string& weight_name) { +// const auto it = parameters.optimizer_attributes_map.find(weight_name); +// ORT_ENFORCE( +// it != parameters.optimizer_attributes_map.end(), +// "Failed to find attribute map for weight ", weight_name); +// return it->second; +// }; +// opt.weight_int_attributes_generator = [¶meters](const std::string& weight_name) { +// const auto it = parameters.optimizer_int_attributes_map.find(weight_name); +// ORT_ENFORCE( +// it != parameters.optimizer_int_attributes_map.end(), +// "Failed to find int attribute map for weight ", weight_name); +// return it->second; +// }; +// opt.use_mixed_precision_moments = parameters.use_fp16_moments; +// opt.do_all_reduce_in_mixed_precision_type = true; +// // TODO: this mapping is temporary. +// // For now, nccl allreduce kernel only implements for allreduce_post_accumulation +// // hovorod allreduce kernel only implements for not allreduce_post_accumulation. +// // eventually we will have one all reduce kernel and let opt to have +// // an allreduce_post_accumulation option and remove the use_nccl option. +// opt.use_nccl = parameters.allreduce_post_accumulation; +// opt.deepspeed_zero = onnxruntime::training::ZeROConfig(parameters.deepspeed_zero_stage); +// opt.enable_grad_norm_clip = parameters.enable_grad_norm_clip; + +// // TODO reduction types +// if (parameters.enable_adasum) { +// #ifdef USE_CUDA +// opt.adasum_reduction_type = training::AdasumReductionType::GpuHierarchicalReduction; +// #else +// opt.adasum_reduction_type = training::AdasumReductionType::CpuReduction; +// #endif +// } + +// config.optimizer_config = opt; +// } + +// if (!parameters.optimizer_initial_state.empty()) { +// config.init_optimizer_states = parameters.optimizer_initial_state; +// } + +// config.gradient_graph_config.use_memory_efficient_gradient = parameters.use_memory_efficient_gradient; +// config.gradient_graph_config.set_gradients_as_graph_outputs = parameters.set_gradients_as_graph_outputs; + +// config.graph_transformer_config.attn_dropout_recompute = parameters.attn_dropout_recompute; +// config.graph_transformer_config.gelu_recompute = parameters.gelu_recompute; +// config.graph_transformer_config.transformer_layer_recompute = parameters.transformer_layer_recompute; +// config.graph_transformer_config.number_recompute_layers = parameters.number_recompute_layers; +// config.graph_transformer_config.propagate_cast_ops_config.strategy = parameters.propagate_cast_ops_strategy; +// config.graph_transformer_config.propagate_cast_ops_config.level = parameters.propagate_cast_ops_level; +// config.graph_transformer_config.propagate_cast_ops_config.allow = parameters.propagate_cast_ops_allow; + +// if (!parameters.model_after_graph_transforms_path.empty()) { +// config.model_after_graph_transforms_path = ToPathString(parameters.model_after_graph_transforms_path); +// } +// if (!parameters.model_with_gradient_graph_path.empty()) { +// config.model_with_gradient_graph_path = ToPathString(parameters.model_with_gradient_graph_path); +// } +// if (!parameters.model_with_training_graph_path.empty()) { +// config.model_with_training_graph_path = ToPathString(parameters.model_with_training_graph_path); +// } + +// training::PipelineTrainingSession::TrainingConfigurationResult config_result{}; + +// OrtPybindThrowIfError(sess->ConfigureForTraining(config, config_result)); + +// TrainingConfigurationResult python_config_result{}; +// if (config_result.mixed_precision_config_result.has_value()) { +// const auto& mp_config_result = config_result.mixed_precision_config_result.value(); +// python_config_result.loss_scale_input_name = mp_config_result.loss_scale_input_name; +// } + +// return python_config_result; +// } + #if defined(USE_MPI) void CopyMPIContextToTrainingParameters(TrainingParameters& parameters, const logging::Logger* logger) { LOGS(*logger, INFO) << "MPIContext::GetInstance().GetWorldRank(): " << MPIContext::GetInstance().GetWorldRank(); @@ -209,6 +449,68 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn ORT_ENFORCE(num_entries_erased == 1, "NodeArg not found in cache: ", node_arg_name); }); + py::class_ parameters(m, "TrainingParameters", R"pbdoc(Configuration information for training.)pbdoc"); + parameters.def(py::init()) + .def_readwrite("loss_output_name", &TrainingParameters::loss_output_name) + //.def_readwrite("immutable_weights", &TrainingParameters::immutable_weights) + .def_readwrite("weights_not_to_train", &TrainingParameters::weights_not_to_train) + .def_readwrite("weights_to_train", &TrainingParameters::weights_to_train) + .def_readwrite("sliced_tensor_names", &TrainingParameters::sliced_tensor_names) + .def_readwrite("training_optimizer_name", &TrainingParameters::training_optimizer_name) + .def_readwrite("lr_params_feed_name", &TrainingParameters::lr_params_feed_name) + .def_readwrite("optimizer_attributes_map", &TrainingParameters::optimizer_attributes_map) + .def_readwrite("optimizer_int_attributes_map", &TrainingParameters::optimizer_int_attributes_map) + .def_readwrite("sliced_schema", &TrainingParameters::sliced_schema) + .def_readwrite("sliced_axes", &TrainingParameters::sliced_axes) + .def_readwrite("use_fp16_moments", &TrainingParameters::use_fp16_moments) + .def_readwrite("use_mixed_precision", &TrainingParameters::use_mixed_precision) + .def_readwrite("allreduce_post_accumulation", &TrainingParameters::allreduce_post_accumulation) + .def_readwrite("loss_scale", &TrainingParameters::loss_scale) + .def_readwrite("world_rank", &TrainingParameters::world_rank) + .def_readwrite("world_size", &TrainingParameters::world_size) + .def_readwrite("data_parallel_size", &TrainingParameters::data_parallel_size) + .def_readwrite("horizontal_parallel_size", &TrainingParameters::horizontal_parallel_size) + .def_readwrite("pipeline_parallel_size", &TrainingParameters::pipeline_parallel_size) + .def_readwrite("pipeline_cut_info_string", &TrainingParameters::pipeline_cut_info_string) + .def_readwrite("num_pipeline_micro_batches", &TrainingParameters::num_pipeline_micro_batches) + .def_readwrite("gradient_accumulation_steps", &TrainingParameters::gradient_accumulation_steps) + .def_readwrite("deepspeed_zero_stage", &TrainingParameters::deepspeed_zero_stage) + .def_readwrite("enable_grad_norm_clip", &TrainingParameters::enable_grad_norm_clip) + .def_readwrite("set_gradients_as_graph_outputs", &TrainingParameters::set_gradients_as_graph_outputs) + .def_readwrite("use_memory_efficient_gradient", &TrainingParameters::use_memory_efficient_gradient) + .def_readwrite("attn_dropout_recompute", &TrainingParameters::attn_dropout_recompute) + .def_readwrite("gelu_recompute", &TrainingParameters::gelu_recompute) + .def_readwrite("transformer_layer_recompute", &TrainingParameters::transformer_layer_recompute) + .def_readwrite("number_recompute_layers", &TrainingParameters::number_recompute_layers) + .def_readwrite("data_parallel_size", &TrainingParameters::data_parallel_size) + .def_readwrite("horizontal_parallel_size", &TrainingParameters::horizontal_parallel_size) + .def_readwrite("pipeline_parallel_size", &TrainingParameters::pipeline_parallel_size) + // .def("set_optimizer_initial_state", + // [](TrainingParameters& parameters, const std::unordered_map>& py_state) -> void { + // onnxruntime::training::TrainingSession::OptimizerState optim_state; + // for (const auto& weight_it : py_state) { + // auto state = weight_it.second; + // NameMLValMap state_tensors; + // for (auto& initializer : state) { + // OrtValue ml_value; + + // // InputDeflist is null because parameters havent been tied to session yet + // // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list) + // CreateGenericMLValue(nullptr, GetAllocator(), "", initializer.second, &ml_value, true); + // ThrowIfPyErrOccured(); + // state_tensors.emplace(initializer.first, ml_value); + // } + // optim_state.emplace(weight_it.first, state_tensors); + // } + // parameters.optimizer_initial_state = optim_state; + // }) + .def_readwrite("model_after_graph_transforms_path", &TrainingParameters::model_after_graph_transforms_path) + .def_readwrite("model_with_gradient_graph_path", &TrainingParameters::model_with_gradient_graph_path) + .def_readwrite("model_with_training_graph_path", &TrainingParameters::model_with_training_graph_path) + .def_readwrite("enable_adasum", &TrainingParameters::enable_adasum) + .def_readwrite("propagate_cast_ops_level", &TrainingParameters::propagate_cast_ops_level) + .def_readwrite("propagate_cast_ops_allow", &TrainingParameters::propagate_cast_ops_allow); + #if defined(USE_MPI) m.def("get_mpi_context_local_rank", []() -> int { return MPIContext::GetInstance().GetLocalRank(); }); m.def("get_mpi_context_local_size", []() -> int { return MPIContext::GetInstance().GetLocalSize(); }); @@ -278,6 +580,130 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn }); #endif + // py::class_ config_result(m, "TrainingConfigurationResult", "pbdoc(Configuration result for training.)pbdoc"); + // config_result.def(py::init()) + // .def_property_readonly("loss_scale_input_name", [](const TrainingConfigurationResult& result) -> py::object { + // if (result.loss_scale_input_name.has_value()) { + // return py::str{result.loss_scale_input_name.value()}; + // } + // return py::none(); + // }); + + // // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user + // struct PyTrainingSession : public PyInferenceSession { + // PyTrainingSession(std::shared_ptr env, const PySessionOptions& so) + // : PyInferenceSession(env, std::make_unique(so.value, *env)) { + // } + // ~PyTrainingSession() = default; + // }; + + // py::class_ training_session(m, "TrainingSession"); + // training_session + // .def(py::init([](const PySessionOptions& so) { + // auto& training_env = GetTrainingEnv(); + // return std::make_unique(training_env.GetORTEnv(), so); + // })) + // .def(py::init([]() { + // auto& training_env = GetTrainingEnv(); + // return std::make_unique(training_env.GetORTEnv(), GetDefaultCPUSessionOptions()); + // })) + // .def("finalize", [](py::object) { + // #if defined(USE_MPI) + // #ifdef _WIN32 + // // https://docs.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-best-practices + // // shutdown_mpi() is not called within MPIContext destructor because of DllMain's restriction + // // call shutdown_mpi() here instead. + // MPIContext::shutdown_mpi(); + // #endif + // #endif + // }) + // .def("load_model", [ep_registration_fn](PyTrainingSession* sess, const std::string& path, TrainingParameters& parameters, const std::vector& provider_types, const ProviderOptionsVector& provider_options) { + // OrtPybindThrowIfError(sess->GetSessionHandle()->Load(path)); + + // #if defined(USE_MPI) + // bool use_nccl = parameters.allreduce_post_accumulation; + // if (!use_nccl && parameters.world_size > 1) + // CopyMPIContextToTrainingParameters(parameters, sess->GetSessionHandle()->GetLogger()); + // #endif + // const auto config_result = ConfigureSessionForTraining(static_cast(sess->GetSessionHandle()), parameters); + + // ProviderOptionsVector merged_options; + // ResolveExtraProviderOptions(provider_types, provider_options, merged_options); + + // InitializeSession(sess->GetSessionHandle(), ep_registration_fn, provider_types, merged_options); + + // return config_result; + // }) + // .def("read_bytes", [ep_registration_fn](PyTrainingSession* sess, const py::bytes& serialized_model, TrainingParameters& parameters, const std::vector& provider_types, const ProviderOptionsVector& provider_options) { + // std::istringstream buffer(serialized_model); + // OrtPybindThrowIfError(sess->GetSessionHandle()->Load(buffer)); + + // #if defined(USE_MPI) + // bool use_nccl = parameters.allreduce_post_accumulation; + // if (!use_nccl && parameters.world_size > 1) + // CopyMPIContextToTrainingParameters(parameters, sess->GetSessionHandle()->GetLogger()); + // #endif + // const auto config_result = ConfigureSessionForTraining(static_cast(sess->GetSessionHandle()), parameters); + // ProviderOptionsVector merged_options; + // ResolveExtraProviderOptions(provider_types, provider_options, merged_options); + + // InitializeSession(sess->GetSessionHandle(), ep_registration_fn, provider_types, merged_options); + + // return config_result; + // }) + // .def("get_state", [](PyTrainingSession* sess) { + // NameMLValMap state_tensors; + // ORT_THROW_IF_ERROR(static_cast(sess->GetSessionHandle())->GetStateTensors(state_tensors)); + // auto& data_transfer_manager = sess->GetSessionHandle()->GetDataTransferManager(); + // // convert to numpy array + // std::map rmap; + // for (auto& kv : state_tensors) { + // if (kv.second.IsTensor()) { + // py::object obj; + // const Tensor& rtensor = kv.second.Get(); + // GetPyObjFromTensor(rtensor, obj, &data_transfer_manager); + // rmap.insert({kv.first, obj}); + // } else { + // throw std::runtime_error("Non tensor type in session state tensors is not expected."); + // } + // } + // return rmap; + // }) + // .def("get_model_state", [](PyTrainingSession* sess, bool include_mixed_precision_weights) { + // std::unordered_map model_state_tensors; + // ORT_THROW_IF_ERROR(static_cast(sess->GetSessionHandle())->GetModelState(model_state_tensors, include_mixed_precision_weights)); + // auto& data_transfer_manager = sess->GetSessionHandle()->GetDataTransferManager(); + // return ConvertORTTensorMapToNumpy(model_state_tensors, data_transfer_manager); + // }) + // .def("get_optimizer_state", [](PyTrainingSession* sess) { + // std::unordered_map opt_state_tensors; + // ORT_THROW_IF_ERROR(static_cast(sess->GetSessionHandle())->GetOptimizerState(opt_state_tensors)); + // auto& data_transfer_manager = sess->GetSessionHandle()->GetDataTransferManager(); + // return ConvertORTTensorMapToNumpy(opt_state_tensors, data_transfer_manager); + // }) + // .def("get_partition_info_map", [](PyTrainingSession* sess) { + // std::unordered_map>> part_info_map; + // ORT_THROW_IF_ERROR(static_cast(sess->GetSessionHandle())->GetPartitionInfoMap(part_info_map)); + // return part_info_map; + // }) + // .def("load_state", [](PyTrainingSession* sess, std::unordered_map& state, bool strict) { + // NameMLValMap state_tensors; + // for (auto initializer : state) { + // OrtValue ml_value; + // auto px = sess->GetSessionHandle()->GetModelInputs(); + // if (!px.first.IsOK() || !px.second) { + // throw std::runtime_error("Either failed to get model inputs from the session object or the input def list was null"); + // } + // CreateGenericMLValue(px.second, GetAllocator(), initializer.first, initializer.second, &ml_value); + // ThrowIfPyErrOccured(); + // state_tensors.insert(std::make_pair(initializer.first, ml_value)); + // } + // ORT_THROW_IF_ERROR(static_cast(sess->GetSessionHandle())->SetStateTensors(state_tensors, strict)); + // }) + // .def("is_output_fp32_node", [](PyTrainingSession* sess, const std::string& output_name) { + // return static_cast(sess->GetSessionHandle())->IsGraphOutputFp32Node(output_name); + // }); + py::class_(m, "PartialGraphExecutionState") .def(py::init([]() { return std::make_unique(); diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml index b8dfb7f3c90a2..f244851f8cc37 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml @@ -20,16 +20,3 @@ stages: agent_pool: Onnxruntime-Linux-GPU upload_wheel: 'yes' debug_build: false - -# Added for triton compiler team. Can be potentially removed. -- template: templates/py-packaging-training-cuda-stage.yml - parameters: - build_py_parameters: --enable_training --update --build - torch_version: '2.0.0' - opset_version: '15' - cuda_version: '11.8' - cmake_cuda_architectures: 70;75;80;86 - docker_file: Dockerfile.manylinux2_28_training_cuda11_8 - agent_pool: Onnxruntime-Linux-GPU - upload_wheel: 'no' - debug_build: true