diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md
index 1f188252bf96e..4b0b088701a1d 100644
--- a/docs/Memory_Optimizer.md
+++ b/docs/Memory_Optimizer.md
@@ -30,16 +30,16 @@ Integrate models using `ORTModule`.
 ```
 
 There are two modes to enable the memory optimizations:
-- Aggressively Recompute All, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=1`. This will recompute all detected subgraphs. It is easy to enable, but be noted this recompute plan may NOT be the best one. In this mode `ORTMODULE_MEMORY_OPT_CONFIG` env values passed by users are not respected.
+- Aggressively Recompute All for Transformer Models, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=1`. This will recompute all detected subgraphs within each Transformer Attention or MLP layer. It is easy to enable, but be noted this recompute plan may NOT be the best one. In this mode, `ORTMODULE_MEMORY_OPT_CONFIG` env values passed by users are not respected.
 - User Specified Subgraph Recompute, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=0` and `export ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,...`. This is an advanced usage, allows users to find the most suitable graphs to recompute, at the cost of overhead to look for the best plans.
 
 ### Mode 1 - Simple Usage (Aggressively Recompute All)
 
 
-1. Set memory optimization level to be AGGRESSIVE_FULL_RECOMPUTE, by `export ORTMODULE_MEMORY_OPT_LEVEL=1`
+1. Set memory optimization level to be TRANSFORMER_LAYERWISE_RECOMPUTE, by `export ORTMODULE_MEMORY_OPT_LEVEL=1`
 2. Run the training as usual; check the logs, you could find something like this:
 	```
-	Memory Optimizer     :  ON   :  Memory Optimization Level: [AGGRESSIVE_FULL_RECOMPUTE], Optimization Config: [Reshape+Where+:1:-1,BiasSoftmax+:1:-1,Cast+:1:-1,BiasGelu+:1:-1,FusedMatMul+:1:-1,Add+:1:-1,Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+:1:-1], Probe Level: [1]
+	Memory Optimizer     :  ON   :  Memory Optimization Level: [TRANSFORMER_LAYERWISE_RECOMPUTE], Optimization Config: [Reshape+Where+:1:-1,BiasSoftmax+:1:-1,Cast+:1:-1,BiasGelu+:1:-1,FusedMatMul+:1:-1,Add+:1:-1,Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+:1:-1]
 									Configs                                              Freq  Max Saving(Bytes)  Saving Symbolic(Bytes)
 	- Plan 1            :  ON   :  Reshape+Where+:1:-1                                  1     134,217,728        128.0*inputs_input_ids_dim0*inputs_input_ids_dim1**2
 	- Plan 2            :  ON   :  BiasSoftmax+:1:-1                                    1     134,086,656        128.0*inputs_input_ids_dim0*inputs_input_ids_dim1*(inputs_input_ids_dim1 - 1)
@@ -82,7 +82,7 @@ There are two modes to enable the memory optimizations:
 	```
 5. Then run the training again, and you will see logs like this:
 	```
-	Memory Optimizer     :  ON   :  Memory Optimization Level: [USER_SPECIFIED], Optimization Config: [BiasGelu+:1:-1], Probe Level: [1]
+	Memory Optimizer     :  ON   :  Memory Optimization Level: [USER_SPECIFIED], Optimization Config: [BiasGelu+:1:-1]
 									Configs                                              Freq  Max Saving(Bytes)  Saving Symbolic(Bytes)
 	- Plan 1            :  OFF  :  Reshape+Where+:1:-1                                  1     134,217,728        128.0*inputs_input_ids_dim0*inputs_input_ids_dim1**2
 	- Plan 2            :  OFF  :  BiasSoftmax+:1:-1                                    1     134,086,656        128.0*inputs_input_ids_dim0*inputs_input_ids_dim1*(inputs_input_ids_dim1 - 1)
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
index 60f62a9881ef4..b63406116a156 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
@@ -15,6 +15,7 @@
 #include "orttraining/core/optimizer/memory_optimizer/optimization_planner.h"
 #include "orttraining/core/optimizer/memory_optimizer/recompute_analysis.h"
 #include "orttraining/core/optimizer/memory_optimizer/memory_insight.h"
+#include "orttraining/core/optimizer/memory_optimizer/transformer_specific.h"
 
 namespace onnxruntime::optimizer::memory_optimizer {
 
@@ -209,6 +210,9 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
                                                      is_forward_nodes,
                                                      logger));
 
+  InlinedHashSet<const Node*> layer_boundary_ln_nodes;
+  FindLayerBoundaryLayerNodeNodes(graph_viewer, logger, layer_boundary_ln_nodes);
+
   // The first pass - find the candidate subgraphs.
   for (int i = static_cast<int>(node_ids.size()) - 1; i >= 0; --i) {
     const Node* p_node = graph_viewer.GetNode(node_ids[i]);
@@ -222,11 +226,13 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
 
     bool can_compromise_stashed_activation = false;
     std::unique_ptr<NodeRecomputePlan> recompute_plan =
-        CheckNodeForRecompute(*p_node,
+        CheckNodeForRecompute(graph_viewer,
+                              *p_node,
                               probe_level,
                               fw_op_output_arg_used_map,
                               node_index_to_its_order_in_topological_sort_map,
                               candidate_output_args_map,
+                              layer_boundary_ln_nodes,
                               logger, false,
                               can_compromise_stashed_activation);
     if (recompute_plan != nullptr) {
@@ -239,9 +245,10 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
       // If the subgraph recompute can save memory by comprising the assumption - recompute graphs' input must exist
       // during backward pass, then we can consider to recompute them.
       std::unique_ptr<NodeRecomputePlan> recompute_with_compromise_plan =
-          CheckNodeForRecompute(*p_node, probe_level, fw_op_output_arg_used_map,
+          CheckNodeForRecompute(graph_viewer, *p_node, probe_level, fw_op_output_arg_used_map,
                                 node_index_to_its_order_in_topological_sort_map,
                                 candidate_output_args_map,
+                                layer_boundary_ln_nodes,
                                 logger, true,
                                 can_compromise_stashed_activation);
       if (recompute_with_compromise_plan != nullptr) {
@@ -710,7 +717,7 @@ std::string GetSerializedORTModuleMemoryStat(const GraphViewer& graph_viewer,
     ORT_ENFORCE(probe_level_int < static_cast<int>(ProbeLevel::LevelMax) &&
                     probe_level_int >= 0,
                 "Invalid probe level specified: ", recompute_probe_level);
-    probe_level = static_cast<ProbeLevel>(probe_level);
+    probe_level = static_cast<ProbeLevel>(probe_level_int);
   }
 
   ptrdiff_t yield_op_order_in_topological_sort;
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
index 7e042031f66a2..78bd2e526cd70 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
@@ -105,6 +105,8 @@ Status MemoryOptimizationPlanner::FinalizeNodePlansFromUserConfig(
     const auto& node = node_to_optimization_plan.first;
     const auto& node_plans = node_to_optimization_plan.second;
 
+    std::cout << "FinalizeNodePlansFromUserConfig loop node name: " << node->Name() << std::endl;
+
     for (auto& node_plan : node_plans) {
       const std::string cluster_id = node_plan->GetClusterId();
       if (cluster_id_to_user_configs.find(cluster_id) == cluster_id_to_user_configs.end()) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 0782cbdae2eec..cf18942f94a02 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -9,8 +9,10 @@
 #include <utility>
 
 #include "orttraining/core/optimizer/memory_optimizer/common.h"
+#include "orttraining/core/optimizer/memory_optimizer/transformer_specific.h"
 #include "orttraining/core/optimizer/memory_optimizer/recompute_analysis.h"
 #include "core/framework/data_types.h"
+#include "core/optimizer/utils.h"
 
 namespace onnxruntime::optimizer::memory_optimizer {
 
@@ -53,7 +55,7 @@ struct AllowedRecomputeNodeConfig {
   InlinedVector<int> input_arg_indices;  // input index to iterate further (bottom up)
 };
 
-// The op types that are supported predefined.
+// The op types that are supported are predefined.
 
 const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecomputeOps(int probe_op_level) {
   static InlinedHashMap<int, InlinedHashMap<std::string, AllowedRecomputeNodeConfig>> recomputable_op_table_map;
@@ -131,7 +133,7 @@ bool IsRecomputable(const Node& node, ProbeLevel probe_level) {
  * @param compromise_stashed_activation Whether to compromise stashed activation, e.g. if we cannot find a
  * recomputable subgraph to save a stashed activation, we can compromise to find a recomputable subgraph to reduce the
  * size of stashed activation.
- * @param can_compromise_stashed_activation A bool return value, to indicate there is opportunaties for finding a
+ * @param can_compromise_stashed_activation A bool return value, to indicate there are opportunities for finding a
  * compromised subgraph.
  * @param save_ratio The ratio of memory saving if we can find a recomputable subgraph.
  * @return Status
@@ -335,13 +337,15 @@ void NodesInTopoOrderToString(gsl::span<const Node* const> nodes_in_topological_
 
 }  // namespace
 
-std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const Node& node,
+std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& graph_viewer,
+                                                         const Node& node,
                                                          const ProbeLevel probe_level,
                                                          const ActivationUsedMap& fw_op_output_arg_used_map,
                                                          const InlinedHashMap<NodeIndex, ptrdiff_t>&
                                                              node_index_to_its_order_in_topological_sort_map,
                                                          const InlinedHashMap<const Node*, InlinedVector<size_t>>&
                                                              candidate_output_args_map,
+                                                         const InlinedHashSet<const Node*>& layer_boundary_ln_nodes,
                                                          const logging::Logger& logger,
                                                          bool compromise_stashed_activation,
                                                          bool& can_compromise_stashed_activation) {
@@ -349,6 +353,32 @@ std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const Node& node,
     return nullptr;
   }
 
+  // Check whether the node's stashed activation outputs are used by LayerNormalization's inputs.
+  // If yes, for Transformers, we don't need to recompute the node, because we treated
+  // LayerNormalization as the boundary for subgraph searching.
+  // Be noted for a Transformer Layer, imagine one layer contains an attention sublayer and an mlp sublayer, but each
+  // sublayer has its own LayerNormalization, we treat the LayerNormalization as the boundary for subgraph searching.
+  if (probe_level == ProbeLevel::Transformers) {
+    // Check at least one of the stashed activation output is used as the 1st input
+    // of LayerNormalization, e.g. will be used as input of LayerNormalizationGrad.
+    for (auto& output_index : candidate_output_args_map.at(&node)) {
+      auto output_name = node.OutputDefs()[output_index]->Name();
+      auto consumers = graph_viewer.GetConsumerNodes(output_name);
+      for (auto& consumer : consumers) {
+        if (layer_boundary_ln_nodes.find(consumer) != layer_boundary_ln_nodes.end()) {
+          int dest_in_index = optimizer_utils::IndexOfNodeInput(*consumer, *node.OutputDefs()[output_index]);
+          if (dest_in_index == 0) {
+            LOGS(logger, WARNING) << "Node " << node.Name() << "(" << node.OpType()
+                                  << ") is a Attention+MLP layer boundary node, "
+                                  << "its stashed activation outputs are used by LayerNormalization's inputs, "
+                                  << "we don't need to recompute it.";
+            return nullptr;
+          }
+        }
+      }
+    }
+  }
+
   InlinedVector<const Node*> nodes_in_topological_order;
   float save_ratio = 1.f;
   ORT_ENFORCE(SelectRecomputeSubgraph(node,
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
index 9211e5044cd86..1c03a87a7c187 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
@@ -19,7 +19,8 @@ namespace onnxruntime::optimizer::memory_optimizer {
 enum class ProbeLevel {
   Basic = 0,
   Advanced = 1,
-  LevelMax = 2,
+  Transformers = 2,  // On top of Advanced, LayerNorm as the boundary for subgraph searching.
+  LevelMax = 3,
 };
 
 /**
@@ -75,6 +76,7 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase {
 /**
  * @brief For the node producing stashed activation, check whether a recomputable subgraph can be found or not.
  *
+ * @param graph_viewer The graph viewer to get node information.
  * @param node The entry node to start the subgraph matching (bottom-up), usually the last node of found subgraphs.
  * @param probe_level The level to control allowed operations during subgraph detecting.
  * @param fw_op_output_arg_used_map The activation usage (in fw and bw) mapping.
@@ -82,6 +84,7 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase {
  *   Used to re-order the collected subgraph nodes.
  * @param candidate_output_args_map A map from node to its candidate activations, which are consumed by both fw and
  *  bw ops.
+ * @param layer_boundary_ln_nodes A set of LayerNormalization nodes, which are used as the boundary for subgraph.
  * @param subgraph_stores A store to maintain all found subgraphs.
  * @param logger Logger.
  * @param compromise_stashed_activation Whether to compromise stashed activation, e.g. if we cannot find a
@@ -90,13 +93,15 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase {
  * @param can_compromise_stashed_activation A bool return value, to indicate there is opportunaties for finding a
  * compromised subgraph.
  */
-std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const Node& node,
+std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& graph_viewer,
+                                                         const Node& node,
                                                          const ProbeLevel probe_level,
                                                          const ActivationUsedMap& fw_op_output_arg_used_map,
                                                          const InlinedHashMap<NodeIndex, ptrdiff_t>&
                                                              node_index_to_its_order_in_topological_sort_map,
                                                          const InlinedHashMap<const Node*, InlinedVector<size_t>>&
                                                              candidate_output_args_map,
+                                                         const InlinedHashSet<const Node*>& layer_boundary_ln_nodes,
                                                          const logging::Logger& logger,
                                                          bool compromise_stashed_activation,
                                                          bool& can_compromise_stashed_activation);
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
new file mode 100644
index 0000000000000..57e76e87bf870
--- /dev/null
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
@@ -0,0 +1,68 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <charconv>
+#include <vector>
+#include <utility>
+
+#include "orttraining/core/optimizer/memory_optimizer/common.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/utils.h"
+#include "core/graph/graph_viewer.h"
+#include "core/framework/tensorprotoutils.h"
+
+#include "core/common/string_utils.h"
+
+namespace onnxruntime::optimizer::memory_optimizer {
+
+void FindLayerBoundaryLayerNodeNodes(
+    const GraphViewer& graph_viewer,
+    const logging::Logger&,
+    InlinedHashSet<const Node*>& layer_boundary_ln_nodes) {
+  // Loop all nodes to find LayerNormalization nodes.
+  // For each LayerNormalization node, keep checking its output nodes,
+  // until find a node that is Softmax or BiasSoftmax or another LayerNormalization.
+  // If the found node is Softmax or BiasSoftmax, the LayerNormalization node as ATTENTION.
+  // If the found node is another LayerNormalization, the LayerNormalization node as MLP.
+  const InlinedHashSet<std::string_view> softmax_ops{"Softmax", "BiasSoftmax"};
+  const InlinedHashSet<std::string_view> layernorm_ops{"LayerNormalization", "SkipLayerNormalization"};
+
+  layer_boundary_ln_nodes.clear();
+  const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+  for (auto node_index : node_topology_list) {
+    auto& node = *graph_viewer.GetNode(node_index);
+
+    if (layernorm_ops.find(node.OpType()) == layernorm_ops.end()) {
+      continue;
+    }
+
+    std::deque<const Node*> nodes_to_check;
+    std::set<const Node*> visited_nodes;
+    for (auto node_it = node.OutputNodesBegin(); node_it != node.OutputNodesEnd(); ++node_it) {
+      nodes_to_check.push_back(&(*node_it));
+    }
+
+    while (!nodes_to_check.empty()) {
+      const Node* next_node = nodes_to_check.front();
+      nodes_to_check.pop_front();
+
+      if (visited_nodes.find(next_node) != visited_nodes.end()) {
+        continue;
+      }
+
+      visited_nodes.insert(next_node);
+      if (softmax_ops.find(next_node->OpType()) != softmax_ops.end()) {
+        layer_boundary_ln_nodes.insert(&node);
+        break;
+      } else if (layernorm_ops.find(next_node->OpType()) != layernorm_ops.end()) {
+        break;
+      } else {
+        for (auto node_it = next_node->OutputNodesBegin(); node_it != next_node->OutputNodesEnd(); ++node_it) {
+          nodes_to_check.push_back(&(*node_it));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace onnxruntime::optimizer::memory_optimizer
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
new file mode 100644
index 0000000000000..62b092e69bb83
--- /dev/null
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "core/common/common.h"
+#include "core/common/logging/logging.h"
+#include "core/common/inlined_containers_fwd.h"
+#include "core/graph/basic_types.h"
+#include "core/framework/data_types.h"
+#include "core/graph/graph_viewer.h"
+#include "orttraining/core/optimizer/memory_optimizer/common.h"
+
+namespace onnxruntime::optimizer::memory_optimizer {
+
+void FindLayerBoundaryLayerNodeNodes(const GraphViewer& graph_viewer,
+                                     const logging::Logger& logger,
+                                     InlinedHashSet<const Node*>& layer_boundary_ln_nodes);
+
+}  // namespace onnxruntime::optimizer::memory_optimizer
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 357304bfb45b3..0a03e29880011 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -652,8 +652,8 @@ def _add_record(tbl, columns):
         )
 
         opt_config_to_display = self._runtime_options.memory_optimizer_config
-        if self._runtime_options.memory_optimization_level == _MemoryOptimizationLevel.AGGRESSIVE_FULL_RECOMPUTE:
-            opt_config_to_display = "ALL_RECOMPUTE_CONFIGS"
+        if self._runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+            opt_config_to_display = "ALL_RECOMPUTE_FOR_EACH_LAYER"
         mem_row = _add_record(
             tbl,
             [
@@ -661,8 +661,7 @@ def _add_record(tbl, columns):
                 len(self._runtime_options.memory_optimizer_config) > 0,
                 (
                     f"Memory Optimization Level: [{_MemoryOptimizationLevel.to_string(self._runtime_options.memory_optimization_level)}], "
-                    f"Optimization Config: [{opt_config_to_display}], "
-                    f"Probe Level: [{self._runtime_options.probe_level}]"
+                    f"Optimization Config: [{opt_config_to_display}]"
                     if len(self._runtime_options.memory_optimizer_config) > 0
                     else "Enable with env ORTMODULE_MEMORY_OPT_LEVEL=1 or ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,..."
                 ),
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index 6bdd61fe7fa40..2b650a6ed53e7 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -530,7 +530,7 @@ def find_memory_optimization_opportunity(self, execution_agent: TrainingAgent, r
 
         # If memory optimization level is aggressive, we will first collect all
         # recompute subgraph by passing empty memory_optimizer_config to get_serialized_ortmodule_memory_stat.
-        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.AGGRESSIVE_FULL_RECOMPUTE:
+        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
             memory_optimizer_config = ""
 
         (
@@ -566,7 +566,7 @@ def find_memory_optimization_opportunity(self, execution_agent: TrainingAgent, r
             self.cluster_id_combination_to_saving_symbolics_map[cluster_id] = values
 
         # For aggressive memory optimization, we update the memory_optimizer_config using all.
-        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.AGGRESSIVE_FULL_RECOMPUTE:
+        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
             recompute_configs = []
             for cluster_id in self.cluster_id_combination_to_saving_symbolics_map:
                 config_values = cluster_id.split(":")
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index d6b98730a5e62..a9c832ad7eee9 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -438,7 +438,8 @@ def _create_execution_agent(self):
         # when we have an allocation plan in place, and reuse information is available.
         if self._runtime_inspector.memory_ob.is_enabled() and (
             self._debug_options.log_level <= LogLevel.INFO
-            or self._runtime_options.memory_optimization_level == _MemoryOptimizationLevel.AGGRESSIVE_FULL_RECOMPUTE
+            or self._runtime_options.memory_optimization_level
+            == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE
         ):
             # Create a training agent without enabling memory optimization.
             execution_agent = TrainingAgent(
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index 80e6cbe586e0e..c9bfa9d90f4ec 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -196,15 +196,15 @@ class _MemoryOptimizationLevel(IntFlag):
     """Enumeration to specify memory optimization level"""
 
     USER_SPECIFIED = 0  # Fully respect user-specified config
-    AGGRESSIVE_FULL_RECOMPUTE = 1  # Enable all recomputable subgraphs
+    TRANSFORMER_LAYERWISE_RECOMPUTE = 1  # Enable all recomputable subgraphs per layer
 
     @staticmethod
     def to_string(memory_optimization_level):
         if memory_optimization_level == _MemoryOptimizationLevel.USER_SPECIFIED:
             return "USER_SPECIFIED"
 
-        if memory_optimization_level == _MemoryOptimizationLevel.AGGRESSIVE_FULL_RECOMPUTE:
-            return "AGGRESSIVE_FULL_RECOMPUTE"
+        if memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+            return "TRANSFORMER_LAYERWISE_RECOMPUTE"
 
         return ""
 
@@ -336,7 +336,8 @@ def _override_from_env_vars(self):
         # Configuration for memory optimization.
         self.memory_optimization_level = int(os.getenv("ORTMODULE_MEMORY_OPT_LEVEL", self.memory_optimization_level))
         self.memory_optimizer_config = os.getenv("ORTMODULE_MEMORY_OPT_CONFIG", self.memory_optimizer_config)
-        self.probe_level = os.getenv("ORTMODULE_MEMORY_OPT_PROBE_RECOMPUTE_LEVEL", self.probe_level)
+        if self.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+            self.probe_level = "2"  # For transformer layer-wise recompute, we need to probe more aggressively.
 
         # Configuration for dev tools.
         if "ORTMODULE_PRINT_INPUT_DENSITY" in os.environ: