diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu b/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu index c8c0191967d40..282ba2403b135 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu +++ b/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu @@ -125,42 +125,31 @@ Status PrepareQkv_Attention(contrib::AttentionParameters& parameters, bool use_fused_kernel = (nullptr != fused_runner && !parameters.is_unidirectional); bool use_fused_causal = (nullptr != fused_runner && parameters.is_unidirectional); - if (data.bias == nullptr) { - assert(nullptr == fused_runner); - // For quantized attention, bias has been added so only need transpose here. - // gemm_buffer should be BxSx3xNxH => qkv: 3xBxNxSxH - assert(qk_head_size == v_head_size); - int matrix_to_trans = (past_present_share_buffer ? 1 : 3); - ORT_RETURN_IF_ERROR(LaunchTransQkv(stream, matrix_to_trans, sequence_length, batch_size, qk_head_size, num_heads, - max_threads_per_block, false, data.gemm_buffer, qkv, 3)); - data.qkv_format = AttentionQkvFormat::Q_K_V_BNSH; - } else { - // For fused TRT attention, transpose qkv to BxSxNx3xH (format 2) - // For flash or memory efficient attention, transpose to 3xBxSxNxH (format 3) - // For unfused kernel, transpose to 3xBxNxSxH (format 1) - // For fused causal kernel, use format 1 since we need have K and V to update present state, - // at the same time, we update gemm_buffer BxSx3xNxH with bias which is used as input for fused causal kernel. - const int format = (use_fused_kernel ? 2 : (use_flash_or_efficient_attention ? 3 : 1)); - data.qkv_format = use_fused_kernel - ? AttentionQkvFormat::QKV_BSN3H - : (use_flash_or_efficient_attention - ? AttentionQkvFormat::Q_K_V_BSNH - : (use_fused_causal - ? AttentionQkvFormat::Q_K_V_BNSH_QKV_BS3NH - : AttentionQkvFormat::Q_K_V_BNSH)); - - // For fused causal, we will update gemm_buffer with bias directly. - T* qkv_add_bias = use_fused_causal ? data.gemm_buffer : nullptr; - - int matrix_to_transpose = ((format == AttentionQkvFormat::Q_K_V_BNSH && past_present_share_buffer) ? 1 : 3); - // format 1: BxSx(NH + NH + NH_v) => BxNxSxH + BxNxSxH + BxNxSxH_v - // format 2: BxSx(NH + NH + NH) => BxSxNx(H + H + H) - LaunchAddBiasTranspose(stream, matrix_to_transpose, format, max_threads_per_block, - batch_size, sequence_length, num_heads, qk_head_size, - data.gemm_buffer, data.bias, qkv, true, v_head_size, qkv_add_bias, - 3, parameters.do_rotary, parameters.rotary_embedding, - parameters.past_sequence_length); - } + // For fused TRT attention, transpose qkv to BxSxNx3xH (format 2) + // For flash or memory efficient attention, transpose to 3xBxSxNxH (format 3) + // For unfused kernel, transpose to 3xBxNxSxH (format 1) + // For fused causal kernel, use format 1 since we need have K and V to update present state, + // at the same time, we update gemm_buffer BxSx3xNxH with bias which is used as input for fused causal kernel. + const int format = (use_fused_kernel ? 2 : (use_flash_or_efficient_attention ? 3 : 1)); + data.qkv_format = use_fused_kernel + ? AttentionQkvFormat::QKV_BSN3H + : (use_flash_or_efficient_attention + ? AttentionQkvFormat::Q_K_V_BSNH + : (use_fused_causal + ? AttentionQkvFormat::Q_K_V_BNSH_QKV_BS3NH + : AttentionQkvFormat::Q_K_V_BNSH)); + + // For fused causal, we will update gemm_buffer with bias directly. + T* qkv_add_bias = use_fused_causal ? data.gemm_buffer : nullptr; + + int matrix_to_transpose = ((format == AttentionQkvFormat::Q_K_V_BNSH && past_present_share_buffer) ? 1 : 3); + // format 1: BxSx(NH + NH + NH_v) => BxNxSxH + BxNxSxH + BxNxSxH_v + // format 2: BxSx(NH + NH + NH) => BxSxNx(H + H + H) + LaunchAddBiasTranspose(stream, matrix_to_transpose, format, max_threads_per_block, + batch_size, sequence_length, num_heads, qk_head_size, + data.gemm_buffer, data.bias, qkv, true, v_head_size, qkv_add_bias, + 3, parameters.do_rotary, parameters.rotary_embedding, + parameters.past_sequence_length); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc index 3299bc2cb11de..91e8577df487b 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc @@ -101,6 +101,7 @@ Status SkipLayerNorm::ComputeInternal(OpKernelContext* ctx) const (double)epsilon_, // epsilon reinterpret_cast(gamma->Data()), // gamma (beta != nullptr) ? reinterpret_cast(beta->Data()) : nullptr, // beta + 0, // broadcast stride for gamma/beta reinterpret_cast(skip->Data()), // skip or residual to add (bias != nullptr) ? reinterpret_cast(bias->Data()) : nullptr, // bias to add sum_output != nullptr ? reinterpret_cast(sum_output->MutableData()) : nullptr); diff --git a/onnxruntime/core/framework/print_tensor_statistics_utils.h b/onnxruntime/core/framework/print_tensor_statistics_utils.h index 65360674e88d0..e4bb8b917d2b4 100644 --- a/onnxruntime/core/framework/print_tensor_statistics_utils.h +++ b/onnxruntime/core/framework/print_tensor_statistics_utils.h @@ -30,7 +30,7 @@ void PrintFloatStats(const T* data, size_t count) { size_t zero = 0; size_t subnormal = 0; for (size_t i = 0; i < count; i++) { - switch (my_fpclassify(*data)) { + switch (my_fpclassify(data[i])) { case FP_INFINITE: inf++; break; diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm.cc b/onnxruntime/core/providers/cuda/nn/layer_norm.cc index 7dd10f9c2960c..c430ffe5aa97d 100644 --- a/onnxruntime/core/providers/cuda/nn/layer_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/layer_norm.cc @@ -44,19 +44,36 @@ Status LayerNorm::ComputeInternal(OpKernelContext* ctx) con auto bias_data = (simplified || (nullptr == bias)) ? nullptr : reinterpret_cast(bias->Data()); const TensorShape& x_shape = X->Shape(); - const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions()); + auto x_num_dims = x_shape.NumDimensions(); + const int64_t axis = HandleNegativeAxis(axis_, x_num_dims); int n1 = gsl::narrow(x_shape.SizeToDimension(axis)); int n2 = gsl::narrow(x_shape.SizeFromDimension(axis)); const auto scale_size = scale->Shape().Size(); const auto bias_size = (bias_data) ? bias->Shape().Size() : 0; + + int broadcast = 0; if (n2 == 1 || scale_size != n2 || (bias_data && bias_size != n2)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Size of X.shape()[axis:] == ", n2, - ". Size of scale and bias (if provided) must match this " - "and the size must not be 1. Got scale size of ", - scale_size, " and bias size of ", bias_size); + // Handle a special case for MMDit where scale and bias need broadcast. + // X shape is (B, S, D), scale and bias shape is (B, 1, D), and we store S as broadcast stride. + if (x_num_dims == 3 && axis == 2 && n2 > 1 && + scale->Shape().NumDimensions() == x_num_dims && + scale->Shape().GetDims()[0] == x_shape.GetDims()[0] && + scale->Shape().GetDims()[1] == 1 && + scale->Shape().GetDims()[2] == x_shape.GetDims()[2] && + bias->Shape().NumDimensions() == x_num_dims && + bias->Shape().GetDims()[0] == x_shape.GetDims()[0] && + bias->Shape().GetDims()[1] == 1 && + bias->Shape().GetDims()[2] == x_shape.GetDims()[2]) { + broadcast = static_cast(x_shape.GetDims()[1]); + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Size of X.shape()[axis:] == ", n2, + ". Size of scale and bias (if provided) must match this " + "and the size must not be 1. Got scale size of ", + scale_size, " and bias size of ", bias_size); + } } // Outputs @@ -65,7 +82,7 @@ Status LayerNorm::ComputeInternal(OpKernelContext* ctx) con // Mean and variance std::vector mean_inv_std_var_dim; - for (int i = 0; i < static_cast(x_shape.NumDimensions()); ++i) { + for (int i = 0; i < static_cast(x_num_dims); ++i) { if (i < axis) { mean_inv_std_var_dim.emplace_back(x_shape.GetDims()[i]); } else { @@ -94,7 +111,7 @@ Status LayerNorm::ComputeInternal(OpKernelContext* ctx) con } HostApplyLayerNorm(GetDeviceProp(), Stream(ctx), Y_data, mean_data, inv_var_data, - X_data, n1, n2, epsilon_, scale_data, bias_data); + X_data, n1, n2, epsilon_, scale_data, bias_data, broadcast); CUDA_RETURN_IF_ERROR(cudaGetLastError()); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu index b9e8b45307079..c21943649775b 100644 --- a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu @@ -334,6 +334,7 @@ __global__ void cuApplyLayerNorm( const U epsilon, const V* __restrict__ gamma, const V* __restrict__ beta, + int broadcast, const T* __restrict__ skip, const T* __restrict__ bias, T* __restrict__ skip_input_bias_add_output) { @@ -366,8 +367,13 @@ __global__ void cuApplyLayerNorm( curr += static_cast(skip_vals[i]); } - U gamma_i = (gamma != nullptr) ? (U)gamma[i] : (U)1; - U beta_i = (beta != nullptr) ? (U)beta[i] : (U)0; + // onnx operator LayerNormalization support broadcast. + // gamma and beta should be unidirectional broadcastable to tensor x. + // Here we support a special case for transformer models that x is (B, S, D) and gamma/beta is (B, 1, D) + int index = (broadcast > 0) ? ((i1 / broadcast) * n2 + i) : i; + U gamma_i = (gamma != nullptr) ? (U)gamma[index] : (U)1; + U beta_i = (beta != nullptr) ? (U)beta[index] : (U)0; + if (simplified) { ovals[i] = static_cast(gamma_i * c_inv_std_dev * curr); } else { @@ -409,6 +415,7 @@ void HostApplyLayerNorm( double epsilon, const V* gamma, const V* beta, + int broadcast, const T* skip, const T* bias, T* skip_input_bias_add_output) { @@ -442,15 +449,15 @@ void HostApplyLayerNorm( input, n1, n2, U(epsilon), - gamma, beta, + gamma, beta, broadcast, skip, bias, skip_input_bias_add_output); } #define LAYERNORM_LINEAR_IMPL(T, U, V, simplified) \ template void HostApplyLayerNorm(const cudaDeviceProp& prop, cudaStream_t stream, V* output, \ U* mean, U* inv_std_dev, const T* input, int n1, int n2, \ - double epsilon, const V* gamma, const V* beta, const T* skip, \ - const T* bias, T* skip_input_bias_add_output); + double epsilon, const V* gamma, const V* beta, int broadcast, \ + const T* skip, const T* bias, T* skip_input_bias_add_output); LAYERNORM_LINEAR_IMPL(float, float, float, true) LAYERNORM_LINEAR_IMPL(half, float, half, true) diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h index e3952eefae35d..3ba895e8829b6 100644 --- a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h @@ -41,6 +41,7 @@ void HostApplyLayerNorm( double epsilon, const V* gamma, const V* beta, + int broadcast = 0, // broadcast stride for gamma/beta const T* skip = nullptr, const T* bias = nullptr, T* skip_input_bias_add_output = nullptr); diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py index 0c5125e74c8a4..ed60af6383273 100644 --- a/onnxruntime/python/tools/transformers/compare_bert_results.py +++ b/onnxruntime/python/tools/transformers/compare_bert_results.py @@ -37,16 +37,23 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-1, atol=1e-3): # Validate the output of baseline and treatment, to make sure the results are similar. diff_count = 0 max_abs_diff = 0 + max_diff_percentage = 0 + case_passed = True for test_case_id, results in enumerate(baseline_results): - case_passed = True for i in range(len(results)): treatment_output = treatment_results[test_case_id][i] - abs_diff = np.amax(np.abs(treatment_output - results[i])) + abs_diff_tensor = np.abs(treatment_output - results[i]) + abs_diff = np.amax(abs_diff_tensor) if verbose and abs_diff > atol: print("abs_diff", abs_diff) print("treatment", treatment_output) print("baseline", results[i]) + count_exceeding = np.sum(abs_diff_tensor > atol) + total_elements = abs_diff_tensor.size + percentage_exceeding = (count_exceeding / total_elements) * 100 + max_diff_percentage = max(max_diff_percentage, percentage_exceeding) + max_abs_diff = max(max_abs_diff, abs_diff) if not np.allclose(results[i].tolist(), treatment_output.tolist(), rtol=rtol, atol=atol): if case_passed: @@ -66,6 +73,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-1, atol=1e-3): ) print(f"maximum absolute difference={max_abs_diff}") + print(f"maximum percentage of elements that exceeds atol={atol} is {max_diff_percentage:.3f}%") return max_abs_diff, case_passed diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py index 2398bb9d6031b..306d89b430277 100644 --- a/onnxruntime/python/tools/transformers/float16.py +++ b/onnxruntime/python/tools/transformers/float16.py @@ -143,6 +143,8 @@ def make_value_info_from_tensor(tensor): "Upsample", ] +# Some operators do not support bfloat16 in CUDA. This is not a full list, just some common operators in transformers. +BF16_OP_BLACK_LIST = ["SkipSimplifiedLayerNormalization", "Attention", "MultiHeadAttention"] # Some operators has data type fixed as float for some inputs. Key is op_type, value is list of input indices # Note that DirectML allows float16 gamma and beta in GroupNorm. Use force_fp16_inputs parameter could overwrite this. @@ -154,14 +156,19 @@ class InitializerTracker: def __init__(self, initializer: TensorProto): self.initializer = initializer + self.bf16_nodes = [] self.fp32_nodes = [] self.fp16_nodes = [] - def add_node(self, node: NodeProto, is_node_blocked): - if is_node_blocked: + def add_node(self, node: NodeProto, dtype: int): + if dtype == TensorProto.FLOAT: self.fp32_nodes.append(node) - else: + elif dtype == TensorProto.BFLOAT16: + self.bf16_nodes.append(node) + elif dtype == TensorProto.FLOAT16: self.fp16_nodes.append(node) + else: + raise ValueError("Invalid dtype") def convert_float_to_float16( @@ -194,6 +201,9 @@ def convert_float_to_float16( Default to false, which will convert only the one needed to avoid precision loss. force_fp16_inputs(Dict[str, List[int]]): Force the conversion of the inputs of some operators to float16, even if this script's preference it to keep them in float32. + use_bfloat16_as_blocked_nodes_dtype(bool): use bfloat16 as the data type for blocked nodes. Default to False. + If the node does not support bfloat16, it will remain in float. + Raises: ValueError: input type is not ModelProto. @@ -333,11 +343,19 @@ def convert_float_to_float16( for i, input_name in enumerate(n.input): if input_name in fp32_initializers: # For Resize/GroupNorm, only the first input can be float16 - use_fp32_weight = is_node_blocked or ( - i in ALWAYS_FLOAT_INPUTS.get(n.op_type, []) - and i not in force_fp16_inputs_dict.get(n.op_type, []) - ) - fp32_initializers[input_name].add_node(n, use_fp32_weight) + if i in ALWAYS_FLOAT_INPUTS.get(n.op_type, []) and i not in force_fp16_inputs_dict.get( + n.op_type, [] + ): + dtype = TensorProto.FLOAT + elif is_node_blocked: + dtype = ( + TensorProto.BFLOAT16 + if (use_bfloat16_as_blocked_nodes_dtype and n.op_type not in BF16_OP_BLACK_LIST) + else TensorProto.FLOAT + ) + else: + dtype = TensorProto.FLOAT16 + fp32_initializers[input_name].add_node(n, dtype) if is_node_blocked: node_list.append(n) @@ -404,15 +422,21 @@ def convert_float_to_float16( queue = next_level + initializers_to_be_casted_to_bf16: Dict[str, TensorProto] = {} for value in fp32_initializers.values(): # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes. if force_fp16_initializers or value.fp16_nodes: value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val) value_info_list.append(make_value_info_from_tensor(value.initializer)) - if value.fp32_nodes and not force_fp16_initializers: + if (value.fp32_nodes or value.bf16_nodes) and not force_fp16_initializers: logger.info( - f"initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{value.fp16_nodes}" + f"initializer is used by both fp32/bf16 and fp16 nodes. Consider add these nodes to block list:{value.fp16_nodes}" ) + elif value.bf16_nodes: + # If float initializer is only used by bfloat16 nodes, need to convert it to bfloat16. + # However, numpy does not support bfloat16, so we will add a Cast node to conver it later. + initializers_to_be_casted_to_bf16[value.initializer.name] = value.initializer + continue # Some operators have data type fixed as float for some input. Add a float16 to float cast for those inputs. for node in mixed_float_type_node_list: @@ -435,14 +459,16 @@ def convert_float_to_float16( node.input[i] = output_name break - accuracy_type = TensorProto.BFLOAT16 if use_bfloat16_as_blocked_nodes_dtype else TensorProto.FLOAT # process the nodes in block list that doesn't support tensor(float16) for node in node_list: # if input's name is in the value_info_list meaning input is tensor(float16) type, - # insert a float16 to float Cast node before the node, + # insert a float16 to target type (float or bfloat16) Cast node before the node, # change current node's input name and create new value_info for the new name + use_bf16 = use_bfloat16_as_blocked_nodes_dtype and node.op_type not in BF16_OP_BLACK_LIST + accuracy_type = TensorProto.BFLOAT16 if use_bf16 else TensorProto.FLOAT for i in range(len(node.input)): input_name = node.input[i] + is_input_converted = False for value_info in value_info_list: if input_name == value_info.name: # create new value_info for current node's new input name @@ -457,9 +483,24 @@ def convert_float_to_float16( model.graph.node.extend(new_node) # change current node's input name node.input[i] = output_name + is_input_converted = True break - # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to - # float16 Cast node after the node, change current node's output name and create new value_info for the new name + + # For bfloat16 nodes, we need to convert float initializers to bfloat16. + if (not is_input_converted) and use_bf16 and (input_name in initializers_to_be_casted_to_bf16): + output_name = node.name + "_input_cast_" + str(i) + value_info = helper.make_tensor_value_info( + name=output_name, elem_type=accuracy_type, shape=initializers_to_be_casted_to_bf16[input_name].dims + ) + new_value_info = model.graph.value_info.add() + new_value_info.CopyFrom(value_info) + node_name = node.name + "_input_cast" + str(i) + new_node = [helper.make_node("Cast", [input_name], [output_name], to=accuracy_type, name=node_name)] + model.graph.node.extend(new_node) + node.input[i] = output_name + + # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a Cast (to float16) + # node after it, change current node's output name and create new value_info for the new name. for i in range(len(node.output)): output = node.output[i] for value_info in value_info_list: diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py index a9ff623fb6967..efc0441b0cf4a 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_attention.py @@ -399,45 +399,6 @@ def split_kv(self, present_k_name: str, present_v_name: str, kv_node: str): self.node_name_to_graph_name[gather_k_name] = self.this_graph_name self.node_name_to_graph_name[gather_v_name] = self.this_graph_name - def transpose_kv(self, past_k: str, past_v: str): - """Transpose past_k and past_v from (B,N,P,H) to (B,P,N,H) - - Args: - past_k (str): name of past K value of shape (B,N,P,H) - past_v (str): name of past V value of shape (B,N,P,H) - - Returns: - past_k_transpose (str): name of past K value of shape (B,P,N,H) - past_v_transpose (str): name of past V value of shape (B,P,N,H) - """ - past_k_transpose = (past_k + "_transposed").replace(".", "_") - past_v_transpose = (past_v + "_transposed").replace(".", "_") - transpose_k_name = self.model.create_node_name("Transpose") - transpose_v_name = self.model.create_node_name("Transpose") - - transpose_k = helper.make_node( - "Transpose", - inputs=[past_k], - outputs=[past_k_transpose], - name=transpose_k_name, - perm=[0, 2, 1, 3], - ) - transpose_v = helper.make_node( - "Transpose", - inputs=[past_v], - outputs=[past_v_transpose], - name=transpose_v_name, - perm=[0, 2, 1, 3], - ) - - # Add reshape nodes to graph - self.nodes_to_add.append(transpose_k) - self.nodes_to_add.append(transpose_v) - self.node_name_to_graph_name[transpose_k_name] = self.this_graph_name - self.node_name_to_graph_name[transpose_v_name] = self.this_graph_name - - return past_k_transpose, past_v_transpose - def create_combined_qkv_bias( self, q_add: NodeProto, diff --git a/onnxruntime/python/tools/transformers/fusion_fastgelu.py b/onnxruntime/python/tools/transformers/fusion_fastgelu.py index a9f46585faad7..e2bb8027c8608 100644 --- a/onnxruntime/python/tools/transformers/fusion_fastgelu.py +++ b/onnxruntime/python/tools/transformers/fusion_fastgelu.py @@ -26,6 +26,9 @@ def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict): if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node): return + if self.fuse_4(tanh_node, input_name_to_nodes, output_name_to_node): + return + def fuse_1(self, tanh_node, input_name_to_nodes, output_name_to_node) -> Optional[bool]: """ Fuse Gelu with tanh into one node: @@ -358,3 +361,122 @@ def fuse_3(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict self.nodes_to_add.append(fused_node) self.node_name_to_graph_name[fused_node.name] = self.this_graph_name return True + + def fuse_4(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: + """ + This pattern is from stable diffusion 3.5 model. + Fuse Gelu with tanh into one node: + +-----------------+------------------+ + | | | + | v v + [root] ==> Mul --> Mul --> Mul -----> Add --> Mul --> Tanh --> Add -----> Mul --> Mul --> + | (A=0.0447) (A=0.7978) (A=1) ^ (A=0.5) + | | + +-------------------------------------------------------------------------+ + Note that constant input for Add and Mul could be first or second input. + """ + if tanh_node.output[0] not in input_name_to_nodes: + return + + children = input_name_to_nodes[tanh_node.output[0]] + if len(children) != 1 or children[0].op_type != "Add": + return + add_after_tanh = children[0] + + if not self.model.has_constant_input(add_after_tanh, 1.0): + return + + if add_after_tanh.output[0] not in input_name_to_nodes: + return + children = input_name_to_nodes[add_after_tanh.output[0]] + if len(children) != 1 or children[0].op_type != "Mul": + return + mul_after_tanh = children[0] + + if mul_after_tanh.output[0] not in input_name_to_nodes: + return + children = input_name_to_nodes[mul_after_tanh.output[0]] + if len(children) != 1 or children[0].op_type != "Mul": + return + mul_half = children[0] + if not self.model.has_constant_input(mul_half, 0.5): + return + + root_input = mul_after_tanh.input[0 if mul_after_tanh.input[1] == add_after_tanh.output[0] else 1] + + mul_before_tanh = self.model.match_parent(tanh_node, "Mul", 0, output_name_to_node) + if mul_before_tanh is None: + return + + i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001) + if i < 0: + return + + add_before_tanh = self.model.match_parent(mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node) + if add_before_tanh is None: + return + + if add_before_tanh.input[0] == root_input: + another = 1 + elif add_before_tanh.input[1] == root_input: + another = 0 + else: + return + + mul_after_pow = self.model.match_parent(add_before_tanh, "Mul", another, output_name_to_node) + if mul_after_pow is None: + return + + i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001) + if i < 0: + return + + mul = self.model.match_parent(mul_after_pow, "Mul", 0 if i == 1 else 1, output_name_to_node) + if mul is None: + return + + if mul.input[0] == root_input: + another = 1 + elif mul.input[1] == root_input: + another = 0 + else: + return + + mul2 = self.model.match_parent(mul, "Mul", another, output_name_to_node) + if mul2 is None: + return + + if mul2.input[0] != root_input or mul2.input[1] != root_input: + return + + subgraph_nodes = [ + mul2, + mul, + mul_after_pow, + add_before_tanh, + mul_before_tanh, + tanh_node, + add_after_tanh, + mul_after_tanh, + mul_half, + ] + + if not self.model.is_safe_to_fuse_nodes( + subgraph_nodes, + [mul_half.output[0]], + input_name_to_nodes, + output_name_to_node, + ): + return + + self.nodes_to_remove.extend(subgraph_nodes) + fused_node = helper.make_node( + "FastGelu", + inputs=[root_input], + outputs=mul_half.output, + name=self.model.create_node_name("FastGelu"), + ) + fused_node.domain = "com.microsoft" + self.nodes_to_add.append(fused_node) + self.node_name_to_graph_name[fused_node.name] = self.this_graph_name + return True diff --git a/onnxruntime/python/tools/transformers/fusion_layernorm.py b/onnxruntime/python/tools/transformers/fusion_layernorm.py index aac05a7f01325..277bd0799cf16 100644 --- a/onnxruntime/python/tools/transformers/fusion_layernorm.py +++ b/onnxruntime/python/tools/transformers/fusion_layernorm.py @@ -13,8 +13,10 @@ class FusionLayerNormalization(Fusion): - def __init__(self, model: OnnxModel): + def __init__(self, model: OnnxModel, check_constant_and_dimension: bool = True, force: bool = False): super().__init__(model, "LayerNormalization", "ReduceMean") + self.check_constant_and_dimension = check_constant_and_dimension + self.force = force def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): """ @@ -23,9 +25,9 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): | | | v [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add - (axis=2 or -1) | (Y=2) (axis=2 or -1) (E-6 or E-12 or 0) ^ - | | - +-----------------------------------------------+ + (axis=2 or -1) | (Y=2) (axis=2 or -1) (B=E-6 or E-12) ^ + | | + +-------------------------------------------------+ It also handles cases of duplicated sub nodes exported from older version of PyTorch: +----------------------+ @@ -56,18 +58,20 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): for child in children: # Check if Sub --> Div exists div_node_1 = self.model.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False) - - # Check if Sub --> Cast --> Div - div_node_2 = self.model.match_child_path(child, ["Cast", "Div"], exclude=[]) - if div_node_1 is not None: div_node = div_node_1 - elif div_node_2 is not None: - div_node = div_node_2[-1] + break + else: + # Check if Sub --> Cast --> Div + div_node_2 = self.model.match_child_path(child, ["Cast", "Div"]) + if div_node_2 is not None: + div_node = div_node_2[-1] + break + if div_node is None: return - path_id, parent_nodes, _ = self.model.match_parent_paths( + _path_id, parent_nodes, _ = self.model.match_parent_paths( div_node, [ (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]), @@ -75,72 +79,93 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): ], output_name_to_node, ) - if path_id < 0: + if parent_nodes is None: return sub_node = parent_nodes[-1] if sub_node not in children: return - second_add_node = parent_nodes[1] - i, add_weight = self.model.get_constant_input(second_add_node) - if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4: - logger.debug(f"skip SkipLayerNormalization fusion since epsilon value is not expected: {add_weight}") + add_eps_node = parent_nodes[1] + i, epsilon = self.model.get_constant_input(add_eps_node) + if epsilon is None or epsilon <= 0 or epsilon > 1.0e-4: + logger.debug(f"skip SkipLayerNormalization fusion since epsilon value is not expected: {epsilon}") return pow_node = parent_nodes[3] if self.model.find_constant_input(pow_node, 2.0) != 1: return - temp_node = input_name_to_nodes[div_node.output[0]][0] - if temp_node.op_type == "Cast": - # Div --> Cast --> Mul - subgraph_nodes.append(temp_node) # add Cast node to list of subgraph nodes - mul_node = input_name_to_nodes[temp_node.output[0]][0] - else: - # Div --> Mul - mul_node = temp_node - if mul_node.op_type != "Mul": - return - - last_add_node = input_name_to_nodes[mul_node.output[0]][0] - if last_add_node.op_type != "Add": - return - - subgraph_nodes.append(node) - subgraph_nodes.extend(children) - subgraph_nodes.extend(parent_nodes[:-1]) - - subgraph_nodes.extend([last_add_node, mul_node, div_node]) - if not self.model.is_safe_to_fuse_nodes( - subgraph_nodes, - last_add_node.output, - input_name_to_nodes, - output_name_to_node, - ): - logger.debug("It is not safe to fuse LayerNormalization node. Skip") - return - - node_before_weight = div_node if temp_node.op_type != "Cast" else temp_node - weight_input = mul_node.input[1 - self.model.input_index(node_before_weight.output[0], mul_node)] - if not self.model.is_constant_with_specified_dimension(weight_input, 1, "layernorm weight"): - return - - bias_input = last_add_node.input[1 - self.model.input_index(mul_node.output[0], last_add_node)] - if not self.model.is_constant_with_specified_dimension(bias_input, 1, "layernorm bias"): - return - - self.nodes_to_remove.extend(subgraph_nodes) - - normalize_node = helper.make_node( - "LayerNormalization", - inputs=[node.input[0], weight_input, bias_input], - outputs=[last_add_node.output[0]], - name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"), - ) - normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))]) - self.nodes_to_add.append(normalize_node) - self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name + if div_node.output[0] not in input_name_to_nodes: + return + + # In MMDit model, Div might have two Mul+Add children paths. + div_children = input_name_to_nodes[div_node.output[0]] + for temp_node in div_children: + if temp_node.op_type == "Cast": + # Div --> Cast --> Mul + subgraph_nodes.append(temp_node) # add Cast node to list of subgraph nodes + if temp_node.output[0] not in input_name_to_nodes: + continue + mul_node = input_name_to_nodes[temp_node.output[0]][0] + else: + # Div --> Mul + mul_node = temp_node + if mul_node.op_type != "Mul": + continue + + if mul_node.output[0] not in input_name_to_nodes: + continue + last_add_node = input_name_to_nodes[mul_node.output[0]][0] + if last_add_node.op_type != "Add": + continue + + subgraph_nodes.append(node) + subgraph_nodes.extend(children) + subgraph_nodes.extend(parent_nodes[:-1]) + + subgraph_nodes.extend([last_add_node, mul_node, div_node]) + + node_before_weight = div_node if temp_node.op_type != "Cast" else temp_node + weight_input = mul_node.input[1 - self.model.input_index(node_before_weight.output[0], mul_node)] + if self.check_constant_and_dimension and not self.model.is_constant_with_specified_dimension( + weight_input, 1, "layernorm weight" + ): + continue + + bias_input = last_add_node.input[1 - self.model.input_index(mul_node.output[0], last_add_node)] + if self.check_constant_and_dimension and not self.model.is_constant_with_specified_dimension( + bias_input, 1, "layernorm bias" + ): + continue + + layer_norm_output = last_add_node.output[0] + if not self.model.is_safe_to_fuse_nodes( + subgraph_nodes, + last_add_node.output, + input_name_to_nodes, + output_name_to_node, + ): + # If it is not safe to fuse, somce computation may be duplicated if we force to fuse it. + # It it unknown that force fusion might bring performance gain/loss. + # User need test performance impact to see whether forcing fusion can help. + if self.force: + self.prune_graph = True + else: + logger.debug("It is not safe to fuse LayerNormalization node. Skip") + continue + else: + self.nodes_to_remove.extend(subgraph_nodes) + + normalize_node = helper.make_node( + "LayerNormalization", + inputs=[node.input[0], weight_input, bias_input], + outputs=[layer_norm_output], + name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"), + ) + normalize_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))]) + self.nodes_to_add.append(normalize_node) + self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name class FusionLayerNormalizationNCHW(Fusion): @@ -218,9 +243,9 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): if sub != sub_node: return - i, add_weight = self.model.get_constant_input(second_add_node) - if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4: - logger.debug(f"skip SkipLayerNormalization fusion since epsilon value is not expected: {add_weight}") + i, epsilon = self.model.get_constant_input(second_add_node) + if epsilon is None or epsilon <= 0 or epsilon > 1.0e-4: + logger.debug(f"skip SkipLayerNormalization fusion since epsilon value is not expected: {epsilon}") return axes = OnnxModel.get_node_attribute(reduce_mean_node, "axes") @@ -286,7 +311,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): outputs=[layernorm_node_name + "_out_nhwc"], name=layernorm_node_name, ) - normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))]) + normalize_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))]) self.nodes_to_add.append(transpose_input) self.nodes_to_add.append(normalize_node) diff --git a/onnxruntime/python/tools/transformers/fusion_mha_mmdit.py b/onnxruntime/python/tools/transformers/fusion_mha_mmdit.py new file mode 100644 index 0000000000000..dcad55c13eb49 --- /dev/null +++ b/onnxruntime/python/tools/transformers/fusion_mha_mmdit.py @@ -0,0 +1,668 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +from logging import getLogger +from typing import Dict, Optional + +import numpy as np +from fusion_base import Fusion +from fusion_utils import FusionUtils +from onnx import NodeProto, TensorProto, helper, numpy_helper +from onnx_model import OnnxModel + +logger = getLogger(__name__) + + +class FusionMultiHeadAttentionMMDit(Fusion): + """ + Fuse MultiHeadAttention for Multimodal Diffusion Transformer (MMDiT). + """ + + def __init__(self, model: OnnxModel): + super().__init__(model, fused_op_type="MultiHeadAttention", search_op_types=["Softmax"]) + self.unsqueeze_update_map = {} + + def get_num_heads(self, start_node: NodeProto, output_name_to_node, input_index=0) -> int: + """ + Detect num_heads from Reshape & Transpose of q/k/v for both Stable Diffusion 3.x and Flux 1.x: + + MatMul .. [-1] [24] .. + | | | / / + Add Concat(axis=0) + | / + Reshape + | + Transpose(perm=0,1,3,2) + | + (start_node) + """ + nodes = self.model.match_parent_path( + start_node, ["Transpose", "Reshape", "Concat"], [input_index, 0, 1], output_name_to_node=output_name_to_node + ) + if nodes is None: + return 0 + + concat_shape = nodes[-1] + if len(concat_shape.input) != 4: + return 0 + + value = self.model.get_constant_value(concat_shape.input[2]) + if value is None: + return 0 + + if len(value.shape) != 1: + return 0 + + return int(value[0]) + + def get_num_heads_from_k(self, transpose_k: NodeProto, output_name_to_node, concat_before_transpose: bool) -> int: + """ + Detect num_heads from subgraph like the following (num_heads=24 in this example): + MatMu .. [-1] [24] .. + | | | / / + Add Concat + | / + Reshape + | + Transpose(perm=0,2,1,3) + | + SimplifiedLayerNormalization + | + Transpose(perm=0,1,3,2) + + Another variant is to an extra Concat node to join two symmetrical subgraphs: + + | | + MatMul MatMul .. [-1] [24] .. + | | | | / / + Add Concat Add Concat + | / | / + Reshape Reshape + | | + Transpose Transpose(perm=0,2,1,3) + | | + SimplifiedLayerNormalization SimplifiedLayerNormalization + | / + Concat + | + Transpose(perm=0,1,3,2) + + Both patterns are used in stable diffusion 3.5 model. + """ + if concat_before_transpose: + nodes = self.model.match_parent_path( + transpose_k, ["Concat", "SimplifiedLayerNormalization"], [0, 1], output_name_to_node=output_name_to_node + ) + if nodes: + return self.get_num_heads(nodes[1], output_name_to_node) + else: + nodes = self.model.match_parent_path( + transpose_k, ["SimplifiedLayerNormalization"], [0], output_name_to_node=output_name_to_node + ) + if nodes: + return self.get_num_heads(nodes[0], output_name_to_node) + + return 0 + + def reshape_to_3d(self, input_name: str, output_name: str) -> str: + """Add a Reshape node to convert 4D BxSxNxH to 3D BxSxD. + + Args: + input_name (str): input name for the 4D tensor of shape BxSxNxH. + output_name (str): output name for the 3D tensor of shape BxSxD, where D = N * H. + + Returns: + str: the output name + """ + + new_dims_name = "bsnh_to_bsd_reshape_dims" + new_dims = self.model.get_initializer(new_dims_name) + if new_dims is None: + new_dims = numpy_helper.from_array(np.array([0, 0, -1], dtype="int64"), name=new_dims_name) + self.model.add_initializer(new_dims, self.this_graph_name) + reshape_q = helper.make_node( + "Reshape", + inputs=[input_name, new_dims_name], + outputs=[output_name], + name=self.model.create_node_name("Reshape"), + ) + self.nodes_to_add.append(reshape_q) + self.node_name_to_graph_name[reshape_q.name] = self.this_graph_name + return reshape_q.output[0] + + def adjust_query_from_bnsh_to_bsd_no_concat(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + """ + MultiHeadAttenion requires query in BSD format. This function adjusts query from BNSH to BSD format. + + Before: + MatMul + | + Add Concat + | / + Reshape + | + Transpose(perm=0,2,1,3) + | + SimplifiedLayerNorm + | + Mul + + After: + MatMul + | + Add Concat + | / + Reshape + | + SimplifiedLayerNorm + | + Reshape (shape=[0, 0, -1]) + """ + + path = self.model.match_parent_path( + mul_q, + ["SimplifiedLayerNormalization", "Transpose"], + [0, 0], + ) + if path is None: + return None + sln_a, transpose_a = path + + if not FusionUtils.check_node_attribute(transpose_a, "perm", [0, 2, 1, 3]): + return None + + # Update the graph + sln_a.input[0] = transpose_a.input[0] + sln_output = sln_a.output[0] + sln_a.output[0] = sln_output + "_BSNH" + + return self.reshape_to_3d(sln_a.output[0], sln_output + "_BSD") + + def adjust_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + """ + MultiHeadAttenion requires query in BSD format. This function adjusts query from BNSH to BSD format. + + Before: + MatMul MatMul + | | + Add Concat Add Concat + | / | / + Reshape Reshape + | | + Transpose(perm=0,2,1,3) Transpose(perm=0,2,1,3) + | | + SimplifiedLayerNorm SimplifiedLayerNorm + | / + Concat(axis=2) + | + Mul + + After: + MatMul MatMul + | | + Add Concat Add Concat + | / | / + Reshape Reshape + | | + SimplifiedLayerNorm SimplifiedLayerNorm + | / + Concat(axis=1) + | + Reshape (shape=[0, 0, -1]) + """ + + path = self.model.match_parent_path( + mul_q, + ["Concat", "SimplifiedLayerNormalization", "Transpose"], + [0, 0, 0], + ) + if path is None: + return None + concat, sln_a, transpose_a = path + + if len(concat.input) != 2: + return None + + path = self.model.match_parent_path( + concat, + ["SimplifiedLayerNormalization", "Transpose"], + [1, 0], + ) + if path is None: + return None + sln_b, transpose_b = path + + if not FusionUtils.check_node_attribute(transpose_a, "perm", [0, 2, 1, 3]): + return None + + if not FusionUtils.check_node_attribute(transpose_b, "perm", [0, 2, 1, 3]): + return None + + if not FusionUtils.check_node_attribute(concat, "axis", 2): + return None + + # Update the graph + sln_a.input[0] = transpose_a.input[0] + sln_b.input[0] = transpose_b.input[0] + + new_concat_node = helper.make_node( + "Concat", + inputs=[sln_a.output[0], sln_b.output[0]], + outputs=[concat.output[0] + "_BSNH"], + name=self.model.create_node_name("Concat"), + axis=1, + ) + self.nodes_to_add.append(new_concat_node) + self.node_name_to_graph_name[new_concat_node.name] = self.this_graph_name + + return self.reshape_to_3d(new_concat_node.output[0], concat.output[0] + "_BSD") + + def update_unsqueeze_axes_1_to_2(self, unsqueeze: NodeProto) -> str: + updated_unsqueeze_output = self.unsqueeze_update_map.get(unsqueeze.name) + if updated_unsqueeze_output is None: + if len(unsqueeze.input) == 1: + new_node = helper.make_node( + "Unsqueeze", + inputs=unsqueeze.input, + outputs=[unsqueeze.output[0] + "_BSNH"], + name=self.model.create_node_name("Unsqueeze"), + axes=[2], + ) + else: + initializer_name = "unsqueeze_axes_2" + if self.model.get_initializer(initializer_name) is None: + unsqueeze_axes_2 = helper.make_tensor( + name=initializer_name, + data_type=TensorProto.INT64, + dims=[1], # Shape of the tensor + vals=[2], # Tensor values + ) + self.model.add_initializer(unsqueeze_axes_2, self.this_graph_name) + + new_node = helper.make_node( + "Unsqueeze", + inputs=[unsqueeze.input[0], initializer_name], + outputs=[unsqueeze.output[0] + "_BSNH"], + name=self.model.create_node_name("Unsqueeze"), + ) + + self.nodes_to_add.append(new_node) + self.node_name_to_graph_name[new_node.name] = self.this_graph_name + updated_unsqueeze_output = new_node.output[0] + self.unsqueeze_update_map[unsqueeze.name] = updated_unsqueeze_output + + return updated_unsqueeze_output + + def update_unsqueeze_axes(self, add: NodeProto, output_name_to_node: Dict[str, NodeProto]) -> bool: + """ + Update axes of Unsqueeze from [1] to [2] in the following pattern: + Unsqueeze Unsqueeze + (axes=[0]) (axes=[0]) + | | + Unsqueeze Unsqueeze + ... (axes=[1]) ... (axes=[1]) + | / | / + Mul Mul + | / + Add + Args: + add (NodeProto): the Add node + output_name_to_node (Dict[str, NodeProto]): mapping from output name to node + + Returns: + bool: True if the pattern is matched and updated successfully, False otherwise. + """ + if len(add.input) != 2: + return False + + # Check axes of Unsqueeze nodes are [0] and [1], and change to [0] and [2] respectively. + nodes_b = self.model.match_parent_path(add, ["Mul", "Unsqueeze", "Unsqueeze"], [1, 1, 0], output_name_to_node) + if nodes_b is None: + return False + + fusion_utils = FusionUtils(self.model) + axes_1 = fusion_utils.get_squeeze_or_unsqueeze_axes(nodes_b[1]) + if axes_1 is None or axes_1 != [1]: + return False + + axes_0 = fusion_utils.get_squeeze_or_unsqueeze_axes(nodes_b[2]) + if axes_0 is None or axes_0 != [0]: + return False + + # Check axes of Unsqueeze nodes are [0] and [1], and change to [0] and [2] respectively. + nodes_a = self.model.match_parent_path(add, ["Mul", "Unsqueeze", "Unsqueeze"], [0, 1, 0], output_name_to_node) + if nodes_a is None: + return False + + axes_1 = fusion_utils.get_squeeze_or_unsqueeze_axes(nodes_a[1]) + if axes_1 is None or axes_1 != [1]: + return False + + axes_0 = fusion_utils.get_squeeze_or_unsqueeze_axes(nodes_a[2]) + if axes_0 is None or axes_0 != [0]: + return False + + nodes_a[0].input[1] = self.update_unsqueeze_axes_1_to_2(nodes_a[1]) + nodes_b[0].input[1] = self.update_unsqueeze_axes_1_to_2(nodes_b[1]) + return True + + def adjust_flux_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + """ + Adjust graph to change query format from BNSH to BSD for Flux model. + Note that the graph pattern is complex, and we only do a shallow match here. + + Before: + | | + Transpose(perm=0,2,1,3) Transpose(perm=0,2,1,3) + | | + SimplifiedLayerNorm SimplifiedLayerNorm + | / + Concat(axis=2) + | + Mul Mul + | / + Add + | + Mul + + After (Transpose nods are removed, and a Reshape is added): + + | | + SimplifiedLayerNorm SimplifiedLayerNorm + | / + Concat(axis=1) + | + Mul Mul + | / + Add + | + Reshape (shape=[0, 0, -1]) + """ + + path = self.model.match_parent_path( + mul_q, + ["Add", "Mul", "Concat", "SimplifiedLayerNormalization", "Transpose"], + [0, 0, 0, 0, 0], + ) + if path is None: + return None + add, _mul_a, concat, sln_a, transpose_a = path + + if len(concat.input) != 2: + return None + + path = self.model.match_parent_path( + concat, + ["SimplifiedLayerNormalization", "Transpose"], + [1, 0], + ) + if path is None: + return None + sln_b, transpose_b = path + + if not FusionUtils.check_node_attribute(transpose_a, "perm", [0, 2, 1, 3]): + return None + + if not FusionUtils.check_node_attribute(transpose_b, "perm", [0, 2, 1, 3]): + return None + + if not FusionUtils.check_node_attribute(concat, "axis", 2): + return None + + # Need adjust axes of Unsqueeze nodes from [1] to [2] so that the tensors to Mul nodes are BSNH instead of BNSH. + if not self.update_unsqueeze_axes(add, output_name_to_node): + return None + + # Update the graph + sln_a.input[0] = transpose_a.input[0] + sln_b.input[0] = transpose_b.input[0] + + new_concat_node = helper.make_node( + "Concat", + inputs=[sln_a.output[0], sln_b.output[0]], + outputs=[concat.output[0] + "_BSNH"], + name=self.model.create_node_name("Concat"), + axis=1, + ) + self.nodes_to_add.append(new_concat_node) + self.node_name_to_graph_name[new_concat_node.name] = self.this_graph_name + self.model.replace_input_of_all_nodes(concat.output[0], new_concat_node.output[0]) + + return self.reshape_to_3d(add.output[0], add.output[0] + "_BSD") + + def adjust_flux_single_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + """ + Adjust graph to change query format from BNSH to BSD for Flux model. + Note that the graph pattern is complex, and we only do a shallow match here. + + Before: + | + Transpose(perm=0,2,1,3) + | + SimplifiedLayerNorm + | + Mul Mul + | / + Add + | + Mul + + After (Transpose is removed, and a Reshape is added): + + | + SimplifiedLayerNorm + | + Mul Mul + | / + Add + | + Reshape (shape=[0, 0, -1]) + """ + + path = self.model.match_parent_path( + mul_q, + ["Add", "Mul", "SimplifiedLayerNormalization", "Transpose"], + [0, 0, 0, 0], + ) + if path is None: + return None + add, _mul_a, sln_a, transpose_a = path + + if not FusionUtils.check_node_attribute(transpose_a, "perm", [0, 2, 1, 3]): + return None + + # Need adjust axes of Unsqueeze nodes from [1] to [2] so that the tensors to Mul nodes are BSNH instead of BNSH. + if not self.update_unsqueeze_axes(add, output_name_to_node): + return None + + # Update the graph + sln_a.input[0] = transpose_a.input[0] + add.output[0] = add.output[0] + "_BSNH" + + return self.reshape_to_3d(add.output[0], add.output[0] + "_BSD") + + def transpose_reshape_bnsh_to_bsd(self, q: str, output_name_to_node) -> Optional[str]: + transpose_q = helper.make_node( + "Transpose", + [q], + [q + "_BSNH"], + name=self.model.create_node_name("Transpose", name_prefix="Transpose_BNSH_to_BSNH"), + perm=[0, 2, 1, 3], + ) + self.nodes_to_add.append(transpose_q) + self.node_name_to_graph_name[transpose_q.name] = self.this_graph_name + + return self.reshape_to_3d(q + "_BSNH", q + "_BSD") + + def create_multihead_attention_node( + self, + q: str, + k: str, + v: str, + output: str, + num_heads: int, + ) -> NodeProto: + """ + Create a MultiHeadAttention node. + + Args: + q (str): name of q + k (str): name of k + v (str): name of v + output (str): output name of MHA + num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. + + Returns: + NodeProto: the node created. + """ + + assert num_heads > 0 + + # Add inputs for MHA: Query, Key, Value (Proj_Bias, Mask, Attention_Bias, Past_K, Past_V are optional) + mha_inputs = [q, k, v] + + # Add outputs for MHA (Present_K, Present_V are optional) + mha_outputs = [output] + + mha_node = helper.make_node( + "MultiHeadAttention", + inputs=mha_inputs, + outputs=mha_outputs, + name=self.model.create_node_name("MultiHeadAttention"), + ) + + mha_node.domain = "com.microsoft" + mha_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) + + # No mask is used in MMDit model, so we need not set the optional mask_filter_value attribute. + return mha_node + + def fuse(self, node, input_name_to_nodes, output_name_to_node): + assert node.op_type == "Softmax" + softmax = node + + # Softmax output shall not be graph output. + if self.model.find_graph_output(softmax.output[0]): + return + + nodes = self.model.match_child_path( + softmax, ["MatMul", "Transpose", "Reshape"], [(0, 0), (0, 0), (0, 0)], input_name_to_nodes + ) + if nodes is None: + return + + matmul_s_v, transpose_out, reshape_out = nodes + if not FusionUtils.check_node_attribute(transpose_out, "perm", [0, 2, 1, 3]): + return + + q_nodes = self.model.match_parent_path( + softmax, + ["MatMul", "Mul", "Sqrt", "Div", "Sqrt", "Cast", "Slice", "Shape"], + [0, 0, 1, 0, 1, 0, 0, 0], + ) + + if q_nodes is None: + return + + matmul_qk, mul_q, sqrt_q_2, div_q, sqrt_q, _, _, shape_q = q_nodes + + q_bnsh = mul_q.input[0] + if q_bnsh != shape_q.input[0]: + return + + k_nodes = self.model.match_parent_path(matmul_qk, ["Mul", "Transpose"], [1, 0]) + if k_nodes is None: + return + + mul_k, transpose_k = k_nodes + k = transpose_k.input[0] + if not FusionUtils.check_node_attribute(transpose_k, "perm", [0, 1, 3, 2]): + return + + k_scale_nodes = self.model.match_parent_path(mul_k, ["Sqrt", "Div"], [1, 0]) + if k_scale_nodes is None: + return + if k_scale_nodes[0].input[0] != sqrt_q_2.input[0]: + return + + v = matmul_s_v.input[1] + + # Here we sanity check the v path to make sure it is in the expected BNSH format. + concat_v = self.model.match_parent(matmul_s_v, "Concat", input_index=1, output_name_to_node=output_name_to_node) + if concat_v is not None: + # Match v path like: + # -- Transpose (perm=[0,2,1,3]) ----+ + # | + # v + # -- Transpose (perm=[0,2,1,3]) -> Concat -> (v) + transpose_1 = self.model.match_parent( + concat_v, "Transpose", input_index=0, output_name_to_node=output_name_to_node + ) + if transpose_1 is None: + return + if not FusionUtils.check_node_attribute(transpose_1, "perm", [0, 2, 1, 3]): + return + + transpose_2 = self.model.match_parent( + concat_v, "Transpose", input_index=1, output_name_to_node=output_name_to_node + ) + if transpose_2 is None: + return + if not FusionUtils.check_node_attribute(transpose_2, "perm", [0, 2, 1, 3]): + return + else: + # Match v path like: + # -- Transpose (perm=[0,2,1,3]) -> (v) + transpose_1 = self.model.match_parent( + matmul_s_v, "Transpose", input_index=1, output_name_to_node=output_name_to_node + ) + if transpose_1 is None: + return + if not FusionUtils.check_node_attribute(transpose_1, "perm", [0, 2, 1, 3]): + return + + # Match patterns for Flux. + num_heads = ( + self.get_num_heads(concat_v, output_name_to_node) + if concat_v + else self.get_num_heads(matmul_s_v, output_name_to_node, input_index=1) + ) + + if num_heads == 0: + # Match patterns for Stable Diffusion 3.5. + num_heads = self.get_num_heads_from_k(transpose_k, output_name_to_node, concat_v is not None) + if num_heads <= 0: + return + + # Q is in BNSH format, we need to adjust it to BSD format due to limitation of MHA op. + # TODO: MHA op support BNSH format to reduce the effort in fusion. + if concat_v is not None: + query = self.adjust_query_from_bnsh_to_bsd(mul_q, output_name_to_node) + else: + query = self.adjust_query_from_bnsh_to_bsd_no_concat(mul_q, output_name_to_node) + + if query is None: + query = self.adjust_flux_query_from_bnsh_to_bsd(mul_q, output_name_to_node) + if query is None: + query = self.adjust_flux_single_query_from_bnsh_to_bsd(mul_q, output_name_to_node) + if query is None: + # fallback to use Transpose and Add to adjust query from BNSH to BSD + # This is more general approach. + # However, it might be slower if the extra Transpose node cannot be removed by ORT optimizer. + query = self.transpose_reshape_bnsh_to_bsd(q_bnsh, output_name_to_node) + + new_node = self.create_multihead_attention_node( + q=query, + k=k, + v=v, + output=reshape_out.output[0], + num_heads=num_heads, + ) + self.nodes_to_add.append(new_node) + self.node_name_to_graph_name[new_node.name] = self.this_graph_name + + self.nodes_to_remove.extend([matmul_s_v, transpose_out, reshape_out]) + + # Use prune graph to remove nodes + self.prune_graph = True diff --git a/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py index a872b8c2075bc..ca7ff6462b9ff 100644 --- a/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py +++ b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py @@ -18,134 +18,113 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): return sim_ln_nodes = None - # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary): - # DD = Pow(D, 2) - # Var = ReduceMean(DD) - # VarEps = Add(Var, epsilon) - # StdDev = Sqrt(VarEps) - # InvStdDev = Div(1, StdDev) - # Normalized = Mul(D, InvStdDev) - # NormalizedScaled = Mul(Normalized, Scale) - - # SimplifiedLayerNorm - # +-------------------------------------------------------+ - # | | - # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul - # | - # node - sim_ln_nodes_1 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"], - [1, 1, 1, 0, 0, 0, 0], - ) - # SimplifiedLayerNorm - # +-------------------------------------------------------+ - # | | - # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul - # | - # node - sim_ln_nodes_2 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"], - [1, 1, 1, 0, 0, 0, 0], - ) - - # For LLaMA from Microsoft custom export: - # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1 + # RMSNorm formula: + # S = Pow(X, 2) or S = Mul(X, X) + # MS = ReduceMean(S) + # MSEps = Add(MS, epsilon) + # RMS = Sqrt(MSEps) + # InvRMS = Div(1, RMS) or InvRMS = Reciprocal(RMS) + # Normalized = Mul(D, InvRMS) + # Y = Mul(Normalized, Scale) # - # SimplifiedLayerNorm - # +-------------------------------------------------------+ - # | | - # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul - # | - # node - sim_ln_nodes_3 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"], - [0, 1, 1, 0, 0, 0, 0], - ) - - # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3 + # (root_input) ----------------------------------------+ + # | | + # v v + # Pow --> ReduceMean --> Add ---> Sqrt --> Div --> Mul --> Mul (node) + # (B=2) (A/B=eps) (A=1) (A/B=scale) # - # SimplifiedLayerNorm - # +-----------------------------------------------+ - # | | - # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul - # | - # node - sim_ln_nodes_4 = self.model.match_parent_path( - node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"], - [0, 1, 1, 0, 0, 0], - ) - - # For Gemma from Microsoft custom export, which has a Multiply after the Gather: + # (root_input) ----------------------------------------+ + # | | | + # v v v + # Mul --> ReduceMean --> Add ---> Sqrt --> Div --> Mul --> Mul (node) + # (B=2) (A/B=eps) (A=1) (A/B=scale) # - # SimplifiedLayerNorm - # +-------------------------------------------------------+ - # | | - # Mul --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul - # | - # node - sim_ln_nodes_5 = self.model.match_parent_path( + return_indice = [] + sim_ln_nodes = self.model.match_parent_path( node, - ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Mul"], - [1, 1, 1, 0, 0, 0, 0], + ["Mul", "Div", "Sqrt", "Add", "ReduceMean"], + [None, 1, 1, 0, None], + output_name_to_node=output_name_to_node, + return_indice=return_indice, ) - add_node, pow_node = None, None - if sim_ln_nodes_1 is not None: - sim_ln_nodes = sim_ln_nodes_1 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-2] - elif sim_ln_nodes_2 is not None: - sim_ln_nodes = sim_ln_nodes_2 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-2] - elif sim_ln_nodes_3 is not None: - sim_ln_nodes = sim_ln_nodes_3 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-2] - elif sim_ln_nodes_4 is not None: - sim_ln_nodes = sim_ln_nodes_4 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-1] - # Verify that parent input to Pow node is graph_input - if pow_node.input[0] not in self.model.get_graphs_input_names(): + if sim_ln_nodes: + mul_node, div_node, _sqrt_node, add_node, reduce_mean_node = sim_ln_nodes + if not self.model.has_constant_input(div_node, 1.0): return - elif sim_ln_nodes_5 is not None: - sim_ln_nodes = sim_ln_nodes_5 - add_node = sim_ln_nodes[3] - pow_node = sim_ln_nodes[-2] else: + # Div(1, RMS) can also be represented as Reciprocal(RMS) like + # + # (root_input) -----------------------------------------------+ + # | | + # v v + # Pow --> ReduceMean --> Add ---> Sqrt --> Reciprocal --> Mul --> Mul (node) + # (B=2) (A/B=eps) (A/B=scale) + # + # (root_input) -----------------------------------------------+ + # | | | + # v v v + # Mul --> ReduceMean --> Add ---> Sqrt --> Reciprocal --> Mul --> Mul (node) + # (B=2) (A/B=eps) (A/B=scale) + # + sim_ln_nodes = self.model.match_parent_path( + node, + ["Mul", "Reciprocal", "Sqrt", "Add", "ReduceMean"], + [None, 1, 0, 0, None], + output_name_to_node=output_name_to_node, + return_indice=return_indice, + ) + if sim_ln_nodes is None: + return + mul_node, _reciprocal_node, _sqrt_node, add_node, reduce_mean_node = sim_ln_nodes + + pow_or_mul_node = self.model.get_parent(reduce_mean_node, 0, output_name_to_node) + if pow_or_mul_node is None or pow_or_mul_node.op_type not in ["Pow", "Mul"]: return - layernorm_weight_index = 1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0 - starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4 + if pow_or_mul_node.op_type == "Pow": + if self.model.find_constant_input(pow_or_mul_node, 2.0) != 1: + return + else: + assert pow_or_mul_node.op_type == "Mul" + if pow_or_mul_node[0] != pow_or_mul_node[1]: + return + + root_input = pow_or_mul_node.input[0] + if root_input != mul_node.input[0]: + return - if self.model.find_constant_input(pow_node, 2.0) != 1: + _i, epsilon = self.model.get_constant_input(add_node) + if epsilon is None or epsilon <= 0 or epsilon > 1.0e-4: + logger.warning(f"epsilon value is not expected: {epsilon}") return - root_input = pow_node.input[0] - if root_input != sim_ln_nodes[0].input[0]: + # ReduceMean must have keepdims == 1 + keepdims = self.model.get_node_attribute(reduce_mean_node, "keepdims") + if not keepdims: return - i, add_weight = self.model.get_constant_input(add_node) - if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4: - logger.warning(f"epsilon value is not expected: {add_weight}") + # ReduceMean axes must refer only to the last dimension. + # Axes became an input in opset 18. Before then, axes was an attribute. + axes = self.model.get_node_attribute(reduce_mean_node, "axes") + if (not axes) and len(reduce_mean_node.input) > 1: + axes = self.model.get_constant_value(reduce_mean_node.input[1]) + # Make sure only one axis as required by SimplifiedLayerNormalization spec. + if not axes or len(axes) != 1: return - self.nodes_to_remove.extend(sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes) + self.nodes_to_remove.extend(sim_ln_nodes) + self.nodes_to_remove.append(pow_or_mul_node) self.nodes_to_remove.append(node) normalize_node = helper.make_node( "SimplifiedLayerNormalization", - inputs=[root_input, node.input[layernorm_weight_index]], + inputs=[root_input, node.input[1 - return_indice[0]]], outputs=[node.output[0]], - name=self.model.create_node_name("SimplifiedLayerNormalization", name_prefix="LayerNorm"), + name=self.model.create_node_name("SimplifiedLayerNormalization", name_prefix="RMSNorm"), ) - normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))]) - normalize_node.attribute.extend([helper.make_attribute("axis", -1)]) + normalize_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))]) + normalize_node.attribute.extend([helper.make_attribute("axis", axes[0])]) normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)]) self.nodes_to_add.append(normalize_node) self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py index dbd9e828198ca..3084b84278994 100644 --- a/onnxruntime/python/tools/transformers/fusion_utils.py +++ b/onnxruntime/python/tools/transformers/fusion_utils.py @@ -127,6 +127,19 @@ def skip_parent(model: OnnxModel, node, parent_node, input_name_to_nodes, node_i return parent_can_be_removed + def get_squeeze_or_unsqueeze_axes(self, node: NodeProto) -> Optional[ndarray]: + assert node.op_type in ["Squeeze", "Unsqueeze"] + + # For opset >= 13, axes is an input instead of an attribute. + if len(node.input) > 1: + return self.model.get_constant_value(node.input[1]) + + axes = None + for attr in node.attribute: + if attr.name == "axes": + axes = helper.get_attribute_value(attr) + return axes + @staticmethod def check_node_attribute(node, attribute_name: str, expected_value, default_value=None): """Verify that a node has expected value for an attribute. diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md index edef0d3ee5453..dc83f4dc220f0 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md @@ -203,35 +203,60 @@ This step will export stable diffusion 1.5 to ONNX model in float32 using script ``` curl https://raw.githubusercontent.com/huggingface/diffusers/v0.15.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py > convert_sd_onnx.py -python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ./sd_v1_5/fp32 +python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ./sd1.5_onnx/fp32 ``` For SDXL, use optimum to export the model: ``` pip install optimum diffusers onnx onnxruntime-gpu -optimum-cli export onnx --model stabilityai/stable-diffusion-xl-base-1.0 --task stable-diffusion-xl ./sd_xl_base_onnx +optimum-cli export onnx --model stabilityai/stable-diffusion-xl-base-1.0 --task stable-diffusion-xl ./sdxl_onnx/fp32 +``` + +#### Stable Diffusion 3.x and Flux 1.0 + +Stable Diffusion 3.x and Flux 1.0 requires transformers >= 4.45, and optimum > 1.23.3. +The default opset version for T5 is 12, which does not support bfloat16. To support bfloat16, please set opset version explicitly like below example. + +``` +git clone https://github.com/huggingface/optimum +cd optimum +pip install -e . + +optimum-cli export onnx --model stabilityai/stable-diffusion-3-medium-diffusers ./sd3_onnx/fp32 --opset 15 +optimum-cli export onnx --model stabilityai/stable-diffusion-3.5-medium ./sd3.5_medium_onnx/fp32 --opset 15 +optimum-cli export onnx --model stabilityai/stable-diffusion-3.5-large ./sd3.5_large_onnx/fp32 --opset 15 +optimum-cli export onnx --model black-forest-labs/FLUX.1-schnell ./flux1_schnell_onnx/fp32 --opset 15 +optimum-cli export onnx --model black-forest-labs/FLUX.1-dev ./flux1_dev_onnx/fp32 --opset 15 ``` ### Optimize ONNX Pipeline -Example to optimize the exported float32 ONNX models, and save to float16 models: +Example to optimize the exported float32 ONNX models, then save to float16 models: ``` -python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd_v1_5/fp32 -o ./sd_v1_5/fp16 --float16 +python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd1.5_onnx/fp32 -o ./sd1.5_onnx/fp16 --float16 ``` -In all examples below, we run the scripts in source code directory. You can get source code like the following: +You can also run the script in source code directory like the following: ``` git clone https://github.com/microsoft/onnxruntime cd onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion + +python optimize_pipeline.py -i ./sdxl_onnx/fp32 -o ./sdxl_onnx/fp16 --float16 +python optimize_pipeline.py -i ./sd3_onnx/fp32 -o ./sd3_onnx/fp16 --float16 +python optimize_pipeline.py -i ./sd3.5_medium_onnx/fp32 -o ./sd3.5_medium_onnx/fp16 --float16 +python optimize_pipeline.py -i ./sd3.5_large_onnx/fp32 -o ./sd3.5_large_onnx/fp16 --float16 +python optimize_pipeline.py -i ./flux1_schnell_onnx/fp32 -o ./flux1_schnell_onnx/fp16 --float16 --bfloat16 +python optimize_pipeline.py -i ./flux1_dev_onnx/fp32 -o ./flux1_dev_onnx/fp16 --float16 --bfloat16 ``` +When converting model to float16, some nodes has overflow risk and we can force those nodes to run in either float32 or bfloat16. +Option `--bfloat16` enables the later. If an operator does not support bfloat16, it will fallback to float32. For SDXL model, it is recommended to use a machine with 48 GB or more memory to optimize. -``` -python optimize_pipeline.py -i ./sd_xl_base_onnx -o ./sd_xl_base_fp16 --float16 -``` ### Run Benchmark +#### Run Benchmark with Optimum + The benchmark.py script will run a warm-up prompt twice, and measure the peak GPU memory usage in these two runs, then record them as first_run_memory_MB and second_run_memory_MB. Then it will run 5 runs to get average latency (in seconds), and output the results to benchmark_result.csv. Note that the first run might need more time and memory: For example, cuDNN convolution algorithm search or model compile happens in the first run. @@ -245,15 +270,15 @@ Before running benchmark on PyTorch, you need to be logged in via `huggingface-c Example to benchmark the optimized pipeline of stable diffusion 1.5 with batch size 1 on CUDA EP: ``` -python benchmark.py -p ./sd_v1_5/fp16 -b 1 -v 1.5 +python benchmark.py -p ./sd1.5_onnx/fp16 -b 1 -v 1.5 python benchmark.py -b 1 -v 1.5 ``` For the first command, '-p' specifies a directory of optimized ONNX pipeline as generated by optimize_pipeline.py. -For the second command without '-p', we will use OnnxruntimeCudaStableDiffusionPipeline to export and optimize ONNX models for clip, unet and vae decoder. +For the second command without '-p', we will use ORTPipelineForText2Image to export and optimize ONNX models for clip, unet and vae decoder. On ROCm EP, use the following command instead: ``` -python benchmark.py -p ./sd_v1_5/fp16 -b 1 --tuning --provider rocm -v 1.5 +python benchmark.py -p ./sd1.5_onnx/fp16 -b 1 --tuning --provider rocm -v 1.5 ``` For ROCm EP, you can substitute `python benchmark.py` with `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` since @@ -263,6 +288,22 @@ For ROCm EP, the `--tuning` is mandatory because we heavily rely on tuning to fi The default parameters are stable diffusion version=1.5, height=512, width=512, steps=50, batch_count=5. Run `python benchmark.py --help` for more information. +#### Stable Diffusion 3.x and Flux 1.0 +Example of benchmark with optimum using CUDA provider on stable diffusion 3.5 medium and Flux 1.0: +``` +python benchmark.py -e optimum --height 1024 --width 1024 --steps 30 -b 1 -v 3.0M -p sd3_onnx/fp32 +python benchmark.py -e optimum --height 1024 --width 1024 --steps 30 -b 1 -v 3.5M -p sd3.5_medium_onnx/fp16 +python benchmark.py -e optimum --height 1024 --width 1024 --steps 30 -b 1 -v 3.5L -p sd3.5_large_onnx/fp16 +python benchmark.py -e optimum --height 1024 --width 1024 --steps 4 -b 1 -v Flux.1S -p flux1_schnell_onnx/fp16 +python benchmark.py -e optimum --height 1024 --width 1024 --steps 30 -b 1 -v Flux.1D -p flux1_dev_onnx/fp16 +``` + +Benchmark PyTorch eager mode performance: +``` +python benchmark.py -e torch --height 1024 --width 1024 --steps 30 -b 1 -v 3.5L +python benchmark.py -e torch --height 1024 --width 1024 --steps 30 -b 1 -v Flux.1D +``` + ### Run Benchmark with xFormers Run PyTorch 1.13.1+cu117 with xFormers like the following diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index 0708d57f040f8..0452cff235c11 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -9,6 +9,7 @@ import statistics import sys import time +from pathlib import Path import __init__ # noqa: F401. Walk-around to run this script directly import coloredlogs @@ -22,6 +23,11 @@ "2.0": "stabilityai/stable-diffusion-2", "2.1": "stabilityai/stable-diffusion-2-1", "xl-1.0": "stabilityai/stable-diffusion-xl-refiner-1.0", + "3.0M": "stabilityai/stable-diffusion-3-medium-diffusers", + "3.5M": "stabilityai/stable-diffusion-3.5-medium", + "3.5L": "stabilityai/stable-diffusion-3.5-large", + "Flux.1S": "black-forest-labs/FLUX.1-schnell", + "Flux.1D": "black-forest-labs/FLUX.1-dev", } PROVIDERS = { @@ -90,6 +96,24 @@ def get_ort_pipeline(model_name: str, directory: str, provider, disable_safety_c def get_torch_pipeline(model_name: str, disable_safety_checker: bool, enable_torch_compile: bool, use_xformers: bool): + if "FLUX" in model_name: + from diffusers import FluxPipeline + + pipe = FluxPipeline.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda") + if enable_torch_compile: + pipe.transformer.to(memory_format=torch.channels_last) + pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) + return pipe + + if "stable-diffusion-3" in model_name: + from diffusers import StableDiffusion3Pipeline + + pipe = StableDiffusion3Pipeline.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda") + if enable_torch_compile: + pipe.transformer.to(memory_format=torch.channels_last) + pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) + return pipe + from diffusers import DDIMScheduler, StableDiffusionPipeline from torch import channels_last, float16 @@ -116,9 +140,9 @@ def get_torch_pipeline(model_name: str, disable_safety_checker: bool, enable_tor return pipe -def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, disable_safety_checker: bool): +def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, steps: int, disable_safety_checker: bool): short_model_name = model_name.split("/")[-1].replace("stable-diffusion-", "sd") - return f"{engine}_{short_model_name}_b{batch_size}" + ("" if disable_safety_checker else "_safe") + return f"{engine}_{short_model_name}_b{batch_size}_s{steps}" + ("" if disable_safety_checker else "_safe") def run_ort_pipeline( @@ -193,6 +217,25 @@ def warmup(): } +def get_negative_prompt_kwargs(negative_prompt, use_num_images_per_prompt, is_flux, batch_size) -> dict: + # Flux does not support negative prompt + kwargs = ( + ( + {"negative_prompt": negative_prompt} + if use_num_images_per_prompt + else {"negative_prompt": [negative_prompt] * batch_size} + ) + if not is_flux + else {} + ) + + # Fix the random seed so that we can inspect the output quality easily. + if torch.cuda.is_available(): + kwargs["generator"] = torch.Generator(device="cuda").manual_seed(123) + + return kwargs + + def run_torch_pipeline( pipe, batch_size: int, @@ -207,16 +250,14 @@ def run_torch_pipeline( ): prompts, negative_prompt = example_prompts() - # total 2 runs of warm up, and measure GPU memory for CUDA EP + import diffusers + + is_flux = isinstance(pipe, diffusers.FluxPipeline) + def warmup(): prompt, negative = warmup_prompts() - pipe( - prompt=[prompt] * batch_size, - height=height, - width=width, - num_inference_steps=steps, - negative_prompt=[negative] * batch_size, - ) + extra_kwargs = get_negative_prompt_kwargs(negative, False, is_flux, batch_size) + pipe(prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs) # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory) first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory) @@ -232,13 +273,13 @@ def warmup(): break torch.cuda.synchronize() inference_start = time.time() + extra_kwargs = get_negative_prompt_kwargs(negative_prompt, False, is_flux, batch_size) images = pipe( prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, - negative_prompt=[negative_prompt] * batch_size, - generator=None, # torch.Generator + **extra_kwargs, ).images torch.cuda.synchronize() @@ -289,7 +330,7 @@ def run_ort( load_end = time.time() print(f"Model loading took {load_end - load_start} seconds") - image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker) + image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, steps, disable_safety_checker) result = run_ort_pipeline( pipe, batch_size, @@ -322,33 +363,12 @@ def get_optimum_ort_pipeline( disable_safety_checker: bool = True, use_io_binding: bool = False, ): - from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline + from optimum.onnxruntime import ORTPipelineForText2Image if directory is not None and os.path.exists(directory): - if "xl" in model_name: - pipeline = ORTStableDiffusionXLPipeline.from_pretrained( - directory, - provider=provider, - session_options=None, - use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification. - ) - else: - pipeline = ORTStableDiffusionPipeline.from_pretrained( - directory, - provider=provider, - use_io_binding=use_io_binding, - ) - elif "xl" in model_name: - pipeline = ORTStableDiffusionXLPipeline.from_pretrained( - model_name, - export=True, - provider=provider, - session_options=None, - use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification. - ) - pipeline.save_pretrained(directory) + pipeline = ORTPipelineForText2Image.from_pretrained(directory, provider=provider, use_io_binding=use_io_binding) else: - pipeline = ORTStableDiffusionPipeline.from_pretrained( + pipeline = ORTPipelineForText2Image.from_pretrained( model_name, export=True, provider=provider, @@ -376,31 +396,27 @@ def run_optimum_ort_pipeline( memory_monitor_type, use_num_images_per_prompt=False, ): - from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline + print("Pipeline type", type(pipe)) + from optimum.onnxruntime.modeling_diffusion import ORTFluxPipeline - assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline)) + is_flux = isinstance(pipe, ORTFluxPipeline) prompts, negative_prompt = example_prompts() def warmup(): prompt, negative = warmup_prompts() + extra_kwargs = get_negative_prompt_kwargs(negative, use_num_images_per_prompt, is_flux, batch_size) if use_num_images_per_prompt: pipe( prompt=prompt, height=height, width=width, num_inference_steps=steps, - negative_prompt=negative, num_images_per_prompt=batch_count, + **extra_kwargs, ) else: - pipe( - prompt=[prompt] * batch_size, - height=height, - width=width, - num_inference_steps=steps, - negative_prompt=[negative] * batch_size, - ) + pipe(prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs) # Run warm up, and measure GPU memory of two runs. # The first run has algo search for cuDNN/MIOpen, so it might need more memory. @@ -409,6 +425,8 @@ def warmup(): warmup() + extra_kwargs = get_negative_prompt_kwargs(negative_prompt, use_num_images_per_prompt, is_flux, batch_size) + latency_list = [] for i, prompt in enumerate(prompts): if i >= num_prompts: @@ -420,16 +438,12 @@ def warmup(): height=height, width=width, num_inference_steps=steps, - negative_prompt=negative_prompt, num_images_per_prompt=batch_size, + **extra_kwargs, ).images else: images = pipe( - prompt=[prompt] * batch_size, - height=height, - width=width, - num_inference_steps=steps, - negative_prompt=[negative_prompt] * batch_size, + prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs ).images inference_end = time.time() latency = inference_end - inference_start @@ -478,7 +492,10 @@ def run_optimum_ort( load_end = time.time() print(f"Model loading took {load_end - load_start} seconds") - image_filename_prefix = get_image_filename_prefix("optimum", model_name, batch_size, disable_safety_checker) + full_model_name = model_name + "_" + Path(directory).name if directory else model_name + image_filename_prefix = get_image_filename_prefix( + "optimum", full_model_name, batch_size, steps, disable_safety_checker + ) result = run_optimum_ort_pipeline( pipe, batch_size, @@ -583,7 +600,7 @@ def warmup(): warmup() - image_filename_prefix = get_image_filename_prefix("ort_trt", short_name, batch_size, disable_safety_checker) + image_filename_prefix = get_image_filename_prefix("ort_trt", short_name, batch_size, steps, disable_safety_checker) latency_list = [] prompts, negative_prompt = example_prompts() @@ -722,7 +739,7 @@ def warmup(): warmup() - image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker) + image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, steps, disable_safety_checker) latency_list = [] prompts, negative_prompt = example_prompts() @@ -877,7 +894,7 @@ def warmup(): warmup() model_name = pipeline_info.name() - image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker) + image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, steps, disable_safety_checker) latency_list = [] prompts, negative_prompt = example_prompts() @@ -972,7 +989,7 @@ def warmup(): warmup() model_name = pipeline.pipeline_info.name() - image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, disable_safety_checker) + image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, steps, disable_safety_checker) latency_list = [] prompts, negative_prompt = example_prompts() @@ -1040,7 +1057,7 @@ def run_torch( load_end = time.time() print(f"Model loading took {load_end - load_start} seconds") - image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker) + image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, steps, disable_safety_checker) if not enable_torch_compile: with torch.inference_mode(): diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark_flux.sh b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark_flux.sh new file mode 100644 index 0000000000000..82a0b4f0746a5 --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark_flux.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------- + +set -euo pipefail + +# Script to benchmark Flux models with ONNX and PyTorch +# Usage: bash benchmark_flux.sh + +# Validate inputs and environment +command -v python3 &>/dev/null || { echo "Python3 is required but not installed."; exit 1; } +command -v wget &>/dev/null || { echo "wget is required but not installed."; exit 1; } + +# Input arguments with defaults +install_dir="${1:-$HOME}" +onnx_dir="${2:-onnx_models}" + +# GPU settings +export CUDA_VISIBLE_DEVICES=0 + +# Function to log messages +log() { + echo -e "\033[1;32m[INFO]\033[0m $1" +} + +# Function to install CUDA 12.6 +install_cuda_12() { + log "Installing CUDA 12.6" + pushd "$install_dir" + wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run + sh cuda_12.6.2_560.35.03_linux.run --toolkit --toolkitpath="$install_dir/cuda12.6" --silent --override --no-man-page + export PATH="$install_dir/cuda12.6/bin:$PATH" + export LD_LIBRARY_PATH="$install_dir/cuda12.6/lib64:$LD_LIBRARY_PATH" + popd +} + +# Function to install cuDNN 9.6 +install_cudnn_9() { + log "Installing cuDNN 9.6" + pushd "$install_dir" + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.6.0.74_cuda12-archive.tar.xz + mkdir -p "$install_dir/cudnn9.6" + tar -Jxvf cudnn-linux-x86_64-9.6.0.74_cuda12-archive.tar.xz -C "$install_dir/cudnn9.6" --strip=1 + export LD_LIBRARY_PATH="$install_dir/cudnn9.6/lib:$LD_LIBRARY_PATH" + popd +} + +# Function to install optimum +install_optimum() { + log "Installing Optimum" + optimum_dir="$install_dir/optimum" + if [ ! -d "$optimum_dir" ]; then + git clone https://github.com/huggingface/optimum "$optimum_dir" + fi + pushd "$optimum_dir" + pip show optimum &>/dev/null || pip install -e . + popd +} + +# Function to build and install ONNX Runtime +install_onnxruntime() { + log "Building ONNX Runtime" + pushd "$install_dir" + if [ ! -d onnxruntime ]; then + git clone https://github.com/microsoft/onnxruntime + fi + pushd onnxruntime + CUDA_ARCH=$(python3 -c "import torch; cc = torch.cuda.get_device_capability(); print(f'{cc[0]}{cc[1]}')") + if [ -z "$CUDA_ARCH" ]; then + echo "No CUDA device found." + exit 1 + fi + pip install --upgrade pip cmake psutil setuptools wheel packaging ninja numpy==2.2 + sh build.sh --config Release --build_dir build/cuda12 --parallel \ + --use_cuda --cuda_version 12.6 --cuda_home "$install_dir/cuda12.6" \ + --cudnn_home "$install_dir/cudnn9.6" \ + --build_wheel --skip_tests \ + --cmake_generator Ninja \ + --compile_no_warning_as_error \ + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF CMAKE_CUDA_ARCHITECTURES="$CUDA_ARCH" + + log "Installing ONNX Runtime" + pip install build/cuda12/Release/dist/onnxruntime_gpu-*-linux_x86_64.whl + popd + popd +} + +# Function to install GPU dependencies +install_gpu() { + log "Installing GPU dependencies" + [ ! -d "$install_dir/cuda12.6" ] && install_cuda_12 + [ ! -d "$install_dir/cudnn9.6" ] && install_cudnn_9 + pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124 + pip install diffusers==0.32.0 transformers==4.46.3 onnx==1.17.0 protobuf==5.29.2 py3nvml + install_onnxruntime + install_optimum +} + +# Function to run benchmarks +run_benchmark() { + local model=$1 + local dir=$2 + local version=$3 + local steps=$4 + local batch=$5 + + log "Running benchmark for model: $model" + mkdir -p "$dir" + [ ! -d "$dir/fp32" ] && optimum-cli export onnx --model "$model" "$dir/fp32" --opset 15 --task text-to-image + [ ! -d "$dir/fp16_fp32" ] && python optimize_pipeline.py -i "$dir/fp32" -o "$dir/fp16_fp32" --float16 + [ ! -d "$dir/fp16_bf16" ] && python optimize_pipeline.py -i "$dir/fp32" -o "$dir/fp16_bf16" --float16 --bfloat16 + python benchmark.py -e optimum --height 1024 --width 1024 --steps "$steps" -b "$batch" -v "$version" -p "$dir/fp16_fp32" + python benchmark.py -e optimum --height 1024 --width 1024 --steps "$steps" -b "$batch" -v "$version" -p "$dir/fp16_bf16" + python benchmark.py -e torch --height 1024 --width 1024 --steps "$steps" -b "$batch" -v "$version" + python benchmark.py -e torch --height 1024 --width 1024 --steps "$steps" -b "$batch" -v "$version" --enable_torch_compile +} + +# Main script execution +install_gpu + +log "Creating ONNX model directory: $onnx_dir" +mkdir -p "$onnx_dir" + +run_benchmark black-forest-labs/FLUX.1-schnell "$onnx_dir/flux1_schnell" Flux.1S 4 1 > "$onnx_dir/flux1_schnell_s4_b1.log" +run_benchmark black-forest-labs/FLUX.1-dev "$onnx_dir/flux1_dev" Flux.1D 50 1 > "$onnx_dir/flux1_dev_s50_b1.log" +run_benchmark stabilityai/stable-diffusion-3.5-large "$onnx_dir/sd3.5_large" 3.5L 50 1 > "$onnx_dir/sd3.5_large_s50_b1.log" +run_benchmark stabilityai/stable-diffusion-3.5-medium "$onnx_dir/sd3.5_medium" 3.5M 50 1 > "$onnx_dir/sd3.5_medium_s50_b1.log" + +log "Benchmark completed." diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py index ffcfd6d9fd7e0..cdb6518be000f 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py @@ -28,6 +28,8 @@ import onnx from fusion_options import FusionOptions from onnx_model_clip import ClipOnnxModel +from onnx_model_mmdit import MmditOnnxModel +from onnx_model_t5 import T5OnnxModel from onnx_model_unet import UnetOnnxModel from onnx_model_vae import VaeOnnxModel from optimizer import optimize_by_onnxruntime, optimize_model @@ -46,11 +48,63 @@ def has_external_data(onnx_model_path): return False +def is_sd_3(source_dir: Path): + return (source_dir / "text_encoder_3").exists() + + +def is_sdxl(source_dir: Path): + return ( + (source_dir / "text_encoder_2").exists() + and not (source_dir / "text_encoder_3").exists() + and not (source_dir / "transformer").exists() + ) + + +def is_flux(source_dir: Path): + return ( + (source_dir / "text_encoder_2").exists() + and not (source_dir / "text_encoder_3").exists() + and (source_dir / "transformer").exists() + ) + + +def _classify_pipeline_type(source_dir: Path): + # May also check _class_name in model_index.json like `StableDiffusion3Pipeline` or `FluxPipeline` etc to classify. + if is_sd_3(source_dir): + return "sd3" + + if is_flux(source_dir): + return "flux" + + if is_sdxl(source_dir): + return "sdxl" + + # sd 1.x and 2.x + return "sd" + + +def _get_model_list(pipeline_type: str): + if pipeline_type == "sd3": + return ["text_encoder", "text_encoder_2", "text_encoder_3", "transformer", "vae_encoder", "vae_decoder"] + + if pipeline_type == "flux": + return ["text_encoder", "text_encoder_2", "transformer", "vae_encoder", "vae_decoder"] + + if pipeline_type == "sdxl": + return ["text_encoder", "text_encoder_2", "unet", "vae_encoder", "vae_decoder"] + + assert pipeline_type == "sd" + return ["text_encoder", "unet", "vae_encoder", "vae_decoder"] + + def _optimize_sd_pipeline( source_dir: Path, target_dir: Path, + pipeline_type: str, + model_list: List[str], use_external_data_format: Optional[bool], float16: bool, + bfloat16: bool, force_fp32_ops: List[str], enable_runtime_optimization: bool, args, @@ -60,8 +114,10 @@ def _optimize_sd_pipeline( Args: source_dir (Path): Root of input directory of stable diffusion onnx pipeline with float32 models. target_dir (Path): Root of output directory of stable diffusion onnx pipeline with optimized models. + model_list (List[str]): list of directory names with onnx model. use_external_data_format (Optional[bool]): use external data format. float16 (bool): use half precision + bfloat16 (bool): use bfloat16 as fallback if float16 is also provided. force_fp32_ops(List[str]): operators that are forced to run in float32. enable_runtime_optimization(bool): run graph optimization using Onnx Runtime. @@ -69,12 +125,15 @@ def _optimize_sd_pipeline( RuntimeError: input onnx model does not exist RuntimeError: output onnx model path existed """ + is_flux_pipeline = pipeline_type == "flux" model_type_mapping = { + "transformer": "mmdit", "unet": "unet", "vae_encoder": "vae", "vae_decoder": "vae", "text_encoder": "clip", - "text_encoder_2": "clip", + "text_encoder_2": "t5" if is_flux_pipeline else "clip", + "text_encoder_3": "t5", # t5-v1_1-xxl is used in SD 3.x text_encoder_3 and Flux text_encoder_2. "safety_checker": "unet", } @@ -82,6 +141,8 @@ def _optimize_sd_pipeline( "unet": UnetOnnxModel, "vae": VaeOnnxModel, "clip": ClipOnnxModel, + "t5": T5OnnxModel, + "mmdit": MmditOnnxModel, } force_fp32_operators = { @@ -91,9 +152,140 @@ def _optimize_sd_pipeline( "text_encoder": [], "text_encoder_2": [], "safety_checker": [], + "text_encoder_3": [], + "transformer": [], + } + + # The node block list is generated by running the fp32 model and get statistics of node inputs and outputs. + # Nodes with any input or output of float or double data type, but value ouf of range of float16 are candidates. + # python optimize_pipeline.py -i ./flux1_schnell_onnx/fp32 -o ./flux1_schnell_onnx/fp32_opt + # export ORT_DEBUG_NODE_IO_DUMP_STATISTICS_DATA=1 + # export ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA=1 + # export ORT_DEBUG_NODE_IO_DUMP_OUTPUT_DATA=1 + # python benchmark.py --height 1024 --width 1024 --steps 4 -b 1 -v Flux.1S -p flux1_schnell_onnx/fp32_opt -e optimum >stdout.txt 2>stderr.txt + # Warning: The node name might change in different export settings. See benchmark_flux.sh for the settings. + flux_node_block_list = { + "text_encoder_2": [ + "/encoder/block.10/layer.1/DenseReluDense/wo/MatMul", + "SkipLayerNorm_20", + "SkipLayerNorm_21", + "SkipLayerNorm_22", + "SkipLayerNorm_23", + "SkipLayerNorm_24", + "SkipLayerNorm_25", + "SkipLayerNorm_26", + "SkipLayerNorm_27", + "SkipLayerNorm_28", + "SkipLayerNorm_29", + "SkipLayerNorm_30", + "SkipLayerNorm_31", + "SkipLayerNorm_32", + "SkipLayerNorm_33", + "SkipLayerNorm_34", + "SkipLayerNorm_35", + "SkipLayerNorm_36", + "SkipLayerNorm_37", + "SkipLayerNorm_38", + "SkipLayerNorm_39", + "SkipLayerNorm_40", + "SkipLayerNorm_41", + "SkipLayerNorm_42", + "SkipLayerNorm_43", + "SkipLayerNorm_44", + "SkipLayerNorm_45", + "/encoder/block.23/layer.1/DenseReluDense/wo/MatMul", + "SkipLayerNorm_46", + ], + "vae_decoder": [ + "/decoder/mid_block/attentions.0/MatMul", + "/decoder/mid_block/attentions.0/Softmax", + ], + "transformer": [ + "/transformer_blocks.18/Mul_5", + "/transformer_blocks.18/Add_7", + "/Concat_1", + "LayerNorm_76", + "/single_transformer_blocks.0/Add", + "LayerNorm_77", + "/single_transformer_blocks.1/Add", + "LayerNorm_78", + "/single_transformer_blocks.2/Add", + "LayerNorm_79", + "/single_transformer_blocks.3/Add", + "LayerNorm_80", + "/single_transformer_blocks.4/Add", + "LayerNorm_81", + "/single_transformer_blocks.5/Add", + "LayerNorm_82", + "/single_transformer_blocks.6/Add", + "LayerNorm_83", + "/single_transformer_blocks.7/Add", + "LayerNorm_84", + "/single_transformer_blocks.8/Add", + "LayerNorm_85", + "/single_transformer_blocks.9/Add", + "LayerNorm_86", + "/single_transformer_blocks.10/Add", + "LayerNorm_87", + "/single_transformer_blocks.11/Add", + "LayerNorm_88", + "/single_transformer_blocks.12/Add", + "LayerNorm_89", + "/single_transformer_blocks.13/Add", + "LayerNorm_90", + "/single_transformer_blocks.14/Add", + "LayerNorm_91", + "/single_transformer_blocks.15/Add", + "LayerNorm_92", + "/single_transformer_blocks.16/Add", + "LayerNorm_93", + "/single_transformer_blocks.17/Add", + "LayerNorm_94", + "/single_transformer_blocks.18/Add", + "LayerNorm_95", + "/single_transformer_blocks.19/Add", + "LayerNorm_96", + "/single_transformer_blocks.20/Add", + "LayerNorm_97", + "/single_transformer_blocks.21/Add", + "LayerNorm_98", + "/single_transformer_blocks.22/Add", + "LayerNorm_99", + "/single_transformer_blocks.23/Add", + "LayerNorm_100", + "/single_transformer_blocks.24/Add", + "LayerNorm_101", + "/single_transformer_blocks.25/Add", + "LayerNorm_102", + "/single_transformer_blocks.26/Add", + "LayerNorm_103", + "/single_transformer_blocks.27/Add", + "LayerNorm_104", + "/single_transformer_blocks.28/Add", + "LayerNorm_105", + "/single_transformer_blocks.29/Add", + "LayerNorm_106", + "/single_transformer_blocks.30/Add", + "LayerNorm_107", + "/single_transformer_blocks.31/Add", + "LayerNorm_108", + "/single_transformer_blocks.32/Add", + "LayerNorm_109", + "/single_transformer_blocks.33/Add", + "LayerNorm_110", + "/single_transformer_blocks.34/Add", + "LayerNorm_111", + "/single_transformer_blocks.35/Add", + "LayerNorm_112", + "/single_transformer_blocks.36/Add", + "LayerNorm_113", + "/single_transformer_blocks.37/Add", + "/Shape", + "/Slice", + ], } - is_xl = (source_dir / "text_encoder_2").exists() + sd3_node_block_list = {"text_encoder_3": flux_node_block_list["text_encoder_2"]} if force_fp32_ops: for fp32_operator in force_fp32_ops: @@ -108,13 +300,17 @@ def _optimize_sd_pipeline( for name, model_type in model_type_mapping.items(): onnx_model_path = source_dir / name / "model.onnx" if not os.path.exists(onnx_model_path): - if name != "safety_checker": - logger.info("input onnx model does not exist: %s", onnx_model_path) + if name != "safety_checker" and name in model_list: + logger.warning("input onnx model does not exist: %s", onnx_model_path) # some model are optional so we do not raise error here. continue # Prepare output directory optimized_model_path = target_dir / name / "model.onnx" + if os.path.exists(optimized_model_path): + if not args.overwrite: + logger.warning("Skipped optimization since the target file existed: %s", optimized_model_path) + continue output_dir = optimized_model_path.parent output_dir.mkdir(parents=True, exist_ok=True) @@ -122,7 +318,7 @@ def _optimize_sd_pipeline( use_external_data_format = has_external_data(onnx_model_path) # Graph fusion before fp16 conversion, otherwise they cannot be fused later. - logger.info(f"Optimize {onnx_model_path}...") + logger.info("Optimize %s ...", onnx_model_path) args.model_type = model_type fusion_options = FusionOptions.parse(args) @@ -146,8 +342,28 @@ def _optimize_sd_pipeline( ) if float16: + model_node_block_list = ( + flux_node_block_list if is_flux_pipeline else sd3_node_block_list if pipeline_type == "sd3" else {} + ) + if name in model_node_block_list: + # Opset 12 does not support bfloat16. + # By default, optimum exports T5 model with opset 12. So we need to check the opset version. + use_bfloat16 = bfloat16 + if use_bfloat16: + for opset in m.model.opset_import: + if opset.domain in ["", "ai.onnx"] and opset.version < 13: + logger.warning( + "onnx model requires opset 13 or higher to use bfloat16. Fall back to float32." + ) + use_bfloat16 = False + + m.convert_float_to_float16( + keep_io_types=False, + node_block_list=model_node_block_list[name], + use_bfloat16_as_blocked_nodes_dtype=use_bfloat16, + ) # For SD-XL, use FP16 in VAE decoder will cause NaN and black image so we keep it in FP32. - if is_xl and name == "vae_decoder": + elif pipeline_type in ["sdxl"] and name in ["vae_decoder"]: logger.info("Skip converting %s to float16 to avoid NaN", name) else: logger.info("Convert %s to float16 ...", name) @@ -181,17 +397,18 @@ def _optimize_sd_pipeline( logger.info("*" * 20) -def _copy_extra_directory(source_dir: Path, target_dir: Path): +def _copy_extra_directory(source_dir: Path, target_dir: Path, model_list: List[str]): """Copy extra directory that does not have onnx model Args: source_dir (Path): source directory target_dir (Path): target directory + model_list (List[str]): list of directory names with onnx model. Raises: RuntimeError: source path does not exist """ - extra_dirs = ["scheduler", "tokenizer", "tokenizer_2", "feature_extractor"] + extra_dirs = ["scheduler", "tokenizer", "tokenizer_2", "tokenizer_3", "feature_extractor"] for name in extra_dirs: source_path = source_dir / name @@ -199,6 +416,8 @@ def _copy_extra_directory(source_dir: Path, target_dir: Path): continue target_path = target_dir / name + if target_path.exists(): + shutil.rmtree(target_path) shutil.copytree(source_path, target_path) logger.info("%s => %s", source_path, target_path) @@ -213,8 +432,7 @@ def _copy_extra_directory(source_dir: Path, target_dir: Path): logger.info("%s => %s", source_path, target_path) # Some directory are optional - onnx_model_dirs = ["text_encoder", "text_encoder_2", "unet", "vae_encoder", "vae_decoder", "safety_checker"] - for onnx_model_dir in onnx_model_dirs: + for onnx_model_dir in model_list: source_path = source_dir / onnx_model_dir / "config.json" target_path = target_dir / onnx_model_dir / "config.json" if source_path.exists(): @@ -235,20 +453,24 @@ def optimize_stable_diffusion_pipeline( if os.path.exists(output_dir): if overwrite: shutil.rmtree(output_dir, ignore_errors=True) - else: - raise RuntimeError("output directory existed:{output_dir}. Add --overwrite to empty the directory.") source_dir = Path(input_dir) target_dir = Path(output_dir) target_dir.mkdir(parents=True, exist_ok=True) - _copy_extra_directory(source_dir, target_dir) + pipeline_type = _classify_pipeline_type(source_dir) + model_list = _get_model_list(pipeline_type) + + _copy_extra_directory(source_dir, target_dir, model_list) _optimize_sd_pipeline( source_dir, target_dir, + pipeline_type, + model_list, use_external_data_format, float16, + args.bfloat16, args.force_fp32_ops, enable_runtime_optimization, args, @@ -283,10 +505,18 @@ def parse_arguments(argv: Optional[List[str]] = None): "--float16", required=False, action="store_true", - help="Output models of half or mixed precision.", + help="Output models of float16, except some nodes falls back to float32 or bfloat16 to avoid overflow.", ) parser.set_defaults(float16=False) + parser.add_argument( + "--bfloat16", + required=False, + action="store_true", + help="Allow bfloat16 as fallback if --float16 is also provided.", + ) + parser.set_defaults(bfloat16=False) + parser.add_argument( "--force_fp32_ops", required=False, @@ -339,6 +569,7 @@ def parse_arguments(argv: Optional[List[str]] = None): def main(argv: Optional[List[str]] = None): args = parse_arguments(argv) + logger.info("Arguments: %s", str(args)) optimize_stable_diffusion_pipeline( args.input, args.output, args.overwrite, args.use_external_data_format, args.float16, args.inspect, args diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index fe80a08829263..2a6f9c3d758db 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -233,15 +233,21 @@ def get_nodes_by_op_type(self, op_type): nodes.append(node) return nodes - def get_children(self, node, input_name_to_nodes=None): + def get_children(self, node, input_name_to_nodes=None, output_index=None): if input_name_to_nodes is None: input_name_to_nodes = self.input_name_to_nodes() children = [] - for output in node.output: - if output in input_name_to_nodes: - for node in input_name_to_nodes[output]: - children.append(node) # noqa: PERF402 + if output_index is not None: + if output_index < len(node.output): + output = node.output[output_index] + if output in input_name_to_nodes: + children = list(input_name_to_nodes[output]) + else: + for output in node.output: + if output in input_name_to_nodes: + children.extend(input_name_to_nodes[output]) + return children def get_parents(self, node, output_name_to_node=None): @@ -436,48 +442,63 @@ def match_child_path( self, node, child_op_types, - child_output_index=None, - return_indice=None, + edges: Optional[List[Tuple[int, int]]] = None, + input_name_to_nodes=None, exclude=[], # noqa: B006 ): """ Find a sequence of input edges based on constraints on parent op_type and index. - When input_index is None, we will find the first parent node based on constraints, - and return_indice will be appended the corresponding input index. + Note that we use greedy approach and only consider the first matched child, so it has chance to miss matching. Args: node (str): current node name. child_op_types (str): constraint of child node op_type of each input edge. - child_output_index (list): constraint of input index of each input edge. None means no constraint. - return_indice (list): a list to append the input index - When there is no constraint on input index of an edge. + edges (list): each edge is represented by two integers: output index of parent node, input index of child node. + None means no constraint. + exclude(list): list of nodes that are excluded (not allowed to match as child). Returns: children: a list of matched children node. """ - if child_output_index is not None: - assert len(child_output_index) == len(child_op_types) + if edges is not None: + assert len(edges) == len(child_op_types) + for edge in edges: + assert ( + isinstance(edge, tuple) and len(edge) == 2 and isinstance(edge[0], int) and isinstance(edge[1], int) + ) + + if input_name_to_nodes is None: + input_name_to_nodes = self.input_name_to_nodes() current_node = node matched_children = [] for i, op_type in enumerate(child_op_types): matched_child = None - node_children = self.get_children(current_node) - for child_i, child in enumerate(node_children): + + if edges is None: + children_nodes = self.get_children(current_node, input_name_to_nodes=input_name_to_nodes) + else: + children_nodes = self.get_children( + current_node, input_name_to_nodes=input_name_to_nodes, output_index=edges[i][0] + ) + + for child in children_nodes: if child.op_type == op_type and child not in exclude: - if child_output_index is not None and child_output_index[i] != child_i: - logger.debug( - f"Failed to match index={i} child_output_index={child_output_index[i]} op_type={op_type}", - stack_info=True, - ) - return None + if edges is not None and child.input[edges[i][1]] != current_node.output[edges[i][0]]: + continue + + # Here we use greedy approach and only consider the first matched child. + # TODO: match recursively if we encounter cases that the correct child is not the first matched. matched_child = child + break + if matched_child is None: - logger.debug(f"Failed to match child op_type={op_type}", stack_info=True) + logger.debug(f"Failed to match child {i} op_type={op_type}", stack_info=True) return None matched_children.append(matched_child) current_node = matched_child + return matched_children def find_first_parent_by_type(self, node, parent_type, output_name_to_node=None, recursive=True): diff --git a/onnxruntime/python/tools/transformers/onnx_model_mmdit.py b/onnxruntime/python/tools/transformers/onnx_model_mmdit.py new file mode 100644 index 0000000000000..80d408e671979 --- /dev/null +++ b/onnxruntime/python/tools/transformers/onnx_model_mmdit.py @@ -0,0 +1,122 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import logging +from typing import Optional + +from fusion_layernorm import FusionLayerNormalization +from fusion_mha_mmdit import FusionMultiHeadAttentionMMDit +from fusion_options import FusionOptions +from import_utils import is_installed +from onnx import ModelProto +from onnx_model_bert import BertOnnxModel + +logger = logging.getLogger(__name__) + + +class MmditOnnxModel(BertOnnxModel): + def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): + """Initialize Multimodal Diffusion Transformer (MMDiT) ONNX Model. + + Args: + model (ModelProto): the ONNX model + num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically). + hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically). + """ + assert (num_heads == 0 and hidden_size == 0) or (num_heads > 0 and hidden_size % num_heads == 0) + super().__init__(model, num_heads=num_heads, hidden_size=hidden_size) + + def postprocess(self): + self.prune_graph() + self.remove_unused_constant() + + def fuse_layer_norm(self): + layernorm_support_broadcast = True + logger.warning( + "The optimized model requires LayerNormalization with broadcast support. " + "Please use onnxruntime-gpu>=1.21 for inference." + ) + fusion = FusionLayerNormalization( + self, check_constant_and_dimension=not layernorm_support_broadcast, force=True + ) + fusion.apply() + + def fuse_multi_head_attention(self): + fusion = FusionMultiHeadAttentionMMDit(self) + fusion.apply() + + def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False): + assert not add_dynamic_axes + + if is_installed("tqdm"): + import tqdm + from tqdm.contrib.logging import logging_redirect_tqdm + + with logging_redirect_tqdm(): + steps = 5 + progress_bar = tqdm.tqdm(range(steps), initial=0, desc="fusion") + self._optimize(options, progress_bar) + else: + logger.info("tqdm is not installed. Run optimization without progress bar") + self._optimize(options, None) + + def _optimize(self, options: Optional[FusionOptions] = None, progress_bar=None): + if (options is not None) and not options.enable_shape_inference: + self.disable_shape_inference() + + # Remove cast nodes that having same data type of input and output based on symbolic shape inference. + self.utils.remove_useless_cast_nodes() + if progress_bar: + progress_bar.update(1) + + if (options is None) or options.enable_layer_norm: + self.fuse_layer_norm() + self.fuse_simplified_layer_norm() + if progress_bar: + progress_bar.update(1) + + if (options is None) or options.enable_gelu: + self.fuse_gelu() + if progress_bar: + progress_bar.update(1) + + if (options is None) or options.enable_attention: + self.fuse_multi_head_attention() + if progress_bar: + progress_bar.update(1) + + # TODO: SkipLayerNormalization does not support broadcast yet. + # if (options is None) or options.enable_skip_layer_norm: + # self.fuse_skip_simplified_layer_norm() + # self.fuse_skip_layer_norm() + # if (options is None) or options.enable_bias_skip_layer_norm: + # # Fuse SkipLayerNormalization and Add Bias before it. + # self.fuse_add_bias_skip_layer_norm() + + self.postprocess() + if progress_bar: + progress_bar.update(1) + + logger.info(f"opset version: {self.get_opset_version()}") + + def get_fused_operator_statistics(self): + """ + Returns node count of fused operators. + """ + op_count = {} + ops = [ + "FastGelu", + "MultiHeadAttention", + "LayerNormalization", + # "SkipLayerNormalization", + "SimplifiedLayerNormalization", + ] + + for op in ops: + nodes = self.get_nodes_by_op_type(op) + op_count[op] = len(nodes) + + logger.info(f"Optimized operators:{op_count}") + return op_count diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py index 9cc4878e8022d..70742bb5f52e3 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_t5.py +++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py @@ -75,9 +75,10 @@ def create_attention_node( k_weight = self.model.get_initializer(k_matmul.input[1]) v_weight = self.model.get_initializer(v_matmul.input[1]) - if q_weight is None: + if q_weight is None or k_weight is None or v_weight is None: + matmul = q_matmul if q_weight is None else k_matmul if k_weight is None else v_matmul print( - f"{q_matmul.input[1]} is not an initializer. " + f"{matmul.input[1]} is not an initializer. " "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion" ) return None @@ -222,9 +223,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no return qkv_nodes = self.model.match_parent_path( - normalize_node, - ["MatMul", "Reshape", "Transpose", "MatMul"], - [1, 0, 0, 0], + normalize_node, ["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0], output_name_to_node ) if qkv_nodes is None: return @@ -235,6 +234,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no reshape_qkv, ["Concat", "Unsqueeze", "Gather", "Shape"], [1, 0, 0, 0], + output_name_to_node, ) if qkv_shape_nodes is None: return @@ -244,6 +244,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no matmul_qkv, ["Transpose", "Reshape", "MatMul"], [1, 0, 0], + output_name_to_node, ) if v_nodes is None: return @@ -254,28 +255,64 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no matmul_qkv, ["Softmax", "Add", "MatMul"], [0, 0, 0], + output_name_to_node, ) if qk_nodes is None: return _, add_qk, matmul_qk = qk_nodes - mask_index = None mask_nodes = self.model.match_parent_path( add_qk, ["Add", "Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [1, 1, 0, 1, 0, 0], + output_name_to_node, ) + + is_pattern_for_one_graph_input = mask_nodes is None if mask_nodes is None: - return - mul_node = mask_nodes[1] - if mask_nodes[1].op_type != "Mul": - return + # Pattern for SD3 and Flux. + mask_nodes = self.model.match_parent_path( + add_qk, + ["Add", "Slice", "Mul", "Sub", "Unsqueeze", "Unsqueeze"], + [1, 1, 0, 0, 1, 0], + output_name_to_node, + ) + if mask_nodes is None: + return + mul_node = mask_nodes[2] + else: + mul_node = mask_nodes[1] _, mul_val = self.model.get_constant_input(mul_node) - if mul_val != -10000: - self.mask_filter_value = mul_val + if mul_val is None: + return - mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) + if mul_val != -10000: + self.mask_filter_value = float(mul_val) + + # If the mask is derived from shape of input_ids, it means there is no padding mask. + mask_nodes_2 = self.model.match_parent_path( + mask_nodes[-1], + ["ConstantOfShape", "Concat", "Unsqueeze", "Gather", "Shape"], + [0, 0, 0, 0, 0], + output_name_to_node, + ) + mask_nodes_3 = self.model.match_parent_path( + mask_nodes[-1], + ["ConstantOfShape", "Concat", "Unsqueeze", "Gather", "Shape"], + [0, 0, 1, 0, 0], + output_name_to_node, + ) + if ( + mask_nodes_2 is not None + and any(input.name == mask_nodes_2[-1].input[0] for input in self.model.graph().input) + and mask_nodes_3 is not None + and mask_nodes_2[-1].input[0] == mask_nodes_3[-1].input[0] + and len(mask_nodes_2[1].input) == 2 + ): + mask_index = "" + else: + mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) res_pos_bias = None rpb_nodes = self.model.match_parent_path( @@ -283,10 +320,17 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no ["Add", "RelativePositionBias"], [1, 0], ) + if rpb_nodes is None and is_pattern_for_one_graph_input: + # Pattern for SD3 and Flux. + rpb_nodes = self.model.match_parent_path( + add_qk, + ["Add", "Slice", "RelativePositionBias"], + [1, 0, 0], + ) if rpb_nodes is None: return - rpb_add_node = rpb_nodes[0] - res_pos_bias = rpb_add_node.input[0] + + res_pos_bias = rpb_nodes[-1].output[0] k_nodes = self.model.match_parent_path( matmul_qk, @@ -332,13 +376,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no self.nodes_to_add.append(new_node) self.node_name_to_graph_name[new_node.name] = self.this_graph_name - self.nodes_to_remove.extend(qkv_nodes[1:]) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(k_nodes[:-1]) - if v_nodes is not None: - self.nodes_to_remove.extend(v_nodes[:-1]) - self.nodes_to_remove.extend(q_nodes[:-1]) - + self.nodes_to_remove.append(reshape_qkv) self.prune_graph = True def fuse_t5_decoder(self, normalize_node, input_name_to_nodes, output_name_to_node): @@ -591,12 +629,7 @@ def fuse_t5_decoder(self, normalize_node, input_name_to_nodes, output_name_to_no self.nodes_to_add.append(new_node) self.node_name_to_graph_name[new_node.name] = self.this_graph_name - self.nodes_to_remove.extend(qkv_nodes[1:]) - self.nodes_to_remove.extend(qk_nodes) - self.nodes_to_remove.extend(k_nodes[:-1]) - if v_nodes is not None: - self.nodes_to_remove.extend(v_nodes[:-1]) - self.nodes_to_remove.extend(q_nodes[:-1]) + self.nodes_to_remove.append(reshape_qkv) self.prune_graph = True @@ -605,7 +638,6 @@ class FusionRelativePositionBiasBlock(Fusion): def __init__(self, model: OnnxModel, max_distance: int): super().__init__(model, "RelativePositionBias", ["Add", "Slice"]) self.max_distance = max_distance - # bidirectional=(not self.is_decoder) self.is_bidirectional = False def fuse(self, node, input_name_to_nodes, output_name_to_node): @@ -615,11 +647,11 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): return compute_bias_nodes = self.model.match_parent_path( - node, ["Unsqueeze", "Transpose", "Gather", "Where"], [0, 0, 0, 1] + node, ["Unsqueeze", "Transpose", "Gather", "Where"], [0, 0, 0, 1], output_name_to_node ) if compute_bias_nodes is None: compute_bias_nodes = self.model.match_parent_path( - node, ["Unsqueeze", "Transpose", "Gather", "Add", "Where"], [0, 0, 0, 1, 1] + node, ["Unsqueeze", "Transpose", "Gather", "Add", "Where"], [0, 0, 0, 1, 1], output_name_to_node ) if compute_bias_nodes is None: return @@ -632,20 +664,29 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): where, ["Min", "ConstantOfShape", "Shape", "Add", "Cast", "Mul", "Div", "Log", "Div"], [2, 1, 0, 0, 0, 0, 0, 0, 0], + output_name_to_node, ) if compute_buckets_nodes is None: return + # It is possible to deduce max_distance from a Div node: + # The value of self.model.get_constant_value(compute_buckets_nodes[-3].input[1]) is close to + # math.log(max_distance / (relative_attention_num_buckets // (4 if is_bidirectional else 2))) + # See https://github.com/huggingface/transformers/blob/608e163b527eaee41e650ffb9eb4c422d2679902/src/transformers/models/t5/modeling_t5.py#L397. + # Most t5 models use max_distance=128, so we hardcode it unitl we see a model with different value. + # TODO: maybe add a sanity check here. + div = compute_buckets_nodes[-1] range_nodes = self.model.match_parent_path( div, ["Cast", "Neg", "Min", "ConstantOfShape", "Shape", "Sub", "Unsqueeze", "Range"], [0, 0, 0, 1, 0, 0, 0, 0], + output_name_to_node, ) if range_nodes is None: range_nodes = self.model.match_parent_path( - div, ["Cast", "Abs", "Sub", "Unsqueeze", "Range"], [0, 0, 0, 0, 0] + div, ["Cast", "Abs", "Sub", "Unsqueeze", "Range"], [0, 0, 0, 0, 0], output_name_to_node ) self.is_bidirectional = True if range_nodes is None: @@ -653,17 +694,20 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): range_node = range_nodes[-1] - self.nodes_to_remove.extend(compute_bias_nodes) - self.nodes_to_remove.extend(compute_buckets_nodes) - self.nodes_to_remove.extend(range_nodes) + self.nodes_to_remove.append(unsqueeze) + self.prune_graph = True - node_name_prefix = "encoder" if self.is_bidirectional else "decoder" + node_name = self.model.create_node_name( + "RelativePositionBias", name_prefix="RelPosBias_" + ("encoder" if self.is_bidirectional else "decoder") + ) table_weight_i = self.model.get_initializer(gather.input[0]) + if table_weight_i is None: + return table_weight = NumpyHelper.to_array(table_weight_i) table_weight_t = np.transpose(table_weight) bias_table = helper.make_tensor( - name=self.model.create_node_name("bias_table_weight", name_prefix=node_name_prefix), + name=node_name + "_bias_table_weight", data_type=TensorProto.FLOAT, dims=[np.shape(table_weight)[0], np.shape(table_weight)[1]], vals=table_weight_t.tobytes(), @@ -677,7 +721,7 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): "RelativePositionBias", inputs=inputs, outputs=outputs, - name=self.model.create_node_name("RelativePositionBias", name_prefix=node_name_prefix), + name=node_name, ) rpb_node.domain = "com.microsoft" rpb_node.attribute.extend([helper.make_attribute("max_distance", self.max_distance)]) @@ -688,14 +732,19 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): class T5OnnxModel(BertOnnxModel): - def __init__(self, model, num_heads, hidden_size): + def __init__(self, model, num_heads: int = 0, hidden_size: int = 0): super().__init__(model, num_heads, hidden_size) self.attention_mask = AttentionMask(self) + + # When the model has only one input (input_ids), there is no padding mask. + if len(self.model.graph.input) == 1: + from fusion_options import AttentionMaskFormat + + self.attention_mask.mask_format = AttentionMaskFormat.NoMask + self.attention_fusion = FusionT5Attention(self, self.hidden_size, self.num_heads, self.attention_mask) self.layer_norm_fusion = FusionSimplifiedLayerNormalization(self) self.skip_layer_norm_fusion = FusionSkipSimplifiedLayerNormalization(self) - # TODO: consider retrieve max_distance from model. - # math.log(max_distance / (num_buckets // 2)) self.rpb_fusion = FusionRelativePositionBiasBlock(self, 128) def fuse_attention(self): @@ -704,9 +753,65 @@ def fuse_attention(self): def fuse_layer_norm(self): self.layer_norm_fusion.apply() - def fuse_skip_layer_norm(self): + def fuse_skip_layer_norm(self, shape_infer=True): self.skip_layer_norm_fusion.apply() + def adjust_rel_pos_bis_length_input(self): + # For T5 encoder, it uses complex logic to compute the query and key length when there is only one graph input (input_ids) + # We can directly get the length from shape (the 2nd dimension) of input_ids. + for node in self.nodes(): + if node.op_type == "RelativePositionBias": + nodes = self.match_parent_path( + node, + [ + "Gather", + "Shape", + "Transpose", + "Reshape", + "Concat", + "Unsqueeze", + "Gather", + "Shape", + "SimplifiedLayerNormalization", + "Gather", + ], + [1, 0, 0, 0, 1, 0, 0, 0, 0, 0], + ) + # TODO: more validation on node attributes + if nodes is not None: + graph_input_names = [input.name for input in self.model.graph.input] + if nodes[-1].input[1] in graph_input_names: + node_name = self.create_node_name("Shape", name_prefix="Added_Shape_") + shape_node = helper.make_node( + "Shape", + inputs=[nodes[-1].input[1]], + outputs=[node_name + "_Output"], + name=node_name, + ) + + indices_1 = helper.make_tensor( + name="Constant_Index_1", + data_type=TensorProto.INT64, + dims=[1], # Shape of the tensor + vals=[1], # Tensor values + ) + self.add_initializer(indices_1) + + gather = helper.make_node( + "Gather", + inputs=[node_name + "_Output", "Constant_Index_1"], + outputs=[node_name + "_Output_Gather_1"], + name=self.create_node_name("Gather", name_prefix="Added_Gather_"), + axis=0, + ) + + self.add_node(shape_node) + self.add_node(gather) + node.input[1] = node_name + "_Output_Gather_1" + node.input[2] = node_name + "_Output_Gather_1" + + break + # Remove get_extended_attention_mask() since it generates all zeros. def remove_extended_mask_decoder_init(self): nodes_to_remove = [] @@ -787,5 +892,6 @@ def postprocess(self): # remove get_extended_attention_mask() since it generates all zeros. self.remove_extended_mask_decoder_init() self.remove_extended_mask_decoder() + self.adjust_rel_pos_bis_length_input() self.prune_graph() diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py index 933bd785dc00d..a83c54e345d7d 100644 --- a/onnxruntime/python/tools/transformers/optimizer.py +++ b/onnxruntime/python/tools/transformers/optimizer.py @@ -35,6 +35,7 @@ from onnx_model_clip import ClipOnnxModel from onnx_model_conformer import ConformerOnnxModel from onnx_model_gpt2 import Gpt2OnnxModel +from onnx_model_mmdit import MmditOnnxModel from onnx_model_phi import PhiOnnxModel from onnx_model_sam2 import Sam2OnnxModel from onnx_model_t5 import T5OnnxModel @@ -66,6 +67,7 @@ "unet": (UnetOnnxModel, "pytorch", 1), # UNet in Stable Diffusion "vae": (VaeOnnxModel, "pytorch", 1), # UAE in Stable Diffusion "vit": (BertOnnxModel, "pytorch", 1), + "mmdit": (MmditOnnxModel, "pytorch", 1), } @@ -237,7 +239,9 @@ def optimize_by_fusion( Returns: object of an optimizer class. """ - if model_type not in ["bert", "swin", "unet", "vae", "clip", "sam2"] and (num_heads == 0 or hidden_size == 0): + if model_type not in ["bert", "t5", "swin", "unet", "vae", "clip", "sam2", "mmdit"] and ( + num_heads == 0 or hidden_size == 0 + ): logger.warning(f"Please specify parameters of num_heads and hidden_size for model_type {model_type}") if model_type not in MODEL_TYPES: