diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index 25007cb93b18d5..8bd31ba282db22 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -176,6 +176,27 @@ std::vector gemm_inst::transform_input_layouts(const std::shared_ptr pad_lower, pad_upper, pad_mask; + for (size_t i = 0; i < input_rank; i++) { + pad_lower.push_back(pad._lower_size[i]); + pad_upper.push_back(pad._upper_size[i]); + pad_mask.push_back(pad._dynamic_dims_mask[i]); + } + + size_t ones_to_add = std::max(output_rank, static_cast(4)) - input_rank; + pad_lower.insert(pad_lower.begin(), ones_to_add, 0); + pad_upper.insert(pad_upper.begin(), ones_to_add, 0); + pad_mask.insert(pad_mask.begin(), ones_to_add, 0); + + padding::DynamicDimsMask updated_pad_mask; + for (size_t i = 0; i < pad_mask.size(); i++) { + updated_pad_mask[i] = pad_mask[i]; + } + return padding(pad_lower, pad_upper, updated_pad_mask); + }; + auto input0_pshape = input_layouts[0].get_partial_shape(); auto input1_pshape = input_layouts[1].get_partial_shape(); @@ -190,6 +211,10 @@ std::vector gemm_inst::transform_input_layouts(const std::shared_ptr layouts = input_layouts; layouts[0].set_partial_shape(transposed_input0_pshape); layouts[1].set_partial_shape(transposed_input1_pshape); + if (layouts[0].data_padding) + layouts[0].data_padding = get_input_padding(layouts[0], input_rank, output_rank); + if (layouts[1].data_padding) + layouts[1].data_padding = get_input_padding(layouts[1], weight_rank, output_rank); if (primitive->input_size() == 3) { auto bias_pshape = input_layouts[2].get_partial_shape(); diff --git a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h index d6a71c20fcac8d..8c7a6a5dd7a501 100644 --- a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h @@ -6,7 +6,9 @@ #include "intel_gpu/primitives/reshape.hpp" #include "intel_gpu/runtime/tensor_accessor.hpp" #include "openvino/core/partial_shape.hpp" +#include "concatenation_inst.h" #include "crop_inst.h" +#include "kv_cache_inst.h" #include "rope_inst.h" #include "mvn_inst.h" #include "primitive_inst.h" @@ -50,6 +52,9 @@ struct typed_program_node : public typed_program_node_base { return true; } + if (batch_can_be_squeezed()) + return true; + // TODO: This function is to limit condition to a specific case (crop + reshape) among cases for the base mode if (!input().is_type()) return false; @@ -91,6 +96,25 @@ struct typed_program_node : public typed_program_node_base { return true; } + bool batch_can_be_squeezed() const { + auto prim = typed_desc(); + if (prim->mode == reshape::reshape_mode::base) { + if (!input().is_type() || !prim->output_pattern.empty() || !get_dependency(1).is_type()) + return false; + + const auto& kv_cache_ps = input().get_output_layout(false).get_partial_shape(); + const auto& concat_ps = get_dependency(1).get_output_layout(false).get_partial_shape(); + if (concat_ps.size() != 1 || concat_ps[0].is_dynamic()) + return false; + + if (kv_cache_ps.size() - 1 != static_cast(concat_ps[0].get_length())) + return false; + + return true; + } + return false; + } + bool has_padding() const { return (this->get_output_layout().data_padding || input().get_output_layout(false).data_padding @@ -144,7 +168,7 @@ struct typed_program_node : public typed_program_node_base { if (input_layout.data_padding.is_dynamic()) { auto prim = typed_desc(); // TODO: If outer padding exists, ouput padding propagation is not supported in the base mode - if (prim->mode == reshape::reshape_mode::base) + if (prim->mode == reshape::reshape_mode::base && !batch_can_be_squeezed()) return; ov::PartialShape pattern_shape = { static_cast(prim->output_pattern.size()) }; diff --git a/src/plugins/intel_gpu/src/graph/reshape.cpp b/src/plugins/intel_gpu/src/graph/reshape.cpp index e5e33f4ad87b14..128cfaba8a8534 100644 --- a/src/plugins/intel_gpu/src/graph/reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/reshape.cpp @@ -20,19 +20,19 @@ namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(reshape) padding propagate_padding(const layout& in_layout, const ov::PartialShape& out_shape, reshape::reshape_mode mode, const ov::ITensorAccessor& ta) { - if (mode == reshape::reshape_mode::base) - return padding(); - auto in_pad = in_layout.data_padding; if (!in_pad.is_dynamic()) { return padding(); } std::vector axes; - if (auto t = ta(1)) { - axes = ov::get_tensor_data_as>(t); - } else { - OPENVINO_THROW("[GPU] Can't propagate padding for reshape op as axes data is not available"); + // axes data is only needed when reshape mode is unsqueeze or squeeze + if (mode != reshape::reshape_mode::base) { + if (auto t = ta(1)) { + axes = ov::get_tensor_data_as>(t); + } else { + OPENVINO_THROW("[GPU] Can't propagate padding for reshape op as axes data is not available"); + } } auto rank = in_layout.get_partial_shape().size(); @@ -76,7 +76,7 @@ padding propagate_padding(const layout& in_layout, const ov::PartialShape& out_s update_pad_mask.push_back(0); } } - } else { + } else if (mode == reshape::reshape_mode::squeeze) { std::unordered_set unique_axes; std::transform(axes.begin(), axes.end(), std::inserter(unique_axes, unique_axes.end()), [=](int64_t axis) { return ov::util::normalize(axis, rank); @@ -96,6 +96,11 @@ padding propagate_padding(const layout& in_layout, const ov::PartialShape& out_s return padding(); } } + } else { + // padding propagation is allowed only if the batch dimension can be squeezed + update_pad_lower = std::vector(pad_lower.begin() + 1, pad_lower.end()); + update_pad_upper = std::vector(pad_upper.begin() + 1, pad_upper.end()); + update_pad_mask = std::vector(pad_mask.begin() + 1, pad_mask.end()); } // TODO: rework this method @@ -189,10 +194,14 @@ std::vector reshape_inst::calc_output_layouts(reshape_node const& node, op.set_special_zero(prim->special_zero); op.set_friendly_name(prim->id.c_str()); output_shapes = ov::op::v1::shape_infer(&op, input_shapes, ta); - // If the reshape is base mode, it is currently not set as can_be_optimized at prepare_buffer_fusing. - // So we can just run the reshape kernel + // If the reshape is base mode, it is currently not set as can_be_optimized at prepare_buffer_fusing + // On the other hand, it is only allowed if the batch dimension can be squeezed + // In other cases, we can just run the reshape kernel // TODO: allow propagatable reshapes - out_pad = padding(); + if (node.batch_can_be_squeezed()) + out_pad = propagate_padding(input_layout, output_shapes[0], prim->mode, ta); + else + out_pad = padding(); break; } case reshape::reshape_mode::squeeze: { diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index 5adc1e691b82a7..c204ddbd3dec16 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -15,6 +15,8 @@ #include "convolution_inst.h" #include "gather_inst.h" #include "gemm_inst.h" +#include "kv_cache_inst.h" +#include "read_value_inst.h" #include "reshape_inst.h" #include "fully_connected_inst.h" #include "permute_inst.h" @@ -1494,3 +1496,62 @@ TEST(prepare_buffer_fusing, inner_axis_data_offset_with_gemm_user) { auto& crop_node = prog->get_node("crop2").as(); ASSERT_FALSE(crop_node.can_be_optimized()); } + +TEST(prepare_buffer_fusing, skip_reshape_batch_can_be_squeezed) { + auto& engine = get_test_engine(); + + auto input_beam_idx_lay = layout{ov::PartialShape{-1}, data_types::i32, format::bfyx}; + auto input_present_lay = layout{ov::PartialShape{-1, 8, -1, 64}, data_types::f32, format::bfyx}; + auto input_param_lay = layout{ov::PartialShape{1}, data_types::f32, format::bfyx}; + auto gemm_input_lay = layout{ov::PartialShape{-1, -1, -1}, data_types::f32, format::bfyx}; + + ov::op::util::VariableInfo info{ov::PartialShape{-1, 8, -1, 64}, data_types::f32, "v0"}; + auto input_kv_lay = layout{info.data_shape, info.data_type, format::bfyx}; + topology topology(input_layout("beam_idx", input_beam_idx_lay), + input_layout("present", input_present_lay), + input_layout("param1", input_param_lay), + input_layout("param2", input_param_lay), + input_layout("param3", input_param_lay), + input_layout("gemm_input", gemm_input_lay), + read_value("kv_cache", std::vector{}, info.variable_id, {input_kv_lay}), + gather("gather", + input_info("kv_cache"), + input_info("beam_idx"), + 0, // axis + input_kv_lay.get_partial_shape().size(), // input rank + ov::Shape{}, // output shape + 0, // batch_dim + true), // support_neg_ind + kv_cache("concat1", {input_info("gather"), input_info("present")}, info, 2, 0, false), + concatenation("concat2", {input_info("param1"), input_info("param2"), input_info("param3")}, 0), + reshape("reshape", input_info("concat1"), input_info("concat2"), false, ov::PartialShape{-1, -1, 64}, cldnn::reshape::reshape_mode::base), + gemm("gemm", + {input_info("gemm_input"), input_info("reshape")}, + data_types::f32, + std::vector{ 0, 1, 2 }, + std::vector{ 0, 1, 2 }, + std::vector{ 0, 1, 2 }, + 1.0f, + 0.0f), + reorder("reorder", input_info("gemm"), format::bfyx, data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network network(engine, topology, config); + auto reshape_inst = network.get_primitive("reshape"); + + ASSERT_EQ(reshape_inst->get_node().can_be_optimized(), true); + ASSERT_EQ(reshape_inst->can_be_optimized(), true); + + auto pad = tensor(0); + pad.feature[0] = 1; + { + std::vector dynamic_pad_mask; + const auto& dynamic_pad_dims = reshape_inst->get_output_layout(0).data_padding._dynamic_dims_mask; + for (size_t i = 0; i < dynamic_pad_dims.size(); i++) + dynamic_pad_mask.push_back(dynamic_pad_dims[i]); + ASSERT_EQ(tensor(dynamic_pad_mask, 0), pad); + } +}