Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Allow base mode reshape to be optimized for case where batch can be squeezed #27830

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions src/plugins/intel_gpu/src/graph/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,27 @@ std::vector<layout> gemm_inst::transform_input_layouts(const std::shared_ptr<con
return transposed_input_pshape;
};

auto get_input_padding = [&](const layout& layout, size_t input_rank, size_t output_rank) {
auto pad = layout.data_padding;
std::vector<tensor::value_type> pad_lower, pad_upper, pad_mask;
for (size_t i = 0; i < input_rank; i++) {
pad_lower.push_back(pad._lower_size[i]);
pad_upper.push_back(pad._upper_size[i]);
pad_mask.push_back(pad._dynamic_dims_mask[i]);
}

size_t ones_to_add = std::max(output_rank, static_cast<size_t>(4)) - input_rank;
pad_lower.insert(pad_lower.begin(), ones_to_add, 0);
pad_upper.insert(pad_upper.begin(), ones_to_add, 0);
pad_mask.insert(pad_mask.begin(), ones_to_add, 0);

padding::DynamicDimsMask updated_pad_mask;
for (size_t i = 0; i < pad_mask.size(); i++) {
updated_pad_mask[i] = pad_mask[i];
}
return padding(pad_lower, pad_upper, updated_pad_mask);
};

auto input0_pshape = input_layouts[0].get_partial_shape();
auto input1_pshape = input_layouts[1].get_partial_shape();

Expand All @@ -190,6 +211,10 @@ std::vector<layout> gemm_inst::transform_input_layouts(const std::shared_ptr<con
std::vector<layout> layouts = input_layouts;
layouts[0].set_partial_shape(transposed_input0_pshape);
layouts[1].set_partial_shape(transposed_input1_pshape);
if (layouts[0].data_padding)
layouts[0].data_padding = get_input_padding(layouts[0], input_rank, output_rank);
if (layouts[1].data_padding)
layouts[1].data_padding = get_input_padding(layouts[1], weight_rank, output_rank);

if (primitive->input_size() == 3) {
auto bias_pshape = input_layouts[2].get_partial_shape();
Expand Down
26 changes: 25 additions & 1 deletion src/plugins/intel_gpu/src/graph/include/reshape_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
#include "intel_gpu/primitives/reshape.hpp"
#include "intel_gpu/runtime/tensor_accessor.hpp"
#include "openvino/core/partial_shape.hpp"
#include "concatenation_inst.h"
#include "crop_inst.h"
#include "kv_cache_inst.h"
#include "rope_inst.h"
#include "mvn_inst.h"
#include "primitive_inst.h"
Expand Down Expand Up @@ -50,6 +52,9 @@ struct typed_program_node<reshape> : public typed_program_node_base<reshape> {
return true;
}

if (batch_can_be_squeezed())
return true;

// TODO: This function is to limit condition to a specific case (crop + reshape) among cases for the base mode
if (!input().is_type<crop>())
return false;
Expand Down Expand Up @@ -91,6 +96,25 @@ struct typed_program_node<reshape> : public typed_program_node_base<reshape> {
return true;
}

bool batch_can_be_squeezed() const {
auto prim = typed_desc();
if (prim->mode == reshape::reshape_mode::base) {
if (!input().is_type<kv_cache>() || !prim->output_pattern.empty() || !get_dependency(1).is_type<concatenation>())
return false;

const auto& kv_cache_ps = input().get_output_layout(false).get_partial_shape();
const auto& concat_ps = get_dependency(1).get_output_layout(false).get_partial_shape();
if (concat_ps.size() != 1 || concat_ps[0].is_dynamic())
return false;

if (kv_cache_ps.size() - 1 != static_cast<size_t>(concat_ps[0].get_length()))
return false;

return true;
}
return false;
}

bool has_padding() const {
return (this->get_output_layout().data_padding
|| input().get_output_layout(false).data_padding
Expand Down Expand Up @@ -144,7 +168,7 @@ struct typed_program_node<reshape> : public typed_program_node_base<reshape> {
if (input_layout.data_padding.is_dynamic()) {
auto prim = typed_desc();
// TODO: If outer padding exists, ouput padding propagation is not supported in the base mode
if (prim->mode == reshape::reshape_mode::base)
if (prim->mode == reshape::reshape_mode::base && !batch_can_be_squeezed())
return;

ov::PartialShape pattern_shape = { static_cast<int64_t>(prim->output_pattern.size()) };
Expand Down
31 changes: 20 additions & 11 deletions src/plugins/intel_gpu/src/graph/reshape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,19 @@ namespace cldnn {
GPU_DEFINE_PRIMITIVE_TYPE_ID(reshape)

padding propagate_padding(const layout& in_layout, const ov::PartialShape& out_shape, reshape::reshape_mode mode, const ov::ITensorAccessor& ta) {
if (mode == reshape::reshape_mode::base)
return padding();

auto in_pad = in_layout.data_padding;
if (!in_pad.is_dynamic()) {
return padding();
}

std::vector<int64_t> axes;
if (auto t = ta(1)) {
axes = ov::get_tensor_data_as<int64_t, std::vector<int64_t>>(t);
} else {
OPENVINO_THROW("[GPU] Can't propagate padding for reshape op as axes data is not available");
// axes data is only needed when reshape mode is unsqueeze or squeeze
if (mode != reshape::reshape_mode::base) {
if (auto t = ta(1)) {
axes = ov::get_tensor_data_as<int64_t, std::vector<int64_t>>(t);
} else {
OPENVINO_THROW("[GPU] Can't propagate padding for reshape op as axes data is not available");
}
}

auto rank = in_layout.get_partial_shape().size();
Expand Down Expand Up @@ -76,7 +76,7 @@ padding propagate_padding(const layout& in_layout, const ov::PartialShape& out_s
update_pad_mask.push_back(0);
}
}
} else {
} else if (mode == reshape::reshape_mode::squeeze) {
std::unordered_set<int64_t> unique_axes;
std::transform(axes.begin(), axes.end(), std::inserter(unique_axes, unique_axes.end()), [=](int64_t axis) {
return ov::util::normalize(axis, rank);
Expand All @@ -96,6 +96,11 @@ padding propagate_padding(const layout& in_layout, const ov::PartialShape& out_s
return padding();
}
}
} else {
// padding propagation is allowed only if the batch dimension can be squeezed
update_pad_lower = std::vector<int32_t>(pad_lower.begin() + 1, pad_lower.end());
update_pad_upper = std::vector<int32_t>(pad_upper.begin() + 1, pad_upper.end());
update_pad_mask = std::vector<int32_t>(pad_mask.begin() + 1, pad_mask.end());
}

// TODO: rework this method
Expand Down Expand Up @@ -189,10 +194,14 @@ std::vector<layout> reshape_inst::calc_output_layouts(reshape_node const& node,
op.set_special_zero(prim->special_zero);
op.set_friendly_name(prim->id.c_str());
output_shapes = ov::op::v1::shape_infer(&op, input_shapes, ta);
// If the reshape is base mode, it is currently not set as can_be_optimized at prepare_buffer_fusing.
// So we can just run the reshape kernel
// If the reshape is base mode, it is currently not set as can_be_optimized at prepare_buffer_fusing
// On the other hand, it is only allowed if the batch dimension can be squeezed
// In other cases, we can just run the reshape kernel
// TODO: allow propagatable reshapes
out_pad = padding();
if (node.batch_can_be_squeezed())
out_pad = propagate_padding(input_layout, output_shapes[0], prim->mode, ta);
else
out_pad = padding();
break;
}
case reshape::reshape_mode::squeeze: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include "convolution_inst.h"
#include "gather_inst.h"
#include "gemm_inst.h"
#include "kv_cache_inst.h"
#include "read_value_inst.h"
#include "reshape_inst.h"
#include "fully_connected_inst.h"
#include "permute_inst.h"
Expand Down Expand Up @@ -1494,3 +1496,62 @@ TEST(prepare_buffer_fusing, inner_axis_data_offset_with_gemm_user) {
auto& crop_node = prog->get_node("crop2").as<crop>();
ASSERT_FALSE(crop_node.can_be_optimized());
}

TEST(prepare_buffer_fusing, skip_reshape_batch_can_be_squeezed) {
auto& engine = get_test_engine();

auto input_beam_idx_lay = layout{ov::PartialShape{-1}, data_types::i32, format::bfyx};
auto input_present_lay = layout{ov::PartialShape{-1, 8, -1, 64}, data_types::f32, format::bfyx};
auto input_param_lay = layout{ov::PartialShape{1}, data_types::f32, format::bfyx};
auto gemm_input_lay = layout{ov::PartialShape{-1, -1, -1}, data_types::f32, format::bfyx};

ov::op::util::VariableInfo info{ov::PartialShape{-1, 8, -1, 64}, data_types::f32, "v0"};
auto input_kv_lay = layout{info.data_shape, info.data_type, format::bfyx};
topology topology(input_layout("beam_idx", input_beam_idx_lay),
input_layout("present", input_present_lay),
input_layout("param1", input_param_lay),
input_layout("param2", input_param_lay),
input_layout("param3", input_param_lay),
input_layout("gemm_input", gemm_input_lay),
read_value("kv_cache", std::vector<input_info>{}, info.variable_id, {input_kv_lay}),
gather("gather",
input_info("kv_cache"),
input_info("beam_idx"),
0, // axis
input_kv_lay.get_partial_shape().size(), // input rank
ov::Shape{}, // output shape
0, // batch_dim
true), // support_neg_ind
kv_cache("concat1", {input_info("gather"), input_info("present")}, info, 2, 0, false),
concatenation("concat2", {input_info("param1"), input_info("param2"), input_info("param3")}, 0),
reshape("reshape", input_info("concat1"), input_info("concat2"), false, ov::PartialShape{-1, -1, 64}, cldnn::reshape::reshape_mode::base),
gemm("gemm",
{input_info("gemm_input"), input_info("reshape")},
data_types::f32,
std::vector<int64_t>{ 0, 1, 2 },
std::vector<int64_t>{ 0, 1, 2 },
std::vector<int64_t>{ 0, 1, 2 },
1.0f,
0.0f),
reorder("reorder", input_info("gemm"), format::bfyx, data_types::f32));

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));

network network(engine, topology, config);
auto reshape_inst = network.get_primitive("reshape");

ASSERT_EQ(reshape_inst->get_node().can_be_optimized(), true);
ASSERT_EQ(reshape_inst->can_be_optimized(), true);

auto pad = tensor(0);
pad.feature[0] = 1;
{
std::vector<tensor::value_type> dynamic_pad_mask;
const auto& dynamic_pad_dims = reshape_inst->get_output_layout(0).data_padding._dynamic_dims_mask;
for (size_t i = 0; i < dynamic_pad_dims.size(); i++)
dynamic_pad_mask.push_back(dynamic_pad_dims[i]);
ASSERT_EQ(tensor(dynamic_pad_mask, 0), pad);
}
}
Loading