Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC: special case of removing uneffective zeropoint Subtract OP #27943

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
rewr.add_matcher<ov::npuw::patterns::opt::DQLiftGatherAsymCW>();
rewr.add_matcher<ov::npuw::patterns::opt::DQLiftGatherSymCW>();
rewr.add_matcher<ov::npuw::patterns::opt::DQLiftGatherSymGQ>();
// TODO: POC: Remove zeropoints preventing following matcher to work
rewr.add_matcher<ov::npuw::patterns::opt::UneffectiveZP>();
rewr.run_on_model(model);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "openvino/util/xml_parse_utils.hpp"
#include "patterns/dcoff.hpp"
#include "patterns/opt.hpp"
#include "openvino/op/ops.hpp"

namespace ov {
namespace npuw {
Expand Down Expand Up @@ -1516,9 +1517,39 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) {
LOG_DEBUG("Handling a Constant input " << prod_output);
LOG_BLOCK();

auto new_param = std::make_shared<ov::op::v0::Parameter>(prod_output.get_element_type(),
prod_output.get_partial_shape());
input_desc.replace_source_output(new_param); // (n)/1/i/a
// TODO: tricky part when const of 4d became a parameter it is no long batch friendly
// lets squeese this shape
auto partial_sh = prod_output.get_partial_shape();
std::shared_ptr<ov::op::v0::Parameter> new_param;
std::shared_ptr<ov::Node> new_param_or_reshape;

if (partial_sh.all_non_negative()) {
auto static_shape = prod_output.get_shape();
std::vector<size_t> dims;
bool needReshape = false;
for (auto s : static_shape) {
if (s != 1) {
dims.push_back(s);
needReshape = true;
}
}
new_param = std::make_shared<ov::op::v0::Parameter>(prod_output.get_element_type(), ov::Shape{dims});
// dont need 2 reshapes
if (needReshape && !ov::as_type<ov::op::v1::Reshape>(input_desc.get_node())) {
auto new_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{static_shape.size()}, static_shape);
new_param_or_reshape = std::make_shared<ov::op::v1::Reshape>(new_param, new_const, false);
} else {
new_param_or_reshape = new_param;
}
LOG_DEBUG("PARTITIONER: a new Constant shape: input " << new_param);
LOG_DEBUG("PARTITIONER: a new reshape inserted: " << new_param_or_reshape);

} else {
new_param = std::make_shared<ov::op::v0::Parameter>(prod_output.get_element_type(),
prod_output.get_partial_shape());
new_param_or_reshape = new_param;
}
input_desc.replace_source_output(new_param_or_reshape); // (n)/1/i/a
function._model->add_parameters({std::move(new_param)});
LOG_DEBUG("Register Parameter[" << new_param_idx << "] as input to " << iport.first << " / "
<< iport.second);
Expand Down
173 changes: 155 additions & 18 deletions src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "openvino/pass/pattern/op/optional.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/util/common_util.hpp"
#include "transformations/utils/utils.hpp"

namespace ov {
namespace npuw {
Expand Down Expand Up @@ -159,14 +160,21 @@ DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) {

auto qcoeff_shape = matched_node_qcoeff->output(0).get_shape();

LOG_DEBUG("DQMatMulCWi matched_qweight->get_element_type(): " << matched_qweight->get_element_type());
LOG_DEBUG("DQMatMulCWi matched_node_qcoeff: " << matched_node_qcoeff->get_friendly_name());
LOG_DEBUG("DQMatMulCWi qcoeff_shape: " << qcoeff_shape);
LOG_DEBUG("DQMatMulCWi matched_matmul->get_transpose_a(): " << matched_matmul->get_transpose_a());
LOG_DEBUG("DQMatMulCWi matched_matmul->get_transpose_b(): " << matched_matmul->get_transpose_b());
LOG_DEBUG("DQMatMulCWi ctx.get().mm_dq_full: " << ctx.get().mm_dq_full);

if ((ov::element::i4 == matched_qweight->get_element_type() ||
ov::element::i8 == matched_qweight->get_element_type()) &&
(ov::op::util::is_parameter(matched_node_qcoeff) || ov::op::util::is_constant(matched_node_qcoeff)) &&
qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) {
(qcoeff_shape.size() == 1 || qcoeff_shape[1] == 1) && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) {
auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr();
auto matched_node_muls = node_to_output.at(qmuls).get_node_shared_ptr();
auto matched_node_mmi = node_to_output.at(qmmi).get_node_shared_ptr();
auto& matched_node_qcoeff_out = uat::_(node_to_output).at_or_at_or_at(qcvtc, reshapec, qcoeff);
auto& matched_node_qcoeff_out = uat::_(node_to_output).at_or_at(qcvtc, qcoeff);
auto& matched_node_muls_out = uat::_(node_to_output).at_or_at(qcvtm, qmuls);

if (!ctx.get().mm_dq_full) {
Expand All @@ -188,7 +196,7 @@ DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) {
auto mm_readers = matched_matmul->output(0).get_target_inputs();

// Introduce a Reshape to alter Scale factor's shape
auto new_dims = std::vector<std::size_t>{qcoeff_shape[1], qcoeff_shape[0]};
auto new_dims = std::vector<std::size_t>{1, qcoeff_shape[0]};
auto new_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, new_dims);
auto new_reshape = std::make_shared<ov::op::v1::Reshape>(matched_node_qcoeff_out, new_const, false);

Expand Down Expand Up @@ -1582,14 +1590,18 @@ SliceLastMatmulMultiply::SliceLastMatmulMultiply() {

ConvToMatmul::ConvToMatmul(Context::Ref ctx) {
auto param = opp::wrap_type<ov::op::v0::Parameter>();
auto convert = opp::wrap_type<ov::op::v0::Convert>({param->output(0)});
auto param1_reshape = opp::optional<ov::op::v1::Reshape>({param, opp::any_input()});
auto convert = opp::wrap_type<ov::op::v0::Convert>({param1_reshape->output(0)});
auto param2 = opp::any_input();
auto convert2 = opp::optional<ov::op::v0::Convert>({param2->output(0)});
auto param2_reshape = opp::optional<ov::op::v1::Reshape>({param2, opp::any_input()});
auto convert2 = opp::optional<ov::op::v0::Convert>({param2_reshape->output(0)});
auto multiply = opp::wrap_type<ov::op::v1::Multiply>({convert, convert2});
auto tr_input = opp::any_input();
auto transpose_in = opp::wrap_type<ov::op::v1::Transpose>({tr_input, opp::any_input()});
auto transpose_in = opp::optional<ov::op::v1::Transpose>({tr_input, opp::any_input()});
auto conv = opp::wrap_type<ov::op::v1::Convolution>({transpose_in, multiply});
auto transpose_out = opp::wrap_type<ov::op::v1::Transpose>({conv, opp::any_input()});

// since this transpose is optional, we might start with convolution and fully match the case where we are working without transpose.
auto transpose_out = opp::optional<ov::op::v1::Transpose>({conv, opp::any_input()});

// Note: Use [=] to make sure the above objects stay alive in the callback
auto callback = [=](ov::pass::pattern::Matcher& m) {
Expand All @@ -1599,33 +1611,94 @@ ConvToMatmul::ConvToMatmul(Context::Ref ctx) {
auto matched_node_param2 = node_to_output.at(param2).get_node_shared_ptr();
auto matched_node_convert = node_to_output.at(convert).get_node_shared_ptr();
auto matched_node_tr_input = node_to_output.at(tr_input);
auto matched_node_transpose_in = node_to_output.at(transpose_in).get_node_shared_ptr();
auto matched_node_transpose_out = node_to_output.at(transpose_out).get_node_shared_ptr();

const auto has_input_tr = node_to_output.count(transpose_in) != 0;
auto has_output_tr = node_to_output.count(transpose_out) != 0;

// check extension mode where matcher started from optional transpose layer and not captured it for some reason.
auto conv_node = node_to_output.at(conv).get_node_shared_ptr();
std::shared_ptr<const Node> transpose_out_node;
if (!has_output_tr) {
for (auto n : conv_node->output(0).get_target_inputs()) {
transpose_out_node = ov::as_type_ptr<ov::op::v1::Transpose>(
n.get_node()->shared_from_this());

if (transpose_out_node) {
LOG_DEBUG("ConvToMatmull: output transpose matched used expanding algorithm "
<< transpose_out_node->get_friendly_name());
has_output_tr = true;
break;
} else {
LOG_DEBUG("ConvToMatmull: output of conv: "
<< n.get_node()->get_friendly_name());
}
}
} else {
transpose_out_node = node_to_output.at(transpose_out).get_node_shared_ptr();;
}

// in case of transpose missed need to check tensor dimensions, in some cases reshapes are enough or transposes pair required
const auto& matched_node_transpose_in = uat::_(node_to_output).at_or_at(transpose_in, tr_input).get_node_shared_ptr();
const auto& matched_node_transpose_out = has_output_tr ? transpose_out_node : conv_node;

auto matched_node_multiply = node_to_output.at(multiply).get_node_shared_ptr();
const auto& cvt2_or_multiply = uat::_(node_to_output).at_or_at(convert2, multiply);

const auto& shape = matched_node_param->get_shape();
const auto& shape2 = matched_node_param2->get_shape();
const auto& tr_in_shape = matched_node_transpose_in->input(0).get_shape();
const auto shape = uat::_(node_to_output).at_or_at(param1_reshape, param).get_shape();
const auto shape2 = uat::_(node_to_output).at_or_at(param2_reshape, param2).get_shape();

const auto& tr_in_shape = has_input_tr ?
matched_node_transpose_in->input(0).get_shape() :
matched_node_transpose_in->output(0).get_shape();

const auto& tr_out_shape = matched_node_transpose_out->output(0).get_shape();

auto check_shape = [](const ov::Shape& shape) {
// last 2 dims are 1
return shape.size() == 4 && shape[2] == 1 && shape[3] == 1;
};

auto check_transpose_shape = [](const ov::Shape& shape) {
// first 2 dims are 1
return shape.size() == 4 && shape[0] == 1 && shape[1] == 1;
};
auto check_conv_shape_1D = [](const ov::Shape& shape) {
// in case of missed transpose also check the reshape possibility
// first 2 dims and 3rd dim are 1
return shape.size() == 4 && shape[0] == 1 && shape[2] == 1 && shape[3] == 1;
};

bool conv_in_shape = has_input_tr ? check_transpose_shape(tr_in_shape) : check_conv_shape_1D(tr_in_shape);
bool conv_out_shape = has_output_tr ? check_transpose_shape(tr_out_shape) : check_conv_shape_1D(tr_out_shape);


LOG_DEBUG("ConvToMatmull: conv_input_shape " << conv_in_shape);
LOG_DEBUG("ConvToMatmull: conv_out_shape " << conv_out_shape);
LOG_DEBUG("ConvToMatmull: matched_node_transpose_in: " << matched_node_transpose_in->get_friendly_name());
LOG_DEBUG("ConvToMatmull: matched_node_transpose_out: " << matched_node_transpose_out->get_friendly_name());

LOG_DEBUG("ConvToMatmull: matched_node_transpose_in shape: " << matched_node_transpose_in->get_shape());
LOG_DEBUG("ConvToMatmull: matched_node_transpose_out shape: " << matched_node_transpose_out->get_shape());

LOG_DEBUG("ConvToMatmull: matched_node_param->get_element_type(): " << matched_node_param->get_element_type());
LOG_DEBUG("ConvToMatmull: matched_node_param2->get_element_type(): " << matched_node_param2->get_element_type());
LOG_DEBUG("ConvToMatmull: matched_node_param2: " << matched_node_param2->get_friendly_name());
LOG_DEBUG("ConvToMatmull: check_shape(shape): " << check_shape(shape));
LOG_DEBUG("ConvToMatmull: check_shape(shape2): " << check_shape(shape2));

// if there is no transpose input - convolution input is fine, but for matmul substitution we might need to add reshape
LOG_DEBUG("ConvToMatmull: check_transpose_shape(tr_in_shape): " << check_transpose_shape(tr_in_shape));

// if there is no transpose input - convolution input is fine, but for matmul substitution we might need to add reshape
LOG_DEBUG("ConvToMatmull: check_transpose_shape(tr_out_shape): " << check_transpose_shape(tr_out_shape));

if ((matched_node_param->get_element_type() == ov::element::i4 ||
matched_node_param->get_element_type() == ov::element::i8) &&
(matched_node_param2->get_element_type() == ov::element::f32 ||
matched_node_param2->get_element_type() == ov::element::f16) &&
(ov::op::util::is_parameter(matched_node_param2) || ov::op::util::is_constant(matched_node_param2)) &&
check_shape(shape) && check_shape(shape2) && check_transpose_shape(tr_in_shape) &&
check_transpose_shape(tr_out_shape)) {
check_shape(shape) && check_shape(shape2) &&
// check_transpose_shape(tr_in_shape) && check_transpose_shape(tr_out_shape) &&
conv_in_shape && conv_out_shape) {
// Add Reshape before Params/Const
auto new_dims = std::vector<std::size_t>{shape[0], shape[1]};
auto new_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, new_dims);
Expand All @@ -1647,20 +1720,84 @@ ConvToMatmul::ConvToMatmul(Context::Ref ctx) {
matched_node_multiply->validate_and_infer_types();
}

// Get rid of Transposes
// Get rid of input Transpose
// TODO: change transpose in case of 2D prefill
auto new_traspose_input = matched_node_tr_input;
if (!has_input_tr) {
std::vector<size_t> perm = {0, 3, 2, 1};
auto shPattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, perm);
new_traspose_input = std::make_shared<ov::op::v1::Transpose>(matched_node_tr_input, shPattern);
}
auto matmul =
std::make_shared<ov::op::v0::MatMul>(matched_node_tr_input, matched_node_multiply, false, true);
std::make_shared<ov::op::v0::MatMul>(new_traspose_input, matched_node_multiply, false, true);

// Get rid of output Transpose
// TODO: change reshape to transpose in case of prefill
std::shared_ptr<Node> new_traspose_output = matmul;
if (!has_output_tr) {
std::vector<size_t> perm = {0, 3, 2, 1};
auto shPattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, perm);
new_traspose_output = std::make_shared<ov::op::v1::Transpose>(matmul, shPattern);
}

for (auto&& r : matched_node_transpose_out->output(0).get_target_inputs()) {
r.replace_source_output(matmul);
r.replace_source_output(new_traspose_output);
}
matmul->validate_and_infer_types();

return true; // root has changed
}
return false; // root hasn't changed
};
register_matcher(std::make_shared<opp::Matcher>(transpose_out, "ConvToMatmul"), std::move(callback));
}

// Removing uneffective zeropoint layer, specific structure fixed for POC only purpose,
// further all unefective subtract can be optimized-out
// anyinput[2D], "uneffective_zp_subgraph"[1D], subtract(boardcast), any_input -> multiply, transpose -> convolution
// anyinput[2D], any_input -> multiply, transpose -> convolution

UneffectiveZP::UneffectiveZP() {
// Constant / param
auto zero_point_tensor = opp::any_input();
auto sub_bias_convert = opp::optional<ov::op::v0::Convert>({zero_point_tensor->outputs()});
auto sub_bias_reshape = opp::optional<ov::op::v1::Reshape>({sub_bias_convert, opp::any_input()});
auto sub_weights = opp::any_input();
auto subtract = opp::wrap_type<ov::op::v1::Subtract>({sub_weights, sub_bias_reshape});
auto multiply_bias = opp::any_input();
auto multiply = opp::wrap_type<ov::op::v1::Multiply>({subtract, multiply_bias});
auto conv_any_bias = opp::any_input();
auto conv = opp::wrap_type<ov::op::v1::Convolution>({conv_any_bias, multiply});

auto callback = [=](ov::pass::pattern::Matcher& m) -> bool {
LOG_DEBUG("UneffectiveZP subgraph candidate detected");
auto& node_to_output = m.get_pattern_value_map();
auto zero_point_tensor_node = node_to_output.at(zero_point_tensor).get_node_shared_ptr();

if (op::util::is_constant_and_all_values_equal_int(zero_point_tensor_node, 0)) {
LOG_DEBUG("UneffectiveZP subgraph detected with NOP zp tensor: "
<< zero_point_tensor_node->get_friendly_name() << " shape=" << zero_point_tensor_node->get_shape());
// removing whole zubgraph with subtract + zeropoints
auto multiply_node = node_to_output.at(multiply).get_node_shared_ptr();
auto sub_weights_node = node_to_output.at(sub_weights).get_node_shared_ptr();

//TODO: verify input 0 is subtract
LOG_DEBUG("UneffectiveZP reconnecting " <<sub_weights_node->get_friendly_name() << " -> " << multiply_node->get_friendly_name());
multiply_node->input(0).replace_source_output(sub_weights_node);
multiply_node->validate_and_infer_types();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line is probably not required

} else {
// TODO: log zeropoints not equal to int4(0)
LOG_DEBUG("UneffectiveZP non all zeropoints are identity");
return false; // root has changed
}

return true; // root has not changed
};

LOG_DEBUG("UneffectiveZP is registered ");
register_matcher(std::make_shared<opp::Matcher>(conv, "UneffectiveZP"), std::move(callback));
}

} // namespace opt
} // namespace patterns
} // namespace npuw
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ class ConvToMatmul : public ov::pass::MatcherPass {
ConvToMatmul(Context::Ref ctx);
};

class UneffectiveZP : public ov::pass::MatcherPass {
public:
UneffectiveZP();
};

} // namespace opt
} // namespace patterns
} // namespace npuw
Expand Down
Loading