diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index aa02ca8681e80f..f0bfaab035a11e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -169,6 +169,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, rewr.add_matcher(); rewr.add_matcher(); rewr.add_matcher(); + // TODO: POC: Remove zeropoints preventing following matcher to work + rewr.add_matcher(); rewr.run_on_model(model); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index b4c101d2bae06d..554588ec989236 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -21,6 +21,7 @@ #include "openvino/util/xml_parse_utils.hpp" #include "patterns/dcoff.hpp" #include "patterns/opt.hpp" +#include "openvino/op/ops.hpp" namespace ov { namespace npuw { @@ -1516,9 +1517,39 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) { LOG_DEBUG("Handling a Constant input " << prod_output); LOG_BLOCK(); - auto new_param = std::make_shared(prod_output.get_element_type(), - prod_output.get_partial_shape()); - input_desc.replace_source_output(new_param); // (n)/1/i/a + // TODO: tricky part when const of 4d became a parameter it is no long batch friendly + // lets squeese this shape + auto partial_sh = prod_output.get_partial_shape(); + std::shared_ptr new_param; + std::shared_ptr new_param_or_reshape; + + if (partial_sh.all_non_negative()) { + auto static_shape = prod_output.get_shape(); + std::vector dims; + bool needReshape = false; + for (auto s : static_shape) { + if (s != 1) { + dims.push_back(s); + needReshape = true; + } + } + new_param = std::make_shared(prod_output.get_element_type(), ov::Shape{dims}); + // dont need 2 reshapes + if (needReshape && !ov::as_type(input_desc.get_node())) { + auto new_const = std::make_shared(ov::element::i32, ov::Shape{static_shape.size()}, static_shape); + new_param_or_reshape = std::make_shared(new_param, new_const, false); + } else { + new_param_or_reshape = new_param; + } + LOG_DEBUG("PARTITIONER: a new Constant shape: input " << new_param); + LOG_DEBUG("PARTITIONER: a new reshape inserted: " << new_param_or_reshape); + + } else { + new_param = std::make_shared(prod_output.get_element_type(), + prod_output.get_partial_shape()); + new_param_or_reshape = new_param; + } + input_desc.replace_source_output(new_param_or_reshape); // (n)/1/i/a function._model->add_parameters({std::move(new_param)}); LOG_DEBUG("Register Parameter[" << new_param_idx << "] as input to " << iport.first << " / " << iport.second); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index db9666b9485546..ba409d747f1361 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -12,6 +12,7 @@ #include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/util/common_util.hpp" +#include "transformations/utils/utils.hpp" namespace ov { namespace npuw { @@ -159,14 +160,21 @@ DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) { auto qcoeff_shape = matched_node_qcoeff->output(0).get_shape(); + LOG_DEBUG("DQMatMulCWi matched_qweight->get_element_type(): " << matched_qweight->get_element_type()); + LOG_DEBUG("DQMatMulCWi matched_node_qcoeff: " << matched_node_qcoeff->get_friendly_name()); + LOG_DEBUG("DQMatMulCWi qcoeff_shape: " << qcoeff_shape); + LOG_DEBUG("DQMatMulCWi matched_matmul->get_transpose_a(): " << matched_matmul->get_transpose_a()); + LOG_DEBUG("DQMatMulCWi matched_matmul->get_transpose_b(): " << matched_matmul->get_transpose_b()); + LOG_DEBUG("DQMatMulCWi ctx.get().mm_dq_full: " << ctx.get().mm_dq_full); + if ((ov::element::i4 == matched_qweight->get_element_type() || ov::element::i8 == matched_qweight->get_element_type()) && (ov::op::util::is_parameter(matched_node_qcoeff) || ov::op::util::is_constant(matched_node_qcoeff)) && - qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { + (qcoeff_shape.size() == 1 || qcoeff_shape[1] == 1) && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr(); auto matched_node_muls = node_to_output.at(qmuls).get_node_shared_ptr(); auto matched_node_mmi = node_to_output.at(qmmi).get_node_shared_ptr(); - auto& matched_node_qcoeff_out = uat::_(node_to_output).at_or_at_or_at(qcvtc, reshapec, qcoeff); + auto& matched_node_qcoeff_out = uat::_(node_to_output).at_or_at(qcvtc, qcoeff); auto& matched_node_muls_out = uat::_(node_to_output).at_or_at(qcvtm, qmuls); if (!ctx.get().mm_dq_full) { @@ -188,7 +196,7 @@ DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) { auto mm_readers = matched_matmul->output(0).get_target_inputs(); // Introduce a Reshape to alter Scale factor's shape - auto new_dims = std::vector{qcoeff_shape[1], qcoeff_shape[0]}; + auto new_dims = std::vector{1, qcoeff_shape[0]}; auto new_const = std::make_shared(ov::element::i32, ov::Shape{2}, new_dims); auto new_reshape = std::make_shared(matched_node_qcoeff_out, new_const, false); @@ -1582,14 +1590,18 @@ SliceLastMatmulMultiply::SliceLastMatmulMultiply() { ConvToMatmul::ConvToMatmul(Context::Ref ctx) { auto param = opp::wrap_type(); - auto convert = opp::wrap_type({param->output(0)}); + auto param1_reshape = opp::optional({param, opp::any_input()}); + auto convert = opp::wrap_type({param1_reshape->output(0)}); auto param2 = opp::any_input(); - auto convert2 = opp::optional({param2->output(0)}); + auto param2_reshape = opp::optional({param2, opp::any_input()}); + auto convert2 = opp::optional({param2_reshape->output(0)}); auto multiply = opp::wrap_type({convert, convert2}); auto tr_input = opp::any_input(); - auto transpose_in = opp::wrap_type({tr_input, opp::any_input()}); + auto transpose_in = opp::optional({tr_input, opp::any_input()}); auto conv = opp::wrap_type({transpose_in, multiply}); - auto transpose_out = opp::wrap_type({conv, opp::any_input()}); + + // since this transpose is optional, we might start with convolution and fully match the case where we are working without transpose. + auto transpose_out = opp::optional({conv, opp::any_input()}); // Note: Use [=] to make sure the above objects stay alive in the callback auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -1599,33 +1611,94 @@ ConvToMatmul::ConvToMatmul(Context::Ref ctx) { auto matched_node_param2 = node_to_output.at(param2).get_node_shared_ptr(); auto matched_node_convert = node_to_output.at(convert).get_node_shared_ptr(); auto matched_node_tr_input = node_to_output.at(tr_input); - auto matched_node_transpose_in = node_to_output.at(transpose_in).get_node_shared_ptr(); - auto matched_node_transpose_out = node_to_output.at(transpose_out).get_node_shared_ptr(); + + const auto has_input_tr = node_to_output.count(transpose_in) != 0; + auto has_output_tr = node_to_output.count(transpose_out) != 0; + + // check extension mode where matcher started from optional transpose layer and not captured it for some reason. + auto conv_node = node_to_output.at(conv).get_node_shared_ptr(); + std::shared_ptr transpose_out_node; + if (!has_output_tr) { + for (auto n : conv_node->output(0).get_target_inputs()) { + transpose_out_node = ov::as_type_ptr( + n.get_node()->shared_from_this()); + + if (transpose_out_node) { + LOG_DEBUG("ConvToMatmull: output transpose matched used expanding algorithm " + << transpose_out_node->get_friendly_name()); + has_output_tr = true; + break; + } else { + LOG_DEBUG("ConvToMatmull: output of conv: " + << n.get_node()->get_friendly_name()); + } + } + } else { + transpose_out_node = node_to_output.at(transpose_out).get_node_shared_ptr();; + } + + // in case of transpose missed need to check tensor dimensions, in some cases reshapes are enough or transposes pair required + const auto& matched_node_transpose_in = uat::_(node_to_output).at_or_at(transpose_in, tr_input).get_node_shared_ptr(); + const auto& matched_node_transpose_out = has_output_tr ? transpose_out_node : conv_node; + auto matched_node_multiply = node_to_output.at(multiply).get_node_shared_ptr(); const auto& cvt2_or_multiply = uat::_(node_to_output).at_or_at(convert2, multiply); - const auto& shape = matched_node_param->get_shape(); - const auto& shape2 = matched_node_param2->get_shape(); - const auto& tr_in_shape = matched_node_transpose_in->input(0).get_shape(); + const auto shape = uat::_(node_to_output).at_or_at(param1_reshape, param).get_shape(); + const auto shape2 = uat::_(node_to_output).at_or_at(param2_reshape, param2).get_shape(); + + const auto& tr_in_shape = has_input_tr ? + matched_node_transpose_in->input(0).get_shape() : + matched_node_transpose_in->output(0).get_shape(); + const auto& tr_out_shape = matched_node_transpose_out->output(0).get_shape(); auto check_shape = [](const ov::Shape& shape) { // last 2 dims are 1 return shape.size() == 4 && shape[2] == 1 && shape[3] == 1; }; - auto check_transpose_shape = [](const ov::Shape& shape) { // first 2 dims are 1 return shape.size() == 4 && shape[0] == 1 && shape[1] == 1; }; + auto check_conv_shape_1D = [](const ov::Shape& shape) { + // in case of missed transpose also check the reshape possibility + // first 2 dims and 3rd dim are 1 + return shape.size() == 4 && shape[0] == 1 && shape[2] == 1 && shape[3] == 1; + }; + + bool conv_in_shape = has_input_tr ? check_transpose_shape(tr_in_shape) : check_conv_shape_1D(tr_in_shape); + bool conv_out_shape = has_output_tr ? check_transpose_shape(tr_out_shape) : check_conv_shape_1D(tr_out_shape); + + + LOG_DEBUG("ConvToMatmull: conv_input_shape " << conv_in_shape); + LOG_DEBUG("ConvToMatmull: conv_out_shape " << conv_out_shape); + LOG_DEBUG("ConvToMatmull: matched_node_transpose_in: " << matched_node_transpose_in->get_friendly_name()); + LOG_DEBUG("ConvToMatmull: matched_node_transpose_out: " << matched_node_transpose_out->get_friendly_name()); + + LOG_DEBUG("ConvToMatmull: matched_node_transpose_in shape: " << matched_node_transpose_in->get_shape()); + LOG_DEBUG("ConvToMatmull: matched_node_transpose_out shape: " << matched_node_transpose_out->get_shape()); + + LOG_DEBUG("ConvToMatmull: matched_node_param->get_element_type(): " << matched_node_param->get_element_type()); + LOG_DEBUG("ConvToMatmull: matched_node_param2->get_element_type(): " << matched_node_param2->get_element_type()); + LOG_DEBUG("ConvToMatmull: matched_node_param2: " << matched_node_param2->get_friendly_name()); + LOG_DEBUG("ConvToMatmull: check_shape(shape): " << check_shape(shape)); + LOG_DEBUG("ConvToMatmull: check_shape(shape2): " << check_shape(shape2)); + + // if there is no transpose input - convolution input is fine, but for matmul substitution we might need to add reshape + LOG_DEBUG("ConvToMatmull: check_transpose_shape(tr_in_shape): " << check_transpose_shape(tr_in_shape)); + + // if there is no transpose input - convolution input is fine, but for matmul substitution we might need to add reshape + LOG_DEBUG("ConvToMatmull: check_transpose_shape(tr_out_shape): " << check_transpose_shape(tr_out_shape)); if ((matched_node_param->get_element_type() == ov::element::i4 || matched_node_param->get_element_type() == ov::element::i8) && (matched_node_param2->get_element_type() == ov::element::f32 || matched_node_param2->get_element_type() == ov::element::f16) && (ov::op::util::is_parameter(matched_node_param2) || ov::op::util::is_constant(matched_node_param2)) && - check_shape(shape) && check_shape(shape2) && check_transpose_shape(tr_in_shape) && - check_transpose_shape(tr_out_shape)) { + check_shape(shape) && check_shape(shape2) && +// check_transpose_shape(tr_in_shape) && check_transpose_shape(tr_out_shape) && + conv_in_shape && conv_out_shape) { // Add Reshape before Params/Const auto new_dims = std::vector{shape[0], shape[1]}; auto new_const = std::make_shared(ov::element::i32, ov::Shape{2}, new_dims); @@ -1647,13 +1720,31 @@ ConvToMatmul::ConvToMatmul(Context::Ref ctx) { matched_node_multiply->validate_and_infer_types(); } - // Get rid of Transposes + // Get rid of input Transpose + // TODO: change transpose in case of 2D prefill + auto new_traspose_input = matched_node_tr_input; + if (!has_input_tr) { + std::vector perm = {0, 3, 2, 1}; + auto shPattern = std::make_shared(ov::element::i64, ov::Shape{4}, perm); + new_traspose_input = std::make_shared(matched_node_tr_input, shPattern); + } auto matmul = - std::make_shared(matched_node_tr_input, matched_node_multiply, false, true); + std::make_shared(new_traspose_input, matched_node_multiply, false, true); + + // Get rid of output Transpose + // TODO: change reshape to transpose in case of prefill + std::shared_ptr new_traspose_output = matmul; + if (!has_output_tr) { + std::vector perm = {0, 3, 2, 1}; + auto shPattern = std::make_shared(ov::element::i64, ov::Shape{4}, perm); + new_traspose_output = std::make_shared(matmul, shPattern); + } for (auto&& r : matched_node_transpose_out->output(0).get_target_inputs()) { - r.replace_source_output(matmul); + r.replace_source_output(new_traspose_output); } + matmul->validate_and_infer_types(); + return true; // root has changed } return false; // root hasn't changed @@ -1661,6 +1752,52 @@ ConvToMatmul::ConvToMatmul(Context::Ref ctx) { register_matcher(std::make_shared(transpose_out, "ConvToMatmul"), std::move(callback)); } +// Removing uneffective zeropoint layer, specific structure fixed for POC only purpose, +// further all unefective subtract can be optimized-out +// anyinput[2D], "uneffective_zp_subgraph"[1D], subtract(boardcast), any_input -> multiply, transpose -> convolution +// anyinput[2D], any_input -> multiply, transpose -> convolution + +UneffectiveZP::UneffectiveZP() { + // Constant / param + auto zero_point_tensor = opp::any_input(); + auto sub_bias_convert = opp::optional({zero_point_tensor->outputs()}); + auto sub_bias_reshape = opp::optional({sub_bias_convert, opp::any_input()}); + auto sub_weights = opp::any_input(); + auto subtract = opp::wrap_type({sub_weights, sub_bias_reshape}); + auto multiply_bias = opp::any_input(); + auto multiply = opp::wrap_type({subtract, multiply_bias}); + auto conv_any_bias = opp::any_input(); + auto conv = opp::wrap_type({conv_any_bias, multiply}); + + auto callback = [=](ov::pass::pattern::Matcher& m) -> bool { + LOG_DEBUG("UneffectiveZP subgraph candidate detected"); + auto& node_to_output = m.get_pattern_value_map(); + auto zero_point_tensor_node = node_to_output.at(zero_point_tensor).get_node_shared_ptr(); + + if (op::util::is_constant_and_all_values_equal_int(zero_point_tensor_node, 0)) { + LOG_DEBUG("UneffectiveZP subgraph detected with NOP zp tensor: " + << zero_point_tensor_node->get_friendly_name() << " shape=" << zero_point_tensor_node->get_shape()); + // removing whole zubgraph with subtract + zeropoints + auto multiply_node = node_to_output.at(multiply).get_node_shared_ptr(); + auto sub_weights_node = node_to_output.at(sub_weights).get_node_shared_ptr(); + + //TODO: verify input 0 is subtract + LOG_DEBUG("UneffectiveZP reconnecting " <get_friendly_name() << " -> " << multiply_node->get_friendly_name()); + multiply_node->input(0).replace_source_output(sub_weights_node); + multiply_node->validate_and_infer_types(); + } else { + // TODO: log zeropoints not equal to int4(0) + LOG_DEBUG("UneffectiveZP non all zeropoints are identity"); + return false; // root has changed + } + + return true; // root has not changed + }; + + LOG_DEBUG("UneffectiveZP is registered "); + register_matcher(std::make_shared(conv, "UneffectiveZP"), std::move(callback)); +} + } // namespace opt } // namespace patterns } // namespace npuw diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index 904ce88039d2eb..aca32d93af4bee 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -177,6 +177,11 @@ class ConvToMatmul : public ov::pass::MatcherPass { ConvToMatmul(Context::Ref ctx); }; +class UneffectiveZP : public ov::pass::MatcherPass { +public: + UneffectiveZP(); +}; + } // namespace opt } // namespace patterns } // namespace npuw