From 28d8089c1380c49f61e2a5cb105d1a80c6cf8b70 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Fri, 26 Jul 2024 16:58:22 +0200
Subject: [PATCH 01/34] [one-optimize] Fuse Mul with FullyConnected layer

This commit introduces fuse_mul_with_fully_connected pass that combines FullyConnected and Mul into one node.

ONE-DCO-1.0-Signed-off-by: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
---
 compiler/circle2circle/src/Circle2Circle.cpp  |   4 +
 .../luci/pass/include/luci/CircleOptimizer.h  |   1 +
 .../luci/Pass/FuseMulWithFullyConnectedPass.h |  37 ++++
 compiler/luci/pass/src/CircleOptimizer.cpp    |   5 +
 .../src/FuseMulWithFullyConnectedPass.cpp     | 183 ++++++++++++++++++
 compiler/one-cmds/how-to-use-one-commands.txt |   1 +
 compiler/one-cmds/onelib/constant.py          |   2 +
 7 files changed, 233 insertions(+)
 create mode 100644 compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
 create mode 100644 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 757c368f31d..50ad26764a5 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -112,6 +112,8 @@ int entry(int argc, char **argv)
   add_switch(arser, "--fuse_mean_with_mean",
              "This will fuse two Mean operations when they follow one by one. This will fold them "
              "into one operation and merge reduction indices.");
+  add_switch(arser, "--fuse_mul_with_fully_connected",
+             "This will fuse Mul operator to FullyConnected operator");
   add_switch(arser, "--fuse_mul_to_fullyconnected_weights",
              "This will fuse Mul to following FullyConnected weights");
   add_switch(arser, "--fuse_mul_with_conv",
@@ -312,6 +314,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FuseBatchNormWithDwConv);
   if (arser.get<bool>("--fuse_batchnorm_with_tconv"))
     options->enable(Algorithms::FuseBatchNormWithTConv);
+  if (arser.get<bool>("--fuse_mul_with_fully_connected"))
+    options->enable(Algorithms::FuseMulWithFullyConnected);
   if (arser.get<bool>("--fuse_mul_to_fullyconnected_weights"))
     options->enable(Algorithms::FuseMulToFullyConnectedWeights);
   if (arser.get<bool>("--fuse_slice_with_tconv"))
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index bdae7d57e41..ae4706e5394 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -41,6 +41,7 @@ class CircleOptimizer final
       FuseBatchNormWithConv,
       FuseBatchNormWithDwConv,
       FuseBatchNormWithTConv,
+      FuseMulWithFullyConnected,
       FuseMulToFullyConnectedWeights,
       FuseSliceWithTConv,
       FuseBCQ,
diff --git a/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
new file mode 100644
index 00000000000..5b7f3450069
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FUSE_MUL_WITH_FULLY_CONNECTED_PASS_H__
+#define __LUCI_FUSE_MUL_WITH_FULLY_CONNECTED_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fuse Mul into CircleFullyConnected
+ */
+struct FuseMulWithFullyConnectedPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FuseMulWithFullyConnectedPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FUSE_MUL_WITH_FULLY_CONNECTED_PASS_H__
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index aa98fb38606..a9a0b002fbe 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -42,6 +42,7 @@
 #include "luci/Pass/FuseBatchNormWithDwConvPass.h"
 #include "luci/Pass/FuseBatchNormWithTConvPass.h"
 #include "luci/Pass/FuseBCQPass.h"
+#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
 #include "luci/Pass/FuseMulToFullyConnectedWeightsPass.h"
 #include "luci/Pass/FuseInstanceNormPass.h"
 #include "luci/Pass/FuseMeanWithMeanPass.h"
@@ -260,6 +261,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
   phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
+  if (_options->query(Options::Algorithm::FuseMulWithFullyConnected))
+  {
+    phase.emplace_back(std::make_unique<FuseMulWithFullyConnectedPass>());
+  }
   if (_options->query(Options::Algorithm::CommonSubExpressionElimination))
   {
     phase.emplace_back(std::make_unique<luci::CommonSubExpressionEliminationPass>());
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
new file mode 100644
index 00000000000..432ec2ca143
--- /dev/null
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/Nodes/CircleConst.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+#include <cmath>
+
+namespace
+{
+
+#define RETURN_FALSE_UNLESS(cond) \
+  if (not(cond))                  \
+    return false;
+
+inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *multiplication)
+{
+  auto node_size = fused_node->size<loco::DataType::FLOAT32>();
+  // Scalar:
+  if (multiplication->rank() == 1 ||
+      multiplication->rank() == 0 && multiplication->size<loco::DataType::FLOAT32>() == 1)
+  {
+    for (uint32_t i = 0; i < node_size; i++)
+      fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(0);
+  }
+  // N-size:
+  else
+  {
+    for (uint32_t i = 0; i < node_size; i++)
+      fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
+  }
+}
+
+/**
+ *  Fuse Mul to FullyConnected if the multiplied value is a channel(last dimension)-wise constant
+ *
+ *  BEFORE
+ *                |
+ *      [CircleFullyConnected]
+ *                |
+ *           [CircleMul]
+ *                |
+ *
+ *  AFTER
+ *                |
+ *       [CircleFullyConnected]   [CircleMul] (dead)
+ *                |
+ *
+ */
+bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
+{
+  // Sanity check:
+  RETURN_FALSE_UNLESS(fc);
+  // Allow only FLOAT32 data type:
+  RETURN_FALSE_UNLESS(fc->dtype() == loco::DataType::FLOAT32);
+  // Allow only without activation functions as values are going to
+  // be multiplied before activation function.
+  RETURN_FALSE_UNLESS(fc->fusedActivationFunction() == luci::FusedActFunc::NONE);
+  // Check for weights being Constant:
+  auto weights = dynamic_cast<luci::CircleConst *>(fc->weights());
+  RETURN_FALSE_UNLESS(weights);
+  // Get Mul node:
+  auto fc_output = loco::succs(fc);
+  // Make sure that FullyConnected has only one child:
+  RETURN_FALSE_UNLESS(fc_output.size() == 1);
+  auto mul = dynamic_cast<luci::CircleMul *>(*fc_output.begin());
+  RETURN_FALSE_UNLESS(mul);
+  // Allow Mul node only with FLOAT32 data type:
+  RETURN_FALSE_UNLESS(mul->dtype() == loco::DataType::FLOAT32);
+  // Get multiplication Constant (here: the second input besides weights):
+  auto multiplication = mul->x() == fc ? dynamic_cast<luci::CircleConst *>(mul->y())
+                                       : dynamic_cast<luci::CircleConst *>(mul->x());
+  RETURN_FALSE_UNLESS(multiplication);
+  // Get rank of multiplication:
+  auto rank = multiplication->rank();
+  RETURN_FALSE_UNLESS(rank != 0);
+  // Check that all dimensions are ones, checks broadcast capabilites.
+  // Last dimesion of multiplication must be compatible with FC.
+  // N-D case (N>1):
+  if (multiplication->rank() > 1)
+  {
+    // Check channel-wise broadcasting:
+    for (uint32_t i = 0; i < rank - 1; i++)
+      RETURN_FALSE_UNLESS(multiplication->dim(i).value() == 1);
+  }
+  // Scalar case:
+  else if (multiplication->rank() == 1 || multiplication->rank() == 0)
+  {
+    RETURN_FALSE_UNLESS(multiplication->size<loco::DataType::FLOAT32>() != 0);
+  }
+
+  // Update weights accordingly.
+  RETURN_FALSE_UNLESS(weights->opcode() == luci::CircleOpcode::CIRCLECONST or
+                      weights->opcode() == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
+  // Create new weights to be updated with values:
+  auto fused_weights = luci::clone(weights);
+  RETURN_FALSE_UNLESS(fused_weights->size<loco::DataType::FLOAT32>() ==
+                      weights->size<loco::DataType::FLOAT32>());
+
+  update_values(fused_weights, multiplication);
+
+  fc->weights(fused_weights);
+
+  // Update bias accordingly.
+  // Only supports:
+  // (1) constant bias
+  // (2) no bias
+  auto bias = loco::must_cast<luci::CircleNode *>(fc->bias());
+  RETURN_FALSE_UNLESS(bias->opcode() == luci::CircleOpcode::CIRCLECONST or
+                      bias->opcode() == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
+  // Create new bias to be updated with values:
+  auto const_bias = dynamic_cast<luci::CircleConst *>(fc->bias());
+  RETURN_FALSE_UNLESS(const_bias)
+  RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32);
+
+  auto fused_bias = luci::clone(const_bias);
+  RETURN_FALSE_UNLESS(fused_bias->size<loco::DataType::FLOAT32>() ==
+                      const_bias->size<loco::DataType::FLOAT32>());
+
+  update_values(fused_bias, multiplication);
+
+  // Here fused_bias's shape is either [1, 1, ..., N] or [N]
+  // where N is weights->dim(0).
+  // The shape is normalized to [N] to become the bias of FullyConected.
+  fused_bias->rank(1);
+  fused_bias->dim(0) = weights->dim(0);
+
+  fc->bias(fused_bias);
+
+  // Set origin and copy Activation Function if exisitng:
+  fc->fusedActivationFunction(mul->fusedActivationFunction());
+  luci::add_origin(fc, luci::get_origin(mul));
+
+  replace(mul).with(fc);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseMulWithFullyConnectedPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
+    if (not fc)
+      continue;
+
+    switch (fc->dtype())
+    {
+      case loco::DataType::FLOAT32:
+        if (fuse_mul_with_fc(fc))
+          changed = true;
+        break;
+      default:
+        break;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index fefbabf9a17..397632aef87 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -171,6 +171,7 @@ Current transformation options are
 - fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
 - fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
 - fuse_batchnorm_with_tconv : This fuses BatchNorm operator to transpose convolution operator
+- fuse_mul_with_fully_connected: This fuses Mul operator with the preceding FullyConnected operator if possible
 - fuse_mul_to_fullyconnected_weights : This fuses Mul operator to following FullyConnected operator weights
 - fuse_mul_with_conv: This fuses Mul with a preceding Convolution op if possible.
 - fuse_mul_with_div: This fuses Mul and Div op as Div.
diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
index 8c5de1b646d..deb7362f3d9 100644
--- a/compiler/one-cmds/onelib/constant.py
+++ b/compiler/one-cmds/onelib/constant.py
@@ -44,6 +44,7 @@ class CONSTANT:
         'fuse_batchnorm_with_dwconv',
         'fuse_batchnorm_with_tconv',
         'fuse_activation_function',
+        'fuse_mul_with_fully_connected',
         'fuse_mul_to_fullyconnected_weights',
         'fuse_instnorm',
         'fuse_prelu',
@@ -122,6 +123,7 @@ class CONSTANT:
         ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
         ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
         ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
+        ('fuse_mul_with_fully_connected', 'fuse Mul op to FullyConnected op'),
         ('fuse_mul_to_fullyconnected_weights',
          'fuse Mul op to following FullyConnected op weights'),
         ('fuse_slice_with_tconv', 'fuse Slice op to Transposed Convolution op'),

From 87774c5b67ca1e8a580f1e38c8051b4cf2a66344 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 29 Jul 2024 16:26:47 +0200
Subject: [PATCH 02/34] Move mul_with_fully_connected pass after the
 mul_with_div

---
 compiler/circle2circle/src/Circle2Circle.cpp      | 8 ++++----
 compiler/luci/pass/include/luci/CircleOptimizer.h | 2 +-
 compiler/luci/pass/src/CircleOptimizer.cpp        | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 50ad26764a5..ac2fc3c6316 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -112,14 +112,14 @@ int entry(int argc, char **argv)
   add_switch(arser, "--fuse_mean_with_mean",
              "This will fuse two Mean operations when they follow one by one. This will fold them "
              "into one operation and merge reduction indices.");
-  add_switch(arser, "--fuse_mul_with_fully_connected",
-             "This will fuse Mul operator to FullyConnected operator");
   add_switch(arser, "--fuse_mul_to_fullyconnected_weights",
              "This will fuse Mul to following FullyConnected weights");
   add_switch(arser, "--fuse_mul_with_conv",
              "This will fuse Mul operation with a preceding Conv if possible.");
   add_switch(arser, "--fuse_mul_with_div",
              "This will fuse Mul operation with a Div operation whose numerator is const.");
+  add_switch(arser, "--fuse_mul_with_fully_connected",
+             "This will fuse Mul operator with a preceding FullyConnected operator.");
   add_switch(arser, "--fuse_slice_with_tconv",
              "This will fuse Slice operation with a preceding TConv if possible.");
   add_switch(arser, "--fuse_transpose_with_mean",
@@ -314,8 +314,6 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FuseBatchNormWithDwConv);
   if (arser.get<bool>("--fuse_batchnorm_with_tconv"))
     options->enable(Algorithms::FuseBatchNormWithTConv);
-  if (arser.get<bool>("--fuse_mul_with_fully_connected"))
-    options->enable(Algorithms::FuseMulWithFullyConnected);
   if (arser.get<bool>("--fuse_mul_to_fullyconnected_weights"))
     options->enable(Algorithms::FuseMulToFullyConnectedWeights);
   if (arser.get<bool>("--fuse_slice_with_tconv"))
@@ -330,6 +328,8 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FuseMulWithConv);
   if (arser.get<bool>("--fuse_mul_with_div"))
     options->enable(Algorithms::FuseMulWithDiv);
+  if (arser.get<bool>("--fuse_mul_with_fully_connected"))
+    options->enable(Algorithms::FuseMulWithFullyConnected);
   if (arser.get<bool>("--make_batchnorm_gamma_positive"))
     options->enable(Algorithms::MakeBatchNormGammaPositive);
   if (arser.get<bool>("--fuse_preactivation_batchnorm"))
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index ae4706e5394..0246e2f4f28 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -41,7 +41,6 @@ class CircleOptimizer final
       FuseBatchNormWithConv,
       FuseBatchNormWithDwConv,
       FuseBatchNormWithTConv,
-      FuseMulWithFullyConnected,
       FuseMulToFullyConnectedWeights,
       FuseSliceWithTConv,
       FuseBCQ,
@@ -50,6 +49,7 @@ class CircleOptimizer final
       FuseMeanWithMean,
       FuseMulWithConv,
       FuseMulWithDiv,
+      FuseMulWithFullyConnected,
       FuseTransposeWithMean,
       ResolveCustomOpAdd,
       ResolveCustomOpBatchMatMul,
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index a9a0b002fbe..500fc7efcb7 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -42,12 +42,12 @@
 #include "luci/Pass/FuseBatchNormWithDwConvPass.h"
 #include "luci/Pass/FuseBatchNormWithTConvPass.h"
 #include "luci/Pass/FuseBCQPass.h"
-#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
 #include "luci/Pass/FuseMulToFullyConnectedWeightsPass.h"
 #include "luci/Pass/FuseInstanceNormPass.h"
 #include "luci/Pass/FuseMeanWithMeanPass.h"
 #include "luci/Pass/FuseMulWithConvPass.h"
 #include "luci/Pass/FuseMulWithDivPass.h"
+#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
 #include "luci/Pass/FusePreActivationBatchNormPass.h"
 #include "luci/Pass/FusePReluPass.h"
 #include "luci/Pass/FuseGeluPass.h"

From 4c7f5a9e49824a06ae2e52997737b41019f2d259 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 29 Jul 2024 16:28:53 +0200
Subject: [PATCH 03/34] Remove weights constant check

---
 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 432ec2ca143..2e91acfe9c9 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -106,8 +106,6 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   }
 
   // Update weights accordingly.
-  RETURN_FALSE_UNLESS(weights->opcode() == luci::CircleOpcode::CIRCLECONST or
-                      weights->opcode() == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
   // Create new weights to be updated with values:
   auto fused_weights = luci::clone(weights);
   RETURN_FALSE_UNLESS(fused_weights->size<loco::DataType::FLOAT32>() ==

From cfcb68b42c78006a530370515435803e3f558357 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 29 Jul 2024 16:31:41 +0200
Subject: [PATCH 04/34] Change order of updating the nodes, more consuming one
 is now later

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 2e91acfe9c9..7b4e2cfa652 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -105,17 +105,6 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
     RETURN_FALSE_UNLESS(multiplication->size<loco::DataType::FLOAT32>() != 0);
   }
 
-  // Update weights accordingly.
-  // Create new weights to be updated with values:
-  auto fused_weights = luci::clone(weights);
-  RETURN_FALSE_UNLESS(fused_weights->size<loco::DataType::FLOAT32>() ==
-                      weights->size<loco::DataType::FLOAT32>());
-
-  update_values(fused_weights, multiplication);
-
-  fc->weights(fused_weights);
-
-  // Update bias accordingly.
   // Only supports:
   // (1) constant bias
   // (2) no bias
@@ -131,14 +120,23 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   RETURN_FALSE_UNLESS(fused_bias->size<loco::DataType::FLOAT32>() ==
                       const_bias->size<loco::DataType::FLOAT32>());
 
-  update_values(fused_bias, multiplication);
+  // Create new weights to be updated with values:
+  auto fused_weights = luci::clone(weights);
+  RETURN_FALSE_UNLESS(fused_weights->size<loco::DataType::FLOAT32>() ==
+                      weights->size<loco::DataType::FLOAT32>());
 
+  // Update bias accordingly:
+  update_values(fused_bias, multiplication);
   // Here fused_bias's shape is either [1, 1, ..., N] or [N]
   // where N is weights->dim(0).
   // The shape is normalized to [N] to become the bias of FullyConected.
   fused_bias->rank(1);
   fused_bias->dim(0) = weights->dim(0);
+  // Update weights accordingly:
+  update_values(fused_weights, multiplication);
 
+  // Replace weights and bias:
+  fc->weights(fused_weights);
   fc->bias(fused_bias);
 
   // Set origin and copy Activation Function if exisitng:

From e8b06b572afc2db0943a8fe8c6e5b825501d9e1c Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Thu, 1 Aug 2024 13:06:32 +0200
Subject: [PATCH 05/34] Fix values updating and add luci tests

---
 .../src/FuseMulWithFullyConnectedPass.cpp     |  40 ++--
 .../FuseMulWithFullyConnectedPass.test.cpp    | 218 ++++++++++++++++++
 2 files changed, 245 insertions(+), 13 deletions(-)
 create mode 100644 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 7b4e2cfa652..b1ffbf94ff2 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -29,21 +29,38 @@ namespace
   if (not(cond))                  \
     return false;
 
-inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *multiplication)
+inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *multiplication, bool is_weights)
 {
   auto node_size = fused_node->size<loco::DataType::FLOAT32>();
-  // Scalar:
+  auto mul_size = multiplication->size<loco::DataType::FLOAT32>();
+  // Scalar multiplication:
   if (multiplication->rank() == 1 ||
-      multiplication->rank() == 0 && multiplication->size<loco::DataType::FLOAT32>() == 1)
+      multiplication->rank() == 0 && mul_size == 1)
   {
     for (uint32_t i = 0; i < node_size; i++)
       fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(0);
   }
-  // N-size:
+  // N-size multiplication:
   else
   {
-    for (uint32_t i = 0; i < node_size; i++)
-      fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
+    // Go along channels, multiplication size is ensured to be compatible with channels.
+    if (is_weights)  // weights 2-D
+    {
+      auto count = fused_node->dim(0).value();
+      auto size = fused_node->dim(fused_node->rank() - 1).value();
+      float val;
+      for (uint32_t c = 0; c < count; c++) {
+        val = multiplication->at<loco::DataType::FLOAT32>(c);
+        for (uint32_t i = 0; i < size; i++) {
+          fused_node->at<loco::DataType::FLOAT32>(c * size + i) *= val;
+        }
+      }
+    }
+    else  // bias 1-D
+    {
+      for (uint32_t i = 0; i < node_size; i++)
+        fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
+    }
   }
 }
 
@@ -98,6 +115,8 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
     // Check channel-wise broadcasting:
     for (uint32_t i = 0; i < rank - 1; i++)
       RETURN_FALSE_UNLESS(multiplication->dim(i).value() == 1);
+    // Check the last dimesion of Mul is the same with the first dimension of FullyConnected
+    RETURN_FALSE_UNLESS(multiplication->dim(rank - 1) == weights->dim(0));
   }
   // Scalar case:
   else if (multiplication->rank() == 1 || multiplication->rank() == 0)
@@ -126,14 +145,9 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
                       weights->size<loco::DataType::FLOAT32>());
 
   // Update bias accordingly:
-  update_values(fused_bias, multiplication);
-  // Here fused_bias's shape is either [1, 1, ..., N] or [N]
-  // where N is weights->dim(0).
-  // The shape is normalized to [N] to become the bias of FullyConected.
-  fused_bias->rank(1);
-  fused_bias->dim(0) = weights->dim(0);
+  update_values(fused_bias, multiplication, false);
   // Update weights accordingly:
-  update_values(fused_weights, multiplication);
+  update_values(fused_weights, multiplication, true);
 
   // Replace weights and bias:
   fc->weights(fused_weights);
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
new file mode 100644
index 00000000000..db267354e67
--- /dev/null
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
+#include "helpers/CreateCircleConst.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+#define DIM_ONE 8
+#define DIM_TWO 4
+#define MUL_VAL 2.0f
+
+namespace
+{
+
+using namespace luci::test;
+
+/**
+ *  Graph for this test
+ *
+ *  BEFORE
+ *
+ *         [FC]
+ *           |
+ *     [Mul w/ Relu]
+ *
+ *  AFTER
+ *
+ *      [FC w/ Relu] (weights and bias updated)
+ *
+ */
+class FCMulGraphlet
+{
+public:
+  FCMulGraphlet() = default;
+
+  void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar)
+  {
+    std::vector<float> weights_val(DIM_ONE * DIM_TWO);
+    for (uint32_t i = 0; i < DIM_ONE * DIM_TWO; i++)
+      weights_val.at(i) = i;
+
+    _fc_f = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE, DIM_TWO}, weights_val);
+
+    std::vector<float> bias_val(DIM_ONE);
+    for (uint32_t i = 0; i < DIM_ONE; i++)
+      bias_val.at(i) = i;
+
+    _fc_b = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE}, bias_val);
+
+    _fc = g->nodes()->create<luci::CircleFullyConnected>();
+    _fc->weights(_fc_f);
+    _fc->bias(_fc_b);
+    _fc->fusedActivationFunction(fc_activation);
+    _fc->dtype(loco::DataType::FLOAT32);
+    _fc->shape({1, DIM_ONE});
+    _fc->name("fc");
+
+    std::vector<float> mul_values;
+
+    if(is_mul_scalar) {
+      mul_values.push_back(static_cast<float>(MUL_VAL));
+      _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1}, mul_values);
+    }
+    else {
+      for (uint32_t i = 0; i < DIM_ONE; i++) {
+        mul_values.push_back(static_cast<float>(i));
+      }
+      _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1, 1, 1, DIM_ONE}, mul_values);
+    }
+
+    _mul = g->nodes()->create<luci::CircleMul>();
+    _mul->x(_fc);
+    _mul->y(_mul_c);
+    _mul->fusedActivationFunction(luci::FusedActFunc::RELU);
+    _mul->dtype(loco::DataType::FLOAT32);
+    if(is_mul_scalar) {
+      _mul->shape({1});
+    }
+    else {
+      _mul->shape({1, DIM_ONE});
+    }
+    _mul->name("mul");
+  }
+
+public:
+  luci::CircleFullyConnected *fc() { return _fc; }
+
+  void to_fm_bias(void)
+  {
+    assert(_fc != nullptr);
+
+    auto new_fc = _fc->graph()->nodes()->create<luci::CircleFullyConnected>();
+    _fc->bias(new_fc);
+  }
+
+protected:
+  luci::CircleFullyConnected *_fc = nullptr;
+  luci::CircleMul *_mul = nullptr;
+  luci::CircleConst *_fc_f = nullptr;
+  luci::CircleConst *_fc_b = nullptr;
+  luci::CircleConst *_mul_c = nullptr;
+};
+
+class FuseAddWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
+{
+public:
+  FuseAddWithFCTestGraph() = default;
+
+  void init(luci::FusedActFunc fc_activation = luci::FusedActFunc::NONE, bool is_mul_scalar = false)
+  {
+    TestIOGraph::init({1, DIM_TWO}, {1, DIM_ONE});
+    FCMulGraphlet::init(g(), fc_activation, is_mul_scalar);
+
+    _fc->input(input());
+
+    output()->from(_mul);
+  }
+};
+
+class FuseMulWithFullyConnectedPassTest : public ::testing::Test
+{
+public:
+  FuseAddWithFCTestGraph g;
+  luci::FuseMulWithFullyConnectedPass pass;
+};
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
+{
+  g.init(luci::FusedActFunc::NONE, false);
+
+  EXPECT_EQ(true, pass.run(g.g()));
+
+  auto fc = dynamic_cast<luci::CircleFullyConnected *>(g.output()->from());
+  EXPECT_NE(nullptr, fc);
+
+  auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
+  auto weights_n =  weights->dim(0).value();
+  auto weights_m =  weights->dim(1).value();
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < weights_n; i++)
+  {
+    for (uint32_t j = 0; j < weights_m; j++)
+    {
+      offset = i * weights_m + j;
+      EXPECT_EQ(i * offset, weights->at<loco::DataType::FLOAT32>(offset));
+    }
+  }
+
+  auto bias = loco::must_cast<luci::CircleConst *>(g.fc()->bias());
+  for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
+  {
+    EXPECT_EQ(i * i, bias->at<loco::DataType::FLOAT32>(i));
+  }
+}
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
+{
+  g.init(luci::FusedActFunc::NONE, true);
+
+  EXPECT_EQ(true, pass.run(g.g()));
+
+  auto fc = dynamic_cast<luci::CircleFullyConnected *>(g.output()->from());
+  EXPECT_NE(nullptr, fc);
+
+  auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
+  auto weights_n =  weights->dim(0).value();
+  auto weights_m =  weights->dim(1).value();
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < weights_n; i++)
+  {
+    for (uint32_t j = 0; j < weights_m; j++)
+    {
+      offset = i * weights_m + j;
+      EXPECT_EQ(MUL_VAL * offset, weights->at<loco::DataType::FLOAT32>(offset));
+    }
+  }
+
+  auto bias = loco::must_cast<luci::CircleConst *>(g.fc()->bias());
+  for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
+  {
+    EXPECT_EQ(MUL_VAL * i, bias->at<loco::DataType::FLOAT32>(i));
+  }
+}
+
+TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
+{
+  g.init();
+
+  // Bias cannot be fused as it's passed as feature map.
+  g.to_fm_bias();
+
+  EXPECT_EQ(false, pass.run(g.g()));
+}
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
+{
+  g.init(luci::FusedActFunc::RELU);
+
+  EXPECT_EQ(false, pass.run(g.g()));
+}
+}

From 3bf26492a565166f6c8030b8ebc60d00246668d1 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Fri, 2 Aug 2024 19:09:30 +0200
Subject: [PATCH 06/34] Fix codestyle

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 19 ++++++++------
 .../FuseMulWithFullyConnectedPass.test.cpp    | 25 +++++++++++--------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index b1ffbf94ff2..1be57695efa 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -29,13 +29,13 @@ namespace
   if (not(cond))                  \
     return false;
 
-inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *multiplication, bool is_weights)
+inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *multiplication,
+                          bool is_weights)
 {
   auto node_size = fused_node->size<loco::DataType::FLOAT32>();
   auto mul_size = multiplication->size<loco::DataType::FLOAT32>();
   // Scalar multiplication:
-  if (multiplication->rank() == 1 ||
-      multiplication->rank() == 0 && mul_size == 1)
+  if (multiplication->rank() == 1 || multiplication->rank() == 0 && mul_size == 1)
   {
     for (uint32_t i = 0; i < node_size; i++)
       fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(0);
@@ -44,22 +44,25 @@ inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *mult
   else
   {
     // Go along channels, multiplication size is ensured to be compatible with channels.
-    if (is_weights)  // weights 2-D
+    if (is_weights) // weights 2-D
     {
       auto count = fused_node->dim(0).value();
       auto size = fused_node->dim(fused_node->rank() - 1).value();
       float val;
-      for (uint32_t c = 0; c < count; c++) {
+      for (uint32_t c = 0; c < count; c++)
+      {
         val = multiplication->at<loco::DataType::FLOAT32>(c);
-        for (uint32_t i = 0; i < size; i++) {
+        for (uint32_t i = 0; i < size; i++)
+        {
           fused_node->at<loco::DataType::FLOAT32>(c * size + i) *= val;
         }
       }
     }
-    else  // bias 1-D
+    else // bias 1-D
     {
       for (uint32_t i = 0; i < node_size; i++)
-        fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
+        fused_node->at<loco::DataType::FLOAT32>(i) *=
+          multiplication->at<loco::DataType::FLOAT32>(i);
     }
   }
 }
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index db267354e67..0043b44bfb9 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -74,12 +74,15 @@ class FCMulGraphlet
 
     std::vector<float> mul_values;
 
-    if(is_mul_scalar) {
+    if (is_mul_scalar)
+    {
       mul_values.push_back(static_cast<float>(MUL_VAL));
       _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1}, mul_values);
     }
-    else {
-      for (uint32_t i = 0; i < DIM_ONE; i++) {
+    else
+    {
+      for (uint32_t i = 0; i < DIM_ONE; i++)
+      {
         mul_values.push_back(static_cast<float>(i));
       }
       _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1, 1, 1, DIM_ONE}, mul_values);
@@ -90,10 +93,12 @@ class FCMulGraphlet
     _mul->y(_mul_c);
     _mul->fusedActivationFunction(luci::FusedActFunc::RELU);
     _mul->dtype(loco::DataType::FLOAT32);
-    if(is_mul_scalar) {
+    if (is_mul_scalar)
+    {
       _mul->shape({1});
     }
-    else {
+    else
+    {
       _mul->shape({1, DIM_ONE});
     }
     _mul->name("mul");
@@ -151,8 +156,8 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
   EXPECT_NE(nullptr, fc);
 
   auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
-  auto weights_n =  weights->dim(0).value();
-  auto weights_m =  weights->dim(1).value();
+  auto weights_n = weights->dim(0).value();
+  auto weights_m = weights->dim(1).value();
   uint32_t offset = 0;
   for (uint32_t i = 0; i < weights_n; i++)
   {
@@ -180,8 +185,8 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
   EXPECT_NE(nullptr, fc);
 
   auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
-  auto weights_n =  weights->dim(0).value();
-  auto weights_m =  weights->dim(1).value();
+  auto weights_n = weights->dim(0).value();
+  auto weights_m = weights->dim(1).value();
   uint32_t offset = 0;
   for (uint32_t i = 0; i < weights_n; i++)
   {
@@ -215,4 +220,4 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 
   EXPECT_EQ(false, pass.run(g.g()));
 }
-}
+} // namespace

From d38654162c2aa13c4b1679da714a72ab4d3e8578 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 5 Aug 2024 11:33:13 +0200
Subject: [PATCH 07/34] Rename pass

---
 compiler/circle2circle/src/Circle2Circle.cpp                | 4 ++--
 .../pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h  | 6 +++---
 compiler/one-cmds/how-to-use-one-commands.txt               | 2 +-
 compiler/one-cmds/onelib/constant.py                        | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index ac2fc3c6316..ccea8c65788 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -118,7 +118,7 @@ int entry(int argc, char **argv)
              "This will fuse Mul operation with a preceding Conv if possible.");
   add_switch(arser, "--fuse_mul_with_div",
              "This will fuse Mul operation with a Div operation whose numerator is const.");
-  add_switch(arser, "--fuse_mul_with_fully_connected",
+  add_switch(arser, "--fuse_mul_with_fullyconnected",
              "This will fuse Mul operator with a preceding FullyConnected operator.");
   add_switch(arser, "--fuse_slice_with_tconv",
              "This will fuse Slice operation with a preceding TConv if possible.");
@@ -328,7 +328,7 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::FuseMulWithConv);
   if (arser.get<bool>("--fuse_mul_with_div"))
     options->enable(Algorithms::FuseMulWithDiv);
-  if (arser.get<bool>("--fuse_mul_with_fully_connected"))
+  if (arser.get<bool>("--fuse_mul_with_fullyconnected"))
     options->enable(Algorithms::FuseMulWithFullyConnected);
   if (arser.get<bool>("--make_batchnorm_gamma_positive"))
     options->enable(Algorithms::MakeBatchNormGammaPositive);
diff --git a/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
index 5b7f3450069..718039f1c69 100644
--- a/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
+++ b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __LUCI_FUSE_MUL_WITH_FULLY_CONNECTED_PASS_H__
-#define __LUCI_FUSE_MUL_WITH_FULLY_CONNECTED_PASS_H__
+#ifndef __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__
+#define __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__
 
 #include <logo/Pass.h>
 
@@ -34,4 +34,4 @@ struct FuseMulWithFullyConnectedPass final : public logo::Pass
 
 } // namespace luci
 
-#endif // __LUCI_FUSE_MUL_WITH_FULLY_CONNECTED_PASS_H__
+#endif // __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index 397632aef87..af731943bf3 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -171,7 +171,7 @@ Current transformation options are
 - fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
 - fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
 - fuse_batchnorm_with_tconv : This fuses BatchNorm operator to transpose convolution operator
-- fuse_mul_with_fully_connected: This fuses Mul operator with the preceding FullyConnected operator if possible
+- fuse_mul_with_fullyconnected: This fuses Mul operator with the preceding FullyConnected operator if possible
 - fuse_mul_to_fullyconnected_weights : This fuses Mul operator to following FullyConnected operator weights
 - fuse_mul_with_conv: This fuses Mul with a preceding Convolution op if possible.
 - fuse_mul_with_div: This fuses Mul and Div op as Div.
diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
index deb7362f3d9..192b258973f 100644
--- a/compiler/one-cmds/onelib/constant.py
+++ b/compiler/one-cmds/onelib/constant.py
@@ -44,7 +44,7 @@ class CONSTANT:
         'fuse_batchnorm_with_dwconv',
         'fuse_batchnorm_with_tconv',
         'fuse_activation_function',
-        'fuse_mul_with_fully_connected',
+        'fuse_mul_with_fullyconnected',
         'fuse_mul_to_fullyconnected_weights',
         'fuse_instnorm',
         'fuse_prelu',
@@ -123,7 +123,7 @@ class CONSTANT:
         ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
         ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
         ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
-        ('fuse_mul_with_fully_connected', 'fuse Mul op to FullyConnected op'),
+        ('fuse_mul_with_fullyconnected', 'fuse Mul op to FullyConnected op'),
         ('fuse_mul_to_fullyconnected_weights',
          'fuse Mul op to following FullyConnected op weights'),
         ('fuse_slice_with_tconv', 'fuse Slice op to Transposed Convolution op'),

From 4180734d7678155d31801c59aa0ac126234986c0 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 5 Aug 2024 15:18:58 +0200
Subject: [PATCH 08/34] Add luci tests with models

---
 .../circle2circle-dredd-recipe-test/test.lst  |  3 +
 compiler/luci-pass-value-py-test/test.lst     |  3 +
 .../Net_FullyConnected_Mul_000/test.recipe    | 67 +++++++++++++++++++
 .../Net_FullyConnected_Mul_000/test.rule      | 12 ++++
 .../Net_FullyConnected_Mul_001/test.recipe    | 67 +++++++++++++++++++
 .../Net_FullyConnected_Mul_001/test.rule      | 12 ++++
 .../Net_FullyConnected_Mul_002/test.recipe    | 66 ++++++++++++++++++
 .../Net_FullyConnected_Mul_002/test.rule      | 12 ++++
 8 files changed, 242 insertions(+)
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.rule
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.rule
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.rule

diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 4bf6a80d65a..b33aa6fd0cf 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -48,6 +48,9 @@ Add(Net_DwConv_BN_000 PASS fuse_batchnorm_with_dwconv)
 Add(Net_DwConv_BN_001 PASS fuse_batchnorm_with_dwconv)
 Add(Net_FC_Gelu_FC_000 PASS replace_with_fc_gelu_fc)
 Add(Net_FullyConnected_Add_000 PASS fold_fully_connected)
+Add(Net_FullyConnected_Mul_000 PASS fuse_mul_with_fullyconnected)
+Add(Net_FullyConnected_Mul_001 PASS fuse_mul_with_fullyconnected)
+Add(Net_FullyConnected_Mul_002 PASS fuse_mul_with_fullyconnected)
 Add(Net_Gelu_000 PASS fuse_gelu)
 Add(Net_Gelu_001 PASS fuse_gelu)
 Add(Net_Horizontal_FullyConnected_Add_000 PASS fuse_horizontal_fc_layers)
diff --git a/compiler/luci-pass-value-py-test/test.lst b/compiler/luci-pass-value-py-test/test.lst
index 287ddcb94d3..47b5a50acda 100644
--- a/compiler/luci-pass-value-py-test/test.lst
+++ b/compiler/luci-pass-value-py-test/test.lst
@@ -33,6 +33,9 @@ eval(Net_Dequantize_Add_000 fold_dequantize)
 eval(Net_DwConv_BN_000 fuse_batchnorm_with_dwconv)
 eval(Net_DwConv_BN_001 fuse_batchnorm_with_dwconv)
 eval(Net_FullyConnected_Add_000 fold_fully_connected)
+eval(Net_FullyConnected_Mul_000 fuse_mul_with_fullyconnected)
+eval(Net_FullyConnected_Mul_001 fuse_mul_with_fullyconnected)
+eval(Net_FullyConnected_Mul_002 fuse_mul_with_fullyconnected)
 eval(Net_Horizontal_FullyConnected_Add_000 fuse_horizontal_fc_layers)
 eval(Net_InstanceNorm_001 fuse_instnorm)
 eval(Net_InstanceNorm_002 fuse_instnorm)
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.recipe
new file mode 100644
index 00000000000..84203a12d04
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.recipe
@@ -0,0 +1,67 @@
+operand {
+    name: "ifm"
+    type: FLOAT32
+    shape { dim: 1 dim: 1 dim: 6 }
+}
+operand {
+    name: "fc_wgt"
+    type: FLOAT32
+    shape { dim: 6 dim: 6 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "fc_bias"
+    type: FLOAT32
+    shape { dim: 6 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "B"
+    type: FLOAT32
+    shape { dim: 1, dim: 1, dim: 6 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0" 
+    }
+}
+operand {
+    name: "fc_out"
+    type: FLOAT32
+    shape: { dim: 1 dim: 1 dim: 6 }
+}
+operand {
+    name: "mul_out"
+    type: FLOAT32
+    shape: { dim: 1 dim: 1 dim: 6 }
+}
+operation {
+    type: "FullyConnected"
+    fullyconnected_options {
+        activation: NONE
+        keep_num_dims: true
+    }
+    input: "ifm"
+    input: "fc_wgt"
+    input: "fc_bias"
+    output: "fc_out"
+}
+operation {
+    type: "Mul"
+    mul_options {
+        activation: NONE
+    }
+    input: "fc_out"
+    input: "B"
+    output: "mul_out"
+}
+input: "ifm"
+output: "mul_out"
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.rule
new file mode 100644
index 00000000000..c1f2a827884
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.rule
@@ -0,0 +1,12 @@
+# This checks if:
+#   Mul(FC(input, weights, bias), other)
+# is converted to:
+#   FC(input, Mul(weights, other), Mul(bias, other))
+# and then Mul is fused to:
+#   FC(input, weights', bias')
+# Here Mul is in shape of (1, 1, X).
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "FC_EXIST"                $(op_count FULLY_CONNECTED) '=' 1
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.recipe
new file mode 100644
index 00000000000..d446424c238
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.recipe
@@ -0,0 +1,67 @@
+operand {
+    name: "ifm"
+    type: FLOAT32
+    shape { dim: 3 dim: 1 dim: 4 }
+}
+operand {
+    name: "fc_wgt"
+    type: FLOAT32
+    shape { dim: 6 dim: 4 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "fc_bias"
+    type: FLOAT32
+    shape { dim: 6 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "B"
+    type: FLOAT32
+    shape { dim: 6 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "fc_out"
+    type: FLOAT32
+    shape: { dim: 3 dim: 1 dim: 6 }
+}
+operand {
+    name: "mul_out"
+    type: FLOAT32
+    shape: { dim: 3 dim: 1 dim: 6 }
+}
+operation {
+    type: "FullyConnected"
+    fullyconnected_options {
+        activation: NONE
+        keep_num_dims: true
+    }
+    input: "ifm"
+    input: "fc_wgt"
+    input: "fc_bias"
+    output: "fc_out"
+}
+operation {
+    type: "Mul"
+    mul_options {
+        activation: RELU
+    }
+    input: "fc_out"
+    input: "B"
+    output: "mul_out"
+}
+input: "ifm"
+output: "mul_out"
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.rule
new file mode 100644
index 00000000000..acdd2d6a96b
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.rule
@@ -0,0 +1,12 @@
+# This checks if:
+#   Mul(FC(input, weights, bias), other)
+# is converted to:
+#   FC(input, Mul(weights, other), Mul(bias, other))
+# and then Mul is fused to:
+#   FC(input, weights', bias')
+# Here Mul is in shape of (X).
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "FC_EXIST"                $(op_count FULLY_CONNECTED) '=' 1
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.recipe
new file mode 100644
index 00000000000..34e3cde4839
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.recipe
@@ -0,0 +1,66 @@
+operand {
+    name: "ifm"
+    type: FLOAT32
+    shape { dim: 1 dim: 16 }
+}
+operand {
+    name: "fc_wgt"
+    type: FLOAT32
+    shape { dim: 4 dim: 16 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "fc_bias"
+    type: FLOAT32
+    shape { dim: 4 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "B"
+    type: FLOAT32
+    shape { dim: 1 }
+    filler {
+        tag: "constant"
+        arg: "2.0"
+    }
+}
+operand {
+    name: "fc_out"
+    type: FLOAT32
+    shape: { dim: 1 dim: 4 }
+}
+operand {
+    name: "mul_out"
+    type: FLOAT32
+    shape: { dim: 1 dim: 4 }
+}
+operation {
+    type: "FullyConnected"
+    fullyconnected_options {
+        activation: NONE
+        keep_num_dims: true
+    }
+    input: "ifm"
+    input: "fc_wgt"
+    input: "fc_bias"
+    output: "fc_out"
+}
+operation {
+    type: "Mul"
+    mul_options {
+        activation: NONE
+    }
+    input: "fc_out"
+    input: "B"
+    output: "mul_out"
+}
+input: "ifm"
+output: "mul_out"
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.rule
new file mode 100644
index 00000000000..9cc8d5fd0a7
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.rule
@@ -0,0 +1,12 @@
+# This checks if:
+#   Mul(FC(input, weights, bias), other)
+# is converted to:
+#   FC(input, Mul(weights, other), Mul(bias, other))
+# and then Mul is fused to:
+#   FC(input, weights', bias')
+# Here Mul is in shape of (1), it's a scalar.
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "FC_EXIST"                $(op_count FULLY_CONNECTED) '=' 1

From 761d3034c12d6aa67bdf271cdf65a5f28fb08ab7 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 5 Aug 2024 15:20:49 +0200
Subject: [PATCH 09/34] Fix scalar vs multi-dim case

---
 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 1be57695efa..03f065ec632 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -35,7 +35,7 @@ inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *mult
   auto node_size = fused_node->size<loco::DataType::FLOAT32>();
   auto mul_size = multiplication->size<loco::DataType::FLOAT32>();
   // Scalar multiplication:
-  if (multiplication->rank() == 1 || multiplication->rank() == 0 && mul_size == 1)
+  if ((multiplication->rank() == 1 || multiplication->rank() == 0) && mul_size == 1)
   {
     for (uint32_t i = 0; i < node_size; i++)
       fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(0);

From fa4733b232b2d1a4cad3d14f903e64fcef9b0fa8 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Tue, 6 Aug 2024 16:11:10 +0200
Subject: [PATCH 10/34] Separate bias and weights updating, remove checks

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 69 +++++++++++--------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 03f065ec632..388c6359138 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -29,40 +29,58 @@ namespace
   if (not(cond))                  \
     return false;
 
-inline void update_values(luci::CircleConst *fused_node, luci::CircleConst *multiplication,
-                          bool is_weights)
+inline bool is_scalar(luci::CircleConst *node)
+{
+  return ((node->rank() == 1 || node->rank() == 0) && node->size<loco::DataType::FLOAT32>() == 1);
+}
+
+inline void update_with_scalar(luci::CircleConst *fused_node, luci::CircleConst *multiplication)
+{
+  for (uint32_t i = 0; i < fused_node->size<loco::DataType::FLOAT32>(); i++)
+  {
+    fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(0);
+  }
+}
+
+inline void update_weights(luci::CircleConst *weights, luci::CircleConst *multiplication)
 {
-  auto node_size = fused_node->size<loco::DataType::FLOAT32>();
-  auto mul_size = multiplication->size<loco::DataType::FLOAT32>();
   // Scalar multiplication:
-  if ((multiplication->rank() == 1 || multiplication->rank() == 0) && mul_size == 1)
+  if (is_scalar(multiplication))
   {
-    for (uint32_t i = 0; i < node_size; i++)
-      fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(0);
+    update_with_scalar(weights, multiplication);
   }
   // N-size multiplication:
   else
   {
     // Go along channels, multiplication size is ensured to be compatible with channels.
-    if (is_weights) // weights 2-D
+    auto count = weights->dim(0).value();
+    auto size = weights->dim(weights->rank() - 1).value();
+    float val;
+    for (uint32_t c = 0; c < count; c++)
     {
-      auto count = fused_node->dim(0).value();
-      auto size = fused_node->dim(fused_node->rank() - 1).value();
-      float val;
-      for (uint32_t c = 0; c < count; c++)
+      val = multiplication->at<loco::DataType::FLOAT32>(c);
+      for (uint32_t i = 0; i < size; i++)
       {
-        val = multiplication->at<loco::DataType::FLOAT32>(c);
-        for (uint32_t i = 0; i < size; i++)
-        {
-          fused_node->at<loco::DataType::FLOAT32>(c * size + i) *= val;
-        }
+        weights->at<loco::DataType::FLOAT32>(c * size + i) *= val;
       }
     }
-    else // bias 1-D
+  }
+}
+
+inline void update_bias(luci::CircleConst *bias, luci::CircleConst *multiplication)
+{
+  // Scalar multiplication:
+  if (is_scalar(multiplication))
+  {
+    update_with_scalar(bias, multiplication);
+  }
+  // N-size multiplication:
+  else
+  {
+    // Go along channels, multiplication size is ensured to be compatible with channels.
+    for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
     {
-      for (uint32_t i = 0; i < node_size; i++)
-        fused_node->at<loco::DataType::FLOAT32>(i) *=
-          multiplication->at<loco::DataType::FLOAT32>(i);
+      bias->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
     }
   }
 }
@@ -139,18 +157,13 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32);
 
   auto fused_bias = luci::clone(const_bias);
-  RETURN_FALSE_UNLESS(fused_bias->size<loco::DataType::FLOAT32>() ==
-                      const_bias->size<loco::DataType::FLOAT32>());
-
   // Create new weights to be updated with values:
   auto fused_weights = luci::clone(weights);
-  RETURN_FALSE_UNLESS(fused_weights->size<loco::DataType::FLOAT32>() ==
-                      weights->size<loco::DataType::FLOAT32>());
 
   // Update bias accordingly:
-  update_values(fused_bias, multiplication, false);
+  update_bias(fused_bias, multiplication);
   // Update weights accordingly:
-  update_values(fused_weights, multiplication, true);
+  update_weights(fused_weights, multiplication);
 
   // Replace weights and bias:
   fc->weights(fused_weights);

From 40dcacf41b541c93f84916a9cb0f75a0a9a59904 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Tue, 6 Aug 2024 17:38:56 +0200
Subject: [PATCH 11/34] [luci/pass] Introduce FuseMulWithFullyConnectedPass

This commit introduce FuseMulWithFullyConnectedPass which will fuse Mul to previous FullyConnected if possible.

ONE-DCO-1.0-Signed-off-by: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
---
 .../luci/pass/include/luci/CircleOptimizer.h  |   1 +
 .../luci/Pass/FuseMulWithFullyConnectedPass.h |  37 +++
 compiler/luci/pass/src/CircleOptimizer.cpp    |   5 +
 .../src/FuseMulWithFullyConnectedPass.cpp     | 209 ++++++++++++++++
 .../FuseMulWithFullyConnectedPass.test.cpp    | 223 ++++++++++++++++++
 5 files changed, 475 insertions(+)
 create mode 100644 compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
 create mode 100644 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
 create mode 100644 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp

diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index 9cbd26f0da5..8a1eb6d4f78 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -49,6 +49,7 @@ class CircleOptimizer final
       FuseMeanWithMean,
       FuseMulWithConv,
       FuseMulWithDiv,
+      FuseMulWithFullyConnected,
       FuseTransposeWithMean,
       ResolveCustomOpAdd,
       ResolveCustomOpBatchMatMul,
diff --git a/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
new file mode 100644
index 00000000000..718039f1c69
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__
+#define __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fuse Mul into CircleFullyConnected
+ */
+struct FuseMulWithFullyConnectedPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FuseMulWithFullyConnectedPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 840c8dd25dd..27cf27e63fd 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -48,6 +48,7 @@
 #include "luci/Pass/FuseMeanWithMeanPass.h"
 #include "luci/Pass/FuseMulWithConvPass.h"
 #include "luci/Pass/FuseMulWithDivPass.h"
+#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
 #include "luci/Pass/FusePreActivationBatchNormPass.h"
 #include "luci/Pass/FusePReluPass.h"
 #include "luci/Pass/FuseGeluPass.h"
@@ -278,6 +279,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
   phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
+  if (_options->query(Options::Algorithm::FuseMulWithFullyConnected))
+  {
+    phase.emplace_back(std::make_unique<FuseMulWithFullyConnectedPass>());
+  }
   if (_options->query(Options::Algorithm::CommonSubExpressionElimination))
   {
     phase.emplace_back(std::make_unique<luci::CommonSubExpressionEliminationPass>());
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
new file mode 100644
index 00000000000..388c6359138
--- /dev/null
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/Nodes/CircleConst.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+#include <cmath>
+
+namespace
+{
+
+#define RETURN_FALSE_UNLESS(cond) \
+  if (not(cond))                  \
+    return false;
+
+inline bool is_scalar(luci::CircleConst *node)
+{
+  return ((node->rank() == 1 || node->rank() == 0) && node->size<loco::DataType::FLOAT32>() == 1);
+}
+
+inline void update_with_scalar(luci::CircleConst *fused_node, luci::CircleConst *multiplication)
+{
+  for (uint32_t i = 0; i < fused_node->size<loco::DataType::FLOAT32>(); i++)
+  {
+    fused_node->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(0);
+  }
+}
+
+inline void update_weights(luci::CircleConst *weights, luci::CircleConst *multiplication)
+{
+  // Scalar multiplication:
+  if (is_scalar(multiplication))
+  {
+    update_with_scalar(weights, multiplication);
+  }
+  // N-size multiplication:
+  else
+  {
+    // Go along channels, multiplication size is ensured to be compatible with channels.
+    auto count = weights->dim(0).value();
+    auto size = weights->dim(weights->rank() - 1).value();
+    float val;
+    for (uint32_t c = 0; c < count; c++)
+    {
+      val = multiplication->at<loco::DataType::FLOAT32>(c);
+      for (uint32_t i = 0; i < size; i++)
+      {
+        weights->at<loco::DataType::FLOAT32>(c * size + i) *= val;
+      }
+    }
+  }
+}
+
+inline void update_bias(luci::CircleConst *bias, luci::CircleConst *multiplication)
+{
+  // Scalar multiplication:
+  if (is_scalar(multiplication))
+  {
+    update_with_scalar(bias, multiplication);
+  }
+  // N-size multiplication:
+  else
+  {
+    // Go along channels, multiplication size is ensured to be compatible with channels.
+    for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
+    {
+      bias->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
+    }
+  }
+}
+
+/**
+ *  Fuse Mul to FullyConnected if the multiplied value is a channel(last dimension)-wise constant
+ *
+ *  BEFORE
+ *                |
+ *      [CircleFullyConnected]
+ *                |
+ *           [CircleMul]
+ *                |
+ *
+ *  AFTER
+ *                |
+ *       [CircleFullyConnected]   [CircleMul] (dead)
+ *                |
+ *
+ */
+bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
+{
+  // Sanity check:
+  RETURN_FALSE_UNLESS(fc);
+  // Allow only FLOAT32 data type:
+  RETURN_FALSE_UNLESS(fc->dtype() == loco::DataType::FLOAT32);
+  // Allow only without activation functions as values are going to
+  // be multiplied before activation function.
+  RETURN_FALSE_UNLESS(fc->fusedActivationFunction() == luci::FusedActFunc::NONE);
+  // Check for weights being Constant:
+  auto weights = dynamic_cast<luci::CircleConst *>(fc->weights());
+  RETURN_FALSE_UNLESS(weights);
+  // Get Mul node:
+  auto fc_output = loco::succs(fc);
+  // Make sure that FullyConnected has only one child:
+  RETURN_FALSE_UNLESS(fc_output.size() == 1);
+  auto mul = dynamic_cast<luci::CircleMul *>(*fc_output.begin());
+  RETURN_FALSE_UNLESS(mul);
+  // Allow Mul node only with FLOAT32 data type:
+  RETURN_FALSE_UNLESS(mul->dtype() == loco::DataType::FLOAT32);
+  // Get multiplication Constant (here: the second input besides weights):
+  auto multiplication = mul->x() == fc ? dynamic_cast<luci::CircleConst *>(mul->y())
+                                       : dynamic_cast<luci::CircleConst *>(mul->x());
+  RETURN_FALSE_UNLESS(multiplication);
+  // Get rank of multiplication:
+  auto rank = multiplication->rank();
+  RETURN_FALSE_UNLESS(rank != 0);
+  // Check that all dimensions are ones, checks broadcast capabilites.
+  // Last dimesion of multiplication must be compatible with FC.
+  // N-D case (N>1):
+  if (multiplication->rank() > 1)
+  {
+    // Check channel-wise broadcasting:
+    for (uint32_t i = 0; i < rank - 1; i++)
+      RETURN_FALSE_UNLESS(multiplication->dim(i).value() == 1);
+    // Check the last dimesion of Mul is the same with the first dimension of FullyConnected
+    RETURN_FALSE_UNLESS(multiplication->dim(rank - 1) == weights->dim(0));
+  }
+  // Scalar case:
+  else if (multiplication->rank() == 1 || multiplication->rank() == 0)
+  {
+    RETURN_FALSE_UNLESS(multiplication->size<loco::DataType::FLOAT32>() != 0);
+  }
+
+  // Only supports:
+  // (1) constant bias
+  // (2) no bias
+  auto bias = loco::must_cast<luci::CircleNode *>(fc->bias());
+  RETURN_FALSE_UNLESS(bias->opcode() == luci::CircleOpcode::CIRCLECONST or
+                      bias->opcode() == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
+  // Create new bias to be updated with values:
+  auto const_bias = dynamic_cast<luci::CircleConst *>(fc->bias());
+  RETURN_FALSE_UNLESS(const_bias)
+  RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32);
+
+  auto fused_bias = luci::clone(const_bias);
+  // Create new weights to be updated with values:
+  auto fused_weights = luci::clone(weights);
+
+  // Update bias accordingly:
+  update_bias(fused_bias, multiplication);
+  // Update weights accordingly:
+  update_weights(fused_weights, multiplication);
+
+  // Replace weights and bias:
+  fc->weights(fused_weights);
+  fc->bias(fused_bias);
+
+  // Set origin and copy Activation Function if exisitng:
+  fc->fusedActivationFunction(mul->fusedActivationFunction());
+  luci::add_origin(fc, luci::get_origin(mul));
+
+  replace(mul).with(fc);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseMulWithFullyConnectedPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
+    if (not fc)
+      continue;
+
+    switch (fc->dtype())
+    {
+      case loco::DataType::FLOAT32:
+        if (fuse_mul_with_fc(fc))
+          changed = true;
+        break;
+      default:
+        break;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
new file mode 100644
index 00000000000..0043b44bfb9
--- /dev/null
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseMulWithFullyConnectedPass.h"
+#include "helpers/CreateCircleConst.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+#define DIM_ONE 8
+#define DIM_TWO 4
+#define MUL_VAL 2.0f
+
+namespace
+{
+
+using namespace luci::test;
+
+/**
+ *  Graph for this test
+ *
+ *  BEFORE
+ *
+ *         [FC]
+ *           |
+ *     [Mul w/ Relu]
+ *
+ *  AFTER
+ *
+ *      [FC w/ Relu] (weights and bias updated)
+ *
+ */
+class FCMulGraphlet
+{
+public:
+  FCMulGraphlet() = default;
+
+  void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar)
+  {
+    std::vector<float> weights_val(DIM_ONE * DIM_TWO);
+    for (uint32_t i = 0; i < DIM_ONE * DIM_TWO; i++)
+      weights_val.at(i) = i;
+
+    _fc_f = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE, DIM_TWO}, weights_val);
+
+    std::vector<float> bias_val(DIM_ONE);
+    for (uint32_t i = 0; i < DIM_ONE; i++)
+      bias_val.at(i) = i;
+
+    _fc_b = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE}, bias_val);
+
+    _fc = g->nodes()->create<luci::CircleFullyConnected>();
+    _fc->weights(_fc_f);
+    _fc->bias(_fc_b);
+    _fc->fusedActivationFunction(fc_activation);
+    _fc->dtype(loco::DataType::FLOAT32);
+    _fc->shape({1, DIM_ONE});
+    _fc->name("fc");
+
+    std::vector<float> mul_values;
+
+    if (is_mul_scalar)
+    {
+      mul_values.push_back(static_cast<float>(MUL_VAL));
+      _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1}, mul_values);
+    }
+    else
+    {
+      for (uint32_t i = 0; i < DIM_ONE; i++)
+      {
+        mul_values.push_back(static_cast<float>(i));
+      }
+      _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1, 1, 1, DIM_ONE}, mul_values);
+    }
+
+    _mul = g->nodes()->create<luci::CircleMul>();
+    _mul->x(_fc);
+    _mul->y(_mul_c);
+    _mul->fusedActivationFunction(luci::FusedActFunc::RELU);
+    _mul->dtype(loco::DataType::FLOAT32);
+    if (is_mul_scalar)
+    {
+      _mul->shape({1});
+    }
+    else
+    {
+      _mul->shape({1, DIM_ONE});
+    }
+    _mul->name("mul");
+  }
+
+public:
+  luci::CircleFullyConnected *fc() { return _fc; }
+
+  void to_fm_bias(void)
+  {
+    assert(_fc != nullptr);
+
+    auto new_fc = _fc->graph()->nodes()->create<luci::CircleFullyConnected>();
+    _fc->bias(new_fc);
+  }
+
+protected:
+  luci::CircleFullyConnected *_fc = nullptr;
+  luci::CircleMul *_mul = nullptr;
+  luci::CircleConst *_fc_f = nullptr;
+  luci::CircleConst *_fc_b = nullptr;
+  luci::CircleConst *_mul_c = nullptr;
+};
+
+class FuseAddWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
+{
+public:
+  FuseAddWithFCTestGraph() = default;
+
+  void init(luci::FusedActFunc fc_activation = luci::FusedActFunc::NONE, bool is_mul_scalar = false)
+  {
+    TestIOGraph::init({1, DIM_TWO}, {1, DIM_ONE});
+    FCMulGraphlet::init(g(), fc_activation, is_mul_scalar);
+
+    _fc->input(input());
+
+    output()->from(_mul);
+  }
+};
+
+class FuseMulWithFullyConnectedPassTest : public ::testing::Test
+{
+public:
+  FuseAddWithFCTestGraph g;
+  luci::FuseMulWithFullyConnectedPass pass;
+};
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
+{
+  g.init(luci::FusedActFunc::NONE, false);
+
+  EXPECT_EQ(true, pass.run(g.g()));
+
+  auto fc = dynamic_cast<luci::CircleFullyConnected *>(g.output()->from());
+  EXPECT_NE(nullptr, fc);
+
+  auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
+  auto weights_n = weights->dim(0).value();
+  auto weights_m = weights->dim(1).value();
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < weights_n; i++)
+  {
+    for (uint32_t j = 0; j < weights_m; j++)
+    {
+      offset = i * weights_m + j;
+      EXPECT_EQ(i * offset, weights->at<loco::DataType::FLOAT32>(offset));
+    }
+  }
+
+  auto bias = loco::must_cast<luci::CircleConst *>(g.fc()->bias());
+  for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
+  {
+    EXPECT_EQ(i * i, bias->at<loco::DataType::FLOAT32>(i));
+  }
+}
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
+{
+  g.init(luci::FusedActFunc::NONE, true);
+
+  EXPECT_EQ(true, pass.run(g.g()));
+
+  auto fc = dynamic_cast<luci::CircleFullyConnected *>(g.output()->from());
+  EXPECT_NE(nullptr, fc);
+
+  auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
+  auto weights_n = weights->dim(0).value();
+  auto weights_m = weights->dim(1).value();
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < weights_n; i++)
+  {
+    for (uint32_t j = 0; j < weights_m; j++)
+    {
+      offset = i * weights_m + j;
+      EXPECT_EQ(MUL_VAL * offset, weights->at<loco::DataType::FLOAT32>(offset));
+    }
+  }
+
+  auto bias = loco::must_cast<luci::CircleConst *>(g.fc()->bias());
+  for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
+  {
+    EXPECT_EQ(MUL_VAL * i, bias->at<loco::DataType::FLOAT32>(i));
+  }
+}
+
+TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
+{
+  g.init();
+
+  // Bias cannot be fused as it's passed as feature map.
+  g.to_fm_bias();
+
+  EXPECT_EQ(false, pass.run(g.g()));
+}
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
+{
+  g.init(luci::FusedActFunc::RELU);
+
+  EXPECT_EQ(false, pass.run(g.g()));
+}
+} // namespace

From b7172340be986496c3ec07facad08ff5c6b44d40 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Tue, 6 Aug 2024 17:42:21 +0200
Subject: [PATCH 12/34] [one-cmds] Add an option for
 FuseMulWithFullyConnectedPass

This commit adds one-cmd option for FuseMulWithFullyConnectedPass.

ONE-DCO-1.0-Signed-off-by: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
---
 compiler/one-cmds/how-to-use-one-commands.txt | 1 +
 compiler/one-cmds/onelib/constant.py          | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index fefbabf9a17..af731943bf3 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -171,6 +171,7 @@ Current transformation options are
 - fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
 - fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
 - fuse_batchnorm_with_tconv : This fuses BatchNorm operator to transpose convolution operator
+- fuse_mul_with_fullyconnected: This fuses Mul operator with the preceding FullyConnected operator if possible
 - fuse_mul_to_fullyconnected_weights : This fuses Mul operator to following FullyConnected operator weights
 - fuse_mul_with_conv: This fuses Mul with a preceding Convolution op if possible.
 - fuse_mul_with_div: This fuses Mul and Div op as Div.
diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
index 8c5de1b646d..192b258973f 100644
--- a/compiler/one-cmds/onelib/constant.py
+++ b/compiler/one-cmds/onelib/constant.py
@@ -44,6 +44,7 @@ class CONSTANT:
         'fuse_batchnorm_with_dwconv',
         'fuse_batchnorm_with_tconv',
         'fuse_activation_function',
+        'fuse_mul_with_fullyconnected',
         'fuse_mul_to_fullyconnected_weights',
         'fuse_instnorm',
         'fuse_prelu',
@@ -122,6 +123,7 @@ class CONSTANT:
         ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
         ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
         ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
+        ('fuse_mul_with_fullyconnected', 'fuse Mul op to FullyConnected op'),
         ('fuse_mul_to_fullyconnected_weights',
          'fuse Mul op to following FullyConnected op weights'),
         ('fuse_slice_with_tconv', 'fuse Slice op to Transposed Convolution op'),

From a568d25f1245dfe2ba0c5753e989b55b0eeadefc Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Wed, 7 Aug 2024 10:30:17 +0200
Subject: [PATCH 13/34] [circle2circle] Dredd test for
 FuseMulWithFullyConnectedPass

This commit is adding circle2circle dredd test for FC + Mul fusion.

ONE-DCO-1.0-Signed-off-by: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
---
 compiler/circle2circle-dredd-recipe-test/test.lst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 4bf6a80d65a..b33aa6fd0cf 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -48,6 +48,9 @@ Add(Net_DwConv_BN_000 PASS fuse_batchnorm_with_dwconv)
 Add(Net_DwConv_BN_001 PASS fuse_batchnorm_with_dwconv)
 Add(Net_FC_Gelu_FC_000 PASS replace_with_fc_gelu_fc)
 Add(Net_FullyConnected_Add_000 PASS fold_fully_connected)
+Add(Net_FullyConnected_Mul_000 PASS fuse_mul_with_fullyconnected)
+Add(Net_FullyConnected_Mul_001 PASS fuse_mul_with_fullyconnected)
+Add(Net_FullyConnected_Mul_002 PASS fuse_mul_with_fullyconnected)
 Add(Net_Gelu_000 PASS fuse_gelu)
 Add(Net_Gelu_001 PASS fuse_gelu)
 Add(Net_Horizontal_FullyConnected_Add_000 PASS fuse_horizontal_fc_layers)

From d3246e3d91388e7ca7d4f713e3e9745d945b16f3 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Wed, 7 Aug 2024 10:35:06 +0200
Subject: [PATCH 14/34] [luci/pass] Value test for
 FuseMulWithFullyConnectedPass

This commit adds value tests for FuseMulWithFullyConnectedPass.

ONE-DCO-1.0-Signed-off-by: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
---
 compiler/luci-pass-value-py-test/test.lst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler/luci-pass-value-py-test/test.lst b/compiler/luci-pass-value-py-test/test.lst
index e2c37517486..91a77e4ef7f 100644
--- a/compiler/luci-pass-value-py-test/test.lst
+++ b/compiler/luci-pass-value-py-test/test.lst
@@ -33,6 +33,9 @@ eval(Net_Dequantize_Add_000 fold_dequantize)
 eval(Net_DwConv_BN_000 fuse_batchnorm_with_dwconv)
 eval(Net_DwConv_BN_001 fuse_batchnorm_with_dwconv)
 eval(Net_FullyConnected_Add_000 fold_fully_connected)
+eval(Net_FullyConnected_Mul_000 fuse_mul_with_fullyconnected)
+eval(Net_FullyConnected_Mul_001 fuse_mul_with_fullyconnected)
+eval(Net_FullyConnected_Mul_002 fuse_mul_with_fullyconnected)
 eval(Net_Horizontal_FullyConnected_Add_000 fuse_horizontal_fc_layers)
 eval(Net_InstanceNorm_001 fuse_instnorm)
 eval(Net_InstanceNorm_002 fuse_instnorm)

From f661561a5a361124fe17f9856841bbad3123109a Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Wed, 7 Aug 2024 16:27:48 +0200
Subject: [PATCH 15/34] Change constness of args, move tests and move
 FuseMulWithFC after FuseMulWithDiv

---
 compiler/luci/pass/src/CircleOptimizer.cpp                | 8 ++++----
 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp  | 8 ++++----
 .../luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp  | 3 ++-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 27cf27e63fd..246d4f36e78 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -279,10 +279,6 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
   phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
-  if (_options->query(Options::Algorithm::FuseMulWithFullyConnected))
-  {
-    phase.emplace_back(std::make_unique<FuseMulWithFullyConnectedPass>());
-  }
   if (_options->query(Options::Algorithm::CommonSubExpressionElimination))
   {
     phase.emplace_back(std::make_unique<luci::CommonSubExpressionEliminationPass>());
@@ -315,6 +311,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<FuseMulWithDivPass>());
   }
+  if (_options->query(Options::Algorithm::FuseMulWithFullyConnected))
+  {
+    phase.emplace_back(std::make_unique<FuseMulWithFullyConnectedPass>());
+  }
   if (_options->query(Options::Algorithm::ResolveCustomOpMaxPoolWithArgmax))
   {
     phase.emplace_back(std::make_unique<luci::ResolveCustomOpMaxPoolWithArgmaxPass>());
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 388c6359138..795a8af0237 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -29,12 +29,12 @@ namespace
   if (not(cond))                  \
     return false;
 
-inline bool is_scalar(luci::CircleConst *node)
+inline bool is_scalar(const luci::CircleConst *node)
 {
   return ((node->rank() == 1 || node->rank() == 0) && node->size<loco::DataType::FLOAT32>() == 1);
 }
 
-inline void update_with_scalar(luci::CircleConst *fused_node, luci::CircleConst *multiplication)
+inline void update_with_scalar(luci::CircleConst *fused_node, const luci::CircleConst *multiplication)
 {
   for (uint32_t i = 0; i < fused_node->size<loco::DataType::FLOAT32>(); i++)
   {
@@ -42,7 +42,7 @@ inline void update_with_scalar(luci::CircleConst *fused_node, luci::CircleConst
   }
 }
 
-inline void update_weights(luci::CircleConst *weights, luci::CircleConst *multiplication)
+inline void update_weights(luci::CircleConst *weights, const luci::CircleConst *multiplication)
 {
   // Scalar multiplication:
   if (is_scalar(multiplication))
@@ -67,7 +67,7 @@ inline void update_weights(luci::CircleConst *weights, luci::CircleConst *multip
   }
 }
 
-inline void update_bias(luci::CircleConst *bias, luci::CircleConst *multiplication)
+inline void update_bias(luci::CircleConst *bias, const luci::CircleConst *multiplication)
 {
   // Scalar multiplication:
   if (is_scalar(multiplication))
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index 0043b44bfb9..7db58b69909 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -146,6 +146,8 @@ class FuseMulWithFullyConnectedPassTest : public ::testing::Test
   luci::FuseMulWithFullyConnectedPass pass;
 };
 
+} // namespace
+
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
 {
   g.init(luci::FusedActFunc::NONE, false);
@@ -220,4 +222,3 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 
   EXPECT_EQ(false, pass.run(g.g()));
 }
-} // namespace

From 85d9783685973a647cd9d71407940284695f5fc7 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Wed, 7 Aug 2024 16:29:46 +0200
Subject: [PATCH 16/34] Fix codestyle

---
 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 795a8af0237..7c6f60ed7c9 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -34,7 +34,8 @@ inline bool is_scalar(const luci::CircleConst *node)
   return ((node->rank() == 1 || node->rank() == 0) && node->size<loco::DataType::FLOAT32>() == 1);
 }
 
-inline void update_with_scalar(luci::CircleConst *fused_node, const luci::CircleConst *multiplication)
+inline void update_with_scalar(luci::CircleConst *fused_node,
+                               const luci::CircleConst *multiplication)
 {
   for (uint32_t i = 0; i < fused_node->size<loco::DataType::FLOAT32>(); i++)
   {

From 51dd43c1df044bc66ccdac8ac6387cd7dab00789 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Wed, 7 Aug 2024 16:34:29 +0200
Subject: [PATCH 17/34] Fix order of cmds

---
 compiler/one-cmds/how-to-use-one-commands.txt | 2 +-
 compiler/one-cmds/onelib/constant.py          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index af731943bf3..d6656545ff8 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -171,10 +171,10 @@ Current transformation options are
 - fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
 - fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
 - fuse_batchnorm_with_tconv : This fuses BatchNorm operator to transpose convolution operator
-- fuse_mul_with_fullyconnected: This fuses Mul operator with the preceding FullyConnected operator if possible
 - fuse_mul_to_fullyconnected_weights : This fuses Mul operator to following FullyConnected operator weights
 - fuse_mul_with_conv: This fuses Mul with a preceding Convolution op if possible.
 - fuse_mul_with_div: This fuses Mul and Div op as Div.
+- fuse_mul_with_fullyconnected: This fuses Mul operator with the preceding FullyConnected operator if possible.
 - fuse_slice_with_tconv: This fuses Slice with a preceding TConv if possible.
 - fuse_bcq: This enables Binary-Coded-bases Quantized DNNs
    - read https://arxiv.org/abs/2005.09904 for detailed information
diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
index 192b258973f..a8dabf139d0 100644
--- a/compiler/one-cmds/onelib/constant.py
+++ b/compiler/one-cmds/onelib/constant.py
@@ -44,7 +44,6 @@ class CONSTANT:
         'fuse_batchnorm_with_dwconv',
         'fuse_batchnorm_with_tconv',
         'fuse_activation_function',
-        'fuse_mul_with_fullyconnected',
         'fuse_mul_to_fullyconnected_weights',
         'fuse_instnorm',
         'fuse_prelu',
@@ -53,6 +52,7 @@ class CONSTANT:
         'fuse_mean_with_mean',
         'fuse_mul_with_conv',
         'fuse_mul_with_div',
+        'fuse_mul_with_fullyconnected',
         'fuse_transpose_with_mean',
         'fuse_slice_with_tconv',
         'fuse_horizontal_fc_layers',
@@ -123,7 +123,6 @@ class CONSTANT:
         ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
         ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
         ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
-        ('fuse_mul_with_fullyconnected', 'fuse Mul op to FullyConnected op'),
         ('fuse_mul_to_fullyconnected_weights',
          'fuse Mul op to following FullyConnected op weights'),
         ('fuse_slice_with_tconv', 'fuse Slice op to Transposed Convolution op'),
@@ -133,6 +132,7 @@ class CONSTANT:
         ('fuse_mean_with_mean', 'fuse two consecutive Mean ops'),
         ('fuse_mul_with_conv', 'fuse Mul op to Convolution op'),
         ('fuse_mul_with_div', 'fuse Mul with Div as Div'),
+        ('fuse_mul_with_fullyconnected', 'fuse Mul op to FullyConnected op'),
         ('fuse_transpose_with_mean',
          'fuse Mean with a preceding Transpose under certain conditions'),
         ('fuse_horizontal_fc_layers',

From e3b354e36b7b747c9918d69cf26ceec2bd976fe2 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Thu, 8 Aug 2024 15:10:05 +0200
Subject: [PATCH 18/34] Remove default arguments

---
 .../src/FuseMulWithFullyConnectedPass.test.cpp     | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index 7db58b69909..681e2d5aa8d 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -48,8 +48,6 @@ using namespace luci::test;
 class FCMulGraphlet
 {
 public:
-  FCMulGraphlet() = default;
-
   void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar)
   {
     std::vector<float> weights_val(DIM_ONE * DIM_TWO);
@@ -123,12 +121,10 @@ class FCMulGraphlet
   luci::CircleConst *_mul_c = nullptr;
 };
 
-class FuseAddWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
+class FuseMulWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
 {
 public:
-  FuseAddWithFCTestGraph() = default;
-
-  void init(luci::FusedActFunc fc_activation = luci::FusedActFunc::NONE, bool is_mul_scalar = false)
+  void init(luci::FusedActFunc fc_activation, bool is_mul_scalar)
   {
     TestIOGraph::init({1, DIM_TWO}, {1, DIM_ONE});
     FCMulGraphlet::init(g(), fc_activation, is_mul_scalar);
@@ -142,7 +138,7 @@ class FuseAddWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
 class FuseMulWithFullyConnectedPassTest : public ::testing::Test
 {
 public:
-  FuseAddWithFCTestGraph g;
+  FuseMulWithFCTestGraph g;
   luci::FuseMulWithFullyConnectedPass pass;
 };
 
@@ -208,7 +204,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 {
-  g.init();
+  g.init(luci::FusedActFunc::NONE, false);
 
   // Bias cannot be fused as it's passed as feature map.
   g.to_fm_bias();
@@ -218,7 +214,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 {
-  g.init(luci::FusedActFunc::RELU);
+  g.init(luci::FusedActFunc::RELU, false);
 
   EXPECT_EQ(false, pass.run(g.g()));
 }

From ffc36e996deba96ae7a2cdd8577d47ea70f2da03 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Thu, 8 Aug 2024 15:15:09 +0200
Subject: [PATCH 19/34] Remove default args

---
 .../src/FuseMulWithFullyConnectedPass.test.cpp  | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index 0043b44bfb9..681e2d5aa8d 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -48,8 +48,6 @@ using namespace luci::test;
 class FCMulGraphlet
 {
 public:
-  FCMulGraphlet() = default;
-
   void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar)
   {
     std::vector<float> weights_val(DIM_ONE * DIM_TWO);
@@ -123,12 +121,10 @@ class FCMulGraphlet
   luci::CircleConst *_mul_c = nullptr;
 };
 
-class FuseAddWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
+class FuseMulWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
 {
 public:
-  FuseAddWithFCTestGraph() = default;
-
-  void init(luci::FusedActFunc fc_activation = luci::FusedActFunc::NONE, bool is_mul_scalar = false)
+  void init(luci::FusedActFunc fc_activation, bool is_mul_scalar)
   {
     TestIOGraph::init({1, DIM_TWO}, {1, DIM_ONE});
     FCMulGraphlet::init(g(), fc_activation, is_mul_scalar);
@@ -142,10 +138,12 @@ class FuseAddWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
 class FuseMulWithFullyConnectedPassTest : public ::testing::Test
 {
 public:
-  FuseAddWithFCTestGraph g;
+  FuseMulWithFCTestGraph g;
   luci::FuseMulWithFullyConnectedPass pass;
 };
 
+} // namespace
+
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
 {
   g.init(luci::FusedActFunc::NONE, false);
@@ -206,7 +204,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 {
-  g.init();
+  g.init(luci::FusedActFunc::NONE, false);
 
   // Bias cannot be fused as it's passed as feature map.
   g.to_fm_bias();
@@ -216,8 +214,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 {
-  g.init(luci::FusedActFunc::RELU);
+  g.init(luci::FusedActFunc::RELU, false);
 
   EXPECT_EQ(false, pass.run(g.g()));
 }
-} // namespace

From 31e25edc0339e00aa356e54e8643f953ba0c98bd Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Fri, 9 Aug 2024 16:16:15 +0200
Subject: [PATCH 20/34] Refactor solution and apply comments

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 35 +++++++++----------
 .../FuseMulWithFullyConnectedPass.test.cpp    |  2 +-
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 7c6f60ed7c9..ef96635579c 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -43,47 +43,52 @@ inline void update_with_scalar(luci::CircleConst *fused_node,
   }
 }
 
-inline void update_weights(luci::CircleConst *weights, const luci::CircleConst *multiplication)
+luci::CircleConst *gen_fused_weights(luci::CircleConst *weights,
+                                     const luci::CircleConst *multiplication)
 {
+  auto fused_weights = luci::clone(weights);
   // Scalar multiplication:
   if (is_scalar(multiplication))
   {
-    update_with_scalar(weights, multiplication);
+    update_with_scalar(fused_weights, multiplication);
   }
   // N-size multiplication:
   else
   {
     // Go along channels, multiplication size is ensured to be compatible with channels.
-    auto count = weights->dim(0).value();
-    auto size = weights->dim(weights->rank() - 1).value();
+    auto count = fused_weights->dim(0).value();
+    auto size = fused_weights->dim(fused_weights->rank() - 1).value();
     float val;
     for (uint32_t c = 0; c < count; c++)
     {
       val = multiplication->at<loco::DataType::FLOAT32>(c);
       for (uint32_t i = 0; i < size; i++)
       {
-        weights->at<loco::DataType::FLOAT32>(c * size + i) *= val;
+        fused_weights->at<loco::DataType::FLOAT32>(c * size + i) *= val;
       }
     }
   }
+  return fused_weights;
 }
 
-inline void update_bias(luci::CircleConst *bias, const luci::CircleConst *multiplication)
+luci::CircleConst *gen_fused_bias(luci::CircleConst *bias, const luci::CircleConst *multiplication)
 {
+  auto fused_bias = luci::clone(bias);
   // Scalar multiplication:
   if (is_scalar(multiplication))
   {
-    update_with_scalar(bias, multiplication);
+    update_with_scalar(fused_bias, multiplication);
   }
   // N-size multiplication:
   else
   {
     // Go along channels, multiplication size is ensured to be compatible with channels.
-    for (uint32_t i = 0; i < bias->size<loco::DataType::FLOAT32>(); i++)
+    for (uint32_t i = 0; i < fused_bias->size<loco::DataType::FLOAT32>(); i++)
     {
-      bias->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
+      fused_bias->at<loco::DataType::FLOAT32>(i) *= multiplication->at<loco::DataType::FLOAT32>(i);
     }
   }
+  return fused_bias;
 }
 
 /**
@@ -128,7 +133,6 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   RETURN_FALSE_UNLESS(multiplication);
   // Get rank of multiplication:
   auto rank = multiplication->rank();
-  RETURN_FALSE_UNLESS(rank != 0);
   // Check that all dimensions are ones, checks broadcast capabilites.
   // Last dimesion of multiplication must be compatible with FC.
   // N-D case (N>1):
@@ -157,14 +161,9 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   RETURN_FALSE_UNLESS(const_bias)
   RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32);
 
-  auto fused_bias = luci::clone(const_bias);
-  // Create new weights to be updated with values:
-  auto fused_weights = luci::clone(weights);
-
-  // Update bias accordingly:
-  update_bias(fused_bias, multiplication);
-  // Update weights accordingly:
-  update_weights(fused_weights, multiplication);
+  // Create new weights and bias with updated values:
+  auto fused_bias = gen_fused_bias(const_bias, multiplication);
+  auto fused_weights = gen_fused_weights(weights, multiplication);
 
   // Replace weights and bias:
   fc->weights(fused_weights);
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index 681e2d5aa8d..ed38f2db380 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -75,7 +75,7 @@ class FCMulGraphlet
     if (is_mul_scalar)
     {
       mul_values.push_back(static_cast<float>(MUL_VAL));
-      _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1}, mul_values);
+      _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {}, mul_values);
     }
     else
     {

From 8b17f47a439719f6aa5a6522f405308972f7a5bf Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Fri, 9 Aug 2024 17:03:58 +0200
Subject: [PATCH 21/34] Add handling of no bias case to pass

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 29 +++++-----
 .../FuseMulWithFullyConnectedPass.test.cpp    | 56 ++++++++++++++-----
 2 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index ef96635579c..19171707f6d 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -153,21 +153,24 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   // Only supports:
   // (1) constant bias
   // (2) no bias
-  auto bias = loco::must_cast<luci::CircleNode *>(fc->bias());
-  RETURN_FALSE_UNLESS(bias->opcode() == luci::CircleOpcode::CIRCLECONST or
-                      bias->opcode() == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
-  // Create new bias to be updated with values:
-  auto const_bias = dynamic_cast<luci::CircleConst *>(fc->bias());
-  RETURN_FALSE_UNLESS(const_bias)
-  RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32);
-
-  // Create new weights and bias with updated values:
-  auto fused_bias = gen_fused_bias(const_bias, multiplication);
-  auto fused_weights = gen_fused_weights(weights, multiplication);
+  auto bias = dynamic_cast<luci::CircleNode *>(fc->bias());
+  if (bias != nullptr)
+  {
+    RETURN_FALSE_UNLESS(bias->opcode() == luci::CircleOpcode::CIRCLECONST or
+                        bias->opcode() == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
+    // Create new bias to be updated with values:
+    auto const_bias = dynamic_cast<luci::CircleConst *>(fc->bias());
+    RETURN_FALSE_UNLESS(const_bias)
+    RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32);
+
+    // Create new bias with updated values and replace:
+    auto fused_bias = gen_fused_bias(const_bias, multiplication);
+    fc->bias(fused_bias);
+  }
 
-  // Replace weights and bias:
+  // Create new weights with updated values and replace:
+  auto fused_weights = gen_fused_weights(weights, multiplication);
   fc->weights(fused_weights);
-  fc->bias(fused_bias);
 
   // Set origin and copy Activation Function if exisitng:
   fc->fusedActivationFunction(mul->fusedActivationFunction());
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index ed38f2db380..a05b4f80ee8 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -48,23 +48,27 @@ using namespace luci::test;
 class FCMulGraphlet
 {
 public:
-  void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar)
+  void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias)
   {
+    _fc = g->nodes()->create<luci::CircleFullyConnected>();
+
     std::vector<float> weights_val(DIM_ONE * DIM_TWO);
     for (uint32_t i = 0; i < DIM_ONE * DIM_TWO; i++)
       weights_val.at(i) = i;
 
     _fc_f = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE, DIM_TWO}, weights_val);
+    _fc->weights(_fc_f);
 
-    std::vector<float> bias_val(DIM_ONE);
-    for (uint32_t i = 0; i < DIM_ONE; i++)
-      bias_val.at(i) = i;
+    if (use_bias)
+    {
+      std::vector<float> bias_val(DIM_ONE);
+      for (uint32_t i = 0; i < DIM_ONE; i++)
+        bias_val.at(i) = i;
 
-    _fc_b = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE}, bias_val);
+      _fc_b = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE}, bias_val);
+      _fc->bias(_fc_b);
+    }
 
-    _fc = g->nodes()->create<luci::CircleFullyConnected>();
-    _fc->weights(_fc_f);
-    _fc->bias(_fc_b);
     _fc->fusedActivationFunction(fc_activation);
     _fc->dtype(loco::DataType::FLOAT32);
     _fc->shape({1, DIM_ONE});
@@ -124,10 +128,10 @@ class FCMulGraphlet
 class FuseMulWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
 {
 public:
-  void init(luci::FusedActFunc fc_activation, bool is_mul_scalar)
+  void init(luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias)
   {
     TestIOGraph::init({1, DIM_TWO}, {1, DIM_ONE});
-    FCMulGraphlet::init(g(), fc_activation, is_mul_scalar);
+    FCMulGraphlet::init(g(), fc_activation, is_mul_scalar, use_bias);
 
     _fc->input(input());
 
@@ -146,7 +150,7 @@ class FuseMulWithFullyConnectedPassTest : public ::testing::Test
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
 {
-  g.init(luci::FusedActFunc::NONE, false);
+  g.init(luci::FusedActFunc::NONE, false, true);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
@@ -175,7 +179,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
 {
-  g.init(luci::FusedActFunc::NONE, true);
+  g.init(luci::FusedActFunc::NONE, true, true);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
@@ -202,9 +206,33 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
   }
 }
 
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_no_bias)
+{
+  g.init(luci::FusedActFunc::NONE, false, false);
+
+  EXPECT_EQ(true, pass.run(g.g()));
+
+  auto fc = dynamic_cast<luci::CircleFullyConnected *>(g.output()->from());
+  EXPECT_NE(nullptr, fc);
+  EXPECT_EQ(nullptr, fc->bias());
+
+  auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
+  auto weights_n = weights->dim(0).value();
+  auto weights_m = weights->dim(1).value();
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < weights_n; i++)
+  {
+    for (uint32_t j = 0; j < weights_m; j++)
+    {
+      offset = i * weights_m + j;
+      EXPECT_EQ(i * offset, weights->at<loco::DataType::FLOAT32>(offset));
+    }
+  }
+}
+
 TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 {
-  g.init(luci::FusedActFunc::NONE, false);
+  g.init(luci::FusedActFunc::NONE, false, true);
 
   // Bias cannot be fused as it's passed as feature map.
   g.to_fm_bias();
@@ -214,7 +242,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 {
-  g.init(luci::FusedActFunc::RELU, false);
+  g.init(luci::FusedActFunc::RELU, false, true);
 
   EXPECT_EQ(false, pass.run(g.g()));
 }

From 715cdf7d0b18cdf96bf43141e99645dda44be06b Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Fri, 9 Aug 2024 17:18:31 +0200
Subject: [PATCH 22/34] Remove random newline

---
 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index c4fd6438468..19171707f6d 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -29,7 +29,6 @@ namespace
   if (not(cond))                  \
     return false;
 
-
 inline bool is_scalar(const luci::CircleConst *node)
 {
   return ((node->rank() == 1 || node->rank() == 0) && node->size<loco::DataType::FLOAT32>() == 1);

From 9e22b260a8e06cc1a624b764199c2e5bcc785d81 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 12 Aug 2024 13:12:43 +0200
Subject: [PATCH 23/34] Apply comments, refactor tests and add proper handling
 of OUTPUTEXCLUDE

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 17 ++++++-----
 .../FuseMulWithFullyConnectedPass.test.cpp    | 30 +++++++++++--------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index 19171707f6d..d12d129fef3 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -136,7 +136,7 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   // Check that all dimensions are ones, checks broadcast capabilites.
   // Last dimesion of multiplication must be compatible with FC.
   // N-D case (N>1):
-  if (multiplication->rank() > 1)
+  if (multiplication->rank() >= 1)
   {
     // Check channel-wise broadcasting:
     for (uint32_t i = 0; i < rank - 1; i++)
@@ -145,28 +145,29 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
     RETURN_FALSE_UNLESS(multiplication->dim(rank - 1) == weights->dim(0));
   }
   // Scalar case:
-  else if (multiplication->rank() == 1 || multiplication->rank() == 0)
+  else if (multiplication->rank() == 0)
   {
-    RETURN_FALSE_UNLESS(multiplication->size<loco::DataType::FLOAT32>() != 0);
+    RETURN_FALSE_UNLESS(multiplication->size<loco::DataType::FLOAT32>() == 1);
   }
 
   // Only supports:
   // (1) constant bias
   // (2) no bias
-  auto bias = dynamic_cast<luci::CircleNode *>(fc->bias());
-  if (bias != nullptr)
+  auto bias = loco::must_cast<luci::CircleNode *>(fc->bias());
+  if (bias->opcode() == luci::CircleOpcode::CIRCLECONST)
   {
-    RETURN_FALSE_UNLESS(bias->opcode() == luci::CircleOpcode::CIRCLECONST or
-                        bias->opcode() == luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
     // Create new bias to be updated with values:
     auto const_bias = dynamic_cast<luci::CircleConst *>(fc->bias());
     RETURN_FALSE_UNLESS(const_bias)
     RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32);
-
     // Create new bias with updated values and replace:
     auto fused_bias = gen_fused_bias(const_bias, multiplication);
     fc->bias(fused_bias);
   }
+  else if (bias->opcode() != luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE)
+  {
+    return false;
+  }
 
   // Create new weights with updated values and replace:
   auto fused_weights = gen_fused_weights(weights, multiplication);
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index a05b4f80ee8..527fb7d9531 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -66,8 +66,13 @@ class FCMulGraphlet
         bias_val.at(i) = i;
 
       _fc_b = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE}, bias_val);
-      _fc->bias(_fc_b);
     }
+    else
+    {
+      // Create CircleOutputExclude -- no bias
+      _fc_b = g->nodes()->create<luci::CircleOutputExclude>();
+    }
+    _fc->bias(_fc_b);
 
     _fc->fusedActivationFunction(fc_activation);
     _fc->dtype(loco::DataType::FLOAT32);
@@ -101,7 +106,7 @@ class FCMulGraphlet
     }
     else
     {
-      _mul->shape({1, DIM_ONE});
+      _mul->shape({1, 1, 1, DIM_ONE});
     }
     _mul->name("mul");
   }
@@ -121,7 +126,7 @@ class FCMulGraphlet
   luci::CircleFullyConnected *_fc = nullptr;
   luci::CircleMul *_mul = nullptr;
   luci::CircleConst *_fc_f = nullptr;
-  luci::CircleConst *_fc_b = nullptr;
+  luci::CircleNode *_fc_b = nullptr;
   luci::CircleConst *_mul_c = nullptr;
 };
 
@@ -148,9 +153,9 @@ class FuseMulWithFullyConnectedPassTest : public ::testing::Test
 
 } // namespace
 
-TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_tensor)
 {
-  g.init(luci::FusedActFunc::NONE, false, true);
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
@@ -177,9 +182,9 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_not_scalar)
   }
 }
 
-TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_scalar)
 {
-  g.init(luci::FusedActFunc::NONE, true, true);
+  g.init(luci::FusedActFunc::NONE, true /* is_mul_scalar */, true /* use_bias */);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
@@ -206,15 +211,16 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_is_scalar)
   }
 }
 
-TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_no_bias)
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_no_bias)
 {
-  g.init(luci::FusedActFunc::NONE, false, false);
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, false /* use_bias */);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
   auto fc = dynamic_cast<luci::CircleFullyConnected *>(g.output()->from());
   EXPECT_NE(nullptr, fc);
-  EXPECT_EQ(nullptr, fc->bias());
+  auto no_bias = dynamic_cast<luci::CircleOutputExclude *>(fc->bias());
+  ASSERT_NE(nullptr, no_bias);
 
   auto weights = loco::must_cast<luci::CircleConst *>(g.fc()->weights());
   auto weights_n = weights->dim(0).value();
@@ -232,7 +238,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_without_activation_mul_no_bias)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 {
-  g.init(luci::FusedActFunc::NONE, false, true);
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */);
 
   // Bias cannot be fused as it's passed as feature map.
   g.to_fm_bias();
@@ -242,7 +248,7 @@ TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 {
-  g.init(luci::FusedActFunc::RELU, false, true);
+  g.init(luci::FusedActFunc::RELU, false /* is_mul_scalar */, true /* use_bias */);
 
   EXPECT_EQ(false, pass.run(g.g()));
 }

From 1b6c71f2c4a5076bc6fab54b8790c277f50e7efe Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 12 Aug 2024 13:36:20 +0200
Subject: [PATCH 24/34] Resolve one-cmds duplication

---
 compiler/one-cmds/how-to-use-one-commands.txt | 1 -
 compiler/one-cmds/onelib/constant.py          | 2 --
 2 files changed, 3 deletions(-)

diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index dc138d4c295..d6656545ff8 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -171,7 +171,6 @@ Current transformation options are
 - fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
 - fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
 - fuse_batchnorm_with_tconv : This fuses BatchNorm operator to transpose convolution operator
-- fuse_mul_with_fullyconnected: This fuses Mul operator with the preceding FullyConnected operator if possible
 - fuse_mul_to_fullyconnected_weights : This fuses Mul operator to following FullyConnected operator weights
 - fuse_mul_with_conv: This fuses Mul with a preceding Convolution op if possible.
 - fuse_mul_with_div: This fuses Mul and Div op as Div.
diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py
index e9630131876..a8dabf139d0 100644
--- a/compiler/one-cmds/onelib/constant.py
+++ b/compiler/one-cmds/onelib/constant.py
@@ -44,7 +44,6 @@ class CONSTANT:
         'fuse_batchnorm_with_dwconv',
         'fuse_batchnorm_with_tconv',
         'fuse_activation_function',
-        'fuse_mul_with_fullyconnected',
         'fuse_mul_to_fullyconnected_weights',
         'fuse_instnorm',
         'fuse_prelu',
@@ -124,7 +123,6 @@ class CONSTANT:
         ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
         ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
         ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
-        ('fuse_mul_with_fullyconnected', 'fuse Mul op to FullyConnected op'),
         ('fuse_mul_to_fullyconnected_weights',
          'fuse Mul op to following FullyConnected op weights'),
         ('fuse_slice_with_tconv', 'fuse Slice op to Transposed Convolution op'),

From 8977ef9620ada75fe696bf7e18201d4f9a24a99e Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 12 Aug 2024 13:37:24 +0200
Subject: [PATCH 25/34] Handle rank 0 and 1

---
 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index d12d129fef3..bceb86e2f5e 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -136,7 +136,7 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   // Check that all dimensions are ones, checks broadcast capabilites.
   // Last dimesion of multiplication must be compatible with FC.
   // N-D case (N>1):
-  if (multiplication->rank() >= 1)
+  if (multiplication->rank() > 1)
   {
     // Check channel-wise broadcasting:
     for (uint32_t i = 0; i < rank - 1; i++)
@@ -144,7 +144,12 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
     // Check the last dimesion of Mul is the same with the first dimension of FullyConnected
     RETURN_FALSE_UNLESS(multiplication->dim(rank - 1) == weights->dim(0));
   }
-  // Scalar case:
+  // 1-D or scalar case:
+  else if (multiplication->rank() == 1)
+  {
+    RETURN_FALSE_UNLESS(multiplication->size<loco::DataType::FLOAT32>() == 1 ||
+                        multiplication->size<loco::DataType::FLOAT32>() == weights->dim(0));
+  }
   else if (multiplication->rank() == 0)
   {
     RETURN_FALSE_UNLESS(multiplication->size<loco::DataType::FLOAT32>() == 1);

From 53aa943893946d1d07fde7ff18e7957c7099488d Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 12 Aug 2024 15:40:56 +0200
Subject: [PATCH 26/34] Add new testcase

---
 compiler/luci-pass-value-py-test/test.lst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler/luci-pass-value-py-test/test.lst b/compiler/luci-pass-value-py-test/test.lst
index 91a77e4ef7f..d610f980bf2 100644
--- a/compiler/luci-pass-value-py-test/test.lst
+++ b/compiler/luci-pass-value-py-test/test.lst
@@ -36,6 +36,7 @@ eval(Net_FullyConnected_Add_000 fold_fully_connected)
 eval(Net_FullyConnected_Mul_000 fuse_mul_with_fullyconnected)
 eval(Net_FullyConnected_Mul_001 fuse_mul_with_fullyconnected)
 eval(Net_FullyConnected_Mul_002 fuse_mul_with_fullyconnected)
+eval(Net_FullyConnected_Mul_003 fuse_mul_with_fullyconnected)
 eval(Net_Horizontal_FullyConnected_Add_000 fuse_horizontal_fc_layers)
 eval(Net_InstanceNorm_001 fuse_instnorm)
 eval(Net_InstanceNorm_002 fuse_instnorm)

From 0c2bb718cd3f9c168ed27e0fc0c8e6211c8aad55 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 12 Aug 2024 15:41:50 +0200
Subject: [PATCH 27/34] Add new testcase

---
 compiler/circle2circle-dredd-recipe-test/test.lst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index b33aa6fd0cf..e6ee1dc8de1 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -51,6 +51,7 @@ Add(Net_FullyConnected_Add_000 PASS fold_fully_connected)
 Add(Net_FullyConnected_Mul_000 PASS fuse_mul_with_fullyconnected)
 Add(Net_FullyConnected_Mul_001 PASS fuse_mul_with_fullyconnected)
 Add(Net_FullyConnected_Mul_002 PASS fuse_mul_with_fullyconnected)
+Add(Net_FullyConnected_Mul_003 PASS fuse_mul_with_fullyconnected)
 Add(Net_Gelu_000 PASS fuse_gelu)
 Add(Net_Gelu_001 PASS fuse_gelu)
 Add(Net_Horizontal_FullyConnected_Add_000 PASS fuse_horizontal_fc_layers)

From e3ff5176161a34d81ff3fff5697930a8b4e7087c Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 12 Aug 2024 15:44:22 +0200
Subject: [PATCH 28/34] [res/tfl_recipes] Add new Net_FullyConnected_Mul

This commit extends Net_FullyConnectedMul tflite recipes with 'no bias' case.

ONE-DCO-1.0-Signed-off-by: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
---
 .../Net_FullyConnected_Mul_003/test.recipe    | 57 +++++++++++++++++++
 .../Net_FullyConnected_Mul_003/test.rule      | 13 +++++
 2 files changed, 70 insertions(+)
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.rule

diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe
new file mode 100644
index 00000000000..4ac85574c64
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe
@@ -0,0 +1,57 @@
+operand {
+    name: "ifm"
+    type: FLOAT32
+    shape { dim: 3 dim: 1 dim: 4 }
+}
+operand {
+    name: "fc_wgt"
+    type: FLOAT32
+    shape { dim: 6 dim: 4 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "B"
+    type: FLOAT32
+    shape { dim: 6 }
+    filler {
+        tag: "gaussian"
+        arg: "0.0"
+        arg: "1.0"
+    }
+}
+operand {
+    name: "fc_out"
+    type: FLOAT32
+    shape: { dim: 3 dim: 1 dim: 6 }
+}
+operand {
+    name: "mul_out"
+    type: FLOAT32
+    shape: { dim: 3 dim: 1 dim: 6 }
+}
+operation {
+    type: "FullyConnected"
+    fullyconnected_options {
+        activation: NONE
+        keep_num_dims: true
+    }
+    input: "ifm"
+    input: "fc_wgt"
+    input: ""
+    output: "fc_out"
+}
+operation {
+    type: "Mul"
+    mul_options {
+        activation: RELU
+    }
+    input: "fc_out"
+    input: "B"
+    output: "mul_out"
+}
+input: "ifm"
+output: "mul_out"
diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.rule
new file mode 100644
index 00000000000..16bb2ff2788
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.rule
@@ -0,0 +1,13 @@
+# This checks if:
+#   Mul(FC(input, weights, _), other)
+# is converted to:
+#   FC(input, Mul(weights, other), _)
+# and then Mul is fused to:
+#   FC(input, weights', _)
+# Here the bias is empty/excluded "_".
+# Thus Mul is only fused with weights.
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "FC_EXIST"                $(op_count FULLY_CONNECTED) '=' 1

From 678869c471693244d7f0a27620d888901e5d4d0c Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Tue, 13 Aug 2024 13:22:34 +0200
Subject: [PATCH 29/34] Change name of operand from B to scale

---
 .../Net_FullyConnected_Mul_003/test.recipe                    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe
index 4ac85574c64..2883ebabdf0 100644
--- a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe
+++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe
@@ -14,7 +14,7 @@ operand {
     }
 }
 operand {
-    name: "B"
+    name: "scale"
     type: FLOAT32
     shape { dim: 6 }
     filler {
@@ -50,7 +50,7 @@ operation {
         activation: RELU
     }
     input: "fc_out"
-    input: "B"
+    input: "scale"
     output: "mul_out"
 }
 input: "ifm"

From b0851814408fbddf8b5fdb38a23339a29b19aa83 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Tue, 13 Aug 2024 13:52:27 +0200
Subject: [PATCH 30/34] Update names from scalar to single element

---
 .../pass/src/FuseMulWithFullyConnectedPass.cpp   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index bceb86e2f5e..c2951d7fda2 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -29,12 +29,12 @@ namespace
   if (not(cond))                  \
     return false;
 
-inline bool is_scalar(const luci::CircleConst *node)
+inline bool is_single_element(const luci::CircleConst *node)
 {
   return ((node->rank() == 1 || node->rank() == 0) && node->size<loco::DataType::FLOAT32>() == 1);
 }
 
-inline void update_with_scalar(luci::CircleConst *fused_node,
+inline void update_with_single_element(luci::CircleConst *fused_node,
                                const luci::CircleConst *multiplication)
 {
   for (uint32_t i = 0; i < fused_node->size<loco::DataType::FLOAT32>(); i++)
@@ -47,10 +47,10 @@ luci::CircleConst *gen_fused_weights(luci::CircleConst *weights,
                                      const luci::CircleConst *multiplication)
 {
   auto fused_weights = luci::clone(weights);
-  // Scalar multiplication:
-  if (is_scalar(multiplication))
+  // Single element multiplication:
+  if (is_single_element(multiplication))
   {
-    update_with_scalar(fused_weights, multiplication);
+    update_with_single_element(fused_weights, multiplication);
   }
   // N-size multiplication:
   else
@@ -74,10 +74,10 @@ luci::CircleConst *gen_fused_weights(luci::CircleConst *weights,
 luci::CircleConst *gen_fused_bias(luci::CircleConst *bias, const luci::CircleConst *multiplication)
 {
   auto fused_bias = luci::clone(bias);
-  // Scalar multiplication:
-  if (is_scalar(multiplication))
+  // Single element multiplication:
+  if (is_single_element(multiplication))
   {
-    update_with_scalar(fused_bias, multiplication);
+    update_with_single_element(fused_bias, multiplication);
   }
   // N-size multiplication:
   else

From 1aa79ccb6192741c573fb85fbf18177479ec6d9e Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Tue, 13 Aug 2024 13:52:45 +0200
Subject: [PATCH 31/34] Update tests

---
 .../pass/src/FuseMulWithFullyConnectedPass.test.cpp   | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index 527fb7d9531..821f4ff3d5c 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -102,7 +102,7 @@ class FCMulGraphlet
     _mul->dtype(loco::DataType::FLOAT32);
     if (is_mul_scalar)
     {
-      _mul->shape({1});
+      _mul->shape({1, DIM_ONE});
     }
     else
     {
@@ -252,3 +252,12 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 
   EXPECT_EQ(false, pass.run(g.g()));
 }
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_null_weights_NEG)
+{
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */);
+
+  g.fc()->weights(nullptr);
+
+  EXPECT_EQ(false, pass.run(g.g()));
+}

From 1bb278d49f59912206f82e8037c29c7a3b87edd3 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Tue, 13 Aug 2024 15:44:32 +0200
Subject: [PATCH 32/34] Fix codestyle

---
 compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index c2951d7fda2..cd9face54d6 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -35,7 +35,7 @@ inline bool is_single_element(const luci::CircleConst *node)
 }
 
 inline void update_with_single_element(luci::CircleConst *fused_node,
-                               const luci::CircleConst *multiplication)
+                                       const luci::CircleConst *multiplication)
 {
   for (uint32_t i = 0; i < fused_node->size<loco::DataType::FLOAT32>(); i++)
   {

From 79a2213d063bbf85e0209a5ec06dc5030cc7c80c Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Wed, 14 Aug 2024 13:48:58 +0200
Subject: [PATCH 33/34] Search from mul, update tests

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 37 +++++------
 .../FuseMulWithFullyConnectedPass.test.cpp    | 65 +++++++++++++++----
 2 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index cd9face54d6..c724f832b94 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -16,12 +16,12 @@
 
 #include "luci/Pass/FuseMulWithFullyConnectedPass.h"
 
+#include "helpers/NodeFiller.h"
+
 #include <luci/IR/CircleNodes.h>
 #include <luci/Service/Nodes/CircleConst.h>
 #include <luci/Profile/CircleNodeOrigin.h>
 
-#include <cmath>
-
 namespace
 {
 
@@ -107,10 +107,19 @@ luci::CircleConst *gen_fused_bias(luci::CircleConst *bias, const luci::CircleCon
  *                |
  *
  */
-bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
+bool fuse_mul_with_fc(luci::CircleMul *mul)
 {
   // Sanity check:
-  RETURN_FALSE_UNLESS(fc);
+  RETURN_FALSE_UNLESS(mul);
+  // Allow Mul node only with FLOAT32 data type:
+  RETURN_FALSE_UNLESS(mul->dtype() == loco::DataType::FLOAT32);
+  // Check if any FC node connects to Mul.
+  // Find the pattern of Mul(FC, CircleConst):
+  luci::CircleFullyConnected *fc = nullptr;
+  luci::CircleConst *multiplication = nullptr;
+  RETURN_FALSE_UNLESS(luci::fill(&fc, &multiplication).with_commutative_args_of(mul));
+  // Make sure that FullyConnected has only one successor:
+  RETURN_FALSE_UNLESS(loco::succs(fc).size() == 1);
   // Allow only FLOAT32 data type:
   RETURN_FALSE_UNLESS(fc->dtype() == loco::DataType::FLOAT32);
   // Allow only without activation functions as values are going to
@@ -119,18 +128,6 @@ bool fuse_mul_with_fc(luci::CircleFullyConnected *fc)
   // Check for weights being Constant:
   auto weights = dynamic_cast<luci::CircleConst *>(fc->weights());
   RETURN_FALSE_UNLESS(weights);
-  // Get Mul node:
-  auto fc_output = loco::succs(fc);
-  // Make sure that FullyConnected has only one child:
-  RETURN_FALSE_UNLESS(fc_output.size() == 1);
-  auto mul = dynamic_cast<luci::CircleMul *>(*fc_output.begin());
-  RETURN_FALSE_UNLESS(mul);
-  // Allow Mul node only with FLOAT32 data type:
-  RETURN_FALSE_UNLESS(mul->dtype() == loco::DataType::FLOAT32);
-  // Get multiplication Constant (here: the second input besides weights):
-  auto multiplication = mul->x() == fc ? dynamic_cast<luci::CircleConst *>(mul->y())
-                                       : dynamic_cast<luci::CircleConst *>(mul->x());
-  RETURN_FALSE_UNLESS(multiplication);
   // Get rank of multiplication:
   auto rank = multiplication->rank();
   // Check that all dimensions are ones, checks broadcast capabilites.
@@ -197,14 +194,14 @@ bool FuseMulWithFullyConnectedPass::run(loco::Graph *g)
   bool changed = false;
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
-    if (not fc)
+    auto mul = dynamic_cast<luci::CircleMul *>(node);
+    if (not mul)
       continue;
 
-    switch (fc->dtype())
+    switch (mul->dtype())
     {
       case loco::DataType::FLOAT32:
-        if (fuse_mul_with_fc(fc))
+        if (fuse_mul_with_fc(mul))
           changed = true;
         break;
       default:
diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
index 821f4ff3d5c..a4f9d6bf087 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp
@@ -34,13 +34,22 @@ using namespace luci::test;
 /**
  *  Graph for this test
  *
- *  BEFORE
+ *  BEFORE (without extra_fc_successor)
  *
  *         [FC]
  *           |
  *     [Mul w/ Relu]
  *
- *  AFTER
+ *  BEFORE (with extra_fc_successor)
+ *
+ *         [FC]
+ *           |
+ *           |-------------------
+ *           |                  |
+ *           |                  |
+ *     [Mul w/ Relu]       [other FC]
+ *
+ *  AFTER (if pass applied)
  *
  *      [FC w/ Relu] (weights and bias updated)
  *
@@ -48,7 +57,8 @@ using namespace luci::test;
 class FCMulGraphlet
 {
 public:
-  void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias)
+  void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias,
+            bool extra_successor)
   {
     _fc = g->nodes()->create<luci::CircleFullyConnected>();
 
@@ -79,6 +89,22 @@ class FCMulGraphlet
     _fc->shape({1, DIM_ONE});
     _fc->name("fc");
 
+    if (extra_successor)
+    {
+      _extra_succ = g->nodes()->create<luci::CircleFullyConnected>();
+      // Set previous FC as input to bump number of successors for it:
+      _extra_succ->input(_fc);
+      std::vector<float> weights_val(DIM_ONE * DIM_TWO);
+      _extra_f =
+        luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE, DIM_TWO}, weights_val);
+      _extra_succ->weights(_extra_f);
+      _extra_succ->bias(nullptr);
+      _extra_succ->fusedActivationFunction(luci::FusedActFunc::NONE);
+      _extra_succ->dtype(loco::DataType::FLOAT32);
+      _extra_succ->shape({1, DIM_ONE});
+      _extra_succ->name("extra_fc");
+    }
+
     std::vector<float> mul_values;
 
     if (is_mul_scalar)
@@ -128,15 +154,18 @@ class FCMulGraphlet
   luci::CircleConst *_fc_f = nullptr;
   luci::CircleNode *_fc_b = nullptr;
   luci::CircleConst *_mul_c = nullptr;
+  luci::CircleFullyConnected *_extra_succ = nullptr;
+  luci::CircleConst *_extra_f = nullptr;
 };
 
 class FuseMulWithFCTestGraph : public TestIOGraph, public FCMulGraphlet
 {
 public:
-  void init(luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias)
+  void init(luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias,
+            bool extra_successor)
   {
     TestIOGraph::init({1, DIM_TWO}, {1, DIM_ONE});
-    FCMulGraphlet::init(g(), fc_activation, is_mul_scalar, use_bias);
+    FCMulGraphlet::init(g(), fc_activation, is_mul_scalar, use_bias, extra_successor);
 
     _fc->input(input());
 
@@ -155,7 +184,8 @@ class FuseMulWithFullyConnectedPassTest : public ::testing::Test
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_tensor)
 {
-  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */);
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */,
+         false /* extra_successor */);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
@@ -184,7 +214,8 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_tensor)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_scalar)
 {
-  g.init(luci::FusedActFunc::NONE, true /* is_mul_scalar */, true /* use_bias */);
+  g.init(luci::FusedActFunc::NONE, true /* is_mul_scalar */, true /* use_bias */,
+         false /* extra_successor */);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
@@ -213,7 +244,8 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_scalar)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_no_bias)
 {
-  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, false /* use_bias */);
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, false /* use_bias */,
+         false /* extra_successor */);
 
   EXPECT_EQ(true, pass.run(g.g()));
 
@@ -238,7 +270,8 @@ TEST_F(FuseMulWithFullyConnectedPassTest, fc_no_bias)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 {
-  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */);
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */,
+         false /* extra_successor */);
 
   // Bias cannot be fused as it's passed as feature map.
   g.to_fm_bias();
@@ -248,16 +281,26 @@ TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG)
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG)
 {
-  g.init(luci::FusedActFunc::RELU, false /* is_mul_scalar */, true /* use_bias */);
+  g.init(luci::FusedActFunc::RELU, false /* is_mul_scalar */, true /* use_bias */,
+         false /* extra_successor */);
 
   EXPECT_EQ(false, pass.run(g.g()));
 }
 
 TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_null_weights_NEG)
 {
-  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */);
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */,
+         false /* extra_successor */);
 
   g.fc()->weights(nullptr);
 
   EXPECT_EQ(false, pass.run(g.g()));
 }
+
+TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_extra_successor_NEG)
+{
+  g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */,
+         true /* extra_successor */);
+
+  EXPECT_EQ(false, pass.run(g.g()));
+}

From 550e798f3f88c2a2edc9043e50765411280a6a30 Mon Sep 17 00:00:00 2001
From: Jan Iwaszkiewicz <j.iwaszkiewi@samsung.com>
Date: Mon, 19 Aug 2024 09:49:39 +0200
Subject: [PATCH 34/34] Annotate requirement of one successor and refactor
 checks

---
 .../src/FuseMulWithFullyConnectedPass.cpp     | 48 ++++++++++++++-----
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
index c724f832b94..d4fb75953ed 100644
--- a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
+++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp
@@ -118,7 +118,39 @@ bool fuse_mul_with_fc(luci::CircleMul *mul)
   luci::CircleFullyConnected *fc = nullptr;
   luci::CircleConst *multiplication = nullptr;
   RETURN_FALSE_UNLESS(luci::fill(&fc, &multiplication).with_commutative_args_of(mul));
-  // Make sure that FullyConnected has only one successor:
+  /**
+   *  Make sure that FullyConnected has only one successor.
+   *
+   *  If the FullyConnected output is connected to more nodes,
+   *  this pass will replace node with new fused FullyConnected.
+   *  Thus pass success will only introduce extra FullyConnected
+   *  without reducing overall number of nodes.
+   *  Which tends to increase model's size and degrades model's performance.
+   *  Thus one successor is required to benefit from this pass.
+   *
+   *  Example graph that illustrates the described scenario:
+   *
+   *  BEFORE
+   *                |
+   *      [CircleFullyConnected]
+   *                |
+   *        +-------+----------------+
+   *        |                        |
+   *        |                        |
+   *  [Other Node]              [CircleMul]
+   *        |                        |
+   *
+   *  AFTER
+   *                |
+   *      [CircleFullyConnected]
+   *                |
+   *        +-------+-----------------------+
+   *        |                               |
+   *        |                               |
+   *  [Other Node]       [New CircleFullyConnected Fused with Mul]
+   *        |                               |
+   *
+   */
   RETURN_FALSE_UNLESS(loco::succs(fc).size() == 1);
   // Allow only FLOAT32 data type:
   RETURN_FALSE_UNLESS(fc->dtype() == loco::DataType::FLOAT32);
@@ -194,18 +226,10 @@ bool FuseMulWithFullyConnectedPass::run(loco::Graph *g)
   bool changed = false;
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    auto mul = dynamic_cast<luci::CircleMul *>(node);
-    if (not mul)
-      continue;
-
-    switch (mul->dtype())
+    if (auto mul = dynamic_cast<luci::CircleMul *>(node))
     {
-      case loco::DataType::FLOAT32:
-        if (fuse_mul_with_fc(mul))
-          changed = true;
-        break;
-      default:
-        break;
+      if (fuse_mul_with_fc(mul))
+        changed = true;
     }
   }