From 4f5a31cb6d289c421e3e7273c4dc4f5aa03c2ac8 Mon Sep 17 00:00:00 2001
From: mouliraj-mcw <mcw-melamurugan@ext.tenstorrent.com>
Date: Mon, 11 Nov 2024 09:43:54 +0000
Subject: [PATCH 1/5] Add forward support for prelu

---
 .../eltwise/test_binary_composite.py          | 22 ++++++++++
 .../metal/llk_api/llk_math_unary_sfpu_api.h   |  1 +
 .../llk_api/llk_sfpu/ckernel_sfpu_prelu.h     | 38 ++++++++++++++++
 .../llk_math_eltwise_unary_sfpu_prelu.h       | 29 +++++++++++++
 .../metal/llk_api/llk_sfpu_types.h            |  1 +
 .../compute_kernel_api/eltwise_unary/prelu.h  | 43 +++++++++++++++++++
 .../eltwise_unary/sfpu_split_includes.h       |  4 ++
 .../eltwise/binary/binary_composite.hpp       | 11 +++++
 .../eltwise/binary/binary_pybind.hpp          | 11 ++++-
 .../binary/device/binary_composite_op.cpp     |  9 ++++
 .../eltwise/unary/common/unary_op_types.hpp   |  3 +-
 .../eltwise/unary/common/unary_op_utils.cpp   | 10 +++++
 .../eltwise/unary/common/unary_op_utils.hpp   |  1 +
 .../ttnn/operations/eltwise/unary/unary.cpp   | 27 ++++++++++++
 .../ttnn/operations/eltwise/unary/unary.hpp   | 17 ++++++++
 ttnn/ttnn/__init__.py                         |  4 --
 ttnn/ttnn/operations/unary.py                 | 11 ++++-
 17 files changed, 234 insertions(+), 8 deletions(-)
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h
 create mode 100644 tt_metal/include/compute_kernel_api/eltwise_unary/prelu.h

diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
index 037a6adee65..b540073527e 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
@@ -959,3 +959,25 @@ def test_binary_lcm_ttnn(input_shapes, device):
 
     comp_pass = compare_pcc([output_tensor], [golden_tensor])
     assert comp_pass
+
+
+@pytest.mark.parametrize(
+    "input_shapes",
+    (
+        (torch.Size([1, 3, 32, 32])),
+        (torch.Size([1, 6, 32, 32])),
+        (torch.Size([1, 7, 320, 384])),
+        (torch.Size([1, 4, 320, 384])),
+    ),
+)
+def test_binary_prelu_ttnn(input_shapes, device):
+    in_data1, input_tensor1 = data_gen_with_range(input_shapes, -100, 100, device)
+    channels = input_shapes[1]
+    in_data2 = torch.rand((channels,), dtype=torch.bfloat16) * 200 - 100
+    input_tensor2 = ttnn.from_torch(in_data2, layout=ttnn.TILE_LAYOUT, device=device)
+    output_tensor = ttnn.prelu(input_tensor1, input_tensor2)
+    golden_function = ttnn.get_golden_function(ttnn.prelu)
+    golden_tensor = golden_function(in_data1, in_data2)
+
+    comp_pass = compare_pcc([output_tensor], [golden_tensor])
+    assert comp_pass
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
index f337c6a0c6d..78104b0ce32 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -36,3 +36,4 @@
 #include "llk_math_eltwise_unary_sfpu_right_shift.h"
 #include "llk_math_eltwise_unary_sfpu_left_shift.h"
 #include "llk_math_eltwise_unary_sfpu_fill.h"
+#include "llk_math_eltwise_unary_sfpu_prelu.h"
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
new file mode 100644
index 00000000000..a50eec93d28
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+#include "ckernel_sfpu_converter.h"
+
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_prelu(uint value) {
+    // SFPU microcode
+    Converter c_value;
+    c_value.u = value;
+    vFloat init = c_value.f;
+
+    for (int d = 0; d < 8; d++)
+    {
+        vFloat a = dst_reg[0];
+        v_if(a < 0.0f) {
+            a = a * init;
+        }
+        v_endif;
+        dst_reg[0] = a;
+        dst_reg++;
+    }
+}
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h
new file mode 100644
index 00000000000..8bedfdbca2f
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_sfpu_prelu.h"
+#include "llk_math_eltwise_unary_sfpu_params.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_prelu_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::prelu, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_prelu(uint dst_index, uint param0, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_prelu<APPROXIMATE>,
+        dst_index,
+        vector_mode,
+        param0);
+}
+
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h
index e20018b4102..7b3789c3743 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h
@@ -14,6 +14,7 @@ enum SfpuType {
     reciprocal,
     sqrt,
     lrelu,
+    prelu,
     power,
     square,
     tanh_derivative,
diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/prelu.h b/tt_metal/include/compute_kernel_api/eltwise_unary/prelu.h
new file mode 100644
index 00000000000..f691dfd8ad6
--- /dev/null
+++ b/tt_metal/include/compute_kernel_api/eltwise_unary/prelu.h
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+
+#include "compute_kernel_api/common_globals.h"
+#ifdef TRISC_MATH
+#include "llk_math_eltwise_unary_sfpu_prelu.h"
+#define MAIN math_main()
+#define MATH(x) x
+#else
+#define MATH(x)
+#endif
+
+
+
+namespace ckernel {
+
+/**
+ * Performs element-wise prelu operation. The value to be prelued in the tile is provided as const param0. The DST register buffer must be in
+ * acquired state via *acquire_dst* call. This call is blocking and is only
+ * available on the compute engine.
+ *
+ * Return value: None
+ *
+ * | Argument        | Description                                                                | Type     | Valid Range                                           | Required |
+ * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------|
+ * | idst            | The index of the tile in DST register buffer to perform the computation on | uint32_t | Must be less than the size of the DST register buffer | True     |
+ * | param0          | The value the output is if the input is greater than 0                     | uint32_t |                                                       | True     |
+ */
+ALWI void prelu_tile(uint32_t idst, uint32_t param0) {
+    MATH((llk_math_eltwise_unary_sfpu_prelu<APPROX>(idst, param0)));
+}
+
+/**
+ * Please refer to documentation for any_init.
+ */
+ALWI void prelu_tile_init() { MATH((llk_math_eltwise_unary_sfpu_prelu_init<APPROX>())); }
+
+
+} // namespace ckernel
diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h
index 0f087ba9bc0..204a1559546 100644
--- a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h
+++ b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h
@@ -116,6 +116,10 @@
 #include "compute_kernel_api/eltwise_unary/softplus.h"
 #endif
 
+#if SFPU_OP_PRELU_INCLUDE
+#include "compute_kernel_api/eltwise_unary/prelu.h"
+#endif
+
 #if SFPU_OP_DROPOUT_INCLUDE
 #include "compute_kernel_api/eltwise_unary/dropout.h"
 #endif
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
index db1aaa16993..559fd2e445f 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
@@ -243,6 +243,14 @@ struct ExecuteMinimum
 
 };
 
+struct ExecutePrelu
+{
+    static Tensor invoke(
+        const Tensor& input_tensor_a,
+        const Tensor& input_tensor_b,
+        const std::optional<MemoryConfig>& memory_config = std::nullopt);
+};
+
 } // namespace binary
 }  // namespace operations
 
@@ -306,5 +314,8 @@ constexpr auto gcd = ttnn::register_operation_with_auto_launch_op<
 constexpr auto lcm = ttnn::register_operation_with_auto_launch_op<
     "ttnn::lcm",
     operations::binary::ExecuteLCM>();
+constexpr auto prelu = ttnn::register_operation_with_auto_launch_op<
+    "ttnn::prelu",
+    operations::binary::ExecutePrelu>();
 
 }  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
index 5033425fd38..8ebaf601a71 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
@@ -186,7 +186,7 @@ void bind_binary_operation(py::module& module, const binary_operation_t& operati
 }
 
 template <typename binary_operation_t>
-void bind_binary_composite(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype = "BFLAOT16", const std::string& note="") {
+void bind_binary_composite(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype = "BFLOAT16", const std::string& note="") {
     auto doc = fmt::format(
         R"doc(
         {2}
@@ -590,6 +590,7 @@ void bind_binary_overload_operation(py::module& module, const binary_operation_t
         operation,
         doc,
 
+
         //tensor and scalar
         ttnn::pybind_overload_t{
             [](const binary_operation_t& self,
@@ -934,6 +935,14 @@ void py_module(py::module& module) {
         R"doc(\mathrm{output\_tensor}_i = \sqrt{(\mathrm{input\_tensor\_a}_i^2 + \mathrm{input\_tensor\_b}_i^2)}
         )doc");
 
+    detail::bind_binary_composite(
+        module,
+        ttnn::prelu,
+        R"doc(Perform an eltwise-prelu operation. Formula : a - a.div(b, rounding_mode=trunc) * b .
+        PReLU supports the case where the size of input_tensor_b matches the number of channels in input_tensor_a.)doc",
+        R"doc(\mathrm{{output\_tensor}} = \verb|PReLU|(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
+
     detail::bind_binary_composite(
         module,
         ttnn::xlogy,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
index 3821e67304f..750ef0e8060 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -264,6 +264,15 @@ Tensor _div_no_nan(const Tensor& input_a, const Tensor& input_b, const std::opti
     return ttnn::where(ttnn::eqz(input_b, output_mem_config), 0, div_result);
 }
 
+Tensor ExecutePrelu::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
+    const tt::tt_metal::LegacyShape s_a = input_a.get_legacy_shape();
+    auto volume = input_b.get_logical_volume();
+    // If volume = 1 Support for a single-value tensor yet to be handled. #14933
+    TT_FATAL(s_a[1] == volume, "Mismatch of parameter numbers and input channel size");
+    Tensor b = ttnn::reshape(input_b, ttnn::SimpleShape{std::array<uint32_t, 4>{1, s_a[1], 1, 1}});
+    Tensor result = ttnn::where(ttnn::ltz(input_a, output_mem_config), ttnn::multiply(input_a, b), input_a);
+    return result;
+}
 // Binary remainder will be overloaded by unary remainder in another PR
 Tensor ExecuteBinaryRemainder::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
     auto arch = input_a.device()->arch();
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
index c133a83c0a8..6bc52c98f92 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
@@ -84,7 +84,8 @@ enum class UnaryOpType {
     REMAINDER,
     FMOD,
     DROPOUT,
-    FILL
+    FILL,
+    PRELU_SFPU,
 };
 
 struct UnaryWithParam {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
index be612eac166..bed48d6a143 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
@@ -62,6 +62,7 @@ void update_macro_defines(UnaryOpType op_type, std::map<std::string, std::string
         case UnaryOpType::TAN: defines["SFPU_OP_TRIG_FAMILY_INCLUDE"] = "1"; break;
         case UnaryOpType::NEG: defines["SFPU_OP_NEG_INCLUDE"] = "1"; break;
         case UnaryOpType::SOFTPLUS: defines["SFPU_OP_SOFTPLUS_INCLUDE"] = "1"; break;
+        case UnaryOpType::PRELU_SFPU: defines["SFPU_OP_PRELU_INCLUDE"] = "1"; break;
         case UnaryOpType::TYPECAST: defines["SFPU_OP_TYPECAST_INCLUDE"] = "1"; break;
         case UnaryOpType::BITWISE_XOR: defines["SFPU_OP_BITWISE_XOR_INCLUDE"] = "1"; break;
         case UnaryOpType::BITWISE_NOT: defines["SFPU_OP_BITWISE_NOT_INCLUDE"] = "1"; break;
@@ -223,6 +224,15 @@ std::pair<std::string, std::string> get_op_init_and_func_parameterized(
                     Converter::to_hex(param1))};
             break;
         }
+        case UnaryOpType::PRELU_SFPU: {
+            op_init_and_name = {
+                "prelu_tile_init();",
+                fmt::format(
+                    "prelu_tile({}, {}u);",
+                    idst,
+                    Converter::to_hex(param0))};
+            break;
+        }
         case UnaryOpType::TYPECAST:
             TT_ASSERT(params.size() == 2, "Expected eltwise_typecast to take 2 parameters");
             op_init_and_name = {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.hpp
index 9f27ebefac2..6abda5c7178 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.hpp
@@ -60,6 +60,7 @@ bool is_parametrized_type(T val) {
         case UnaryOpType::REMAINDER:
         case UnaryOpType::DROPOUT:
         case UnaryOpType::FILL:
+        case UnaryOpType::PRELU_SFPU:
         case UnaryOpType::FMOD: return true;
         default: return false;
     }
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp
index e68ec9535d6..20c80ec2217 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp
@@ -281,6 +281,33 @@ Tensor Softplus::invoke(
         optional_output_tensor);
 }
 
+Tensor Prelu::invoke(
+    uint8_t queue_id,
+    const Tensor& input,
+    float value,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<Tensor>& optional_output_tensor) {
+    return detail::unary_impl(
+        queue_id,
+        input,
+        {UnaryWithParam{UnaryOpType::PRELU_SFPU, value}},
+        memory_config,
+        optional_output_tensor);
+}
+
+Tensor Prelu::invoke(
+    const Tensor& input,
+    float value,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<Tensor>& optional_output_tensor) {
+    return detail::unary_impl(
+        DefaultQueueId,
+        input,
+        {UnaryWithParam{UnaryOpType::PRELU_SFPU, value}},
+        memory_config,
+        optional_output_tensor);
+}
+
 Tensor Identity::invoke(
     uint8_t queue_id,
     const Tensor& input_tensor,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp
index b75e30ab478..13d40a05628 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp
@@ -122,6 +122,21 @@ struct Softplus {
         const std::optional<Tensor>& optional_output_tensor = std::nullopt);
 };
 
+struct Prelu {
+    static Tensor invoke(
+        uint8_t queue_id,
+        const Tensor& input,
+        float value,
+        const std::optional<MemoryConfig>& memory_config = std::nullopt,
+        const std::optional<Tensor>& optional_output_tensor = std::nullopt);
+
+    static Tensor invoke(
+        const Tensor& input,
+        float value,
+        const std::optional<MemoryConfig>& memory_config = std::nullopt,
+        const std::optional<Tensor>& optional_output_tensor = std::nullopt);
+};
+
 struct Identity {
     static Tensor invoke(
         uint8_t queue_id,
@@ -340,6 +355,8 @@ REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_xor, BITWISE_XOR, int32_
 constexpr auto dropout = ttnn::register_operation_with_auto_launch_op<"ttnn::dropout", ttnn::operations::unary::Dropout>();
 constexpr auto identity = ttnn::register_operation_with_auto_launch_op<"ttnn::identity", ttnn::operations::unary::Identity>();
 constexpr auto softplus = ttnn::register_operation_with_auto_launch_op<"ttnn::softplus", ttnn::operations::unary::Softplus>();
+constexpr auto prelu_sfpu = ttnn::register_operation_with_auto_launch_op<"ttnn::prelu_sfpu", ttnn::operations::unary::Prelu>();
+
 constexpr auto sigmoid_accurate =
     ttnn::register_operation_with_auto_launch_op<"ttnn::sigmoid_accurate", ttnn::operations::unary::Sigmoid_accurate>();
 constexpr auto unary_chain = ttnn::register_operation_with_auto_launch_op<"ttnn::unary_chain", ttnn::operations::unary::Unary_chain>();
diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py
index fba363c3971..c5c6f1a63f3 100644
--- a/ttnn/ttnn/__init__.py
+++ b/ttnn/ttnn/__init__.py
@@ -243,10 +243,6 @@ def auto_register_ttnn_cpp_operations(module):
 mul_ = ttnn.multiply_
 
 
-def prelu(*args, **kwargs):  # Alias for leaky_relu. TODO(#8544): implement PReLU properly
-    return ttnn.leaky_relu(*args, **kwargs)
-
-
 # TODO: pybind the overloaded operators below
 ttnn.Tensor.__add__ = lambda self, *args, **kwargs: ttnn.add(self, *args, **kwargs)
 ttnn.Tensor.__radd__ = lambda self, *args, **kwargs: ttnn.add(self, *args, **kwargs)
diff --git a/ttnn/ttnn/operations/unary.py b/ttnn/ttnn/operations/unary.py
index 1f34e1fec2a..032b7d9308d 100644
--- a/ttnn/ttnn/operations/unary.py
+++ b/ttnn/ttnn/operations/unary.py
@@ -66,7 +66,6 @@ def _golden_function(input_tensor: ttnn.Tensor, **_):
             "gelu": torch.nn.functional.gelu,
             "rsqrt": torch.rsqrt,
             # Unaries with float parameter
-            # "prelu": torch_prelu, # Alias for leaky_relu. TODO(#8544): implement PReLU properly
             # Other unaries (composite operations)
             "softplus": torch.nn.functional.softplus,
             "sigmoid_accurate": torch.sigmoid,
@@ -148,7 +147,6 @@ def _golden_function(input_tensor: ttnn.Tensor, **_):
     ttnn.gelu,
     ttnn.rsqrt,
     # Unaries with float parameter
-    # ttnn.prelu,  # Alias for leaky_relu. TODO(#8544): implement PReLU properly
     # Unaries using op_chain
     ttnn.log_sigmoid,
     ttnn.softplus,
@@ -239,6 +237,15 @@ def _golden_function_elu(input_tensor_a, *args, alpha=1.0, **kwargs):
 ttnn.attach_golden_function(ttnn.elu, golden_function=_golden_function_elu)
 
 
+def _golden_function_prelu(input_tensor_a, input_tensor_b, *args, **kwargs):
+    import torch
+
+    return torch.nn.functional.prelu(input_tensor_a, weight=input_tensor_b)
+
+
+ttnn.attach_golden_function(ttnn.prelu, golden_function=_golden_function_prelu)
+
+
 def _golden_function_hardtanh(input_tensor_a, min_val=-1.0, max_val=1.0, *args, **kwargs):
     import torch
 

From 727a7ee31a3662db6da51e2465d0b09d3a64c8d5 Mon Sep 17 00:00:00 2001
From: mouliraj-mcw <mcw-melamurugan@ext.tenstorrent.com>
Date: Tue, 12 Nov 2024 09:51:44 +0000
Subject: [PATCH 2/5] Add scalar support for prelu

---
 .../eltwise/test_binary_composite.py          | 23 +++++++++++++++++++
 .../eltwise/binary/binary_composite.hpp       |  6 +++++
 .../eltwise/binary/binary_pybind.hpp          | 14 ++++-------
 .../binary/device/binary_composite_op.cpp     |  6 ++++-
 ttnn/ttnn/operations/binary.py                | 12 ++++++++++
 ttnn/ttnn/operations/unary.py                 |  9 --------
 6 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
index b540073527e..96050f92511 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
@@ -981,3 +981,26 @@ def test_binary_prelu_ttnn(input_shapes, device):
 
     comp_pass = compare_pcc([output_tensor], [golden_tensor])
     assert comp_pass
+
+
+@pytest.mark.parametrize(
+    "input_shapes",
+    (
+        (torch.Size([1, 3, 32, 32])),
+        (torch.Size([1, 6, 32, 32])),
+        (torch.Size([1, 7, 320, 384])),
+        (torch.Size([1, 4, 320, 384])),
+    ),
+)
+@pytest.mark.parametrize(
+    "scalar",
+    {-0.25, -2.7, 0.45, 6.4},
+)
+def test_binary_prelu_scalar_ttnn(input_shapes, scalar, device):
+    in_data1, input_tensor1 = data_gen_with_range(input_shapes, -100, 100, device)
+    output_tensor = ttnn.prelu(input_tensor1, scalar)
+    golden_function = ttnn.get_golden_function(ttnn.prelu)
+    golden_tensor = golden_function(in_data1, scalar)
+
+    comp_pass = compare_pcc([output_tensor], [golden_tensor])
+    assert comp_pass
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
index 559fd2e445f..77d2928b427 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
@@ -249,6 +249,12 @@ struct ExecutePrelu
         const Tensor& input_tensor_a,
         const Tensor& input_tensor_b,
         const std::optional<MemoryConfig>& memory_config = std::nullopt);
+
+    static Tensor invoke(
+        const Tensor& input_tensor,
+        float scalar,
+        const std::optional<MemoryConfig>& memory_config = std::nullopt);
+
 };
 
 } // namespace binary
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
index 8ebaf601a71..8a42c230115 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
@@ -590,7 +590,6 @@ void bind_binary_overload_operation(py::module& module, const binary_operation_t
         operation,
         doc,
 
-
         //tensor and scalar
         ttnn::pybind_overload_t{
             [](const binary_operation_t& self,
@@ -935,14 +934,6 @@ void py_module(py::module& module) {
         R"doc(\mathrm{output\_tensor}_i = \sqrt{(\mathrm{input\_tensor\_a}_i^2 + \mathrm{input\_tensor\_b}_i^2)}
         )doc");
 
-    detail::bind_binary_composite(
-        module,
-        ttnn::prelu,
-        R"doc(Perform an eltwise-prelu operation. Formula : a - a.div(b, rounding_mode=trunc) * b .
-        PReLU supports the case where the size of input_tensor_b matches the number of channels in input_tensor_a.)doc",
-        R"doc(\mathrm{{output\_tensor}} = \verb|PReLU|(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",
-        R"doc(BFLOAT16, BFLOAT8_B)doc");
-
     detail::bind_binary_composite(
         module,
         ttnn::xlogy,
@@ -1053,6 +1044,11 @@ void py_module(py::module& module) {
         ttnn::maximum,
         R"doc(Compute maximum :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc");
 
+    detail::bind_binary_composite_overload(
+        module,
+        ttnn::prelu,
+        R"doc(Perform an eltwise-prelu operation. PReLU supports the case where the size of input_tensor_b matches the number of channels in input_tensor_a.)doc");
+
     detail::bind_binary_composite(
         module,
         ttnn::scatter,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
index 750ef0e8060..5933cc63db9 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -264,10 +264,14 @@ Tensor _div_no_nan(const Tensor& input_a, const Tensor& input_b, const std::opti
     return ttnn::where(ttnn::eqz(input_b, output_mem_config), 0, div_result);
 }
 
+Tensor ExecutePrelu::invoke(const Tensor& input, float scalar, const std::optional<MemoryConfig>& output_mem_config) {
+    return ttnn::prelu_sfpu(input, scalar);
+}
+
 Tensor ExecutePrelu::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
     const tt::tt_metal::LegacyShape s_a = input_a.get_legacy_shape();
     auto volume = input_b.get_logical_volume();
-    // If volume = 1 Support for a single-value tensor yet to be handled. #14933
+    // If volume = 1 Support for a single-value tensor yet to be handled. TODO(#14933)
     TT_FATAL(s_a[1] == volume, "Mismatch of parameter numbers and input channel size");
     Tensor b = ttnn::reshape(input_b, ttnn::SimpleShape{std::array<uint32_t, 4>{1, s_a[1], 1, 1}});
     Tensor result = ttnn::where(ttnn::ltz(input_a, output_mem_config), ttnn::multiply(input_a, b), input_a);
diff --git a/ttnn/ttnn/operations/binary.py b/ttnn/ttnn/operations/binary.py
index ce8b02488f0..cc60e3a7a63 100644
--- a/ttnn/ttnn/operations/binary.py
+++ b/ttnn/ttnn/operations/binary.py
@@ -455,4 +455,16 @@ def _golden_function_lcm(input_tensor_a, input_tensor_b, *args, **kwargs):
 ttnn.attach_golden_function(ttnn.lcm, golden_function=_golden_function_lcm)
 
 
+def _golden_function_prelu(input_tensor_a, input_tensor_b, *args, **kwargs):
+    import torch
+
+    if not torch.is_tensor(input_tensor_b):
+        input_tensor_b = torch.tensor(input_tensor_b, dtype=input_tensor_a.dtype)
+
+    return torch.nn.functional.prelu(input_tensor_a, weight=input_tensor_b)
+
+
+ttnn.attach_golden_function(ttnn.prelu, golden_function=_golden_function_prelu)
+
+
 __all__ = []
diff --git a/ttnn/ttnn/operations/unary.py b/ttnn/ttnn/operations/unary.py
index 032b7d9308d..5a73f32d59f 100644
--- a/ttnn/ttnn/operations/unary.py
+++ b/ttnn/ttnn/operations/unary.py
@@ -237,15 +237,6 @@ def _golden_function_elu(input_tensor_a, *args, alpha=1.0, **kwargs):
 ttnn.attach_golden_function(ttnn.elu, golden_function=_golden_function_elu)
 
 
-def _golden_function_prelu(input_tensor_a, input_tensor_b, *args, **kwargs):
-    import torch
-
-    return torch.nn.functional.prelu(input_tensor_a, weight=input_tensor_b)
-
-
-ttnn.attach_golden_function(ttnn.prelu, golden_function=_golden_function_prelu)
-
-
 def _golden_function_hardtanh(input_tensor_a, min_val=-1.0, max_val=1.0, *args, **kwargs):
     import torch
 

From 087877dc6148356dc6e3dcc709a10ea664b64db5 Mon Sep 17 00:00:00 2001
From: mouliraj-mcw <mcw-melamurugan@ext.tenstorrent.com>
Date: Wed, 13 Nov 2024 06:55:37 +0000
Subject: [PATCH 3/5] Update Test Files

---
 .../sweeps/eltwise/unary/prelu/prelu.py         | 17 +++--------------
 .../pytests/tt_dnn/test_eltwise_unary.py        |  1 +
 .../operations/eltwise/test_activation.py       |  1 +
 .../operations/eltwise/test_binary_composite.py |  1 +
 4 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/tests/sweep_framework/sweeps/eltwise/unary/prelu/prelu.py b/tests/sweep_framework/sweeps/eltwise/unary/prelu/prelu.py
index 134825bdf04..b2940fcf817 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/prelu/prelu.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/prelu/prelu.py
@@ -14,11 +14,6 @@
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
 from models.utility_functions import torch_random
 
-# Override the default timeout in seconds for hang detection.
-TIMEOUT = 30
-
-random.seed(0)
-
 
 # Parameters provided to the test vector generator are defined here.
 # They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
@@ -45,12 +40,6 @@ def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
     return False, None
 
 
-def torch_prelu(x, *args, **kwargs):
-    weight = kwargs.pop("scalar")
-    result = torch.nn.functional.prelu(x, torch.tensor(weight, dtype=x.dtype))
-    return result
-
-
 # This is the run instructions for the test, defined by the developer.
 # The run function must take the above-defined parameters as inputs.
 # The runner will call this run function with each test vector, and the returned results from this function will be stored.
@@ -65,14 +54,14 @@ def run(
     *,
     device,
 ) -> list:
-    data_seed = random.randint(0, 20000000)
-    torch.manual_seed(data_seed)
+    torch.manual_seed(0)
 
     torch_input_tensor_a = gen_func_with_cast_tt(
         partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
     )(input_shape)
 
-    torch_output_tensor = torch_prelu(torch_input_tensor_a, scalar=weight)
+    golden_function = ttnn.get_golden_function(ttnn.prelu)
+    torch_output_tensor = golden_function(torch_input_tensor_a, weight)
 
     input_tensor_a = ttnn.from_torch(
         torch_input_tensor_a,
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py
index ea9029a7cf6..1db66f53ced 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py
@@ -871,6 +871,7 @@ def test_run_eltwise_leaky_relu_op(
         )
 
     @pytest.mark.parametrize("weight", [-0.5, 1.0, 0.5])
+    @skip_for_grayskull()
     def test_run_eltwise_prelu(
         self,
         input_shapes,
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_activation.py b/tests/ttnn/unit_tests/operations/eltwise/test_activation.py
index 57f8ecf4284..4407dd2a306 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_activation.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_activation.py
@@ -307,6 +307,7 @@ def test_scalarB_leaky_relu(device, h, w, scalar):
     run_activation_test_leaky_relu(device, h, w, scalar, ttnn.leaky_relu)
 
 
+@skip_for_grayskull()
 @pytest.mark.parametrize("weight", [-0.5, 1.0, 0.5])
 @pytest.mark.parametrize("h", [64])
 @pytest.mark.parametrize("w", [128])
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
index 96050f92511..6f5ff325837 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
@@ -996,6 +996,7 @@ def test_binary_prelu_ttnn(input_shapes, device):
     "scalar",
     {-0.25, -2.7, 0.45, 6.4},
 )
+@skip_for_grayskull()
 def test_binary_prelu_scalar_ttnn(input_shapes, scalar, device):
     in_data1, input_tensor1 = data_gen_with_range(input_shapes, -100, 100, device)
     output_tensor = ttnn.prelu(input_tensor1, scalar)

From 6f0cf195d16c7f2a4462e76757c0185493a98312 Mon Sep 17 00:00:00 2001
From: mouliraj-mcw <mcw-melamurugan@ext.tenstorrent.com>
Date: Thu, 14 Nov 2024 10:08:43 +0000
Subject: [PATCH 4/5] Update the logic

---
 .../eltwise/test_binary_composite.py          | 43 +++++++++++++------
 .../eltwise/binary/binary_pybind.hpp          | 23 ++++++++--
 .../binary/device/binary_composite_op.cpp     | 15 ++++---
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
index 6f5ff325837..497c2d194a6 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
@@ -9,9 +9,11 @@
 from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
     data_gen_with_range,
     data_gen_with_range_int,
+    data_gen_with_val,
     compare_pcc,
     compare_equal,
 )
+from tests.ttnn.utils_for_testing import assert_with_pcc
 from models.utility_functions import is_grayskull, skip_for_grayskull
 
 
@@ -964,32 +966,43 @@ def test_binary_lcm_ttnn(input_shapes, device):
 @pytest.mark.parametrize(
     "input_shapes",
     (
-        (torch.Size([1, 3, 32, 32])),
-        (torch.Size([1, 6, 32, 32])),
-        (torch.Size([1, 7, 320, 384])),
-        (torch.Size([1, 4, 320, 384])),
+        (torch.Size([1, 2, 32, 64, 64])),
+        (torch.Size([1, 3, 7, 29, 127])),
+        (torch.Size([1, 3, 2, 32])),
+        (torch.Size([1, 6, 49, 97])),
+        (torch.Size([1, 7, 320])),
+        (torch.Size([1, 49, 321])),
+        (torch.Size([4, 32])),
+        (torch.Size([49, 321])),
     ),
 )
 def test_binary_prelu_ttnn(input_shapes, device):
-    in_data1, input_tensor1 = data_gen_with_range(input_shapes, -100, 100, device)
+    in_data1 = torch.rand(input_shapes, dtype=torch.bfloat16) * 200 - 100
     channels = input_shapes[1]
     in_data2 = torch.rand((channels,), dtype=torch.bfloat16) * 200 - 100
+
+    input_tensor1 = ttnn.from_torch(in_data1, layout=ttnn.TILE_LAYOUT, device=device)
     input_tensor2 = ttnn.from_torch(in_data2, layout=ttnn.TILE_LAYOUT, device=device)
+
     output_tensor = ttnn.prelu(input_tensor1, input_tensor2)
+    output_tensor = ttnn.to_torch(output_tensor)
     golden_function = ttnn.get_golden_function(ttnn.prelu)
     golden_tensor = golden_function(in_data1, in_data2)
 
-    comp_pass = compare_pcc([output_tensor], [golden_tensor])
-    assert comp_pass
+    assert_with_pcc(golden_tensor, output_tensor, 0.999)
 
 
 @pytest.mark.parametrize(
     "input_shapes",
     (
-        (torch.Size([1, 3, 32, 32])),
-        (torch.Size([1, 6, 32, 32])),
-        (torch.Size([1, 7, 320, 384])),
-        (torch.Size([1, 4, 320, 384])),
+        (torch.Size([1, 2, 32, 64, 64])),
+        (torch.Size([1, 3, 7, 29, 127])),
+        (torch.Size([1, 3, 2, 32])),
+        (torch.Size([1, 6, 49, 97])),
+        (torch.Size([1, 7, 320])),
+        (torch.Size([1, 49, 321])),
+        (torch.Size([4, 32])),
+        (torch.Size([49, 321])),
     ),
 )
 @pytest.mark.parametrize(
@@ -998,10 +1011,12 @@ def test_binary_prelu_ttnn(input_shapes, device):
 )
 @skip_for_grayskull()
 def test_binary_prelu_scalar_ttnn(input_shapes, scalar, device):
-    in_data1, input_tensor1 = data_gen_with_range(input_shapes, -100, 100, device)
+    in_data1 = torch.rand(input_shapes, dtype=torch.bfloat16) * 200 - 100
+    input_tensor1 = ttnn.from_torch(in_data1, layout=ttnn.TILE_LAYOUT, device=device)
+
     output_tensor = ttnn.prelu(input_tensor1, scalar)
+    output_tensor = ttnn.to_torch(output_tensor)
     golden_function = ttnn.get_golden_function(ttnn.prelu)
     golden_tensor = golden_function(in_data1, scalar)
 
-    comp_pass = compare_pcc([output_tensor], [golden_tensor])
-    assert comp_pass
+    assert_with_pcc(golden_tensor, output_tensor, 0.999)
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
index 8a42c230115..9ca6a02f9a3 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
@@ -358,7 +358,7 @@ void bind_binary_composite_with_rtol_atol(py::module& module, const binary_opera
 }
 
 template <typename binary_operation_t>
-void bind_binary_composite_overload(py::module& module, const binary_operation_t& operation, const std::string& description) {
+void bind_binary_composite_overload(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& supported_dtype="BFLOAT16", const std::string& supported_rank= "2, 3, 4") {
     auto doc = fmt::format(
         R"doc(
             {2}
@@ -373,6 +373,19 @@ void bind_binary_composite_overload(py::module& module, const binary_operation_t
             Keyword Args:
                 memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`.
 
+            Note:
+                Supported dtypes, layouts, and ranks:
+
+                .. list-table::
+                   :header-rows: 1
+
+                   * - Dtypes
+                     - Layouts
+                     - Ranks
+                   * - {3}
+                     - TILE
+                     - {4}
+
             Returns:
                 ttnn.Tensor: the output tensor.
 
@@ -384,7 +397,9 @@ void bind_binary_composite_overload(py::module& module, const binary_operation_t
         )doc",
         operation.base_name(),
         operation.python_fully_qualified_name(),
-        description);
+        description,
+        supported_dtype,
+        supported_rank);
 
     bind_registered_operation(
         module,
@@ -1047,7 +1062,9 @@ void py_module(py::module& module) {
     detail::bind_binary_composite_overload(
         module,
         ttnn::prelu,
-        R"doc(Perform an eltwise-prelu operation. PReLU supports the case where the size of input_tensor_b matches the number of channels in input_tensor_a.)doc");
+        R"doc(Perform an eltwise-prelu operation. PReLU supports the case where the size of input_tensor_b matches the number of channels in input_tensor_a.)doc",
+        R"doc(BFLOAT16, BFLOAT8_B)doc",
+        R"doc(2, 3, 4, 5)doc");
 
     detail::bind_binary_composite(
         module,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
index 5933cc63db9..e46b86fa072 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -269,11 +269,16 @@ Tensor ExecutePrelu::invoke(const Tensor& input, float scalar, const std::option
 }
 
 Tensor ExecutePrelu::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
-    const tt::tt_metal::LegacyShape s_a = input_a.get_legacy_shape();
-    auto volume = input_b.get_logical_volume();
-    // If volume = 1 Support for a single-value tensor yet to be handled. TODO(#14933)
-    TT_FATAL(s_a[1] == volume, "Mismatch of parameter numbers and input channel size");
-    Tensor b = ttnn::reshape(input_b, ttnn::SimpleShape{std::array<uint32_t, 4>{1, s_a[1], 1, 1}});
+    const auto s_a = input_a.get_shape();
+    const auto volume = input_b.get_logical_volume();
+
+    TT_FATAL(s_a[1] == volume, "Mismatch of parameter numbers and input channel size. Found parameter numbers = {} and channel size = {}.", volume, s_a[1]);
+    Tensor b = input_b;
+    if(s_a.rank()>2){
+        SmallVector<uint32_t> reshape(s_a.rank(), 1);
+        reshape[1] = s_a[1];
+        b = ttnn::reshape(input_b, ttnn::Shape(reshape));
+    }
     Tensor result = ttnn::where(ttnn::ltz(input_a, output_mem_config), ttnn::multiply(input_a, b), input_a);
     return result;
 }

From 34c469deb9b6919c8206c7e836cdb87157adfa68 Mon Sep 17 00:00:00 2001
From: mouliraj-mcw <mcw-melamurugan@ext.tenstorrent.com>
Date: Fri, 15 Nov 2024 06:44:00 +0000
Subject: [PATCH 5/5] Add prelu support for blackhole

---
 .../metal/llk_api/llk_math_unary_sfpu_api.h   |  1 +
 .../llk_api/llk_sfpu/ckernel_sfpu_prelu.h     | 39 +++++++++++++++++++
 .../llk_math_eltwise_unary_sfpu_prelu.h       | 29 ++++++++++++++
 .../blackhole/metal/llk_api/llk_sfpu_types.h  |  3 +-
 .../llk_api/llk_sfpu/ckernel_sfpu_prelu.h     |  5 ++-
 5 files changed, 74 insertions(+), 3 deletions(-)
 create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
 create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h

diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_math_unary_sfpu_api.h
index 5324157d4a4..91a4c684384 100644
--- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_math_unary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -25,3 +25,4 @@
 #include "llk_math_eltwise_unary_sfpu_trigonometry.h"
 #include "llk_math_eltwise_unary_sfpu_unary_comp.h"
 #include "llk_math_eltwise_unary_sfpu_fill.h"
+#include "llk_math_eltwise_unary_sfpu_prelu.h"
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
new file mode 100644
index 00000000000..dc6e8e1e727
--- /dev/null
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+#include "ckernel_sfpu_converter.h"
+
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_fill(const uint value) {
+
+    // SFPU microcode
+    Converter c_value;
+    c_value.u = value;
+    vFloat fill_val = c_value.f;
+
+    #pragma GCC unroll 0
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat a = dst_reg[0];
+        v_if(a < 0.0f) {
+            a = a * init;
+        }
+        v_endif;
+        dst_reg[0] = a;
+        dst_reg++;
+    }
+}
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h
new file mode 100644
index 00000000000..8bedfdbca2f
--- /dev/null
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_prelu.h
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_sfpu_prelu.h"
+#include "llk_math_eltwise_unary_sfpu_params.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_prelu_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::prelu, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_prelu(uint dst_index, uint param0, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_prelu<APPROXIMATE>,
+        dst_index,
+        vector_mode,
+        param0);
+}
+
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu_types.h
index f55b01c24ab..8a7616784ed 100644
--- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu_types.h
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu_types.h
@@ -87,5 +87,6 @@ enum SfpuType {
     ceil,
     unused,
     cumsum,
-    fill
+    fill,
+    prelu,
 };
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
index a50eec93d28..22c28881261 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_prelu.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -23,7 +23,8 @@ inline void calculate_prelu(uint value) {
     c_value.u = value;
     vFloat init = c_value.f;
 
-    for (int d = 0; d < 8; d++)
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++)
     {
         vFloat a = dst_reg[0];
         v_if(a < 0.0f) {