From 364fb1d9098fc1e9c5f5ec44b3c0bb41df7d878c Mon Sep 17 00:00:00 2001
From: Reem Tawfik <rtawfik@tenstorrent.com>
Date: Wed, 6 Dec 2023 18:07:26 +0000
Subject: [PATCH] #0: Add empty llk api files for grayskull to fix compile

---
 .../grayskull/common/inc/chlkc_list.h         |   4 +-
 .../ckernels/grayskull/common/inc/ckernel.h   |  18 -
 .../grayskull/common/inc/ckernel_globals.h    |  59 --
 .../grayskull/common/inc/ckernel_template.h   | 217 +++++
 .../grayskull/common/inc/cpack_common.h       |  10 -
 .../grayskull/common/inc/cunpack_common.h     |   5 -
 .../grayskull/common/src/ckernel_template.cc  | 229 -----
 .../hw/ckernels/grayskull/llk_lib/llk_defs.h  |  60 --
 ..._math_eltwise_unary_sfpu_common_includes.h | 155 +++-
 .../hw/ckernels/grayskull/llk_lib/llk_pack.h  |   2 +-
 .../ckernels/grayskull/llk_lib/llk_unpack_A.h |   3 +
 .../grayskull/llk_lib/llk_unpack_AB.h         |   3 +
 .../grayskull/llk_lib/llk_unpack_AB_matmul.h  |   3 +
 .../grayskull/llk_lib/llk_unpack_common.h     |   3 +
 .../metal/common/metal_ckernel_globals.h      |  63 ++
 .../metal/llk_api/llk_math_binary_api.h       |  86 ++
 .../metal/llk_api/llk_math_binary_sfpu_api.h  |  70 ++
 .../metal/llk_api/llk_math_common_api.h       | 108 +++
 .../metal/llk_api/llk_math_matmul_api.h       |  68 ++
 .../metal/llk_api/llk_math_reduce_api.h       |  28 +
 .../llk_api/llk_math_unary_datacopy_api.h     |  36 +
 .../metal/llk_api/llk_math_unary_sfpu_api.h   | 293 +++++++
 .../grayskull/metal/llk_api/llk_op_info_api.h |  23 +
 .../grayskull/metal/llk_api/llk_pack_api.h    | 308 +++++++
 .../llk_api}/llk_param_structs.h              |   0
 ..._math_eltwise_unary_sfpu_common_includes.h | 171 ++++
 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h     | 780 ++++++++++++++++++
 .../grayskull/metal/llk_api/llk_sfpu_types.h  |  63 ++
 .../metal/llk_api/llk_unpack_AB_api.h         |  85 ++
 .../metal/llk_api/llk_unpack_AB_matmul_api.h  | 136 +++
 .../metal/llk_api/llk_unpack_A_api.h          |  89 ++
 .../metal/llk_api/llk_unpack_common_api.h     | 137 +++
 .../metal/llk_api/llk_unpack_reduce_api.h     |  94 +++
 .../metal/llk_api/llk_unpack_tilize_api.h     |  99 +++
 .../metal/llk_api/llk_unpack_untilize_api.h   |  96 +++
 .../ckernels/grayskull/metal/llk_io/llk_io.h  |  10 +
 .../grayskull/metal/llk_io/llk_operands.h     |  53 ++
 .../grayskull/metal/llk_io/llk_outputs.h      |  61 ++
 .../wormhole_b0/common/inc/ckernel_globals.h  |   1 -
 .../metal/common/metal_ckernel_globals.h      |   2 +
 .../wormhole_b0/metal/llk_io/llk_operands.h   |   3 +-
 tt_metal/include/compute_kernel_api/unpack.h  | 114 +--
 42 files changed, 3402 insertions(+), 446 deletions(-)
 delete mode 100644 tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api}/llk_param_structs.h (100%)
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h

diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h b/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h
index 3d02d79f908..0a30e5f179b 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h
@@ -14,20 +14,18 @@ using namespace ckernel;
 
 
 #ifdef UCK_CHLKC_MATH
-// #include "chlkc_math_llk_args.h"
+#include "chlkc_unpack_data_format.h"
 #include "chlkc_math_fidelity.h"
 #include "chlkc_math_approx_mode.h"
 #include "chlkc_math.cpp"
 #endif
 
 #ifdef UCK_CHLKC_PACK
-// #include "chlkc_pack_llk_args.h"
 #include "chlkc_pack_data_format.h"
 #include "chlkc_pack.cpp"
 #endif
 
 #ifdef UCK_CHLKC_UNPACK
-// #include "chlkc_unpack_llk_args.h"
 #include "chlkc_unpack_data_format.h"
 #include "chlkc_unpack.cpp"
 #endif
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
index b2de68e862a..88bb41af795 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
@@ -62,8 +62,6 @@ extern uint32_t dest_offset_id;
 extern uint32_t dbg_event_index;
 extern uint32_t dbg_event_end;
 
-extern uint32_t op_info_offset;
-
 // Internal scope to namespace methods only (C++ does not allow namespace private ownership)
 namespace internal {
 }
@@ -281,22 +279,6 @@ inline void debug_dump(uint8_t *data, uint32_t byte_size) {
     // TODO(pk) re-implement
 }
 
-inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {
-
-    uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
-    static constexpr uint32_t op_info_num_items = 7;
-
-    volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(&op_info_struct);
-    for (uint32_t i = 0; i < op_info_num_items; i++) {
-        op_info_struct_ptr[i] = op_info_ptr[i];
-    }
-    op_info_offset += 28;
-
-    if (op_info_offset == OP_INFO_SIZE) {
-        op_info_offset = 0; // In case we go out of bounds
-    }
-}
-
 inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b)
 {
   unsigned int r = 0;
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
index a98ae7577ae..445c77d1e0b 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
@@ -7,8 +7,6 @@
 #include <cstdint>
 #include "ckernel_structs.h"
 #include "risc_attribs.h"
-#include "tensix_functions.h"
-#include "hostdevcommon/common_runtime_address_map.h"
 
 extern uint32_t cfg_state_id;
 extern uint32_t unp_cfg_context;
@@ -16,62 +14,5 @@ extern uint32_t gl_alu_format_spec_reg;
 
 extern volatile uint32_t l1_buffer[16];
 
-//extern const int32_t unpack_src_format[24];
-//extern const int32_t unpack_dst_format[24];
-//extern const int32_t pack_src_format[16];
-//extern const int32_t pack_dst_format[16];
-
 extern uint32_t pack_sync_tile_dst_ptr;
 extern uint32_t math_sync_tile_dst_index;
-
-extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
-
-extern uint32_t __ldm_bss_start[];
-extern uint32_t __ldm_bss_end[];
-extern uint32_t __ldm_data_start[];
-extern uint32_t __ldm_data_end[];
-extern void (* __init_array_start[])();
-extern void (* __init_array_end[])();
-extern uint32_t __firmware_start[];
-
-extern void kernel_init();
-extern void kernel_launch();
-
-inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
-    // Cover L1 load latency of 6 cycles for the bulk of the copy
-    int32_t n = 0;
-    while (n < len - 5) {
-        uint32_t v0 = l1_addr[n + 0];
-        uint32_t v1 = l1_addr[n + 1];
-        uint32_t v2 = l1_addr[n + 2];
-        uint32_t v3 = l1_addr[n + 3];
-        uint32_t v4 = l1_addr[n + 4];
-        uint32_t v5 = l1_addr[n + 5];
-        local_mem_addr[n + 0] = v0;
-        local_mem_addr[n + 1] = v1;
-        local_mem_addr[n + 2] = v2;
-        local_mem_addr[n + 3] = v3;
-        local_mem_addr[n + 4] = v4;
-        local_mem_addr[n + 5] = v5;
-        n += 6;
-    }
-    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
-    while (n < len) {
-        local_mem_addr[n] = l1_addr[n];
-        n++;
-    }
-}
-
-inline void firmware_kernel_common_init(void *init_local_l1_base) {
-
-    // Handle stuff typically done in crt0 in asm.  Easier to do in C
-    wzerorange(__ldm_bss_start, __ldm_bss_end);
-
-    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
-    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
-    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
-
-    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
-        (**fptr)();
-    }
-}
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h
index c8968d06577..ba1c08033b1 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h
@@ -237,4 +237,221 @@ class ckernel_unpack_template
     void program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask = 0); // calls program, then run
 };
 
+    ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op)
+        , m_loop_op1(TT_OP_NOP)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op;
+        m_loop1_last_instr = loop_op;
+    }
+
+    ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op0)
+        , m_loop_op1(loop_op1)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op1;
+        m_loop1_last_instr = loop_op1;
+    }
+
+    void ckernel_template::set_loop_op0(uint loop_op)
+    {
+        m_loop_op0 = loop_op;
+    }
+
+    void ckernel_template::set_loop_op1(uint loop_op)
+    {
+        m_loop_op1 = loop_op;
+    }
+
+    void ckernel_template::set_end_ops(uint end_op0, uint end_op1)
+    {
+        m_end_op0 = end_op0;
+        m_end_op1 = end_op1;
+    }
+
+    void ckernel_template::set_end_op(uint end_op0)
+    {
+        set_end_ops(end_op0, TT_OP_NOP);
+    }
+
+    void ckernel_template::set_start_op(uint start_op0)
+    {
+        m_start_op0 = start_op0;
+    }
+
+    void ckernel_template::set_last_inner_loop_instr(uint op)
+    {
+        m_loop1_last_instr = op;
+    }
+
+    void ckernel_template::set_last_outer_loop_instr(uint op)
+    {
+        m_loop0_last_instr = op;
+    }
+
+    void ckernel_template::program_and_run(volatile uint *instrn_buffer)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer);
+    }
+
+    void ckernel_template::run(volatile uint *instrn_buffer)
+    {
+        TTI_MOP(1, 0, 0); // run the double-loop template
+    }
+
+    void ckernel_template::program(volatile uint *instrn_buffer)
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[0] = m_outer_loop_len;
+        mop_cfg[1] = m_inner_loop_len;
+        mop_cfg[2] = m_start_op0;
+        mop_cfg[3] = m_end_op0;
+        mop_cfg[4] = m_end_op1;
+        mop_cfg[5] = m_loop_op0;
+        mop_cfg[6] = m_loop_op1;
+        mop_cfg[7] = m_loop0_last_instr;
+        mop_cfg[8] = m_loop1_last_instr;
+    }
+
+    void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer, count, zmask);
+    }
+
+    void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP_CFG(zmask >> 16);              // Set the top 16 bits of zmask - we could skip this for count <= 16
+        TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template
+    }
+
+    // Version without zmask, should be slightly faster by eliminating one instruction.
+    void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP(0, count - 1, 0); // Run the template
+    }
+
+    void ckernel_unpack_template::program(volatile uint *instrn_buffer) const
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[1] = m_unpackB | (m_unpack_halo << 1);
+        mop_cfg[2] = m_B_instr;
+        mop_cfg[3] = m_A0_instr;
+        mop_cfg[4] = m_A1_instr;
+        mop_cfg[5] = m_A2_instr;
+        mop_cfg[6] = m_A3_instr;
+        mop_cfg[7] = m_skipA_instr;
+        mop_cfg[8] = m_skipB_instr;
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            A_instr, 0, 0, 0, skipA_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            B_instr, 0, 0, 0, skipB_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr,
+
+        uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(true, // src B
+            false,                           // halo
+            A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){
+        return ckernel_unpack_template::lA(instr0, skip0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){
+        // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA.
+        return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1);
+    }
+
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h
index b3c32f94d0e..6d3a1b6fa7b 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h
@@ -416,14 +416,4 @@ namespace ckernel::packer
    {
        dest_offset_id = 0;
    }
-
-   inline uint32_t get_output_id(uint32_t output)
-   {
-      return ((output) - OUTPUT_BASE);
-   }
-
-   inline constexpr uint32_t get_output_base_id()
-   {
-      return (OUTPUT_BASE_ID);
-   }
 }
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h
index 99e4a2c892f..33a3c7f0a58 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h
@@ -335,9 +335,4 @@ namespace ckernel::unpacker
       // Clear context ID
       //reset_config_context();
     }
-
-   inline uint32_t get_operand_id(uint32_t operand)
-   {
-      return operand;
-   }
 }
diff --git a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc
deleted file mode 100644
index 238301e0566..00000000000
--- a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "ckernel_template.h"
-#include "debug/fw_debug.h"
-
-namespace ckernel
-{
-extern volatile uint *cfg_regs;
-
-ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op)
-    : m_outer_loop_len(outer_loop_len)
-    , m_inner_loop_len(inner_loop_len)
-    , m_loop_op0(loop_op)
-    , m_loop_op1(TT_OP_NOP)
-    , m_end_op0(TT_OP_NOP)
-    , m_end_op1(TT_OP_NOP)
-    , m_start_op0(TT_OP_NOP)
-{
-    m_loop0_last_instr = loop_op;
-    m_loop1_last_instr = loop_op;
-}
-
-ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1)
-    : m_outer_loop_len(outer_loop_len)
-    , m_inner_loop_len(inner_loop_len)
-    , m_loop_op0(loop_op0)
-    , m_loop_op1(loop_op1)
-    , m_end_op0(TT_OP_NOP)
-    , m_end_op1(TT_OP_NOP)
-    , m_start_op0(TT_OP_NOP)
-{
-    m_loop0_last_instr = loop_op1;
-    m_loop1_last_instr = loop_op1;
-}
-
-void ckernel_template::set_loop_op0(uint loop_op)
-{
-    m_loop_op0 = loop_op;
-}
-
-void ckernel_template::set_loop_op1(uint loop_op)
-{
-    m_loop_op1 = loop_op;
-}
-
-void ckernel_template::set_end_ops(uint end_op0, uint end_op1)
-{
-    m_end_op0 = end_op0;
-    m_end_op1 = end_op1;
-}
-
-void ckernel_template::set_end_op(uint end_op0)
-{
-    set_end_ops(end_op0, TT_OP_NOP);
-}
-
-void ckernel_template::set_start_op(uint start_op0)
-{
-    m_start_op0 = start_op0;
-}
-
-void ckernel_template::set_last_inner_loop_instr(uint op)
-{
-    m_loop1_last_instr = op;
-}
-
-void ckernel_template::set_last_outer_loop_instr(uint op)
-{
-    m_loop0_last_instr = op;
-}
-
-void ckernel_template::program_and_run(volatile uint *instrn_buffer)
-{
-    program(instrn_buffer);
-    run(instrn_buffer);
-}
-
-void ckernel_template::run(volatile uint *instrn_buffer)
-{
-    TTI_MOP(1, 0, 0); // run the double-loop template
-}
-
-void ckernel_template::program(volatile uint *instrn_buffer)
-{
-    volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
-
-    mop_sync(); // wait until previous mops have completed
-
-    mop_cfg[0] = m_outer_loop_len;
-    mop_cfg[1] = m_inner_loop_len;
-    mop_cfg[2] = m_start_op0;
-    mop_cfg[3] = m_end_op0;
-    mop_cfg[4] = m_end_op1;
-    mop_cfg[5] = m_loop_op0;
-    mop_cfg[6] = m_loop_op1;
-    mop_cfg[7] = m_loop0_last_instr;
-    mop_cfg[8] = m_loop1_last_instr;
-}
-
-void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
-{
-    program(instrn_buffer);
-    run(instrn_buffer, count, zmask);
-}
-
-void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
-{
-    FWASSERT("Unpack template only supports loops up to 128", count <= 128);
-    TT_MOP_CFG(zmask >> 16);              // Set the top 16 bits of zmask - we could skip this for count <= 16
-    TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template
-}
-
-// Version without zmask, should be slightly faster by eliminating one instruction.
-void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count)
-{
-    FWASSERT("Unpack template only supports loops up to 128", count <= 128);
-    TT_MOP(0, count - 1, 0); // Run the template
-}
-
-void ckernel_unpack_template::program(volatile uint *instrn_buffer) const
-{
-    volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
-
-    mop_sync(); // wait until previous mops have completed
-
-    mop_cfg[1] = m_unpackB | (m_unpack_halo << 1);
-    mop_cfg[2] = m_B_instr;
-    mop_cfg[3] = m_A0_instr;
-    mop_cfg[4] = m_A1_instr;
-    mop_cfg[5] = m_A2_instr;
-    mop_cfg[6] = m_A3_instr;
-    mop_cfg[7] = m_skipA_instr;
-    mop_cfg[8] = m_skipB_instr;
-}
-
-ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        false,                            // halo
-        A_instr, 0, 0, 0, skipA_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        false,                            // halo
-        B_instr, 0, 0, 0, skipB_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(true, // src B
-        true,                            // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B);
-}
-
-ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(true, // src B
-        true,                            // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr,
-
-    uint B_instr, uint skipB_instr)
-{
-    return ckernel_unpack_template(true, // src B
-        false,                           // halo
-        A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr);
-}
-
-ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){
-    return ckernel_unpack_template::lA(instr0, skip0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){
-    // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA.
-    return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1);
-}
-
-} // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h
index 4736137bb9e..2c28acf94e0 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h
@@ -90,66 +90,6 @@ enum ReluType {
   MAX_THRESHOLD_RELU
 };
 
-enum SfpuType
-{
-  tanh,
-  hardtanh,
-  gelu,
-  exponential,
-  exp_with_base,
-  sigmoid,
-  sigmoid_appx,
-  reciprocal,
-  sqrt,
-  rsqrt,
-  lrelu,
-  power,
-  square,
-  tanh_derivative,
-  log,
-  log_with_base,
-  equal_zero,
-  not_equal_zero,
-  less_than_zero,
-  greater_than_equal_zero,
-  less_than_equal_zero,
-  greater_than_zero,
-  clamp,
-  gelu_derivative,
-  dropout,
-  abs,
-  sign,
-  max,
-  min,
-  sine,
-  cosine,
-  tan,
-  relu_min,
-  relu_max,
-  elu,
-  exp2,
-  heaviside,
-  expm1,
-  signbit,
-  asin,
-  acos,
-  atan,
-  erf,
-  erfc,
-  isfinite,
-  isinf,
-  isposinf,
-  isneginf,
-  isnan,
-  logical_not_unary,
-  erfinv,
-  i0,
-  silu,
-  mask,
-  unused
-};
-
-
 enum SfpiTestType
 {
   logical_not,
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
index 822699707d1..9ee2ba08a21 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -5,12 +5,165 @@
 #pragma once
 #include <type_traits>
 
+#include "llk_sfpu_types.h"
 #include "ckernel_globals.h"
 #include "ckernel_include.h"
 #include "ckernel_template.h"
+#include "metal_ckernel_sfpu.h"
 #include "cmath_common.h"
 #include "llk_format_conversions.h"
 #include "llk_math_common.h"
 #include "llk_param_structs.h"
+#include "llk_math_eltwise_unary_sfpu.h"
 
-using namespace ckernel;
+// using namespace ckernel;
+// using namespace ckernel::sfpu;
+// namespace ckernel {
+
+// /*************************************************************************
+//  * LLK ELTWISE UNARY SFPU
+//  *************************************************************************/
+
+// template <
+//     SfpuType operation,
+//     bool APPROXIMATION_MODE,
+//     int SfpuType_PARAM = 0,
+//     int ITERATIONS = 8,
+//     bool IS_INT_SFPU_EN = false>
+// inline void llk_math_calculate_sfpu(
+//     const int iterations = ITERATIONS,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     if constexpr (operation == SfpuType::exp_with_base) {
+//         constexpr bool zero_negative = true;
+//         _calculate_exponential_<APPROXIMATION_MODE, zero_negative, true, ITERATIONS>(iterations, param0);
+//     } else if constexpr (operation == SfpuType::tanh) {
+//         _calculate_tanh_<APPROXIMATION_MODE, ITERATIONS>(iterations);
+//     } else if constexpr (operation == SfpuType::hardtanh) {
+//         _calculate_hardtanh_<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
+//     } else if constexpr (operation == SfpuType::rsqrt) {
+//         // param0 = true -> approximate fast mode
+//         //          false -> high precision mode
+//         //  The algorithm uses Newton's method based on no.of iteration better approximation can be calculated
+//         if (param0) {
+//             calculate_rsqrt<true, ITERATIONS, 10>();
+//         } else {
+//             calculate_rsqrt<false, ITERATIONS, 25>();
+//         }
+//     } else if constexpr (operation == SfpuType::sigmoid) {
+//         calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::sigmoid_appx) {
+//         calculate_sigmoid_appx<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::tanh_derivative) {
+//         calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::dropout) {
+//         calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(param0, param1);
+//     } else if constexpr (operation == SfpuType::power) {
+//         calculate_power_iterative<APPROXIMATION_MODE, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::square) {
+//         calculate_square<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::log) {
+//         calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::log_with_base) {
+//         calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(param0);
+//     } else if constexpr (
+//         (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) ||
+//         (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) ||
+//         (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) {
+//         calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(8);  // BFLOAT16 - exp
+//     } else if constexpr (operation == SfpuType::clamp) {
+//         calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(param0, param1, param2);
+//     } else if constexpr (operation == SfpuType::abs) {
+//         calculate_abs<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::sign) {
+//         calculate_sign<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::max) {
+//         calculate_max<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::min) {
+//         calculate_min<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::exp2) {
+//         calculate_exp2<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::heaviside) {
+//         calculate_heaviside<APPROXIMATION_MODE, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::expm1) {
+//         calculate_expm1<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::asin) {
+//         calculate_asin<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::acos) {
+//         calculate_acos<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::atan) {
+//         calculate_atan<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::signbit) {
+//         calculate_signbit<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::silu) {
+//         calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
+//     }
+// }
+
+// template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN = false>
+// inline void llk_math_eltwise_unary_sfpu(
+//     uint dst_index,
+//     int vector_mode = (int)Dim::RC,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+//     constexpr int ITERATIONS = 8;
+
+//     _llk_math_eltwise_unary_sfpu_start_<Dst>(dst_index);
+
+//     if (vector_mode == (int)Dim::R) {
+//         // Do a row vector, Face0 + Face1 -- first iteration (first row)
+//         const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2)
+//                                                : 2;  // At least 2 iterations for odd and even columns
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 2; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 iterations, param0, param1, param2, param3, param4, param5);
+//             // Move to the next face
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//         // Skip next two faces
+//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//     } else if (vector_mode == (int)Dim::C) {
+//         // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for
+//         // full face
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 2; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             if (num_faces > 2) {  // Skip next face if tile is 32x32
+//                 _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             }
+//         }
+//         if (num_faces <= 2) {
+//             // Skip next two faces
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//     } else {
+//         // Do all four faces, and iterate through all 4 blocks of 4 rows each
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 4; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
+//             // Move to the next face
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//     }
+//     _llk_math_eltwise_unary_sfpu_done_();
+// }
+
+// }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h
index a6f0e32f133..97c0b3d4909 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h
@@ -149,7 +149,7 @@ inline void llk_pack_init() {
 template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false>
 inline void llk_matmul_pack(std::uint32_t start_tile_index, std::uint32_t output, uint32_t ntiles, std::uint32_t output_tile_index = 0) {
     std::uint8_t output_id = get_output_id(output);
-    constexpr std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id();
+    const std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id();
 
     static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
 
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h
index 3f1e76ea0d1..03848f843e5 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h
@@ -6,6 +6,9 @@
 #include "llk_io_unpack.h"
 #include "llk_param_structs.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h
index ee9127628ce..307e94b25ef 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h
@@ -6,6 +6,9 @@
 #include "llk_io_unpack.h"
 #include "llk_param_structs.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h
index b438a9715c5..78a28594917 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h
@@ -7,6 +7,9 @@
 #include "llk_io_unpack.h"
 #include "llk_param_structs.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h
index 3269aab1192..2c46633312f 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h
@@ -11,6 +11,9 @@
 #include "llk_param_structs.h"
 #include "llk_io_unpack.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #ifdef PERF_DUMP
 #include "ckernel_perf_api.h"
 #endif
diff --git a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
new file mode 100644
index 00000000000..7800a9934d7
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//TODO: This file should be deleted after fixing redefinition errors,
+// functions should be moved to ckernel_globals.h
+#pragma once
+
+#include <cstdint>
+#include "ckernel_structs.h"
+#include "risc_attribs.h"
+#include "tensix_functions.h"
+#include "hostdevcommon/common_runtime_address_map.h"
+
+extern uint32_t __ldm_bss_start[];
+extern uint32_t __ldm_bss_end[];
+extern uint32_t __ldm_data_start[];
+extern uint32_t __ldm_data_end[];
+extern void (* __init_array_start[])();
+extern void (* __init_array_end[])();
+extern uint32_t __firmware_start[];
+
+extern void kernel_init();
+extern void kernel_launch();
+
+inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
+    // Cover L1 load latency of 6 cycles for the bulk of the copy
+    int32_t n = 0;
+    while (n < len - 5) {
+        uint32_t v0 = l1_addr[n + 0];
+        uint32_t v1 = l1_addr[n + 1];
+        uint32_t v2 = l1_addr[n + 2];
+        uint32_t v3 = l1_addr[n + 3];
+        uint32_t v4 = l1_addr[n + 4];
+        uint32_t v5 = l1_addr[n + 5];
+        local_mem_addr[n + 0] = v0;
+        local_mem_addr[n + 1] = v1;
+        local_mem_addr[n + 2] = v2;
+        local_mem_addr[n + 3] = v3;
+        local_mem_addr[n + 4] = v4;
+        local_mem_addr[n + 5] = v5;
+        n += 6;
+    }
+    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
+    while (n < len) {
+        local_mem_addr[n] = l1_addr[n];
+        n++;
+    }
+}
+
+inline void firmware_kernel_common_init(void *init_local_l1_base) {
+
+    // Handle stuff typically done in crt0 in asm.  Easier to do in C
+    wzerorange(__ldm_bss_start, __ldm_bss_end);
+
+    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
+    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
+    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
+
+    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
+        (**fptr)();
+    }
+}
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
new file mode 100644
index 00000000000..317c14707ca
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
@@ -0,0 +1,86 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_binary.h"
+
+// /*************************************************************************
+//  * LLK ELTWISE BINARY
+//  *************************************************************************/
+
+// // Version with no operand
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
+// inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) {
+//     const std::uint32_t num_faces = 4;
+
+//     _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
+//         num_faces, transpose, acc_to_dest);
+// }
+
+// // Version with operands
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
+// inline void llk_math_eltwise_binary_init_with_operands(
+//     const std::uint32_t operand_A,
+//     const std::uint32_t operand_B,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t acc_to_dest = 0) {
+//     const std::uint32_t operand_id =
+//         get_operand_id(operand_A);  // operand_id is used to extract tile dim data which is the same for both operands
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
+//         num_faces, transpose, acc_to_dest);
+// }
+
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     DstSync Dst = DstSync::SyncFull,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool is_fp32_dest_acc_en = false>
+// inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) {
+//     const std::uint32_t num_faces = 4;
+
+//     _llk_math_eltwise_binary_<
+//         eltwise_binary_type,
+//         src_b_bcast_type,
+//         Dst,
+//         NUM_FIDELITY_PHASES,
+//         binary_reuse_dest,
+//         is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
+// }
+
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     DstSync Dst = DstSync::SyncFull,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool is_fp32_dest_acc_en = false>
+// inline void llk_math_eltwise_binary(
+//     const std::uint32_t operand_A,
+//     const std::uint32_t operand_B,
+//     uint dst_index,
+//     const bool clear_fp32_dst_acc = true) {
+//     const std::uint32_t operand_id = get_operand_id(operand_A);  // both operands must have same number of faces
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_math_eltwise_binary_<
+//         eltwise_binary_type,
+//         src_b_bcast_type,
+//         Dst,
+//         NUM_FIDELITY_PHASES,
+//         binary_reuse_dest,
+//         is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
new file mode 100644
index 00000000000..21c3e8ae428
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_binary_sfpu.h"
+
+// /*************************************************************************
+//  * LLK ELTWISE BINARY SFPU
+//  *************************************************************************/
+
+// template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu(
+//     const uint operand,
+//     uint dst_index_a,
+//     uint dst_index_b,
+//     int vector_mode = (int)Dim::RC,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+//     _llk_math_eltwise_binary_sfpu_<sfpu_op, APPROXIMATE, Dst>(
+//         face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5);
+// }
+
+// template <SfpuType sfpu_op, bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_init(
+//     uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
+//     _llk_math_eltwise_binary_sfpu_init_<sfpu_op, APPROXIMATE>(param0, param1, param2, param3, param4, param5);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu_quant_int32(
+//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+//     llk_math_eltwise_binary_sfpu<SfpuType::quant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) {
+//     llk_math_eltwise_binary_sfpu_init<SfpuType::quant_int32, APPROXIMATE>(zero_point);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu_requant_int32(
+//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+//     llk_math_eltwise_binary_sfpu<SfpuType::requant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) {
+//     llk_math_eltwise_binary_sfpu_init<SfpuType::requant_int32, APPROXIMATE>(zero_point);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu_dequant_int32(
+//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+//     llk_math_eltwise_binary_sfpu<SfpuType::dequant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) {
+//     llk_math_eltwise_binary_sfpu_init<SfpuType::dequant_int32, APPROXIMATE>(zero_point);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
new file mode 100644
index 00000000000..3da220f0cba
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
@@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_globals.h"
+#include "ckernel_template.h"
+#include "cmath_common.h"
+#include "llk_defs.h"
+#include "llk_io.h"
+#include "llk_math_common.h"
+#include "llk_operands.h"
+#include "llk_param_structs.h"
+
+// // Need to revisit why we even need this
+// #define EPS 1.19209e-07  // std::numeric_limits::epsilon() for FP32
+
+// /*************************************************************************
+//  * LLK MATH COMMON
+//  *************************************************************************/
+
+// template <DstSync Dst>
+// inline void llk_math_wait_for_dest_available() {
+//     _llk_math_wait_for_dest_available_<Dst>();
+// }
+
+// template <DstSync Dst = SyncFull, bool is_fp32_dest_acc_en = false>
+// inline void llk_math_dest_section_done() {
+//     _llk_math_dest_section_done_<Dst, is_fp32_dest_acc_en>();
+// }
+
+// template <DstSync Dst, bool is_fp32_dest_acc_en = false>
+// inline void llk_math_pack_sync_init() {
+//     _llk_math_pack_sync_init_<Dst, is_fp32_dest_acc_en>();
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
+//     _llk_math_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_math_release_tile(std::uint32_t operand) {
+//     _llk_math_release_tile_<mail2math, mail2pack>();
+// }
+
+// inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); }
+
+// inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); }
+
+// inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+//     _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]);
+// }
+
+// inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+//     _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]);
+// }
+
+// inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]);
+// }
+
+// inline void llk_math_reconfig_data_format(
+//     const std::uint32_t srca_old_operand,
+//     const std::uint32_t srca_new_operand,
+//     const std::uint32_t srcb_old_operand,
+//     const std::uint32_t srcb_new_operand) {
+//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) &&
+//         (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+//         llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand);
+//     } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
+//         llk_math_reconfig_data_format_srca(srca_new_operand);
+//     } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+//         llk_math_reconfig_data_format_srcb(srcb_new_operand);
+//     }
+// }
+
+// inline void llk_math_reconfig_data_format_srca(
+//     const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
+//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+
+//     if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
+//         llk_math_reconfig_data_format_srca(srca_new_operand);
+//     }
+// }
+
+// inline void llk_math_reconfig_data_format_srcb(
+//     const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
+//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+//         llk_math_reconfig_data_format_srcb(srcb_new_operand);
+//     }
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
new file mode 100644
index 00000000000..a12bcca1ef4
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_matmul.h"
+
+// /*************************************************************************
+//  * LLK MATMUL
+//  *************************************************************************/
+
+// template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
+// inline void llk_math_matmul_init(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+//     const std::uint32_t in0_id = get_operand_id(operandA);
+//     const std::uint32_t in1_id = get_operand_id(operandB);
+
+//     const bool partial_face = get_operand_partial_face(in0_id);
+
+//     const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id);
+//     const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id);
+//     const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id);
+//     const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id);
+
+// #ifdef ARCH_GRAYSKULL
+//     _llk_math_matmul_init_<NUM_FIDELITY_PHASES, FaceLayout>(
+//         in0_tile_r_dim,
+//         in0_tile_c_dim,
+//         in1_tile_r_dim,
+//         in1_tile_c_dim,
+//         partial_face,
+//         transpose,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim);
+// #else
+//     _llk_math_matmul_init_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(
+//         in0_tile_r_dim,
+//         in0_tile_c_dim,
+//         in1_tile_r_dim,
+//         in1_tile_c_dim,
+//         partial_face,
+//         transpose,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim);
+// #endif
+// }
+
+// template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
+// inline void llk_math_matmul(
+//     uint dst_index,
+//     const bool transpose = false,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+// #ifdef ARCH_GRAYSKULL
+//     _llk_math_matmul_<NUM_FIDELITY_PHASES, FaceLayout>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
+// #else
+//     _llk_math_matmul_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
+// #endif
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
new file mode 100644
index 00000000000..c5f11d005f2
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_reduce.h"
+
+// /*************************************************************************
+//  * LLK REDUCE
+//  *************************************************************************/
+
+// template <
+//     PoolType type,
+//     ReduceDim dim,
+//     int num_fidelity_phases = 0,
+//     bool is_fp32_dest_acc_en = false,
+//     bool is_int_fpu_en = false>
+// inline void llk_math_reduce(const uint dst_index) {
+//     _llk_math_reduce_<type, dim, num_fidelity_phases, is_fp32_dest_acc_en, is_int_fpu_en>(dst_index);
+// }
+
+// template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0>
+// inline void llk_math_reduce_init(
+//     const std::uint32_t within_face_16x16_transpose =
+//         0) {  // within_face_16x16_transpose used for unpack, ignored by math
+//     _llk_math_reduce_init_<type, dim, num_fidelity_phases>(within_face_16x16_transpose);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
new file mode 100644
index 00000000000..ca2a5d39e40
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+
+// /*************************************************************************
+//  * LLK ELTWISE UNARY DATACOPY
+//  *************************************************************************/
+
+// template <
+//     DataCopyType type,
+//     BroadcastType src_b_bcast_type = BroadcastType::NONE,
+//     DstSync Dst = DstSync::SyncFull,
+//     bool is_fp32_dest_acc_en = false,
+//     bool unpack_to_dest = false>
+// inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0);
+//     _llk_math_eltwise_unary_datacopy_<type, src_b_bcast_type, Dst, is_fp32_dest_acc_en, unpack_to_dest>(
+//         dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
+// }
+
+// template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE>
+// // within_face_16x16_transpose is used by unpacker, math does not transpose
+// inline void llk_math_eltwise_unary_datacopy_init(
+//     const std::uint32_t transpose_of_faces = 0 /*unused*/,
+//     const std::uint32_t within_face_16x16_transpose = 0 /* unused */,
+//     const std::uint32_t operand = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     _llk_math_eltwise_unary_datacopy_init_<type, src_b_bcast_type>(
+//         transpose_of_faces, within_face_16x16_transpose, num_faces);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
new file mode 100644
index 00000000000..53b9d1afe8b
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -0,0 +1,293 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "metal_ckernel_sfpu.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+
+// namespace ckernel {
+
+// /*************************************************************************
+// * LLK ELTWISE UNARY SFPU
+// *************************************************************************/
+
+// // New LLK SFPU APIs
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::rsqrt, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_rsqrt_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::rsqrt, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::log, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_log_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::log, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::log_with_base, APPROXIMATE, dst_sync>(dst_index,base_scale);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_log_with_base_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::log_with_base, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::tanh, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::signbit, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_signbit_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::signbit, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_tanh_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::tanh, APPROXIMATE>();
+// }
+
+// //sign
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::sign, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_sign_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::sign, APPROXIMATE>();
+// }
+// template <DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) {
+//     constexpr bool dont_care = false;
+//     llk_math_eltwise_unary_sfpu<SfpuType::dropout, dont_care, dst_sync>(dst_index, vector_mode, integer_dropout, scale_factor);
+// }
+
+// inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) {
+//     constexpr bool dont_care = false;
+//     constexpr uint dont_care_param = 0;
+
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::dropout, dont_care>(dont_care_param, dont_care_param, seed);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::sigmoid, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_sigmoid_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::sigmoid, APPROXIMATE>();
+// }
+
+// //EQZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_eqz_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::equal_zero, APPROXIMATE>();
+// }
+
+// //NEZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::not_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_nez_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::not_equal_zero, APPROXIMATE>();
+// }
+
+// //LTZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::less_than_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_ltz_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_zero, APPROXIMATE>();
+// }
+
+// //GTZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::greater_than_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_gtz_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_zero, APPROXIMATE>();
+// }
+
+// //LEZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::less_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_lez_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_equal_zero, APPROXIMATE>();
+// }
+
+// //GEZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::greater_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_gez_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_equal_zero, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::max, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_max_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::max, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::square, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_square_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::square, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::power, APPROXIMATE, dst_sync>(dst_index, vector_mode, pow);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_power_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::power, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::abs, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_abs_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::abs, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::cast_fp32_to_fp16a, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::cast_fp32_to_fp16a, APPROXIMATE>();
+// }
+
+// //EXP2
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::exp2, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_exp2_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::exp2, APPROXIMATE>();
+// }
+
+// //heaviside
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::heaviside, APPROXIMATE, dst_sync>(dst_index,vector_mode,param0);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_heaviside_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::heaviside, APPROXIMATE>();
+// }
+
+// //EXPM1
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::expm1, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_expm1_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::expm1, APPROXIMATE>();
+// }
+
+// //Asin
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::asin, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_asin_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::asin, APPROXIMATE>();
+// }
+
+// //Atan
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::atan, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_atan_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::atan, APPROXIMATE>();
+// }
+
+// //Acos
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::acos, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_acos_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::acos, APPROXIMATE>();
+// }
+
+// //silu
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::silu, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_silu_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::silu, APPROXIMATE>();
+// }
+
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h
new file mode 100644
index 00000000000..ca7e298a7c2
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+extern uint32_t op_info_offset;
+
+inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {
+
+    uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
+    static constexpr uint32_t op_info_num_items = 7;
+
+    volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(&op_info_struct);
+    for (uint32_t i = 0; i < op_info_num_items; i++) {
+        op_info_struct_ptr[i] = op_info_ptr[i];
+    }
+    op_info_offset += 28;
+
+    if (op_info_offset == OP_INFO_SIZE) {
+        op_info_offset = 0; // In case we go out of bounds
+    }
+}
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
new file mode 100644
index 00000000000..37ee8a0fe56
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
@@ -0,0 +1,308 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_template.h"
+#include "cpack_common.h"
+#include "ckernel_globals.h"
+#include "circular_buffer.h"
+
+#include "llk_io.h"
+#include "llk_defs.h"
+#include "llk_outputs.h"
+#include "llk_param_structs.h"
+#include "llk_pack.h"
+#include "llk_pack_common.h"
+
+/*************************************************************************
+* LLK PACK
+*************************************************************************/
+
+// template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_mop_config(const uint32_t output) {
+
+//     const std::uint32_t output_id = get_output_id(output);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_mop_config_<untilize, zero_output, FaceLayout, false>(
+//         pack_dst_format[output_id],
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile
+//     );
+// }
+
+// template <bool untilize = false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) {
+
+//     const std::uint32_t output_id = get_output_id(pack_params->pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+
+//     _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
+//         pack_src_format[output_id],
+//         pack_dst_format[output_id],
+//         tile_size,
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile,
+//         pack_params->relu_config.val
+//     );
+// }
+
+// template <bool untilize = false, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
+// inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) {
+//     llk_pack_params_t llk_pack_params = {
+//         .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}};
+//     llk_pack_hw_configure<untilize, is_fp32_dest_acc_en>(&llk_pack_params);
+// }
+
+// template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) {
+//     const std::uint32_t output_id = get_output_id(pack_params->pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+
+//     _llk_pack_reduce_hw_configure_<untilize, type, dim, is_fp32_dest_acc_en>(
+//         pack_src_format[output_id],
+//         pack_dst_format[output_id],
+//         tile_size,
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile,
+//         pack_params->relu_config.val
+//     );
+// }
+
+// template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
+// inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) {
+//     llk_pack_params_t llk_pack_params = {
+//         .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}};
+//     llk_pack_reduce_hw_configure<untilize, type, dim, is_fp32_dest_acc_en>(&llk_pack_params);
+// }
+
+// template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_init(const std::uint32_t pack_output = 16) {
+
+//     const std::uint32_t output_id = get_output_id(pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_init_<untilize, zero_output, FaceLayout, false>(
+//         pack_dst_format[output_id],
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile
+//     );
+// }
+
+// template <bool out_of_order_output, bool untilize>
+// inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) {
+
+//     std::uint32_t pack_tile_addr;
+//     if constexpr (out_of_order_output) {
+//         pack_tile_addr = cb_interface[output_id].fifo_wr_ptr +
+//                         (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1;
+//     } else {
+//         if constexpr (untilize) {
+//             // FIXME: Need to support pack-untilize?
+//             // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim +
+//             //                                 cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf
+//             // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
+//             // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size);
+
+//             // cb_interface[output_id].ublock_tile_cnt++;
+
+//             // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) {
+//             //    cb_interface[output_id].ublock_tile_cnt=0;
+//             //    cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct;
+//             // }
+//         } else {
+//             pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
+//             cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size;
+//         }
+//     }
+//     return pack_tile_addr;
+// }
+
+// template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) {
+//     std::uint8_t output_id = get_output_id(output);
+
+//     static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
+
+//     std::uint32_t pack_tile_addr = get_output_tile_address<out_of_order_output, untilize>(output_id, output_tile_index);
+
+//     _llk_pack_<Dst, untilize, is_fp32_dest_acc_en>(
+//         tile_index,
+//         pack_tile_addr
+//     );
+// }
+
+// /*************************************************************************
+// * LLK PACK COMMON
+// *************************************************************************/
+
+
+// inline void llk_packer_wait_for_math_done() {
+//     _llk_packer_wait_for_math_done_();
+// }
+
+// template <uint WaitRes = p_stall::NONE>
+// inline void llk_packer_set_math_semaphore() {
+//     _llk_packer_set_math_semaphore_<WaitRes>();
+// }
+
+// template <DstSync Dst, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_dest_section_done() {
+//     _llk_pack_dest_section_done_<Dst, is_fp32_dest_acc_en>();
+// }
+
+// template <DstSync Dst, DstTileFaceLayout FaceLayout, bool untilize = false>
+// inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) {
+//     const std::uint32_t output_id = get_output_id(pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_init_packer_dest_offset_registers_<Dst, FaceLayout, untilize>(
+//         face_r_dim,
+//         narrow_tile
+//     );
+// }
+
+// template <DstSync Dst, DstTileFaceLayout FaceLayout = RowMajor, bool untilize = false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) {
+
+//     const std::uint32_t output_id = get_output_id(pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_dest_init_<Dst, FaceLayout, untilize, is_fp32_dest_acc_en>(
+//         face_r_dim,
+//         narrow_tile
+//     );
+// }
+
+// template <bool mail2math=true, bool mail2pack=true>
+// inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) {
+//     _llk_pack_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
+// }
+
+// template <bool mail2math=true, bool mail2pack=true>
+// inline void llk_pack_release_tile(std::uint32_t output) {
+//     _llk_pack_release_tile_<mail2math, mail2pack>();
+// }
+
+// inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
+//     _llk_pack_debug_dump_(data, byte_size);
+// }
+
+// inline void llk_pack_debug_dump_seek(std::uint8_t offset) {
+//     _llk_pack_debug_dump_seek_(offset);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) {
+
+//     const std::uint32_t output_id = get_output_id(new_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_reconfig_data_format_<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(
+//         pack_src_format[output_id],
+//         pack_dst_format[output_id],
+//         cb_interface[output_id].fifo_page_size,
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile
+//     );
+// }
+
+// template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) {
+//     std::uint32_t old_output_id = get_output_id(old_output);
+//     std::uint32_t new_output_id = get_output_id(new_output);
+
+//     if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id])
+//        && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid)
+//        && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) {
+//         llk_pack_reconfig_data_format<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(new_output);
+//     } else if constexpr (is_tile_dim_reconfig_en) {
+//         // Same format but different tile dims
+//         llk_pack_mop_config<false, false, FaceLayout, false>(new_output);
+//     }
+// }
+
+// TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) {
+//     _llk_pack_relu_config_(config);
+// }
+
+// inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) {
+//     _llk_pack_reconfig_l1_acc_(enable);
+// }
+
+// template <bool untilize = false, ReduceDim dim>
+// inline void llk_pack_reduce_mask_config() {
+//     _llk_pack_reduce_mask_config_<untilize, dim>();
+// }
+
+// inline void llk_pack_reduce_mask_clear() {
+//     _llk_pack_reduce_mask_clear_();
+// }
+
+// // FIXME-WH-UPLIFT
+// template <ReduceDim dim, bool at_kernel_start = false, bool revert=false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
+
+//     const bool untilize = false;
+//     if constexpr (at_kernel_start) {
+
+//         const std::uint32_t output_id = get_output_id(icb_out);
+//         const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//         const std::uint32_t num_faces = get_output_num_faces(output_id);
+//         const bool partial_face = get_output_partial_face(output_id);
+//         const bool narrow_tile = get_output_narrow_tile(output_id);
+//         const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+//         const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}};
+
+//         _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
+//             pack_src_format[output_id],
+//             pack_dst_format[output_id],
+//             tile_size,
+//             face_r_dim,
+//             num_faces,
+//             partial_face,
+//             narrow_tile,
+//             relu_config.val
+//         );
+//     }
+
+//     if constexpr (revert) {
+//         _llk_pack_reduce_mask_clear_();
+//     } else {
+//         _llk_pack_reduce_mask_config_<untilize, dim>();
+//     }
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
new file mode 100644
index 00000000000..83a5fdcca92
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -0,0 +1,171 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <type_traits>
+
+#include "llk_sfpu_types.h"
+#include "ckernel_globals.h"
+#include "ckernel_include.h"
+#include "ckernel_template.h"
+#include "metal_ckernel_sfpu.h"
+#include "cmath_common.h"
+#include "llk_format_conversions.h"
+#include "llk_math_common.h"
+#include "llk_param_structs.h"
+#include "llk_math_eltwise_unary_sfpu.h"
+
+//TODO: Fix for GS uplift
+
+// using namespace ckernel;
+// using namespace ckernel::sfpu;
+// namespace ckernel {
+
+// /*************************************************************************
+//  * LLK ELTWISE UNARY SFPU
+//  *************************************************************************/
+
+// template <
+//     SfpuType operation,
+//     bool APPROXIMATION_MODE,
+//     int SfpuType_PARAM = 0,
+//     int ITERATIONS = 8,
+//     bool IS_INT_SFPU_EN = false>
+// inline void llk_math_calculate_sfpu(
+//     const int iterations = ITERATIONS,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     if constexpr (operation == SfpuType::exp_with_base) {
+//         constexpr bool zero_negative = true;
+//         _calculate_exponential_<APPROXIMATION_MODE, zero_negative, true, ITERATIONS>(iterations, param0);
+//     } else if constexpr (operation == SfpuType::tanh) {
+//         _calculate_tanh_<APPROXIMATION_MODE, ITERATIONS>(iterations);
+//     } else if constexpr (operation == SfpuType::hardtanh) {
+//         _calculate_hardtanh_<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
+//     } else if constexpr (operation == SfpuType::rsqrt) {
+//         // param0 = true -> approximate fast mode
+//         //          false -> high precision mode
+//         //  The algorithm uses Newton's method based on no.of iteration better approximation can be calculated
+//         if (param0) {
+//             calculate_rsqrt<true, ITERATIONS, 10>();
+//         } else {
+//             calculate_rsqrt<false, ITERATIONS, 25>();
+//         }
+//     } else if constexpr (operation == SfpuType::sigmoid) {
+//         calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::sigmoid_appx) {
+//         calculate_sigmoid_appx<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::tanh_derivative) {
+//         calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::dropout) {
+//         calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(param0, param1);
+//     } else if constexpr (operation == SfpuType::power) {
+//         calculate_power_iterative<APPROXIMATION_MODE, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::square) {
+//         calculate_square<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::log) {
+//         calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::log_with_base) {
+//         calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(param0);
+//     } else if constexpr (
+//         (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) ||
+//         (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) ||
+//         (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) {
+//         calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(8);  // BFLOAT16 - exp
+//     } else if constexpr (operation == SfpuType::clamp) {
+//         calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(param0, param1, param2);
+//     } else if constexpr (operation == SfpuType::abs) {
+//         calculate_abs<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::sign) {
+//         calculate_sign<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::max) {
+//         calculate_max<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::min) {
+//         calculate_min<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::exp2) {
+//         calculate_exp2<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::heaviside) {
+//         calculate_heaviside<APPROXIMATION_MODE, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::expm1) {
+//         calculate_expm1<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::asin) {
+//         calculate_asin<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::acos) {
+//         calculate_acos<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::atan) {
+//         calculate_atan<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::signbit) {
+//         calculate_signbit<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::silu) {
+//         calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
+//     }
+// }
+
+// template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN = false>
+// inline void llk_math_eltwise_unary_sfpu(
+//     uint dst_index,
+//     int vector_mode = (int)Dim::RC,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+//     constexpr int ITERATIONS = 8;
+
+//     _llk_math_eltwise_unary_sfpu_start_<Dst>(dst_index);
+
+//     if (vector_mode == (int)Dim::R) {
+//         // Do a row vector, Face0 + Face1 -- first iteration (first row)
+//         const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2)
+//                                                : 2;  // At least 2 iterations for odd and even columns
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 2; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 iterations, param0, param1, param2, param3, param4, param5);
+//             // Move to the next face
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//         // Skip next two faces
+//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//     } else if (vector_mode == (int)Dim::C) {
+//         // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for
+//         // full face
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 2; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             if (num_faces > 2) {  // Skip next face if tile is 32x32
+//                 _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             }
+//         }
+//         if (num_faces <= 2) {
+//             // Skip next two faces
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//     } else {
+//         // Do all four faces, and iterate through all 4 blocks of 4 rows each
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 4; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
+//             // Move to the next face
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//     }
+//     _llk_math_eltwise_unary_sfpu_done_();
+// }
+
+// }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
new file mode 100644
index 00000000000..50018e399c3
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
@@ -0,0 +1,780 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_sfpu_types.h"
+#include "ckernel_defs.h"
+#include "ckernel_sfpu.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+#include "ckernel_sfpu_cdf.h"
+#include "ckernel_sfpu_exp.h"
+#include "ckernel_sfpu_recip.h"
+#include "ckernel_sfpu_converter.h"
+
+//TODO: Delete this file once GS uplift is done
+// using namespace sfpi;
+
+// namespace ckernel
+// {
+// namespace sfpu
+// {
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
+// inline void calculate_rsqrt()
+// {
+
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+
+//         vFloat in = dst_reg[0];
+//         v_if(dst_reg[0] == 0.0f){
+//             dst_reg[0] = std::numeric_limits<float>::infinity();
+//         }v_else{
+//             vFloat result = 1.0f;
+//             v_if(dst_reg[0] > 1.0f){
+//                 result = sfpu_reciprocal(in);
+//             }v_endif;
+
+//             for (int r = 0; r < RECIPROCAL_ITERATIONS; r++)
+//             {
+//                 // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration.
+//                 result = result * (1.5F - 0.5F  * dst_reg[0] * result * result);
+//             }
+//             dst_reg[0] = result;
+//         }v_endif;
+
+//         dst_reg++;
+
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_sigmoid_appx()
+// {
+//     vUInt l0 = l_reg[LRegs::LReg0];
+//     vUInt l1 = l_reg[LRegs::LReg1];
+//     vUInt l2 = l_reg[LRegs::LReg2];
+
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         dst_reg[0] = lut(val, l0, l1, l2) + 0.5f;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg0] = l0;
+//     l_reg[LRegs::LReg1] = l1;
+//     l_reg[LRegs::LReg2] = l2;
+// }
+
+// // TODO: Implement using bitwise comparision
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_signbit()
+// {
+
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         v_if (val <= -0.0f) {
+//             val = 1.0f;
+//         } v_elseif (val >= 0.0f) {
+//             val = 0.0f;
+//         }
+//         v_endif;
+//         dst_reg[0] = val;
+
+//        dst_reg++;
+//     }
+
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_tanh()
+// {
+//     // SFPU microcode
+//     vUInt l0 = l_reg[LRegs::LReg0];
+//     vUInt l1 = l_reg[LRegs::LReg1];
+//     vUInt l2 = l_reg[LRegs::LReg2];
+
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         val = lut(val, l0, l1, l2);
+//         dst_reg[0] = val;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg0] = l0;
+//     l_reg[LRegs::LReg1] = l1;
+//     l_reg[LRegs::LReg2] = l2;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_hardtanh(uint param0, uint param1, uint param2)
+// {
+//     // All params are in FP16_B format
+//     // param0 = -(neg_threshold)
+//     // param1 = -(pos_threshold - neg_threshold)
+//     // param2 = -(pos_threshold)
+
+//     vFloat p0 = s2vFloat16(param0);
+//     vFloat p1 = s2vFloat16(param1);
+//     vFloat p2 = s2vFloat16(param2);
+//     // SFPU microcode
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         val += p0;// 12 bits
+//         v_if (val < 0.0f) {
+//             val = 0.0f;
+//         }
+//         v_endif;
+
+//         val += p1;// 12 bits
+//         v_if (val >= 0.0f) {
+//             val = 0.0f;
+//         }
+//         v_endif;
+
+//         val += p2;// 12 bits
+
+//         dst_reg[0] = val;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int WITH_PRECOMPUTED_TANH, int ITERATIONS>
+// inline void calculate_tanh_derivative()
+// {
+//     vUInt l0 = l_reg[LRegs::LReg0];
+//     vUInt l1 = l_reg[LRegs::LReg1];
+//     vUInt l2 = l_reg[LRegs::LReg2];
+
+//     // tanh'(x) = 1 - (tanh(x))^2
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         if constexpr (!WITH_PRECOMPUTED_TANH) {
+//             val = lut(val, l0, l1, l2);
+//         }
+
+//         val = val * (-val) + vConst1;
+//         dst_reg[0] = val;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg0] = l0;
+//     l_reg[LRegs::LReg1] = l1;
+//     l_reg[LRegs::LReg2] = l2;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_dropout(uint prob, uint scale)
+// {
+//     // SFPU microcode
+
+//     vUInt rand = l_reg[LRegs::LReg3];
+
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++) {
+//         ////////////////////////
+//         // Scale samples
+//         ///////////////////////
+//         dst_reg[0] = dst_reg[0] * s2vFloat16b(scale);
+
+//         ////////////////////////
+//         // Drop samples
+//         ///////////////////////
+//         v_if (rand < prob) {
+//             dst_reg[0] = vConst0;
+//         }
+//         v_endif;
+
+//         ////////////////////////
+//         // 16-bit PRNG update
+//         ///////////////////////
+//         vUInt lfsr = vConstIntPrgm1;
+//         vUInt tmp = lfsr & rand;
+//         rand = rand >> 1;
+//         v_if (tmp != 0) {
+//             vUInt mask = vConstIntPrgm0;
+//             rand ^= mask;
+//         }
+//         v_endif;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg3] = rand;
+// }
+
+// template <bool APPROXIMATION_MODE,int ITERATIONS>
+// inline void calculate_power_iterative(const uint exponent)
+// {
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < 8; d++)
+//     {
+//         vFloat in = dst_reg[0];
+//         vFloat result = 1.0f;
+//         for (uint i = 0; i < exponent; i++) {
+//             result *= in;
+//         }
+// 	dst_reg[0]=result;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_square()
+// {
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat in = dst_reg[0];
+//         vFloat result = in * in;
+
+//         dst_reg[0] = result;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool HAS_BASE_SCALING>
+// sfpi_inline void calculate_log_body(const uint log_base_scale_factor)
+// {
+//     ////////////////////////////
+//     // Load From dest + "normalize to calculation range"
+//     ////////////////////////////
+//     vFloat in = dst_reg[0];
+//     vFloat x = setexp(in, 127);    // set exp to exp bias (put in range of 1-2)
+
+//     // XXXXXX ask Namal? if we can derive the coefficients below to higher precision
+//     ////////////////////////////
+//     // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order
+//     // x* ( x* (A*x + B) + C) + D
+//     // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006
+//     // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to
+//     // save the subtract and get A',B',C',D'):
+//     // A' = A
+//     // B' = -3A + B
+//     // C' = 3a -2B + C
+//     // D' = -A + B - C + D
+//     // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753
+//     ////////////////////////////
+//     vFloat a = vConstFloatPrgm1;
+//     vFloat b = vConstFloatPrgm2;
+//     // XXXXX try variants of the below: B'=.7122, C'=2.0869
+//     vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f;
+
+//     ////////////////////////////
+//     // Convert exponent to float
+//     ////////////////////////////
+//     vInt exp = exexp(in);
+//     v_if (exp < 0) {
+//         exp = setsgn(~exp + 1, 1);
+//     }
+//     v_endif;
+
+//     vFloat expf = int32_to_float(exp, 0);
+//     vFloat vConstLn2 = vConstFloatPrgm0;
+//     vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2)
+
+//     if constexpr (HAS_BASE_SCALING) {
+//         result *= s2vFloat16a(log_base_scale_factor);
+//     }
+
+//     ////////////////////////////
+//     // Base case when input is 0. ln(0) = -inf
+//     ////////////////////////////
+//     v_if (in == 0.0F) { // Reload for register pressure
+//         result = -std::numeric_limits<float>::infinity();
+//     }
+//     v_endif;
+
+//     dst_reg[0] = result;
+// }
+
+// template <bool APPROXIMATION_MODE, bool HAS_BASE_SCALING, int ITERATIONS>
+// inline void calculate_log(uint log_base_scale_factor)
+// {
+//     #pragma GCC unroll 8
+//     for(int d = 0; d < ITERATIONS; d++){
+//         calculate_log_body<HAS_BASE_SCALING>(log_base_scale_factor);
+//         dst_reg++;
+//     }
+// }
+
+// sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init)
+// {
+//     flag1 = init;
+//     if (check) {
+//         flag2 = init;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, SfpuType COMP_MODE, int ITERATIONS>
+// inline void calculate_comp(uint exponent_size_8)
+// {
+//    const vFloat zero = 0.0f;
+//    const vFloat one = 1.0f;
+//    for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         vFloat flag1, flag2;
+
+// 	//a[i] == 0
+// 	if constexpr(COMP_MODE == SfpuType::equal_zero) {
+// 	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
+// 	      v = one;
+// 	    } v_else {
+// 	      v = zero;
+// 	    }
+// 	    v_endif;
+// 	  }
+
+// 	//a[i] != 0
+// 	if constexpr(COMP_MODE == SfpuType::not_equal_zero) {
+// 	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
+// 	      v = zero;
+// 	    } v_else {
+// 	      v = one;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] < 0
+// 	if constexpr(COMP_MODE == SfpuType::less_than_zero) {
+// 	    v_if (v >= 0.0f) {
+// 	      v = zero;
+// 	    } v_else {
+// 	      v = one;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] >= 0
+// 	if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) {
+// 	    v_if (v >= 0.0f) {
+// 	      v = one;
+// 	    } v_else {
+// 	      v = zero;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] > 0
+// 	if constexpr(COMP_MODE == SfpuType::greater_than_zero) {
+// 	    v_if (v > 0.0f) {
+// 	      v = one;
+// 	    } v_else {
+// 	      v = zero;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] <= 0
+// 	if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) {
+// 	    v_if (v > 0.0f) {
+// 	      v = zero;
+// 	    } v_else {
+// 	      v = one;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	dst_reg[0] = v;
+// 	dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_clamp(uint param0, uint param1, uint param2)
+// {
+//     // All params are in FP16 format
+//     // param0 = min
+//     // param1 = max
+
+//     //uint format = (param0 >> 16)&0x1;
+//     s2vFloat16::Format format = s2vFloat16::fp16a;
+
+//     // SFPU microcode
+//     vFloat min = s2vFloat16(param0, format);
+//     vFloat max = s2vFloat16(param1, format);
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         v_if (val < min) {
+//             val = s2vFloat16(param0, format);
+//         } v_elseif (val >= max) {
+//             val = s2vFloat16(param1, format);
+//         }
+//         v_endif;
+
+//         dst_reg[0] = val + s2vFloat16b(param2); // 12 bits
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_abs()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         dst_reg[0] = sfpi::abs(v);
+//         dst_reg++;
+//     }
+// }
+
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_exp2()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         // log(2) = 0.6931471805;
+//         v = v * 0.6931471805f;
+// 	    // exp = e^(v)
+// 	    vFloat exp = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
+// 	    dst_reg[0] = exp;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_sign()
+// {
+//     // All params are in FP16 format
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+// 	vFloat result = vConst1;
+//         v_if (v < 0.0f) {
+//            result = vConstNeg1;
+//         } v_elseif(v > 0.0f) {
+// 	  result = vConst1;
+// 	} v_else {
+// 	  result = vConst0;
+//         }
+//         v_endif;
+
+// 	dst_reg[0] = result;
+//         dst_reg++;
+//     }
+// }
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_max()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat a = dst_reg[0];
+//         vFloat b = dst_reg[32];
+//         v_if(a < b) {
+//             dst_reg[0] = b;
+//         }
+//         v_endif;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_min()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat a = dst_reg[0];
+//         vFloat b = dst_reg[32];
+//         v_if(a > b) {
+//             dst_reg[0] = b;
+//         }
+//         v_endif;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_expm1()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         v = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
+//         dst_reg[0] = v - 1.0f;
+//         dst_reg++;
+//     }
+// }
+
+
+// #define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4)  (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0)
+
+// template <bool APPROXIMATION_MODE>
+// sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val)
+// {
+//     v_if(1 > sfpi::abs(val)){
+//         dst_reg[0] = sfpi::abs(val)  ;
+//     }
+//     v_else{
+//         dst_reg[0] =  sfpu_reciprocal(sfpi::abs(val));
+//     }
+//     v_endif;
+
+//     vFloat t1 = dst_reg[0] * dst_reg[0];
+
+//     t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1);
+
+//     t1 = t1 * dst_reg[0];
+
+//     v_if (sfpi::abs(val) > 1){
+//         t1 = 1.570796327f - t1;
+//     }
+//     v_endif;
+
+//     v_if(val < 0 ){
+//         t1 = -t1;
+//     }
+//     v_endif;
+
+//     return t1;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_atan()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         val = sfpu_atan_maclaurin_series<APPROXIMATION_MODE>(val);
+//         dst_reg[0] = val;
+//         dst_reg++;
+//     }
+// }
+
+
+// template <bool APPROXIMATION_MODE>
+// sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val)
+// {
+//     // input for [-1:1]
+//     // Mclauren series
+//     // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ...
+//     // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a
+
+//     vFloat tmp = val;
+//     vFloat val_square = val * val;
+//     // x
+//     vFloat output = tmp;
+//     // (1/6) * x^3
+//     tmp = tmp * val_square;
+//     output += 0.166666666 * tmp;
+//     // (3/40) * x^5
+//     tmp = tmp * val_square;
+//     output +=  0.075 * tmp;
+
+//     //(5/112) * x^7
+//     tmp = tmp * val_square;
+//     output += 0.044642857 * tmp;
+
+//     // (35/1152) *x^9
+//     tmp = tmp * val_square;
+//     output += 0.03038194 * tmp;
+
+//     //(63/2816) * x^11
+//     tmp = tmp * val_square;
+//     output += 0.02237216 * tmp;
+
+//     // Write out output
+//     return output;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_asin()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
+//         dst_reg[0] = v;
+//         dst_reg++;
+//     }
+// }
+
+
+// #define PI_2 (1.570796326794)
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_acos()
+// {
+//     // SFPU microcode
+//     // acos = (pi/2 - asin)
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
+//         v = PI_2 - v;
+//         dst_reg[0] = v;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void cast_fp32_to_fp16a()
+// {
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         //vFloat val = dst_reg[0];
+//         //dst_reg[0] = float_to_fp16a(val, 0);
+//         TTI_SFPLOAD(0, 0, 3, 0);
+//         TTI_SFP_STOCH_RND(0,0,0,0,0,8);
+//         TTI_SFPSTORE(0,1,3,0);
+//         dst_reg++;
+//     }
+// }
+
+
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_negative()
+// {
+
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         dst_reg[0] = -val;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_add1()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         dst_reg[0] = 1.0f + val;
+//         dst_reg++;
+//     }
+// }
+
+// inline
+// vFloat sigmoid_piecewise_linear_positive(vFloat val) {
+//         vFloat result = 0.0f;
+// 	v_if ( val >= +5.0f)  {
+// 	  result = 1.0f;
+// 	} v_elseif ( val > 1.0f && val < 5.0f ) {
+// 	  result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f,  0.24300185f,  0.50437757f,val);
+// 	} v_else {
+// 	  result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5
+// 	}
+// 	v_endif;
+// 	return result;
+// }
+
+// //sigmoid is anti-symmetric and offset by 1
+// //sigmoid[-x] = 1 - sigmoid[x]
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_sigmoid()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         vFloat result = 0.0f;
+
+//         v_if ( val < 0.0f ) {
+//   	   val = -val;
+//         }
+//         v_endif;
+
+// 	result = sigmoid_piecewise_linear_positive(val);
+
+// 	val = dst_reg[0];
+//         v_if ( val < 0.0f ) {
+//             result = 1.0f - result;
+//         }
+//         v_endif;
+
+//         dst_reg[0] = result;
+//         dst_reg++;
+//     }
+
+//     return;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_heaviside(uint value)
+// {
+//     // SFPU microcode
+//     Converter c_value;
+//     c_value.u = value;
+//     vFloat s = c_value.f;
+
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++) {
+//         vFloat v = dst_reg[0];
+
+//         v_if (v < 0.0f) {
+//             v = 0.0f;
+//         }v_elseif (v > 0.0f) {
+//             v = 1.0f;
+//         }v_else {
+//             v = s;
+//         }
+//         v_endif;
+
+//        dst_reg[0] = v;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_silu()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++) {
+//         vFloat val = dst_reg[0];
+//         v_if ( val < 0.0f ) {
+//             val = -val;
+//         }
+//         v_endif;
+
+// 	    vFloat result = sigmoid_piecewise_linear_positive(val);
+
+// 	    val = dst_reg[0];
+//         v_if ( val < 0.0f ) {
+//             result = 1.0f - result;
+//         }
+//         v_endif;
+//         result = val * result;
+//         dst_reg[0] = result;
+//         dst_reg++;
+//     }
+// }
+
+// } // namespace sfpu
+// } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h
new file mode 100644
index 00000000000..dd04b59a63d
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+enum SfpuType {
+  tanh,
+  hardtanh,
+  gelu,
+  exponential,
+  exp_with_base,
+  sigmoid,
+  sigmoid_appx,
+  reciprocal,
+  sqrt,
+  rsqrt,
+  lrelu,
+  power,
+  square,
+  tanh_derivative,
+  log,
+  log_with_base,
+  equal_zero,
+  not_equal_zero,
+  less_than_zero,
+  greater_than_equal_zero,
+  less_than_equal_zero,
+  greater_than_zero,
+  clamp,
+  gelu_derivative,
+  dropout,
+  abs,
+  sign,
+  max,
+  min,
+  sine,
+  cosine,
+  tan,
+  relu_min,
+  relu_max,
+  elu,
+  exp2,
+  heaviside,
+  expm1,
+  signbit,
+  asin,
+  acos,
+  atan,
+  erf,
+  erfc,
+  isfinite,
+  isinf,
+  isposinf,
+  isneginf,
+  isnan,
+  logical_not_unary,
+  erfinv,
+  i0,
+  silu,
+  mask,
+  unused,
+};
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
new file mode 100644
index 00000000000..642fbb1591e
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_AB.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+//  * LLK UNPACK AB
+//  *************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_hw_configure(
+//     const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) {
+//     // In0 -> unpA
+//     // In1 -> unpB
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
+//     const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
+
+//     // unpA -> srcA
+//     // unpB -> srcB
+//     const uint32_t num_faces = get_operand_num_faces(unpA_operand_id);  // num faces in unpA and unpB are the same
+
+//     const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id);  // face r dim in unpA and unpB are the same
+
+//     _llk_unpack_AB_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_src_format[unpB_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpack_dst_format[unpB_operand_id],
+//         face_r_dim,
+//         within_face_16x16_transpose,
+//         num_faces);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) {
+//     const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand};
+
+//     llk_unpack_AB_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_params, within_face_16x16_transpose);
+// }
+
+// template <BroadcastType BType = BroadcastType::NONE>
+// inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) {
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);  // if narrow tile read face 0 twice for row broadcast
+//                                                                    // or read face 0 and 1 for col broadcast
+//     _llk_unpack_AB_mop_config_<BType>(transpose_of_faces, num_faces, narrow_tile);
+// }
+
+// template <BroadcastType BType = BroadcastType::NONE>
+// inline void llk_unpack_AB_init(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t acc_to_dest = 0) {
+//     const std::uint32_t operandA_id = get_operand_id(operandA);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id);  // face r dim in unpA and unpB are the same
+//     const std::uint32_t num_faces = get_operand_num_faces(operandA_id);
+//     const bool narrow_tile =
+//         get_operand_narrow_tile(operandA_id);  // if narrow tile read face 0 twice for row broadcast
+
+//     _llk_unpack_AB_init_<BType>(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest);
+// }
+
+// template <BroadcastType BType = BroadcastType::NONE>
+// inline void llk_unpack_AB(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t tile_index_a,
+//     const std::uint32_t tile_index_b,
+//     const bool transpose_of_faces = 0 /*not used*/) {
+//     std::uint32_t operandA_id = get_operand_id(operandA);
+//     std::uint32_t operandB_id = get_operand_id(operandB);
+//     std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a;
+//     std::uint32_t address_a = base_address_a + offset_address_a;
+//     std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b;
+//     std::uint32_t address_b = base_address_b + offset_address_b;
+
+//     _llk_unpack_AB_<BType>(address_a, address_b, transpose_of_faces > 0);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
new file mode 100644
index 00000000000..f4aee2da6bd
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
@@ -0,0 +1,136 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_AB_matmul.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+//  * LLK UNPACK AB MATMUL
+//  *************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) {
+//     const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca;
+
+//     // In0 -> unpB
+//     // In1 -> unpA
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
+//     const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
+
+//     // unpA -> srcA
+//     // unpB -> srcB
+//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id);
+
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+//     const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id);
+
+//     _llk_unpack_AB_matmul_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_src_format[unpB_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpack_dst_format[unpB_operand_id],
+//         unpA_face_r_dim,
+//         unpB_face_r_dim,
+//         transpose_xy_srca,
+//         unpA_num_faces,
+//         unpB_num_faces,
+//         cb_interface[unpA_operand_id].fifo_page_size,
+//         cb_interface[unpB_operand_id].fifo_page_size);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_matmul_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) {
+//     const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = {
+//         .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca};
+//     llk_unpack_AB_matmul_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_matmul_params);
+// }
+
+// inline void llk_unpack_AB_matmul_mop_config(
+//     const bool transpose,
+//     const std::uint32_t ct_dim,
+//     const std::uint32_t rt_dim,
+//     const std::uint32_t kt_dim,
+//     const bool partial_face) {
+//     // in0 - loaded to SrcB
+//     // in1 - loaded to SrcA
+//     _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face);
+// }
+
+// __attribute__((always_inline)) inline void llk_unpack_AB_matmul_init(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+//     // In0 -> srcB (supports partial face)
+//     // In1 -> srcA
+//     const uint32_t operandA_id = get_operand_id(operandB);
+//     const uint32_t operandB_id = get_operand_id(operandA);
+
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id);
+//     const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id);
+
+//     const bool reuse_a = ct_dim >= rt_dim;
+//     const bool partial_face = get_operand_partial_face(operandB_id);
+
+//     const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id);
+//     const uint32_t unpB_num_faces =
+//         partial_face ? 1 : get_operand_num_faces(operandB_id);  // if partial face -> unpack face by face
+
+//     _llk_unpack_AB_matmul_init_(
+//         transpose,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim,
+//         unpA_face_r_dim,
+//         unpB_face_r_dim,
+//         unpA_num_faces,
+//         unpB_num_faces,
+//         partial_face);
+// }
+
+// inline void llk_unpack_AB_matmul(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t tile_index_a,
+//     const std::uint32_t tile_index_b,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+//     // In0/InA -> srcB (supports partial face)
+//     // In1/InB -> srcA
+
+//     volatile uint *cfg = get_cfg_pointer();  // get pointer to registers for current state ID
+
+//     const std::uint32_t operandA_id = get_operand_id(operandA);
+//     const std::uint32_t operandB_id = get_operand_id(operandB);
+//     const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id);  // In1/InB -> srcA
+//     const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id);  // In0/InA -> srcB
+
+//     const bool partial_face = get_operand_partial_face(operandA_id);
+
+//     std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
+//     std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
+
+//     std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size;
+//     std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size;
+
+//     _llk_unpack_AB_matmul_(
+//         base_address_a,
+//         base_address_b,
+//         tile_index_a,
+//         tile_index_b,
+//         tile_size_a,
+//         tile_size_b,
+//         unpA_face_r_dim,
+//         unpB_face_r_dim,
+//         partial_face,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
new file mode 100644
index 00000000000..ca39397653c
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_A.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+//  * LLK UNPACK A
+//  *************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_A_hw_configure(
+//     const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) {
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand);
+//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+//     _llk_unpack_A_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_A_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) {
+//     const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand};
+//     llk_unpack_A_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_A_params, within_face_16x16_transpose);
+// }
+
+// template <
+//     BroadcastType BType = BroadcastType::NONE,
+//     bool acc_to_dest = false,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool unpack_to_dest = false>
+// inline void llk_unpack_A_mop_config(
+//     const bool transpose_of_faces,
+//     const std::uint32_t operand_id,
+//     const std::uint32_t unpack_src_format = 0,
+//     std::uint32_t unpack_dst_format = 0) {
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_unpack_A_mop_config_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+//         transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format);
+// }
+
+// template <
+//     BroadcastType BType = BroadcastType::NONE,
+//     bool acc_to_dest = false,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool unpack_to_dest = false>
+// inline void llk_unpack_A_init(
+//     const std::uint32_t transpose_of_faces = 0,
+//     const std::uint32_t within_face_16x16_transpose = 0,
+//     const std::uint32_t operand = 0) {
+//     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(within_face_16x16_transpose);
+
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_unpack_A_init_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+//         transpose_of_faces,
+//         within_face_16x16_transpose,
+//         face_r_dim,
+//         num_faces,
+//         unpack_src_format[operand_id],
+//         unpack_dst_format[operand_id]);
+// }
+
+// template <
+//     BroadcastType BType = BroadcastType::NONE,
+//     bool acc_to_dest = false,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool unpack_to_dest = false>
+// inline void llk_unpack_A(
+//     const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+//     std::uint32_t address = base_address + offset_address;
+
+//     _llk_unpack_A_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+//         address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
new file mode 100644
index 00000000000..a2f5d8c675f
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
@@ -0,0 +1,137 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "circular_buffer.h"
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_globals.h"
+#include "ckernel_template.h"
+#include "cunpack_common.h"
+#include "llk_defs.h"
+#include "llk_io.h"
+#include "llk_operands.h"
+#include "llk_param_structs.h"
+#include "llk_unpack_common.h"
+
+// /*************************************************************************
+//  * LLK UNPACK COMMON
+//  *************************************************************************/
+
+// void llk_zero_operand(std::uint32_t operand) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size;
+//     std::uint32_t size = cb_interface[operand_id].fifo_size;
+//     _llk_zero_buffer_(fifo_base_addr, size);
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+//     std::uint32_t address = base_address + offset_address;
+//     _llk_unpack_get_tile_<mail2math, mail2pack>(address, p_tile);
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_unpack_release_tile(std::uint32_t operand) {
+//     _llk_unpack_release_tile_<mail2math, mail2pack>();
+// }
+
+// inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
+//     _llk_unpack_debug_dump_(data, byte_size);
+// }
+
+// inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
+//     const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand);
+//     const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id);
+//     _llk_unpack_reconfig_data_format_srca_impl_(
+//         unpack_src_format[srca_operand_id],
+//         unpack_dst_format[srca_operand_id],
+//         cb_interface[srca_operand_id].fifo_page_size);
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
+//     std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand);
+//     const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id);
+//     _llk_unpack_reconfig_data_format_srcb_impl_(
+//         unpack_src_format[srcb_operand_id],
+//         unpack_dst_format[srcb_operand_id],
+//         cb_interface[srcb_operand_id].fifo_page_size);
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srca(
+//     const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
+//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+
+//     if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) {
+//         llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+//     } else if constexpr (is_tile_dim_reconfig_en) {
+//         llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+//     }
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srcb(
+//     const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
+//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) {
+//         llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+//     } else if constexpr (is_tile_dim_reconfig_en) {
+//         llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+//     }
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format(
+//     const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
+//     llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+//     llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format(
+//     const std::uint32_t srca_old_operand,
+//     const std::uint32_t srca_new_operand,
+//     const std::uint32_t srcb_old_operand,
+//     const std::uint32_t srcb_new_operand) {
+//     llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_old_operand, srca_new_operand);
+//     llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_old_operand, srcb_new_operand);
+// }
+
+// inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); }
+
+// inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); }
+
+// // All TILE_SIZE related functions were deprecared in BBE for WH.  The following is needed for pack_shifted so just
+// // keeping here.
+// // FIXME: Need to review and adjust accordingly
+// constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) {
+//     switch (format & 0x1F) {
+//         case ((uint8_t)DataFormat::Float32): return ((index << 8));
+//         case ((uint8_t)DataFormat::Float16):
+//         case ((uint8_t)DataFormat::Float16_b): return ((index << 7));
+//         case ((uint8_t)DataFormat::Bfp8):
+//         case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2));
+//         case ((uint8_t)DataFormat::Bfp4):
+//         case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2));
+//         case ((uint8_t)DataFormat::Bfp2):
+//         case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2));
+//         case ((uint8_t)DataFormat::Int8):
+//         case ((uint8_t)DataFormat::Lf8): return ((index << 6));
+//         // Keep default as Bfp8?
+//         default: return ((index << 6) + (index << 2));
+//     };
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
new file mode 100644
index 00000000000..01a12122375
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
@@ -0,0 +1,94 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_reduce.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+* LLK UNPACK REDUCE
+*************************************************************************/
+
+// template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_reduce_hw_configure(
+//     const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) {
+
+//     constexpr bool within_face_16x16_transpose  = (ReduceDim::REDUCE_ROW == dim);
+
+//     const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand);
+//     const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+//     constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
+//     const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
+//                                ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
+
+//     _llk_unpack_reduce_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpB_src_format,
+//         unpack_dst_format[unpA_operand_id],
+//         unpB_dst_format,
+//         unpA_face_r_dim,
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces,
+//         unpA_num_faces
+//     );
+
+//     if constexpr (type != PoolType::MAX) {
+//         union {
+//             float f;
+//             uint32_t u;
+//         } f2u = {.f = const_mult};
+
+//         for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u;  // Load const into L1 buffer
+//     }
+// }
+
+// template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en=false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) {
+//     const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand};
+//     llk_unpack_reduce_hw_configure<type, dim, is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_reduce_params, mult);
+// }
+
+// template <PoolType type, ReduceDim dim>
+// inline void llk_unpack_reduce_mop_config() {
+//     _llk_unpack_reduce_mop_config_<type, dim>();
+// }
+
+// template <PoolType type, ReduceDim dim>
+// inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) {
+
+//     constexpr std::uint32_t unpA_operand_id = 0;
+
+//     const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
+//     const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
+//                                ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
+
+//     cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>(unpB_dst_format);
+
+//     cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0xf>(unpB_src_format);
+//     cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpB_dst_format);
+
+//     TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32);
+//     TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
+//     TTI_NOP; TTI_NOP;
+
+//     _llk_unpack_reduce_init_<type, dim>(
+//         within_face_16x16_transpose
+//     );
+// }
+
+// template <PoolType type, ReduceDim dim>
+// inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) {
+
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+//     std::uint32_t address = base_address + offset_address;
+
+//     _llk_unpack_reduce_<type, dim>(
+//         address
+//     );
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
new file mode 100644
index 00000000000..59ede271732
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_tilize.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+* LLK UNPACK TILIZE
+*************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false>
+// inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) {
+
+//     constexpr bool  within_face_16x16_transpose = false;
+//     constexpr StochRndType stoch_rnd_mode = StochRndType::None;
+
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand);
+//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+//     _llk_unpack_tilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces
+//     );
+// }
+
+
+// template <bool is_fp32_dest_acc_en = false>
+// inline void llk_unpack_tilize_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand) {
+//     const llk_unpack_A_params_t unpack_tilize_params = {
+//         .unpA_operand = unpA_operand
+//     };
+//     llk_unpack_tilize_hw_configure<is_fp32_dest_acc_en>(&unpack_tilize_params);
+// }
+
+// inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
+//     _llk_unpack_tilize_mop_config_(narrow_tile);
+// }
+
+// inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) {
+//     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(0);
+
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
+
+//     // Save state of unpacker config for quick restore
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0]
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
+
+//     _llk_unpack_tilize_init_(
+//         unpack_src_format[operand_id],
+//         unpack_dst_format[operand_id],
+//         ct_dim,
+//         face_r_dim,
+//         narrow_tile
+//     );
+
+// }
+
+// inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) {
+//     TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0);
+//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0]
+//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32,  p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context
+// }
+
+// inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) {
+
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
+
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;  // Remove header size added by descriptor
+
+//     _llk_unpack_tilize_(
+//         base_address,
+//         tile_index,
+//         unpack_src_format[operand_id],
+//         block_ct_dim,
+//         face_r_dim,
+//         num_faces,
+//         narrow_tile
+//     );
+// }
+
+// inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) {
+//     for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) {
+//         llk_unpack_tilize(operand, tile_index, block_c_tiles);
+//     }
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
new file mode 100644
index 00000000000..dded559e94d
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_untilize.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+// * LLK UNPACK UNTILIZE
+// *************************************************************************/
+// template <bool is_fp32_dest_acc_en = false>
+// inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) {
+//     constexpr bool is_row_pool = false;
+//     constexpr bool within_face_16x16_transpose = false;
+//     constexpr StochRndType stoch_rnd_mode = StochRndType::None;
+
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand);
+//     const uint32_t unpA_num_faces = 4;
+//     const uint32_t unpA_face_r_dim = FACE_R_DIM;
+
+//     _llk_unpack_untilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces
+//     );
+// }
+
+// inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) {
+//     const llk_unpack_A_params_t unpack_untilize_params = {
+//         .unpA_operand = unpA_operand,
+//     };
+//     llk_unpack_untilize_hw_configure(&unpack_untilize_params);
+// }
+
+// inline void llk_unpack_untilize_mop_config() {
+//     _llk_unpack_untilize_mop_config_();
+// }
+
+// inline void llk_unpack_untilize_init(std::uint32_t operand = 0) {
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = 1;
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     // Save state of unpacker config for quick restore
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1
+
+//     _llk_unpack_untilize_init_(
+//         unpack_dst_format[operand_id],
+//         cb_interface[operand_id].fifo_page_size,
+//         face_r_dim,
+//         num_faces
+//     );
+// }
+
+// inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
+//     std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride;
+
+//     // Check that unpacker is done (all contexts freed up) before starting hw configuration
+//     wait_for_idle();
+
+//     // Reset address counters
+//     unpacker_addr_counter_init();
+
+//     // Wait for cfg to be free to edit
+//     TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK);
+
+//     // Reset the values to default in unpack AB common.
+//     TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0);
+//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16);
+//     cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 0, 0xFFFF>(1);
+//     cfg_reg_rmw_tensix<UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, UNP0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, UNP0_ADDR_CTRL_XY_REG_1_Ystride_MASK>(unpA_ch1_y_stride);
+//     TTI_NOP; TTI_NOP; // Do we need this for WH?
+// }
+
+// template <bool first_pass = true>
+// inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) {
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+
+//     _llk_unpack_untilize_pass_<first_pass>(
+//         base_address,
+//         block_tile_cols
+//     );
+// }
+
+// inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) {
+//     llk_unpack_untilize_pass<true>(operand, block_c_tiles);
+//     llk_unpack_untilize_pass<false>(operand, block_c_tiles);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h
new file mode 100644
index 00000000000..37e018dc6b8
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h
@@ -0,0 +1,10 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+
+#include "circular_buffer.h"
+
+extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h
new file mode 100644
index 00000000000..1569b4cdcd1
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+inline uint32_t get_operand_id(uint32_t operand)
+{
+   return (operand);
+}
+
+inline const uint32_t get_operand_src_format(const std::uint32_t operand_id)
+{
+   return unpack_src_format[operand_id];
+}
+
+inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id)
+{
+   return unpack_dst_format[operand_id];
+}
+
+//TODO: Do we need tile dim functions for GS?
+inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id)
+{
+   return 4;
+}
+
+inline const uint32_t get_operand_partial_face(const std::uint32_t operand_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_operand_face_r_dim(const std::uint32_t operand_id)
+{
+   return 16;
+}
+
+inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_operand_tile_r_dim(const std::uint32_t operand_id)
+{
+   return 32;
+}
+
+inline const uint32_t get_operand_tile_c_dim(const std::uint32_t operand_id)
+{
+   return 32;
+}
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h
new file mode 100644
index 00000000000..bd010082bbd
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+// Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes
+inline uint32_t get_output_id(uint32_t output)
+{
+   const uint32_t OUTPUT_BASE    = 0;
+   return ((output) - OUTPUT_BASE);
+}
+
+inline const uint32_t get_output_base_id()
+{
+   const uint32_t OUTPUT_BASE_ID = 16;
+   return (OUTPUT_BASE_ID);
+}
+
+inline const uint32_t get_output_src_format(const std::uint32_t output_id)
+{
+   return pack_src_format[output_id];
+}
+
+inline const uint32_t get_output_dst_format(const std::uint32_t output_id)
+{
+   return pack_dst_format[output_id];
+}
+
+//TODO: Do we need tile dim functions for GS?
+inline const uint32_t get_output_num_faces(const std::uint32_t output_id)
+{
+   return 4;
+}
+
+inline const uint32_t get_output_partial_face(const std::uint32_t output_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_output_face_r_dim(const std::uint32_t output_id)
+{
+   return 16;
+}
+
+inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_output_tile_r_dim(const std::uint32_t output_id)
+{
+   return 32;
+}
+
+inline const uint32_t get_output_tile_c_dim(const std::uint32_t output_id)
+{
+   return 32;
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
index f9359469e33..3dd7dbe114c 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
@@ -19,4 +19,3 @@ extern uint32_t math_sync_tile_dst_index;
 
 extern uint32_t __local_mem_rodata_start_addr[];
 extern uint32_t __local_mem_rodata_end_addr[];
-extern uint32_t __firmware_start[];
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
index 29a2dbf9cfe..cf08580ad69 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+//TODO: This file should be deleted after fixing redefinition errors,
+// functions should be moved to ckernel_globals.h
 #pragma once
 
 #include <cstdint>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
index 2b94607012d..ea113ce5fa0 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
@@ -8,8 +8,7 @@
 
 inline uint32_t get_operand_id(uint32_t operand)
 {
-    const int OPERAND_BASE_ID = 0;
-    return (operand);
+   return (operand);
 }
 
 inline const uint32_t get_operand_src_format(const std::uint32_t operand_id)
diff --git a/tt_metal/include/compute_kernel_api/unpack.h b/tt_metal/include/compute_kernel_api/unpack.h
index 2aaefe1d9d4..c52dc248fea 100644
--- a/tt_metal/include/compute_kernel_api/unpack.h
+++ b/tt_metal/include/compute_kernel_api/unpack.h
@@ -7,67 +7,67 @@
 
 #include "common_globals.h"
 
+//TODO: Is this file needed?
+// namespace ckernel {
 
-namespace ckernel {
+// /**
+//  * Helper function to reconfigure unpacker srca and srcb input data formats.
+//  */
+// ALWI void unpack_reconfig_data_format(const uint32_t srca_new_operand, const uint32_t srcb_new_operand) {
+//     #ifdef ARCH_GRAYSKULL
+//         UNPACK(( llk_unpack_reconfig_data_format(srca_new_operand, srcb_new_operand) ));
+//     #endif
+//     // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
+// }
 
-/**
- * Helper function to reconfigure unpacker srca and srcb input data formats.
- */
-ALWI void unpack_reconfig_data_format(const uint32_t srca_new_operand, const uint32_t srcb_new_operand) {
-    #ifdef ARCH_GRAYSKULL
-        UNPACK(( llk_unpack_reconfig_data_format(srca_new_operand, srcb_new_operand) ));
-    #endif
-    // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
-}
+// /**
+//  * Helper function to reconfigure srca/srcb input data formats, only if they differ from existing formats.
+// */
+// ALWI void unpack_reconfig_data_format(const uint32_t srca_old_operand, const uint32_t srca_new_operand, const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) {
+//     #ifdef ARCH_GRAYSKULL
+//         UNPACK(( llk_unpack_reconfig_data_format(srca_old_operand, srca_new_operand, srcb_old_operand, srcb_new_operand) ));
+//     #endif
+//     // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
+// }
 
-/**
- * Helper function to reconfigure srca/srcb input data formats, only if they differ from existing formats.
-*/
-ALWI void unpack_reconfig_data_format(const uint32_t srca_old_operand, const uint32_t srca_new_operand, const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) {
-    #ifdef ARCH_GRAYSKULL
-        UNPACK(( llk_unpack_reconfig_data_format(srca_old_operand, srca_new_operand, srcb_old_operand, srcb_new_operand) ));
-    #endif
-    // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
-}
+// /**
+//  * Helper function to reconfigure unpacker srca input data format.
+//  */
+// ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_new_operand) {
+//     #ifdef ARCH_GRAYSKULL
+//         UNPACK(( llk_unpack_reconfig_data_format_srca(srca_new_operand) ));
+//     #endif
+//     // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
+// }
 
-/**
- * Helper function to reconfigure unpacker srca input data format.
- */
-ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_new_operand) {
-    #ifdef ARCH_GRAYSKULL
-        UNPACK(( llk_unpack_reconfig_data_format_srca(srca_new_operand) ));
-    #endif
-    // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
-}
+// /**
+//  * Helper function to reconfigure unpacker srca input data format, only if it differs from existing format.
+//  */
+// ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_old_operand, const uint32_t srca_new_operand) {
+//     #ifdef ARCH_GRAYSKULL
+//         UNPACK(( llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand) ));
+//     #endif
+//     // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
+// }
 
-/**
- * Helper function to reconfigure unpacker srca input data format, only if it differs from existing format.
- */
-ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_old_operand, const uint32_t srca_new_operand) {
-    #ifdef ARCH_GRAYSKULL
-        UNPACK(( llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand) ));
-    #endif
-    // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
-}
+// /**
+//  * Helper function to reconfigure unpacker srcb input data format.
+//  */
+// ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_new_operand) {
+//     #ifdef ARCH_GRAYSKULL
+//         UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_new_operand) ));
+//     #endif
+//     // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
+// }
 
-/**
- * Helper function to reconfigure unpacker srcb input data format.
- */
-ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_new_operand) {
-    #ifdef ARCH_GRAYSKULL
-        UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_new_operand) ));
-    #endif
-    // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
-}
+// /**
+//  * Helper function to reconfigure unpacker srcb input data format, only if it differs from existing format.
+//  */
+// ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) {
+//     #ifdef ARCH_GRAYSKULL
+//         UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand) ));
+//     #endif
+//     // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
+// }
 
-/**
- * Helper function to reconfigure unpacker srcb input data format, only if it differs from existing format.
- */
-ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) {
-    #ifdef ARCH_GRAYSKULL
-        UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand) ));
-    #endif
-    // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip.
-}
-
-}
+// }