From 9ab8177b4a786b8b974ab2db9cde7d469a03f336 Mon Sep 17 00:00:00 2001
From: Kei-Ming Kwong <kkwong@tenstorrent.com>
Date: Sat, 18 Nov 2023 22:47:00 +0000
Subject: [PATCH 01/16] #3908: [llk] Uplift and move metal specialties to API
 layer

---
 .../3T/matmul_large_block_zm/zm_3m_math.cpp   |    6 +-
 .../3T/matmul_large_block_zm/zm_3m_unpack.cpp |   10 +-
 .../chlkc_math.cpp                            |    4 +-
 .../chlkc_unpack.cpp                          |    6 +-
 .../test_kernels/compute/eltwise_copy_3m.cpp  |    2 +-
 .../test_kernels/compute/eltwise_sfpi.cpp     |    2 -
 .../test_kernels/compute/untilA_elwbin_3m.cpp |   10 +-
 .../ckernels/grayskull/common/inc/ckernel.h   |    8 +-
 .../grayskull/common/inc/ckernel_reverseops.h |    1 -
 .../ckernels/wormhole_b0/common/inc/ckernel.h |  352 +++--
 .../wormhole_b0/common/inc/ckernel_defs.h     |   79 +-
 .../wormhole_b0/common/inc/ckernel_globals.h  |   59 +-
 .../wormhole_b0/common/inc/ckernel_gpr_map.h  |    8 +-
 .../wormhole_b0/common/inc/ckernel_noc.h      |  317 -----
 .../wormhole_b0/common/inc/ckernel_perf_api.h |  158 +++
 .../common/inc/ckernel_perf_include.h         |   28 +
 .../common/inc/ckernel_perf_math.h            |  279 ++++
 .../common/inc/ckernel_perf_unpack_pack.h     |  158 +++
 .../wormhole_b0/common/inc/ckernel_sfpi.h     |   24 -
 .../wormhole_b0/common/inc/ckernel_sfpu.h     | 1261 +++++++++++------
 .../wormhole_b0/common/inc/ckernel_structs.h  |   81 +-
 .../wormhole_b0/common/inc/cllk_io_headers.h  |   58 -
 .../common/inc/cllk_math_headers.h            |   56 -
 .../common/inc/cllk_packer_headers.h          |   41 -
 .../common/inc/cllk_unpack_headers.h          |   26 -
 .../wormhole_b0/common/inc/cmath_common.h     |   67 +-
 .../wormhole_b0/common/inc/cpack_common.h     |  242 +---
 .../wormhole_b0/common/inc/cpriority_queue.h  |  137 --
 .../wormhole_b0/common/inc/cunpack_common.h   |  148 +-
 .../wormhole_b0/common/src/ckernel.cc         |  221 +++
 .../wormhole_b0/common/src/ckernel_main.cc    |   21 +
 .../common/src/ckernel_perf_unpack_pack.cc    |  301 ++++
 .../common/src/ckernel_template.cc            |    9 +-
 .../wormhole_b0/common/src/ckernel_unity.cc   |   10 +
 .../wormhole_b0/common/src/fwlog_list         |    2 +
 .../hw/ckernels/wormhole_b0/llk_lib/llk_3c.h  |    0
 .../ckernels/wormhole_b0/llk_lib/llk_defs.h   |   25 +-
 .../wormhole_b0/llk_lib/llk_math_common.h     |   80 +-
 .../llk_lib/llk_math_eltwise_binary.h         |   95 +-
 .../llk_lib/llk_math_eltwise_binary_sfpu.h    |  119 ++
 .../llk_lib/llk_math_eltwise_unary_datacopy.h |   65 +-
 .../llk_lib/llk_math_eltwise_unary_sfpi.h     |   25 -
 .../llk_lib/llk_math_eltwise_unary_sfpu.h     |  343 +----
 .../wormhole_b0/llk_lib/llk_math_matmul.h     |  160 ++-
 .../wormhole_b0/llk_lib/llk_math_reduce.h     |   36 +-
 .../ckernels/wormhole_b0/llk_lib/llk_pack.h   |  231 ++-
 .../wormhole_b0/llk_lib/llk_pack_common.h     |  183 +--
 .../wormhole_b0/llk_lib/llk_unpack_A.h        |   93 +-
 .../wormhole_b0/llk_lib/llk_unpack_AB.h       |   69 +-
 .../llk_lib/llk_unpack_AB_matmul.h            |  107 +-
 .../wormhole_b0/llk_lib/llk_unpack_common.h   |  100 +-
 .../wormhole_b0/llk_lib/llk_unpack_reduce.h   |   79 +-
 .../wormhole_b0/llk_lib/llk_unpack_tilize.h   |   80 +-
 .../wormhole_b0/llk_lib/llk_unpack_untilize.h |   97 +-
 .../wormhole_b0/llk_ops/tilize/chlkc_math.cpp |   33 -
 .../llk_ops/tilize/chlkc_math_fidelity.h      |    5 -
 .../wormhole_b0/llk_ops/tilize/chlkc_pack.cpp |   37 -
 .../llk_ops/tilize/chlkc_pack_data_format.h   |   10 -
 .../llk_ops/tilize/chlkc_unpack.cpp           |   36 -
 .../llk_ops/tilize/chlkc_unpack_data_format.h |   10 -
 .../llk_ops/tilize/hlk_args_struct_init.h     |   11 -
 .../wormhole_b0/llk_ops/tilize/loop_count.h   |    5 -
 .../llk_ops/untilize/chlkc_math.cpp           |   33 -
 .../llk_ops/untilize/chlkc_math_fidelity.h    |    5 -
 .../llk_ops/untilize/chlkc_pack.cpp           |   37 -
 .../llk_ops/untilize/chlkc_pack_data_format.h |   10 -
 .../llk_ops/untilize/chlkc_unpack.cpp         |   37 -
 .../untilize/chlkc_unpack_data_format.h       |   10 -
 .../llk_ops/untilize/hlk_args_struct_init.h   |   12 -
 .../wormhole_b0/llk_ops/untilize/loop_count.h |    5 -
 .../{common/inc => metal/common}/chlkc_list.h |    1 +
 .../metal/common/metal_ckernel_globals.h      |   62 +
 .../metal/common/metal_compile_time_args.h    |    9 +
 .../metal/common/metal_mod_div_lib.h          |   92 ++
 .../wormhole_b0/metal/common/tt_log.h         |   16 +
 .../metal/llk_api/llk_math_binary_api.h       |   86 ++
 .../metal/llk_api/llk_math_binary_sfpu_api.h  |   70 +
 .../metal/llk_api/llk_math_common_api.h       |  108 ++
 .../metal/llk_api/llk_math_matmul_api.h       |   69 +
 .../metal/llk_api/llk_math_reduce_api.h       |   28 +
 .../llk_api/llk_math_unary_datacopy_api.h     |   36 +
 .../metal/llk_api/llk_math_unary_sfpu_api.h   |  345 +++++
 .../metal/llk_api/llk_op_info_api.h           |   23 +
 .../wormhole_b0/metal/llk_api/llk_pack_api.h  |  270 ++++
 .../llk_api}/llk_param_structs.h              |    0
 .../llk_api/llk_sfpu}/ckernel_reverseops.h    |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_cdf.h      |    0
 .../llk_sfpu}/ckernel_sfpu_converter.h        |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_elu.h      |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h   |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_exp.h      |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_gelu.h     |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_i0.h       |    0
 .../llk_sfpu}/ckernel_sfpu_isinf_isnan.h      |    0
 .../llk_sfpu}/ckernel_sfpu_logical_not_noti.h |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_recip.h    |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_relu.h     |    0
 .../llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h     |    0
 .../llk_sfpu}/ckernel_sfpu_trigonometry.h     |    0
 .../llk_math_eltwise_unary_sfpu_0_param.h     |    0
 .../llk_math_eltwise_unary_sfpu_1_param.h     |    0
 ..._math_eltwise_unary_sfpu_common_includes.h |    2 +
 .../llk_math_eltwise_unary_sfpu_elu.h         |    0
 .../llk_math_eltwise_unary_sfpu_erf_erfc.h    |    0
 .../llk_math_eltwise_unary_sfpu_erfinv.h      |    0
 .../llk_math_eltwise_unary_sfpu_exp.h         |    0
 .../llk_math_eltwise_unary_sfpu_gelu.h        |    0
 .../llk_math_eltwise_unary_sfpu_i0.h          |    0
 .../llk_math_eltwise_unary_sfpu_init.h        |   14 +
 .../llk_math_eltwise_unary_sfpu_isinf_isnan.h |    0
 ...math_eltwise_unary_sfpu_logical_not_noti.h |    0
 .../llk_math_eltwise_unary_sfpu_recip.h       |    0
 .../llk_math_eltwise_unary_sfpu_relu.h        |    0
 .../llk_math_eltwise_unary_sfpu_reverseops.h  |    0
 .../llk_math_eltwise_unary_sfpu_sqrt.h        |    0
 ...llk_math_eltwise_unary_sfpu_trigonometry.h |    0
 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h     |  778 ++++++++++
 .../metal/llk_api/llk_unpack_AB_api.h         |   85 ++
 .../metal/llk_api/llk_unpack_AB_matmul_api.h  |  136 ++
 .../metal/llk_api/llk_unpack_A_api.h          |   89 ++
 .../metal/llk_api/llk_unpack_common_api.h     |  141 ++
 .../metal/llk_api/llk_unpack_reduce_api.h     |   94 ++
 .../metal/llk_api/llk_unpack_tilize_api.h     |   93 ++
 .../metal/llk_api/llk_unpack_untilize_api.h   |   96 ++
 .../wormhole_b0/metal/llk_io/llk_io.cc        |    3 +
 .../wormhole_b0/metal/llk_io/llk_io.h         |    6 +
 .../{llk_lib => metal/llk_io}/llk_io_pack.h   |    9 +-
 .../{llk_lib => metal/llk_io}/llk_io_unpack.h |   12 +-
 .../wormhole_b0/metal/llk_io/llk_operands.h   |   46 +
 .../wormhole_b0/metal/llk_io/llk_outputs.h    |   55 +
 tt_metal/hw/firmware/src/brisc.cc             |    2 +-
 tt_metal/hw/firmware/src/brisck.cc            |    2 +-
 tt_metal/hw/firmware/src/ncrisc.cc            |    2 +-
 tt_metal/hw/firmware/src/ncrisck.cc           |    2 +-
 tt_metal/hw/firmware/src/trisc.cc             |   11 +-
 tt_metal/hw/firmware/src/trisck.cc            |    8 +-
 tt_metal/hw/inc/debug/fw_debug.h              |    3 +
 tt_metal/include/compute_kernel_api.h         |   27 +-
 tt_metal/include/compute_kernel_api/bcast.h   |    8 +-
 tt_metal/include/compute_kernel_api/cb_api.h  |    8 +
 .../compute_kernel_api/common_globals.h       |    8 +-
 .../compute_kernel_api/eltwise_binary.h       |    4 +-
 .../eltwise_unary/eltwise_unary.h             |    4 +-
 tt_metal/include/compute_kernel_api/matmul.h  |   13 +-
 tt_metal/include/compute_kernel_api/reduce.h  |    6 +-
 .../compute_kernel_api/tile_move_copy.h       |    4 +-
 tt_metal/include/compute_kernel_api/tilize.h  |    4 +-
 .../include/compute_kernel_api/transpose_wh.h |    4 +-
 .../include/compute_kernel_api/untilize.h     |    4 +-
 150 files changed, 6226 insertions(+), 3247 deletions(-)
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_3c.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/common}/chlkc_list.h (96%)
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api}/llk_param_structs.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_reverseops.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_cdf.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_converter.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_elu.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_exp.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_gelu.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_i0.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_isinf_isnan.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_logical_not_noti.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_recip.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_relu.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_trigonometry.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_0_param.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_1_param.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_common_includes.h (83%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_elu.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erf_erfc.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erfinv.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_exp.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_gelu.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_i0.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_init.h (66%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_isinf_isnan.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_logical_not_noti.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_recip.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_relu.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_reverseops.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_sqrt.h (100%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_trigonometry.h (100%)
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_io}/llk_io_pack.h (98%)
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_io}/llk_io_unpack.h (97%)
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h

diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp
index c68f206eb31..7717eb5676c 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp
@@ -4,9 +4,9 @@
 
 #include <cstdint>
 #include "llk_math_common.h"
-#include "llk_math_eltwise_unary_datacopy.h"
-#include "llk_math_eltwise_unary_datacopy.h"
-#include "llk_math_matmul.h"
+#include "llk_math_unary_datacopy_api.h"
+#include "llk_math_unary_datacopy_api.h"
+#include "llk_math_matmul_api.h"
 namespace NAMESPACE
 {
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp
index 7f6b7684c68..751693a6217 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cstdint>
-#include "llk_unpack_common.h"
-#include "llk_unpack_tilize.h"
-#include "llk_unpack_untilize.h"
-#include "llk_unpack_A.h"
-#include "llk_unpack_AB_matmul.h"
+#include "llk_unpack_common_api.h"
+#include "llk_unpack_tilize_api.h"
+#include "llk_unpack_untilize_api.h"
+#include "llk_unpack_A_api.h"
+#include "llk_unpack_AB_matmul_api.h"
 namespace NAMESPACE
 {
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp
index bdc0507c5ce..0665298d117 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp
@@ -4,8 +4,8 @@
 
 #include <cstdint>
 #include "llk_math_common.h"
-#include "llk_math_eltwise_binary.h"
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_binary_api.h"
+#include "llk_math_unary_datacopy_api.h"
 
 namespace NAMESPACE
 {
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp
index 7f1e967ac54..d9d15e7a1fa 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cstdint>
-#include "llk_unpack_common.h"
-#include "llk_unpack_AB.h"
-#include "llk_unpack_untilize.h"
+#include "llk_unpack_common_api.h"
+#include "llk_unpack_AB_api.h"
+#include "llk_unpack_untilize_api.h"
 
 namespace NAMESPACE
 {
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp
index a47d5e02e24..10ede233bbd 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp
@@ -14,7 +14,7 @@ namespace NAMESPACE {
 
 #ifdef TRISC_MATH
 #include "llk_math_common.h"
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_unary_datacopy_api.h"
 
 void math_main()
 {
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
index 51b21cff002..2dced1a4c0d 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
@@ -4,8 +4,6 @@
 
 #include <cstdint>
 
-#include "llk_3c.h"
-
 namespace NAMESPACE {
 void MAIN {
     // expands to hlk_relu_config(nullptr, 1); for relu only
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp
index 56be069bed3..f4feda0dbae 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp
@@ -16,8 +16,8 @@ namespace NAMESPACE {
 #ifdef TRISC_MATH
 #include <cstdint>
 #include "llk_math_common.h"
-#include "llk_math_eltwise_binary.h"
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_binary_api.h"
+#include "llk_math_unary_datacopy_api.h"
 
 void math_main()
 {
@@ -49,9 +49,9 @@ void math_main()
 
 #ifdef TRISC_UNPACK
 #include <cstdint>
-#include "llk_unpack_common.h"
-#include "llk_unpack_AB.h"
-#include "llk_unpack_untilize.h"
+#include "llk_unpack_common_api.h"
+#include "llk_unpack_AB_api.h"
+#include "llk_unpack_untilize_api.h"
 
 void unpack_main()
 {
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
index 5c5489622ec..b2de68e862a 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
@@ -49,11 +49,11 @@ constexpr uint PACK_FLUSH_COUNTERS = // counters flush
     (1 << PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT) |
     (1 << PACK_COUNTERS_SEC2_pack_xys_per_tile_SHAMT);
 
-extern volatile uint * const reg_base;
-extern volatile uint * const pc_buf_base;
-extern volatile uint * const regfile;
+extern volatile uint * reg_base;
+extern volatile uint * pc_buf_base;
+extern volatile uint * regfile;
 extern uint *regmem;
-extern volatile uint * const instrn_buffer;
+extern volatile uint * instrn_buffer;
 extern volatile uint *dbg_event_scratch;
 extern volatile uint local_mem_barrier;
 
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h
index 7d1974639fe..e17a51820e1 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h
@@ -6,7 +6,6 @@
 
 #include "ckernel_defs.h"
 #include "ckernel.h"
-#include "noc_nonblocking_api.h"
 
 #include "sfpi.h"
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
index a0548a80deb..2f72476ade2 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
 #pragma once
 
 #include "risc_attribs.h"
@@ -27,49 +26,95 @@
 #define GPR_DEBUG_REGFILE 0
 #endif
 
+#ifdef PERF_DUMP
+#define DECOUPLINGS_EN (SKIP_UNP || MATH_PACK_DECOUPLE)
+#else
+#define SKIP_UNP 0
+#define MATH_PACK_DECOUPLE 0
+#define DECOUPLINGS_EN 0
+#define OVERLAY_DECOUPLE 0
+#endif
+
+
+#ifndef INSERT_UNPACK_DELAY
+#define INSERT_UNPACK_DELAY 0
+#endif
+
+#ifndef INSERT_MATH_DELAY
+#define INSERT_MATH_DELAY 0
+#endif
+
+#ifndef INSERT_PACK_DELAY
+#define INSERT_PACK_DELAY 0
+#endif
+
+#define DELAY_EN (INSERT_UNPACK_DELAY || INSERT_PACK_DELAY || INSERT_MATH_DELAY)
+
+#define TT_ALWAYS_INLINE inline __attribute__ ((always_inline))
+
 #include <cstdint>
 
 #include "ckernel_include.h"
-#include "debug/fw_debug.h"
 #include "tensix.h"
+#include "debug/fw_debug.h"
 #include "eth_l1_address_map.h"
-#include "noc_overlay_parameters.h"
-#include "stream_io_map.h"
 #include "hostdevcommon/common_runtime_address_map.h"
-#include "limits.h"
 // #include <cstring>
-//#include "perf_lib/scratch_api.h" // not used unless perf dump enabled?
-
 
 namespace ckernel
 {
 
-#define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_ ## arg_idx
-
 constexpr uint PACK_FLUSH_COUNTERS = // counters flush
     (1 << PACK_COUNTERS_SEC2_pack_per_xy_plane_SHAMT) |
     (1 << PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT) |
     (1 << PACK_COUNTERS_SEC2_pack_xys_per_tile_SHAMT);
 
-extern volatile uint tt_reg_ptr * const reg_base;
-extern volatile uint tt_reg_ptr * const pc_buf_base;
-extern volatile uint tt_reg_ptr * const regfile;
-extern uint tt_reg_ptr * regmem;
-extern volatile uint tt_reg_ptr * const instrn_buffer;
+constexpr uint RESET_VAL = 0;
+constexpr uint KERNEL_IN_PROGRESS = 15;
+constexpr uint KERNEL_COMPLETE = 1;
+
+extern volatile uint tt_reg_ptr * reg_base;
+extern volatile uint tt_reg_ptr * pc_buf_base;
+extern volatile uint tt_reg_ptr * regfile;
+extern volatile uint tt_reg_ptr * instrn_buffer;
+extern volatile uint tt_reg_ptr *mailbox_base[4];
 extern volatile uint tt_reg_ptr *dbg_event_scratch;
+extern volatile uint tt_reg_ptr *trisc_l1_mailbox;
+extern volatile uint8_t tt_l1_ptr *debug_buffer;
 
 extern uint32_t cfg_state_id;
 extern uint32_t dest_offset_id;
 extern uint32_t dbg_event_index;
 extern uint32_t dbg_event_end;
 
-extern uint32_t op_info_offset;
+extern volatile uint16_t tt_reg_ptr *debug_mailbox_base;
+extern uint8_t mailbox_index;
+const extern uint8_t mailbox_end;
 // Internal scope to namespace methods only (C++ does not allow namespace private ownership)
 namespace internal {
 }
 
-void tensix_sync();
-void mop_sync();
+inline void tensix_sync()
+{
+    volatile uint foo = 0;
+    volatile uint *fooptr = &foo;
+    // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes
+    pc_buf_base[1] = foo;
+
+    // Now read -- this read will block until we're idle
+    *fooptr = pc_buf_base[1];
+}
+
+inline void mop_sync()
+{
+    volatile uint foo = 0;
+    volatile uint *fooptr = &foo;
+    // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes
+    pc_buf_base[2] = foo;
+
+    // Now read -- this read will block until mops are done
+    *fooptr = pc_buf_base[2];
+}
 
 inline void sync_regfile_write(const uint index);
 
@@ -84,6 +129,7 @@ static constexpr bool is_valid(const T val, const uint8_t wid)
 inline void mmio_register_write(register_space_e space, uint addr, uint data)
 {
     const uint regaddr = (space << 6) | (addr & 0x3F);
+    //FWLOG2("Regaddr: 0x%x, data: 0x%x", regaddr, data);
     reg_base[regaddr] = data;
 }
 
@@ -122,6 +168,17 @@ inline void t6_semaphore_get(const uint8_t index)
     TTI_SEMGET(semaphore::t6_sem(index));
 }
 
+template <uint WaitRes>
+inline void t6_semaphore_wait_on_max(const uint8_t index)
+{
+    TTI_SEMWAIT(WaitRes, semaphore::t6_sem(index), p_stall::STALL_ON_MAX);
+}
+template <uint WaitRes>
+inline void t6_semaphore_wait_on_zero(const uint8_t index)
+{
+    TTI_SEMWAIT(WaitRes, semaphore::t6_sem(index), p_stall::STALL_ON_ZERO);
+}
+
 // Tensix thread semaphore get optionally stalled
 inline void t6_semaphore_init(const uint8_t index, const uint8_t min_value, const uint8_t max_value)
 {
@@ -154,7 +211,7 @@ inline void cfg_write(uint cfg_addr32, uint data)
 inline uint cfg_read(uint cfg_addr32)
 {
     // Declared here instead of globally to prevent direct access, which might ignore current state ID
-    volatile uint32_t tt_reg_ptr *cfg_regs = reinterpret_cast<volatile uint32_t tt_reg_ptr *>(TENSIX_CFG_BASE);
+    volatile uint *cfg_regs = reinterpret_cast<volatile uint *>(TENSIX_CFG_BASE);
     return cfg_regs[cfg_addr(cfg_addr32)];
 }
 
@@ -199,7 +256,11 @@ inline void mop_run(const uint8_t type, const uint8_t count)
     TTI_MOP(type, count - 1, 0); // Run the MOP
 }
 
-inline __attribute__((always_inline)) uint32_t reg_read(uint32_t addr)
+// Register read (workaround for bug
+// https://yyz-gitlab.local.tenstorrent.com/tenstorrent/tensix/issues/976
+// now handled by the compiler)
+// workaround is needed only for GS
+inline uint reg_read(uint32_t addr)
 {
     volatile uint tt_reg_ptr *p_reg = reinterpret_cast<volatile uint tt_reg_ptr *> (addr);
     return p_reg[0];
@@ -310,146 +371,169 @@ inline void cfg_reg_rmw_tensix(uint32_t val)
     }
 }
 
-template <class T>
-inline std::uint32_t memory_cast(T *object_ptr)
+inline void mailbox_write(const uint8_t thread, const uint32_t data)
 {
-    return reinterpret_cast<uint32_t>(object_ptr);
+    mailbox_base[thread + 1][0] = data;
 }
 
-inline uint64_t read_wall_clock()
+// Blocking read
+inline uint32_t mailbox_read(const uint8_t thread)
 {
-   uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
-   uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
-   return ((uint64_t)timestamp_high << 32) | timestamp_low;
+    return mailbox_base[thread + 1][0];
 }
 
-void debug_dump(const uint8_t *data, uint32_t byte_size);
-void debug_dump_seek(uint8_t offset);
+inline bool mailbox_not_empty(const uint8_t thread)
+{
+    return mailbox_base[thread + 1][1] > 0;
+}
 
+inline void mailbox_write_full(const uint8_t thread, const uint32_t data)
+{
+    mailbox_base[thread][0] = data;
+}
 
-inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b)
+// Blocking read
+inline uint32_t mailbox_read_full(const uint8_t thread)
 {
-  unsigned int r = 0;
-  while (a)
-    {
-      if (a & 1)
-        r += b;
-      a >>= 1;
-      b <<= 1;
-    }
-  return r;
-}
-
-inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n)
-{
-    // Uses embedding style magic number
-    // * fixed point 1/12 then shifting.
-    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
-    return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3;
-}
-
-inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n)
-{
-    // Uses embedding style magic number
-    // * fixed point 1/12 then shifting.
-    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
-    return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6;
-}
-
-template <uint32_t d>
-inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n)
-{
-    if constexpr (d == 12) {
-        // fast divide for 12 divisor
-        return fast_udiv_12(n);
-    } else if constexpr (d == 94) {
-        // fast divide for 94 divisor. Handles Banked L1 address generation for E75
-        return fast_udiv_94(n);
-    } else {
-        // generic divide from llvm
-        const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT;
-        unsigned int q;
-        unsigned int r;
-        unsigned sr;
-        /* special cases */
-        if (d == 0)
-            return 0; /* ?! */
-        if (n == 0)
-            return 0;
-        sr = __builtin_clz(d) - __builtin_clz(n);
-        /* 0 <= sr <= n_uword_bits - 1 or sr large */
-        if (sr > n_uword_bits - 1)  /* d > r */
-            return 0;
-        if (sr == n_uword_bits - 1)  /* d == 1 */
-            return n;
-        ++sr;
-        /* 1 <= sr <= n_uword_bits - 1 */
-        /* Not a special case */
-        q = n << (n_uword_bits - sr);
-        r = n >> sr;
-        unsigned int  carry = 0;
-        for (; sr > 0; --sr)
-        {
-            /* r:q = ((r:q)  << 1) | carry */
-            r = (r << 1) | (q >> (n_uword_bits - 1));
-            q = (q << 1) | carry;
-            /* carry = 0;
-             * if (r.all >= d.all)
-             * {
-             *      r.all -= d.all;
-             *      carry = 1;
-             * }
-             */
-            const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1);
-            carry = s & 1;
-            r -= d & s;
-        }
-        q = (q << 1) | carry;
-        return q;
-    }
+    return mailbox_base[thread][0];
 }
-template <uint32_t d>
-inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a)
+
+inline bool mailbox_not_empty_full(const uint8_t thread)
 {
-    return a - udivsi3_const_divisor<d>(a) * d;
+    return mailbox_base[thread][1] > 0;
 }
 
-inline void tensix_sync()
+inline void trisc_l1_mailbox_write(const uint data)
 {
-    volatile uint foo = 0x0;
-    volatile uint *fooptr = &foo;
-    // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes
-    pc_buf_base[1] = foo;
+    trisc_l1_mailbox[0] = data;
+}
 
-    // Now read -- this read will block until we're idle
-    *fooptr = pc_buf_base[1];
+inline uint trisc_l1_mailbox_read()
+{
+    return trisc_l1_mailbox[0];
 }
 
-inline void mop_sync()
+template <class T>
+inline std::uint32_t memory_cast(T *object_ptr)
 {
-    volatile uint foo = 0x0;
-    volatile uint *fooptr = &foo;
-    // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes
-    pc_buf_base[2] = foo;
+    return reinterpret_cast<uint32_t>(object_ptr);
+}
 
-    // Now read -- this read will block until mops are done
-    *fooptr = pc_buf_base[2];
+inline void record_mailbox_value(uint16_t event_value) {
+  if (mailbox_index < mailbox_end) {
+    debug_mailbox_base[mailbox_index] = event_value;
+    mailbox_index++;
+  }
 }
 
-inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {
+inline void record_mailbox_value_with_index(uint8_t index, uint16_t event_value) {
+  if (index < mailbox_end) {
+    debug_mailbox_base[index] = event_value;
+  }
+}
 
-    uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
-    static constexpr uint32_t op_info_num_items = 7;
+// Initialize debug scratch mailbox values and range
+inline void clear_mailbox_values(uint16_t value = 0) {
+  for (int i = 0; i < mailbox_end; i++)
+    debug_mailbox_base[i] = value;
+}
 
-    volatile uint32_t* op_info_struct_ptr = reinterpret_cast<volatile uint32_t*>(&op_info_struct);
-    for (uint32_t i = 0; i < op_info_num_items; i++) {
-        op_info_struct_ptr[i] = op_info_ptr[i];
-    }
-    op_info_offset += 28;
+inline uint64_t read_wall_clock()
+{
+   uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+   uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+   return ((uint64_t)timestamp_high << 32) | timestamp_low;
+}
+
+inline void record_kernel_runtime(uint64_t kernel_runtime) {
+    debug_mailbox_base[mailbox_end - 4] = kernel_runtime & 0xffff;
+    debug_mailbox_base[mailbox_end - 3] = (kernel_runtime >> 16) & 0xffff;
+    debug_mailbox_base[mailbox_end - 2] = (kernel_runtime >> 32) & 0xffff;
+    debug_mailbox_base[mailbox_end - 1] = (kernel_runtime >> 48) & 0xffff;
+}
+
+void debug_dump(const uint8_t *data, uint32_t byte_size);
+void debug_dump_seek(uint8_t offset);
 
-    if (op_info_offset == OP_INFO_SIZE) {
-        op_info_offset = 0; // In case we go out of bounds
+inline void stall_kernel(uint32_t num_cycles) {
+#if DELAY_EN > 0
+    TT_LLK_DUMP("stall_kernel({})", num_cycles);
+    uint32_t start_clk_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+    uint32_t elapsed_time = 0;
+    while (elapsed_time <= num_cycles) {
+        uint32_t current_clk_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+        if (current_clk_l >= start_clk_l) {
+            elapsed_time = current_clk_l - start_clk_l;
+        } else {
+            elapsed_time = 0xffffffff - (start_clk_l - current_clk_l);
+        }
     }
+#endif
 }
 
+#if defined(PERF_DUMP) || DELAY_EN > 0
+extern bool record_perf_events;
+#endif
+
+// This api is inserted in the beginning of each input loop
+// Wait for all instructions of previous loop to finish before starting the next loop
+// If PERF_DUMP is enabled, always wait but only for the inputs that perf dump is enabled for
+// If PERF_DUMP is enabled, and delay is not, no need to insert these apis for unpack and math
+template<int thread_id>
+inline void serialize_input_loop_start() {
+    #if defined(PERF_DUMP) || DELAY_EN > 0
+        TT_LLK_DUMP("serialize_input_loop_start<{}>()", thread_id);
+        if constexpr (thread_id == 0) {
+    #if DELAY_EN > 0
+            t6_semaphore_post(semaphore::UNPACK_MATH_DONE);
+            while (semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {}
+    #endif
+
+        } else if (thread_id == 1) {
+    #if DELAY_EN > 0
+            t6_semaphore_post(semaphore::UNPACK_MATH_DONE);
+            while (semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {}
+    #endif
+
+        } else if (thread_id == 2) {
+    #if DELAY_EN == 0
+            if (record_perf_events) {
+    #endif
+            t6_semaphore_post(semaphore::PACK_DONE);
+            while (semaphore_read(semaphore::PACK_DONE) == 0) {}
+    #if DELAY_EN == 0
+            }
+    #endif
+        }
+    #endif
+}
+
+template<int thread_id>
+inline void serialize_input_loop_end() {
+    #if defined(PERF_DUMP) || DELAY_EN > 0
+        TT_LLK_DUMP("serialize_input_loop_end<{}>()", thread_id);
+        if constexpr (thread_id == 0) {
+        #if DELAY_EN > 0
+                t6_semaphore_get<p_stall::UNPACK>(semaphore::UNPACK_MATH_DONE);
+                while (semaphore_read(semaphore::UNPACK_MATH_DONE) > 0) {}
+        #endif
+
+            } else if (thread_id == 1) {
+        #if DELAY_EN > 0
+                t6_semaphore_get<p_stall::MATH>(semaphore::UNPACK_MATH_DONE);
+                while (semaphore_read(semaphore::UNPACK_MATH_DONE) > 0) {}
+        #endif
+
+            } else if (thread_id == 2) {
+        #if DELAY_EN == 0
+                if (record_perf_events) {
+        #endif
+                t6_semaphore_get<p_stall::PACK>(semaphore::PACK_DONE);
+                while (semaphore_read(semaphore::PACK_DONE) > 0) {}
+        #if DELAY_EN == 0
+                }
+        #endif
+            }
+    #endif
+    }
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
index 46b5b775903..ffd8ad6dae9 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
@@ -2,10 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
 #pragma once
 
-#include "llk_defs.h"
 #include "ckernel_ops.h"
 #include "tensix_types.h"
 
@@ -68,12 +66,30 @@ enum PackSelMask
     PACK_23=0xC
 };
 
+/*
+Stochastic rounding modes:
+    None: No stochastic rounding enabled, default rounding is round to nearest even.
+    Fpu: Enables stochastic rounding for every accumulation in the fpu
+    Pack: Enables stochastic rounding in both gasket and packer. Gasket rounding is in
+    data format conversion stage from dest format to pack_src_format. Packer rounding
+    is in data format conversion stage from pack_src_format to pack_dst_format.
+    All: Enables fpu, pack and gasket rounding.
+*/
+enum class StochRndMode : std::uint8_t
+{
+    None    = 0,
+    Fpu     = 1,
+    Pack    = 2,
+    All     = 0xf,
+    Invalid = 0xff,
+};
+
 constexpr std::uint32_t FACE_HEIGHT = 16;
 constexpr std::uint32_t FACE_WIDTH  = 16;
 constexpr std::uint32_t TILE_HEIGHT = 32;
 constexpr std::uint32_t TILE_WIDTH  = 32;
 constexpr std::uint32_t DATUMS_PER_ROW = 16;
-constexpr std::uint32_t TILE_HEADER_SIZE = 0;
+constexpr std::uint32_t TILE_HEADER_SIZE = 1;
 
 constexpr std::uint32_t FACE_R_DIM = FACE_HEIGHT;
 constexpr std::uint32_t FACE_C_DIM = FACE_WIDTH ;
@@ -92,29 +108,27 @@ static_assert((DEST_NUM_TILES_FP16 & (DEST_NUM_TILES_FP16 - 1)) == 0);
 #define HI_16(REG) (2 * (REG) + 1)
 
 
-// all sizes are headerless
-// in 16B words, in L1/DRAM, headerless
+/*
 constexpr static std::int32_t MUL_TILE_SIZE_AND_INDEX(uint format, uint index) {
-    switch (format&0x1F) {
-        case ((uint8_t)DataFormat::Float32): return ((index<<8));
+    switch (format&0xF) {
+        case ((uint8_t)DataFormat::Float32): return ((index<<8)+(index<<1));
         case ((uint8_t)DataFormat::Float16):
-        case ((uint8_t)DataFormat::Float16_b): return ((index<<7));
+        case ((uint8_t)DataFormat::Float16_b): return ((index<<7)+(index<<1));
         case ((uint8_t)DataFormat::Bfp8):
-        case ((uint8_t)DataFormat::Bfp8_b): return ((index<<6)+(index<<2));
+        case ((uint8_t)DataFormat::Bfp8_b): return ((index<<6)+(index<<2)+(index<<1));
         case ((uint8_t)DataFormat::Bfp4):
-        case ((uint8_t)DataFormat::Bfp4_b): return ((index<<5)+(index<<2));
+        case ((uint8_t)DataFormat::Bfp4_b): return ((index<<5)+(index<<2)+(index<<1));
         case ((uint8_t)DataFormat::Bfp2):
-        case ((uint8_t)DataFormat::Bfp2_b): return ((index<<4)+(index<<2));
+        case ((uint8_t)DataFormat::Bfp2_b): return ((index<<4)+(index<<2)+(index<<1));
         case ((uint8_t)DataFormat::Int8):
-        case ((uint8_t)DataFormat::Lf8): return ((index<<6));
+        case ((uint8_t)DataFormat::Lf8): return ((index<<6)+(index<<1));
         //Keep default as Bfp8?
-        default: return ((index<<6)+(index<<2));
+        default: return ((index<<6)+(index<<2)+(index<<1));
     };
 }
 
-// in Bytes, in DST REG, headerless
 constexpr static std::int32_t MUL_DEST_TILE_SIZE_AND_INDEX(uint format, uint index) {
-    switch (format&0x1F) {
+    switch (format&0xF) {
         case ((uint8_t)DataFormat::Float32): return (index<<12);
         case ((uint8_t)DataFormat::Float16):
         case ((uint8_t)DataFormat::Float16_b): return (index<<11);
@@ -130,27 +144,25 @@ constexpr static std::int32_t MUL_DEST_TILE_SIZE_AND_INDEX(uint format, uint ind
     };
 }
 
-// 16B words, L1/dram headerless!
 constexpr static std::int32_t GET_L1_TILE_SIZE(uint format) {
-    switch (format&0x1F) {
-        case ((uint8_t)DataFormat::Float32): return ((4096>>4));
+    switch (format&0xF) {
+        case ((uint8_t)DataFormat::Float32): return ((4096>>4)+(32>>4));
         case ((uint8_t)DataFormat::Float16):
-        case ((uint8_t)DataFormat::Float16_b): return ((2048>>4));
+        case ((uint8_t)DataFormat::Float16_b): return ((2048>>4)+(32>>4));
         case ((uint8_t)DataFormat::Bfp8):
-        case ((uint8_t)DataFormat::Bfp8_b): return ((1024>>4)+(64>>4));
+        case ((uint8_t)DataFormat::Bfp8_b): return ((1024>>4)+(64>>4)+(32>>4));
         case ((uint8_t)DataFormat::Bfp4):
-        case ((uint8_t)DataFormat::Bfp4_b): return ((512>>4)+(64>>4));
+        case ((uint8_t)DataFormat::Bfp4_b): return ((512>>4)+(64>>4)+(32>>4));
         case ((uint8_t)DataFormat::Bfp2):
-        case ((uint8_t)DataFormat::Bfp2_b): return ((256>>4)+(64>>4));
+        case ((uint8_t)DataFormat::Bfp2_b): return ((256>>4)+(64>>4)+(32>>4));
         case ((uint8_t)DataFormat::Int8):
-        case ((uint8_t)DataFormat::Lf8): return ((1024>>4));
-        default: return ((1024>>4)+(64>>4));
+        case ((uint8_t)DataFormat::Lf8): return ((1024>>4)+(32>>4));
+        default: return ((1024>>4)+(64>>4)+(32>>4));
     };
 }
 
-
 constexpr static std::int32_t GET_DEST_TILE_BYTE_SIZE(uint format) {
-    switch (format&0x1F) {
+    switch (format&0xF) {
         case ((uint8_t)DataFormat::Float32): return 4096;
         case ((uint8_t)DataFormat::Float16):
         case ((uint8_t)DataFormat::Float16_b): return 2048;
@@ -165,9 +177,11 @@ constexpr static std::int32_t GET_DEST_TILE_BYTE_SIZE(uint format) {
         default: return 1024;
     };
 }
+*/
 
 constexpr static std::uint32_t GET_L1_HEADERLESS_TILE_SIZE(uint format) {
     switch (format&0xF) {
+        case ((uint8_t)DataFormat::Int32):
         case ((uint8_t)DataFormat::Float32): return (4096>>4);
         case ((uint8_t)DataFormat::Float16):
         case ((uint8_t)DataFormat::Float16_b): return (2048>>4);
@@ -204,8 +218,20 @@ constexpr static bool IS_BFP_A_FORMAT(uint format) {
     };
 }
 
+constexpr static bool IS_A_FORMAT(uint format) {
+    switch (format&0xF) {
+        case ((uint8_t)DataFormat::Lf8):
+        case ((uint8_t)DataFormat::Float16):
+        case ((uint8_t)DataFormat::Bfp8):
+        case ((uint8_t)DataFormat::Bfp4):
+        case ((uint8_t)DataFormat::Bfp2): return true;
+        default: return false;
+    };
+}
+
 constexpr static std::uint32_t SCALE_DATUM_SIZE(uint format, uint datum_count) {
     switch (format&0xF) {
+        case ((uint8_t)DataFormat::Int32):
         case ((uint8_t)DataFormat::Float32): return (datum_count<<2);
         case ((uint8_t)DataFormat::Float16):
         case ((uint8_t)DataFormat::Float16_b): return (datum_count<<1);
@@ -217,4 +243,5 @@ constexpr static std::uint32_t SCALE_DATUM_SIZE(uint format, uint datum_count) {
 #define UPPER_HALFWORD(x) ((x) >> 16)
 
 constexpr int WHB0_ITERATIONS = 8;
+
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
index d4ad75f5e7f..90ac67944f5 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
@@ -7,70 +7,15 @@
 #include <cstdint>
 #include "ckernel_structs.h"
 #include "risc_attribs.h"
-#include "tensix_functions.h"
-#include "hostdevcommon/common_runtime_address_map.h"
 
 extern uint32_t cfg_state_id;
 extern uint32_t unp_cfg_context;
 
 extern uint32_t volatile tt_l1_ptr l1_buffer[16];
 
-//extern const int32_t unpack_src_format[24];
-//extern const int32_t unpack_dst_format[24];
-//extern const int32_t pack_src_format[16];
-//extern const int32_t pack_dst_format[16];
-
 extern uint32_t pack_sync_tile_dst_ptr;
 extern uint32_t math_sync_tile_dst_index;
 
-extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
-
-extern uint32_t __ldm_bss_start[];
-extern uint32_t __ldm_bss_end[];
-extern uint32_t __ldm_data_start[];
-extern uint32_t __ldm_data_end[];
-extern void (* __init_array_start[])();
-extern void (* __init_array_end[])();
+extern uint32_t __local_mem_rodata_start_addr[];
+extern uint32_t __local_mem_rodata_end_addr[];
 extern uint32_t __firmware_start[];
-
-extern void kernel_init();
-extern void kernel_launch();
-
-inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
-    // Cover L1 load latency of 6 cycles for the bulk of the copy
-    int32_t n = 0;
-    while (n < len - 5) {
-        uint32_t v0 = l1_addr[n + 0];
-        uint32_t v1 = l1_addr[n + 1];
-        uint32_t v2 = l1_addr[n + 2];
-        uint32_t v3 = l1_addr[n + 3];
-        uint32_t v4 = l1_addr[n + 4];
-        uint32_t v5 = l1_addr[n + 5];
-        local_mem_addr[n + 0] = v0;
-        local_mem_addr[n + 1] = v1;
-        local_mem_addr[n + 2] = v2;
-        local_mem_addr[n + 3] = v3;
-        local_mem_addr[n + 4] = v4;
-        local_mem_addr[n + 5] = v5;
-        n += 6;
-    }
-    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
-    while (n < len) {
-        local_mem_addr[n] = l1_addr[n];
-        n++;
-    }
-}
-
-inline void firmware_kernel_common_init(void *init_local_l1_base) {
-
-    // Handle stuff typically done in crt0 in asm.  Easier to do in C
-    wzerorange(__ldm_bss_start, __ldm_bss_end);
-
-    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
-    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
-    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
-
-    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
-        (**fptr)();
-    }
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h
index 9b3f032e624..822704cc9e1 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h
@@ -43,13 +43,19 @@ struct p_gpr_unpack
     constexpr static uint FACE_DIM_4x16     = 42;   // Holds face dimension (4x16)
     constexpr static uint FACE_DIM_2x16     = 43;   // Holds face dimension (2x16)
     constexpr static uint FACE_DIM_1x16     = 44;   // Holds face dimension (1x16)
+    constexpr static uint PERF_UNPACK_NUM_TILES_0 = 45;   // num tiles for input operands 0-1
+    constexpr static uint PERF_UNPACK_NUM_TILES_1 = 46;   // num tiles for input operands 2-3
+    constexpr static uint PERF_UNPACK_NUM_TILES_2 = 47;   // num tiles for input operands 4-5
+    constexpr static uint PERF_UNPACK_NUM_TILES_3 = 48;   // num tiles for input operands 6-7
+    constexpr static uint UNPACK_STRIDE     = 52;   // Used to save/restore unpack A stride (UNP0_ADDR_CTRL_ZW_REG_1_Zstride register)
+                                                    // before/after unpacking directly to dest
     constexpr static uint SR_UNPACK_TILIZER_STATE_0 = 54;  // Save unpack state before tilizer is enabled for quick restore
     constexpr static uint SR_UNPACK_TILIZER_STATE_1 = 55;
     constexpr static uint SR_UNPACK_UNTILIZER_STATE_0 = 56;  // Save unpack state before tilizer is enabled for quick restore
     constexpr static uint SR_UNPACK_UNTILIZER_STATE_1 = 57;
     constexpr static uint SR_UNPACK_UNTILIZER_STATE_2 = 58;
     constexpr static uint SR_UNPACK_UNTILIZER_STATE_3 = 59;
-    constexpr static uint SR_UNPACK_UNTILIZER_STATE_4 = 59;
+
 };
 
 // Math GPR thread
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h
deleted file mode 100644
index 18c150d6388..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h
+++ /dev/null
@@ -1,317 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "debug/fw_debug.h"
-
-#include "noc_overlay_parameters.h"
-
-struct stream_tile_info_t
-{
-    uint32_t base_address;
-    TileHeader tile_header;
-};
-// Functions for accessing NOC overlay registers
-namespace ckernel
-{
-
-typedef volatile uint32_t tt_reg_ptr *regp;
-
-// Only perform the calculation once, as it's expensive to multiply numbers
-inline regp get_stream_reg(uint32_t stream_id)
-{
-    constexpr uint32_t NOC_REGISTER_MMIO_BASE = 0xFFB40000;
-    constexpr uint32_t PER_STREAM_REG_SIZE = 0x1000;
-    return (regp) (NOC_REGISTER_MMIO_BASE + PER_STREAM_REG_SIZE * stream_id);
-}
-
-inline uint32_t get_stream_reg_addr(uint32_t stream_id, uint32_t index)
-{
-    constexpr uint32_t NOC_REGISTER_MMIO_BASE = 0xFFB40000;
-    constexpr uint32_t PER_STREAM_REG_SIZE = 0x1000;
-    return (NOC_REGISTER_MMIO_BASE + PER_STREAM_REG_SIZE * stream_id + (index << 2));
-}
-
-inline void write_stream_register(regp p_stream_reg, uint32_t index, uint32_t value)
-{
-    p_stream_reg[index] = value;
-}
-
-inline uint32_t read_stream_register(const regp p_stream_reg, uint32_t index)
-{
-    return p_stream_reg[index];
-}
-
-inline uint32_t read_stream_register_field(const regp p_stream_reg, uint32_t index, uint32_t shift, uint32_t width)
-{
-    return (read_stream_register(p_stream_reg, index) >> shift) & ((1 << width) - 1);
-}
-
-// Wait until stream has at least 'count' tiles ready
-inline void wait_for_stream_messages(const regp p_stream_reg, const uint count)
-{
-    uint c = 0;
-    do
-    {
-        c = read_stream_register(p_stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX);
-    } while (c < count);
-}
-
-inline void wait_for_N_stream_messages(const regp p_stream_reg, const uint num_messages) {
-
-    uint c = 0;
-    do {
-        uint32_t msg_info_wr = read_stream_register(p_stream_reg, STREAM_MSG_INFO_WR_PTR_REG_INDEX);
-        uint32_t msg_info = read_stream_register(p_stream_reg, STREAM_MSG_INFO_PTR_REG_INDEX);
-        uint32_t num_msg = read_stream_register(p_stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX);
-        c = num_msg + (msg_info_wr - msg_info);
-        // wait while we receive all the tiles from this stream
-    } while (c < num_messages);
-}
-
-inline void wait_for_stream_phase(const regp p_stream_reg, const uint phase_id)
-{
-    if (phase_id == 0)
-    {
-        return;
-    }
-    uint p = 0;
-    do
-    {
-        p = read_stream_register(p_stream_reg, STREAM_CURR_PHASE_REG_INDEX);
-    } while (p != phase_id);
-}
-
-inline void update_stream_read_pointer(regp p_stream_reg, const uint amount)
-{
-    write_stream_register(p_stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, amount);
-}
-
-inline uint read_stream_base_address(const regp p_stream_reg, const uint tile_n)
-{
-    return read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + tile_n * 6 + 0); //-> activations base address for tile n
-}
-
-inline uint read_stream_zero_mask(const regp p_stream_reg, const uint tile_n)
-{
-    return read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + tile_n * 6 + 4); //-> 32-bit zero mask
-}
-
-// Read tile info from a stream
-inline stream_tile_info_t read_stream_info(const uint tile_index, const regp p_stream_reg)
-{
-    const uint n = tile_index;
-    const uint base_address = read_stream_base_address(p_stream_reg, n);
-
-    TileHeader_u header;
-
-    header.val[0] = read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + n * 6 + 2); //-> tile n size including header and tile id (15:0 size, 31:16 tile id)
-    FWASSERT("Tile size must be != 0", (header.val[0] & 0xFFFF) != 0);
-    header.val[1] = read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + n * 6 + 3); //-> tile n meta data size and format
-    header.val[2] = read_stream_zero_mask(p_stream_reg, n); //-> 32-bit zero mask
-    //read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + n * 6 + 5); //-> Reserved
-
-    return stream_tile_info_t{base_address, header.header};
-}
-
-inline uint read_dis_zero_compress_group_info(const regp p_stream_reg)
-{
-    return read_stream_register(p_stream_reg, STREAM_MSG_GROUP_COMPRESS_REG_INDEX);
-}
-
-// Return the offset of a tile given it's tile id and table address
-inline uint32_t get_indexed_offset(const uint tile_id, const uint weights_offset, const uint table_addr)
-{
-    const uint16_t *weight_offset_table = reinterpret_cast<uint16_t *>(table_addr << 4);
-    uint weight_offset = weight_offset_table[tile_id + weights_offset];
-    return weight_offset;
-}
-
-inline void unpacker_config(const regp p_stream_reg, const uint unpacker_id, const uint fifo_size_factor = 1)
-{
-    uint fifo_base_addr = read_stream_register(p_stream_reg, STREAM_BUF_START_REG_INDEX);
-    uint fifo_size = fifo_size_factor * read_stream_register(p_stream_reg, STREAM_BUF_SIZE_REG_INDEX);
-    cfg_write(unpacker_id ? THCON_SEC1_REG2_Unpack_limit_address_ADDR32 : THCON_SEC0_REG2_Unpack_limit_address_ADDR32,
-        (fifo_base_addr + fifo_size - 1) | (fifo_size << THCON_SEC0_REG2_Unpack_fifo_size_SHAMT));
-}
-
-// Optimized function that reads base addresses and programs registers for one context
-inline void program_halo_strips_cntx0(
-    volatile uint *cfg, const regp p_stream_reg, const uint first_active_tile, const uint unpack_halo_mask, uint *group_dis_zero_compress)
-{
-    //const uint strip1_addr = read_stream_base_address(p_stream_reg, 1);
-    //const uint strip2_addr = read_stream_base_address(p_stream_reg, 2);
-    //const uint strip3_addr = read_stream_base_address(p_stream_reg, 3);
-    //cfg[THCON_SEC0_REG3_Base_cntx1_address_ADDR32] = strip1_addr;
-    //cfg[THCON_SEC0_REG3_Base_cntx2_address_ADDR32] = strip2_addr;
-    //cfg[THCON_SEC0_REG3_Base_cntx3_address_ADDR32] = strip3_addr;
-    uint dis_zero_compress_group_info = read_dis_zero_compress_group_info(p_stream_reg);       // Get uncompress flag for all 4 tiles
-    uint dis_zero_compress_mask = ((dis_zero_compress_group_info & 0x1) << first_active_tile); // Get mask for first active tile
-
-    uint index = 0;
-    uint tile = 0;
-    ;
-    for (uint i = 1; i <= 3; i++)
-    {
-        if (i == first_active_tile)
-            continue;
-
-        if ((unpack_halo_mask >> i) & 0x1)
-        {
-            index++;
-            tile++;
-            const uint strip_addr = read_stream_base_address(p_stream_reg, index);
-            switch (i)
-            {
-            case 1: cfg[THCON_SEC0_REG3_Base_cntx1_address_ADDR32] = strip_addr; break;
-            case 2: cfg[THCON_SEC0_REG3_Base_cntx2_address_ADDR32] = strip_addr; break;
-            case 3: cfg[THCON_SEC0_REG3_Base_cntx3_address_ADDR32] = strip_addr; break;
-            }
-            dis_zero_compress_mask |= (((dis_zero_compress_group_info >> tile) & 0x1) << i);
-        }
-    }
-    *group_dis_zero_compress &= (~(0xf)); // Clear 4 uncompress flags for context 0
-    *group_dis_zero_compress |= dis_zero_compress_mask;
-}
-
-// Optimized function that reads base addresses and programs registers for one context
-// FIXME: this is probably pretty slow.... need to evaluate, and maybe make a separate one for the 'common' case
-// where the unpack halo mask is 0xF
-inline void program_halo_strips_cntx1(
-    volatile uint *cfg, const regp p_stream_reg, const uint first_active_tile, const uint unpack_halo_mask, uint *group_dis_zero_compress)
-{
-    uint dis_zero_compress_group_info = read_dis_zero_compress_group_info(p_stream_reg);       // Get uncompress flag for all 4 tiles
-    uint dis_zero_compress_mask = ((dis_zero_compress_group_info & 0x1) << first_active_tile); // Get mask for first active tile
-
-    uint index = 0;
-    uint tile = 0;
-    for (uint i = 1; i <= 3; i++)
-    {
-        if (i == first_active_tile)
-            continue;
-
-        if ((unpack_halo_mask >> i) & 0x1)
-        {
-            index++;
-            tile++;
-            const uint strip_addr = read_stream_base_address(p_stream_reg, index);
-            switch (i)
-            {
-            case 1: cfg[THCON_SEC0_REG4_Base_cntx5_address_ADDR32] = strip_addr; break;
-            case 2: cfg[THCON_SEC0_REG4_Base_cntx6_address_ADDR32] = strip_addr; break;
-            case 3: cfg[THCON_SEC0_REG4_Base_cntx7_address_ADDR32] = strip_addr; break;
-            }
-            dis_zero_compress_mask |= (((dis_zero_compress_group_info >> tile) & 0x1) << i);
-        }
-    }
-    *group_dis_zero_compress &= (~(0xf0000)); // Clear 4 uncompress flags for context 1
-    *group_dis_zero_compress |= (dis_zero_compress_mask << 16);
-}
-} // namespace ckernel
-
-namespace ckernel::stream
-{
-    // Only perform the calculation once, as it's expensive to multiply numbers
-    inline regp get_reg(uint32_t stream_id)
-    {
-        return ckernel::get_stream_reg(stream_id);
-    }
-
-    inline void wait_for_phase(const regp stream_reg, const uint phase_id)
-    {
-        ckernel::wait_for_stream_phase(stream_reg, phase_id);
-    }
-
-    // Wait until stream has at least 'count' tiles ready
-    inline void wait_for_messages(const regp stream_reg, const uint count)
-    {
-        uint c = 0;
-        do
-        {
-            c = read_stream_register(stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX);
-        } while (c < count);
-    }
-
-    // Wait until stream has any messages ready
-    template <bool FastPop = false>
-    inline void wait_for_token(const regp stream_reg)
-    {
-        wait_for_messages(stream_reg, 1);
-
-        if constexpr (FastPop) {
-            write_stream_register(stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, 1);
-        }
-    }
-
-    // Wait for a tile for streaming unpacker. Make sure to get address before updating pointer.
-    inline uint32_t wait_for_tile(const regp stream_reg)
-    {
-        constexpr auto tile_count = 1;
-        stream::wait_for_messages(stream_reg, tile_count);
-        auto tile_l1_addr = read_stream_base_address(stream_reg, 0);
-        update_stream_read_pointer(stream_reg, tile_count);
-        return tile_l1_addr;
-    }
-
-    inline void pop_messages(const regp stream_reg, const uint count) {
-        for (uint j = 0; j < count; j++) {
-            // TODO: Change to do 2 or 4 (only for stream 4/5) pops at each instruction?
-            uint32_t num_msgs = 1;
-            // Wait for stream to load tiles into the msg info fifo so that we can pop them
-            while (read_stream_register(stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX) == 0) {}
-            write_stream_register(stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, num_msgs);
-            write_stream_register(stream_reg, STREAM_MSG_DATA_CLEAR_REG_INDEX, num_msgs);
-        }
-    }
-
-    inline void release_token(const regp stream_reg)
-    {
-        write_stream_register(stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, 1);
-        write_stream_register(stream_reg, STREAM_MSG_DATA_CLEAR_REG_INDEX, 1);
-    }
-
-    // Wait until specific stream register index contains specific value.
-    inline void wait_for_reg_value(const regp p_stream_reg, const uint reg_index, const uint reg_value)
-    {
-        uint rd_value = reg_value - 1; // Initial non matching value
-        do
-        {
-            rd_value = read_stream_register(p_stream_reg, reg_index);
-        } while (rd_value != reg_value);
-    }
-
-    inline std::uint8_t* get_stream_buf_base_ptr(const regp stream_reg) {
-      auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4;
-      return reinterpret_cast<std::uint8_t*>(base_addr);
-    }
-
-    inline std::uint8_t* get_stream_msg_info_wr_ptr(const regp stream_reg) {
-      auto base_addr = read_stream_register(stream_reg, STREAM_MSG_INFO_WR_PTR_REG_INDEX) << 4;
-      return reinterpret_cast<std::uint8_t*>(base_addr);
-    }
-
-    inline std::uint8_t* get_stream_buf_limit_ptr(const regp stream_reg) {
-      auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4;;
-      auto size = read_stream_register(stream_reg, STREAM_BUF_SIZE_REG_INDEX) << 4;
-      auto limit_addr = base_addr + size;
-      return reinterpret_cast<std::uint8_t*>(limit_addr);
-    }
-
-    inline std::uint8_t* get_stream_msg_ptr(const regp stream_reg) {
-      auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4;
-      auto rdptr = read_stream_register(stream_reg, STREAM_RD_PTR_REG_INDEX) << 4;
-      auto tile_addr = base_addr + rdptr;
-      return reinterpret_cast<std::uint8_t*>(tile_addr);
-    }
-
-    inline std::uint8_t* get_stream_msg_wr_ptr(const regp stream_reg) {
-      auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4;
-      auto wrptr = read_stream_register(stream_reg, STREAM_WR_PTR_REG_INDEX) << 4;
-      auto tile_addr = base_addr + wrptr;
-      return reinterpret_cast<std::uint8_t*>(tile_addr);
-    }
-
-
-} // namespace ckernel::stream
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
new file mode 100644
index 00000000000..0e0c729f4b2
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <cstdint>
+#include <l1_address_map.h>
+#include "ckernel_include.h"
+#include "ckernel_globals.h"
+#include "ckernel.h"
+#include "tensix.h"
+#include "fw_debug.h"
+#include "epoch.h"
+
+#ifdef PERF_DUMP
+#include "perf_lib/scratch_api.h"
+#include "perf_res_decouple.h"
+#include "ckernel_perf_math.h"
+#include "ckernel_perf_unpack_pack.h"
+#endif
+
+#ifndef INTERMED_DUMP
+#define INTERMED_DUMP 0
+#endif
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+
+// Comment in/out to enable perf scratch even logging
+
+namespace ckernel
+{
+extern uint32_t perf_index;
+extern uint32_t perf_end;
+// Perf-buffer are double buffered for spill_to_dram.
+// Ncrisc will move one half to dram while trisc populates the other half.
+// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
+extern volatile uint32_t *perf_buf_base[2];
+// Selects the half of perf_buffer that trisc is currently writing into.
+extern uint8_t perf_buf_base_id;
+extern bool record_perf_events;
+extern uint32_t perf_events_target_idx;
+extern uint16_t current_outer_loop_iter;
+extern uint8_t thread_id;
+extern bool first_unpack_recorded;
+
+inline void set_perf_dump_flag_for_input(int input_idx) {
+   #ifdef PERF_DUMP
+      TT_LLK_DUMP("set_perf_dump_flag_for_input({})", input_idx);
+      if (perf_events_target_inputs[perf_events_target_idx] == input_idx) {
+         record_perf_events = true;
+         perf_events_target_idx++;
+         if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) {
+            if (thread_id == 0 || thread_id == 2) {
+                  perf_end += num_events_per_input;
+                  // The buffer size available for each thread after double buffering is (l1_mem::address_map::TRISC_PERF_BUF_SIZE)/2.
+                  // Max number of events we can record in each half of the buffer will be that size divided by 4, since each event will be 4 bytes.
+                  if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) {
+                     perf_end = TRISC_PERF_BUF_SIZE >> 2;
+                  }
+            }
+         }
+         current_outer_loop_iter = input_idx;
+      } else {
+         record_perf_events = false;
+      }
+      first_unpack_recorded = false;
+   #endif
+}
+
+inline void record_pack_input_init_timestamp() {
+   #ifdef PERF_DUMP
+      TT_LLK_DUMP("record_pack_input_init_timestamp()");
+      if (record_perf_events) {
+         uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter);
+         record_timestamp_64b(event_id);
+      }
+   #endif
+}
+
+void record_pack_input_end_timestamp() {
+   #ifdef PERF_DUMP
+      TT_LLK_DUMP("record_pack_input_end_timestamp()");
+      if (record_perf_events) {
+         uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter);
+         record_timestamp_64b(event_id);
+         if (perf_events_target_idx == 1) {
+            uint32_t event_id_num_tiles_pack = perf::get_event_id(0, 0, perf::EventType::NUM_TILES_PACK, current_outer_loop_iter);
+            uint16_t num_tiles = regfile[p_gpr_pack::PERF_PACK_NUM_TILES] & 0xffff;
+            record_perf_value_and_check_overflow(event_id_num_tiles_pack, num_tiles, 0);
+         }
+      }
+   #endif
+}
+
+inline void perf_math_counter_start() {
+   #ifdef PERF_DUMP
+      TT_LLK_DUMP("perf_math_counter_start()");
+      if constexpr(SKIP_UNP) {
+         TTI_SETDVALID(p_setrwc::SET_A);
+         TTI_SETDVALID(p_setrwc::SET_B);
+      }
+      if (record_perf_events) {
+         // Due to a race condition that corrupts the write address of the fpu counters, reprogram them for every input
+         dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[perf_buf_base_id][perf_index], (uint32_t)&perf_buf_base[perf_buf_base_id][perf_end]);
+         start_fpu_perf_cnt<true>();
+      }
+   #endif
+}
+
+inline void record_perf_math_counter() {
+   #ifdef PERF_DUMP
+      TT_LLK_DUMP("record_perf_math_counter()");
+      if constexpr(SKIP_UNP) {
+         TTI_CLEARDVALID(0x1, 0);
+         TTI_CLEARDVALID(0x2, 0);
+      }
+      if (record_perf_events) {
+         stop_fpu_perf_cnt<true, true>();
+         // record_fpu_perf_cnt_value();
+      }
+   #endif
+}
+
+void record_unpack_num_tiles() {
+   #ifdef PERF_DUMP
+      if (perf_events_target_idx == 1) {
+         for (uint8_t operand = 0; operand < PERF_MAX_NUM_INPUTS; operand++) {
+            uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0;
+            regfile_base_idx += (operand >> 1);
+            bool upper = operand & 0b1;
+            uint16_t num_tiles;
+            if (upper) {
+               num_tiles = (regfile[regfile_base_idx] >> 16) & 0xffff;
+            } else {
+               num_tiles = regfile[regfile_base_idx] & 0xffff;
+            }
+            if (num_tiles != 0) {
+               uint32_t event_id_num_tiles_unpack = perf::get_event_id(operand, 0, perf::EventType::NUM_TILES_UNPACK, current_outer_loop_iter);
+               record_perf_value_and_check_overflow(event_id_num_tiles_unpack, num_tiles, 0);
+            }
+         }
+      }
+   #endif
+}
+
+void record_unpack_first_instruction_timestamp() {
+   #ifdef PERF_DUMP
+      TT_LLK_DUMP("record_unpack_first_instruction_timestamp()");
+      if (record_perf_events) {
+         uint32_t clock_lo = regfile[p_gpr_unpack::PERF_FIRST_UNP_LO];
+         uint32_t clock_hi = regfile[p_gpr_unpack::PERF_FIRST_UNP_HI];
+         uint32_t event_id_last_wait_tile = perf::get_event_id(0, 0, perf::EventType::UNPACK_FIRST_INSTRUCTION, current_outer_loop_iter);
+         record_perf_value_and_check_overflow(event_id_last_wait_tile, clock_lo, clock_hi);
+         if (perf_events_target_idx == 1) {
+            record_unpack_num_tiles();
+         }
+      }
+   #endif
+}
+
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
new file mode 100644
index 00000000000..50b9ed3f7cc
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#ifdef PERF_DUMP
+#include <l1_address_map.h>
+
+#include "perf_events_target_inputs.h"
+#include "perf_lib/scratch_api.h"
+
+#ifndef INTERMED_DUMP
+#define INTERMED_DUMP 0
+#endif
+
+#ifndef PERF_DUMP_CONCURRENT
+#define PERF_DUMP_CONCURRENT 0
+#endif
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+
+static constexpr uint32_t PERF_DUMP_END_SIGNAL = 0xbeeff00d;
+static constexpr uint32_t PERF_CNT_DUMP_ENTRY_SIZE = 16; // Entry size in bytes
+
+#if PERF_DUMP_LEVEL == 0
+static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_0;
+#else
+static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_1;
+#endif
+
+#endif
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
new file mode 100644
index 00000000000..a5df5a61f62
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
@@ -0,0 +1,279 @@
+#pragma once
+
+#include <cstdint>
+#include <l1_address_map.h>
+#include "ckernel_include.h"
+#include "ckernel_globals.h"
+#include "ckernel.h"
+#include "tensix.h"
+#include "fw_debug.h"
+#include "epoch.h"
+
+#include "ckernel_perf_include.h"
+
+#ifndef INTERMED_DUMP
+#define INTERMED_DUMP 0
+#endif
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+
+// Comment in/out to enable perf scratch even logging
+
+namespace ckernel
+{
+extern uint32_t perf_index;
+extern uint32_t perf_end;
+// Perf-buffer are double buffered for spill_to_dram.
+// Ncrisc will move one half to dram while trisc populates the other half.
+// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
+extern volatile uint32_t *perf_buf_base[2];
+// Selects the half of perf_buffer that trisc is currently writing into.
+extern uint8_t perf_buf_base_id;
+extern uint16_t current_outer_loop_iter;
+extern uint8_t thread_id;
+extern uint32_t perf_events_target_idx;
+
+// In math thread, THCON dumps perf buffers in l1.
+// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well.
+// Flipping the l1 perf start address must also be done by THCON for math thread.
+// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective.
+// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed.
+// We read this value initially for all threads to reduce the l1-reads.
+extern int32_t dram_dump_req_local;
+
+struct cperf_cnt_mode
+{
+    constexpr static uint32_t PERF_CNT_MODE_FREE = 0; // Free running period counter
+    constexpr static uint32_t PERF_CNT_MODE_STOP = 1; // Stop counter
+    constexpr static uint32_t PERF_CNT_MODE_WRAP = 2; // Wrap period counter
+};
+
+struct cperf_cnt_block_sel
+{
+    constexpr static uint32_t PERF_CNT_INSTR_THREAD = 0; // Select all instruction thread perf counters(includes TDMA)
+    constexpr static uint32_t PERF_CNT_FPU = 1; // Select FPU perf counters
+    constexpr static uint32_t PERF_CNT_L1  = 2; // Select L1 perf counters
+    constexpr static uint32_t PERF_CNT_ALL = 3; // Select all perf counters
+};
+
+struct cperf_dbg_daisy_id
+{
+    constexpr static uint32_t DEBUG_DAISY_INSTRN_THREAD = 1; // Thread specific perf counters
+    constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_0 = 4; // TDMA+math
+    constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_1 = 5; // math+instruction issue
+    constexpr static uint32_t DEBUG_DAISY_TENSIX  = 7; // FPU and L1 perf counters
+};
+
+struct cperf_dbg_dump_to_mem_mode
+{
+    constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_WR = 0;
+    constexpr static uint32_t DEBUG_MEM_MODE_AUTO_WR = 1;
+    constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_RD = 2;
+    constexpr static uint32_t DEBUG_MEM_MODE_AUTO_RD = 3;
+};
+
+inline void set_perf_cnt_params(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU, uint32_t ref_period=0xffffffff, uint32_t mode=cperf_cnt_mode::PERF_CNT_MODE_FREE) {
+  uint32_t perf_cnt_ref_period_reg;
+  switch (block_sel) {
+     case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD:     perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD0; break;
+     case cperf_cnt_block_sel::PERF_CNT_L1:              perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_L1_0; break;
+     default: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_FPU0;
+  }
+  reg_write(perf_cnt_ref_period_reg, ref_period);
+  reg_write(perf_cnt_ref_period_reg+4, 0x00010100);
+}
+
+inline void stop_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) {
+  uint32_t perf_cnt_cntl_reg;
+  switch (block_sel) {
+     case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break;
+     case cperf_cnt_block_sel::PERF_CNT_L1:           perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break;
+     case cperf_cnt_block_sel::PERF_CNT_ALL:          perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break;
+     default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2;
+  }
+  reg_write(perf_cnt_cntl_reg, 0x00000002);
+  reg_write(perf_cnt_cntl_reg, 0x00000000);
+}
+
+template <bool use_tensix=true, bool stall_on_math=false>
+inline void stop_fpu_perf_cnt() {
+   if (perf_events_target_idx <= 1) {
+      if constexpr (use_tensix) {
+            if constexpr (stall_on_math) {
+               TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH);
+            }
+            TTI_STOREREG(p_gpr_math::PERF_CNT_STOP, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
+            TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
+      } else {
+         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000002);
+         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000);
+      }
+   }
+}
+
+inline void start_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) {
+  uint32_t perf_cnt_cntl_reg;
+  switch (block_sel) {
+     case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break;
+     case cperf_cnt_block_sel::PERF_CNT_L1:           perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break;
+     case cperf_cnt_block_sel::PERF_CNT_ALL:          perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break;
+     default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2;
+  }
+  reg_write(perf_cnt_cntl_reg, 0x00000001);
+  reg_write(perf_cnt_cntl_reg, 0x00000000);
+}
+
+template <bool use_tensix=true>
+inline void start_fpu_perf_cnt() {
+   if (perf_events_target_idx <= 1) {
+      if constexpr (use_tensix) {
+            TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH);
+            TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::THCON);
+            TTI_STOREREG(p_gpr_math::PERF_CNT_START, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
+            TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
+      } else {
+         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000001);
+         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000);
+      }
+   }
+}
+
+
+inline void sel_fpu_perf_cnt(uint32_t cnt_id) {
+   riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl;
+   dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);;
+   dbg_bus_cntl.f.dbg_daisy_sel = cperf_dbg_daisy_id::DEBUG_DAISY_TENSIX;
+   dbg_bus_cntl.f.dbg_sig_sel   = 0x0;
+   dbg_bus_cntl.f.dbg_rd_sel    = cnt_id<<1; //rd_sel is aligned to 16-bit
+   reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val);
+}
+
+// Return value of the selected perf counter
+inline uint32_t get_perf_cnt() {
+   return reg_read(RISCV_DEBUG_REG_DBG_RD_DATA);
+}
+
+template <bool use_tensix=true>
+inline void dump_perf_cnt_to_mem() {
+   if constexpr (use_tensix) {
+      TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_SET,   (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff);
+      TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff);
+   } else {
+      riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2;
+      dbg_l1_mem_reg2.val = 0;
+      dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR;
+      dbg_l1_mem_reg2.f.mem_write = 1;
+      reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
+      dbg_l1_mem_reg2.f.mem_write = 0;
+      reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
+   }
+}
+
+inline void dbg_daisy_enable() {
+   riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl;
+   dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);
+   dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x1;
+   dbg_bus_cntl.f.dbg_daisy_en = 0x1;
+   reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val);
+}
+
+inline void dbg_daisy_disable() {
+   riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl;
+   dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);
+   dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x0;
+   dbg_bus_cntl.f.dbg_daisy_en = 0x0;
+   reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val);
+}
+
+inline void dbg_enable_dump_to_mem(uint32_t start_addr, uint32_t end_addr) {
+
+   TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH);
+   uint32_t start_addr_lo = (start_addr >> 4) & 0xffff;
+   uint32_t start_addr_hi = (start_addr >> 4) >> 16;
+   TT_SETDMAREG(0, start_addr_lo, 0, LO_16(p_gpr_math::TMP0));
+   TT_SETDMAREG(0, start_addr_hi, 0, HI_16(p_gpr_math::TMP0));
+   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG0 >> 2) & 0x3ffff);
+
+   uint32_t end_addr_lo = (end_addr >> 4) & 0xffff;
+   uint32_t end_addr_hi = (end_addr >> 4) >> 16;
+   TT_SETDMAREG(0, end_addr_lo, 0, LO_16(p_gpr_math::TMP0));
+   TT_SETDMAREG(0, end_addr_hi, 0, HI_16(p_gpr_math::TMP0));
+   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG1 >> 2) & 0x3ffff);
+
+   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG0, start_addr>>4);
+   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG1, end_addr>>4);
+   riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2;
+   dbg_l1_mem_reg2.val = 0;
+   dbg_l1_mem_reg2.f.mem_dump_mode = 0xf; //invalid and overriden below to trigger pulse needed to latch start address
+   dbg_l1_mem_reg2.f.skip_cycles = 0;
+
+   uint32_t debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff;
+   uint32_t debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff;
+   TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0));
+   TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0));
+   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff);
+
+
+   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
+   dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; // This value must change in order to latch new start address!!!
+   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
+
+   debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff;
+   debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff;
+   TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0));
+   TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0));
+   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff);
+
+   TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::THCON);
+}
+
+template <bool use_tensix=true>
+inline void record_fpu_perf_cnt_value() {
+   // if (perf_events_target_idx <= 1) {
+   //    // In l1 mode always reserve last event for PERF_DUMP_END_SIGNAL.
+   //    uint32_t reserve_space_for_trisc_end_signal = 1;
+   //    if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default.
+   //       //perf_buf_base[perf_index] = get_perf_cnt();
+   //       //perf_buf_base[perf_index + 1] = get_perf_cnt();
+   //       dump_perf_cnt_to_mem<use_tensix>(); //Dump 16B to L1
+   //       perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t));
+   //    }
+   // }
+}
+
+// Dump a dummy math event to get the initial fpu counter value.
+inline void record_dummy_math_event() {
+   if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
+      uint32_t reserve_space_for_trisc_end_signal = 1;
+      if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default.
+         perf_buf_base[perf_buf_base_id][perf_index] = 0;
+         perf_buf_base[perf_buf_base_id][perf_index+1] = 0;
+         perf_buf_base[perf_buf_base_id][perf_index+2] = 0;
+         perf_buf_base[perf_buf_base_id][perf_index+3] = 0;
+         perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t));
+      }
+   }
+}
+
+inline void setup_fpu_perf_cnt() {
+   // Only program perf counters for math thread (trisc1)
+   if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
+      set_perf_cnt_params(cperf_cnt_block_sel::PERF_CNT_FPU,0xffffffff,cperf_cnt_mode::PERF_CNT_MODE_FREE);
+      sel_fpu_perf_cnt(0);
+      dbg_daisy_enable();
+      dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[0][PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)], (uint32_t)&perf_buf_base[0][perf_end]);
+
+      riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2;
+      dbg_l1_mem_reg2.val = 0;
+      dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR;
+      dbg_l1_mem_reg2.f.mem_write = 0;
+      regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR]=dbg_l1_mem_reg2.val;
+      dbg_l1_mem_reg2.f.mem_write = 1;
+      regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_SET]=dbg_l1_mem_reg2.val;
+
+      regfile[p_gpr_math::PERF_CNT_START]=0x1;
+      regfile[p_gpr_math::PERF_CNT_STOP]=0x2;
+      sync_regfile_write(p_gpr_math::PERF_CNT_STOP);
+   }
+}
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
new file mode 100644
index 00000000000..aaa854ebc2f
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <cstdint>
+#include <l1_address_map.h>
+#include "ckernel_include.h"
+#include "ckernel_globals.h"
+#include "ckernel.h"
+#include "tensix.h"
+#include "fw_debug.h"
+#include "epoch.h"
+
+#include "ckernel_perf_include.h"
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+
+// Comment in/out to enable perf scratch even logging
+
+namespace ckernel
+{
+extern uint32_t perf_index;
+extern uint32_t perf_end;
+// Perf-buffer are double buffered for spill_to_dram.
+// Ncrisc will move one half to dram while trisc populates the other half.
+// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
+extern volatile uint32_t *perf_buf_base[2];
+// Selects the half of perf_buffer that trisc is currently writing into.
+extern uint8_t perf_buf_base_id;
+extern uint8_t thread_id;
+
+// In math thread, THCON dumps perf buffers in l1.
+// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well.
+// Flipping the l1 perf start address must also be done by THCON for math thread.
+// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective.
+// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed.
+// We read this value initially for all threads to reduce the l1-reads.
+extern int32_t dram_dump_req_local;
+extern bool record_perf_events;
+extern uint32_t perf_events_target_idx;
+extern bool first_unpack_recorded;
+extern volatile uint * ncrisc_ack_addr;
+extern uint16_t current_outer_loop_iter;
+#if OVERLAY_DECOUPLE == 1
+extern uint8_t overlay_output_decouple_mask;
+#endif
+
+void allocate_perf_buffer();
+
+// This function gets called when half-perf-buffer is full and need to switch.
+// Only used for threads 0 and 2.
+// For thread 1 a different function is used: switch_perf_buffers_for_math_thread
+// If ncrisc has not yet finished dumping the next half of perf-buffer, trisc will stall.
+// If is_perf_end_signal is true, we just need to write the PERF_DUMP_END_SIGNAL.
+// This function should only get executed in INTERMED_DUMP mode.
+void switch_perf_buffers();
+void last_trisc_perf_dump_to_dram();
+
+// The two following functions are separated to avoid inline recursive function calls.
+// TODO: Check the behaviour of the compiler if the two following functions were merged into a template function.
+inline void record_perf_value(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b) {
+   perf_buf_base[perf_buf_base_id][perf_index] = event_id;
+   perf_buf_base[perf_buf_base_id][perf_index + 1] = event_value_hi_32b;
+   perf_buf_base[perf_buf_base_id][perf_index + 2] = event_value_lo_32b;
+   perf_index += 3;
+}
+
+inline void record_perf_dump_end() {
+   if (perf_index < perf_end) {
+      perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL;
+      perf_index += 1;
+   }
+#if PERF_DUMP_CONCURRENT == 1
+   if (perf_index < perf_end) {
+      perf_buf_base[perf_buf_base_id][perf_end - 1] = PERF_DUMP_END_SIGNAL;
+   }
+#endif
+}
+
+inline void record_perf_value_and_check_overflow(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b, uint32_t leave_space = 0) {
+   // In l1 mode always reserve the last event for PERF_DUMP_END_SIGNAL.
+   int reserve_space_for_trisc_end_signal = 1;
+
+#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1)
+   leave_space = 0;
+   reserve_space_for_trisc_end_signal = 0;
+   if (perf_index + 2 >= perf_end - reserve_space_for_trisc_end_signal - leave_space) {
+      switch_perf_buffers();
+   }
+   record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b);
+#else
+   if (perf_index + 2 < perf_end - reserve_space_for_trisc_end_signal - leave_space) {
+      record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b);
+   }
+#endif
+}
+
+inline void record_timestamp_64b(uint event_id, uint leave_space = 0) {
+   if (record_perf_events) {
+      uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+      uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+      record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, leave_space);
+   }
+}
+
+inline void record_perf_dump_end_and_check_overflow() {
+   if (thread_id == 1) {
+      uint32_t reserve_space_for_trisc_end_signal = 1;
+      if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default.
+         perf_buf_base[perf_buf_base_id][perf_index] = reg_read(0xFFB12000 + 0x120);
+         perf_buf_base[perf_buf_base_id][perf_index+1] = reg_read(0xFFB12000 + 0x124);
+         perf_buf_base[perf_buf_base_id][perf_index+2] = 0;
+         perf_buf_base[perf_buf_base_id][perf_index+3] = 0;
+         perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t));
+      }
+   }
+
+#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1)
+   if (perf_index >= perf_end) {
+      switch_perf_buffers();
+   }
+   record_perf_dump_end();
+#else
+   if (perf_index < perf_end) {
+      record_perf_dump_end();
+   }
+#endif
+}
+
+inline void record_latest_wait_for_tile() {
+#if defined(PERF_DUMP)
+   if (!first_unpack_recorded) {
+      uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+      uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+      regfile[p_gpr_unpack::PERF_FIRST_UNP_LO] = timestamp_low & 0xffffffff;
+      sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_LO);
+      regfile[p_gpr_unpack::PERF_FIRST_UNP_HI] = timestamp_high & 0xffffffff;
+      sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_HI);
+   }
+#endif
+}
+
+void increment_unpack_tiles(uint operand_idx, uint num_tiles);
+void increment_pack_tiles(uint num_tiles);
+#if OVERLAY_DECOUPLE == 1
+inline uint32_t get_active_stream_idx(uint32_t stream_id) {
+    std::uint32_t active_stream_idx;
+    for (uint32_t active_streams_idx = 0; active_streams_idx < NOC_NUM_STREAMS; active_streams_idx++) {
+      if (stream_id == EPOCH_INFO_PTR->active_streams[active_streams_idx]->stream_id) {
+        active_stream_idx = active_streams_idx;
+        break;
+      }
+    }
+    return active_stream_idx;
+}
+
+void llk_push_all_packer_tiles_for_decoupling();
+#endif
+
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h
index 8b2df1139ff..0dd06c65dc8 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h
@@ -2874,25 +2874,6 @@ void test17()
     copy_result_to_dreg0(17);
 }
 
-inline void calculate_logical_not()
-{
-  vUInt v(dst_reg[0].get());
- const vUInt vZero(0), vOne(1);
- v_if(v == 0) {
-   dst_reg[0] = vOne;
- } v_else {
-   dst_reg[0] = vZero;
- }
- v_endif;
-}
-
-inline void calculate_bitwise_complement()
-{
-  vUInt v( dst_reg[0].get() );
-  vUInt v_comp = ~v;
-  dst_reg[0] = v_comp;
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // These tests are designed to be incremental so that if a test fails the
 // earlier tests should be examined/fixed prior to the latter tests.
@@ -2934,12 +2915,7 @@ inline void calculate_sfpi(uint param0 = 0, uint param1 = 0, uint param2 = 0, ui
         test16();
     } else if constexpr (operation == SfpiTestType::test17) {
         test17();
-    } else if constexpr (operation == SfpiTestType::logical_not) {
-	calculate_logical_not();
-    } else if constexpr (operation == SfpiTestType::bitwise_complement) {
-	calculate_bitwise_complement();
     }
-
 }
 
 } // NAMESPACE
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
index e63621398e4..e7b80e9cabf 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
@@ -5,16 +5,13 @@
 #pragma once
 
 #include "ckernel_defs.h"
-#include "ckernel.h"
 #include "noc_nonblocking_api.h"
+#include "ckernel.h"
+#include "llk_defs.h"
+#include <limits>
 
 #include "sfpi.h"
 
-#include "ckernel_sfpu_cdf.h"
-#include "ckernel_sfpu_exp.h"
-#include "ckernel_sfpu_recip.h"
-#include "ckernel_sfpu_converter.h"
-
 using namespace sfpi;
 
 namespace ckernel
@@ -54,7 +51,71 @@ sfpi_inline vInt sfpu_is_fp16_zero(const vFloat& v, uint exponent_size_8)
     }
 }
 
+sfpi_inline vFloat _sfpu_exp_(vFloat val)
+{
+    // If exponent is > -1 extract it and replace with -1
+    vInt exp = exexp(val);
+    v_if (exp >= 0) {
+        val = setexp(val, 126);
+    }
+    v_endif;
+
+    // Run series in Horner form
+    vFloat tmp = val * vConst0p8373 + s2vFloat16b(0.863281);
+    val = val * tmp + vConst1;
+
+    v_if (exp >= 0) {
+        val = val * val;
+        for (int s_iter = 0; s_iter < 7; s_iter++) {
+            exp = exp - 1;
+            // Narrow predication on each loop
+            v_and(exp >= 0);
+            val = val * val;
+        }
+    }
+    v_endif;
+
+    return val;
+}
+
+template <int max_iter = 3>
+sfpi_inline vFloat _sfpu_reciprocal_(const vFloat in)
+{
+    // Force sign to 1 (make number negative)
+    vFloat val = setsgn(in, 1);
+
+    val = setexp(val, 126); // Set exponent to 126 to make the number in 0.5-1
+    // Use 1.44 as first guess at x, ideal value would be 1.33, but we happen to have 1.44 available, so use that to avoid a load
+    vFloat vConstLn2Recip = vConstFloatPrgm0;
+    vFloat two = vConstFloatPrgm1;
+    vFloat result = vConstLn2Recip * (val * vConstLn2Recip + two);
+
+    for (int s_iter = 0; s_iter < (max_iter-1); s_iter++) {
+        result = result * (val * result + two);
+    }
+
+    vInt orig_exp = exexp(in);
+    vInt new_exp = exexp(result);
+
+    // "Subtract" exponents, and re-bias.
+    // Execute: -1 - exp, then exp += 127
+    new_exp -= orig_exp;
+    new_exp += 126;
+
+    v_if (new_exp < 0) {
+        // If rebiased exponent is negative, we need to saturate at 0.
+        // This means the initial number was too big so reciprocal result should be 0
+        result = 0.0F;
+        new_exp = 0;
+    }
+    v_endif;
+
+    // Set newly denormalized exponent to result exponent field
+    return setexp(result, new_exp);
+}
+
 inline void init_dropout_seed(uint16_t p2){
+
     uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(0, 0, NOC_NODE_ID);
 
     uint16_t my_x = noc_id_reg & NOC_NODE_ID_MASK;
@@ -75,23 +136,29 @@ template <bool APPROXIMATION_MODE>
 inline void configure_programmable_constants(SfpuType operation)
 {
     switch (operation) {
-    case SfpuType::expm1:
-    case SfpuType::exp2:
+    case SfpuType::gelu:
+        vConstFloatPrgm0 = 0.5f;
+        break;
+    case SfpuType::exponential:
         if (APPROXIMATION_MODE) {
             vConstFloatPrgm0 = 1.442695f; // ln2_recip
             vConstFloatPrgm1 = s2vFloat16b(p_exp::C23_73);
             vConstFloatPrgm2 = s2vFloat16b(p_exp::ADJ_EXP);
             break;
         }
+
+
+
+        // Fall through
+    case SfpuType::gelu_derivative:
         vConstFloatPrgm2 = 0.863281f;
 
         // Fall through
-    case SfpuType::rsqrt:
-    case SfpuType::atan:
+    case SfpuType::reciprocal:
         vConstFloatPrgm0 = 1.442695f; // ln2_recip
         vConstFloatPrgm1 = 2.0f;
         break;
-    case SfpuType::log_with_base:
+
     case SfpuType::log:
         // ln2
         vConstFloatPrgm0 = 0.692871f; // ln2
@@ -100,6 +167,15 @@ inline void configure_programmable_constants(SfpuType operation)
         vConstFloatPrgm1 = 0.1058f;
         vConstFloatPrgm2 = -0.7166f;
         break;
+
+    case SfpuType::sqrt:
+        if (APPROXIMATION_MODE) {
+            vConstFloatPrgm0 = s2vFloat16b(127 << 7);
+        } else {
+            vConstFloatPrgm0 = s2vFloat16b(0x5f37);
+        }
+        break;
+
     case SfpuType::dropout:
         vConstIntPrgm0 = 0xb400;
         vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB
@@ -140,25 +216,173 @@ inline void sfpu_init(SfpuType operation, uint param0 = 0)
         TTI_SFPLOADI(1, 2, imm1);
         TTI_SFPLOADI(2, 2, imm2);
         break;
-    case SfpuType::sigmoid_appx:
-        imm0 = 0x3DFF;
-        imm1 = 0x21D8;
-        imm2 = 0xFF10;
-        TTI_SFPLOADI(0, 2, imm0);
-        TTI_SFPLOADI(1, 2, imm1);
-        TTI_SFPLOADI(2, 2, imm2);
+    case SfpuType::sigmoid:
+        // imm0 = 0x3DFF;
+        // imm1 = 0x21D8;
+        // imm2 = 0xFF10;
+        // TTI_SFPLOADI(0, 2, imm0);
+        // TTI_SFPLOADI(1, 2, imm1);
+        // TTI_SFPLOADI(2, 2, imm2);
+        // Using a 6 piece LUT to calculate and model sigmoid  directly
+        // x <= 0.5 --> 0.2452x + (-0.0004997)
+        // x <= 1.0 --> 0.2173x + 0.0152
+        // x <= 1.5 --> 0.1731x + 0.05988
+        // x <= 2.0 --> 0.1262x + 0.1298
+        // x <= 4.0 --> 0.0485x + 0.2998
+        // x >  4.0 --> 0.4998
+
+        // imm0[15:0] = A0=0.2452 = 0x33D9 -- imm0[31:16] = A1=0.2173 = 0x32F4
+        sfpu_load_imm32(0,0x32F433D9);
+        // imm4[15:0] = B0= -0.0004997  = 0x9018 -- imm4[31:16] = B1= 0.0152 = 0x23c8
+        sfpu_load_imm32(4,0x23C89018);
+
+        // imm1[15:0] = A2=0.1731 = 0x318a -- imm1[31:16] = A3=0.1262 = 0x300a
+        sfpu_load_imm32(1,0x300A318A);
+        // imm5[15:0] = B2=0.05988 = 0x2BAA -- imm5[31:16] = B3=0.1298 = 0x3027
+        sfpu_load_imm32(5,0x30272BAA);
+
+        // imm2[15:0] = A4=0.0485 = 0x2A35 -- imm2[31:16] = A5=0.0 = 0x7C00
+        sfpu_load_imm32(2,0x7C002A35);
+        // imm6[15:0] = B4=0.2998 = 0x34CC -- imm6[31:16] = B5=0.4998 = 0x37ff
+        sfpu_load_imm32(6,0x37ff34CC);
+
+        break;
+    case SfpuType::gelu_derivative:
+        if constexpr (APPROXIMATION_MODE) {
+            // Using a 6 piece LUT to calculate and model gelu_derivative directly
+            // x <= 0.5 --> 0.8x + 0.5
+            // x <= 1.0 --> 0.4x + 0.7
+            // x <= 1.5 --> 0.1x + 0.99
+            // x <= 2.0 --> -0.09x + 1.27
+            // x <= 3.0 --> -0.075x + 1.235
+            // x >  3.0 --> 1.0
+            // imm0[15:0] = A0=0.8    = 0x3A66 -- imm0[31:16] = A1=0.4   = 0x3666
+            imm0_high = 0x3666;
+            imm0_low  = 0x3A66;
+            // imm1[15:0] = A2=0.1    = 0x2E66 -- imm1[31:16] = A3=-0.09 = 0xADC3
+            imm1_high = 0xADC3;
+            imm1_low  = 0x2E66;
+            // imm2[15:0] = A4=-0.075 = 0xACCD -- imm2[31:16] = A5=0     = 0x7C00
+            imm2_high = 0x7C00;
+            imm2_low  = 0xACCD;
+            // imm3[15:0] = B0=0.5    = 0x3800 -- imm3[31:16] = B1=0.7   = 0x399A
+            imm3_high = 0x399A;
+            imm3_low  = 0x3800;
+            // imm4[15:0] = B2=0.99   = 0x3BEC -- imm4[31:16] = B3=1.27  = 0x3D14
+            imm4_high = 0x3D14;
+            imm4_low  = 0x3BEC;
+            // imm5[15:0] = B4=1.235  = 0x3CF1 -- imm5[31:16] = B5=1.0   = 0x3C00
+            imm5_high = 0x3C00;
+            imm5_low  = 0x3CF1;
+            TTI_SFPLOADI(0, 10, imm0_low);
+            TTI_SFPLOADI(0,  8, imm0_high);
+            TTI_SFPLOADI(1, 10, imm1_low);
+            TTI_SFPLOADI(1,  8, imm1_high);
+            TTI_SFPLOADI(2, 10, imm2_low);
+            TTI_SFPLOADI(2,  8, imm2_high);
+            TTI_SFPLOADI(4, 10, imm3_low);
+            TTI_SFPLOADI(4,  8, imm3_high);
+            TTI_SFPLOADI(5, 10, imm4_low);
+            TTI_SFPLOADI(5,  8, imm4_high);
+            TTI_SFPLOADI(6, 10, imm5_low);
+            TTI_SFPLOADI(6,  8, imm5_high);
+        } else {
+            imm0 = 0x28FF;
+            imm1 = 0x3020;
+            TTI_SFPLOADI(0, 2, imm0);
+            TTI_SFPLOADI(1, 2, imm1);
+        }
+        break;
+    case SfpuType::gelu:
+        // //SG: FIXME
+        // imm0 = 0x18FF;
+        // imm1 = (APPROXIMATION_MODE)? 0x212C : 0x2010;
+        // imm2 = 0xFF00;
+        // TTI_SFPLOADI(0, 2, imm0);
+        // TTI_SFPLOADI(1, 2, imm1);
+        // TTI_SFPLOADI(2, 2, imm2);
+
+        // // >= 3.0f
+        // lreg2_hi=0.50;//3800
+        // lreg6_hi=0.0f;//7c00
+        // // 2.0f -> 3.0f
+        // lreg2_lo= 0.5402f;//3852
+        // lreg6_lo= -0.1194f;//AFA4
+        // // 1.5f -> 2.0f
+        // lreg1_hi= .6099f; //38E1
+        // lreg5_hi= -.2635f; //B437
+        // // 1.0f -> 1.5f
+        // lreg1_lo=0.6189;//38F3
+        // lreg5_lo=-.2797;//B479
+        // // 0.5f -> 1.0f
+        // lreg0_hi=.4939f;//37E7
+        // lreg4_hi=-.1605f;//B122
+        // // 0.0f -> 0.5f
+        // lreg0_lo=0.1928f;//322B
+        // lreg4_lo=-0.0150f;//A3AE
+        sfpu_load_imm32(0,0x37E7322B);
+        //sfpu_load_imm32(4,0xB122A3AE);
+        sfpu_load_imm32(4,0xB12286D8);
+
+
+        sfpu_load_imm32(1,0x38E138F3);
+        sfpu_load_imm32(5,0xB437B479);
+
+        sfpu_load_imm32(2,0x38003852);
+        sfpu_load_imm32(6,0x7c00afa4);
+
         break;
     case SfpuType::dropout:
         init_dropout_seed(param0);
         break;
-    case SfpuType::sigmoid:
-      break;
+    case SfpuType::quant_int32:
+    case SfpuType::requant_int32:
+    case SfpuType::dequant_int32:
+        sfpu_load_imm32(2,param0);
+        break;
     default:
         // Should result in compile time error??
         break;
     }
 }
 
+template <bool APPROXIMATION_MODE>
+sfpi_inline vFloat _calculate_exponential_body_(vFloat in)
+{
+    vFloat out;
+
+    if constexpr (APPROXIMATION_MODE)
+    {
+        constexpr int FRAC_BITS = 3;
+        constexpr uint SP_BIAS = 127 << FRAC_BITS;
+
+        // * by 1/ln2 and add convert to 7.3 FxP format
+        vFloat vConstLn2Recip = vConstFloatPrgm0;
+        vFloat conv = in * vConstLn2Recip;
+
+        // Clear exp bits
+        vInt c23_73 = p_exp::C23_73;
+        vInt tmp = reinterpret<vInt>(conv) - c23_73;
+
+        // Add bias
+        tmp += SP_BIAS;
+
+        // SHL to move integer bits to exponent
+        out = reinterpret<vFloat>(tmp << (10 - FRAC_BITS));
+    }
+    else
+    {
+        // Force sign to 0 (make number positive)
+        out = _sfpu_exp_(setsgn(in, 0));
+
+        v_if (in < 0) {
+            out = _sfpu_reciprocal_(out);
+        }
+        v_endif;
+    }
+
+    return out;
+}
 
 /*
 template <bool APPROXIMATION_MODE, bool ZERO_NEGATIVE, bool SCALE_EN>
@@ -178,81 +402,158 @@ void calculate_cube(uint16_t exp_base_scale_factor = 0)
 }
 */
 
-
-template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
-inline void calculate_rsqrt()
+template <bool APPROXIMATION_MODE, bool SCALE_EN, int ITERATIONS>
+void calculate_exponential(const int iterations, uint16_t exp_base_scale_factor = 0)
 {
-
-    for (int d = 0; d < ITERATIONS; d++)
+    // Unroll 8 best for approx, unroll 0 for precise, compiler figures this out
+    for (int d = 0; d < iterations; d++)
     {
+        vFloat val = dst_reg[0];
+        if constexpr(SCALE_EN){
+            val = val * s2vFloat16a(exp_base_scale_factor);
+        }
+        if constexpr (APPROXIMATION_MODE)
+        {
+            v_if (val>=89){
+                vFloat val_inf = std::numeric_limits<float>::infinity();
+                dst_reg[0] = val_inf;
+            } v_elseif(val<-42){
+                    dst_reg[0] = 0.0f;
+            } v_else {
+                // * by 1/ln2 and add convert to 7.3 FxP format
+                vFloat vConstLn2Recip = vConstFloatPrgm0;
+                vFloat c23_73 = vConstFloatPrgm1;
+                vInt adj_exp = vConstIntPrgm2;
+                val = val * vConstLn2Recip + c23_73;
+
+                // Remove Exponent of 7 and bias the Mantissa to 127.
+                vInt val_short = adj_exp + reinterpret<vInt>(val);
+
+                // SHL to move integer bits to exponent
+                val_short <<= 10 - p_exp::FRAC_BITS;
+                dst_reg[0] = reinterpret<vFloat>(val_short);
+            }
+            v_endif;
+        }
+        else
+        {
+            // Force sign to 0 (make number positive)
+            vFloat result = _sfpu_exp_(setsgn(val, 0));
 
-        vFloat in = dst_reg[0];
-        v_if(dst_reg[0] == 0.0f){
-            dst_reg[0] = std::numeric_limits<float>::infinity();
-        }v_else{
-            vFloat result = 1.0f;
-            v_if(dst_reg[0] > 1.0f){
-                result = sfpu_reciprocal(in);
-            }v_endif;
-
-            for (int r = 0; r < RECIPROCAL_ITERATIONS; r++)
-            {
-                // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration.
-                result = result * (1.5F - 0.5F  * dst_reg[0] * result * result);
+            v_if (val < 0) {
+                result = _sfpu_reciprocal_(result);
             }
+            v_endif;
+
             dst_reg[0] = result;
-        }v_endif;
+        }
 
         dst_reg++;
-
     }
 }
+template <bool APPROXIMATION_MODE>
+inline vFloat _calculate_gelu_core_(vFloat in)
+{
+    // SFPU microcode:
+    // result = (APPROX_MODE == 1)
+    //   ? (1 + erf(x/sqrt(2)))
+    //   : (1 + tanh( sqrt(2/pi) * (x + 0.044715*x^3) )
+    vFloat result;
+    if constexpr (APPROXIMATION_MODE) {
+        result = in;
+    } else {
+        // f = (0.044715*x^3 + x)
+        result = (in * in) * (in * s2vFloat16b(0.044715f)) + in;
+        result *= s2vFloat16b(0.79788f);
+    }
+
+    return result;
+}
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_sigmoid_appx()
+inline void calculate_gelu(const int iterations)
 {
+
     vUInt l0 = l_reg[LRegs::LReg0];
     vUInt l1 = l_reg[LRegs::LReg1];
     vUInt l2 = l_reg[LRegs::LReg2];
+    vUInt l4 = l_reg[LRegs::LReg4];
+    vUInt l5 = l_reg[LRegs::LReg5];
+    vUInt l6 = l_reg[LRegs::LReg6];
 
     #pragma GCC unroll 8
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
-        vFloat val = dst_reg[0];
+        // vFloat in = dst_reg[0];
+        // vFloat result = _calculate_gelu_core_<APPROXIMATION_MODE>(in);
 
-        dst_reg[0] = lut(val, l0, l1, l2) + 0.5f;
+        // vFloat half_in = in * half;
+        // result = lut(result, l0, l1, l2);
+        // result = half_in * result + half_in;
+
+        //dst_reg[0] = result;
+
+        vFloat in = dst_reg[0];
+        vFloat half = vConstFloatPrgm0;
+        vFloat half_in = in * half;
+        vFloat result = lut2_sign(in, l0, l1, l2, l4, l5, l6);
+        result = half_in + result;
+
+        dst_reg[0] = result;
 
         dst_reg++;
+
+        // dst_reg++;
+        //TTI_SFPLOAD(3, 0, 1/*load addr mode*/,0);    // load from dest
+        ////TTI_SFPMUL(3,11,9,7,0);           // lreg7 = 0.5*lreg3
+        //TTI_SFPLUTFP32(7, 2);                // lreg7= LUT(3)
+        //TTI_SFPMAD(3,12,7,3,0);            // lreg3 = 0.5*lreg3+lregm7
+        //TTI_SFPSTORE(3, 0, 3/*store_addr_mod3*/, 0);   // and INCRWC by 4 using mode 3
     }
 
     l_reg[LRegs::LReg0] = l0;
     l_reg[LRegs::LReg1] = l1;
     l_reg[LRegs::LReg2] = l2;
+    l_reg[LRegs::LReg4] = l4;
+    l_reg[LRegs::LReg5] = l5;
+    l_reg[LRegs::LReg6] = l6;
+
+
 }
 
-// TODO: Implement using bitwise comparision
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_signbit()
+inline void calculate_sigmoid(const int iterations)
 {
+    constexpr int lut_mode = 0; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1
+    vUInt l0 = l_reg[LRegs::LReg0];
+    vUInt l1 = l_reg[LRegs::LReg1];
+    vUInt l2 = l_reg[LRegs::LReg2];
+    vUInt l4 = l_reg[LRegs::LReg4];
+    vUInt l5 = l_reg[LRegs::LReg5];
+    vUInt l6 = l_reg[LRegs::LReg6];
 
-    for (int d = 0; d < ITERATIONS; d++)
+
+    #pragma GCC unroll 8
+    for (int d = 0; d < iterations; d++)
     {
         vFloat val = dst_reg[0];
-        v_if (val <= -0.0f) {
-            val = 1.0f;
-        } v_elseif (val >= 0.0f) {
-            val = 0.0f;
-        }
-        v_endif;
-        dst_reg[0] = val;
 
-       dst_reg++;
+        dst_reg[0] = lut2(val, l0, l1, l2, l4, l5, l6, lut_mode) + 0.5f;
+
+        dst_reg++;
     }
 
+    l_reg[LRegs::LReg0] = l0;
+    l_reg[LRegs::LReg1] = l1;
+    l_reg[LRegs::LReg2] = l2;
+    l_reg[LRegs::LReg4] = l4;
+    l_reg[LRegs::LReg5] = l5;
+    l_reg[LRegs::LReg6] = l6;
+
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_tanh()
+inline void calculate_tanh(const int iterations)
 {
     // SFPU microcode
     vUInt l0 = l_reg[LRegs::LReg0];
@@ -260,7 +561,7 @@ inline void calculate_tanh()
     vUInt l2 = l_reg[LRegs::LReg2];
 
     #pragma GCC unroll 8
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat val = dst_reg[0];
         val = lut(val, l0, l1, l2);
@@ -275,7 +576,7 @@ inline void calculate_tanh()
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_hardtanh(uint param0, uint param1, uint param2)
+inline void calculate_hardtanh(const int iterations, uint param0, uint param1, uint param2)
 {
     // All params are in FP16_B format
     // param0 = -(neg_threshold)
@@ -287,7 +588,7 @@ inline void calculate_hardtanh(uint param0, uint param1, uint param2)
     vFloat p2 = s2vFloat16(param2);
     // SFPU microcode
     #pragma GCC unroll 0
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat val = dst_reg[0];
 
@@ -312,14 +613,14 @@ inline void calculate_hardtanh(uint param0, uint param1, uint param2)
 }
 
 template <bool APPROXIMATION_MODE, int WITH_PRECOMPUTED_TANH, int ITERATIONS>
-inline void calculate_tanh_derivative()
+inline void calculate_tanh_derivative(const int iterations)
 {
     vUInt l0 = l_reg[LRegs::LReg0];
     vUInt l1 = l_reg[LRegs::LReg1];
     vUInt l2 = l_reg[LRegs::LReg2];
 
     // tanh'(x) = 1 - (tanh(x))^2
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat val = dst_reg[0];
 
@@ -339,14 +640,146 @@ inline void calculate_tanh_derivative()
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_dropout(uint prob, uint scale)
+inline void calculate_gelu_derivative(const int iterations)
 {
-    // SFPU microcode
+    if constexpr (APPROXIMATION_MODE) {
+        constexpr int lut_mode = 1; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1
+
+        vUInt l0 = l_reg[LRegs::LReg0];
+        vUInt l1 = l_reg[LRegs::LReg1];
+        vUInt l2 = l_reg[LRegs::LReg2];
+        vUInt l4 = l_reg[LRegs::LReg4];
+        vUInt l5 = l_reg[LRegs::LReg5];
+        vUInt l6 = l_reg[LRegs::LReg6];
+
+        // SFPU microcode:
+        #pragma GCC unroll 0
+        for (int d = 0; d < iterations; d++)
+        {
+            vFloat val = dst_reg[0];
+            val = lut2(val, l0, l1, l2, l4, l5, l6, lut_mode);
+            v_if (val < 0.0F) {
+                val = val + 1.0f;
+            }
+            v_endif;
+            dst_reg[0] = val;
+            dst_reg++;
+
+        }
+
+        l_reg[LRegs::LReg0] = l0;
+        l_reg[LRegs::LReg1] = l1;
+        l_reg[LRegs::LReg2] = l2;
+        l_reg[LRegs::LReg4] = l4;
+        l_reg[LRegs::LReg5] = l5;
+        l_reg[LRegs::LReg6] = l6;
+    } else {
+        constexpr uint imm2 = 0xFF10;
+
+        vUInt l0 = l_reg[LRegs::LReg0];
+        vUInt l1 = l_reg[LRegs::LReg1];
 
+        // SFPU microcode:
+        #pragma GCC unroll 0
+        for (int d = 0; d < iterations; d++)
+        {
+            vFloat in = dst_reg[0];
+            vFloat neg_half_sq_in = in * in * -0.5f;
+
+            // exp = e^(val)
+            vFloat exp = _calculate_exponential_body_<false>(neg_half_sq_in);
+
+            // exp = exp * 1/sqrt(2*pi)
+            vFloat partial = exp * in * s2vFloat16b(0.3989423F);
+
+            vFloat result = _calculate_gelu_core_<true>(in);
+
+            result = lut(result, l0, l1, imm2);
+
+            dst_reg[0] = partial + result + 0.5f;
+            dst_reg++;
+        }
+
+        l_reg[LRegs::LReg0] = l0;
+        l_reg[LRegs::LReg1] = l1;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_reciprocal(const int iterations)
+{
+    #pragma GCC unroll 8
+    for (int d = 0; d < iterations; d++)
+    {
+        vFloat in = dst_reg[0];
+        vFloat out = _sfpu_reciprocal_<APPROXIMATION_MODE ? 2 : 3>(in);
+
+        v_if (in < 0.0F) {
+            // Invert sign on calculated value if CC=1 (number is negative)
+            out = -out;
+        }
+        v_endif;
+
+        dst_reg[0] = out;
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
+inline void calculate_sqrt(const int iterations)
+{
+    #pragma GCC unroll 8
+    for (int d = 0; d < iterations; d++)
+    {
+        vFloat val = dst_reg[0];
+
+        if constexpr (APPROXIMATION_MODE)
+        {
+            vUInt magic = vConstIntPrgm0;
+
+            //sqrt initial approximation
+            // adjust bias
+            vUInt val_s = magic + reinterpret<vUInt>(val);
+
+            // approximation of square root
+            val_s >>= 1;
+            dst_reg[0] = reinterpret<vFloat>(val_s);
+        }
+        else
+        {
+            // Recip root method
+            //// Init approx
+            //u.i = SQRT_MAGIC_F - (u.i >> 1);
+            v_if (val != 0.0f)
+            {
+                vUInt magic = vConstIntPrgm0;
+                vFloat approx = reinterpret<vFloat>(magic - (reinterpret<vUInt>(val) >> 1));
+
+                //Reciproot iterations
+                for (int r = 0; r < RECIPROCAL_ITERATIONS; r++)
+                {
+                    //x*r*(1.5f - xhalf*r*r);
+                    approx = ((approx * approx) * (val * -0.5f) + 1.5f) * approx;
+                }
+
+                dst_reg[0] = approx * val;
+            }
+            v_endif;
+        }
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_dropout(const int iterations, uint prob, uint scale)
+{
+    // SFPU microcode
     vUInt rand = l_reg[LRegs::LReg3];
 
     #pragma GCC unroll 0
-    for (int d = 0; d < ITERATIONS; d++) {
+    for (int d = 0; d < iterations; d++) {
         ////////////////////////
         // Scale samples
         ///////////////////////
@@ -378,27 +811,49 @@ inline void calculate_dropout(uint prob, uint scale)
     l_reg[LRegs::LReg3] = rand;
 }
 
-template <bool APPROXIMATION_MODE,int ITERATIONS>
-inline void calculate_power_iterative(const uint exponent)
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_lrelu(const int iterations, uint slope)
 {
-    #pragma GCC unroll 8
-    for (int d = 0; d < 8; d++)
+    // SFPU microcode
+    vFloat s = s2vFloat16b(slope);
+
+    #pragma GCC unroll 0
+    for (int d = 0; d < iterations; d++) {
+        vFloat v = dst_reg[0];
+
+        v_if (v < 0.0f) {
+            v *= s;
+        }
+        v_endif;
+
+        dst_reg[0] = v;
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_power(const int iterations, uint exponent)
+{
+    for (int d = 0; d < iterations; d++)
     {
         vFloat in = dst_reg[0];
-        vFloat result = 1.0f;
-        for (uint i = 0; i < exponent; i++) {
+        vFloat result = in * in;
+        for (uint i = 2; i < exponent; i++) {
             result *= in;
         }
-	dst_reg[0]=result;
+
+        dst_reg[0] = result;
+
         dst_reg++;
     }
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_square()
+inline void calculate_square(const int iterations)
 {
     #pragma GCC unroll 8
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat in = dst_reg[0];
         vFloat result = in * in;
@@ -410,7 +865,7 @@ inline void calculate_square()
 }
 
 template <bool HAS_BASE_SCALING>
-sfpi_inline void calculate_log_body(const uint log_base_scale_factor)
+sfpi_inline void _calculate_log_body_(const uint log_base_scale_factor)
 {
     ////////////////////////////
     // Load From dest + "normalize to calculation range"
@@ -465,16 +920,16 @@ sfpi_inline void calculate_log_body(const uint log_base_scale_factor)
 }
 
 template <bool APPROXIMATION_MODE, bool HAS_BASE_SCALING, int ITERATIONS>
-inline void calculate_log(uint log_base_scale_factor)
+inline void calculate_log(const int iterations, uint log_base_scale_factor)
 {
     #pragma GCC unroll 8
-    for(int d = 0; d < ITERATIONS; d++){
-        calculate_log_body<HAS_BASE_SCALING>(log_base_scale_factor);
+    for(int d = 0; d < iterations; d++){
+        _calculate_log_body_<HAS_BASE_SCALING>(log_base_scale_factor);
         dst_reg++;
     }
 }
 
-sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init)
+sfpi_inline void _calculate_comp_init_flag_(bool check, vFloat& flag1, vFloat& flag2, float init)
 {
     flag1 = init;
     if (check) {
@@ -483,82 +938,82 @@ sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& fla
 }
 
 template <bool APPROXIMATION_MODE, SfpuType COMP_MODE, int ITERATIONS>
-inline void calculate_comp(uint exponent_size_8)
+inline void calculate_comp(const int iterations, uint exponent_size_8)
 {
-   const vFloat zero = 0.0f;
-   const vFloat one = 1.0f;
-   for (int d = 0; d < ITERATIONS; d++)
+    //invert output and use same comparison check
+    constexpr bool invert_output = ((COMP_MODE == SfpuType::greater_than_equal_zero) ||
+                                    (COMP_MODE == SfpuType::not_equal_zero) ||
+                                    (COMP_MODE == SfpuType::greater_than_zero));
+
+    // output_0 and output_1 hold the outputs use use when a zero or negative check is true/false.
+    // False = 0.0 = kCONST_0 (5/8-bit exponent format)
+    // True  = 1.0 = kCONST_1_FP16B (8-bit exponent format)
+    // SFPU uses 8-bit exponent in operations so loading these constants in 8-bit exponent format.
+    // Although a command flag can tell SFPU to re-bias a 5-bit exponent to 8-bit, we are loading 8-bit
+    // exponent and telling SFPU to not add any bias to these constants.
+    constexpr float output_0 = invert_output ? 0.0f : 1.0f;
+    constexpr float output_1 = invert_output ? 1.0f : 0.0f;
+
+    constexpr bool check_zero = (COMP_MODE == SfpuType::equal_zero) || (COMP_MODE == SfpuType::not_equal_zero);
+    constexpr bool second_check = (COMP_MODE == SfpuType::less_than_equal_zero) || (COMP_MODE == SfpuType::greater_than_zero);
+
+    for (int d = 0; d < iterations; d++)
     {
         vFloat v = dst_reg[0];
         vFloat flag1, flag2;
-
-	//a[i] == 0
-	if constexpr(COMP_MODE == SfpuType::equal_zero) {
-	    v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
-	      v = one;
-	    } v_else {
-	      v = zero;
-	    }
-	    v_endif;
-	  }
-
-	//a[i] != 0
-	if constexpr(COMP_MODE == SfpuType::not_equal_zero) {
-	    v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
-	      v = zero;
-	    } v_else {
-	      v = one;
-	    }
-	    v_endif;
-        }
-
-	//a[i] < 0
-	if constexpr(COMP_MODE == SfpuType::less_than_zero) {
-	    v_if (v >= 0.0f) {
-	      v = zero;
-	    } v_else {
-	      v = one;
-	    }
-	    v_endif;
+        if constexpr(check_zero)
+        {
+            v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_0);
+            } v_else {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_1);
+            }
+            v_endif;
         }
-
-	//a[i] >= 0
-	if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) {
-	    v_if (v >= 0.0f) {
-	      v = one;
-	    } v_else {
-	      v = zero;
-	    }
-	    v_endif;
+        else
+        {
+            v_if (v < 0.0F) {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_0);
+            } v_else {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_1);
+            }
+            v_endif;
         }
 
-	//a[i] > 0
-	if constexpr(COMP_MODE == SfpuType::greater_than_zero) {
-	    v_if (v > 0.0f) {
-	      v = one;
-	    } v_else {
-	      v = zero;
-	    }
-	    v_endif;
+        vFloat result;
+        if constexpr (second_check)
+        {
+            // SfpuType::less_than_equal_zero
+            // flag1 = 0x3F80(1.0) if DST < 0 else 0
+            // flag2 = 0x3F80(1.0) if DST == 0 else 0
+            // Do a bitwise Or (flag1 | flag2) to get <= condition.
+            // flag1 < 0 OR flag2 == 0 => DST is Less than or Equal to zero.
+            // Result will be either 0x0000(0.0) or 0x3F80(1.0)
+            if constexpr (COMP_MODE == SfpuType::less_than_equal_zero) {
+                result = reinterpret<vFloat>(reinterpret<vUInt>(flag1) | reinterpret<vUInt>(flag2));
+            }
+            else
+            {
+                // SfpuType::greater_than_zero
+                // flag1 = 0x3F80(1.0) if DST >= 0 else 0
+                // flag2 = 0x3F80(1.0) if DST != 0 else 0
+                // Do a bitwise And (flag1 & flag2) to get > condition.
+                // flag2 >= 0 AND flag1 != 0 => DST is Greater than zero
+                // Result will be either 0x0000(0.0) or 0x3F80(1.0)
+                result = reinterpret<vFloat>(reinterpret<vUInt>(flag1) & reinterpret<vUInt>(flag2));
+            }
+        } else {
+            result = flag1;
         }
 
-	//a[i] <= 0
-	if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) {
-	    v_if (v > 0.0f) {
-	      v = zero;
-	    } v_else {
-	      v = one;
-	    }
-	    v_endif;
-        }
+        dst_reg[0] = result;
 
-	dst_reg[0] = v;
-	dst_reg++;
+        dst_reg++;
     }
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_clamp(uint param0, uint param1, uint param2)
+inline void calculate_clamp(const int iterations, uint param0, uint param1, uint param2)
 {
     // All params are in FP16 format
     // param0 = min
@@ -571,7 +1026,7 @@ inline void calculate_clamp(uint param0, uint param1, uint param2)
     vFloat min = s2vFloat16(param0, format);
     vFloat max = s2vFloat16(param1, format);
     #pragma GCC unroll 0
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat val = dst_reg[0];
 
@@ -589,10 +1044,10 @@ inline void calculate_clamp(uint param0, uint param1, uint param2)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_abs()
+inline void calculate_abs(const int iterations)
 {
     // SFPU microcode
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat v = dst_reg[0];
         dst_reg[0] = sfpi::abs(v);
@@ -600,53 +1055,25 @@ inline void calculate_abs()
     }
 }
 
-
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_exp2()
-{
-    // SFPU microcode
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat v = dst_reg[0];
-        // log(2) = 0.6931471805;
-        v = v * 0.6931471805f;
-	    // exp = e^(v)
-	    vFloat exp = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
-	    dst_reg[0] = exp;
-        dst_reg++;
-    }
-}
-
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_sign()
+inline void calculate_sign(const int iterations, uint exponent_size_8)
 {
     // All params are in FP16 format
-    for (int d = 0; d < ITERATIONS; d++)
+    // uint format = 1;
+    #pragma GCC unroll 0
+    for (int d = 0; d < iterations; d++)
     {
         vFloat v = dst_reg[0];
-	vFloat result = vConst1;
-        v_if (v < 0.0f) {
-           result = vConstNeg1;
-        } v_elseif(v > 0.0f) {
-	  result = vConst1;
-	} v_else {
-	  result = vConst0;
+        dst_reg[0] = vConst1;
+        v_if (v < 0.0F) {
+            dst_reg[0] = vConstNeg1;
         }
         v_endif;
 
-	dst_reg[0] = result;
-        dst_reg++;
-    }
-}
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_max()
-{
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat a = dst_reg[0];
-        vFloat b = dst_reg[32];
-        v_if(a < b) {
-            dst_reg[0] = b;
+        //param0 == 0 is Bfp8 format. It does not require bias removal.
+        //param0 != 0 is Float16 format and exp bias needs to be removed for zero check.
+        v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+            dst_reg[0] = vConst0;
         }
         v_endif;
 
@@ -655,13 +1082,13 @@ inline void calculate_max()
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_min()
+inline void calculate_max(const int iterations)
 {
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat a = dst_reg[0];
         vFloat b = dst_reg[32];
-        v_if(a > b) {
+        v_if(a < b) {
             dst_reg[0] = b;
         }
         v_endif;
@@ -671,261 +1098,248 @@ inline void calculate_min()
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_expm1()
+inline void calculate_max_int32(const int iterations)
 {
-    // SFPU microcode
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
-        vFloat v = dst_reg[0];
-        v = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
-        dst_reg[0] = v - 1.0f;
+        TTI_SFPLOAD(2, 12, 3, 0);
+        TTI_SFPLOAD(0, 12, 3, 64);
+        TTI_SFPMOV(0, 0, 1, 0);
+        TTI_SFPIADD(0, 2, 1, 2);
+        TTI_SFPSTORE(0, 12, 3, 0);
+        TTI_SFPENCC(0x003, 0, 0, 10);
         dst_reg++;
     }
 }
 
-
-#define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4)  (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0)
-
 template <bool APPROXIMATION_MODE>
-sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val)
+sfpi_inline vFloat sfpu_sine_maclaurin_series(vFloat val)
 {
-    v_if(1 > sfpi::abs(val)){
-        dst_reg[0] = sfpi::abs(val)  ;
-    }
-    v_else{
-        dst_reg[0] =  sfpu_reciprocal(sfpi::abs(val));
-    }
-    v_endif;
-
-    vFloat t1 = dst_reg[0] * dst_reg[0];
-
-    t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1);
-
-    t1 = t1 * dst_reg[0];
-
-    v_if (sfpi::abs(val) > 1){
-        t1 = 1.570796327f - t1;
+    // Good for [-pi:pi]
+    // Mclauren series = x - x^3/3! + x^5/5! - x^7/7! + x^9/9! - x^11/11!
+    vFloat tmp = val;
+    // x
+    vFloat output = tmp;
+    // x^3/3!
+    tmp = tmp*val*val;
+    output += -0.166666666*tmp;
+    // x^5/5!
+    tmp = tmp*val*val;
+    output +=  0.0083333333*tmp;
+    // x^7/7!
+    tmp = tmp*val*val;
+    output += -0.0001984126*tmp;
+    if constexpr (not APPROXIMATION_MODE) {
+        // x^9/9!
+        tmp = tmp*val*val;
+        output +=  0.0000027557*tmp;
+        // x^11/11!
+        tmp = tmp*val*val;
+        output += -0.00000002505*tmp;
     }
-    v_endif;
 
-    v_if(val < 0 ){
-        t1 = -t1;
-    }
-    v_endif;
-
-    return t1;
-}
-
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_atan()
-{
-    // SFPU microcode
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat val = dst_reg[0];
-        val = sfpu_atan_maclaurin_series<APPROXIMATION_MODE>(val);
-        dst_reg[0] = val;
-        dst_reg++;
-    }
+    // Write out output
+    return output;
 }
-
-
 template <bool APPROXIMATION_MODE>
-sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val)
+sfpi_inline vFloat sfpu_cosine_maclaurin_series(vFloat val)
 {
-    // input for [-1:1]
-    // Mclauren series
-    // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ...
-    // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a
-
-    vFloat tmp = val;
-    vFloat val_square = val * val;
-    // x
-    vFloat output = tmp;
-    // (1/6) * x^3
-    tmp = tmp * val_square;
-    output += 0.166666666 * tmp;
-    // (3/40) * x^5
-    tmp = tmp * val_square;
-    output +=  0.075 * tmp;
-
-    //(5/112) * x^7
-    tmp = tmp * val_square;
-    output += 0.044642857 * tmp;
-
-    // (35/1152) *x^9
-    tmp = tmp * val_square;
-    output += 0.03038194 * tmp;
-
-    //(63/2816) * x^11
-    tmp = tmp * val_square;
-    output += 0.02237216 * tmp;
+    // Good for [-pi:pi]
+    // Mclauren series = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! - x^10/10! + x^12/12!
+    // 1
+    vFloat output = 1.0f;
+    // x^2/2!
+    vFloat tmp = val*val;
+    output += -0.5*tmp;
+    // x^4/4!
+    tmp = tmp*val*val;
+    output +=  0.0416666666*tmp;
+    // x^6/6!
+    tmp = tmp*val*val;
+    output += -0.0013888888*tmp;
+    if constexpr (not APPROXIMATION_MODE) {
+        // x^8/8!
+        tmp = tmp*val*val;
+        output +=  0.0000248015*tmp;
+        // x^10/10!
+        tmp = tmp*val*val;
+        output += -0.0000002755*tmp;
+    }
 
     // Write out output
     return output;
 }
-
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_asin()
+inline void calculate_sine(const int iterations)
 {
     // SFPU microcode
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat v = dst_reg[0];
-        v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
+        v = 0.318309886183791f*v; // *1/pi to get number of pi rads.
+        vInt whole_v = float_to_int16(v);
+        vFloat whole_v_float = int32_to_float(whole_v, 0);
+        v = v - whole_v_float;
+        v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi]
+        v = sfpu_sine_maclaurin_series<APPROXIMATION_MODE>(v);
+        whole_v = whole_v & 0x1;
+        v_if(whole_v != 0) {
+            // odd so flip the sign
+            v *= -1;
+        }
+        v_endif;
         dst_reg[0] = v;
         dst_reg++;
     }
 }
-
-
-#define PI_2 (1.570796326794)
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_acos()
+inline void calculate_cosine(const int iterations)
 {
     // SFPU microcode
-    // acos = (pi/2 - asin)
-    for (int d = 0; d < ITERATIONS; d++)
+    for (int d = 0; d < iterations; d++)
     {
         vFloat v = dst_reg[0];
-        v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
-        v = PI_2 - v;
+        v = 0.318309886183791f*v; // *1/pi to get number of pi rads.
+        vInt whole_v = float_to_int16(v);
+        vFloat whole_v_float = int32_to_float(whole_v, 0);
+        v = v - whole_v_float;
+        v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi]
+        v = sfpu_cosine_maclaurin_series<APPROXIMATION_MODE>(v);
+        whole_v = whole_v & 0x1;
+        v_if(whole_v != 0) {
+            // odd so flip the sign
+            v *= -1;
+        }
+        v_endif;
         dst_reg[0] = v;
         dst_reg++;
     }
 }
-
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void cast_fp32_to_fp16a()
+inline void relu_max(const int iterations, uint uint_threshold)
 {
-    #pragma GCC unroll 8
-    for (int d = 0; d < ITERATIONS; d++)
+    vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a);
+    for (int d = 0; d < iterations; d++)
     {
-        //vFloat val = dst_reg[0];
-        //dst_reg[0] = float_to_fp16a(val, 0);
-        TTI_SFPLOAD(0, 0, 3, 0);
-        TTI_SFP_STOCH_RND(0,0,0,0,0,8);
-        TTI_SFPSTORE(0,1,3,0);
+        vFloat a = dst_reg[0];
+        v_if(a > threshold) {
+            a = threshold;
+        }
+        v_endif;
+        v_if(a < 0.0f) {
+            a = 0.0f;
+        }
+        v_endif;
+        dst_reg[0] = a;
         dst_reg++;
     }
 }
-
-
-
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_negative()
+inline void relu_min(const int iterations, uint uint_threshold)
 {
-
-    for (int d = 0; d < ITERATIONS; d++)
+    vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a);
+    for (int d = 0; d < iterations; d++)
     {
-        vFloat val = dst_reg[0];
-        dst_reg[0] = -val;
+        vFloat a = dst_reg[0];
+        v_if(a < threshold) {
+            a = 0.0f;
+        }
+        v_endif;
+        dst_reg[0] = a;
         dst_reg++;
     }
 }
-
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_add1()
+inline void cast_fp32_to_fp16a(const int iterations)
 {
-    for (int d = 0; d < ITERATIONS; d++)
+    #pragma GCC unroll 8
+    for (int d = 0; d < iterations; d++)
     {
-        vFloat val = dst_reg[0];
-        dst_reg[0] = 1.0f + val;
+        //vFloat val = dst_reg[0];
+        //dst_reg[0] = float_to_fp16a(val, 0);
+        TTI_SFPLOAD(0, 0, 3, 0);
+        TTI_SFP_STOCH_RND(0,0,0,0,0,8);
+        TTI_SFPSTORE(0,1,3,0);
         dst_reg++;
     }
 }
 
-inline
-vFloat sigmoid_piecewise_linear_positive(vFloat val) {
-        vFloat result = 0.0f;
-	v_if ( val >= +5.0f)  {
-	  result = 1.0f;
-	} v_elseif ( val > 1.0f && val < 5.0f ) {
-	  result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f,  0.24300185f,  0.50437757f,val);
-	} v_else {
-	  result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5
-	}
-	v_endif;
-	return result;
-}
-
-//sigmoid is anti-symmetric and offset by 1
-//sigmoid[-x] = 1 - sigmoid[x]
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_sigmoid()
+inline void quant_int32(const int iterations, const uint dst_offset)
 {
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat val = dst_reg[0];
-        vFloat result = 0.0f;
-
-        v_if ( val < 0.0f ) {
-  	   val = -val;
-        }
-        v_endif;
-
-	result = sigmoid_piecewise_linear_positive(val);
-
-	val = dst_reg[0];
-        v_if ( val < 0.0f ) {
-            result = 1.0f - result;
-        }
-        v_endif;
-
-        dst_reg[0] = result;
+    // Operand A is input (fp32)
+    // Operand B is scaling factor (fp32)
+    // Operand C is zero-point constant (fp32)
+    // Output is int32 scaled to int8 range
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        // operand A - fp32
+        TTI_SFPLOAD(0, 3, 3, 0);
+        // operand B - fp32 scaler
+        TT_SFPLOAD(1, 3, 3, dst_offset * 64);
+        // D(A) = A*B+C, LREG[2] = zero_point
+        TTI_SFPMAD(0, 1, 2, 0, 0);
+        // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result
+        TTI_NOP;
+        // fp32->int8, descale value is zero (LREG_9)
+        TTI_SFP_STOCH_RND(0,0,9,0,0,3);
+        // LREG_0 -> dest as int32
+        TTI_SFPSTORE(0,4,3,0);
         dst_reg++;
     }
-
-    return;
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_heaviside(uint value)
+inline void requant_int32(const int iterations, const uint dst_offset)
 {
-    // SFPU microcode
-    Converter c_value;
-    c_value.u = value;
-    vFloat s = c_value.f;
-
-    #pragma GCC unroll 0
-    for (int d = 0; d < ITERATIONS; d++) {
-        vFloat v = dst_reg[0];
-
-        v_if (v < 0.0f) {
-            v = 0.0f;
-        }v_elseif (v > 0.0f) {
-            v = 1.0f;
-        }v_else {
-            v = s;
-        }
-        v_endif;
-
-       dst_reg[0] = v;
-
+    // Operand A is input to requant (int32)
+    // Operand B is scaling factor (fp32)
+    // Operand C is zero-point constant (fp32)
+    // Output is int32 scaled to int8 range
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        // operand A - int32
+        TTI_SFPLOAD(0, 4, 3, 0);
+        // operand B - fp32 scaler
+        TT_SFPLOAD(1, 3, 3, dst_offset*64);
+        // cast int32->fp32
+        TTI_SFPCAST(0, 0, 0);
+        // D(A) = A*B+C, LREG[2] = zero_point
+        TTI_SFPMAD(0, 1, 2, 0, 0);
+        // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result
+        TTI_NOP;
+        // fp32->int8, descale value is zero (LREG_9)
+        TTI_SFP_STOCH_RND(0,0,9,0,0,3);
+        // LREG_0 -> dest as int32
+        TTI_SFPSTORE(0,4,3,0);
         dst_reg++;
     }
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_silu()
+inline void dequant_int32(const int iterations, const uint dst_offset)
 {
-    // SFPU microcode
+    // Operand A[LREG0] is input to dequant (int32)
+    // Operand B[LREG1] is scaling factor (fp32)
+    // Operand C[LREG2] is zero-point constant (fp32)
+    // Output = (A + (-C)) * B (fp32)
+    #pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++) {
-        vFloat val = dst_reg[0];
-        v_if ( val < 0.0f ) {
-            val = -val;
-        }
-        v_endif;
-
-	    vFloat result = sigmoid_piecewise_linear_positive(val);
-
-	    val = dst_reg[0];
-        v_if ( val < 0.0f ) {
-            result = 1.0f - result;
-        }
-        v_endif;
-        result = val * result;
-        dst_reg[0] = result;
+        // operand A - int32
+        TTI_SFPLOAD(0, 4, 3, 0);
+        // operand B - fp32 scaler
+        TT_SFPLOAD(1, 3, 3, dst_offset*64);
+        // cast int32->fp32
+        TTI_SFPCAST(0, 0, 0);
+        // D(A)) = A+(-C), LREG[10] is 1, SFPADD = LREG_A*LREG_B+LREG_C
+        TTI_SFPADD(0,10,2,0,0);
+        TTI_NOP;
+        // D(A)) = (A+(-C))*B, LREG[9] is zero
+        TTI_SFPMUL(0,1,9,0,0);
+        TTI_NOP;
+        // LREG_0 -> dest as fp32
+        TTI_SFPSTORE(0,3,3,0);
         dst_reg++;
     }
 }
@@ -945,53 +1359,56 @@ inline void calculate_mask()
     }
 }
 
-
-template <SfpuType operation, bool APPROXIMATION_MODE, int SfpuType_PARAM=0, int ITERATIONS=8>
-inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0)
+template <SfpuType operation, bool APPROXIMATION_MODE, int SfpuType_PARAM=0, int ITERATIONS=8, bool IS_INT_SFPU_EN = false>
+inline void calculate_sfpu(const int iterations = ITERATIONS, uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0)
 {
-    if constexpr (operation == SfpuType::exp_with_base) {
-	    constexpr bool zero_negative = true;
-        calculate_exponential<APPROXIMATION_MODE, zero_negative, true, ITERATIONS>(param0);
+    if constexpr (operation == SfpuType::exponential) {
+        calculate_exponential<APPROXIMATION_MODE, false, ITERATIONS>(iterations, param0);
+    }
+    else if constexpr (operation == SfpuType::exp_with_base) {
+        calculate_exponential<APPROXIMATION_MODE, true, ITERATIONS>(iterations, param0);
     }
     else if constexpr (operation == SfpuType::tanh) {
-        calculate_tanh<APPROXIMATION_MODE, ITERATIONS>();
+        calculate_tanh<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
     else if constexpr (operation == SfpuType::hardtanh) {
-        calculate_hardtanh<APPROXIMATION_MODE, ITERATIONS>(param0, param1, param2);
-    }
-    else if constexpr (operation == SfpuType::rsqrt) {
-	//param0 = true -> approximate fast mode
-	//         false -> high precision mode
-    // The algorithm uses Newton's method based on no.of iteration better approximation can be calculated
-	if ( param0 ) {
-	    calculate_rsqrt<true, ITERATIONS, 10>();
-	} else {
-	    calculate_rsqrt<false, ITERATIONS, 25>();
-	}
+        calculate_hardtanh<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
+    }
+    else if constexpr (operation == SfpuType::gelu) {
+        calculate_gelu<APPROXIMATION_MODE, ITERATIONS>(iterations);
+    }
+    else if constexpr (operation == SfpuType::reciprocal) {
+        calculate_reciprocal<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
     else if constexpr (operation == SfpuType::sigmoid) {
-        calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>();
+        calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
-    else if constexpr (operation == SfpuType::sigmoid_appx) {
-        calculate_sigmoid_appx<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::sqrt) {
+        calculate_sqrt<APPROXIMATION_MODE, ITERATIONS, 2>(iterations);
     }
     else if constexpr (operation == SfpuType::tanh_derivative) {
-        calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>();
+        calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>(iterations);
+    }
+    else if constexpr (operation == SfpuType::lrelu) {
+        calculate_lrelu<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
     }
     else if constexpr (operation == SfpuType::dropout) {
-        calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(param0, param1);
+        calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1);
     }
     else if constexpr (operation == SfpuType::power) {
-	calculate_power_iterative<APPROXIMATION_MODE, ITERATIONS>(param0);
+        calculate_power<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
     }
     else if constexpr (operation == SfpuType::square) {
-        calculate_square<APPROXIMATION_MODE, ITERATIONS>();
+        calculate_square<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
     else if constexpr (operation == SfpuType::log) {
-        calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(param0);
+        calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(iterations, param0);
     }
     else if constexpr (operation == SfpuType::log_with_base) {
-        calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(param0);
+        calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(iterations, param0);
+    }
+    else if constexpr (operation == SfpuType::gelu_derivative) {
+        calculate_gelu_derivative<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
     else if constexpr ((operation == SfpuType::equal_zero) ||
                        (operation == SfpuType::not_equal_zero) ||
@@ -999,46 +1416,46 @@ inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, ui
                        (operation == SfpuType::greater_than_equal_zero) ||
                        (operation == SfpuType::less_than_equal_zero) ||
                        (operation == SfpuType::greater_than_zero)) {
-        calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(8); //BFLOAT16 - exp
+        calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(iterations, param5);
     }
     else if constexpr (operation == SfpuType::clamp) {
-        calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(param0, param1, param2);
+        calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
     }
     else if constexpr (operation == SfpuType::abs) {
-        calculate_abs<APPROXIMATION_MODE, ITERATIONS>();
+        calculate_abs<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
     else if constexpr (operation == SfpuType::sign) {
-        calculate_sign<APPROXIMATION_MODE, ITERATIONS>();
+        calculate_sign<APPROXIMATION_MODE, ITERATIONS>(iterations, param5);
     }
     else if constexpr (operation == SfpuType::max) {
-        calculate_max<APPROXIMATION_MODE, ITERATIONS>();
-    }
-    else if constexpr (operation == SfpuType::min) {
-        calculate_min<APPROXIMATION_MODE, ITERATIONS>();
+        if constexpr (IS_INT_SFPU_EN)
+            calculate_max_int32<APPROXIMATION_MODE, ITERATIONS>(iterations);
+        else
+            calculate_max<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
-    else if constexpr (operation == SfpuType::exp2) {
-        calculate_exp2<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::sine) {
+        calculate_sine<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
-    else if constexpr (operation == SfpuType::heaviside) {
-        calculate_heaviside<APPROXIMATION_MODE, ITERATIONS>(param0);
+    else if constexpr (operation == SfpuType::cosine) {
+        calculate_cosine<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
-    else if constexpr (operation == SfpuType::expm1) {
-        calculate_expm1<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::relu_min) {
+        relu_min<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
     }
-    else if constexpr (operation == SfpuType::asin) {
-        calculate_asin<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::relu_max) {
+        relu_max<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
     }
-    else if constexpr (operation == SfpuType::acos) {
-        calculate_acos<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::cast_fp32_to_fp16a) {
+        cast_fp32_to_fp16a<APPROXIMATION_MODE, ITERATIONS>(iterations);
     }
-    else if constexpr (operation == SfpuType::atan) {
-        calculate_atan<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::quant_int32) {
+        quant_int32<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
     }
-    else if constexpr (operation == SfpuType::signbit) {
-        calculate_signbit<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::requant_int32) {
+        requant_int32<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
     }
-    else if constexpr (operation == SfpuType::silu) {
-        calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
+    else if constexpr (operation == SfpuType::dequant_int32) {
+        dequant_int32<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
     }
     else if constexpr (operation == SfpuType::mask) {
         calculate_mask<APPROXIMATION_MODE, ITERATIONS>();
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h
index 58ef2c3bb6a..a8134eb8d47 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h
@@ -4,9 +4,6 @@
 
 #pragma once
 
-#include "circular_buffer.h"
-#include "hostdevcommon/kernel_structs.h"
-
 namespace ckernel
 {
 
@@ -14,13 +11,14 @@ namespace ckernel
 struct semaphore
 {
     constexpr static uint32_t MATH_PACK = 1;   // math <-> pack sync on dest register
-    constexpr static uint32_t UNPACK_PACK = 2; // pack <-> unpack sync on scratch buffer
+    constexpr static uint32_t UNPACK_TO_DEST = 2; // unpack <-> math sync on unpack to dest
     constexpr static uint32_t UNPACK_OPERAND_SYNC = 3; // unpack <-> pack, math sync on operand get/release
     constexpr static uint32_t PACK_DONE = 4; // Wait for beinning and end of each pack-iteration. For recording perf events and inserting delay.
     constexpr static uint32_t UNPACK_SYNC = 5; // trisc <-> unpack sync on hw kernel
     // Wait for beinning and end of each unpack or math iteration. For recording perf events and inserting delay.
     // This semaphore should only be used for either unpack or math. Not both at the same time.
     constexpr static uint32_t UNPACK_MATH_DONE = 6;
+    constexpr static uint32_t MATH_DONE = 7; // wait for math to finish when unpacking to dest
 
     constexpr static uint16_t t6_sem(const uint8_t sem_index)
     {
@@ -46,79 +44,4 @@ enum firmware_msg_e
     SET_PERF_SCRATCH = 4
 };
 
-constexpr uint8_t OPERAND_BASE_REG = 16; // base register used for operand storage
-constexpr uint8_t OUTPUT_BASE_REG = 16; // base register used for output storage
-
-typedef struct {
-   uint32_t fifo_rd_ptr;
-   uint32_t fifo_limit;
-   uint16_t tiles_acked;
-   uint16_t accumulation_buffer;
-   uint32_t words_acked;
-   uint32_t fifo_size;
-   uint16_t blocks_per_iter; // total number of ublocks popped from interm buffer per input
-   uint16_t curr_block; // current number of ublocks popped per input
-   uint16_t num_iter;  // total number of passes through the interm buffer per input
-   uint16_t curr_iter;  // current numer of passes through the interm buffer per input
-   uint32_t fifo_rd_base_ptr;
-   uint32_t tile_size_words;
-} operand_t;
-
-static_assert(sizeof(operand_t) == (sizeof(uint32_t) * 9));
-
-typedef union {
-   operand_t f;
-   uint32_t val[9];
-} operand_u;
-
-typedef struct {
-   uint32_t fifo_wr_ptr;
-   uint32_t fifo_limit;
-   uint32_t fifo_size;
-   uint32_t fifo_num_pages;
-   uint32_t fifo_wr_base_ptr;
-   uint16_t fifo_wr_tile_ptr;
-   uint16_t tiles_received;
-   uint32_t dram_output_no_push;
-   uint16_t tile_size_words;
-   bool     legacy_pack;
-   uint8_t  fork;
-   uint8_t  num_fork_streams;
-   bool     shared_buffer;  // interm buffer is shared with output
-   uint8_t  shared_buffer_operand; //shared buffer output operand
-   bool     accumulation_buffer;  // interm buffer used for accumulation
-   uint8_t  fork_stream_ids[16];
-   union {
-      uint16_t ublock_ct;       //ublock ct dim in tiles
-      uint16_t out_tile_dim;   //output block dim in tiles
-   };
-   union {
-      uint16_t ublock_tile_dim; //number of tiles in ublock for untilized output
-      uint16_t blocks_per_iter; //total number of ublocks pushed to interm buffer per input
-   };
-   union {
-      uint16_t row_tile_dim;    //one row of tiles
-   };
-   union {
-      uint16_t block_tile_dim;  //one row of ublocks for untilized output
-      uint16_t num_iter; //total number of passes through the interm buffer per input
-   };
-   union {
-      uint16_t ublock_tile_cnt;
-      uint16_t curr_block;  //current number of ublocks pushed to interm buffer per input
-   };
-   union {
-      uint16_t block_tile_cnt;  //current number of packed tiles for untilized output
-      uint16_t curr_iter;  // current numer of passes through the interm buffer per input
-   };
-} output_t;
-
-static_assert(sizeof(output_t) == (sizeof(uint32_t) * 16));
-
-typedef union {
-   output_t f;
-   uint32_t val[16];
-} output_u;
-
-
 } // namespace ckelimitrnel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h
deleted file mode 100644
index b5bb5b1cbcd..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-
-#include "ckernel_defs.h"
-
-//
-// Receiving from a stream input
-//
-
-// Setup pipe receiving data over stream
-inline void llk_setup_input_operand(src_op_id_e operand);
-
-// Wait for N tiles available in the incoming stream
-inline void llk_wait_tiles(src_op_id_e operand, std::uint32_t num_tiles);
-
-// Pop N tiles from the incoming stream
-inline void llk_pop_tiles(src_op_id_e operand, std::uint32_t num_tiles);
-
-//
-// Receiving from a local buffer
-//
-
-// Setup pipe for receiving data over local buffer
-inline void llk_setup_local_operand(src_op_id_e operand);
-
-// Wait for N tiles available in the local buffer
-inline void llk_wait_local_tiles(src_op_id_e operand, std::uint32_t num_tiles);
-
-// Pop N tiles from the incoming stream
-inline void llk_pop_local_tiles(src_op_id_e operand, std::uint32_t num_tiles);
-
-//
-// Write to stream output
-//
-
-// Setup pipe for writing output data to stream buffer
-inline void llk_setup_output(out_op_id_e output);
-
-// Blockig call to wait for free space needed to pack N tiles
-inline void llk_wait_for_free_tiles(out_op_id_e output, std::uint32_t num_tiles);
-
-// Push N tiles to stream buffer (increment write pointer)
-inline void llk_push_tiles(out_op_id_e output, std::uint32_t num_tiles);
-
-//
-// Write to local output
-//
-
-// Setup pipe for writing output data to local output
-inline void llk_setup_local_output(out_op_id_e output);
-
-// Blockig call to wait for free space needed to pack N tiles
-inline void llk_wait_for_free_tiles(out_op_id_e output, std::uint32_t num_tiles);
-
-// Push N tiles to stream buffer (increment write pointer)
-inline void llk_push_tiles(out_op_id_e output, std::uint32_t num_tiles);
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h
deleted file mode 100644
index 6782ceb8cb4..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-
-#include "ckernel_defs.h"
-#include "tensix_types.h"
-
-
-//
-// LLK math common
-//
-
-template <DstSync Dst>
-inline void llk_math_wait_for_dest_available();
-
-inline void llk_math_dest_section_done();
-
-template <DstClear Dst>
-inline void llk_math_clear_dst<Dst>(uint tile_index);
-
-template <DstStart Dst>
-inline void llk_math_set_dest_section_base();
-
-template <DstStart Dst>
-inline void llk_math_set_dest_section_flip();
-
-//
-// LLK matrix multiplication
-//
-inline void llk_math_mmul_init();
-inline void llk_math_mmul();
-
-//
-// LLK Eltwise binary
-//
-template <EltwiseBinaryType eltwise_binary_type, BroadcastType src_b_broadcast_type>
-inline void llk_math_eltwise_binary();
-template <EltwiseBinaryType eltwise_binary_type, BroadcastType src_b_broadcast_type>
-inline void llk_math_eltwise_binary_init();
-
-//
-// LLK Eltwise unary sfpu
-//
-template <SfpuType sfpu_type, bool approximation_mode>
-inline void llk_math_eltwise_unary_sfpu();
-template <SfpuType sfpu_type, bool approximation_mode>
-inline void llk_math_eltwise_unary_sfpu_init();
-
-//
-// LLK Eltwise unary datacopy
-//
-template <DataCopyType datacopy_type>
-inline void llk_math_eltwise_unary_datacopy();
-template <DataCopyType datacopy_type>
-inline void llk_math_eltwise_unary_datacopy_init();
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h
deleted file mode 100644
index a2adf15f705..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-
-#include "ckernel_defs.h"
-#include "tensix_types.h"
-
-// MT: is it only formats for packer ??
-inline void llk_pack_hw_configure(DataFormat unpack_src, DataFormat unpack_dst);
-
-template <DestSyncSections Dst>
-inline void llk_pack_wait_for_dest_available<Dst>();
-
-template <DestSyncSections Dst>
-inline void llk_pack_set_dest_base<Dst>();
-
-template <DestSyncSections Dst>
-inline void llk_pack_dest_section_done<Dst>();
-
-template <DstClearSections Dst>
-inline void llk_math_clear_dst<Dst>();
-
-//
-// LLK pack tile to output stream - using row tables
-//
-inline void llk_pack_stream_row_tables_init();
-inline void llk_pack_stream_row_tables(std::uint32_t dst_tile_index);
-
-//
-// LLK pack tile to output stream - using tile tables
-//
-inline void llk_pack_stream_tile_tables_init();
-inline void llk_pack_stream_tile_tables(std::uint32_t dst_tile_index);
-
-
-//
-// LLK pack tile to local L1 buffer - using row tables
-//
-inline void llk_pack_local_row_tables_init();
-inline void llk_pack_local_row_tables(std::uint32_t dst_tile_index);
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h
deleted file mode 100644
index fef79486088..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-
-
-//
-// LLK unpack tile A
-//
-inline void llk_unpack_A_hw_config(llk_unpack_A_params_t params);
-inline void llk_unpack_A_init();
-inline void llk_unpack_A(std::uint32_t tile_index);
-
-//
-// LLK unpack tile B
-//
-inline void llk_unpack_B_hw_config(llk_unpack_B_params_t params);
-inline void llk_unpack_B_init();
-inline void llk_unpack_B(std::uint32_t tile_index);
-
-//
-// LLK unpack tiles AB
-//
-inline void llk_unpack_AB_hw_config(llk_unpack_AB_params_t params);
-inline void llk_unpack_AB_init();
-inline void llk_unpack_AB(std::uint32_t tile_index_a, std::uint32_t tile_index_b);
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h
index aaa08f8eed0..fa97031b17a 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h
@@ -7,11 +7,8 @@
 //#include "kernel_types.h"
 #include "ckernel.h"
 #include "ckernel_template.h"
-#include "ckernel_sfpu.h"
 #include "ckernel_globals.h"
-
-#include "debug/fw_debug.h"
-#include "debug/status.h"
+#include "llk_defs.h"
 
 #ifndef SFPU_OP_PARAM
 #define SFPU_OP_PARAM 0
@@ -21,8 +18,6 @@
 #define FUSE_SQRT_RECIP 0
 #endif
 
-#define EPS 1.19209e-07 //std::numeric_limits::epsilon() for FP32
-
 using namespace ckernel;
 
 namespace ckernel::math
@@ -145,9 +140,7 @@ inline uint32_t get_dest_buffer_base()
 inline void wait_math_semaphores()
 {
     // wait while math semaphore is on max, no room to write math results
-    DEBUG_STATUS('W', 'M', 'S', 'W');
     TTI_SEMWAIT(p_stall::STALL_MATH|p_stall::STALL_SFPU, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_MAX);
-    DEBUG_STATUS('W', 'M', 'S', 'D');
 }
 
 inline void set_math_semaphores()
@@ -156,13 +149,31 @@ inline void set_math_semaphores()
     t6_semaphore_post<p_stall::MATH|p_stall::WAIT_SFPU>(semaphore::MATH_PACK);
 }
 
-template <DstTileLayout layout, DstTileShape tile_shape>
+inline void math_unpack_to_dest_math_ready()
+{
+    t6_semaphore_wait_on_max<p_stall::STALL_SYNC>(semaphore::MATH_DONE);
+    t6_semaphore_post<p_stall::MATH|p_stall::WAIT_SFPU>(semaphore::MATH_DONE);
+    while (semaphore_read(semaphore::MATH_DONE) == 0) {}
+    semaphore_get(semaphore::MATH_DONE);
+}
+
+inline void math_unpack_to_dest_tile_ready()
+{
+    t6_semaphore_wait_on_zero<p_stall::STALL_SYNC>(semaphore::UNPACK_TO_DEST);
+    t6_semaphore_get<p_stall::MATH|p_stall::WAIT_SFPU>(semaphore::UNPACK_TO_DEST);
+}
+
+template <DstTileLayout layout, DstTileShape tile_shape, bool unpack_to_dest = false>
 inline void set_dst_write_addr(uint32_t tile_index)
 {
     if constexpr (layout == DstTileLayout::Default) {
         uint dst_index = tile_index << DstTileSizeLog2[tile_shape];
         dst_index = dst_index + get_dest_buffer_base();
-        TT_SETC16(DEST_TARGET_REG_CFG_MATH_Offset_ADDR32, dst_index);
+        if constexpr (unpack_to_dest) {
+            mailbox_write(ThreadId::UnpackThreadId, dst_index); // Send to unpacker
+        } else {
+            TT_SETC16(DEST_TARGET_REG_CFG_MATH_Offset_ADDR32, dst_index);
+        }
     } else {
         // FIXME MT: add this mapping for other layout
     }
@@ -188,9 +199,8 @@ inline void clear_addr_mod_base()
 
 inline void math_dest_wait()
 {
-    DEBUG_STATUS('W', 'D', 'S', 'W');
+    FWLOG0("XX math_full_dest_sync()->wait for whole dest available");
     TTI_SEMWAIT(p_stall::STALL_MATH|p_stall::STALL_SFPU, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_MAX);
-    DEBUG_STATUS('W', 'D', 'S', 'D');
 }
 
 inline void dest_section_flip()
@@ -213,34 +223,11 @@ inline void set_dest_section_base()
     TT_SETC16(DEST_TARGET_REG_CFG_MATH_Offset_ADDR32, base_addr);
 }
 
-inline uint32_t get_operand_id(uint32_t operand)
-{
-    const int INTERMEDIATE_BASE_ID = 24;
-    const int OPERAND_BASE_ID = 0;
-    return (operand>=INTERMEDIATE_BASE_ID) ? operand - 8 : operand - OPERAND_BASE_ID;
-}
-
-
-// FIXME: Added this --> Should be generated by compile trisc?
-constexpr std::uint32_t math_tile_dims[32][2] = {
-    {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32},
-    {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32},
-    {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32},
-    {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}
-};
-constexpr std::uint32_t math_tile_num_faces[32] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
-constexpr std::uint32_t math_partial_face[32] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-
-inline constexpr uint32_t get_num_faces(const std::uint32_t operand_id)
-{
-   return math_tile_num_faces[operand_id];
-}
-
-inline constexpr uint32_t get_partial_face(const std::uint32_t operand_id)
-{
-    return math_partial_face[operand_id];
+inline constexpr bool is_32bit_input(const std::uint32_t src_format, const std::uint32_t dst_format) {
+    const uint input_df = src_format;
+    const uint output_df = dst_format;
+    return ((input_df == (uint)DataFormat::Int32)  || (input_df == (uint)DataFormat::Float32)) &&
+           ((output_df == (uint)DataFormat::Int32) || (output_df == (uint)DataFormat::Float32));
 }
 
 } // namespace ckernel::math
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h
index 57990f84c94..bdc0b6b5063 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h
@@ -7,13 +7,11 @@
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_globals.h"
-#include "debug/fw_debug.h"
+#include "llk_defs.h"
 
 
 namespace ckernel::packer
 {
-   constexpr uint32_t OUTPUT_BASE    = 0;
-   constexpr uint32_t OUTPUT_BASE_ID = 16;
    constexpr uint32_t PACK_CNT       = 4;
 
 
@@ -137,47 +135,6 @@ namespace ckernel::packer
       pack_counters_t f;
    } pack_counters_u;
 
-
-   // FIXME: Added this --> Should be generated by compile trisc?
-   constexpr std::uint32_t pack_tile_dims[32][2] = {
-      {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32},
-      {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32},
-      {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32},
-      {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}
-   };
-   constexpr std::uint32_t pack_tile_num_faces[32] = {
-      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
-   constexpr std::uint32_t pack_tile_face_r_dim[32] = {
-      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 };
-   constexpr std::uint32_t pack_partial_face[32] = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-   constexpr std::uint32_t pack_narrow_tile[32] = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-   inline const uint32_t get_num_faces(const std::uint32_t output_id) //FIXME: why we have to always inline
-   {
-      return pack_tile_num_faces[output_id];
-   }
-
-   inline const uint32_t get_face_r_dim(const std::uint32_t output_id)
-   {
-      return pack_tile_face_r_dim[output_id];
-   }
-
-   inline const uint32_t get_tile_c_dim(const std::uint32_t output_id)
-   {
-      return pack_tile_dims[output_id][TileDim::C_IDX];
-   }
-
-   inline constexpr uint32_t get_partial_face(const std::uint32_t operand_id)
-   {
-      return pack_partial_face[operand_id];
-   }
-
-   inline constexpr uint32_t get_narrow_tile(const std::uint32_t operand_id)
-   {
-      return pack_narrow_tile[operand_id];
-   }
-
    // Set unpacker offsets to 0, except for unpacker 0, channel 1, X, which is the tile X dimension
    inline void packer_addr_counter_init()
    {
@@ -185,15 +142,15 @@ namespace ckernel::packer
        TTI_SETADCZW(0b100, 0, 0, 0, 0, 0b1111);
    }
 
-   inline void set_packer_strides(const uint output_id){
+   inline void set_packer_strides(const uint pack_src_format, const uint pack_dst_format){
 
       // Get pointer to registers for current state ID
       volatile uint tt_reg_ptr *cfg = get_cfg_pointer();
 
-      uint x_stride = (uint)(pack_src_format[output_id]&0x3) == (uint)DataFormat::Float32 ? 4 :
-                      (uint)(pack_src_format[output_id]&0x3) == (uint)DataFormat::Float16 ? 2 : 1;
-      uint y_stride = 16*x_stride;
-      uint z_stride = PACK_CNT*16*y_stride;
+      uint x_stride = (uint)(pack_src_format&0x3) == (uint)DataFormat::Float32 ? 4 :
+                      (uint)(pack_src_format&0x3) == (uint)DataFormat::Float16 ? 2 : 1;
+      uint y_stride = FACE_R_DIM*x_stride;
+      uint z_stride = PACK_CNT*FACE_C_DIM*y_stride;
       uint w_stride = z_stride;
 
       TT_SETDMAREG(0, LOWER_HALFWORD((y_stride<<PCK0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT)), 0, LO_16(p_gpr_pack::TMP0)); //x-stride not used!
@@ -206,32 +163,29 @@ namespace ckernel::packer
    }
 
    template <bool is_fp32_dest_acc_en>
-   inline void set_packer_config(const uint output_id){
+   inline void set_packer_config(const uint pack_src_format, const uint pack_dst_format, const uint num_faces = 4, const bool partial_face = false){
 
       // Get pointer to registers for current state ID
       volatile uint tt_reg_ptr *cfg = get_cfg_pointer();
 
-      const uint num_faces = get_num_faces(output_id);
-      const bool partial_face = get_partial_face(output_id);
-
       // Set packer config
       pack_config_u config;
       for (uint i=0; i<4; i++) {
          config.val[i] = 0;
       }
 
-      config.f.exp_section_size = (((uint)pack_dst_format[output_id] == (uint)DataFormat::Lf8) ||
-                                   ((uint)pack_dst_format[output_id] == (uint)DataFormat::Int8)) ? 0 : (partial_face ? 1 : num_faces); // set to num_faces as exp section size is not used for non-bfp formats except for lf8/int8
+      config.f.exp_section_size = ((pack_dst_format == (uint)DataFormat::Lf8) ||
+                                   (pack_dst_format == (uint)DataFormat::Int8)) ? 0 : (partial_face ? 1 : num_faces); // set to num_faces as exp section size is not used for non-bfp formats except for lf8/int8
 
       config.f.uncompress   = 1;
-      config.f.out_data_format   = (uint)pack_dst_format[output_id];
-      config.f.in_data_format    = (uint)pack_src_format[output_id];
+      config.f.out_data_format   = pack_dst_format;
+      config.f.in_data_format    = pack_src_format;
       config.f.pack_per_xy_plane = 1;
 
 
       // Workaround for bug in HW: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1394
       if constexpr (is_fp32_dest_acc_en) {
-         if (IS_BFP_A_FORMAT((uint)pack_dst_format[output_id])) {
+         if (IS_A_FORMAT(pack_dst_format)) {
             config.f.exp_threshold_en = 1;
             config.f.exp_threshold = 113;
          }
@@ -276,27 +230,35 @@ namespace ckernel::packer
 
       dest_rd_ctrl_u dest_rd_ctrl;
       dest_rd_ctrl.val = 0;
-      dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Read_32b_data = ((uint)pack_src_format[output_id] == (uint)DataFormat::Int8) | (is_fp32_dest_acc_en ? 1 : 0);
+      dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Read_32b_data = (pack_src_format == (uint)DataFormat::Int8) |
+                                                      (pack_src_format == (uint)DataFormat::Int32) |
+                                                      (pack_src_format == (uint)DataFormat::Float32) |
+                                                      (is_fp32_dest_acc_en ? 1 : 0);
+
+      //Round to 10 bit mantissa from fp32 dest
+      if(is_fp32_dest_acc_en && (pack_src_format!=(uint)DataFormat::Float32)) {
+         dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Round_10b_mant = 1;
+      }
       cfg[PCK_DEST_RD_CTRL_Read_32b_data_ADDR32] = dest_rd_ctrl.val;
 
-      if (IS_BFP_FORMAT(pack_dst_format[output_id])) {
+      if (IS_BFP_FORMAT(pack_dst_format)) {
          // Override exp section size for packers 1,2,3
          // Tile header + exp size + datum size
-         if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8_b) {
+         if ((uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp8 || (uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp8_b) {
             config.f.exp_section_size = 1 + ((num_faces>2) ? 2 : 0) + 16;
             cfg[THCON_SEC0_REG8_Row_start_section_size_ADDR32+0]=config.val[0];
             config.f.exp_section_size = 1 + 1 + 32;
             cfg[THCON_SEC1_REG1_Row_start_section_size_ADDR32+0]=config.val[0];
             config.f.exp_section_size = 1 + 0 + 48;
             cfg[THCON_SEC1_REG8_Row_start_section_size_ADDR32+0]=config.val[0];
-         } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4_b) {
+         } else if ((uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp4 || (uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp4_b) {
             config.f.exp_section_size = 1 + ((num_faces>2) ? 2 : 0) + 8;
             cfg[THCON_SEC0_REG8_Row_start_section_size_ADDR32+0]=config.val[0];
             config.f.exp_section_size = 1 + 1 + 16;
             cfg[THCON_SEC1_REG1_Row_start_section_size_ADDR32+0]=config.val[0];
             config.f.exp_section_size = 1 + 0 + 24;
             cfg[THCON_SEC1_REG8_Row_start_section_size_ADDR32+0]=config.val[0];
-         } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2_b) {
+         } else if ((uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp2 || (uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp2_b) {
             config.f.exp_section_size = 1 + ((num_faces>2) ? 2 : 0) + 4;
             cfg[THCON_SEC0_REG8_Row_start_section_size_ADDR32+0]=config.val[0];
             config.f.exp_section_size = 1 + 1 + 8;
@@ -310,7 +272,7 @@ namespace ckernel::packer
       }
 
       // Save to GPR for quick data format reconfig
-      regfile[p_gpr_pack::EXP0_SEC_SIZE_BFP]  = (num_faces) << THCON_SEC0_REG8_Exp_section_size_SHAMT;
+      regfile[p_gpr_pack::EXP0_SEC_SIZE_BFP]  = (partial_face ? 1 : num_faces) << THCON_SEC0_REG8_Exp_section_size_SHAMT;
       regfile[p_gpr_pack::EXP1_SEC_SIZE_BFP8] = (1 + ((num_faces>2) ? 2 : 0) + 16) << THCON_SEC0_REG8_Exp_section_size_SHAMT;
       regfile[p_gpr_pack::EXP2_SEC_SIZE_BFP8] = (1 + 1 + 32) << THCON_SEC0_REG8_Exp_section_size_SHAMT;
       regfile[p_gpr_pack::EXP3_SEC_SIZE_BFP8] = (1 + 0 + 48) << THCON_SEC0_REG8_Exp_section_size_SHAMT;
@@ -323,13 +285,12 @@ namespace ckernel::packer
       sync_regfile_write(p_gpr_pack::EXP3_SEC_SIZE_BFP2);
    }
 
-   inline void set_packer_l1_offset(const uint output_id){
+   inline void set_packer_l1_offset(const uint pack_dst_format, const uint face_r_dim = FACE_R_DIM){
 
-      const uint face_r_dim = get_face_r_dim(output_id);
       const uint face_dim = face_r_dim * FACE_C_DIM;
 
-      uint32_t l1_offset_1 = IS_BFP_FORMAT(pack_dst_format[output_id]) ? 1 : (((uint8_t)(pack_dst_format[output_id]&0x3) == (uint8_t)DataFormat::Float32)  ? (face_dim/16)*4 :
-                                                                               ((uint8_t)(pack_dst_format[output_id]&0x3) == (uint8_t)DataFormat::Float16) ? (face_dim/16)*2 : (face_dim/16));
+      uint32_t l1_offset_1 = IS_BFP_FORMAT(pack_dst_format) ? 1 : (((uint8_t)(pack_dst_format&0x3) == (uint8_t)DataFormat::Float32)  ? (face_dim/16)*4 :
+                                                                               ((uint8_t)(pack_dst_format&0x3) == (uint8_t)DataFormat::Float16) ? (face_dim/16)*2 : (face_dim/16));
       uint32_t l1_offset_2 = 2 * l1_offset_1;
       uint32_t l1_offset_3 = 3 * l1_offset_1;
 
@@ -351,7 +312,11 @@ namespace ckernel::packer
 
 
    template <bool is_fp32_dest_acc_en = false>
-   inline void reconfig_packer_data_format(const uint output_id)
+   inline void reconfig_packer_data_format(
+      const uint pack_src_format,
+      const uint pack_dst_format,
+      const uint tile_size,
+      const uint face_r_dim = FACE_R_DIM)
    {
       // Get pointer to registers for current state ID
       volatile uint *cfg = get_cfg_pointer();
@@ -361,35 +326,35 @@ namespace ckernel::packer
       config.val[2] = 0; // Only need to modify word[2][15:0]
 
       config.f.uncompress   = 1;
-      config.f.out_data_format   = (uint)pack_dst_format[output_id];
-      config.f.in_data_format    = (uint)pack_src_format[output_id];
+      config.f.out_data_format   = pack_dst_format;
+      config.f.in_data_format    = pack_src_format;
       TT_SETDMAREG(0, LOWER_HALFWORD(config.val[2]), 0, LO_16(p_gpr_pack::TMP_LO));
       TTI_REG2FLOP(2,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO); //16-bit write
       TTI_REG2FLOP(2,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO);
       TTI_REG2FLOP(2,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO);
       TTI_REG2FLOP(2,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO);
 
-      if (IS_BFP_FORMAT(pack_dst_format[output_id])) {
+      if (IS_BFP_FORMAT(pack_dst_format)) {
          // Override exp section size for packers 1,2,3
          // Tile header + exp size + datum size
          TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP0_SEC_SIZE_BFP);
-         if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8_b) {
+         if ((pack_dst_format&0x1F) == (uint)DataFormat::Bfp8 || (pack_dst_format&0x1F) == (uint)DataFormat::Bfp8_b) {
             TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP1_SEC_SIZE_BFP8);
             TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP2_SEC_SIZE_BFP8);
             TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP3_SEC_SIZE_BFP8);
-         } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4_b) {
+         } else if ((pack_dst_format&0x1F) == (uint)DataFormat::Bfp4 || (pack_dst_format&0x1F) == (uint)DataFormat::Bfp4_b) {
             TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP1_SEC_SIZE_BFP4);
             TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP2_SEC_SIZE_BFP4);
             TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP3_SEC_SIZE_BFP4);
-         } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2_b) {
+         } else if ((pack_dst_format&0x1F) == (uint)DataFormat::Bfp2 || (pack_dst_format&0x1F) == (uint)DataFormat::Bfp2_b) {
             TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP1_SEC_SIZE_BFP2);
             TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP2_SEC_SIZE_BFP2);
             TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP3_SEC_SIZE_BFP2);
          } else {
             FWASSERT("Other data formats not supported", false);
          }
-      } else if (((uint)pack_dst_format[output_id] == (uint)DataFormat::Lf8) ||
-                 ((uint)pack_dst_format[output_id] == (uint)DataFormat::Int8)) {
+      } else if ((pack_dst_format == (uint)DataFormat::Lf8) ||
+                 (pack_dst_format == (uint)DataFormat::Int8)) {
          TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO);
          TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO);
          TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO);
@@ -397,59 +362,73 @@ namespace ckernel::packer
       }
 
       // Set l1 address offset
-      set_packer_l1_offset(output_id);
+      set_packer_l1_offset(pack_dst_format, face_r_dim);
 
-      TT_SETDMAREG(0, LOWER_HALFWORD((std::uint32_t)cb_interface[output_id].fifo_page_size), 0, LO_16(p_gpr_pack::TILE_HEADER));
+      TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_pack::TILE_HEADER));
 
       // Workaround for HW bug: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1394
       if constexpr (is_fp32_dest_acc_en) {
-         if (IS_BFP_A_FORMAT((uint)pack_dst_format[output_id])) {
+         if (IS_BFP_A_FORMAT(pack_dst_format)) {
             config.val[3] = 0; // Only need to modify word[2][15:0]
             config.f.exp_threshold_en = 1;
             config.f.exp_threshold = 113;
             TT_SETDMAREG(0, UPPER_HALFWORD(config.val[3]), 0, HI_16(p_gpr_pack::TMP_HI));
             TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI);
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI);
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI);
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI);
          } else {
             TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO);
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO);
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO);
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO);
          }
       }
 
       // Flush packer pipeline before strides gasket alu format change
       TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::PACK);
-      cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG2_Dstacc_RMW>(pack_src_format[output_id]);
+      cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG2_Dstacc_RMW>(pack_src_format);
 
       tensix_sync(); //FIXME: why stallwait on cfg write doesn't work!
 
       // Set packer strides
-      set_packer_strides(output_id);
+      set_packer_strides(pack_src_format, pack_dst_format);
 
 
    }
 
    template <bool is_fp32_dest_acc_en, bool untilize>
-   inline void configure_pack(uint pack_output_id, uint relu_config = 0)
+   inline void configure_pack(
+      const uint pack_src_format,
+      const uint pack_dst_format,
+      const uint tile_size,
+      const uint face_r_dim = FACE_R_DIM,
+      const uint num_faces = 4,
+      const bool partial_face = false,
+      const bool narrow_tile = false,
+      const uint relu_config = 0)
    {
       // Get pointer to registers for current state ID
       volatile uint *cfg = get_cfg_pointer();
 
-      if (pack_src_format[pack_output_id] != pack_dst_format[pack_output_id]) {
+      if (pack_src_format != pack_dst_format) {
          TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK);
          tensix_sync();
       }
 
-      set_packer_strides(pack_output_id);
+      set_packer_strides(pack_src_format, pack_dst_format);
 
       t6_mutex_acquire(mutex::REG_RMW);
 
-      uint alu_dst_format = pack_src_format[pack_output_id];
+      const uint alu_dst_format = pack_src_format;
 
       cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG2_Dstacc_RMW>(alu_dst_format);
 
       t6_mutex_release(mutex::REG_RMW);
 
-      set_packer_config<is_fp32_dest_acc_en>(pack_output_id);
+      set_packer_config<is_fp32_dest_acc_en>(pack_src_format, pack_dst_format, num_faces, partial_face);
 
-      set_packer_l1_offset(pack_output_id);
+      set_packer_l1_offset(pack_dst_format, face_r_dim);
 
       // PACK_COUNTERS_SEC0_pack_per_xy_plane = cfg_reg_array[3][0 +: 8];
       // PACK_COUNTERS_SEC0_pack_reads_per_xy_plane = cfg_reg_array[3][8 +: 8];
@@ -457,7 +436,7 @@ namespace ckernel::packer
       // PACK_COUNTERS_SEC0_pack_yz_transposed = cfg_reg_array[3][23 +: 1];
       pack_counters_u pack_counters;
       pack_counters.val = 0;
-      pack_counters.f.pack_reads_per_xy_plane = get_face_r_dim(pack_output_id); // Number of reads per face
+      pack_counters.f.pack_reads_per_xy_plane = face_r_dim; // Number of reads per face
                                                                                 // Used for resetting tile posistion generator for edge masks
       for (uint i=0; i<4; i++) cfg[PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32+i]=pack_counters.val; // disable auto last generation
 
@@ -468,7 +447,7 @@ namespace ckernel::packer
       cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32]=pck_edge_offset.val;
       cfg[TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32] = 0x0; // All packers use row set mapping 0, edge offset 0 mask
 
-      regfile[p_gpr_pack::TILE_HEADER]   = (std::uint32_t)cb_interface[pack_output_id].fifo_page_size;
+      regfile[p_gpr_pack::TILE_HEADER]   = tile_size;
       regfile[p_gpr_pack::TILE_HEADER+1] = 0;
       regfile[p_gpr_pack::TILE_HEADER+2] = 0;
       regfile[p_gpr_pack::TILE_HEADER+3] = 0;
@@ -484,74 +463,13 @@ namespace ckernel::packer
 
       cfg[STACC_RELU_ApplyRelu_ADDR32] = hw_relu_config.val[0];
 
-      const uint face_r_dim = get_face_r_dim(pack_output_id);
       const uint face_dim = face_r_dim * FACE_C_DIM;
-      const bool narrow_tile = get_narrow_tile(pack_output_id);
-      const uint pack_x_dim = untilize ? (narrow_tile ? face_dim : 16) : face_dim; // Number of datums to pack per row
-                                                                                   // To untilize narrow tile (32x16) we just pack 2 faces back to back
-      TT_SETADCXX(p_setadc::PAC, pack_x_dim-1, 0x0);
-   }
 
-   template <DstTileFaceLayout FaceLayout, bool untilize, bool is_fp32_dest_acc_en>
-   inline void init_packer_dest_offset_registers()
-   {
-      //Issue #3064: to avoid sfpu and packer stalling when dest is in FP32 mode
-      //             use dest offset of 0x200 instead of 0x100
-      //             Wormhole a0/b0 HW translates these addreses to the correct dest bank,
-      //             however dest capacity is unchanged (e.g 0x100 to 0x1FF should be unused now)
-      constexpr uint32_t DEST_OFFSET_SHIFT = 0; //is_fp32_dest_acc_en ? (1) : (0);
-      constexpr uint32_t DEST_HALF_OFFSET = DEST_REGISTER_HALF_SIZE >> DEST_OFFSET_SHIFT;
-
-      if constexpr (untilize) {
-         if constexpr (FaceLayout == ColMajor) {
-            // Packer0 :  0,32,  1,33 ...  7, 39
-	    // Packer1 :  8,40,  9,41 ... 15, 47
-	    // Packer2 : 16,48, 17,49 ... 23, 55
-	    // Packer3 : 23,56, 24,57 ... 31, 63
-            regfile[p_gpr_pack::DEST_OFFSET_LO]   = 0x0;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x8;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x10;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x18;
-            regfile[p_gpr_pack::DEST_OFFSET_HI]   = DEST_HALF_OFFSET;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x8;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x10;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x18;
-         } else {
-            // Packer0 :  0,16,  1,17 ...  7, 23
-	    // Packer1 :  8,24,  9,25 ... 15, 31
-	    // Packer2 : 32,48, 33,49 ... 39, 55
-	    // Packer3 : 40,56, 41,57 ... 47, 63
-            regfile[p_gpr_pack::DEST_OFFSET_LO]   = 0x0;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x8;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x20;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x28;
-            regfile[p_gpr_pack::DEST_OFFSET_HI]   = DEST_HALF_OFFSET;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x8;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x20;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x28;
-	 }
-      } else {
-         if constexpr (FaceLayout == ColMajor) {
-            regfile[p_gpr_pack::DEST_OFFSET_LO]   = 0x0;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x20;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x10;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x30;
-            regfile[p_gpr_pack::DEST_OFFSET_HI]   = DEST_HALF_OFFSET;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x20;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x10;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x30;
-         } else { // Default to row major layout
-            regfile[p_gpr_pack::DEST_OFFSET_LO]   = 0x0;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x10;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x20;
-            regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x30;
-            regfile[p_gpr_pack::DEST_OFFSET_HI]   = DEST_HALF_OFFSET;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x10;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x20;
-            regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x30;
-         }
-      }
-      sync_regfile_write(p_gpr_pack::DEST_OFFSET_HI+3);
+      // To untilize narrow tile (32x16) we just pack 2 faces back to back
+      // Number of datums to pack per row
+      const uint pack_x_dim = (narrow_tile || !untilize) ? face_dim : FACE_R_DIM;
+
+      TT_SETADCXX(p_setadc::PAC, pack_x_dim-1, 0x0);
    }
 
    inline uint8_t get_packer_dest_offset_index()
@@ -585,7 +503,7 @@ namespace ckernel::packer
 
    // Program packer destination addresses from GPRs
    template <PackSelMask PackSel=PACK_ALL>
-   inline void program_packer_destination(uint32_t addr, uint8_t pack_output_id)
+   inline void program_packer_destination(uint32_t addr)
    {
       uint32_t new_l1_addr = (1 << 31) | addr;
       TT_SETDMAREG(0, LOWER_HALFWORD(addr), 0, LO_16(p_gpr_pack::OUTPUT_ADDR));
@@ -638,14 +556,4 @@ namespace ckernel::packer
       TTI_STOREIND (1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::OUTPUT_ADDR);
    }
 
-   inline uint32_t get_output_id(uint32_t output)
-   {
-      return ((output) - OUTPUT_BASE);
-   }
-
-   inline constexpr uint32_t get_output_base_id()
-   {
-      return (OUTPUT_BASE_ID);
-   }
-
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h
deleted file mode 100644
index 5dff63c55e3..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <cstdint>
-#include <utility>
-#include "debug/fw_debug.h"
-
-// Provides a priority queue where lowest priority value has highest priority (e.g. priority 0 is higher priority than 5)
-// Can be reversed with REVERSE_PRIORITY (e.g. priority 5 will be higher priority than 0)
-// Also provides a version that allocates memory for you (such as on the stack), see below
-
-template <bool REVERSE_PRIORITY=false>
-class FixedSizePriorityQueue
-{
-    protected:
-    std::pair<uint32_t, uint32_t> *heap;
-    uint32_t num_elem;
-    uint32_t max_size;
-
-    public:
-    FixedSizePriorityQueue(uint32_t addr_, uint32_t max_size_)
-    {
-        heap = (std::pair<uint32_t, uint32_t> *) addr_;
-        num_elem = 0;
-        max_size = max_size_;
-    }
-
-    void push(uint32_t value, uint32_t priority)
-    {
-        push(std::make_pair(value, priority));
-    }
-
-    void push(std::pair<uint32_t, uint32_t> value)
-    {
-        FWASSERT("You are trying to push a full priority queue.", !is_full());
-
-        heap[num_elem] = value;
-
-        num_elem++;
-        bubble_up(num_elem - 1);
-    }
-
-    std::pair<uint32_t, uint32_t> pop()
-    {
-        FWASSERT("You are trying to pop an empty priority queue.", !is_empty());
-
-        // Swap first with last
-        auto first_elem = heap[0];
-        heap[0] = heap[num_elem - 1];
-        heap[num_elem - 1] = first_elem;
-
-        num_elem--;
-        bubble_down(0);
-
-        return first_elem;
-    }
-
-    __attribute__((always_inline))
-    inline const std::pair<uint32_t, uint32_t>& top() const
-    {
-        FWASSERT("You are trying to view an empty priority queue.", !is_empty());
-
-        return heap[0];
-    }
-
-    __attribute__((always_inline))
-    inline const uint32_t size() const
-    {
-        return num_elem;
-    }
-
-    __attribute__((always_inline))
-    inline const bool is_empty() const
-    {
-        return size() == 0;
-    }
-
-    __attribute__((always_inline))
-    inline const bool is_full() const
-    {
-        return size() == max_size;
-    }
-
-    protected:
-
-    void bubble_up(uint32_t idx)
-    {
-        if (idx == 0)
-            return;
-
-        uint32_t parent = ((idx + 1) >> 1) - 1;
-
-        if ((REVERSE_PRIORITY && (heap[parent].second < heap[idx].second)) ||
-            (!REVERSE_PRIORITY && (heap[parent].second > heap[idx].second))) {
-            // swap
-            auto tmp = heap[idx];
-            heap[idx] = heap[parent];
-            heap[parent] = tmp;
-
-            bubble_up(parent);
-        }
-    }
-
-    void bubble_down(uint32_t idx)
-    {
-        uint32_t left = ((idx + 1) << 1) - 1;
-        uint32_t right = ((idx + 1) << 1);
-        uint32_t higher_priority = idx;
-
-        if (left < size()) {
-            if ((REVERSE_PRIORITY && (heap[left].second > heap[higher_priority].second)) ||
-                (!REVERSE_PRIORITY && (heap[left].second < heap[higher_priority].second))) {
-                higher_priority = left;
-            }
-        }
-
-        if (right < size()) {
-            if ((REVERSE_PRIORITY && (heap[right].second > heap[higher_priority].second)) ||
-                (!REVERSE_PRIORITY && (heap[right].second < heap[higher_priority].second))) {
-                higher_priority = right;
-            }
-        }
-
-        if (higher_priority != idx) {
-            // swap
-            auto tmp = heap[idx];
-            heap[idx] = heap[higher_priority];
-            heap[higher_priority] = tmp;
-
-            bubble_down(higher_priority);
-        }
-    }
-
-};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h
index 818793a2680..55404e24d39 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h
@@ -6,8 +6,6 @@
 
 #include "ckernel.h"
 #include "ckernel_globals.h"
-#include "debug/fw_debug.h"
-#include "debug/status.h"
 
 #ifdef PERF_DUMP
 #include "perf_res_decouple.h"
@@ -15,8 +13,6 @@
 
 namespace ckernel::unpacker
 {
-   constexpr uint32_t OPERAND_BASE_ID = 0;
-   constexpr uint32_t INTERMEDIATE_BASE_ID = 24;
    constexpr uint32_t TILE_DESC_SIZE = 2; //Unpacker descriptor size in dwords
    constexpr uint32_t CONFIG_SIZE = 2; //Unpacker configuration size in dwords
 
@@ -156,9 +152,7 @@ namespace ckernel::unpacker
    // Wait for threshold of busy contexts to fall below total available contexts
    inline void wait_for_next_context(const uint num_contexts)
    {
-       DEBUG_STATUS('W', 'N', 'C', 'W');
        while (semaphore_read(semaphore::UNPACK_SYNC) >= num_contexts) {}
-       DEBUG_STATUS('W', 'N', 'C', 'D');
    }
 
    inline void switch_config_context(uint &unp_cfg_context)
@@ -180,56 +174,30 @@ namespace ckernel::unpacker
       TTI_SETC16(UNPACK_MISC_CFG_CfgContextOffset_0_ADDR32, 0x0000);
    }
 
-   // FIXME: Added this --> Should be generated by compile trisc?
-   constexpr std::uint32_t unpack_tile_num_faces[32] = {
-      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
-   constexpr std::uint32_t unpack_tile_face_r_dim[32] = {
-      16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 };
-   constexpr std::uint32_t unpack_partial_face[32] = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-   constexpr std::uint32_t unpack_narrow_tile[32] = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    // Sync on unpacker idle via waiting busy contexts counter 0
    inline void wait_for_idle()
    {
-       DEBUG_STATUS('W', 'I', 'W');
        while (semaphore_read(semaphore::UNPACK_SYNC) > 0) {}
-       DEBUG_STATUS('W', 'I', 'D');
    }
 
-   inline constexpr uint32_t get_num_faces(const std::uint32_t operand_id)
-   {
-      return unpack_tile_num_faces[operand_id];
-   }
-
-   inline constexpr uint32_t get_face_r_dim(const std::uint32_t operand_id)
-   {
-      return unpack_tile_face_r_dim[operand_id];
-   }
-
-   inline constexpr uint32_t get_partial_face(const std::uint32_t operand_id)
-   {
-      return unpack_partial_face[operand_id];
-   }
-
-   inline constexpr uint32_t get_narrow_tile(const std::uint32_t operand_id)
-   {
-      return unpack_narrow_tile[operand_id];
+   inline void enalbe_int8_fpu_math() {
+      alu_config_u alu_payload = {.val = 0};
+      alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = 1;
+      cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, ALU_ACC_CTRL_INT8_math_enabled_MASK>(alu_payload.val);
    }
 
+   template<bool row_pool=false, bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
    inline void configure_unpack_AB(
-     uint unpA_operand_id,
-     uint unpB_operand_id,
-     uint unpA_face_r_dim=16,
-     uint unpB_face_r_dim=16,
-     bool row_pool=false,
-     bool transpose_xy_srca_en=false,
-     bool is_fp32_dest_acc_en=false,
-     bool srnd_fpu_en = false,
+     const uint unpA_src_format,
+     const uint unpB_src_format,
+     const uint unpA_dst_format,
+     const uint unpB_dst_format,
+     const uint unpA_face_r_dim=FACE_R_DIM,
+     const uint unpB_face_r_dim=FACE_R_DIM,
+     const bool transpose_xy_srca_en=false,
      const uint unpA_num_faces = 4,
      const uint unpB_num_faces = 4)
    {
-
       // Check that unpacker is done (all contexts freed up) before starting hw configuration
       wait_for_idle();
 
@@ -239,11 +207,11 @@ namespace ckernel::unpacker
       // Get pointer to registers for current state ID
       volatile uint tt_reg_ptr *cfg = get_cfg_pointer();
 
-      uint unpA_ch1_x_stride = (uint) (unpack_dst_format[unpA_operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[unpA_operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
-      uint unpB_ch1_x_stride = (uint) (unpack_dst_format[unpB_operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[unpB_operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
+      uint unpA_ch1_x_stride = (uint) (unpA_dst_format&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpA_dst_format&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
+      uint unpB_ch1_x_stride = (uint) (unpB_dst_format&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpB_dst_format&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
       uint unpA_ch1_z_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride;
       uint unpB_ch1_z_stride = FACE_C_DIM*FACE_R_DIM*unpB_ch1_x_stride;
-      uint exp_width = ((uint)unpack_dst_format[unpA_operand_id]>>2)&0x1; //0=5-bit, 1=8-bit
+      uint exp_width = ((uint)unpA_dst_format>>2)&0x1; //0=5-bit, 1=8-bit
 
       // Strides for incrementing ch1 address to srcA and srcB
       cfg[UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32] = (0                 << UNP0_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT) |
@@ -262,33 +230,42 @@ namespace ckernel::unpacker
       alu_config_u alu_payload = {.val = 0};
 
       uint32_t fp32_dest_acc_en = (is_fp32_dest_acc_en) ? (1) : (0);
+      uint32_t int8_math_enabled = ((uint)unpA_dst_format == (uint)DataFormat::Int8) ||
+                                   ((uint)unpB_dst_format == (uint)DataFormat::Int8) ||
+                                   ((uint)unpA_dst_format == (uint)DataFormat::Int32) ||
+                                   ((uint)unpB_dst_format == (uint)DataFormat::Int32);
+
+      constexpr uint alu_format_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK;
+      alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcA = unpA_dst_format;
+      alu_payload.f.ALU_FORMAT_SPEC_REG1_SrcB = row_pool ? ((uint) DataFormat::Float16 | (exp_width<<2)) : unpB_dst_format;
 
-      alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcA = unpack_dst_format[unpA_operand_id];
-      alu_payload.f.ALU_FORMAT_SPEC_REG1_SrcB = row_pool ? ((uint) DataFormat::Float16 | (exp_width<<2)) : unpack_dst_format[unpB_operand_id];
       // FP32 accumulation and SFPU to read dest as FP32
       // NOTE: This assumes these config fields are adjacent and in same register!!
       static_assert(ALU_ACC_CTRL_Fp32_enabled_ADDR32 == ALU_FORMAT_SPEC_REG0_SrcA_ADDR32);
       static_assert(ALU_ACC_CTRL_Fp32_enabled_ADDR32 == ALU_ACC_CTRL_SFPU_Fp32_enabled_ADDR32);
+      constexpr uint alu_dest_format_mask = ALU_ACC_CTRL_INT8_math_enabled_MASK | ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK | ALU_ACC_CTRL_Fp32_enabled_MASK;
       alu_payload.f.ALU_ACC_CTRL_Fp32_enabled = fp32_dest_acc_en;
       alu_payload.f.ALU_ACC_CTRL_SFPU_Fp32_enabled = fp32_dest_acc_en;
-      alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = ((uint)unpack_dst_format[unpA_operand_id] == (uint)DataFormat::Int8) ||
-                                                     ((uint)unpack_dst_format[unpB_operand_id] == (uint)DataFormat::Int8);
+      alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = int8_math_enabled;
 
-      constexpr uint mask1 = ALU_ACC_CTRL_INT8_math_enabled_MASK | ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK | ALU_ACC_CTRL_Fp32_enabled_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK | ALU_FORMAT_SPEC_REG0_SrcA_MASK;
+      constexpr uint alu_stoch_rnd_mask = ALU_ROUNDING_MODE_Fpu_srnd_en_MASK | ALU_ROUNDING_MODE_Gasket_srnd_en_MASK | ALU_ROUNDING_MODE_Packer_srnd_en_MASK;
+      constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndMode::All);
+      alu_payload.f.ALU_ROUNDING_MODE_Fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Fpu);
+      alu_payload.f.ALU_ROUNDING_MODE_Gasket_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndMode::Pack);
+      alu_payload.f.ALU_ROUNDING_MODE_Packer_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Pack);
 
-      cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, mask1>(alu_payload.val);
+      constexpr uint alu_mask = alu_format_mask | alu_dest_format_mask | alu_stoch_rnd_mask;
 
-      cfg_reg_rmw_tensix<ALU_ROUNDING_MODE_Fpu_srnd_en_RMW>(srnd_fpu_en);
+      cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, alu_mask>(alu_payload.val);
 
       t6_mutex_release(mutex::REG_RMW);
 
-
       // Set tile descriptor
       unpack_tile_descriptor_u tile_descriptor;
       for (uint i=0; i<TILE_DESC_SIZE; i++) {
          tile_descriptor.val[i] = 0;
       }
-      tile_descriptor.f.in_data_format  = (uint) unpack_src_format[unpA_operand_id];
+      tile_descriptor.f.in_data_format  = (uint) unpA_src_format;
       tile_descriptor.f.uncompressed = 1; // Input tile is uncompressed
       tile_descriptor.f.x_dim        = 0; // Not used for unpA as value is overriden by per context x_dim set below. Used for unpB
       tile_descriptor.f.y_dim        = 1;
@@ -296,7 +273,7 @@ namespace ckernel::unpacker
       //tile_descriptor.f.blobs_per_xy_plane = 0;
       //tile_descriptor.f.blobs_y_start = 0;
       for (uint i=0; i<TILE_DESC_SIZE; i++) cfg[THCON_SEC0_REG0_TileDescriptor_ADDR32+i]=tile_descriptor.val[i];
-      tile_descriptor.f.in_data_format  = row_pool ? (uint) DataFormat::Float32 : unpack_src_format[unpB_operand_id];
+      tile_descriptor.f.in_data_format  = row_pool ? (uint) DataFormat::Float32 : unpB_src_format;
       tile_descriptor.f.x_dim        = unpB_face_r_dim*FACE_C_DIM;
       tile_descriptor.f.z_dim        = unpB_num_faces;
       for (uint i=0; i<TILE_DESC_SIZE; i++) cfg[THCON_SEC1_REG0_TileDescriptor_ADDR32+i]=tile_descriptor.val[i];
@@ -306,7 +283,7 @@ namespace ckernel::unpacker
       for (uint i=0; i<CONFIG_SIZE; i++) {
          config.val[i] = 0;
       }
-      config.f.out_data_format = unpack_dst_format[unpA_operand_id];
+      config.f.out_data_format = unpA_dst_format;
       config.f.throttle_mode   = 2;
       config.f.context_count   = 0;
       config.f.haloize_mode    = transpose_xy_srca_en ? 1 : 0;
@@ -319,7 +296,7 @@ namespace ckernel::unpacker
       //config.f.fifo_size = 0; // Set dynamically
       for (uint i=0; i<CONFIG_SIZE; i++) cfg[THCON_SEC0_REG2_Out_data_format_ADDR32+i]=config.val[i];
 
-      config.f.out_data_format = row_pool ? ((uint) DataFormat::Float16 | (exp_width<<2)) : unpack_dst_format[unpB_operand_id];
+      config.f.out_data_format = row_pool ? ((uint) DataFormat::Float16 | (exp_width<<2)) : unpB_dst_format;
       config.f.haloize_mode    = 0;
 
       for (uint i=0; i<CONFIG_SIZE; i++) cfg[THCON_SEC1_REG2_Out_data_format_ADDR32+i]=config.val[i];
@@ -361,6 +338,11 @@ namespace ckernel::unpacker
                                                                  // workaround for bug https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1372
       }
       */
+      // Workaround for HW bug (int32 dest and movd2a/b is used with srcA/B configured as int8)
+      if (int8_math_enabled) {
+          reg_write(RISCV_DEBUG_REG_DBG_FEATURE_DISABLE, 1<<11); // Set debug feature disable bit 11
+                                                                 // workaround for bug https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1948
+      }
 
       // Clear context ID
       reset_config_context();
@@ -390,16 +372,56 @@ namespace ckernel::unpacker
             TTI_SETADCXX(UNP_SEL, FACE_R_DIM*FACE_C_DIM-1, 0x0);
             TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16);
             break;
-   }
+      }
 
       if constexpr (INSERT_FENCE) {
          TTI_DMANOP; // Insert fence if reg2flop is followed by an unpack
       }
-    }
+   }
+
+   inline constexpr bool is_32bit_input(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format) {
+       const uint input_df = unpack_src_format;
+       const uint output_df = unpack_dst_format;
+       return ((input_df == (uint)DataFormat::Int32)  || (input_df == (uint)DataFormat::Float32)) &&
+              ((output_df == (uint)DataFormat::Int32) || (output_df == (uint)DataFormat::Float32));
+   }
+
+   inline void wait_for_dest_available() {
+      t6_semaphore_wait_on_max<p_stall::UNPACK>(semaphore::UNPACK_TO_DEST);
+   }
+
+   inline void unpack_to_dest_tile_done(uint &context_id) {
+      t6_semaphore_post<p_stall::UNPACK0>(semaphore::UNPACK_TO_DEST);
+      TTI_WRCFG(p_gpr_unpack::UNPACK_STRIDE, p_cfg::WRCFG_32b, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32); // Restore unpack stride
+      // Restore config context
+      if (context_id == 0) {
+         cfg_reg_rmw_tensix<THCON_SEC0_REG2_Unpack_if_sel_cntx0_RMW>(0);
+         cfg_reg_rmw_tensix<THCON_SEC0_REG5_Dest_cntx0_address_RMW>(4*16);
+      } else {
+         cfg_reg_rmw_tensix<THCON_SEC0_REG2_Unpack_if_sel_cntx1_RMW>(0);
+         cfg_reg_rmw_tensix<THCON_SEC0_REG5_Dest_cntx1_address_RMW>(4*16);
+      }
+      TTI_SETC16(SRCA_SET_Base_ADDR32, 0x4); // re-enable address bit swizzle
+   }
+
 
-   inline uint32_t get_operand_id(uint32_t operand)
+   inline void set_dst_write_addr(const uint32_t &context_id, const uint32_t &unpack_dst_format)
    {
-      return operand;
+      uint32_t dst_byte_addr = 16*(4 + mailbox_read(ThreadId::MathThreadId)); // Apply fixed offset of 4*16 to dest address
+      TTI_SETC16(SRCA_SET_Base_ADDR32, 0x0); // Disable address bit swizzle
+      TTI_RDCFG(p_gpr_unpack::UNPACK_STRIDE, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32); // Save current stride
+      uint unpA_ch1_x_stride = (uint) (unpack_dst_format&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
+      uint unpA_ch1_z_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride;
+      TT_SETDMAREG(0, LOWER_HALFWORD(unpA_ch1_z_stride << UNP0_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT), 0, LO_16(p_gpr_unpack::TMP_LO));
+      TTI_WRCFG(p_gpr_unpack::TMP_LO, p_cfg::WRCFG_32b, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32); // Set unpack stride
+      if (context_id == 0) {
+         cfg_reg_rmw_tensix<THCON_SEC0_REG2_Unpack_if_sel_cntx0_RMW>(1);
+         cfg_reg_rmw_tensix<THCON_SEC0_REG5_Dest_cntx0_address_RMW>(dst_byte_addr);
+      } else {
+         cfg_reg_rmw_tensix<THCON_SEC0_REG2_Unpack_if_sel_cntx1_RMW>(1);
+         cfg_reg_rmw_tensix<THCON_SEC0_REG5_Dest_cntx1_address_RMW>(dst_byte_addr);
+      }
+
    }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc
new file mode 100644
index 00000000000..3db907d6b99
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc
@@ -0,0 +1,221 @@
+
+#include "ckernel.h"
+#include "ckernel_addr_map.h"
+#include "ckernel_pcbuf.h"
+#include "ckernel_main.h"
+#include "ckernel_globals.h"
+#include <l1_address_map.h>
+#include <tensix.h>
+#ifdef PERF_DUMP
+#include "ckernel_perf_unpack_pack.h"
+#include "ckernel_perf_math.h"
+#endif
+
+namespace ckernel
+{
+
+enum class ttRiscCores : std::uint32_t { Unpack = 0, Math = 1, Pack = 2, Brisc = 3, Nrisc = 4};
+
+volatile uint tt_reg_ptr *reg_base = reinterpret_cast<volatile uint *>(0xFFB10000);
+volatile uint tt_reg_ptr *pc_buf_base = reinterpret_cast<volatile uint *>(PC_BUF_BASE);
+volatile uint tt_reg_ptr *regfile = reinterpret_cast<volatile uint *>(REGFILE_BASE);
+volatile uint tt_reg_ptr *instrn_buffer = reinterpret_cast<volatile uint *>(INSTRN_BUF_BASE);
+volatile uint tt_reg_ptr *mailbox_base[4] = {
+    reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX0_BASE), reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX1_BASE),
+    reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX2_BASE), reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX3_BASE)
+};
+volatile uint tt_reg_ptr *dbg_event_scratch = nullptr;
+
+uint32_t cfg_state_id __attribute__((section(".bss"))) = 0;  // Flip between 0 and 1 to keep state between kernel calls
+uint32_t dest_offset_id __attribute__((section(".bss"))) = 0; // Flip between 0 and 1 to keep dest pointer between kernel calls
+
+uint32_t dbg_event_index __attribute__((section(".bss"))) = 0;
+uint32_t dbg_event_end __attribute__((section(".bss"))) = 0;
+volatile uint16_t tt_reg_ptr *debug_mailbox_base = nullptr;
+uint8_t mailbox_index = 0;
+const uint8_t mailbox_end = 32;
+volatile uint8_t tt_l1_ptr *debug_buffer = nullptr;
+volatile uint8_t tt_l1_ptr *debug_buffer_start = nullptr;
+uint8_t thread_id __attribute__((section(".bss"))) = 0;
+
+#ifdef PERF_DUMP
+uint32_t perf_index __attribute__((section(".bss"))) = 0;
+uint32_t perf_end __attribute__((section(".bss"))) = 0;
+volatile uint32_t *perf_buf_base[2];
+uint8_t perf_buf_base_id __attribute__((section(".bss"))) = 0;
+bool record_perf_events __attribute__((section(".bss"))) = 0;
+uint32_t perf_events_target_idx __attribute__((section(".bss"))) = 0;
+uint16_t current_outer_loop_iter __attribute__((section(".bss"))) = 0;
+int32_t dram_dump_req_local;
+bool first_unpack_recorded __attribute__((section(".bss"))) = 0;
+volatile uint *ncrisc_ack_addr = nullptr;
+uint32_t header;
+#if OVERLAY_DECOUPLE == 1
+uint8_t overlay_output_decouple_mask = 0;
+inline void update_overlay_decoupling_mailbox() {
+    overlay_output_decouple_mask = PERF_RISC_MAILBOX_OUTPUT_DECOUPLE_MASK_PTR[0] & 0xff;
+    if (thread_id == 0 || thread_id == 1) {
+        while(semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {}
+    }
+}
+inline void reset_unpack_pack_sync() {
+    if (thread_id == 2) {
+        semaphore_get(semaphore::UNPACK_MATH_DONE);
+    }
+}
+#endif
+#endif
+
+volatile uint tt_l1_ptr * trisc_l1_mailbox = reinterpret_cast<volatile uint tt_l1_ptr *>(MAILBOX_ADDR);
+
+inline bool ready_for_next_epoch() {         // place this through compiler into a section that is not going to overwritten
+    return true;
+    // mailbox_write(ttRiscCores::Nrisc);              // signal done epoch to NCRisc
+    // mailbox_read(ttRiscCores::Nrisc);               // This is blocking read, until NCrisc signals epoch is ready
+}
+
+inline void set_thread_id_parameter() {
+    if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) {
+        thread_id = 0;
+    } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) {
+        thread_id = 1;
+    } else {
+        thread_id = 2;
+    }
+}
+
+inline void allocate_debug_mailbox_buffer() {
+   std::int32_t debug_mailbox_addr;
+   if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) {
+      debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 0*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE;
+   } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
+      debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 1*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE;
+   } else {
+      debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 2*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE;
+   }
+   debug_mailbox_base = reinterpret_cast<volatile uint16_t tt_l1_ptr *>(debug_mailbox_addr);
+   clear_mailbox_values();
+}
+
+inline void allocate_debug_buffer() {
+   std::int32_t debug_buffer_addr;
+   if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) {
+      debug_buffer_addr = l1_mem::address_map::TRISC0_DEBUG_BUFFER_BASE;
+   } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
+      debug_buffer_addr = l1_mem::address_map::TRISC1_DEBUG_BUFFER_BASE;
+   } else {
+      debug_buffer_addr = l1_mem::address_map::TRISC2_DEBUG_BUFFER_BASE;
+   }
+   debug_buffer = reinterpret_cast<volatile uint8_t tt_l1_ptr *>(debug_buffer_addr);
+   debug_buffer[l1_mem::address_map::DEBUG_BUFFER_SIZE-1]=0x0;
+   debug_buffer_start = debug_buffer;
+}
+
+__attribute__((noinline)) void debug_dump(const uint8_t *data, uint32_t byte_size) {
+  for (uint32_t i = 0; i < byte_size; i++) {
+    if ((((uint32_t) debug_buffer)&(l1_mem::address_map::DEBUG_BUFFER_SIZE-1)) ==
+         l1_mem::address_map::DEBUG_BUFFER_SIZE-1) {
+       *(debug_buffer) = 0xff; //overflow detected
+    } else {
+       *debug_buffer = data[i];
+       debug_buffer++;
+    }
+  }
+}
+
+__attribute__((noinline)) void debug_dump_seek(uint8_t offset) {
+  debug_buffer = reinterpret_cast<volatile uint8_t *>(debug_buffer_start + offset);
+}
+
+} // namespace ckernel
+
+void local_mem_copy() {
+   volatile uint tt_l1_ptr *l1_local_mem_start_addr;
+   volatile uint *local_mem_start_addr = (volatile uint*) LOCAL_MEM_BASE_ADDR;
+
+   if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) {
+      l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC0_LOCAL_MEM_BASE;
+   } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) {
+      l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC1_LOCAL_MEM_BASE;
+   } else {
+      l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC2_LOCAL_MEM_BASE;
+   }
+   uint word_size = ((uint)__local_mem_rodata_end_addr - (uint)__local_mem_rodata_start_addr)>>2;
+
+   if (word_size>0) {
+      for (uint n=0;n<word_size;n++) {
+         local_mem_start_addr[n] = l1_local_mem_start_addr[n];
+      }
+   }
+
+}
+
+using namespace ckernel;
+
+int main(int argc, char *argv[])
+{
+    FWEVENT("Launching proudction env kernels");
+
+    // Initialize GPRs to all 0s
+    for (int i = 0; i < 64; i++)
+        regfile[i] = 0;
+
+    // Init L1 buffer with 1.0f (used for reduce max)
+    union {
+        float f;
+        uint32_t u;
+    } f2u = {.f = 1.0f};
+
+    // Save a little code space.  GCC fails to remove the loop variable so loop with a ptr
+#pragma GCC unroll 0
+    for (volatile uint32_t tt_l1_ptr *ptr = l1_buffer; ptr < &l1_buffer[16]; *ptr++ = f2u.u) // Load const into L1 buffer
+
+    reset_cfg_state_id();
+
+    trisc_l1_mailbox_write(RESET_VAL);
+
+    if ((uint)l1_mem::address_map::RISC_LOCAL_MEM_BASE ==
+            ((uint)__local_mem_rodata_end_addr&0xfff00000))
+    {
+       local_mem_copy();
+    }
+
+    allocate_debug_mailbox_buffer();
+    allocate_debug_buffer();
+    if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) {
+        reg_write(RISCV_DEBUG_REG_DBG_FEATURE_DISABLE, 0); // Clear debug feature disable in case it was set by previous kernel on TRISC0
+                                                             // e.g workaround for bug https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1372
+        regfile[p_gpr_unpack::L1_BUFFER_ADDR] = (((uint)l1_buffer) >> 4) - 1; //Store L1 buffer address for reduce input 1
+        sync_regfile_write(p_gpr_unpack::L1_BUFFER_ADDR);
+    }
+
+#ifdef PERF_DUMP
+    set_thread_id_parameter();
+    allocate_perf_buffer();
+    setup_fpu_perf_cnt();
+    record_dummy_math_event();
+#if OVERLAY_DECOUPLE == 1
+    update_overlay_decoupling_mailbox();
+#endif
+#endif
+
+    //while (ready_for_next_epoch())
+    {
+        run_kernel();
+    }
+
+    // Signal completion
+    tensix_sync();
+#ifdef PERF_DUMP
+#if OVERLAY_DECOUPLE == 1
+    reset_unpack_pack_sync();
+#endif
+    record_perf_dump_end_and_check_overflow();
+    // There has to be a tensix_sync() before this last pass.
+    last_trisc_perf_dump_to_dram();
+    tensix_sync();
+#endif
+
+    trisc_l1_mailbox_write(KERNEL_COMPLETE);
+
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc
new file mode 100644
index 00000000000..b2c39df3313
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc
@@ -0,0 +1,21 @@
+
+// This c-file's purpose is:
+// 1) include the generated list of kernels
+//      The files hold run_kernel() definition and inline kernel_main functions for every ckernel
+//      Need to make sure no other file includes these lists since it also include global parameter definitions
+// 2) instantiate global variables
+
+
+#include "ckernel_globals.h"
+
+#if defined(UCK_CHLKC_UNPACK) || defined(UCK_CHLKC_MATH) || defined(UCK_CHLKC_PACK)
+#include "chlkc_list.h"
+#else
+#include "ckernel_list.h"
+#endif
+
+// Global vars
+uint32_t unp_cfg_context = 0;
+uint32_t pack_sync_tile_dst_ptr = 0;
+uint32_t math_sync_tile_dst_index = 0;
+volatile uint32_t tt_l1_ptr l1_buffer[16] __attribute__ ((section (".text#"))) __attribute__ ((aligned (16)));
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc
new file mode 100644
index 00000000000..446e14cb8f6
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc
@@ -0,0 +1,301 @@
+
+#include "ckernel_perf_unpack_pack.h"
+#include "stream_interface.h"
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+
+
+namespace ckernel
+{
+extern uint32_t perf_index;
+extern uint32_t perf_end;
+// Perf-buffer are double buffered for spill_to_dram.
+// Ncrisc will move one half to dram while trisc populates the other half.
+// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
+extern volatile uint32_t *perf_buf_base[2];
+// Selects the half of perf_buffer that trisc is currently writing into.
+extern uint8_t perf_buf_base_id;
+extern bool record_perf_events;
+extern uint16_t current_outer_loop_iter;
+extern uint8_t thread_id;
+extern int32_t dram_dump_req_local;
+extern volatile uint* ncrisc_ack_addr;
+extern uint32_t header;
+
+void allocate_perf_buffer() {
+   std::int32_t perf_buf_base_addr;
+   if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) {
+      perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + 0*TRISC_PERF_BUF_SIZE;
+      perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d.
+      if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) {
+         perf_end = TRISC_PERF_BUF_SIZE >> 3;
+      } else {
+         perf_end = 3;
+      }
+      dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[0];
+      ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[0];
+   } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
+      perf_buf_base_addr = l1_mem::address_map::MATH_PERF_BUF_BASE_ADDR;
+      perf_index = 4; // The first 4 32b regs are skipped in recording math perf counters.
+      perf_end = 16;
+
+      // Initialize math_dram_dump_req_local in the beginning of epoch.
+      // EPOCH_INFO_PTR->perf_dram_copy_req counters do not get reset between epochs.
+      dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[1];
+      ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[1];
+   } else {
+      perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + TRISC_PERF_BUF_SIZE;
+      perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d.
+      if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) {
+         perf_end = TRISC_PERF_BUF_SIZE >> 3;
+      } else {
+         perf_end = 3;
+      }
+      TTI_SEMINIT(1, 0, 1 << semaphore::PACK_DONE);
+      dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[2];
+      ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[2];
+   }
+   // Tirsc starts dumping into the first half of the perf_buffers.
+   perf_buf_base_id = 0;
+   // Program the address for the first half of the perf buffer address.
+   perf_buf_base[0] = reinterpret_cast<volatile uint32_t *>(perf_buf_base_addr);
+   // Program the address for the second half of the perf buffer address.
+   perf_buf_base[1] = reinterpret_cast<volatile uint32_t *>(perf_buf_base_addr + (TRISC_PERF_BUF_SIZE >> 1));
+   perf_buf_base[perf_buf_base_id][0] = PERF_DUMP_END_SIGNAL;
+#if PERF_DUMP_CONCURRENT
+   volatile uint32_t* header_ptr = reinterpret_cast<volatile uint32_t *>(l1_mem::address_map::PERF_THREAD_HEADER);
+   header = header_ptr[0];
+   header = (header & 0xfff8ffff) | (((uint32_t)(thread_id) & 0b111) << 16);
+   perf_buf_base[perf_buf_base_id][1] = header;
+   for (uint i = 2; i < perf_index; i++) {
+      perf_buf_base[perf_buf_base_id][i] = 0xffffffff;
+   }
+#else
+   for (uint i = 1; i < perf_index; i++) {
+      perf_buf_base[perf_buf_base_id][i] = 0xffffffff;
+   }
+#endif
+}
+
+void switch_perf_buffers() {
+
+   if constexpr (INTERMED_DUMP || PERF_DUMP_CONCURRENT) {
+      for (uint i = perf_index; i < perf_end; i++) {
+         perf_buf_base[perf_buf_base_id][i] = 0xffffffff;
+      }
+      bool stalled = false;
+      uint32_t timestamp_stall_start_l;
+      uint32_t timestamp_stall_start_h;
+      uint32_t timestamp_stall_end_l;
+      uint32_t timestamp_stall_end_h;
+
+      // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram
+      int32_t ack_local = *ncrisc_ack_addr;
+      if (ack_local <= dram_dump_req_local - 1) {
+         stalled = true;
+         timestamp_stall_start_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+         timestamp_stall_start_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+
+         while (ack_local <= dram_dump_req_local - 1) {
+            ack_local = *ncrisc_ack_addr;
+         }
+
+         timestamp_stall_end_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+         timestamp_stall_end_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+      }
+
+      dram_dump_req_local++;
+      EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local;
+
+      perf_buf_base_id = 1 - perf_buf_base_id;
+      if constexpr(INTERMED_DUMP) {
+         perf_index = 0;
+      } else {
+         perf_index = 0;
+         perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL;
+         perf_buf_base[perf_buf_base_id][perf_index+1] = *(uint32_t*)(&header);
+         perf_index = 2;
+      }
+      if (stalled && perf_index + 5 < perf_end - 1) {
+         uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::STALL_TRISC_FOR_DRAM_PERF_DUMP, current_outer_loop_iter);
+         perf_buf_base[perf_buf_base_id][perf_index] = event_id;
+         perf_buf_base[perf_buf_base_id][perf_index+1] = timestamp_stall_start_h;
+         perf_buf_base[perf_buf_base_id][perf_index+2] = timestamp_stall_start_l;
+         perf_buf_base[perf_buf_base_id][perf_index+3] = event_id;
+         perf_buf_base[perf_buf_base_id][perf_index+4] = timestamp_stall_end_h;
+         perf_buf_base[perf_buf_base_id][perf_index+5] = timestamp_stall_end_l;
+         perf_index += 6;
+      }
+   }
+}
+
+void last_trisc_perf_dump_to_dram() {
+   if (perf_index > 0) {
+
+      // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram
+      int32_t ack_local = *ncrisc_ack_addr;
+      while (ack_local <= dram_dump_req_local - 1) {
+         ack_local = *ncrisc_ack_addr;
+      }
+
+      if constexpr (INTERMED_DUMP) {
+         if (thread_id == 1) {
+            dram_dump_req_local += 2;
+         } else {
+            dram_dump_req_local++;
+         }
+      } else if constexpr (PERF_DUMP_CONCURRENT) {
+         dram_dump_req_local++;
+      } else {
+         dram_dump_req_local += 2;
+      }
+      EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local;
+   }
+}
+
+void increment_unpack_tiles(uint operand_idx, uint num_tiles) {
+   if (record_perf_events && (perf_events_target_idx == 1)) {
+      if (operand_idx >= PERF_MAX_NUM_INPUTS) {
+         return;
+      }
+      uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0;
+      regfile_base_idx += (operand_idx >> 1);
+      bool upper = operand_idx & 0b1;
+      uint32_t num_tiles_regfile = regfile[regfile_base_idx];
+      uint32_t current_num_tiles;
+      if (upper) {
+         current_num_tiles = (num_tiles_regfile >> 16) & 0xffff;
+         current_num_tiles += num_tiles;
+         regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff) + ((current_num_tiles & 0xffff) << 16);
+      } else {
+         current_num_tiles = (num_tiles_regfile + num_tiles) & 0xffff;
+         regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff0000) + (current_num_tiles & 0xffff);
+      }
+      sync_regfile_write(regfile_base_idx);
+   }
+}
+
+void increment_pack_tiles(uint num_tiles) {
+   if (record_perf_events && (perf_events_target_idx == 1)) {
+      regfile[p_gpr_pack::PERF_PACK_NUM_TILES] += num_tiles;
+      sync_regfile_write(p_gpr_pack::PERF_PACK_NUM_TILES);
+   }
+}
+
+#if OVERLAY_DECOUPLE == 1
+
+// This runs prior to set_perf_dump_flag_for_input so perf_end has to be adjusted
+void record_overlay_decoupled_output_bw_start(uint32_t num_tiles) {
+   if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) {
+      perf_end += 6;
+   }
+   if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) {
+      perf_end = TRISC_PERF_BUF_SIZE >> 2;
+   }
+   uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_NUM_TILES, perf_events_target_inputs[0]);
+   record_perf_value_and_check_overflow(event_id, num_tiles, 0);
+   event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]);
+   uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+   uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+   record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0);
+}
+
+void record_overlay_decoupled_output_bw_end() {
+   if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) {
+      perf_end += 6;
+   }
+   if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) {
+      perf_end = TRISC_PERF_BUF_SIZE >> 2;
+   }
+   uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]);
+   uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+   uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+   record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0);
+}
+
+void llk_push_all_packer_tiles_for_decoupling() {
+   uint32_t operand = OPERAND_OUTPUT_START_INDEX;
+   uint32_t output = operand_to_output_index(operand);
+
+   // Populate the output buffer with headers
+   uint32_t stream_buf_size_bytes = EPOCH_INFO_PTR->outputs[output]->buf_full_size_bytes;
+   uint32_t stream_buf_addr = EPOCH_INFO_PTR->outputs[output]->buf_base_addr;
+   uint32_t stream_msg_info_buf_ptr = (EPOCH_INFO_PTR->outputs[output]->msg_info_buf_start)*MEM_WORD_WIDTH;
+   uint32_t tile_size_words = *(volatile uint32_t tt_l1_ptr *)(stream_msg_info_buf_ptr);
+   uint32_t tile_size_bytes = tile_size_words*MEM_WORD_WIDTH;
+   for (uint32_t tile_header_ptr = stream_buf_addr; tile_header_ptr < stream_buf_addr + stream_buf_size_bytes; tile_header_ptr += tile_size_bytes) {
+         *((uint32_t *)(tile_header_ptr)) = tile_size_words;
+   }
+
+   uint32_t total_num_tiles_to_push = 0;
+   uint32_t num_tiles_to_push[EPOCH_MAX_OUTPUT_FORKS+1];
+   uint32_t stream_id = EPOCH_INFO_PTR->outputs[output]->stream_id;
+   uint32_t active_stream_idx = get_active_stream_idx(stream_id);
+   volatile epoch_stream_info_t * l1_stream_info = EPOCH_INFO_PTR->active_streams[active_stream_idx];
+   for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) {
+      uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1];
+      uint32_t epoch_num_tiles = EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->epoch_num_tiles;
+      num_tiles_to_push[k] = epoch_num_tiles;
+      total_num_tiles_to_push += epoch_num_tiles;
+   }
+   if (((l1_stream_info->flags & STREAM_MOVES_RAW_DATA) != 0) || l1_stream_info->legacy_pack) {
+
+      record_overlay_decoupled_output_bw_start(total_num_tiles_to_push);
+
+      while(total_num_tiles_to_push > 0) {
+         uint32_t stream_msg_info_buf_ptr = (l1_stream_info->msg_info_buf_start)*MEM_WORD_WIDTH;
+         uint32_t tile_size_words = *(volatile uint32_t *)(stream_msg_info_buf_ptr);
+         uint32_t stream_buf_size_tiles = l1_stream_info->buf_size_tiles;
+         bool any_streams_busy = false;
+         for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) {
+               uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1];
+               uint32_t fork_stream_id = k == 0 ? stream_id : EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->stream_id;
+               if (num_tiles_to_push[k] == 0) {
+                  continue;
+               }
+               uint32_t dram_output_no_push = ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_DRAM_NO_PUSH) != 0) || ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_MOVES_RAW_DATA) != 0);
+               if (dram_output_no_push) {
+                  uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id);
+                  uint16_t operand_tiles_received = (uint16_t)*get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id));
+                  uint16_t operand_tiles_acked = (uint16_t)*get_operand_tiles_acked_ptr(stream_id_to_operand(fork_stream_id));
+                  uint16_t tiles_available = operand_tiles_received - operand_tiles_acked;// op_pack_tiles_ptr_sub(operand_tiles_received, operand_tiles_acked);
+                  uint32_t stream_buf_free_tiles = stream_buf_size_tiles - tiles_available;
+                  uint32_t num_tiles = tiles_left_in_phase > stream_buf_free_tiles ? stream_buf_free_tiles : tiles_left_in_phase;
+                  if (num_tiles > 0) {
+                     stream_set_tiles_left_in_phase(fork_stream_id, num_tiles);
+                     volatile uint32_t tt_reg_ptr* tiles_received_ptr = (volatile uint32_t tt_reg_ptr*)get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id));
+                     operand_tiles_received = (uint16_t)tiles_received_ptr[0];
+                     uint16_t new_epoch_tiles_received = operand_tiles_received + num_tiles;// op_pack_tiles_ptr_add(operand_tiles_received, num_tiles);
+                     tiles_received_ptr[0] = new_epoch_tiles_received;
+
+                     num_tiles_to_push[k] -= num_tiles;
+                     total_num_tiles_to_push -= num_tiles;
+                  }
+               } else {
+                  uint32_t phase_active = stream_phase_is_active(fork_stream_id) && !is_dummy_phase(fork_stream_id);
+                  if (phase_active) {
+                     uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id);
+                     uint32_t num_free_words = stream_get_free_words(fork_stream_id);
+                     uint32_t num_tiles = 0;
+                     uint32_t num_words = 0;
+                     while (num_words + tile_size_words <= num_free_words && num_tiles + 1 <= tiles_left_in_phase) {
+                           num_tiles++;
+                           num_words += tile_size_words;
+                     }
+                     if (num_tiles > 0) {
+                           stream_set_tiles_left_in_phase(fork_stream_id, num_tiles);
+                           stream_relay_tiles(fork_stream_id, num_tiles, num_words);
+
+                           num_tiles_to_push[k] -= num_tiles;
+                           total_num_tiles_to_push -= num_tiles;
+                     }
+                  }
+               }
+         }
+      }
+      record_overlay_decoupled_output_bw_end();
+   }
+}
+#endif
+
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc
index 238301e0566..baeba52c6c6 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc
@@ -1,9 +1,10 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
+/*
+ * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+*/
 
 #include "ckernel_template.h"
-#include "debug/fw_debug.h"
 
 namespace ckernel
 {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
new file mode 100644
index 00000000000..35130c72520
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
@@ -0,0 +1,10 @@
+// combining multiple C++ source files into a single file
+// to reduce the overhead of the compilation process and
+// improve build times
+#include "ckernel.cc"
+#include "ckernel_template.cc"
+#ifdef PERF_DUMP
+#include "ckernel_perf_unpack_pack.cc"
+#endif
+#include "ckernel_main.cc"
+#include "llk_io.cc" // sw stack specific io interface
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
new file mode 100644
index 00000000000..2a66c11d1a6
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
@@ -0,0 +1,2 @@
+ckernel.cc
+ckernel_template.cc
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_3c.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_3c.h
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
index e19013f89e0..e205ec12747 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
@@ -5,22 +5,23 @@
 #pragma once
 
 namespace ckernel {
+
+enum Dim {
+  None      = 0,
+  R         = 1,
+  C         = 2,
+  Z         = 3,
+  RC        = 4,
+  ZR        = 5,
+  Invalid   = 0xFF,
+};
+
 enum ReduceDim {
     REDUCE_ROW,
     REDUCE_COL,
     REDUCE_SCALAR,
 };
 
-enum Dim {
-    None = 0,
-    R = 1,
-    C = 2,
-    Z = 3,
-    RC = 4,
-    ZR = 5,
-    Invalid = 0xFF,
-};
-
 enum TileDim {
     R_IDX = 0,
     C_IDX = 1,
@@ -96,7 +97,6 @@ enum ReluType {
     MAX_THRESHOLD_RELU,
 };
 
-
 enum SfpuType {
     tanh,
     hardtanh,
@@ -155,6 +155,9 @@ enum SfpuType {
     silu,
     mask,
     negative,
+    dequant_int32,
+    requant_int32,
+    quant_int32,
     unused,
 };
 }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h
index 0dfa2c30fe4..8eb5e084934 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h
@@ -2,22 +2,19 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-
 #pragma once
 
 #include "ckernel_defs.h"
 #include "ckernel_include.h"
 #include "cmath_common.h"
-#include "chlkc_unpack_data_format.h"
 #ifdef PERF_DUMP
 #include "ckernel_perf_api.h"
 #endif
 
-#include "hostdevcommon/common_runtime_address_map.h"
 using namespace ckernel::math;
 
 template <DstSync Dst>
-inline void llk_math_wait_for_dest_available() {
+inline void _llk_math_wait_for_dest_available_() {
     // These liteweight functions for sync with packer imply
     // no mode change - entire epoch is either double buffer or single buffer
 #ifdef PERF_DUMP
@@ -30,7 +27,7 @@ inline void llk_math_wait_for_dest_available() {
 }
 
 template <DstSync Dst = SyncFull, bool is_fp32_dest_acc_en = false>
-inline void llk_math_dest_section_done() {
+inline void _llk_math_dest_section_done_() {
 #ifdef PERF_DUMP
     if constexpr(MATH_PACK_DECOUPLE) {
         return;
@@ -51,7 +48,7 @@ inline void llk_math_dest_section_done() {
 }
 
 template <DstSync Dst, bool is_fp32_dest_acc_en = false>
-inline void llk_math_pack_sync_init() {
+inline void _llk_math_pack_sync_init_() {
 #ifdef PERF_DUMP
     if constexpr(MATH_PACK_DECOUPLE) {
         return;
@@ -87,65 +84,42 @@ inline void llk_math_pack_sync_init() {
     }
 }
 
-inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
-    debug_dump(data, byte_size);
-}
+template <bool mail2math=true, bool mail2pack=true>
+inline void _llk_math_get_tile_(std::uint32_t tile_index, std::uint32_t* p_tile) {
+    if constexpr (mail2math) {
+       *p_tile = mailbox_read(ThreadId::UnpackThreadId);
+    } else {
+       *p_tile = 0x0;
+    }
 
-inline void llk_math_debug_dump_seek(std::uint8_t offset) {
-    debug_dump_seek(offset);
 }
 
-inline void llk_math_reconfig_data_format(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand, const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
-    std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
-    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-    std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
-    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-
-    if((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) && (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
-        uint config_data = (unpack_dst_format[new_srca_operand_id] << ALU_FORMAT_SPEC_REG0_SrcA_SHAMT) | (unpack_dst_format[new_srcb_operand_id] << ALU_FORMAT_SPEC_REG1_SrcB_SHAMT);
-        constexpr uint config_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK;
-        cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, config_mask>(config_data);
-
-    } else if((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])){
-        cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_RMW>((uint)unpack_dst_format[new_srca_operand_id]);
-    } else if((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])){
-        cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>((uint)unpack_dst_format[new_srcb_operand_id]);
+template <bool mail2math=true, bool mail2pack=true>
+inline void _llk_math_release_tile_() {
+    if constexpr (mail2math) {
+       semaphore_get(semaphore::UNPACK_OPERAND_SYNC);
     }
 }
 
-inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
-    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-
-    uint config_data = (unpack_dst_format[new_srca_operand_id] << ALU_FORMAT_SPEC_REG0_SrcA_SHAMT) | (unpack_dst_format[new_srcb_operand_id] << ALU_FORMAT_SPEC_REG1_SrcB_SHAMT);
-    constexpr uint config_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK;
-    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, config_mask>(config_data);
+inline void _llk_math_debug_dump_(std::uint8_t *data, std::uint32_t byte_size) {
+    debug_dump(data, byte_size);
 }
 
-inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
-    std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
-    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-
-    if((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])){
-        cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_RMW>((uint)unpack_dst_format[new_srca_operand_id]);
-    }
+inline void _llk_math_debug_dump_seek_(std::uint8_t offset) {
+    debug_dump_seek(offset);
 }
 
-inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
-    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_RMW>((uint)unpack_dst_format[new_srca_operand_id]);
+inline void _llk_math_reconfig_data_format_srca_(const std::uint32_t srca_data_format) {
+    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_RMW>(srca_data_format);
 }
 
-inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
-    std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
-    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-
-    if((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])){
-        cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>((uint)unpack_dst_format[new_srcb_operand_id]);
-    }
+inline void _llk_math_reconfig_data_format_srcb_(const std::uint32_t srcb_data_format) {
+    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>(srcb_data_format);
 }
 
-inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
-    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>((uint)unpack_dst_format[new_srcb_operand_id]);
+inline void _llk_math_reconfig_data_format_(const std::uint32_t srca_data_format, const std::uint32_t srcb_data_format) {
+
+    uint config_data = (srca_data_format << ALU_FORMAT_SPEC_REG0_SrcA_SHAMT) | (srcb_data_format << ALU_FORMAT_SPEC_REG1_SrcB_SHAMT);
+    constexpr uint config_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK;
+    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, config_mask>(config_data);
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h
index dbe1512ee0c..0a70d430497 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h
@@ -7,13 +7,11 @@
 #include "ckernel_template.h"
 #include "cmath_common.h"
 #include "llk_math_common.h"
-#include "llk_param_structs.h"
 
 using namespace ckernel;
 
 // local function declarations
 inline void eltwise_binary_configure_addrmod();
-inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0);
 
 template <EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
 inline void eltwise_binary_reuse_dest_as_src() {
@@ -32,16 +30,15 @@ template <
     int NUM_FIDELITY_PHASES = 0,
     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
     bool is_fp32_dest_acc_en = false>
-inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const std::uint32_t num_faces_b, uint dst_index, const bool clear_fp32_dst_acc) {
+inline void _llk_math_eltwise_binary_(const std::uint32_t num_faces, uint dst_index, const bool clear_fp32_dst_acc) {
+
     constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0);
     constexpr uint32_t ZERO_ACC_MODE = p_zeroacc::CLR_16;
 
-    // Todo: do something with num_faces_a, num_faces_b
-
     if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
         math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
 
-            if constexpr (eltwise_binary_type == ELWMUL) {
+        if constexpr (eltwise_binary_type == ELWMUL) {
             if (is_fp32_dest_acc_en && clear_fp32_dst_acc) {
                 #pragma GCC unroll 0
                 for (std::uint32_t i = 0; i < 8; i++) {
@@ -59,7 +56,7 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
                   (Dst == DstSync::SyncTile2)),
                 "Dst clear in DstSync::SyncTile16 or DstSync::SyncTile2 dst sync mode is not supported!");
             /*
-                if (clear_dest_acc) {
+            if (clear_dest_acc) {
                 if constexpr (is_fp32_dest_acc_en) {
                     #pragma GCC unroll 0
                     for(std::uint32_t i = 0; i < 8; i++) {
@@ -85,13 +82,13 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
 #pragma GCC unroll 0
             for (std::uint32_t n = 0; n < outerloop; n++) {  // N-num faces
                 eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
-            ckernel_template::run(instrn_buffer);
+                ckernel_template::run(instrn_buffer);
             }
             TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
 #pragma GCC unroll 0
             for (std::uint32_t n = 0; n < outerloop; n++) {  // N-num faces
                 eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
-            ckernel_template::run(instrn_buffer);
+                ckernel_template::run(instrn_buffer);
             }
             TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
         } else {
@@ -99,8 +96,8 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
 #pragma GCC unroll 0
             for (std::uint32_t n = 0; n < outerloop; n++) {  // N-num faces
                 eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
-            ckernel_template::run(instrn_buffer);
-        }
+                ckernel_template::run(instrn_buffer);
+            }
             // Manually clear B once mop is done for scaler bcast
             if constexpr (src_b_bcast_type == BroadcastType::SCALAR) {
                 TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, p_setrwc::SET_D);
@@ -135,8 +132,8 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
                             TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) + (0 +         n)); // Clear faces 0 & 1
                         }
                     }
-                ckernel_template::run(instrn_buffer);
-            }
+                    ckernel_template::run(instrn_buffer);
+                }
             }
             TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
             if constexpr (high_fidelity) {
@@ -151,7 +148,7 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
                             TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) + (2 +         n)); // Clear faces 2 & 3
                         }
                     }
-                ckernel_template::run(instrn_buffer);
+                    ckernel_template::run(instrn_buffer);
                 }
             } else {
 #pragma GCC unroll 0
@@ -165,16 +162,16 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
                             TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) +  (2 +         n)); // Clear faces 2 & 3
                         }
                     }
-                ckernel_template::run(instrn_buffer);
-            }
+                    ckernel_template::run(instrn_buffer);
+                }
             }
             TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
         } else {
             // Row and no broadcasted behaves similarly
-            constexpr uint32_t outerloop = (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? 4 : 1;
+            const uint32_t outerloop = (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? num_faces : 1;
             if constexpr (high_fidelity) {
 #pragma GCC unroll 0
-                for (std::uint32_t n = 0; n < 4; n++) {  // N-num faces
+                for (std::uint32_t n = 0; n < num_faces; n++) {  // N-num faces
                     eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
                     if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) {
                         if (is_fp32_dest_acc_en && clear_fp32_dst_acc) {
@@ -198,9 +195,9 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
                             TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) + n);
                         }
                     }
-                ckernel_template::run(instrn_buffer);
+                    ckernel_template::run(instrn_buffer);
+                }
             }
-        }
             if constexpr (src_b_bcast_type == BroadcastType::SCALAR) {
                 TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, p_setrwc::SET_D);
             }
@@ -211,33 +208,6 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const
     math::clear_dst_reg_addr();
 }
 
-template <
-    EltwiseBinaryType eltwise_binary_type,
-    BroadcastType src_b_bcast_type,
-    DstSync Dst = DstSync::SyncFull,
-    int NUM_FIDELITY_PHASES = 0,
-    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
-    bool is_fp32_dest_acc_en = false>
-inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) {
-    llk_math_eltwise_binary_impl<eltwise_binary_type, src_b_bcast_type, Dst, NUM_FIDELITY_PHASES, binary_reuse_dest, is_fp32_dest_acc_en>(4, 4, dst_index, clear_fp32_dst_acc);
-}
-
-template <
-    EltwiseBinaryType eltwise_binary_type,
-    BroadcastType src_b_bcast_type,
-    DstSync Dst = DstSync::SyncFull,
-    int NUM_FIDELITY_PHASES = 0,
-    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
-    bool is_fp32_dest_acc_en = false>
-inline void llk_math_eltwise_binary(const std::uint32_t operand_A, const std::uint32_t operand_B, uint dst_index, const bool clear_fp32_dst_acc = true) {
-    const std::uint32_t id_A = get_operand_id(operand_A);
-    const std::uint32_t id_B = get_operand_id(operand_B);
-
-    const std::uint32_t num_faces_A = get_num_faces(id_A);
-    const std::uint32_t num_faces_B = get_num_faces(id_B);
-
-    llk_math_eltwise_binary_impl<eltwise_binary_type, src_b_bcast_type, Dst, NUM_FIDELITY_PHASES, binary_reuse_dest, is_fp32_dest_acc_en>(num_faces_A, num_faces_B, dst_index, clear_fp32_dst_acc);
-}
 
 template <EltwiseBinaryType eltwise_binary_type, BroadcastType bcast_type>
 inline void eltwise_binary_configure_addrmod() {
@@ -287,11 +257,11 @@ template <
     BroadcastType bcast_type,
     int NUM_FIDELITY_PHASES = 0,
     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t operand_id = 0) {
+inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t num_faces = 4) {
     constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0);
     const uint addr_mod = ADDR_MOD_0;
     constexpr uint innerloop = 16 >> 3;  // 8 rows per eltwise op at a time.
-    uint outerloop = get_num_faces(operand_id);
+    uint outerloop = num_faces;
     auto broadcast_type = p_elwise::SRCB_NO_BCAST;
     if constexpr (bcast_type == BroadcastType::COL) {
         // The mop only runs for 2 outer loops and mop is called twice for col broadcast
@@ -362,13 +332,13 @@ template <
     BroadcastType src_b_bcast_type,
     int NUM_FIDELITY_PHASES = 0,
     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-inline void llk_math_eltwise_binary_init_impl(const std::uint32_t operand_id, const std::uint32_t transpose, const std::uint32_t acc_to_dest) {
-    // todo: do something with num_faces
+inline void _llk_math_eltwise_binary_init_(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) {
+
     eltwise_binary_configure_addrmod<eltwise_binary_type, src_b_bcast_type>();
 
     if constexpr (
         (eltwise_binary_type == ELWADD) || (eltwise_binary_type == ELWSUB) || (eltwise_binary_type == ELWMUL)) {
-        eltwise_binary_configure_mop<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(acc_to_dest, operand_id);
+        eltwise_binary_configure_mop<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(acc_to_dest, num_faces);
     } else {
         FWASSERT("Unsupported op!", false);
     }
@@ -377,24 +347,3 @@ inline void llk_math_eltwise_binary_init_impl(const std::uint32_t operand_id, co
 
     math::reset_counters(p_setrwc::SET_ABD_F);
 }
-
-// Version with no operand
-template <
-    EltwiseBinaryType eltwise_binary_type,
-    BroadcastType src_b_bcast_type,
-    int NUM_FIDELITY_PHASES = 0,
-    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-inline void llk_math_eltwise_binary_init(const std::uint32_t transpose=0, const std::uint32_t acc_to_dest = 0) {
-    llk_math_eltwise_binary_init_impl<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(0, transpose, acc_to_dest);
-}
-
-// Version with operands
-template <
-    EltwiseBinaryType eltwise_binary_type,
-    BroadcastType src_b_bcast_type,
-    int NUM_FIDELITY_PHASES = 0,
-    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-inline void llk_math_eltwise_binary_init_with_operands(const std::uint32_t operand_A, const std::uint32_t operand_B, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest = 0) {
-    const std::uint32_t operand_id = get_operand_id(operand_A); // operand_id is used to extract tile dim data which is the same for both operands
-    llk_math_eltwise_binary_init_impl<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(operand_id, transpose, acc_to_dest);
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h
new file mode 100644
index 00000000000..9e23dab17f2
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h
@@ -0,0 +1,119 @@
+/*
+ * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+*/
+
+#pragma once
+
+#include "ckernel_include.h"
+#include "ckernel_template.h"
+#include <type_traits>
+
+#include "cmath_common.h"
+#include "llk_math_common.h"
+#include "ckernel_globals.h"
+#include "ckernel_sfpu.h"
+
+using namespace ckernel;
+// local function declarations
+template <SfpuType sfpu_op>
+inline void eltwise_binary_sfpu_configure_addrmod(){
+    // NOTE: this kernel is typically used in conjunction with
+    //       A2D, which is using ADDR_MOD_0 and ADDR_MOD_2, so use one
+    //       that doesn't conflict!
+
+    addr_mod_t{
+        .srca = {.incr = 0},
+        .srcb = {.incr = 0},
+        .dest = {.incr = 0},
+    }.set(ADDR_MOD_7);
+
+}
+inline void eltwise_binary_sfpu_configure_mop();
+
+template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+inline void _llk_math_eltwise_binary_sfpu_(
+    const uint face_r_dim,
+    const uint num_faces,
+    uint dst_index_a,
+    uint dst_index_b,
+    int vector_mode = (int)Dim::RC,
+    uint param0 = 0,
+    uint param1 = 0,
+    uint param2 = 0,
+    uint param3 = 0,
+    uint param4 = 0,
+    uint param5 = 0) {
+    constexpr int ITERATIONS = 8;
+    uint dst_index = (dst_index_a <= dst_index_b) ? dst_index_a : dst_index_b;
+    param0 = (dst_index_a > dst_index_b) ? dst_index_a-dst_index_b : dst_index_b-dst_index_a;
+    if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
+        math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
+    } else {
+        math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(dst_index);
+    }
+    math::set_addr_mod_base();
+    TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
+    if (vector_mode == (int)Dim::R) {
+        // Do a row vector, Face0 + Face1 -- first iteration (first row)
+        const int iterations = (num_faces < 4) ?
+                                    ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns
+#pragma GCC unroll 0
+        for (int face = 0; face < 2; face++) {
+            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS>(iterations, param0, param1, param2, param3, param4, param5);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+        }
+        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+    } else if (vector_mode == (int)Dim::C) {
+        // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face
+#pragma GCC unroll 0
+        for (int face = 0; face < 2; face++) {
+            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS>(ITERATIONS, param0, param1, param2, param3, param4, param5);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            if (num_faces>2) { // Skip next 2 faces if tile is 32x32
+                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            }
+        }
+        if (num_faces<=2) {
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+        }
+    } else {
+        // Do all four faces, and iterate through all 4 blocks of 4 rows each
+#pragma GCC unroll 0
+        for (int face = 0; face < 4; face++) {
+            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS>(ITERATIONS, param0, param1, param2, param3, param4, param5);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+        }
+    }
+    math::clear_dst_reg_addr();
+
+    TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::WAIT_SFPU);
+    math::clear_addr_mod_base();
+}
+
+template <SfpuType sfpu_op, bool APPROXIMATE>
+inline void _llk_math_eltwise_binary_sfpu_init_(
+    uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
+    eltwise_binary_sfpu_configure_addrmod< sfpu_op >();
+    if constexpr (sfpu_op == SfpuType::quant_int32) {
+        sfpu::sfpu_init<APPROXIMATE>(sfpu_op, param0);
+    } else if constexpr (sfpu_op == SfpuType::requant_int32) {
+        sfpu::sfpu_init<APPROXIMATE>(sfpu_op, param0);
+    } else if constexpr (sfpu_op == SfpuType::dequant_int32) {
+        sfpu::sfpu_init<APPROXIMATE>(sfpu_op, param0);
+    } else {
+        sfpu::sfpu_init<APPROXIMATE>(sfpu_op);
+    }
+    math::reset_counters(p_setrwc::SET_ABD_F);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h
index aede5275e89..f26d2ca3f46 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_param_structs.h"
 
 #include "ckernel_include.h"
 #include "ckernel_template.h"
@@ -17,35 +16,43 @@ using namespace ckernel;
 // local function declarations
 inline void eltwise_unary_configure_addrmod();
 
-template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE, DstSync Dst = DstSync::SyncFull, bool is_fp32_dest_acc_en = false>
-inline void llk_math_eltwise_unary_datacopy(uint dst_index) {
-    if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
-        math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
+template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE, DstSync Dst = DstSync::SyncFull, bool is_fp32_dest_acc_en = false, bool unpack_to_dest = false>
+inline void _llk_math_eltwise_unary_datacopy_(const std::uint32_t dst_index, const std::uint32_t src_format, const std::uint32_t dst_format) {
+
+    if (unpack_to_dest && math::is_32bit_input(src_format, dst_format)) {
+        math_unpack_to_dest_math_ready();
+        math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32, true>(dst_index);
+        math::math_unpack_to_dest_tile_ready();
     } else {
-        math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(dst_index);
-    }
 
-    if constexpr (type == A2D) {
-        ckernel_template::run(instrn_buffer);
-    } else if constexpr (type == B2D) {
-        if constexpr (src_b_bcast_type == BroadcastType::SCALAR) {
-            // Manually clear B once mop is done
-            ckernel_template::run(instrn_buffer);
-            TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
-        } else if constexpr (src_b_bcast_type == BroadcastType::COL) {
-            // Mop for col broadcast only does 2 outerloops.  Needs to clear B manually and call twice
-            ckernel_template::run(instrn_buffer);
-            TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
-            ckernel_template::run(instrn_buffer);
-            TTI_SETRWC(p_setrwc::CLR_AB, 0, 0, 0, 0, 0);
+        if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
+            math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
         } else {
+            math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(dst_index);
+        }
+
+        if constexpr (type == A2D) {
             ckernel_template::run(instrn_buffer);
+        } else if constexpr (type == B2D) {
+            if constexpr (src_b_bcast_type == BroadcastType::SCALAR) {
+                // Manually clear B once mop is done
+                ckernel_template::run(instrn_buffer);
+                TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
+            } else if constexpr (src_b_bcast_type == BroadcastType::COL) {
+                // Mop for col broadcast only does 2 outerloops.  Needs to clear B manually and call twice
+                ckernel_template::run(instrn_buffer);
+                TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0);
+                ckernel_template::run(instrn_buffer);
+                TTI_SETRWC(p_setrwc::CLR_AB, 0, 0, 0, 0, 0);
+            } else {
+                ckernel_template::run(instrn_buffer);
+            }
+        } else {
+            FWASSERT("Unsupported op!", false);
         }
-    } else {
-        FWASSERT("Unsupported op!", false);
-    }
 
-    math::clear_dst_reg_addr();
+        math::clear_dst_reg_addr();
+    }
 }
 
 template <DataCopyType type, BroadcastType bcast_type = BroadcastType::NONE>
@@ -102,11 +109,10 @@ inline void eltwise_unary_configure_addrmod() {
 }
 
 template <DataCopyType type, BroadcastType bcast_type = BroadcastType::NONE>
-inline void eltwise_unary_configure_mop(uint rows_per_inst, uint total_rows, const uint operand_id) {
+inline void eltwise_unary_configure_mop(uint rows_per_inst, uint total_rows, const uint num_faces) {
     // always move 32x32 tile, packed as 16x16x4
 
     if constexpr (type == A2D) {
-        const std::uint32_t num_faces = get_num_faces(operand_id);
         uint addr_mod = (rows_per_inst == p_mova2d::MOV_1_ROW) ? ADDR_MOD_0 : ADDR_MOD_2;
         uint innerloop = (rows_per_inst == p_mova2d::MOV_1_ROW) ? total_rows : (total_rows >> 3);
         uint outerloop = num_faces;
@@ -160,15 +166,14 @@ inline void eltwise_unary_configure_mop(uint rows_per_inst, uint total_rows, con
 
 template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE>
 // within_face_16x16_transpose is used by unpacker, math does not transpose
-inline void llk_math_eltwise_unary_datacopy_init(const std::uint32_t transpose_of_faces=0 /*unused*/, const std::uint32_t within_face_16x16_transpose=0 /* unused */, const std::uint32_t operand = 0) {
-    const std::uint32_t operand_id = get_operand_id(operand);
+inline void _llk_math_eltwise_unary_datacopy_init_(const std::uint32_t transpose_of_faces=0 /*unused*/, const std::uint32_t within_face_16x16_transpose=0 /* unused */, const std::uint32_t num_faces = 4) {
 
     eltwise_unary_configure_addrmod<type, src_b_bcast_type>();
 
     if constexpr (type == A2D) {
-        eltwise_unary_configure_mop<type, src_b_bcast_type>(p_mova2d::MOV_8_ROWS, 16, operand_id);
+        eltwise_unary_configure_mop<type, src_b_bcast_type>(p_mova2d::MOV_8_ROWS, 16, num_faces);
     } else if constexpr (type == B2D) {
-        eltwise_unary_configure_mop<type, src_b_bcast_type>(p_movb2d::MOV_4_ROWS, 16, operand_id);
+        eltwise_unary_configure_mop<type, src_b_bcast_type>(p_movb2d::MOV_4_ROWS, 16, num_faces);
     } else {
         FWASSERT("Unsupported op!", false);
     }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h
index d781a4160dc..3f83bb707b0 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h
@@ -2,16 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#pragma once
-#include "llk_param_structs.h"
-
 #include "ckernel_include.h"
 #include "ckernel_template.h"
 #include <type_traits>
 
 #include "cmath_common.h"
 #include "llk_math_common.h"
-#include "llk_format_conversions.h"
 #include "ckernel_globals.h"
 #include "ckernel_sfpi.h"
 
@@ -159,24 +155,3 @@ template <DstSync dst_sync = DstSync::SyncFull>
 inline void llk_math_eltwise_unary_sfpi_test19(uint dst_index) {
     llk_math_eltwise_unary_sfpi<SfpiTestType::test19, dst_sync>(dst_index);
 }
-
-//Logical Not
-template <DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpi_logical_not(uint dst_index) {
-    llk_math_eltwise_unary_sfpi<SfpiTestType::logical_not, dst_sync>(dst_index);
-}
-
-inline void llk_math_eltwise_unary_sfpi_logical_not_init() {
-  llk_math_eltwise_unary_sfpi_init();
-}
-
-//Bitwise Complement
-template <DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpi_bitwise_complement(uint dst_index) {
-    llk_math_eltwise_unary_sfpi<SfpiTestType::bitwise_complement, dst_sync>(dst_index);
-}
-
-
-inline void llk_math_eltwise_unary_sfpi_bitwise_complement_init() {
-  llk_math_eltwise_unary_sfpi_init();
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h
index e695f53e2bd..ccd0dc293ff 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h
@@ -3,20 +3,16 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_param_structs.h"
-
 #include "ckernel_include.h"
 #include "ckernel_template.h"
 #include <type_traits>
 
 #include "cmath_common.h"
 #include "llk_math_common.h"
-#include "llk_format_conversions.h"
 #include "ckernel_globals.h"
 #include "ckernel_sfpu.h"
 
-namespace ckernel {
-
+using namespace ckernel;
 template <SfpuType sfpu_type>
 void static_assert_sfpu_type_dependent() {
     static_assert(sfpu_type == SfpuType::unused, "sfpu_type exception");
@@ -37,16 +33,20 @@ inline void eltwise_unary_sfpu_configure_addrmod(){
 }
 inline void eltwise_unary_sfpu_configure_mop();
 
-template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu(
+template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN=false>
+inline void _llk_math_eltwise_unary_sfpu_(
+    const uint face_r_dim,
+    const uint num_faces,
     uint dst_index,
-    int vector_mode = Dim::RC,
+    int vector_mode = (int)Dim::RC,
     uint param0 = 0,
     uint param1 = 0,
     uint param2 = 0,
     uint param3 = 0,
     uint param4 = 0,
     uint param5 = 0) {
+
+    constexpr int ITERATIONS = 8;
     if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
         math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
     } else {
@@ -54,25 +54,33 @@ inline void llk_math_eltwise_unary_sfpu(
     }
     math::set_addr_mod_base();
     TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
-    if (vector_mode == Dim::R) {
+    if (vector_mode == (int)Dim::R) {
         // Do a row vector, Face0 + Face1 -- first iteration (first row)
-        const int ITERATIONS = 1;
+        const int iterations = (num_faces < 4) ?
+                                    ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns
 #pragma GCC unroll 0
         for (int face = 0; face < 2; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS>(param0, param1, param2, param3, param4, param5);
+            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(iterations, param0, param1, param2, param3, param4, param5);
             TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
             TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         }
-        // Skip the next 2 faces
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-    } else if (vector_mode == Dim::C) {
-        // Do a column vector, Face0 + Face2 -- All iterations for full face
+    } else if (vector_mode == (int)Dim::C) {
+        // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face
 #pragma GCC unroll 0
         for (int face = 0; face < 2; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0>(param0, param1, param2, param3, param4, param5);
+            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(ITERATIONS, param0, param1, param2, param3, param4, param5);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            if (num_faces>2) { // Skip next 2 faces if tile is 32x32
+                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
+            }
+        }
+        if (num_faces<=2) {
             TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
             TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
             TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
@@ -82,7 +90,7 @@ inline void llk_math_eltwise_unary_sfpu(
         // Do all four faces, and iterate through all 4 blocks of 4 rows each
 #pragma GCC unroll 0
         for (int face = 0; face < 4; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0>(param0, param1, param2, param3, param4, param5);
+            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(ITERATIONS, param0, param1, param2, param3, param4, param5);
             TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
             TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         }
@@ -94,7 +102,7 @@ inline void llk_math_eltwise_unary_sfpu(
 }
 
 template <SfpuType sfpu_op, bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_init(
+inline void _llk_math_eltwise_unary_sfpu_init_(
     uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
     eltwise_unary_sfpu_configure_addrmod< sfpu_op >();
     if constexpr (sfpu_op == SfpuType::dropout) {
@@ -104,304 +112,3 @@ inline void llk_math_eltwise_unary_sfpu_init(
     }
     math::reset_counters(p_setrwc::SET_ABD_F);
 }
-
-// New LLK SFPU APIs
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::rsqrt, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_rsqrt_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::rsqrt, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::log, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_log_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::log, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) {
-    llk_math_eltwise_unary_sfpu<SfpuType::log_with_base, APPROXIMATE, dst_sync>(dst_index,base_scale);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_log_with_base_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::log_with_base, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::tanh, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::signbit, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_signbit_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::signbit, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_tanh_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::tanh, APPROXIMATE>();
-}
-
-//sign
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::sign, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_sign_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::sign, APPROXIMATE>();
-}
-template <DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) {
-    constexpr bool dont_care = false;
-    llk_math_eltwise_unary_sfpu<SfpuType::dropout, dont_care, dst_sync>(dst_index, vector_mode, integer_dropout, scale_factor);
-}
-
-inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) {
-    constexpr bool dont_care = false;
-    constexpr uint dont_care_param = 0;
-
-    llk_math_eltwise_unary_sfpu_init<SfpuType::dropout, dont_care>(dont_care_param, dont_care_param, seed);
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::sigmoid, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_sigmoid_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::sigmoid, APPROXIMATE>();
-}
-
-//EQZ
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::equal_zero, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_eqz_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::equal_zero, APPROXIMATE>();
-}
-
-//NEZ
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::not_equal_zero, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_nez_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::not_equal_zero, APPROXIMATE>();
-}
-
-//LTZ
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::less_than_zero, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_ltz_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_zero, APPROXIMATE>();
-}
-
-//GTZ
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::greater_than_zero, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_gtz_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_zero, APPROXIMATE>();
-}
-
-//LEZ
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::less_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_lez_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_equal_zero, APPROXIMATE>();
-}
-
-//GEZ
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::greater_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_gez_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_equal_zero, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::max, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_max_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::max, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::square, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_square_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::square, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::power, APPROXIMATE, dst_sync>(dst_index, vector_mode, pow);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_power_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::power, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::abs, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_abs_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::abs, APPROXIMATE>();
-}
-
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::cast_fp32_to_fp16a, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::cast_fp32_to_fp16a, APPROXIMATE>();
-}
-
-//EXP2
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::exp2, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_exp2_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::exp2, APPROXIMATE>();
-}
-
-//heaviside
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::heaviside, APPROXIMATE, dst_sync>(dst_index,vector_mode,param0);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_heaviside_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::heaviside, APPROXIMATE>();
-}
-
-//EXPM1
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::expm1, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_expm1_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::expm1, APPROXIMATE>();
-}
-
-//Asin
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::asin, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_asin_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::asin, APPROXIMATE>();
-}
-
-//Atan
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::atan, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_atan_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::atan, APPROXIMATE>();
-}
-
-//Acos
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::acos, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_acos_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::acos, APPROXIMATE>();
-}
-
-//silu
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) {
-    llk_math_eltwise_unary_sfpu<SfpuType::silu, APPROXIMATE, dst_sync>(dst_index);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_silu_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::silu, APPROXIMATE>();
-}
-
-//Mask
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::mask, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_mask_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::mask, APPROXIMATE>();
-}
-
-// Negative
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::negative, APPROXIMATE, dst_sync>(dst_index,vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_negative_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::negative, APPROXIMATE>();
-}
-
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h
index 24e8738da78..5ebaefe0d96 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h
@@ -3,8 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_param_structs.h"
-
 #include "ckernel_include.h"
 #include "ckernel_template.h"
 
@@ -17,17 +15,14 @@
 
 using namespace ckernel;
 
-template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout=DstTileFaceLayout::RowMajor>
-inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_id, const std::uint32_t in1_id) {
+template <int NUM_FIDELITY_PHASES>
+inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) {
 
     constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0);
 
-    const bool is_in0_16x32 = (math_tile_dims[in0_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]> FACE_C_DIM);
-    const bool is_in0_32x16 = (math_tile_dims[in0_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]<=FACE_C_DIM);
-    const bool is_in1_32x16 = (math_tile_dims[in1_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in1_id][TileDim::C_IDX]<=FACE_C_DIM);
-    const bool partial_face = get_partial_face(in0_id);
-
-    static_assert(FaceLayout == DstTileFaceLayout::RowMajor, "FaceLayout must be RowMajor");
+    const bool is_in0_16x32 = (in0_tile_r_dim <=FACE_R_DIM) && (in0_tile_c_dim > FACE_C_DIM);
+    const bool is_in0_32x16 = (in0_tile_r_dim > FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM);
+    const bool is_in1_32x16 = (in1_tile_r_dim > FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM);
 
     // MVMUL does D = B*A
 
@@ -68,12 +63,21 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c
     const uint8_t dest_increment = transpose == false ? 8 : 24;
 
     if (is_in0_16x32||is_in0_32x16) {
-        addr_mod_t{
-            .srca = {.incr = 16, .clr = 0, .cr = 0},
-            .srcb = {.incr = 0,  .clr = 0, .cr = 1}, // cr=16 before
-            .dest = {.incr = 8,  .clr = 0, .cr = 0},
+        if (transpose) {
+            addr_mod_t{
+                .srca = {.incr = 32, .clr = 0, .cr = 0},
+                .srcb = {.incr = 0,  .clr = 0, .cr = 1}, // cr=16 before
+                .dest = {.incr = 8,  .clr = 0, .cr = 0},
+            }
+                .set(ADDR_MOD_1);
+        } else {
+            addr_mod_t{
+                .srca = {.incr = 16, .clr = 0, .cr = 0},
+                .srcb = {.incr = 0,  .clr = 0, .cr = 1}, // cr=16 before
+                .dest = {.incr = 8,  .clr = 0, .cr = 0},
+            }
+                .set(ADDR_MOD_1);
         }
-            .set(ADDR_MOD_1);
     } else {
         if (is_in1_32x16) {
                 addr_mod_t{
@@ -111,20 +115,39 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c
              .set(ADDR_MOD_2);
     } else if (is_in0_16x32||is_in0_32x16) {
         if (partial_face) {
-            addr_mod_t{
-                .srca = {.incr = 16, .clr = 0, .cr = 0},
-                .srcb = {.incr = 0, .clr = 0, .cr = 0},
-                .dest = {.incr = 16, .clr = 0, .cr = 0},
-                .bias = {.incr = 1},
+            if (transpose) {
+                addr_mod_t{
+                    .srca = {.incr = 32, .clr = 0, .cr = 0},
+                    .srcb = {.incr = 0, .clr = 0, .cr = 0},
+                    .dest = {.incr = 16, .clr = 0, .cr = 0},
+                    .bias = {.incr = 1},
+                }
+                    .set(ADDR_MOD_2);
+            } else {
+                addr_mod_t{
+                    .srca = {.incr = 16, .clr = 0, .cr = 0},
+                    .srcb = {.incr = 0, .clr = 0, .cr = 0},
+                    .dest = {.incr = 16, .clr = 0, .cr = 0},
+                    .bias = {.incr = 1},
+                }
+                    .set(ADDR_MOD_2);
             }
-                .set(ADDR_MOD_2);
         } else {
-            addr_mod_t{
-                .srca = {.incr = 16, .clr = 0, .cr = 0},
-                .srcb = {.incr = 0, .clr = 0, .cr = 1},
-                .dest = {.incr = 8, .clr = 0, .cr = 0},
+            if (transpose) {
+                addr_mod_t{
+                    .srca = {.incr = 32, .clr = 0, .cr = 0},
+                    .srcb = {.incr = 0, .clr = 0, .cr = 1},
+                    .dest = {.incr = 8, .clr = 0, .cr = 0},
+                }
+                    .set(ADDR_MOD_2);
+            } else {
+                addr_mod_t{
+                    .srca = {.incr = 16, .clr = 0, .cr = 0},
+                    .srcb = {.incr = 0, .clr = 0, .cr = 1},
+                    .dest = {.incr = 8, .clr = 0, .cr = 0},
+                }
+                    .set(ADDR_MOD_2);
             }
-                .set(ADDR_MOD_2);
         }
     } else {
         addr_mod_t{
@@ -137,21 +160,41 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c
 
     if (is_in0_16x32) {
         if (partial_face) {
-            addr_mod_t{
-                .srca = {.incr =16, .clr = 0, .cr = 0},
-                .srcb = {.incr =16, .clr = 0, .cr = 0},
-                .dest = {.incr =0 , .clr = 1, .cr = 0},
-                .bias = {.incr = 1},
+            if (transpose) {
+                addr_mod_t{
+                    .srca = {.incr =16, .clr = 0, .cr = 1}, //srca=16
+                    .srcb = {.incr =16, .clr = 0, .cr = 0},
+                    .dest = {.incr =0 , .clr = 1, .cr = 0},
+                    .bias = {.incr = 1},
+                }
+                    .set(ADDR_MOD_4);
+            } else {
+                addr_mod_t{
+                    .srca = {.incr =16, .clr = 0, .cr = 0},
+                    .srcb = {.incr =16, .clr = 0, .cr = 0},
+                    .dest = {.incr =0 , .clr = 1, .cr = 0},
+                    .bias = {.incr = 1},
+                }
+                    .set(ADDR_MOD_4);
             }
-                .set(ADDR_MOD_4);
         } else {
-            addr_mod_t{
-                .srca = {.incr =16, .clr = 0, .cr = 0},
-                .srcb = {.incr =16, .clr = 0, .cr = 1},
-                .dest = {.incr = 0, .clr = 0, .cr = 1},
-                .bias = {.incr = 1},
+            if (transpose) {
+                addr_mod_t{
+                    .srca = {.incr =16, .clr = 0, .cr = 1}, //srca=16
+                    .srcb = {.incr =16, .clr = 0, .cr = 1},
+                    .dest = {.incr = 0, .clr = 0, .cr = 1},
+                    .bias = {.incr = 1},
+                }
+                    .set(ADDR_MOD_4);
+            } else {
+                addr_mod_t{
+                    .srca = {.incr =16, .clr = 0, .cr = 0},
+                    .srcb = {.incr =16, .clr = 0, .cr = 1},
+                    .dest = {.incr = 0, .clr = 0, .cr = 1},
+                    .bias = {.incr = 1},
+                }
+                    .set(ADDR_MOD_4);
             }
-                .set(ADDR_MOD_4);
         }
     } else if (is_in0_32x16) {
         addr_mod_t{
@@ -192,8 +235,8 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c
 
 }
 
-template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout=DstTileFaceLayout::RowMajor>
-inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_id, const std::uint32_t in1_id) {
+template <int NUM_FIDELITY_PHASES>
+inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) {
 
     // in0 - loaded to SrcB
     // in1 - loaded to SrcA
@@ -208,12 +251,11 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con
     const bool reuse_a = ct_dim>=rt_dim;
     const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim;
 
-    const bool is_in0_16x32 = (math_tile_dims[in0_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]> FACE_C_DIM);
-    const bool is_in1_32x16 = (math_tile_dims[in1_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in1_id][TileDim::C_IDX]<=FACE_C_DIM);
-    const bool is_in0_32x16 = (math_tile_dims[in0_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]<=FACE_C_DIM);
-    const bool is_in0_16x16 = (math_tile_dims[in0_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]<=FACE_C_DIM);
-    const bool is_in1_16x16 = (math_tile_dims[in1_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in1_id][TileDim::C_IDX]<=FACE_C_DIM);
-    const bool partial_face = get_partial_face(in0_id);
+    const bool is_in0_16x32 = (in0_tile_r_dim <=FACE_R_DIM) && (in0_tile_c_dim > FACE_C_DIM);
+    const bool is_in1_32x16 = (in1_tile_r_dim > FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM);
+    const bool is_in0_32x16 = (in0_tile_r_dim > FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM);
+    const bool is_in0_16x16 = (in0_tile_r_dim <= FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM);
+    const bool is_in1_16x16 = (in1_tile_r_dim <= FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM);
 
     const std::uint32_t replay_buf_len = (is_in0_16x16 || is_in1_16x16) ? (partial_face ? 2 : 4) :
                                          ((is_in0_16x32 || is_in1_32x16 || is_in0_32x16) ? (partial_face ? 4 : 8) : 16);
@@ -243,14 +285,14 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con
         TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_3, 0); // B3A1 // srca=srca, srcb+=8,  dest+=8, bias=1
     } else if (is_in0_16x32 || is_in0_32x16) {
         if (partial_face) {
-            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16,  srcb=0,   dest=+16, bias = 1
-            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16,  srcb+=16,  dest=0 (addr_mod_4), bias=0
-            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B1A2 // srca+=16,  srcb=0,  dest=+16, bias = 1
+            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16/32,  srcb=0,   dest=+16, bias = 1, // srca+=32 if transposed
+            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16/=16,  srcb+=16,  dest=0 (addr_mod_4), bias=0, // srca=16 if transposed
+            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B1A2 // srca+=16/32,  srcb=0,  dest=+16, bias = 1  // srca+=32 if transposed
         } else {
             TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A0 // srca=srca, srcb+=8,  dest+=8
-            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16,  srcb=0,   dest+=8
+            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16/32,  srcb=0,   dest+=8 // srca+=32 if transposed
             TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_3, 0); // B0A1 // srca=srca, srcb+=8,  dest+=8,  bias=1
-            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16/=0,  srcb=16,  dest=0/+=8 (addr_mod_4), bias=0 // srca=0 dest+=8 if in0_32x16
+            TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16/=0/=16,  srcb=16,  dest=0/+=8 (addr_mod_4), bias=0 // srca=0 dest+=8 if in0_32x16, srca=16 if transposed
 
             TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B1A2 // srca=srca, srcb+=8,  dest+=8
             TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_1, 0); // B1A2 // srca+=16,  srcb=16,  dest+=8/24 // dest+=24 if transposed
@@ -316,14 +358,10 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con
     tmp.program(instrn_buffer);
 }
 
-template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout=DstTileFaceLayout::RowMajor>
-inline void llk_math_matmul_init(const std::uint32_t operandA, const std::uint32_t operandB, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
-
-    const std::uint32_t operandA_id = get_operand_id(operandA);
-    const std::uint32_t operandB_id = get_operand_id(operandB);
-
-    matmul_configure_addrmod<NUM_FIDELITY_PHASES, FaceLayout>(transpose, ct_dim, rt_dim, kt_dim, operandA_id, operandB_id);
+template <int NUM_FIDELITY_PHASES>
+inline void _llk_math_matmul_init_(const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
 
+    matmul_configure_addrmod<NUM_FIDELITY_PHASES>(transpose, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face);
     const bool reuse_a = ct_dim>=rt_dim;
     const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim;
     if (t_dim>1) {
@@ -336,12 +374,12 @@ inline void llk_math_matmul_init(const std::uint32_t operandA, const std::uint32
         TTI_SETC16(CLR_DVALID_SrcA_Disable_ADDR32, 0);
     }
 
-    matmul_configure_mop<NUM_FIDELITY_PHASES, FaceLayout>(transpose>0, ct_dim, rt_dim, kt_dim, operandA_id, operandB_id);
+    matmul_configure_mop<NUM_FIDELITY_PHASES>(transpose>0, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face);
     math::reset_counters(p_setrwc::SET_ABD_F);
 }
 
-template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout=DstTileFaceLayout::RowMajor>
-inline void llk_math_matmul(uint dst_index, const bool transpose=false, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
+template <int NUM_FIDELITY_PHASES>
+inline void _llk_math_matmul_(uint dst_index, const bool transpose=false, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
     const bool reuse_a = ct_dim>=rt_dim;
     const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim;
     const std::uint32_t rut_dim = reuse_a ? ct_dim : rt_dim; //reuse-dim
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h
index 8c126977d12..4c77069f857 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h
@@ -3,8 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_param_structs.h"
-
 #include "ckernel_include.h"
 #include "ckernel_template.h"
 
@@ -20,8 +18,8 @@ inline void reduce_configure_addrmod();
 template <ReduceDim dim, int num_fidelity_phases>
 inline void reduce_configure_mop();
 
-template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0, bool is_fp32_dest_acc_en = false>
-inline void llk_math_reduce(uint dst_index) {
+template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0, bool is_fp32_dest_acc_en = false, bool is_int_fpu_en = false>
+inline void _llk_math_reduce_(const uint dst_index) {
     constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4;
     math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(dst_index);
     if constexpr (dim == ReduceDim::REDUCE_ROW) {
@@ -47,6 +45,17 @@ inline void llk_math_reduce(uint dst_index) {
             }
         }
 
+        // Workaround for https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1948
+        if constexpr (is_int_fpu_en) {
+            TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
+            TTI_SFPLOAD(0, 4, ADDR_MOD_0, 0);
+            TTI_SFPSTORE(0,5,ADDR_MOD_0,0);
+            TTI_SFPLOAD(0, 4, ADDR_MOD_0, 2);
+            TTI_SFPSTORE(0,5,ADDR_MOD_0,2);
+            TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::WAIT_SFPU);
+            TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x1);
+        }
+
         // Move back to B and transpose
         // we avoid clobbering weights in src B by moving to rows 16 - 31
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 0, 0, 0, p_setrwc::SET_AB);
@@ -64,6 +73,9 @@ inline void llk_math_reduce(uint dst_index) {
         // Note: transpose on src B on works on rows 16 - 31
         TTI_TRNSPSRCB;
         TTI_MOVD2B(0, p_movd2b::SRC_ROW16_OFFSET, ADDR_MOD_0, p_movd2b::MOV_1_ROW, 0);
+        if constexpr (is_int_fpu_en) {
+            TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x0);
+        }
 
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B);
@@ -102,6 +114,16 @@ inline void llk_math_reduce(uint dst_index) {
                 TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
             }
         }
+        // Workaround for https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1948
+        if constexpr (is_int_fpu_en) {
+            TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
+            TTI_SFPLOAD(0, 4, ADDR_MOD_0, 0);
+            TTI_SFPSTORE(0,5,ADDR_MOD_0,0);
+            TTI_SFPLOAD(0, 4, ADDR_MOD_0, 2);
+            TTI_SFPSTORE(0,5,ADDR_MOD_0,2);
+            TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::WAIT_SFPU);
+            TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x1);
+        }
 
         // Move back to B and transpose
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 0, 0, 0, p_setrwc::SET_AB);
@@ -119,6 +141,9 @@ inline void llk_math_reduce(uint dst_index) {
         // Note: transpose on src B on works on rows 16 - 31
         TTI_TRNSPSRCB;
         TTI_MOVD2B(0, p_movd2b::SRC_ROW16_OFFSET, ADDR_MOD_0, p_movd2b::MOV_1_ROW, 0);
+        if constexpr (is_int_fpu_en) {
+            TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x0);
+        }
 
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B);
@@ -155,6 +180,7 @@ inline void llk_math_reduce(uint dst_index) {
             // Reset Dest Counter
             TTI_SETRWC(p_setrwc::CLR_AB, 0, 0, 0, 0, p_setrwc::SET_AD);
         }
+
     } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) {
         //fp32 dest unsupported with reduce scalar, must fix zeroacc
         static_assert(!is_fp32_dest_acc_en);
@@ -268,7 +294,7 @@ inline void reduce_configure_mop() {
 }
 
 template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0>
-inline void llk_math_reduce_init(const std::uint32_t within_face_16x16_transpose=0) { //within_face_16x16_transpose used for unpack, ignored by math
+inline void _llk_math_reduce_init_(const std::uint32_t within_face_16x16_transpose=0) { //within_face_16x16_transpose used for unpack, ignored by math
 
     constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4;
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h
index e2a91b56c3f..7df83739dc9 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h
@@ -3,9 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_io_pack.h"
 #include "llk_defs.h"
-#include "llk_param_structs.h"
 
 #include "ckernel.h"
 #include "ckernel_template.h"
@@ -15,15 +13,11 @@
 using namespace ckernel;
 using namespace ckernel::packer;
 
-template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
-inline void llk_pack_mop_config(const uint32_t output_id) {
-
-    const uint num_faces = get_num_faces(output_id);
-    const uint face_r_dim = get_face_r_dim(output_id);
-    const bool partial_face = get_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]);
+template <bool untilize = false>
+inline void _llk_pack_configure_addrmod_() {
 
     addr_mod_pack_t{
-        .y_src = {.incr = untilize ? 0 : 15}, // 4-bit value so max is 15. incadcxy will increment it by 1
+        .y_src = {.incr = 15}, // 4-bit value so max is 15. incadcxy will increment it by 1
         .y_dst = {.incr = 1},
     }
         .set(ADDR_MOD_0);
@@ -44,68 +38,98 @@ inline void llk_pack_mop_config(const uint32_t output_id) {
     }
 
     addr_mod_pack_t{
-        .y_src = { .incr = 0, .clr = 0, .cr = 0  },
+        .y_src = { .incr = 0, .clr = 1, .cr = 0  },
         .y_dst = { .incr = 0, .clr = 0, .cr = 0  },
     }.set(ADDR_MOD_2);
 
-    const uint PACKCNT = partial_face ? 1 : num_faces;
-    const uint MEGAROW = 1;
-    constexpr uint ZERO_OUTPUT_FLAG = zero_output ? p_pacr::P_ZERO_OUTPUT_ENABLED : p_pacr::P_ZERO_OUTPUT_DISABLED;
+}
 
+template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+inline void _llk_pack_mop_config_(const std::uint32_t pack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false) {
+    static_assert(FaceLayout == DstTileFaceLayout::RowMajor, "FaceLayout must be RowMajor");
+
+    const uint PACKCNT = (partial_face && IS_BFP_FORMAT(pack_dst_format)) ? 1 : num_faces;
+    constexpr uint MEGAROW = 1;
+    constexpr uint ZERO_OUTPUT_FLAG = zero_output ? p_pacr::P_ZERO_OUTPUT_ENABLED : p_pacr::P_ZERO_OUTPUT_DISABLED;
+    constexpr uint MOP_INNER_LOOP = 1;
 
-    // Write header to l1
     if constexpr (!untilize) {
-        const uint MOP_INNER_LOOP = 1;
-        const uint MOP_OUTER_LOOP = 1;
+        constexpr uint MOP_OUTER_LOOP = 1;
 
         ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1));
 
-        if (partial_face) {
+        if (partial_face && IS_BFP_FORMAT(pack_dst_format)) {
             tmp.set_start_op(TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); // Don't close the tile, point to the next face
             tmp.set_loop_op0(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 1, 0)); // Inc ch0_y+=1 (addr_mod_0 will increment by 15)
             tmp.set_loop_op1(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1)); // Close the tile
         }
-
+        // Write header to l1
+        if constexpr (write_tile_header) {
+            tmp.set_end_op(TT_OP_STOREIND(
+                1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::OUTPUT_ADDR));
+        }
 
         tmp.program(instrn_buffer);
     } else {
-        const bool narrow_tile = get_narrow_tile(output_id);
-        const uint MOP_UNTILIZE_INNER_LOOP = narrow_tile ? 1 : (FaceLayout == DstTileFaceLayout::ColMajor ? 8 : 4);
-        const uint MOP_UNTILIZE_OUTER_LOOP = ((face_r_dim == 1) || narrow_tile) ? 1 : face_r_dim / 2;
+        const uint MOP_OUTER_LOOP = ((face_r_dim == 1) || narrow_tile) ? 1 : (face_r_dim >> 1);
 
-        ckernel::ckernel_template tmp(MOP_UNTILIZE_OUTER_LOOP, MOP_UNTILIZE_INNER_LOOP, TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0));
-        if (narrow_tile) {
-            tmp.set_last_inner_loop_instr(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1)); // Close the tile and clear the counters
+        if ((face_r_dim == 1) || narrow_tile) {
+            ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1));
+            tmp.program(instrn_buffer);
         } else {
+            // Inc ch0_y+=1 (addr_mod_0 will increment by 15)
+            ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 1, 0));
             tmp.set_start_op(TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0));
-            if (face_r_dim>1) {
-                tmp.set_loop_op0(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 4, 0)); // If it's narrow tile (32x16) pack rows back to back otherwise jump between faces
-                tmp.set_end_op(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0));
-            }
-            tmp.set_last_inner_loop_instr(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 4, 0));
-            tmp.set_last_outer_loop_instr(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 4, 0));
+            tmp.set_end_op(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0));
+            tmp.program(instrn_buffer);
         }
-        tmp.program(instrn_buffer);
     }
-
 }
 
-template <bool untilize = false, bool is_fp32_dest_acc_en = false>
-inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) {
-    configure_pack<is_fp32_dest_acc_en, untilize>(get_output_id(pack_params->pack_output), pack_params->relu_config.val);
+template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+inline void _llk_pack_reconfig_data_format_(const std::uint32_t pack_src_format, const std::uint32_t pack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false) {
+
+    reconfig_packer_data_format<is_fp32_dest_acc_en>(
+        pack_src_format,
+        pack_dst_format,
+        tile_size,
+        face_r_dim
+    );
+
+    if constexpr (is_tile_dim_reconfig_en) {
+        _llk_pack_mop_config_<false, false, FaceLayout, write_tile_header>(pack_dst_format, face_r_dim, num_faces, partial_face, narrow_tile);
+    }
 }
 
-template <bool untilize = false, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
-inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) {
-    llk_pack_params_t llk_pack_params = {
-        .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}};
-    llk_pack_hw_configure<untilize, is_fp32_dest_acc_en>(&llk_pack_params);
+template <bool untilize = false, bool is_fp32_dest_acc_en = false>
+inline void _llk_pack_hw_configure_(const std::uint32_t pack_src_format, const std::uint32_t pack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false, const std::uint32_t relu_config = 0) {
+
+    configure_pack<is_fp32_dest_acc_en, untilize>(
+        pack_src_format,
+        pack_dst_format,
+        tile_size,
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile,
+        relu_config
+    );
 }
 
-// FIXME: Remove once edge mask spec is defined
 template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false>
-inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) {
-    configure_pack<is_fp32_dest_acc_en, untilize>(get_output_id(pack_params->pack_output), pack_params->relu_config.val);
+inline void _llk_pack_reduce_hw_configure_(const std::uint32_t pack_src_format, const std::uint32_t pack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false, const std::uint32_t relu_config = 0) {
+
+    configure_pack<is_fp32_dest_acc_en, untilize>(
+        pack_src_format,
+        pack_dst_format,
+        tile_size,
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile,
+        relu_config
+    );
+
     volatile uint tt_reg_ptr *cfg = get_cfg_pointer();
 
     ckernel::packer::pck_edge_offset_u pack_edge_offset = {.val = 0};
@@ -143,84 +167,22 @@ inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) {
     }
 }
 
-template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
-inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) {
-    llk_pack_params_t llk_pack_params = {
-        .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}};
-    llk_pack_reduce_hw_configure<untilize, type, dim, is_fp32_dest_acc_en>(&llk_pack_params);
-}
-
-template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
-inline void llk_pack_init(const std::uint32_t pack_output = 0) {
-    const std::uint32_t output_id = get_output_id(pack_output);
-    llk_pack_mop_config<untilize, zero_output, FaceLayout>(output_id);
-}
-
-template <bool out_of_order_output, bool untilize>
-inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) {
-
-    std::uint32_t pack_tile_addr;
-    if constexpr (out_of_order_output) {
-        pack_tile_addr = cb_interface[output_id].fifo_wr_ptr +
-                        (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1;
-    } else {
-        if constexpr (untilize) {
-            // FIXME: Do we need support for pack untilize?
-            // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim +
-            //                                 cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf
-            // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
-            // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size);
-
-            // cb_interface[output_id].ublock_tile_cnt++;
-
-            // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) {
-            //    cb_interface[output_id].ublock_tile_cnt=0;
-            //    cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct;
-            // }
-        } else {
-            pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
-            cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size;
-        }
-    }
-    return pack_tile_addr;
-}
-
-#if defined(PERF_DUMP) && MATH_PACK_DECOUPLE
-template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false, bool is_fp32_dest_acc_en = false, bool pack_l1_acc_en = false>
-inline void llk_pack_decouple(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0, bool pack_l1_acc = false) {
-
-    std::uint8_t output_id = get_output_id(output);
-
-    static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
+template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+inline void _llk_pack_init_(const std::uint32_t pack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false) {
 
-    std::uint32_t pack_tile_addr = get_output_tile_address<out_of_order_output, untilize>(output_id, output_tile_index);
+    _llk_pack_configure_addrmod_<untilize>();
 
-    if (operand_is_intermediate(output)) {
-        return;
-    }
-
-    if constexpr (!untilize) {
-        uint32_t tile_header[4];
-        uint32_t* l1_dest = reinterpret_cast<uint32_t*>(pack_tile_addr << 4);
-        for (int i = 0; i < 4; i++) {
-            tile_header[i] = regfile[p_gpr_pack::TILE_HEADER + i];
-            l1_dest[i] = tile_header[i];
-        }
-    }
+    _llk_pack_mop_config_<untilize, zero_output, FaceLayout, write_tile_header>(
+        pack_dst_format,
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile
+    );
 }
-#endif
-
-template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false, bool is_fp32_dest_acc_en = false>
-inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) {
 
-    std::uint8_t output_id = get_output_id(output);
-
-    // Access tile dims using the following logic:
-    // pack_tile_dims[output_id]
-
-    static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
-
-    std::uint32_t pack_tile_addr = get_output_tile_address<out_of_order_output, untilize>(output_id, output_tile_index);
+template <DstSync Dst = SyncFull, bool untilize = false, bool is_fp32_dest_acc_en = false>
+inline void _llk_pack_(const std::uint32_t tile_index, const std::uint32_t address) {
 
     constexpr uint32_t DEST_NUM_TILES_SHIFT = is_fp32_dest_acc_en ? (1) : (0);
     constexpr uint32_t DEST_NUM_TILES = DEST_NUM_TILES_FP16 >> DEST_NUM_TILES_SHIFT;
@@ -237,44 +199,11 @@ inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32
         TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_W, tile_index);
     }
 
-    program_packer_destination(pack_tile_addr, output_id);
+    program_packer_destination(address);
 
     mop_run(1, 1);
 
     if constexpr (untilize) {
         TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close tile
-        TTI_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_Y, 0);
-        TTI_INCADCZW(p_setadc::PAC, 0, 0, 1, 0);
-    }
-
-}
-// FIXME-WH-UPLIFT
-template <ReduceDim dim, bool at_kernel_start = false, bool revert=false, bool is_fp32_dest_acc_en = false>
-inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
-
-    if constexpr (at_kernel_start)
-        configure_pack<is_fp32_dest_acc_en, false>(get_output_id(icb_out), false);
-    else {
-        TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK);
-        tensix_sync();
-    }
-
-    volatile uint *cfg = get_cfg_pointer();
-    if constexpr (dim == ReduceDim::REDUCE_ROW) {
-        for (uint i = 0; i < 4; i++)
-            //TTI_WRCFG(revert ? 0xFFFFffff : 0x1, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+i);
-            cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32 + i] = revert ? 0xFFFFffff : 0x1;
-    } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) {
-        //TTI_WRCFG(revert ? 0xFFFFffff : 0x0, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0);
-        //TTI_WRCFG(revert ? 0xFFFFffff : 0x1, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1);
-        //TTI_WRCFG(revert ? 0xFFFFffff : 0x1, p_cfg::WRCFG_32b, TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32);
-        cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0] = revert ? 0xFFFFffff : 0x0;
-        cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1] = revert ? 0xFFFFffff : 0x1;
-        cfg[TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32] = revert ? 0xF : 0x1;
-    } else {
-        //TTI_WRCFG(revert ? 0xFFFFffff : 0x0,    p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0);
-        //TTI_WRCFG(revert ? 0xFFFFffff : 0xFFFF, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1);
-        //cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0] = revert ? 0xFFFFffff : 0x0;
-        //cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1] = revert ? 0xFFFFffff : 0x0000ffff;
     }
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
index f3385908c37..88dbdb186a9 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
@@ -6,26 +6,31 @@
 
 #include "ckernel.h"
 #include "ckernel_defs.h"
-#include "debug/fw_debug.h"
+#include "fw_debug.h"
 #include "cpack_common.h"
 #include "llk_defs.h"
-#include "llk_param_structs.h"
-
-#include "hostdevcommon/common_runtime_address_map.h"
-
 
 using namespace ckernel;
 using namespace ckernel::packer;
 
+#ifdef PERF_DUMP
+#include "ckernel_perf_api.h"
+#endif
 
 // wait until math is done and has produced something to pack
-inline void llk_packer_wait_for_math_done() {
+inline void _llk_packer_wait_for_math_done_() {
+#ifdef PERF_DUMP
+    if constexpr (MATH_PACK_DECOUPLE == 0) {
+        TTI_SEMWAIT(p_stall::STALL_TDMA, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_ZERO);
+    }
+#else
     TTI_SEMWAIT(p_stall::STALL_TDMA, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_ZERO);
+#endif
 }
 
 // Tell math that it can write again
 template <uint WaitRes = p_stall::NONE>
-inline void llk_packer_set_math_semaphore() {
+inline void _llk_packer_set_math_semaphore_() {
     t6_semaphore_get<WaitRes>(semaphore::MATH_PACK);  // Indicate that packer is done and header is written into L1
 }
 
@@ -33,7 +38,13 @@ inline void llk_packer_set_math_semaphore() {
 // Tell math it can write again
 // Clear dest
 template <DstSync Dst, bool is_fp32_dest_acc_en = false>
-inline void llk_pack_dest_section_done() {
+inline void _llk_pack_dest_section_done_() {
+#ifdef PERF_DUMP
+    if constexpr (MATH_PACK_DECOUPLE) {
+        return;
+    }
+#endif
+
     constexpr bool clear_dest = (Dst != DstSync::SyncTile16);
 
     if constexpr (clear_dest){
@@ -53,7 +64,7 @@ inline void llk_pack_dest_section_done() {
     constexpr uint32_t WaitRes = (Dst == DstSync::SyncTile16) ? (p_stall::PACK) : (p_stall::NONE);
 
     // Tell math that it can write again
-    llk_packer_set_math_semaphore<WaitRes>();
+    _llk_packer_set_math_semaphore_<WaitRes>();
 
     constexpr bool flip_dest = ((Dst == DstSync::SyncHalf) || (Dst == DstSync::SyncTile2));
 
@@ -63,97 +74,99 @@ inline void llk_pack_dest_section_done() {
     }
 }
 
+template <DstSync Dst, DstTileFaceLayout FaceLayout, bool untilize = false>
+inline void _llk_init_packer_dest_offset_registers_(const std::uint32_t face_r_dim = FACE_R_DIM, const bool narrow_tile = false) {
+    TTI_STALLWAIT(p_stall::STALL_TDMA|p_stall::STALL_THCON, p_stall::PACK);  // wait for pack to finish
+    if constexpr (untilize) {
+        const uint face_r_offset = ((face_r_dim == 1) || narrow_tile) ? FACE_R_DIM : (face_r_dim >> 1);
+        if constexpr (FaceLayout == ColMajor) {
+            // Packer0 :  0,32,  1,33 ...  7, 39
+            // Packer1 :  8,40,  9,41 ... 15, 47
+            // Packer2 : 16,48, 17,49 ... 23, 55
+            // Packer3 : 23,56, 24,57 ... 31, 63
+            TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
+            TT_SETDMAREG(0, 0x000 + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
+            TT_SETDMAREG(0, 0x000 + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
+            TT_SETDMAREG(0, 0x000 + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
+        } else {
+            //For example if face_offset = 8:
+            // Packer0 :  0,16,  1,17 ...  7, 23
+            // Packer1 :  8,24,  9,25 ... 15, 31
+            // Packer2 : 32,48, 33,49 ... 39, 55
+            // Packer3 : 40,56, 41,57 ... 47, 63
+            TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
+            TT_SETDMAREG(0, 0x000 + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
+            TT_SETDMAREG(0, 0x000 + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
+            TT_SETDMAREG(0, 0x000 + 0x20 + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20 + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
+        }
+    } else {
+        if constexpr (FaceLayout == ColMajor) {
+            TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
+            TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
+            TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
+            TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
+        } else {  // Default to row major layout
+            TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
+            TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
+            TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
+            TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
+            TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
+        }
+    }
+    select_packer_dest_registers<Dst>();
+}
+
 template <DstSync Dst, DstTileFaceLayout FaceLayout = RowMajor, bool untilize = false, bool is_fp32_dest_acc_en = false>
-inline void llk_pack_dest_init() {
+inline void _llk_pack_dest_init_(const std::uint32_t face_r_dim = FACE_R_DIM, const bool narrow_tile = false) {
     tensix_sync();
     reset_dest_offset_id();
-    init_packer_dest_offset_registers<FaceLayout, untilize, is_fp32_dest_acc_en>();
-    select_packer_dest_registers<Dst>();
+    _llk_init_packer_dest_offset_registers_<Dst,FaceLayout,untilize>(face_r_dim, narrow_tile);
     packer_addr_counter_init();
     pack_sync_tile_dst_ptr = 0;
 }
 
-template <DstSync Dst, DstTileFaceLayout FaceLayout, bool untilize = false>
-inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 0) {
-    TTI_STALLWAIT(p_stall::STALL_TDMA|p_stall::STALL_THCON, p_stall::PACK);  // wait for pack to finish
-    if constexpr (untilize) {
-       if constexpr (FaceLayout == ColMajor) {
-          // Packer0 :  0,32,  1,33 ...  7, 39
-          // Packer1 :  8,40,  9,41 ... 15, 47
-          // Packer2 : 16,48, 17,49 ... 23, 55
-          // Packer3 : 23,56, 24,57 ... 31, 63
-          TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
-          TT_SETDMAREG(0, 0x000 + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
-          TT_SETDMAREG(0, 0x000 + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
-          TT_SETDMAREG(0, 0x000 + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
-       } else {
-          // Packer0 :  0,16,  1,17 ...  7, 23
-          // Packer1 :  8,24,  9,25 ... 15, 31
-          // Packer2 : 32,48, 33,49 ... 39, 55
-          // Packer3 : 40,56, 41,57 ... 47, 63
-          TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
-          TT_SETDMAREG(0, 0x000 + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
-          TT_SETDMAREG(0, 0x000 + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
-          TT_SETDMAREG(0, 0x000 + 0x28, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
-          TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x28, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
-       }
+
+template <bool mail2math=true, bool mail2pack=true>
+inline void _llk_pack_get_tile_(std::uint32_t tile_index, std::uint32_t *p_tile) {
+    if constexpr (mail2pack) {
+       *p_tile = mailbox_read(ThreadId::UnpackThreadId);
     } else {
-       if constexpr (FaceLayout == ColMajor) {
-           TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
-           TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
-           TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
-           TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
-       } else {  // Default to row major layout
-           TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0));
-           TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1));
-           TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2));
-           TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2));
-           TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3));
-       }
+       *p_tile = 0x0;
     }
-    select_packer_dest_registers<Dst>();
-}
 
-inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
-    debug_dump(data, byte_size);
 }
 
-inline void llk_pack_debug_dump_seek(std::uint8_t offset) {
-    debug_dump_seek(offset);
+template <bool mail2math=true, bool mail2pack=true>
+inline void _llk_pack_release_tile_() {
+    if constexpr (mail2pack) {
+       semaphore_get(semaphore::UNPACK_OPERAND_SYNC);
+    }
 }
 
-template <bool is_fp32_dest_acc_en = false>
-inline void llk_pack_reconfig_data_format(const std::uint32_t new_operand) {
-    reconfig_packer_data_format<is_fp32_dest_acc_en>(get_output_id(new_operand));
+inline void _llk_pack_debug_dump_(std::uint8_t *data, std::uint32_t byte_size) {
+    debug_dump(data, byte_size);
 }
 
-template <bool is_fp32_dest_acc_en = false>
-inline void llk_pack_reconfig_data_format(const std::uint32_t old_operand, const std::uint32_t new_operand) {
-    std::uint32_t old_operand_id = get_output_id(old_operand);
-    std::uint32_t new_operand_id = get_output_id(new_operand);
-
-    if((pack_dst_format[old_operand_id] != pack_dst_format[new_operand_id])
-       && (pack_dst_format[old_operand_id] != (uint)DataFormat::Invalid)
-       && (pack_dst_format[new_operand_id] != (uint)DataFormat::Invalid)) {
-        reconfig_packer_data_format<is_fp32_dest_acc_en>(new_operand_id);
-    }
+inline void _llk_pack_debug_dump_seek_(std::uint8_t offset) {
+    debug_dump_seek(offset);
 }
 
-inline void llk_pack_relu_config(std::uint32_t config) {
+TT_ALWAYS_INLINE void _llk_pack_relu_config_(const std::uint32_t config) {
     ReluType mode = (config&0xf) == 0 ? ReluType::NO_RELU : ((config&0xf) == 3 ? ReluType::MAX_THRESHOLD_RELU : ReluType::MIN_THRESHOLD_RELU);
     uint32_t val = ((config>>16) << STACC_RELU_ReluThreshold_SHAMT) | (((uint32_t)mode) << STACC_RELU_ApplyRelu_SHAMT);
     TTI_SETDMAREG(0, val&0xffff, 0, LO_16(p_gpr_pack::TMP0));
@@ -163,13 +176,13 @@ inline void llk_pack_relu_config(std::uint32_t config) {
     TTI_NOP; TTI_NOP;
 }
 
-inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable)
+inline void _llk_pack_reconfig_l1_acc_(const std::uint32_t enable)
 {
     reconfigure_packer_l1_acc(enable);
 }
 
 template <bool untilize = false, ReduceDim dim>
-inline void llk_pack_reduce_mask_config() {
+inline void _llk_pack_reduce_mask_config_() {
     ckernel::packer::pck_edge_offset_u pack_edge_offset = {.val = 0};
 
     // We initialize PCK_EDGE_OFFSET_SEC0 mask to clear out all the datums in the row
@@ -230,7 +243,7 @@ inline void llk_pack_reduce_mask_config() {
     TTI_NOP; TTI_NOP;
 }
 
-inline void llk_pack_reduce_mask_clear() {
+inline void _llk_pack_reduce_mask_clear_() {
     // By default, all packers are set to use TILE_ROW_SET_MAPPING_0 and
     // mask is configured to pass through all the datums
     pck_edge_offset_u pack_edge_offset = {.val = 0};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h
index 6117df04e2e..61dd252e81e 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h
@@ -3,9 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_io_unpack.h"
-#include "llk_param_structs.h"
-
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
@@ -15,14 +12,19 @@
 using namespace ckernel;
 using namespace ckernel::unpacker;
 
-template <BroadcastType BType = BroadcastType::NONE, bool acc_to_dest = false, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::uint32_t operand_id) {
+#ifndef SKIP_UNP
+#define SKIP_UNP 0
+#endif
+
+template <BroadcastType BType = BroadcastType::NONE, bool acc_to_dest = false, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, bool unpack_to_dest = false>
+inline void _llk_unpack_A_mop_config_(const bool transpose_of_faces, const std::uint32_t num_faces, const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format = 0) {
 
     static_assert(!((BType != BroadcastType::NONE) && acc_to_dest && (binary_reuse_dest == EltwiseBinaryReuseDestType::DEST_TO_SRCB)), "Not supported configuration!");
+    static_assert((((BType == BroadcastType::NONE) && (!acc_to_dest) && (binary_reuse_dest == EltwiseBinaryReuseDestType::NONE)) || (!unpack_to_dest)), "Not supported configuration when unpacking to dest!");
 
-    const uint32_t num_faces = get_num_faces(operand_id);
     #if SKIP_UNP == 1
         static constexpr uint unpack_srca = TT_OP_NOP;
+        static constexpr uint unpack_srca_to_dest = TT_OP_NOP;
         static constexpr uint unpack_srca_set_dvalid = TT_OP_NOP;
         static constexpr uint unpack_srcb = TT_OP_NOP;
         static constexpr uint unpack_srcb_inc_z_0 = TT_OP_NOP;
@@ -36,6 +38,7 @@ inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::ui
         TTI_NOP;
     #else
         static constexpr uint unpack_srca = TT_OP_UNPACR(SrcA, 0b1 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 1 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1);
+        static constexpr uint unpack_srca_to_dest = TT_OP_UNPACR(SrcA, 0b00010001 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 0 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); // ch0/ch1 z_inc
         static constexpr uint unpack_srca_set_dvalid = TT_OP_UNPACR_NOP(SrcA, p_unpacr_nop::UNP_ZEROSRC_SET_DVALID);
         static constexpr uint unpack_srcb = TT_OP_UNPACR(SrcB, 0b1 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 1 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1);
         static constexpr uint unpack_srcb_inc_z_0 = TT_OP_UNPACR(SrcB, 0b0 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 1 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1);
@@ -46,7 +49,12 @@ inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::ui
         static constexpr uint srcb_clear_z = TT_OP_SETADCZW(p_setadc::UNP_B, 0, 0, 0, 0, 0b0001); // set srcB ch0_z = 0
     #endif
 
-    if constexpr (BType == BroadcastType::COL) {
+    if (unpack_to_dest && unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) {
+        const uint32_t outerloop = num_faces;
+        constexpr uint32_t innerloop = 1;
+        ckernel_template tmp(outerloop, innerloop, unpack_srca_to_dest);
+        tmp.program(instrn_buffer);
+    } else if constexpr (BType == BroadcastType::COL) {
         if constexpr (acc_to_dest) {
             constexpr uint32_t innerloop = 1;
             constexpr uint32_t outerloop = 2; //TODO: add support for num_faces, add support for dest to srcB
@@ -128,50 +136,30 @@ inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::ui
     }
 }
 
-template <bool is_fp32_dest_acc_en = false, bool srnd_fpu_en = false>
-inline void llk_unpack_A_hw_configure(const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) {
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void _llk_unpack_A_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
     constexpr bool is_row_pool = false;
-    const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand);
-
-    const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id);
-
-    const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id);
-
-    configure_unpack_AB(unpA_operand_id, unpA_operand_id,
-        unpA_face_r_dim, unpA_face_r_dim, is_row_pool, within_face_16x16_transpose, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpA_num_faces);
-}
-
-template <bool is_fp32_dest_acc_en = false, bool srnd_fpu_en = false>
-inline void llk_unpack_A_hw_configure_disaggregated(const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) {
-
-    const llk_unpack_A_params_t unpack_A_params = {
-        .unpA_operand = unpA_operand
-    };
-    llk_unpack_A_hw_configure<is_fp32_dest_acc_en, srnd_fpu_en>(&unpack_A_params, within_face_16x16_transpose);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format,
+        unpack_src_format,
+        unpack_dst_format,
+        unpack_dst_format,
+        face_r_dim,
+        face_r_dim,
+        within_face_16x16_transpose,
+        num_faces,
+        num_faces);
 }
 
-template <BroadcastType BType = BroadcastType::NONE, bool acc_to_dest = false, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-inline void llk_unpack_A_init(const std::uint32_t transpose_of_faces=0, const std::uint32_t within_face_16x16_transpose=0, const std::uint32_t operand = 0) {
-
-    cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(within_face_16x16_transpose);
-
-    const std::uint32_t operand_id = get_operand_id(operand);
-
-    const std::uint32_t face_r_dim = get_face_r_dim(operand_id);
-
+template <BroadcastType BType = BroadcastType::NONE, bool acc_to_dest = false, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, bool unpack_to_dest = false>
+inline void _llk_unpack_A_init_(const std::uint32_t transpose_of_faces=0, const std::uint32_t within_face_16x16_transpose=0, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const std::uint32_t unpack_src_format = 0, const std::uint32_t unpack_dst_format = 0) {
     constexpr std::uint32_t UNP_SEL = (BType == BroadcastType::NONE) ? p_setadc::UNP_A : p_setadc::UNP_B;
     config_face_dim<false, UNP_SEL>(face_r_dim);
-    llk_unpack_A_mop_config<BType, acc_to_dest, binary_reuse_dest>(transpose_of_faces>0, operand_id);
+    _llk_unpack_A_mop_config_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(transpose_of_faces>0, num_faces, unpack_src_format, unpack_dst_format);
 }
 
-template <BroadcastType BType = BroadcastType::NONE, bool acc_to_dest = false, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-inline void llk_unpack_A(const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0 /*not used*/) {
-    std::uint32_t input = get_operand_id(operand);
-    std::uint32_t base_address = cb_interface[input].fifo_rd_ptr;
-    std::uint32_t offset_address = cb_interface[input].fifo_page_size * tile_index;
-    // note: unpacker is programmed to automatically skip the tile header (+1)
-    // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1
-    std::uint32_t address = base_address + offset_address - 1;
+template <BroadcastType BType = BroadcastType::NONE, bool acc_to_dest = false, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, bool unpack_to_dest = false>
+inline void _llk_unpack_A_(const std::uint32_t address, const bool transpose_of_faces = 0, const std::uint32_t unpack_src_format = 0, const std::uint32_t unpack_dst_format = 0) {
 
     // Clear z/w start counters
     TTI_SETADCZW(0b011, 0, 0, 0, 0, 0b1111);
@@ -206,13 +194,30 @@ inline void llk_unpack_A(const std::uint32_t operand, const std::uint32_t tile_i
         }
     }
 
+    if constexpr (unpack_to_dest) {
+        if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) {
+            set_dst_write_addr(unp_cfg_context, unpack_dst_format);
+            wait_for_dest_available();
+        }
+    }
+
     // Run MOP
     ckernel::ckernel_template::run(instrn_buffer);
 
     // T6::SEMGET for context release
     t6_semaphore_get(semaphore::UNPACK_SYNC);
 
+    if (unpack_to_dest) {
+        if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) {
+            unpack_to_dest_tile_done(unp_cfg_context);
+        }
+    }
+
     // Switch unpacker config context
     switch_config_context(unp_cfg_context);
 
+
+#ifdef PERF_DUMP
+    first_unpack_recorded = true;
+#endif
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h
index de144ad5aec..0f6d54f2909 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h
@@ -3,9 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_io_unpack.h"
-#include "llk_param_structs.h"
-
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
@@ -16,7 +13,7 @@ using namespace ckernel;
 using namespace ckernel::unpacker;
 
 template <BroadcastType BType = BroadcastType::NONE>
-inline void llk_unpack_AB_mop_config(const bool transpose_of_faces=false, const std::uint32_t operand_id=0) {
+inline void _llk_unpack_AB_mop_config_(const bool transpose_of_faces=false, const std::uint32_t num_faces=4, const bool narrow_tile=false) {
 #if SKIP_UNP == 1
     static constexpr uint unpack_srca = TT_OP_NOP;
     static constexpr uint unpack_srcb = TT_OP_NOP;
@@ -26,9 +23,6 @@ inline void llk_unpack_AB_mop_config(const bool transpose_of_faces=false, const
     static constexpr uint unpack_srcb =
         TT_OP_UNPACR(SrcB, 0b1, 0, 0, 0, 1, 1, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1);
 #endif
-    const uint32_t num_faces = get_num_faces(operand_id);
-    const bool narrow_tile = get_narrow_tile(operand_id); // if narrow tile read face 0 twice for row broadcast
-                                                          // or read face 0 and 1 for col broadcast
 
     if constexpr (BType == BroadcastType::COL) {
         static constexpr uint unpack_srcb_set_z = TT_OP_SETADCZW(0b010, 0, 0, 0, 2, 0b0001);
@@ -76,60 +70,35 @@ inline void llk_unpack_AB_mop_config(const bool transpose_of_faces=false, const
 
 }
 
-template <bool is_fp32_dest_acc_en = false, bool srnd_fpu_en = false>
-inline void llk_unpack_AB_hw_configure(const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) {
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void _llk_unpack_AB_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format,  const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
     constexpr bool is_row_pool = false;
-    // In0 -> unpA
-    // In1 -> unpB
-    const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
-    const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
-
-    // unpA -> srcA
-    // unpB -> srcB
-    const uint32_t num_faces = get_num_faces(unpA_operand_id);  // num faces in unpA and unpB are the same
-
-    const uint32_t face_r_dim = get_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same
-
-    configure_unpack_AB(unpA_operand_id, unpB_operand_id,
-                        face_r_dim, face_r_dim, is_row_pool, within_face_16x16_transpose, is_fp32_dest_acc_en, srnd_fpu_en, num_faces, num_faces);
-}
-
-template <bool is_fp32_dest_acc_en = false, bool srnd_fpu_en = false>
-inline void llk_unpack_AB_hw_configure_disaggregated(
-    const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0 ) {
-    const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand};
-    llk_unpack_AB_hw_configure<is_fp32_dest_acc_en, srnd_fpu_en>(&unpack_AB_params, within_face_16x16_transpose);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpA_src_format,
+        unpB_src_format,
+        unpA_dst_format,
+        unpB_dst_format,
+        face_r_dim,
+        face_r_dim,
+        within_face_16x16_transpose,
+        num_faces,
+        num_faces);
 }
 
 template <BroadcastType BType = BroadcastType::NONE>
-inline void llk_unpack_AB_init(const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) {
-    const uint32_t unpA_operand_id = get_operand_id(unpA_operand);
+inline void _llk_unpack_AB_init_(const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) {
 
-    //Need to be able to configure tranpose srca for fused ops
     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(transpose); // transpose within the face
 
-    const uint32_t face_r_dim = get_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same
-
     constexpr std::uint32_t UNP_SEL = p_setadc::UNP_AB;
     config_face_dim<false, UNP_SEL>(face_r_dim);
 
-    llk_unpack_AB_mop_config<BType>(transpose>0, unpA_operand_id); // transpose of faces 0,2,1,3
+    _llk_unpack_AB_mop_config_<BType>(transpose>0, num_faces, narrow_tile); // transpose of faces 0,2,1,3
 }
 
 template <BroadcastType BType = BroadcastType::NONE>
-inline void llk_unpack_AB(
-    const std::uint32_t operandA, const std::uint32_t operandB, const std::uint32_t tile_index_a, const std::uint32_t tile_index_b, const bool transpose_of_faces = 0 /*not used*/) {
-    std::uint32_t inputA = get_operand_id(operandA);
-    std::uint32_t inputB = get_operand_id(operandB);
-    std::uint32_t base_address_a = cb_interface[inputA].fifo_rd_ptr;
-    std::uint32_t offset_address_a = cb_interface[inputA].fifo_page_size * tile_index_a;
-    std::uint32_t base_address_b = cb_interface[inputB].fifo_rd_ptr;
-    std::uint32_t offset_address_b = cb_interface[inputB].fifo_page_size * tile_index_b;
-
-    // note: unpacker is programmed to automatically skip the tile header (+1)
-    // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1
-    std::uint32_t address_a = base_address_a + offset_address_a - 1;
-    std::uint32_t address_b = base_address_b + offset_address_b - 1;
+inline void _llk_unpack_AB_(
+    const std::uint32_t address_a, const std::uint32_t address_b, const bool transpose_of_faces = 0 /*not used*/) {
 
     TTI_SETADCZW(0b011, 0, 0, 0, 0, 0b1111); // reset counters
 
@@ -159,4 +128,8 @@ inline void llk_unpack_AB(
 
     // Switch unpacker config context
     switch_config_context(unp_cfg_context);
+
+#ifdef PERF_DUMP
+    first_unpack_recorded = true;
+#endif
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h
index 0b432d532c5..4578126b9e8 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h
@@ -3,9 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_io_unpack.h"
-#include "llk_param_structs.h"
-
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
@@ -16,7 +13,7 @@ using namespace ckernel;
 using namespace ckernel::unpacker;
 
 // transpose is unused, math is adjusted to take into account srca face layout when transpose=true
-inline void llk_unpack_AB_matmul_mop_config(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const bool partial_face) {
+inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const bool partial_face) {
     // in0 - loaded to SrcB
     // in1 - loaded to SrcA
 
@@ -93,26 +90,21 @@ inline void llk_unpack_AB_matmul_mop_config(const bool transpose, const std::uin
 
 }
 
-template<bool is_fp32_dest_acc_en = false, bool srnd_fpu_en = false>
-inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) {
-    constexpr bool is_row_pool = false;
-    const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca;
-
-    // In0 -> unpB
-    // In1 -> unpA
-    const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
-    const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
-
-    // unpA -> srcA
-    // unpB -> srcB
-    const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id);
-    const uint32_t unpB_num_faces = get_num_faces(unpB_operand_id);
+template<bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void _llk_unpack_AB_matmul_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format,  const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4, const std::uint32_t unpA_tile_size = 0, const std::uint32_t unpB_tile_size = 0) {
 
-    const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id);
-    const uint32_t unpB_face_r_dim = get_face_r_dim(unpB_operand_id);
+    constexpr bool is_row_pool = false;
 
-    configure_unpack_AB(unpA_operand_id, unpB_operand_id,
-                        unpA_face_r_dim, unpB_face_r_dim, is_row_pool, transpose_xy_srca, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpB_num_faces);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpA_src_format,
+        unpB_src_format,
+        unpA_dst_format,
+        unpB_dst_format,
+        unpA_face_r_dim,
+        unpB_face_r_dim,
+        within_face_16x16_transpose,
+        unpA_num_faces,
+        unpB_num_faces);
 
     // Configure tile size in datums
     const uint32_t unpA_x_end = unpA_num_faces*unpA_face_r_dim*FACE_C_DIM-1;
@@ -120,35 +112,14 @@ inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_
     TT_SETADCXX(p_setadc::UNP_A, unpA_x_end, 0x0);
     TT_SETADCXX(p_setadc::UNP_B, unpB_x_end, 0x0);
 
-    std::uint32_t inputA = get_operand_id(unpack_AB_params->unpB_operand);
-    std::uint32_t inputB = get_operand_id(unpack_AB_params->unpA_operand);
-    regfile[p_gpr_unpack::TILE_SIZE_A] = cb_interface[inputA].fifo_page_size;
-    regfile[p_gpr_unpack::TILE_SIZE_B] = cb_interface[inputB].fifo_page_size;
+    regfile[p_gpr_unpack::TILE_SIZE_A] = unpA_tile_size;
+    regfile[p_gpr_unpack::TILE_SIZE_B] = unpB_tile_size;
     sync_regfile_write(p_gpr_unpack::TILE_SIZE_B);
 }
 
-template<bool is_fp32_dest_acc_en = false, bool srnd_fpu_en = false>
-inline void llk_unpack_AB_matmul_hw_configure_disaggregated(
-    const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) {
-    const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = {
-        .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca };
-    llk_unpack_AB_matmul_hw_configure<is_fp32_dest_acc_en, srnd_fpu_en>(&unpack_AB_matmul_params);
-}
-
-inline void llk_unpack_AB_matmul_init(const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
-    // In0 -> srcB (supports partial face)
-    // In1 -> srcA
-    const uint32_t unpA_operand_id = get_operand_id(unpB_operand);
-    const uint32_t unpB_operand_id = get_operand_id(unpA_operand);
-
-    const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id);
-    const uint32_t unpB_face_r_dim = get_face_r_dim(unpB_operand_id);
+__attribute__((always_inline)) inline void _llk_unpack_AB_matmul_init_(const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1, const std::uint32_t unpA_face_r_dim=FACE_R_DIM, const std::uint32_t unpB_face_r_dim=FACE_R_DIM, const std::uint32_t unpA_num_faces=4, const std::uint32_t unpB_num_faces=4, const bool partial_face=false) {
 
     const bool reuse_a = ct_dim >= rt_dim;
-    const bool partial_face = get_partial_face(unpB_operand_id);
-
-    const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id);
-    const uint32_t unpB_num_faces = partial_face ? 1 : get_num_faces(unpB_operand_id); // if partial face -> unpack face by face
 
     // also turn on within_face_16x16_transpose if it was turned off by datacopy at runtime
     // on WH, the unpacker performs both transpose of faces as well as transpose each face.
@@ -175,50 +146,33 @@ inline void llk_unpack_AB_matmul_init(const std::uint32_t unpA_operand, const st
 
     TT_SETDMAREG(0, LOWER_HALFWORD(kt_dim), 0, LO_16(p_gpr_unpack::KT_DIM)); // store kt_dim to gpr for scaling tile size
 
-    llk_unpack_AB_matmul_mop_config(transpose != 0, ct_dim, rt_dim, kt_dim, partial_face);
+    _llk_unpack_AB_matmul_mop_config_(transpose != 0, ct_dim, rt_dim, kt_dim, partial_face);
 }
 
-inline void llk_unpack_AB_matmul(
-    const std::uint32_t operandA, const std::uint32_t operandB, const std::uint32_t tile_index_a, const std::uint32_t tile_index_b, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
+inline void _llk_unpack_AB_matmul_(
+    const std::uint32_t base_address_a, const std::uint32_t base_address_b, const std::uint32_t tile_index_a, const std::uint32_t tile_index_b, const std::uint32_t tile_size_a, const std::uint32_t tile_size_b, const std::uint32_t unpA_face_r_dim=FACE_R_DIM, const std::uint32_t unpB_face_r_dim=FACE_R_DIM, const bool partial_face=false, std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
     // In0/InA -> srcB (supports partial face)
     // In1/InB -> srcA
 
-    std::uint32_t inputA = get_operand_id(operandA);
-    std::uint32_t inputB = get_operand_id(operandB);
-    std::uint32_t base_address_a = cb_interface[inputA].fifo_rd_ptr;
-    std::uint32_t base_address_b = cb_interface[inputB].fifo_rd_ptr;
     volatile uint *cfg = get_cfg_pointer();  // get pointer to registers for current state ID
 
-    const std::uint32_t unpA_face_r_dim = get_face_r_dim(inputB); // In1/InB -> srcA
-    const std::uint32_t unpB_face_r_dim = get_face_r_dim(inputA); // In0/InA -> srcB
-
     const bool reuse_a = ct_dim >= rt_dim;
     const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim;
 
-    const bool partial_face = get_partial_face(inputA);
-
-
     if (!reuse_a) {
         TTI_MULDMAREG(0, p_gpr_unpack::TMP_LO, p_gpr_unpack::TILE_SIZE_B, p_gpr_unpack::KT_DIM);
     }
 
     for (uint t = 0; t < t_dim; t++) {
 
-        std::uint32_t cur_tile_index_a = tile_index_a + (reuse_a ? (t*kt_dim) : (0));
-        std::uint32_t cur_tile_index_b = tile_index_b + (reuse_a ? (0       ) : (t));
-        std::uint32_t next_tile_index_a = tile_index_a + (reuse_a ? ((t+1)*kt_dim) : (0));
-        std::uint32_t next_tile_index_b = tile_index_b + (reuse_a ? (0       ) : (t+1));
-        std::uint32_t offset_address_a = cb_interface[inputA].fifo_page_size * cur_tile_index_a;
-        std::uint32_t offset_address_b = cb_interface[inputB].fifo_page_size * cur_tile_index_b;
-        std::uint32_t next_offset_address_a =cb_interface[inputA].fifo_page_size * next_tile_index_a;
-        std::uint32_t next_offset_address_b = cb_interface[inputB].fifo_page_size * next_tile_index_b;
-        // note: unpacker is programmed to automatically skip the tile header (+1)
-        // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1
-        std::uint32_t address_a = base_address_a + offset_address_a - 1;
-        std::uint32_t address_b = base_address_b + offset_address_b - 1;
-        std::uint32_t next_address_a = base_address_a + next_offset_address_a - 1;
-        std::uint32_t next_address_b = base_address_b + next_offset_address_b - 1;
-
+        std::uint32_t offset_address_a =tile_size_a*(tile_index_a + (reuse_a ? (t*kt_dim) : (0)));
+        std::uint32_t next_offset_address_a = tile_size_a*(tile_index_a + (reuse_a ? ((t+1)*kt_dim) : (0)));
+        std::uint32_t offset_address_b = tile_size_b*(tile_index_b + (reuse_a ? (0       ) : (t)));
+        std::uint32_t next_offset_address_b = tile_size_b*(tile_index_b + (reuse_a ? (0       ) : (t+1)));
+        std::uint32_t address_a = base_address_a + offset_address_a;
+        std::uint32_t next_address_a = base_address_a + next_offset_address_a;
+        std::uint32_t address_b = base_address_b + offset_address_b;
+        std::uint32_t next_address_b = base_address_b + next_offset_address_b;
 
         // Wait for free context
         wait_for_next_context(2);
@@ -298,4 +252,9 @@ inline void llk_unpack_AB_matmul(
         switch_config_context(unp_cfg_context);
     }
 
+
+    #ifdef PERF_DUMP
+        first_unpack_recorded = true;
+    #endif
+
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h
index 0f5ba7d9b52..92222ddaaa3 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h
@@ -6,10 +6,8 @@
 
 #include "ckernel.h"
 #include "ckernel_defs.h"
-#include "debug/fw_debug.h"
+#include "fw_debug.h"
 #include "cunpack_common.h"
-#include "llk_param_structs.h"
-#include "llk_io_unpack.h"
 
 #ifdef PERF_DUMP
 #include "ckernel_perf_api.h"
@@ -18,17 +16,15 @@
 using namespace ckernel;
 using namespace ckernel::unpacker;
 
-void llk_zero_operand(std::uint32_t operand) {
-    std::uint32_t input = get_operand_id(operand);
+void _llk_zero_buffer_(const std::uint32_t base_address, const std::uint32_t size) {
 
     TT_SETDMAREG(0, 0, 0, LO_16(p_gpr_unpack::OPERAND_OFFSET_ADDR));
     TT_SETDMAREG(0, 0, 0, HI_16(p_gpr_unpack::OPERAND_OFFSET_ADDR));
 
-    std::uint32_t fifo_base_addr = cb_interface[input].fifo_limit - cb_interface[input].fifo_size;
-    TT_SETDMAREG(0, LOWER_HALFWORD(fifo_base_addr), 0, LO_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR));
-    TT_SETDMAREG(0, UPPER_HALFWORD(fifo_base_addr), 0, HI_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR));
+    TT_SETDMAREG(0, LOWER_HALFWORD(base_address), 0, LO_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR));
+    TT_SETDMAREG(0, UPPER_HALFWORD(base_address), 0, HI_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR));
 
-    for (std::uint32_t i = 0; i < cb_interface[input].fifo_size; i++) {
+    for (std::uint32_t i = 0; i < size; i++) {
         TTI_STOREIND(
             1,
             0,
@@ -40,65 +36,69 @@ void llk_zero_operand(std::uint32_t operand) {
     }
 }
 
-inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
-    debug_dump(data, byte_size);
-}
+template <bool mail2math=true, bool mail2pack=true>
+inline void _llk_unpack_get_tile_(std::uint32_t address, std::uint32_t *p_tile) {
+    std::uint32_t byte_address = (address + TILE_HEADER_SIZE)<<4;
 
-inline void llk_unpack_debug_dump_seek(std::uint8_t offset) {
-    debug_dump_seek(offset);
-}
+    if constexpr (mail2math) {
+       mailbox_write(ThreadId::MathThreadId, byte_address);
+       semaphore_post(semaphore::UNPACK_OPERAND_SYNC);
+    }
 
-inline void llk_unpack_reconfig_data_format_srca_impl(std::uint32_t srca_operand_id)
-{
-    cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format[srca_operand_id]);
-    cfg_reg_rmw_tensix<THCON_SEC0_REG2_Out_data_format_RMW>(unpack_dst_format[srca_operand_id]);
-    TT_SETDMAREG(0, LOWER_HALFWORD(cb_interface[srca_operand_id].fifo_page_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A
-}
+    if constexpr (mail2pack) {
+       mailbox_write(ThreadId::PackThreadId, byte_address);
+       semaphore_post(semaphore::UNPACK_OPERAND_SYNC);
+    }
 
-inline void llk_unpack_reconfig_data_format_srcb_impl(std::uint32_t srcb_operand_id)
-{
-    cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format[srcb_operand_id]);
-    cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpack_dst_format[srcb_operand_id]);
-    TT_SETDMAREG(0, LOWER_HALFWORD(cb_interface[srcb_operand_id].fifo_page_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_B)); // update gpr which holds tile size B
+    *p_tile = byte_address;
 }
 
-inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
-    llk_unpack_reconfig_data_format_srca_impl(get_operand_id(srca_new_operand));
+template <bool mail2math=true, bool mail2pack=true>
+inline void _llk_unpack_release_tile_() {
+    while (semaphore_read(semaphore::UNPACK_OPERAND_SYNC) > 0);
 }
 
-inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
-    llk_unpack_reconfig_data_format_srcb_impl(get_operand_id(srcb_new_operand));
+inline void _llk_unpack_debug_dump_(std::uint8_t *data, std::uint32_t byte_size) {
+    debug_dump(data, byte_size);
 }
 
-inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
-    std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
-    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-
-    if((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) {
-        llk_unpack_reconfig_data_format_srca_impl(new_srca_operand_id);
-    }
+inline void _llk_unpack_debug_dump_seek_(std::uint8_t offset) {
+    debug_dump_seek(offset);
 }
 
-inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
-    std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
-    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+template <bool is_tile_dim_reconfig_en = false>
+inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4)
+{
+    if constexpr(is_tile_dim_reconfig_en) {
+        const uint face_dim = face_r_dim*FACE_C_DIM;
 
-    if((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) {
-        llk_unpack_reconfig_data_format_srcb_impl(new_srcb_operand_id);
+        cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 16, 0xffff0000>(num_faces);
+        cfg_reg_rmw_tensix<THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32, 0, 0xffffffff>(face_dim | face_dim << 16);
     }
+    cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format);
+    cfg_reg_rmw_tensix<THCON_SEC0_REG2_Out_data_format_RMW>(unpack_dst_format);
+    TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A
 }
 
-inline void llk_unpack_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
-    llk_unpack_reconfig_data_format_srca(srca_new_operand);
-    llk_unpack_reconfig_data_format_srcb(srcb_new_operand);
-}
+template <bool is_tile_dim_reconfig_en = false>
+inline void _llk_unpack_reconfig_data_format_srcb_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4)
+{
+    if constexpr(is_tile_dim_reconfig_en) {
+        const uint face_dim = face_r_dim*FACE_C_DIM;
 
-inline void llk_unpack_reconfig_data_format(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand, const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
-    llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand);
-    llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand);
+        cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 16, 0xffff0000>(face_r_dim*FACE_C_DIM);
+        cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32+1, 16, 0xffff0000>(num_faces);
+    }
+    cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format);
+    cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpack_dst_format);
+    TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_B)); // update gpr which holds tile size B
 }
 
-inline void llk_unpack_dbg_feature_disable(){
+inline void _llk_unpack_dbg_feature_disable_(){
     reg_write(RISCV_DEBUG_REG_DBG_FEATURE_DISABLE, 1<<11); // Set debug feature disable bit 11
                                                            // workaround for bug https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1372
 }
+
+inline void _llk_enable_int8_fpu_math_() {
+    enalbe_int8_fpu_math();
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h
index b59b552c7da..8f0ea52e4fa 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h
@@ -3,9 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_io_unpack.h"
-#include "llk_param_structs.h"
-
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
@@ -16,7 +13,7 @@ using namespace ckernel;
 using namespace ckernel::unpacker;
 
 template <PoolType type, ReduceDim dim>
-inline void llk_unpack_reduce_mop_config() {
+inline void _llk_unpack_reduce_mop_config_() {
 #if SKIP_UNP == 1
     static constexpr uint unpack_srca = TT_OP_NOP;
 #else
@@ -43,75 +40,37 @@ inline void llk_unpack_reduce_mop_config() {
     tmp.program(instrn_buffer);
 }
 
-template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, bool srnd_fpu_en = false>
-inline void llk_unpack_reduce_hw_configure(
-    const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) {
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void _llk_unpack_reduce_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format,  const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4) {
 
-    constexpr uint32_t srca_height = 16;
-    constexpr uint32_t srcb_height = 16;
     constexpr bool is_row_pool = true;
-    constexpr bool transpose_xy_per_face = (ReduceDim::REDUCE_ROW == dim);
-
-    configure_unpack_AB(
-        get_operand_id(unpack_reduce_params->unpA_operand),
-        get_operand_id(unpack_reduce_params->unpA_operand),
-        srca_height,
-        srcb_height,
-        is_row_pool,
-        transpose_xy_per_face,
-        is_fp32_dest_acc_en,
-        srnd_fpu_en);
-
-    if constexpr (type != PoolType::MAX) {
-        union {
-            float f;
-            uint32_t u;
-        } f2u = {.f = const_mult};
-
-        for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u;  // Load const into L1 buffer
-    }
-}
 
-template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en=false, bool srnd_fpu_en = false>
-inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) {
-    const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand};
-    llk_unpack_reduce_hw_configure<type, dim, is_fp32_dest_acc_en, srnd_fpu_en>(&unpack_reduce_params, mult);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpA_src_format,
+        unpB_src_format,
+        unpA_dst_format,
+        unpB_dst_format,
+        unpA_face_r_dim,
+        unpB_face_r_dim,
+        within_face_16x16_transpose,
+        unpA_num_faces,
+        unpB_num_faces);
 }
 
 template <PoolType type, ReduceDim dim>
-inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) {
-    llk_unpack_reduce_mop_config<type, dim>();
-    volatile uint tt_reg_ptr *cfg = get_cfg_pointer();  // get pointer to registers for current state ID
-
-    uint unpack_src_df  = (uint) DataFormat::Float32;
-
-    uint unpack_dst_df = (((uint)unpack_dst_format[0]>>2)&0x1) ? (uint) DataFormat::Float16_b : (uint) DataFormat::Float16;
-
-    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>(unpack_dst_df);
+inline void _llk_unpack_reduce_init_(const std::uint32_t within_face_16x16_transpose=0) {
 
     // REDUCE_ROW requires transpose itself; additionaly, within_face_16x16_transpose flag could require transpose;
     // if we have the flag set with REDUCE_ROW, we don't need to do anything
     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(ReduceDim::REDUCE_ROW == dim ? !within_face_16x16_transpose : within_face_16x16_transpose);
 
-    TTI_SETADCXX(0b11, FACE_WIDTH*FACE_HEIGHT-1, 0x0);
+    TTI_SETADCXX(0b11, FACE_R_DIM*FACE_C_DIM-1, 0x0);
 
-    cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0xf>(unpack_src_df);
-    cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpack_dst_df);
-
-    TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32);
-    TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
-    TTI_NOP; TTI_NOP;
+    _llk_unpack_reduce_mop_config_<type, dim>();
 }
 
 template <PoolType type, ReduceDim dim>
-inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) {
-    std::uint32_t input = get_operand_id(operand);
-    std::uint32_t base_address = cb_interface[input].fifo_rd_ptr;
-    std::uint32_t offset_address = cb_interface[input].fifo_page_size * tile_index;
-    // note: unpacker is programmed to automatically skip the tile header (+1)
-    // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1
-    std::uint32_t address = base_address + offset_address - 1;
-
+inline void _llk_unpack_reduce_(const std::uint32_t address) {
     // Clear z/w start counters
     TTI_SETADCZW(0b011, 0, 0, 0, 0, 0b1111);
 
@@ -122,7 +81,7 @@ inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t t
     wait_for_next_context(2);
 
     // Load only 16 datums into srcB
-    TTI_SETADCXX(p_setadc::UNP1, DATUMS_PER_ROW-1, 0x0);
+    TTI_SETADCXX(p_setadc::UNP1, FACE_C_DIM-1, 0x0);
 
     // Trisc::SEMPOST for context acquire
     semaphore_post(semaphore::UNPACK_SYNC);
@@ -138,7 +97,7 @@ inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t t
     mop_run(0, 4);
 
     // Restore face height
-    TTI_SETADCXX(p_setadc::UNP1, FACE_HEIGHT*16-1, 0x0);
+    TTI_SETADCXX(p_setadc::UNP1, FACE_R_DIM*FACE_C_DIM-1, 0x0);
 
     // T6::SEMGET for context release
     t6_semaphore_get(semaphore::UNPACK_SYNC);
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h
index adede517c2e..ae1b22d830e 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h
@@ -3,9 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_io_unpack.h"
-#include "llk_param_structs.h"
-
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
@@ -15,7 +12,7 @@
 using namespace ckernel;
 using namespace ckernel::unpacker;
 
-inline void llk_unpack_tilize_mop_config(const std::uint32_t operand_id) {
+inline void _llk_unpack_tilize_mop_config_(const bool narrow_tile=false) {
     #if SKIP_UNP == 1
         static constexpr uint unpack_srca = TT_OP_NOP;
         static constexpr uint unpack_srcb_zerosrc = TT_OP_NOP;
@@ -26,91 +23,69 @@ inline void llk_unpack_tilize_mop_config(const std::uint32_t operand_id) {
         static constexpr uint unpack_srcb_set_dvalid = TT_OP_UNPACR_NOP(SrcB, p_unpacr_nop::UNP_SET_DVALID); //WA for https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1230
     #endif
 
-    const uint32_t outerloop = get_narrow_tile(operand_id) ? 1 : 2;
+    const uint32_t outerloop = narrow_tile ? 1 : 2;
     constexpr uint32_t innerloop = 1;
     ckernel_template tmp(outerloop, innerloop, unpack_srcb_zerosrc, unpack_srcb_set_dvalid);
     tmp.set_start_op(unpack_srca);
     tmp.program(instrn_buffer);
 }
 
-template <bool is_fp32_dest_acc_en = false>
-inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) {
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void _llk_unpack_tilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
 
     constexpr bool is_row_pool = false;
-    constexpr bool transpose_xy_srca = false;
-    constexpr bool srnd_fpu_en = false;
-
-    const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand);
-    const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id);
-    const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id);
-    configure_unpack_AB(unpA_operand_id, unpA_operand_id, unpA_face_r_dim, unpA_face_r_dim, is_row_pool, transpose_xy_srca, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpA_num_faces);
-}
 
-template <bool is_fp32_dest_acc_en = false>
-inline void llk_unpack_tilize_hw_configure_disaggregated(
-    const std::uint32_t unpA_operand, const std::uint32_t unpA_block_ct_dim) {
-    const llk_unpack_A_params_t unpack_tilize_params = {
-        .unpA_operand = unpA_operand
-    };
-    llk_unpack_tilize_hw_configure<is_fp32_dest_acc_en>(&unpack_tilize_params);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format,
+        unpack_src_format,
+        unpack_dst_format,
+        unpack_dst_format,
+        face_r_dim,
+        face_r_dim,
+        within_face_16x16_transpose,
+        num_faces,
+        num_faces);
 }
 
-inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) {
+inline void _llk_unpack_tilize_init_(const std::uint32_t unpack_src_format=0, const std::uint32_t unpack_dst_format=0, const std::uint32_t ct_dim=0, const std::uint32_t face_r_dim=FACE_R_DIM, const bool narrow_tile=false) {
     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(0);
 
-    const std::uint32_t operand_id = get_operand_id(operand);
-    const std::uint32_t face_r_dim = get_face_r_dim(operand_id);
-
-    const std::uint32_t block_c_dim = ct_dim * (get_narrow_tile(operand_id) ? FACE_C_DIM : TILE_C_DIM);
+    const std::uint32_t block_c_dim = ct_dim * (narrow_tile ? FACE_C_DIM : TILE_C_DIM);
 
     // Set face dim
     TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0);
 
-    // Save state of unpacker config for quick restore
-    TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0]
-    TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
-
     // Override default settings to enable tilize mode
     unpack_config_u config = {0};
-    config.f.out_data_format = (uint)unpack_dst_format[operand_id];
+    config.f.out_data_format = unpack_dst_format;
     config.f.throttle_mode = 2;
     config.f.tileize_mode = 1;
-    config.f.shift_amount = (SCALE_DATUM_SIZE((uint)unpack_src_format[operand_id], block_c_dim)) >> 4;
+    config.f.shift_amount = (SCALE_DATUM_SIZE(unpack_src_format, block_c_dim)) >> 4;
 
     TT_SETDMAREG(0, LOWER_HALFWORD(config.val[0]), 0, LO_16(p_gpr_unpack::TMP0));
     TT_SETDMAREG(0, UPPER_HALFWORD(config.val[0]), 0, HI_16(p_gpr_unpack::TMP0));
     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::TMP0); // Load unpack config[0]
     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16); //GPR preloaded with  16 | (16 << 16)
 
-    llk_unpack_tilize_mop_config(operand_id);
+    _llk_unpack_tilize_mop_config_(narrow_tile);
 }
 
-inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) {
-    TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0);
-    TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0]
-    TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32,  p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context
-}
-
-inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) {
-    std::uint32_t operand_id = get_operand_id(operand);
-    std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;  // Remove header size added by descriptor
-    const std::uint32_t face_r_dim = get_face_r_dim(operand_id);
-    const std::uint32_t num_faces = get_num_faces(operand_id);
+inline void _llk_unpack_tilize_(const std::uint32_t base_address, const std::uint32_t tile_index, std::uint32_t unpack_src_format=0, std::uint32_t block_ct_dim=0, const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false) {
     volatile uint tt_reg_ptr *cfg = get_cfg_pointer();  // get pointer to registers for current state ID
 
 
-    std::uint32_t top_face_offset_address = SCALE_DATUM_SIZE((uint)unpack_src_format[operand_id], tile_index) << (get_narrow_tile(operand_id) ? 0 : 1);
+    std::uint32_t top_face_offset_address = SCALE_DATUM_SIZE(unpack_src_format, tile_index) << (narrow_tile ? 0 : 1);
                                                     // Each iteration unpacks 2 face_r_dimx16 faces (1st 0,1 2nd 2,3 unless tile is <=16x32)
                                                     // For narrow tile we unpack 1 face in each iteration
                                                     // Offset address is in 16B words
                                                     // Datum count = tile_index*face_r_dim (/16 to get word count)
 
-    const std::uint32_t block_c_dim_16B = block_ct_dim * (get_narrow_tile(operand_id) ? FACE_C_DIM/16 : TILE_C_DIM/16);
+    const std::uint32_t block_c_dim_16B = block_ct_dim * (narrow_tile ? FACE_C_DIM/16 : TILE_C_DIM/16);
     std::uint32_t bot_face_offset_address =
-        SCALE_DATUM_SIZE((uint)unpack_src_format[operand_id], face_r_dim*block_c_dim_16B);  //*N rows / 16 to get 16B word aligned address
+        SCALE_DATUM_SIZE(unpack_src_format, face_r_dim*block_c_dim_16B);  //*N rows / 16 to get 16B word aligned address
 
     // Program srcA and srcB base addresses
-    std::uint32_t num_loops = get_narrow_tile(operand_id) ? 2 : num_faces/2;
+    std::uint32_t num_loops = narrow_tile ? 2 : num_faces/2;
 
     for (std::uint32_t n = 0; n < num_loops; n++) {
         std::uint32_t address = base_address + top_face_offset_address + ((n == 1) ? bot_face_offset_address : 0);
@@ -145,10 +120,3 @@ inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, s
     first_unpack_recorded = true;
 #endif
 }
-
-inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) {
-    std::uint32_t input = get_operand_id(operand);
-    for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) {
-        llk_unpack_tilize(operand, tile_index, block_c_tiles);
-    }
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h
index d6d04cd55d6..723f9716c88 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h
@@ -3,9 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "llk_io_unpack.h"
-#include "llk_param_structs.h"
-
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
@@ -19,18 +16,24 @@ using namespace ckernel::unpacker;
     #define SKIP_UNP (0)
 #endif
 
-inline void llk_unpack_untilize_mop_config() {
+inline void _llk_unpack_untilize_mop_config_() {
 
-    constexpr uint replay_buf_len = 5;
+    constexpr uint replay_buf_len = (SKIP_UNP == 1) ? 1 : 5;
     TTI_REPLAY(0, replay_buf_len, 0, 1);
-
+#if SKIP_UNP == 1
+    TTI_NOP;
+    static constexpr uint load_offset_addr_cntx0 = TT_OP_NOP;
+    static constexpr uint load_offset_addr_cntx1 = TT_OP_NOP;
+#else
     TTI_DMANOP; // REG2FLOP that sets offset in previous loop needs additional cycle to complete
     TTI_UNPACR(SrcA, 0b01000001, 0, 0, 0, 1, 0, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1);
     TTI_UNPACR(SrcA, 0b01000001, 0, 0, 0, 1, 0, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1);
     TTI_ADDDMAREG(0, p_gpr_unpack::TILE_OFFSET, p_gpr_unpack::TILE_OFFSET, p_gpr_unpack::TILE_SIZE);
     TTI_ADDRCRZW(0b001, 0, 0, 0, 0, 0b0001);
+
     static constexpr uint load_offset_addr_cntx0 = TT_OP_REG2FLOP(1, 0, 0, 0, THCON_SEC0_REG7_Offset_address_ADDR32 - THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::TILE_OFFSET);
     static constexpr uint load_offset_addr_cntx1 = TT_OP_REG2FLOP(1, 0, 0, 0, THCON_SEC0_REG7_Offset_cntx1_address_ADDR32 - THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::TILE_OFFSET);
+#endif
 
     ckernel_unpack_template tmp = ckernel_unpack_template(
           true,  // src B
@@ -45,31 +48,25 @@ inline void llk_unpack_untilize_mop_config() {
     tmp.program(instrn_buffer);
 }
 
-template <bool is_fp32_dest_acc_en = false>
-inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) {
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void _llk_unpack_untilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
     constexpr bool is_row_pool = false;
-    constexpr bool transpose_xy_srca = false;
-    constexpr bool srnd_fpu_en = false;
-
-    const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand);
-    const uint32_t unpA_num_faces = 4;
-    const uint32_t unpA_face_r_dim = 16;
-    configure_unpack_AB(unpA_operand_id, unpA_operand_id, unpA_face_r_dim, unpA_face_r_dim, is_row_pool, transpose_xy_srca, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpA_num_faces);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format,
+        unpack_src_format,
+        unpack_dst_format,
+        unpack_dst_format,
+        face_r_dim,
+        face_r_dim,
+        within_face_16x16_transpose,
+        num_faces,
+        num_faces);
 }
 
-inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) {
-    const llk_unpack_A_params_t unpack_untilize_params = {
-        .unpA_operand = unpA_operand,
-    };
-    llk_unpack_untilize_hw_configure(&unpack_untilize_params);
-}
-
-inline void llk_unpack_untilize_init(std::uint32_t operand = 0) {
-    std::uint32_t operand_id = get_operand_id(operand);
-    std::uint32_t face_r_dim = 1;
+inline void _llk_unpack_untilize_init_(const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) {
 
-    std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
-    std::uint32_t unpA_ch1_y_stride = FACE_R_DIM*unpA_ch1_x_stride;
+    const std::uint32_t unpA_ch1_x_stride = (unpack_dst_format&0x3) == (std::uint32_t) DataFormat::Float32 ? 4 : (unpack_dst_format&0x3) == (std::uint32_t) DataFormat::Float16 ? 2 : 1;
+    const std::uint32_t unpA_ch1_y_stride = FACE_R_DIM*unpA_ch1_x_stride;
 
     TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0);
 
@@ -79,38 +76,14 @@ inline void llk_unpack_untilize_init(std::uint32_t operand = 0) {
     cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 0, 0xFFFF>(FACE_C_DIM);
     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16); //GPR preloaded with  16 | (16 << 16)
 
-    std::uint32_t tile_size_words = cb_interface[operand_id].fifo_page_size;
-    TT_SETDMAREG(0, LOWER_HALFWORD(tile_size_words), 0, LO_16(p_gpr_unpack::TILE_SIZE));
-    TT_SETDMAREG(0, UPPER_HALFWORD(tile_size_words), 0, HI_16(p_gpr_unpack::TILE_SIZE));
-    llk_unpack_untilize_mop_config();
-}
-
-inline void llk_unpack_untilize_uninit(const std::uint32_t operand) {
-    std::uint32_t operand_id = get_operand_id(operand);
-    std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
-    std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride;
-
-    // Check that unpacker is done (all contexts freed up) before starting hw configuration
-    wait_for_idle();
+    TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE));
+    TT_SETDMAREG(0, UPPER_HALFWORD(tile_size), 0, HI_16(p_gpr_unpack::TILE_SIZE));
 
-    // Reset address counters
-    unpacker_addr_counter_init();
-
-    // Wait for cfg to be free to edit
-    TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK);
-
-    // Reset the values to default in unpack AB common.
-    TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0);
-    TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16);
-    cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 0, 0xFFFF>(1);
-    cfg_reg_rmw_tensix<UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, UNP0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, UNP0_ADDR_CTRL_XY_REG_1_Ystride_MASK>(unpA_ch1_y_stride);
-    TTI_NOP; TTI_NOP; // Do we need this for WH?
+    _llk_unpack_untilize_mop_config_();
 }
 
 template <bool first_pass = true>
-inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) {
-    std::uint32_t operand_id = get_operand_id(operand);
-    std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+inline void _llk_unpack_untilize_pass_(const std::uint32_t base_address, const std::uint32_t block_tile_cols) {
     std::uint32_t rem_blocks_in_row = block_tile_cols;
 
     // Program srcA and srcB base addresses
@@ -146,11 +119,13 @@ inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_
             if ((face_2xr_cnt + rem_blocks_in_row) >= (FACE_HEIGHT / 2)) {
                 // Run MOP
                 TT_MOP(0, 8 - face_2xr_cnt - 1, unp_cfg_context == 0 ? 0 : 0xff);                                              // Run the MOP
-
+#if SKIP_UNP == 1
+                TTI_NOP;
+#else
                 TTI_UNPACR(SrcA, 0b0, 0, 0, 0, 1, 1, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1);  // set data valid
                 TTI_UNPACR_NOP(SrcB, p_unpacr_nop::UNP_ZEROSRC);
                 TTI_UNPACR_NOP(SrcB, p_unpacr_nop::UNP_SET_DVALID);
-
+#endif
                 TTI_SETADCXY(0b001, 0, 0, 0, 0, 0b1000);  // Clear srcA addr y cnt
                 rem_blocks_in_row -= (8 - face_2xr_cnt);
                 face_2xr_cnt = 0;
@@ -193,9 +168,7 @@ inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_
     // Switch unpacker config context
     switch_config_context(unp_cfg_context);
 
-}
-
-inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) {
-    llk_unpack_untilize_pass<true>(operand, block_c_tiles);
-    llk_unpack_untilize_pass<false>(operand, block_c_tiles);
+#ifdef PERF_DUMP
+    first_unpack_recorded = true;
+#endif
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp
deleted file mode 100644
index 2e615199cf5..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cstdint>
-#include "llk_math_common.h"
-#include "llk_math_eltwise_unary_datacopy.h"
-namespace NAMESPACE
-{
-
-struct hlk_args_t
-{
-int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core
-int32_t per_core_block_cnt; // Number of blocks of size 1xN tiles (1 rows and N cols)
-int32_t per_core_block_c_dim; // Block c dim  = (Nx32)
-int32_t per_core_block_tile_cnt; // Block tile count = (1xN)
-}
-;
-
-void math_main(const struct hlk_args_t *args,const int outer_loop_cnt)
-{
-int __outer_loop_iter;
-llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE>(false);
-llk_math_pack_sync_init<SyncTile16>();
-for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) {
-  for (int b = 0; b < args -> per_core_tile_cnt; ++b) {
-    llk_math_wait_for_dest_available<SyncTile16>();
-    llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncTile16>(0);
-    llk_math_dest_section_done<SyncTile16>();
-  }
-}
-}
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h
deleted file mode 100644
index 4e13ffa422a..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-constexpr std::int32_t MATH_FIDELITY = 255;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp
deleted file mode 100644
index 15b8912cfff..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cstdint>
-#include "llk_pack_common.h"
-#include "llk_pack.h"
-namespace NAMESPACE
-{
-
-struct hlk_args_t
-{
-int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core
-int32_t per_core_block_cnt; // Number of blocks of size 1xN tiles (1 rows and N cols)
-int32_t per_core_block_c_dim; // Block c dim  = (Nx32)
-int32_t per_core_block_tile_cnt; // Block tile count = (1xN)
-}
-;
-
-void pack_main(const struct hlk_args_t *args,const int outer_loop_cnt)
-{
-int __outer_loop_iter;
-llk_pack_init(16);
-llk_pack_hw_configure_disaggregated(16);
-llk_setup_outputs();
-llk_pack_dest_init<SyncTile16>();
-for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) {
-  for (int b = 0; b < args -> per_core_tile_cnt; ++b) {
-    llk_packer_wait_for_math_done();
-    llk_wait_for_free_tiles(16,1);
-    llk_pack<false, SyncTile16>(0,16);
-    llk_push_tiles(16,1);
-    llk_pack_dest_section_done<SyncTile16>();
-  }
-}
-}
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h
deleted file mode 100644
index f4ff894944f..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-const std::int32_t pack_src_format[16] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
-const std::int32_t pack_dst_format[16] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp
deleted file mode 100644
index 37ed8574d7a..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cstdint>
-#include "llk_unpack_common.h"
-#include "llk_unpack_tilize.h"
-namespace NAMESPACE
-{
-
-struct hlk_args_t
-{
-int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core
-int32_t per_core_block_cnt; // Number of blocks of size 1xN tiles (1 rows and N cols)
-int32_t per_core_block_c_dim; // Block c dim  = (Nx32)
-int32_t per_core_block_tile_cnt; // Block tile count = (1xN)
-}
-;
-
-void unpack_main(const struct hlk_args_t *args,const int outer_loop_cnt)
-{
-int __outer_loop_iter;
-llk_setup_operands();
-llk_unpack_tilize_init();
-llk_unpack_tilize_hw_configure_disaggregated(0, args -> per_core_block_c_dim);
-for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) {
-  for (int i = 0; i < args -> per_core_block_cnt; ++i) {
-    llk_wait_blocks(0,1);
-    for (int j = 0; j < args -> per_core_block_tile_cnt; ++j) {
-      llk_unpack_tilize(0,j,args -> per_core_block_c_dim);
-    }
-    llk_pop_blocks(0,1,args -> per_core_block_c_dim);
-  }
-}
-}
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h
deleted file mode 100644
index 10d1799f857..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-const std::int32_t unpack_src_format[24] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
-const std::int32_t unpack_dst_format[24] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h
deleted file mode 100644
index 62a6e634ee4..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-const NAMESPACE::hlk_args_t hlk_args =
-  {
-    .per_core_tile_cnt = 0x4,
-    .per_core_block_tile_cnt = 0x4,
-    .per_core_block_cnt = 0x1,
-    .per_core_block_c_dim = 128
-  };
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h
deleted file mode 100644
index 9be808dfbc5..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-constexpr std::int32_t arg_loop_count = 1;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp
deleted file mode 100644
index 6e3d0a44332..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cstdint>
-#include "llk_math_common.h"
-#include "llk_math_eltwise_unary_datacopy.h"
-namespace NAMESPACE
-{
-
-struct hlk_args_t
-{
-int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core
-int32_t per_core_block_tile_r_dim; // Block tile r dim (RT)
-int32_t per_core_block_tile_c_dim; // Block tile c dim (CT)
-int32_t per_core_block_cnt; // Number of blocks of size (RTxCT)
-}
-;
-
-void math_main(const struct hlk_args_t *args,const int outer_loop_cnt)
-{
-int __outer_loop_iter;
-llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE>(false);
-llk_math_pack_sync_init<SyncTile16>();
-for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) {
-  for (int b = 0; b < args -> per_core_tile_cnt; ++b) {
-    llk_math_wait_for_dest_available<SyncTile16>();
-    llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncTile16>(0);
-    llk_math_dest_section_done<SyncTile16>();
-  }
-}
-}
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h
deleted file mode 100644
index 4e13ffa422a..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-constexpr std::int32_t MATH_FIDELITY = 255;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp
deleted file mode 100644
index e18a8f81483..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cstdint>
-#include "llk_pack_common.h"
-#include "llk_pack.h"
-namespace NAMESPACE
-{
-
-struct hlk_args_t
-{
-int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core
-int32_t per_core_block_tile_r_dim; // Block tile r dim (RT)
-int32_t per_core_block_tile_c_dim; // Block tile c dim (CT)
-int32_t per_core_block_cnt; // Number of blocks of size (RTxCT)
-}
-;
-
-void pack_main(const struct hlk_args_t *args,const int outer_loop_cnt)
-{
-int __outer_loop_iter;
-llk_pack_init(16);
-llk_pack_hw_configure_disaggregated<true>(16);
-llk_setup_outputs();
-llk_pack_dest_init<SyncTile16>();
-for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) {
-  for (int b = 0; b < args -> per_core_tile_cnt; ++b) {
-    llk_packer_wait_for_math_done();
-    llk_wait_for_free_blocks(16,1);
-    llk_pack<false, SyncTile16, true>(0,16);
-    llk_push_blocks(16,1);
-    llk_pack_dest_section_done<SyncTile16>();
-  }
-}
-}
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h
deleted file mode 100644
index f4ff894944f..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-const std::int32_t pack_src_format[16] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
-const std::int32_t pack_dst_format[16] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp
deleted file mode 100644
index f5da6ad4752..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cstdint>
-#include "llk_unpack_common.h"
-#include "llk_unpack_untilize.h"
-namespace NAMESPACE
-{
-
-struct hlk_args_t
-{
-int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core
-int32_t per_core_block_tile_r_dim; // Block tile r dim (RT)
-int32_t per_core_block_tile_c_dim; // Block tile c dim (CT)
-int32_t per_core_block_cnt; // Number of blocks of size (RTxCT)
-}
-;
-
-void unpack_main(const struct hlk_args_t *args,const int outer_loop_cnt)
-{
-int __outer_loop_iter;
-llk_setup_operands();
-llk_unpack_untilize_init();
-llk_unpack_untilize_hw_configure_disaggregated(0);
-for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) {
-  for (int i = 0; i < args -> per_core_block_cnt; ++i) {
-    for (int j = 0; j < args -> per_core_block_tile_r_dim; ++j) {
-      llk_wait_tiles(0,args -> per_core_block_tile_c_dim);
-      llk_unpack_untilize<true>(0,args -> per_core_block_tile_c_dim);
-      llk_unpack_untilize<false>(0,args -> per_core_block_tile_c_dim);
-      llk_pop_tiles(0,args -> per_core_block_tile_c_dim);
-    }
-  }
-}
-}
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h
deleted file mode 100644
index 10d1799f857..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-const std::int32_t unpack_src_format[24] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
-const std::int32_t unpack_dst_format[24] = {
-    1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
-};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h
deleted file mode 100644
index 123b7bb0d4a..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h
+++ /dev/null
@@ -1,12 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-const NAMESPACE::hlk_args_t hlk_args =
-  {
-    .per_core_tile_cnt = 0x4,
-    .per_core_block_tile_cnt = 0x4,
-    .per_core_block_tile_r_dim = 0x2, // Block tile r dim (RT)
-    .per_core_block_tile_c_dim = 0x2, // Block tile c dim (CT)
-    .per_core_block_cnt = 0x1
-  };
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h
deleted file mode 100644
index 9be808dfbc5..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-constexpr std::int32_t arg_loop_count = 1;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/chlkc_list.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/chlkc_list.h
similarity index 96%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/chlkc_list.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/common/chlkc_list.h
index bff17865521..d288ba0114d 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/chlkc_list.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/chlkc_list.h
@@ -14,6 +14,7 @@ using namespace ckernel;
 
 
 #ifdef UCK_CHLKC_MATH
+#include "chlkc_unpack_data_format.h"
 #include "chlkc_math_fidelity.h"
 #include "chlkc_math_approx_mode.h"
 #include "chlkc_math.cpp"
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
new file mode 100644
index 00000000000..f31efd1c3d0
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
@@ -0,0 +1,62 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include "ckernel_globals.h"
+#include "tensix_functions.h"
+#include "metal_compile_time_args.h"
+#include "risc_attribs.h"
+#include "hostdevcommon/common_runtime_address_map.h"
+
+extern uint32_t __ldm_bss_start[];
+extern uint32_t __ldm_bss_end[];
+extern uint32_t __ldm_data_start[];
+extern uint32_t __ldm_data_end[];
+extern void (* __init_array_start[])();
+extern void (* __init_array_end[])();
+extern uint32_t __firmware_start[];
+
+extern void kernel_init();
+extern void kernel_launch();
+
+inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
+    // Cover L1 load latency of 6 cycles for the bulk of the copy
+    int32_t n = 0;
+    while (n < len - 5) {
+        uint32_t v0 = l1_addr[n + 0];
+        uint32_t v1 = l1_addr[n + 1];
+        uint32_t v2 = l1_addr[n + 2];
+        uint32_t v3 = l1_addr[n + 3];
+        uint32_t v4 = l1_addr[n + 4];
+        uint32_t v5 = l1_addr[n + 5];
+        local_mem_addr[n + 0] = v0;
+        local_mem_addr[n + 1] = v1;
+        local_mem_addr[n + 2] = v2;
+        local_mem_addr[n + 3] = v3;
+        local_mem_addr[n + 4] = v4;
+        local_mem_addr[n + 5] = v5;
+        n += 6;
+    }
+    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
+    while (n < len) {
+        local_mem_addr[n] = l1_addr[n];
+        n++;
+    }
+}
+
+inline void firmware_kernel_common_init(void *init_local_l1_base) {
+
+    // Handle stuff typically done in crt0 in asm.  Easier to do in C
+    wzerorange(__ldm_bss_start, __ldm_bss_end);
+
+    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
+    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
+    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
+
+    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
+        (**fptr)();
+    }
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h
new file mode 100644
index 00000000000..e507966c516
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h
@@ -0,0 +1,9 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace ckernel {
+#define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_##arg_idx
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h
new file mode 100644
index 00000000000..e59e64b8ea3
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h
@@ -0,0 +1,92 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b)
+{
+  unsigned int r = 0;
+  while (a)
+    {
+      if (a & 1)
+        r += b;
+      a >>= 1;
+      b <<= 1;
+    }
+  return r;
+}
+
+inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n)
+{
+    // Uses embedding style magic number
+    // * fixed point 1/12 then shifting.
+    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
+    return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3;
+}
+
+inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n)
+{
+    // Uses embedding style magic number
+    // * fixed point 1/12 then shifting.
+    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
+    return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6;
+}
+
+template <uint32_t d>
+inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n)
+{
+    if constexpr (d == 12) {
+        // fast divide for 12 divisor
+        return fast_udiv_12(n);
+    } else if constexpr (d == 94) {
+        // fast divide for 94 divisor. Handles Banked L1 address generation for E75
+        return fast_udiv_94(n);
+    } else {
+        // generic divide from llvm
+        const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT;
+        unsigned int q;
+        unsigned int r;
+        unsigned sr;
+        /* special cases */
+        if (d == 0)
+            return 0; /* ?! */
+        if (n == 0)
+            return 0;
+        sr = __builtin_clz(d) - __builtin_clz(n);
+        /* 0 <= sr <= n_uword_bits - 1 or sr large */
+        if (sr > n_uword_bits - 1)  /* d > r */
+            return 0;
+        if (sr == n_uword_bits - 1)  /* d == 1 */
+            return n;
+        ++sr;
+        /* 1 <= sr <= n_uword_bits - 1 */
+        /* Not a special case */
+        q = n << (n_uword_bits - sr);
+        r = n >> sr;
+        unsigned int  carry = 0;
+        for (; sr > 0; --sr)
+        {
+            /* r:q = ((r:q)  << 1) | carry */
+            r = (r << 1) | (q >> (n_uword_bits - 1));
+            q = (q << 1) | carry;
+            /* carry = 0;
+             * if (r.all >= d.all)
+             * {
+             *      r.all -= d.all;
+             *      carry = 1;
+             * }
+             */
+            const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1);
+            carry = s & 1;
+            r -= d & s;
+        }
+        q = (q << 1) | carry;
+        return q;
+    }
+}
+template <uint32_t d>
+inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a)
+{
+    return a - udivsi3_const_divisor<d>(a) * d;
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h
new file mode 100644
index 00000000000..5ff63a3af7d
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Define TT_LOG and it's derivatives so the compile passes.
+// If we are running workloads on hardware, TT_LOG will already have been defined.
+#ifndef TT_LOG_DEFINED
+    #define TT_LOG_DEFINED
+    #define TT_LOG(...) (void)sizeof(__VA_ARGS__)
+    #define TT_LOG_NB(...) (void)sizeof(__VA_ARGS__)
+    #define TT_PAUSE(...) (void)sizeof(__VA_ARGS__)
+    #define TT_RISC_ASSERT(...) (void)sizeof(__VA_ARGS__)
+    #define TT_LLK_DUMP(...) (void)sizeof(__VA_ARGS__)
+    #define TT_DUMP_LOG(...) (void)sizeof(__VA_ARGS__)
+    #define TT_DUMP_ASSERT(...) (void)sizeof(__VA_ARGS__)
+#endif
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h
new file mode 100644
index 00000000000..cd282dd2df9
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h
@@ -0,0 +1,86 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_binary.h"
+
+/*************************************************************************
+ * LLK ELTWISE BINARY
+ *************************************************************************/
+
+// Version with no operand
+template <
+    EltwiseBinaryType eltwise_binary_type,
+    BroadcastType src_b_bcast_type,
+    int NUM_FIDELITY_PHASES = 0,
+    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
+inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) {
+    const std::uint32_t num_faces = 4;
+
+    _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
+        num_faces, transpose, acc_to_dest);
+}
+
+// Version with operands
+template <
+    EltwiseBinaryType eltwise_binary_type,
+    BroadcastType src_b_bcast_type,
+    int NUM_FIDELITY_PHASES = 0,
+    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
+inline void llk_math_eltwise_binary_init_with_operands(
+    const std::uint32_t operand_A,
+    const std::uint32_t operand_B,
+    const std::uint32_t transpose = 0,
+    const std::uint32_t acc_to_dest = 0) {
+    const std::uint32_t operand_id =
+        get_operand_id(operand_A);  // operand_id is used to extract tile dim data which is the same for both operands
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+    _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
+        num_faces, transpose, acc_to_dest);
+}
+
+template <
+    EltwiseBinaryType eltwise_binary_type,
+    BroadcastType src_b_bcast_type,
+    DstSync Dst = DstSync::SyncFull,
+    int NUM_FIDELITY_PHASES = 0,
+    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+    bool is_fp32_dest_acc_en = false>
+inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) {
+    const std::uint32_t num_faces = 4;
+
+    _llk_math_eltwise_binary_<
+        eltwise_binary_type,
+        src_b_bcast_type,
+        Dst,
+        NUM_FIDELITY_PHASES,
+        binary_reuse_dest,
+        is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
+}
+
+template <
+    EltwiseBinaryType eltwise_binary_type,
+    BroadcastType src_b_bcast_type,
+    DstSync Dst = DstSync::SyncFull,
+    int NUM_FIDELITY_PHASES = 0,
+    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+    bool is_fp32_dest_acc_en = false>
+inline void llk_math_eltwise_binary(
+    const std::uint32_t operand_A,
+    const std::uint32_t operand_B,
+    uint dst_index,
+    const bool clear_fp32_dst_acc = true) {
+    const std::uint32_t operand_id = get_operand_id(operand_A);  // both operands must have same number of faces
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+    _llk_math_eltwise_binary_<
+        eltwise_binary_type,
+        src_b_bcast_type,
+        Dst,
+        NUM_FIDELITY_PHASES,
+        binary_reuse_dest,
+        is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h
new file mode 100644
index 00000000000..5f662f22081
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_binary_sfpu.h"
+
+/*************************************************************************
+ * LLK ELTWISE BINARY SFPU
+ *************************************************************************/
+
+template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+inline void llk_math_eltwise_binary_sfpu(
+    const uint operand,
+    uint dst_index_a,
+    uint dst_index_b,
+    int vector_mode = (int)Dim::RC,
+    uint param0 = 0,
+    uint param1 = 0,
+    uint param2 = 0,
+    uint param3 = 0,
+    uint param4 = 0,
+    uint param5 = 0) {
+    const std::uint32_t operand_id = get_operand_id(0);
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+    _llk_math_eltwise_binary_sfpu_<sfpu_op, APPROXIMATE, Dst>(
+        face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5);
+}
+
+template <SfpuType sfpu_op, bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_init(
+    uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
+    _llk_math_eltwise_binary_sfpu_init_<sfpu_op, APPROXIMATE>(param0, param1, param2, param3, param4, param5);
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_binary_sfpu_quant_int32(
+    uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+    llk_math_eltwise_binary_sfpu<SfpuType::quant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) {
+    llk_math_eltwise_binary_sfpu_init<SfpuType::quant_int32, APPROXIMATE>(zero_point);
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_binary_sfpu_requant_int32(
+    uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+    llk_math_eltwise_binary_sfpu<SfpuType::requant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) {
+    llk_math_eltwise_binary_sfpu_init<SfpuType::requant_int32, APPROXIMATE>(zero_point);
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_binary_sfpu_dequant_int32(
+    uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+    llk_math_eltwise_binary_sfpu<SfpuType::dequant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) {
+    llk_math_eltwise_binary_sfpu_init<SfpuType::dequant_int32, APPROXIMATE>(zero_point);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h
new file mode 100644
index 00000000000..6f7d61cabb5
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h
@@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_globals.h"
+#include "ckernel_template.h"
+#include "cmath_common.h"
+#include "llk_defs.h"
+#include "llk_io.h"
+#include "llk_math_common.h"
+#include "llk_operands.h"
+#include "llk_param_structs.h"
+
+// Need to revisit why we even need this
+#define EPS 1.19209e-07  // std::numeric_limits::epsilon() for FP32
+
+/*************************************************************************
+ * LLK MATH COMMON
+ *************************************************************************/
+
+template <DstSync Dst>
+inline void llk_math_wait_for_dest_available() {
+    _llk_math_wait_for_dest_available_<Dst>();
+}
+
+template <DstSync Dst = SyncFull, bool is_fp32_dest_acc_en = false>
+inline void llk_math_dest_section_done() {
+    _llk_math_dest_section_done_<Dst, is_fp32_dest_acc_en>();
+}
+
+template <DstSync Dst, bool is_fp32_dest_acc_en = false>
+inline void llk_math_pack_sync_init() {
+    _llk_math_pack_sync_init_<Dst, is_fp32_dest_acc_en>();
+}
+
+template <bool mail2math = true, bool mail2pack = true>
+inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
+    _llk_math_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
+}
+
+template <bool mail2math = true, bool mail2pack = true>
+inline void llk_math_release_tile(std::uint32_t operand) {
+    _llk_math_release_tile_<mail2math, mail2pack>();
+}
+
+inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); }
+
+inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); }
+
+inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
+    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+    _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]);
+}
+
+inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
+    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+    _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]);
+}
+
+inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
+    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+    _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]);
+}
+
+inline void llk_math_reconfig_data_format(
+    const std::uint32_t srca_old_operand,
+    const std::uint32_t srca_new_operand,
+    const std::uint32_t srcb_old_operand,
+    const std::uint32_t srcb_new_operand) {
+    std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+    std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+    if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) &&
+        (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+        llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand);
+    } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
+        llk_math_reconfig_data_format_srca(srca_new_operand);
+    } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+        llk_math_reconfig_data_format_srcb(srcb_new_operand);
+    }
+}
+
+inline void llk_math_reconfig_data_format_srca(
+    const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
+    std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+
+    if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
+        llk_math_reconfig_data_format_srca(srca_new_operand);
+    }
+}
+
+inline void llk_math_reconfig_data_format_srcb(
+    const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
+    std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+    if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+        llk_math_reconfig_data_format_srcb(srcb_new_operand);
+    }
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
new file mode 100644
index 00000000000..8f7ea1f5713
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_matmul.h"
+
+/*************************************************************************
+ * LLK MATMUL
+ *************************************************************************/
+
+template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
+inline void llk_math_matmul_init(
+    const std::uint32_t operandA,
+    const std::uint32_t operandB,
+    const std::uint32_t transpose = 0,
+    const std::uint32_t ct_dim = 1,
+    const std::uint32_t rt_dim = 1,
+    const std::uint32_t kt_dim = 1) {
+    const std::uint32_t in0_id = get_operand_id(operandA);
+    const std::uint32_t in1_id = get_operand_id(operandB);
+
+    const bool partial_face = get_operand_partial_face(in0_id);
+
+    const auto unpack_tile_dims = get_operand_tile_dims(in0_id);
+    const std::uint32_t in0_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX];
+    const std::uint32_t in0_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX];
+    const std::uint32_t in1_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX];
+    const std::uint32_t in1_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX];
+
+#ifdef ARCH_GRAYSKULL
+    _llk_math_matmul_init_<NUM_FIDELITY_PHASES, FaceLayout>(
+        in0_tile_r_dim,
+        in0_tile_c_dim,
+        in1_tile_r_dim,
+        in1_tile_c_dim,
+        partial_face,
+        transpose,
+        ct_dim,
+        rt_dim,
+        kt_dim);
+#else
+    _llk_math_matmul_init_<NUM_FIDELITY_PHASES>(
+        in0_tile_r_dim,
+        in0_tile_c_dim,
+        in1_tile_r_dim,
+        in1_tile_c_dim,
+        partial_face,
+        transpose,
+        ct_dim,
+        rt_dim,
+        kt_dim);
+#endif
+}
+
+template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
+inline void llk_math_matmul(
+    uint dst_index,
+    const bool transpose = false,
+    const std::uint32_t ct_dim = 1,
+    const std::uint32_t rt_dim = 1,
+    const std::uint32_t kt_dim = 1) {
+#ifdef ARCH_GRAYSKULL
+    _llk_math_matmul_<NUM_FIDELITY_PHASES, FaceLayout>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
+#else
+    _llk_math_matmul_<NUM_FIDELITY_PHASES>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
+#endif
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h
new file mode 100644
index 00000000000..f6d54ba067c
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_reduce.h"
+
+/*************************************************************************
+ * LLK REDUCE
+ *************************************************************************/
+
+template <
+    PoolType type,
+    ReduceDim dim,
+    int num_fidelity_phases = 0,
+    bool is_fp32_dest_acc_en = false,
+    bool is_int_fpu_en = false>
+inline void llk_math_reduce(const uint dst_index) {
+    _llk_math_reduce_<type, dim, num_fidelity_phases, is_fp32_dest_acc_en, is_int_fpu_en>(dst_index);
+}
+
+template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0>
+inline void llk_math_reduce_init(
+    const std::uint32_t within_face_16x16_transpose =
+        0) {  // within_face_16x16_transpose used for unpack, ignored by math
+    _llk_math_reduce_init_<type, dim, num_fidelity_phases>(within_face_16x16_transpose);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h
new file mode 100644
index 00000000000..4a280fa8119
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+
+/*************************************************************************
+ * LLK ELTWISE UNARY DATACOPY
+ *************************************************************************/
+
+template <
+    DataCopyType type,
+    BroadcastType src_b_bcast_type = BroadcastType::NONE,
+    DstSync Dst = DstSync::SyncFull,
+    bool is_fp32_dest_acc_en = false,
+    bool unpack_to_dest = false>
+inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) {
+    const std::uint32_t operand_id = get_operand_id(0);
+    _llk_math_eltwise_unary_datacopy_<type, src_b_bcast_type, Dst, is_fp32_dest_acc_en, unpack_to_dest>(
+        dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
+}
+
+template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE>
+// within_face_16x16_transpose is used by unpacker, math does not transpose
+inline void llk_math_eltwise_unary_datacopy_init(
+    const std::uint32_t transpose_of_faces = 0 /*unused*/,
+    const std::uint32_t within_face_16x16_transpose = 0 /* unused */,
+    const std::uint32_t operand = 0) {
+    const std::uint32_t operand_id = get_operand_id(0);
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+    _llk_math_eltwise_unary_datacopy_init_<type, src_b_bcast_type>(
+        transpose_of_faces, within_face_16x16_transpose, num_faces);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
new file mode 100644
index 00000000000..17bba18f12a
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -0,0 +1,345 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "metal_ckernel_sfpu.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+
+namespace ckernel {
+
+/*************************************************************************
+* LLK ELTWISE UNARY SFPU
+*************************************************************************/
+
+template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN=false>
+inline void llk_math_eltwise_unary_sfpu(
+    uint dst_index,
+    int vector_mode = (int)Dim::RC,
+    uint param0 = 0,
+    uint param1 = 0,
+    uint param2 = 0,
+    uint param3 = 0,
+    uint param4 = 0,
+    uint param5 = 0) {
+
+    const std::uint32_t operand_id = get_operand_id(0);
+    const std::uint32_t num_faces = get_operand_num_faces(0);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+    _llk_math_eltwise_unary_sfpu_<sfpu_op, APPROXIMATE, Dst, IS_INT_SFPU_EN>(
+        face_r_dim,
+        num_faces,
+        dst_index,
+        vector_mode,
+        param0,
+        param1,
+        param2,
+        param3,
+        param4,
+        param5
+    );
+}
+
+
+// New LLK SFPU APIs
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::rsqrt, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_rsqrt_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::rsqrt, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::log, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_log_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::log, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) {
+    llk_math_eltwise_unary_sfpu<SfpuType::log_with_base, APPROXIMATE, dst_sync>(dst_index,base_scale);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_log_with_base_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::log_with_base, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::tanh, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::signbit, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_signbit_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::signbit, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_tanh_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::tanh, APPROXIMATE>();
+}
+
+//sign
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::sign, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_sign_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::sign, APPROXIMATE>();
+}
+template <DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) {
+    constexpr bool dont_care = false;
+    llk_math_eltwise_unary_sfpu<SfpuType::dropout, dont_care, dst_sync>(dst_index, vector_mode, integer_dropout, scale_factor);
+}
+
+inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) {
+    constexpr bool dont_care = false;
+    constexpr uint dont_care_param = 0;
+
+    llk_math_eltwise_unary_sfpu_init<SfpuType::dropout, dont_care>(dont_care_param, dont_care_param, seed);
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::sigmoid, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_sigmoid_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::sigmoid, APPROXIMATE>();
+}
+
+//EQZ
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::equal_zero, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_eqz_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::equal_zero, APPROXIMATE>();
+}
+
+//NEZ
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::not_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_nez_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::not_equal_zero, APPROXIMATE>();
+}
+
+//LTZ
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::less_than_zero, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_ltz_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_zero, APPROXIMATE>();
+}
+
+//GTZ
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::greater_than_zero, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_gtz_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_zero, APPROXIMATE>();
+}
+
+//LEZ
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::less_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_lez_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_equal_zero, APPROXIMATE>();
+}
+
+//GEZ
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::greater_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_gez_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_equal_zero, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::max, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_max_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::max, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::square, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_square_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::square, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::power, APPROXIMATE, dst_sync>(dst_index, vector_mode, pow);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_power_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::power, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::abs, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_abs_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::abs, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::cast_fp32_to_fp16a, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::cast_fp32_to_fp16a, APPROXIMATE>();
+}
+
+//EXP2
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::exp2, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_exp2_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::exp2, APPROXIMATE>();
+}
+
+//heaviside
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::heaviside, APPROXIMATE, dst_sync>(dst_index,vector_mode,param0);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_heaviside_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::heaviside, APPROXIMATE>();
+}
+
+//EXPM1
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::expm1, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_expm1_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::expm1, APPROXIMATE>();
+}
+
+//Asin
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::asin, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_asin_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::asin, APPROXIMATE>();
+}
+
+//Atan
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::atan, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_atan_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::atan, APPROXIMATE>();
+}
+
+//Acos
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::acos, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_acos_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::acos, APPROXIMATE>();
+}
+
+//silu
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) {
+    llk_math_eltwise_unary_sfpu<SfpuType::silu, APPROXIMATE, dst_sync>(dst_index);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_silu_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::silu, APPROXIMATE>();
+}
+
+//Mask
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::mask, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_mask_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::mask, APPROXIMATE>();
+}
+
+// Negative
+template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) {
+    llk_math_eltwise_unary_sfpu<SfpuType::negative, APPROXIMATE, dst_sync>(dst_index,vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_negative_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::negative, APPROXIMATE>();
+}
+
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h
new file mode 100644
index 00000000000..ca7e298a7c2
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+extern uint32_t op_info_offset;
+
+inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {
+
+    uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
+    static constexpr uint32_t op_info_num_items = 7;
+
+    volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(&op_info_struct);
+    for (uint32_t i = 0; i < op_info_num_items; i++) {
+        op_info_struct_ptr[i] = op_info_ptr[i];
+    }
+    op_info_offset += 28;
+
+    if (op_info_offset == OP_INFO_SIZE) {
+        op_info_offset = 0; // In case we go out of bounds
+    }
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
new file mode 100644
index 00000000000..808d88a6281
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
@@ -0,0 +1,270 @@
+#pragma once
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_template.h"
+#include "cpack_common.h"
+#include "ckernel_globals.h"
+#include "circular_buffer.h"
+
+#include "llk_io.h"
+#include "llk_defs.h"
+#include "llk_outputs.h"
+#include "llk_param_structs.h"
+#include "llk_pack.h"
+#include "llk_pack_common.h"
+
+/*************************************************************************
+* LLK PACK
+*************************************************************************/
+
+template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+inline void llk_pack_mop_config(const uint32_t output) {
+
+    const std::uint32_t output_id = get_output_id(output);
+    const std::uint32_t num_faces = get_output_num_faces(output_id);
+    const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+    const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]);
+    const bool narrow_tile = get_output_narrow_tile(output_id);
+
+    _llk_pack_mop_config_<untilize, zero_output, FaceLayout, write_tile_header>(
+        pack_dst_format[output_id],
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile
+    );
+}
+
+template <bool untilize = false, bool is_fp32_dest_acc_en = false>
+inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) {
+
+    const std::uint32_t output_id = get_output_id(pack_params->pack_output);
+    const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+    const std::uint32_t num_faces = get_output_num_faces(output_id);
+    const bool partial_face = get_output_partial_face(output_id);
+    const bool narrow_tile = get_output_narrow_tile(output_id);
+
+    const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+
+    _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
+        pack_src_format[output_id],
+        pack_dst_format[output_id],
+        tile_size,
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile,
+        pack_params->relu_config.val
+    );
+}
+
+template <bool untilize = false, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
+inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) {
+    llk_pack_params_t llk_pack_params = {
+        .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}};
+    llk_pack_hw_configure<untilize, is_fp32_dest_acc_en>(&llk_pack_params);
+}
+
+template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false>
+inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) {
+    const std::uint32_t output_id = get_output_id(pack_params->pack_output);
+    const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+    const std::uint32_t num_faces = get_output_num_faces(output_id);
+    const bool partial_face = get_output_partial_face(output_id);
+    const bool narrow_tile = get_output_narrow_tile(output_id);
+
+    const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+
+    _llk_pack_reduce_hw_configure_<untilize, type, dim, is_fp32_dest_acc_en>(
+        pack_src_format[output_id],
+        pack_dst_format[output_id],
+        tile_size,
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile,
+        pack_params->relu_config.val
+    );
+}
+
+template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
+inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) {
+    llk_pack_params_t llk_pack_params = {
+        .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}};
+    llk_pack_reduce_hw_configure<untilize, type, dim, is_fp32_dest_acc_en>(&llk_pack_params);
+}
+
+template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+inline void llk_pack_init(const std::uint32_t pack_output = 0) {
+
+    const std::uint32_t output_id = get_output_id(pack_output);
+    const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+    const std::uint32_t num_faces = get_output_num_faces(output_id);
+    const bool partial_face = get_output_partial_face(output_id);
+    const bool narrow_tile = get_output_narrow_tile(output_id);
+
+    _llk_pack_init_<untilize, zero_output, FaceLayout, write_tile_header>(
+        pack_dst_format[output_id],
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile
+    );
+}
+
+template <bool out_of_order_output, bool untilize>
+inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) {
+
+    std::uint32_t pack_tile_addr;
+    if constexpr (out_of_order_output) {
+        pack_tile_addr = cb_interface[output_id].fifo_wr_ptr +
+                        (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1;
+    } else {
+        if constexpr (untilize) {
+            // FIXME: Need to support pack-untilize?
+            // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim +
+            //                                 cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf
+            // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
+            // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size);
+
+            // cb_interface[output_id].ublock_tile_cnt++;
+
+            // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) {
+            //    cb_interface[output_id].ublock_tile_cnt=0;
+            //    cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct;
+            // }
+        } else {
+            pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
+            cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size;
+        }
+    }
+    return pack_tile_addr;
+}
+
+template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false, bool is_fp32_dest_acc_en = false>
+inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) {
+    std::uint8_t output_id = get_output_id(output);
+
+    static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
+
+    std::uint32_t pack_tile_addr = get_output_tile_address<out_of_order_output, untilize>(output_id, output_tile_index);
+
+    _llk_pack_<Dst, untilize, is_fp32_dest_acc_en>(
+        tile_index,
+        pack_tile_addr
+    );
+}
+
+/*************************************************************************
+* LLK PACK COMMON
+*************************************************************************/
+
+
+inline void llk_packer_wait_for_math_done() {
+    _llk_packer_wait_for_math_done_();
+}
+
+template <uint WaitRes = p_stall::NONE>
+inline void llk_packer_set_math_semaphore() {
+    _llk_packer_set_math_semaphore_<WaitRes>();
+}
+
+template <DstSync Dst, bool is_fp32_dest_acc_en = false>
+inline void llk_pack_dest_section_done() {
+    _llk_pack_dest_section_done_<Dst, is_fp32_dest_acc_en>();
+}
+
+template <DstSync Dst, DstTileFaceLayout FaceLayout, bool untilize = false>
+inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 0) {
+    const std::uint32_t output_id = get_output_id(pack_output);
+    const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+    const bool narrow_tile = get_output_narrow_tile(output_id);
+
+    _llk_init_packer_dest_offset_registers_<Dst, FaceLayout, untilize>(
+        face_r_dim,
+        narrow_tile
+    );
+}
+
+template <DstSync Dst, DstTileFaceLayout FaceLayout = RowMajor, bool untilize = false, bool is_fp32_dest_acc_en = false>
+inline void llk_pack_dest_init(const std::uint32_t pack_output = 0) {
+
+    const std::uint32_t output_id = get_output_id(pack_output);
+    const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+    const bool narrow_tile = get_output_narrow_tile(output_id);
+
+    _llk_pack_dest_init_<Dst, FaceLayout, untilize, is_fp32_dest_acc_en>(
+        face_r_dim,
+        narrow_tile
+    );
+}
+
+template <bool mail2math=true, bool mail2pack=true>
+inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) {
+    _llk_pack_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
+}
+
+template <bool mail2math=true, bool mail2pack=true>
+inline void llk_pack_release_tile(std::uint32_t output) {
+    _llk_pack_release_tile_<mail2math, mail2pack>();
+}
+
+inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
+    _llk_pack_debug_dump_(data, byte_size);
+}
+
+inline void llk_pack_debug_dump_seek(std::uint8_t offset) {
+    _llk_pack_debug_dump_seek_(offset);
+}
+
+template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) {
+
+    const std::uint32_t output_id = get_output_id(new_output);
+    const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+    const std::uint32_t num_faces = get_output_num_faces(output_id);
+    const bool partial_face = get_output_partial_face(output_id);
+    const bool narrow_tile = get_output_narrow_tile(output_id);
+
+    _llk_pack_reconfig_data_format_<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(
+        pack_src_format[output_id],
+        pack_dst_format[output_id],
+        cb_interface[output_id].fifo_page_size,
+        face_r_dim,
+        num_faces,
+        partial_face,
+        narrow_tile
+    );
+}
+
+template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) {
+    std::uint32_t old_output_id = get_output_id(old_output);
+    std::uint32_t new_output_id = get_output_id(new_output);
+
+    if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id])
+       && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid)
+       && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) {
+        llk_pack_reconfig_data_format<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(new_output);
+    } else if constexpr (is_tile_dim_reconfig_en) {
+        // Same format but different tile dims
+        llk_pack_mop_config<false, false, FaceLayout, write_tile_header>(new_output);
+    }
+}
+
+TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) {
+    _llk_pack_relu_config_(config);
+}
+
+inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) {
+    _llk_pack_reconfig_l1_acc_(enable);
+}
+
+template <bool untilize = false, ReduceDim dim>
+inline void llk_pack_reduce_mask_config() {
+    _llk_pack_reduce_mask_config_<untilize, dim>();
+}
+
+inline void llk_pack_reduce_mask_clear() {
+    _llk_pack_reduce_mask_clear_();
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_param_structs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_param_structs.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_param_structs.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_param_structs.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_reverseops.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_reverseops.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_reverseops.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_reverseops.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_cdf.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_cdf.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_converter.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_converter.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_elu.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erf_erfc.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erfinv.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_exp.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_exp.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_gelu.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_i0.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_isinf_isnan.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_logical_not_noti.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_recip.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_recip.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_relu.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_sqrt.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_trigonometry.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_trigonometry.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
similarity index 83%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
index 822699707d1..f1e7d19acc8 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -8,9 +8,11 @@
 #include "ckernel_globals.h"
 #include "ckernel_include.h"
 #include "ckernel_template.h"
+#include "metal_ckernel_sfpu.h"
 #include "cmath_common.h"
 #include "llk_format_conversions.h"
 #include "llk_math_common.h"
 #include "llk_param_structs.h"
+#include "llk_math_eltwise_unary_sfpu.h"
 
 using namespace ckernel;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_elu.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_exp.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_exp.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_i0.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
similarity index 66%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_init.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
index 691accf168a..b82b1f39cb4 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_init.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
@@ -28,4 +28,18 @@ inline void llk_math_eltwise_unary_sfpu_init(void (*func)()) {
     math::reset_counters(p_setrwc::SET_ABD_F);
 }
 
+template <SfpuType sfpu_op, bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_init(
+    uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
+
+    _llk_math_eltwise_unary_sfpu_init_<sfpu_op, APPROXIMATE>(
+        param0,
+        param1,
+        param2,
+        param3,
+        param4,
+        param5
+    );
+}
+
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_recip.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_recip.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_relu.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
new file mode 100644
index 00000000000..72c27cde02b
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
@@ -0,0 +1,778 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_defs.h"
+#include "ckernel_sfpu.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+#include "ckernel_sfpu_cdf.h"
+#include "ckernel_sfpu_exp.h"
+#include "ckernel_sfpu_recip.h"
+#include "ckernel_sfpu_converter.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
+inline void calculate_rsqrt()
+{
+
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+
+        vFloat in = dst_reg[0];
+        v_if(dst_reg[0] == 0.0f){
+            dst_reg[0] = std::numeric_limits<float>::infinity();
+        }v_else{
+            vFloat result = 1.0f;
+            v_if(dst_reg[0] > 1.0f){
+                result = sfpu_reciprocal(in);
+            }v_endif;
+
+            for (int r = 0; r < RECIPROCAL_ITERATIONS; r++)
+            {
+                // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration.
+                result = result * (1.5F - 0.5F  * dst_reg[0] * result * result);
+            }
+            dst_reg[0] = result;
+        }v_endif;
+
+        dst_reg++;
+
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_sigmoid_appx()
+{
+    vUInt l0 = l_reg[LRegs::LReg0];
+    vUInt l1 = l_reg[LRegs::LReg1];
+    vUInt l2 = l_reg[LRegs::LReg2];
+
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+
+        dst_reg[0] = lut(val, l0, l1, l2) + 0.5f;
+
+        dst_reg++;
+    }
+
+    l_reg[LRegs::LReg0] = l0;
+    l_reg[LRegs::LReg1] = l1;
+    l_reg[LRegs::LReg2] = l2;
+}
+
+// TODO: Implement using bitwise comparision
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_signbit()
+{
+
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        v_if (val <= -0.0f) {
+            val = 1.0f;
+        } v_elseif (val >= 0.0f) {
+            val = 0.0f;
+        }
+        v_endif;
+        dst_reg[0] = val;
+
+       dst_reg++;
+    }
+
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_tanh()
+{
+    // SFPU microcode
+    vUInt l0 = l_reg[LRegs::LReg0];
+    vUInt l1 = l_reg[LRegs::LReg1];
+    vUInt l2 = l_reg[LRegs::LReg2];
+
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        val = lut(val, l0, l1, l2);
+        dst_reg[0] = val;
+
+        dst_reg++;
+    }
+
+    l_reg[LRegs::LReg0] = l0;
+    l_reg[LRegs::LReg1] = l1;
+    l_reg[LRegs::LReg2] = l2;
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_hardtanh(uint param0, uint param1, uint param2)
+{
+    // All params are in FP16_B format
+    // param0 = -(neg_threshold)
+    // param1 = -(pos_threshold - neg_threshold)
+    // param2 = -(pos_threshold)
+
+    vFloat p0 = s2vFloat16(param0);
+    vFloat p1 = s2vFloat16(param1);
+    vFloat p2 = s2vFloat16(param2);
+    // SFPU microcode
+    #pragma GCC unroll 0
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+
+        val += p0;// 12 bits
+        v_if (val < 0.0f) {
+            val = 0.0f;
+        }
+        v_endif;
+
+        val += p1;// 12 bits
+        v_if (val >= 0.0f) {
+            val = 0.0f;
+        }
+        v_endif;
+
+        val += p2;// 12 bits
+
+        dst_reg[0] = val;
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int WITH_PRECOMPUTED_TANH, int ITERATIONS>
+inline void calculate_tanh_derivative()
+{
+    vUInt l0 = l_reg[LRegs::LReg0];
+    vUInt l1 = l_reg[LRegs::LReg1];
+    vUInt l2 = l_reg[LRegs::LReg2];
+
+    // tanh'(x) = 1 - (tanh(x))^2
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+
+        if constexpr (!WITH_PRECOMPUTED_TANH) {
+            val = lut(val, l0, l1, l2);
+        }
+
+        val = val * (-val) + vConst1;
+        dst_reg[0] = val;
+
+        dst_reg++;
+    }
+
+    l_reg[LRegs::LReg0] = l0;
+    l_reg[LRegs::LReg1] = l1;
+    l_reg[LRegs::LReg2] = l2;
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_dropout(uint prob, uint scale)
+{
+    // SFPU microcode
+
+    vUInt rand = l_reg[LRegs::LReg3];
+
+    #pragma GCC unroll 0
+    for (int d = 0; d < ITERATIONS; d++) {
+        ////////////////////////
+        // Scale samples
+        ///////////////////////
+        dst_reg[0] = dst_reg[0] * s2vFloat16b(scale);
+
+        ////////////////////////
+        // Drop samples
+        ///////////////////////
+        v_if (rand < prob) {
+            dst_reg[0] = vConst0;
+        }
+        v_endif;
+
+        ////////////////////////
+        // 16-bit PRNG update
+        ///////////////////////
+        vUInt lfsr = vConstIntPrgm1;
+        vUInt tmp = lfsr & rand;
+        rand = rand >> 1;
+        v_if (tmp != 0) {
+            vUInt mask = vConstIntPrgm0;
+            rand ^= mask;
+        }
+        v_endif;
+
+        dst_reg++;
+    }
+
+    l_reg[LRegs::LReg3] = rand;
+}
+
+template <bool APPROXIMATION_MODE,int ITERATIONS>
+inline void calculate_power_iterative(const uint exponent)
+{
+    #pragma GCC unroll 8
+    for (int d = 0; d < 8; d++)
+    {
+        vFloat in = dst_reg[0];
+        vFloat result = 1.0f;
+        for (uint i = 0; i < exponent; i++) {
+            result *= in;
+        }
+	dst_reg[0]=result;
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_square()
+{
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat in = dst_reg[0];
+        vFloat result = in * in;
+
+        dst_reg[0] = result;
+
+        dst_reg++;
+    }
+}
+
+template <bool HAS_BASE_SCALING>
+sfpi_inline void calculate_log_body(const uint log_base_scale_factor)
+{
+    ////////////////////////////
+    // Load From dest + "normalize to calculation range"
+    ////////////////////////////
+    vFloat in = dst_reg[0];
+    vFloat x = setexp(in, 127);    // set exp to exp bias (put in range of 1-2)
+
+    // XXXXXX ask Namal? if we can derive the coefficients below to higher precision
+    ////////////////////////////
+    // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order
+    // x* ( x* (A*x + B) + C) + D
+    // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006
+    // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to
+    // save the subtract and get A',B',C',D'):
+    // A' = A
+    // B' = -3A + B
+    // C' = 3a -2B + C
+    // D' = -A + B - C + D
+    // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753
+    ////////////////////////////
+    vFloat a = vConstFloatPrgm1;
+    vFloat b = vConstFloatPrgm2;
+    // XXXXX try variants of the below: B'=.7122, C'=2.0869
+    vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f;
+
+    ////////////////////////////
+    // Convert exponent to float
+    ////////////////////////////
+    vInt exp = exexp(in);
+    v_if (exp < 0) {
+        exp = setsgn(~exp + 1, 1);
+    }
+    v_endif;
+
+    vFloat expf = int32_to_float(exp, 0);
+    vFloat vConstLn2 = vConstFloatPrgm0;
+    vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2)
+
+    if constexpr (HAS_BASE_SCALING) {
+        result *= s2vFloat16a(log_base_scale_factor);
+    }
+
+    ////////////////////////////
+    // Base case when input is 0. ln(0) = -inf
+    ////////////////////////////
+    v_if (in == 0.0F) { // Reload for register pressure
+        result = -std::numeric_limits<float>::infinity();
+    }
+    v_endif;
+
+    dst_reg[0] = result;
+}
+
+template <bool APPROXIMATION_MODE, bool HAS_BASE_SCALING, int ITERATIONS>
+inline void calculate_log(uint log_base_scale_factor)
+{
+    #pragma GCC unroll 8
+    for(int d = 0; d < ITERATIONS; d++){
+        calculate_log_body<HAS_BASE_SCALING>(log_base_scale_factor);
+        dst_reg++;
+    }
+}
+
+sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init)
+{
+    flag1 = init;
+    if (check) {
+        flag2 = init;
+    }
+}
+
+template <bool APPROXIMATION_MODE, SfpuType COMP_MODE, int ITERATIONS>
+inline void calculate_comp(uint exponent_size_8)
+{
+   const vFloat zero = 0.0f;
+   const vFloat one = 1.0f;
+   for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat v = dst_reg[0];
+        vFloat flag1, flag2;
+
+	//a[i] == 0
+	if constexpr(COMP_MODE == SfpuType::equal_zero) {
+	    v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+	      v = one;
+	    } v_else {
+	      v = zero;
+	    }
+	    v_endif;
+	  }
+
+	//a[i] != 0
+	if constexpr(COMP_MODE == SfpuType::not_equal_zero) {
+	    v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+	      v = zero;
+	    } v_else {
+	      v = one;
+	    }
+	    v_endif;
+        }
+
+	//a[i] < 0
+	if constexpr(COMP_MODE == SfpuType::less_than_zero) {
+	    v_if (v >= 0.0f) {
+	      v = zero;
+	    } v_else {
+	      v = one;
+	    }
+	    v_endif;
+        }
+
+	//a[i] >= 0
+	if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) {
+	    v_if (v >= 0.0f) {
+	      v = one;
+	    } v_else {
+	      v = zero;
+	    }
+	    v_endif;
+        }
+
+	//a[i] > 0
+	if constexpr(COMP_MODE == SfpuType::greater_than_zero) {
+	    v_if (v > 0.0f) {
+	      v = one;
+	    } v_else {
+	      v = zero;
+	    }
+	    v_endif;
+        }
+
+	//a[i] <= 0
+	if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) {
+	    v_if (v > 0.0f) {
+	      v = zero;
+	    } v_else {
+	      v = one;
+	    }
+	    v_endif;
+        }
+
+	dst_reg[0] = v;
+	dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_clamp(uint param0, uint param1, uint param2)
+{
+    // All params are in FP16 format
+    // param0 = min
+    // param1 = max
+
+    //uint format = (param0 >> 16)&0x1;
+    s2vFloat16::Format format = s2vFloat16::fp16a;
+
+    // SFPU microcode
+    vFloat min = s2vFloat16(param0, format);
+    vFloat max = s2vFloat16(param1, format);
+    #pragma GCC unroll 0
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+
+        v_if (val < min) {
+            val = s2vFloat16(param0, format);
+        } v_elseif (val >= max) {
+            val = s2vFloat16(param1, format);
+        }
+        v_endif;
+
+        dst_reg[0] = val + s2vFloat16b(param2); // 12 bits
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_abs()
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat v = dst_reg[0];
+        dst_reg[0] = sfpi::abs(v);
+        dst_reg++;
+    }
+}
+
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_exp2()
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat v = dst_reg[0];
+        // log(2) = 0.6931471805;
+        v = v * 0.6931471805f;
+	    // exp = e^(v)
+	    vFloat exp = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
+	    dst_reg[0] = exp;
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_sign()
+{
+    // All params are in FP16 format
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat v = dst_reg[0];
+	vFloat result = vConst1;
+        v_if (v < 0.0f) {
+           result = vConstNeg1;
+        } v_elseif(v > 0.0f) {
+	  result = vConst1;
+	} v_else {
+	  result = vConst0;
+        }
+        v_endif;
+
+	dst_reg[0] = result;
+        dst_reg++;
+    }
+}
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_max()
+{
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat a = dst_reg[0];
+        vFloat b = dst_reg[32];
+        v_if(a < b) {
+            dst_reg[0] = b;
+        }
+        v_endif;
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_min()
+{
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat a = dst_reg[0];
+        vFloat b = dst_reg[32];
+        v_if(a > b) {
+            dst_reg[0] = b;
+        }
+        v_endif;
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_expm1()
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat v = dst_reg[0];
+        v = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
+        dst_reg[0] = v - 1.0f;
+        dst_reg++;
+    }
+}
+
+
+#define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4)  (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0)
+
+template <bool APPROXIMATION_MODE>
+sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val)
+{
+    v_if(1 > sfpi::abs(val)){
+        dst_reg[0] = sfpi::abs(val)  ;
+    }
+    v_else{
+        dst_reg[0] =  sfpu_reciprocal(sfpi::abs(val));
+    }
+    v_endif;
+
+    vFloat t1 = dst_reg[0] * dst_reg[0];
+
+    t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1);
+
+    t1 = t1 * dst_reg[0];
+
+    v_if (sfpi::abs(val) > 1){
+        t1 = 1.570796327f - t1;
+    }
+    v_endif;
+
+    v_if(val < 0 ){
+        t1 = -t1;
+    }
+    v_endif;
+
+    return t1;
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_atan()
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        val = sfpu_atan_maclaurin_series<APPROXIMATION_MODE>(val);
+        dst_reg[0] = val;
+        dst_reg++;
+    }
+}
+
+
+template <bool APPROXIMATION_MODE>
+sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val)
+{
+    // input for [-1:1]
+    // Mclauren series
+    // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ...
+    // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a
+
+    vFloat tmp = val;
+    vFloat val_square = val * val;
+    // x
+    vFloat output = tmp;
+    // (1/6) * x^3
+    tmp = tmp * val_square;
+    output += 0.166666666 * tmp;
+    // (3/40) * x^5
+    tmp = tmp * val_square;
+    output +=  0.075 * tmp;
+
+    //(5/112) * x^7
+    tmp = tmp * val_square;
+    output += 0.044642857 * tmp;
+
+    // (35/1152) *x^9
+    tmp = tmp * val_square;
+    output += 0.03038194 * tmp;
+
+    //(63/2816) * x^11
+    tmp = tmp * val_square;
+    output += 0.02237216 * tmp;
+
+    // Write out output
+    return output;
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_asin()
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat v = dst_reg[0];
+        v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
+        dst_reg[0] = v;
+        dst_reg++;
+    }
+}
+
+
+#define PI_2 (1.570796326794)
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_acos()
+{
+    // SFPU microcode
+    // acos = (pi/2 - asin)
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat v = dst_reg[0];
+        v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
+        v = PI_2 - v;
+        dst_reg[0] = v;
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void cast_fp32_to_fp16a()
+{
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        //vFloat val = dst_reg[0];
+        //dst_reg[0] = float_to_fp16a(val, 0);
+        TTI_SFPLOAD(0, 0, 3, 0);
+        TTI_SFP_STOCH_RND(0,0,0,0,0,8);
+        TTI_SFPSTORE(0,1,3,0);
+        dst_reg++;
+    }
+}
+
+
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_negative()
+{
+
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        dst_reg[0] = -val;
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_add1()
+{
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        dst_reg[0] = 1.0f + val;
+        dst_reg++;
+    }
+}
+
+inline
+vFloat sigmoid_piecewise_linear_positive(vFloat val) {
+        vFloat result = 0.0f;
+	v_if ( val >= +5.0f)  {
+	  result = 1.0f;
+	} v_elseif ( val > 1.0f && val < 5.0f ) {
+	  result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f,  0.24300185f,  0.50437757f,val);
+	} v_else {
+	  result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5
+	}
+	v_endif;
+	return result;
+}
+
+//sigmoid is anti-symmetric and offset by 1
+//sigmoid[-x] = 1 - sigmoid[x]
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_sigmoid()
+{
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        vFloat result = 0.0f;
+
+        v_if ( val < 0.0f ) {
+  	   val = -val;
+        }
+        v_endif;
+
+	result = sigmoid_piecewise_linear_positive(val);
+
+	val = dst_reg[0];
+        v_if ( val < 0.0f ) {
+            result = 1.0f - result;
+        }
+        v_endif;
+
+        dst_reg[0] = result;
+        dst_reg++;
+    }
+
+    return;
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_heaviside(uint value)
+{
+    // SFPU microcode
+    Converter c_value;
+    c_value.u = value;
+    vFloat s = c_value.f;
+
+    #pragma GCC unroll 0
+    for (int d = 0; d < ITERATIONS; d++) {
+        vFloat v = dst_reg[0];
+
+        v_if (v < 0.0f) {
+            v = 0.0f;
+        }v_elseif (v > 0.0f) {
+            v = 1.0f;
+        }v_else {
+            v = s;
+        }
+        v_endif;
+
+       dst_reg[0] = v;
+
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_silu()
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++) {
+        vFloat val = dst_reg[0];
+        v_if ( val < 0.0f ) {
+            val = -val;
+        }
+        v_endif;
+
+	    vFloat result = sigmoid_piecewise_linear_positive(val);
+
+	    val = dst_reg[0];
+        v_if ( val < 0.0f ) {
+            result = 1.0f - result;
+        }
+        v_endif;
+        result = val * result;
+        dst_reg[0] = result;
+        dst_reg++;
+    }
+}
+
+} // namespace sfpu
+} // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h
new file mode 100644
index 00000000000..bce909a4395
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_AB.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+ * LLK UNPACK AB
+ *************************************************************************/
+
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_AB_hw_configure(
+    const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) {
+    // In0 -> unpA
+    // In1 -> unpB
+    const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
+    const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
+
+    // unpA -> srcA
+    // unpB -> srcB
+    const uint32_t num_faces = get_operand_num_faces(unpA_operand_id);  // num faces in unpA and unpB are the same
+
+    const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id);  // face r dim in unpA and unpB are the same
+
+    _llk_unpack_AB_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format[unpA_operand_id],
+        unpack_src_format[unpB_operand_id],
+        unpack_dst_format[unpA_operand_id],
+        unpack_dst_format[unpB_operand_id],
+        face_r_dim,
+        within_face_16x16_transpose,
+        num_faces);
+}
+
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_AB_hw_configure_disaggregated(
+    const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) {
+    const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand};
+
+    llk_unpack_AB_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_params, within_face_16x16_transpose);
+}
+
+template <BroadcastType BType = BroadcastType::NONE>
+inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) {
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+    const bool narrow_tile = get_operand_narrow_tile(operand_id);  // if narrow tile read face 0 twice for row broadcast
+                                                                   // or read face 0 and 1 for col broadcast
+    _llk_unpack_AB_mop_config_<BType>(transpose_of_faces, num_faces, narrow_tile);
+}
+
+template <BroadcastType BType = BroadcastType::NONE>
+inline void llk_unpack_AB_init(
+    const std::uint32_t operandA,
+    const std::uint32_t operandB,
+    const std::uint32_t transpose = 0,
+    const std::uint32_t acc_to_dest = 0) {
+    const std::uint32_t operandA_id = get_operand_id(operandA);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id);  // face r dim in unpA and unpB are the same
+    const std::uint32_t num_faces = get_operand_num_faces(operandA_id);
+    const bool narrow_tile =
+        get_operand_narrow_tile(operandA_id);  // if narrow tile read face 0 twice for row broadcast
+
+    _llk_unpack_AB_init_<BType>(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest);
+}
+
+template <BroadcastType BType = BroadcastType::NONE>
+inline void llk_unpack_AB(
+    const std::uint32_t operandA,
+    const std::uint32_t operandB,
+    const std::uint32_t tile_index_a,
+    const std::uint32_t tile_index_b,
+    const bool transpose_of_faces = 0 /*not used*/) {
+    std::uint32_t operandA_id = get_operand_id(operandA);
+    std::uint32_t operandB_id = get_operand_id(operandB);
+    std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
+    std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a;
+    std::uint32_t address_a = base_address_a + offset_address_a;
+    std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
+    std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b;
+    std::uint32_t address_b = base_address_b + offset_address_b;
+
+    _llk_unpack_AB_<BType>(address_a, address_b, transpose_of_faces > 0);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h
new file mode 100644
index 00000000000..68eca79f4e9
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h
@@ -0,0 +1,136 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_AB_matmul.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+ * LLK UNPACK AB MATMUL
+ *************************************************************************/
+
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) {
+    const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca;
+
+    // In0 -> unpB
+    // In1 -> unpA
+    const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
+    const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
+
+    // unpA -> srcA
+    // unpB -> srcB
+    const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+    const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id);
+
+    const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+    const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id);
+
+    _llk_unpack_AB_matmul_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format[unpA_operand_id],
+        unpack_src_format[unpB_operand_id],
+        unpack_dst_format[unpA_operand_id],
+        unpack_dst_format[unpB_operand_id],
+        unpA_face_r_dim,
+        unpB_face_r_dim,
+        transpose_xy_srca,
+        unpA_num_faces,
+        unpB_num_faces,
+        cb_interface[unpA_operand_id].fifo_page_size,
+        cb_interface[unpB_operand_id].fifo_page_size);
+}
+
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_AB_matmul_hw_configure_disaggregated(
+    const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) {
+    const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = {
+        .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca};
+    llk_unpack_AB_matmul_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_matmul_params);
+}
+
+inline void llk_unpack_AB_matmul_mop_config(
+    const bool transpose,
+    const std::uint32_t ct_dim,
+    const std::uint32_t rt_dim,
+    const std::uint32_t kt_dim,
+    const bool partial_face) {
+    // in0 - loaded to SrcB
+    // in1 - loaded to SrcA
+    _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face);
+}
+
+__attribute__((always_inline)) inline void llk_unpack_AB_matmul_init(
+    const std::uint32_t operandA,
+    const std::uint32_t operandB,
+    const std::uint32_t transpose = 0,
+    const std::uint32_t ct_dim = 1,
+    const std::uint32_t rt_dim = 1,
+    const std::uint32_t kt_dim = 1) {
+    // In0 -> srcB (supports partial face)
+    // In1 -> srcA
+    const uint32_t operandA_id = get_operand_id(operandB);
+    const uint32_t operandB_id = get_operand_id(operandA);
+
+    const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id);
+    const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id);
+
+    const bool reuse_a = ct_dim >= rt_dim;
+    const bool partial_face = get_operand_partial_face(operandB_id);
+
+    const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id);
+    const uint32_t unpB_num_faces =
+        partial_face ? 1 : get_operand_num_faces(operandB_id);  // if partial face -> unpack face by face
+
+    _llk_unpack_AB_matmul_init_(
+        transpose,
+        ct_dim,
+        rt_dim,
+        kt_dim,
+        unpA_face_r_dim,
+        unpB_face_r_dim,
+        unpA_num_faces,
+        unpB_num_faces,
+        partial_face);
+}
+
+inline void llk_unpack_AB_matmul(
+    const std::uint32_t operandA,
+    const std::uint32_t operandB,
+    const std::uint32_t tile_index_a,
+    const std::uint32_t tile_index_b,
+    const std::uint32_t ct_dim = 1,
+    const std::uint32_t rt_dim = 1,
+    const std::uint32_t kt_dim = 1) {
+    // In0/InA -> srcB (supports partial face)
+    // In1/InB -> srcA
+
+    volatile uint *cfg = get_cfg_pointer();  // get pointer to registers for current state ID
+
+    const std::uint32_t operandA_id = get_operand_id(operandA);
+    const std::uint32_t operandB_id = get_operand_id(operandB);
+    const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id);  // In1/InB -> srcA
+    const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id);  // In0/InA -> srcB
+
+    const bool partial_face = get_operand_partial_face(operandA_id);
+
+    std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
+    std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
+
+    std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size;
+    std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size;
+
+    _llk_unpack_AB_matmul_(
+        base_address_a,
+        base_address_b,
+        tile_index_a,
+        tile_index_b,
+        tile_size_a,
+        tile_size_b,
+        unpA_face_r_dim,
+        unpB_face_r_dim,
+        partial_face,
+        ct_dim,
+        rt_dim,
+        kt_dim);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h
new file mode 100644
index 00000000000..e8918793baa
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_A.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+ * LLK UNPACK A
+ *************************************************************************/
+
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_A_hw_configure(
+    const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) {
+    const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand);
+    const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+    const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+    _llk_unpack_A_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format[unpA_operand_id],
+        unpack_dst_format[unpA_operand_id],
+        unpA_face_r_dim,
+        within_face_16x16_transpose,
+        unpA_num_faces);
+}
+
+template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_A_hw_configure_disaggregated(
+    const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) {
+    const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand};
+    llk_unpack_A_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_A_params, within_face_16x16_transpose);
+}
+
+template <
+    BroadcastType BType = BroadcastType::NONE,
+    bool acc_to_dest = false,
+    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+    bool unpack_to_dest = false>
+inline void llk_unpack_A_mop_config(
+    const bool transpose_of_faces,
+    const std::uint32_t operand_id,
+    const std::uint32_t unpack_src_format = 0,
+    std::uint32_t unpack_dst_format = 0) {
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+    _llk_unpack_A_mop_config_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+        transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format);
+}
+
+template <
+    BroadcastType BType = BroadcastType::NONE,
+    bool acc_to_dest = false,
+    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+    bool unpack_to_dest = false>
+inline void llk_unpack_A_init(
+    const std::uint32_t transpose_of_faces = 0,
+    const std::uint32_t within_face_16x16_transpose = 0,
+    const std::uint32_t operand = 0) {
+    cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(within_face_16x16_transpose);
+
+    const std::uint32_t operand_id = get_operand_id(operand);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+    _llk_unpack_A_init_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+        transpose_of_faces,
+        within_face_16x16_transpose,
+        face_r_dim,
+        num_faces,
+        unpack_src_format[operand_id],
+        unpack_dst_format[operand_id]);
+}
+
+template <
+    BroadcastType BType = BroadcastType::NONE,
+    bool acc_to_dest = false,
+    EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+    bool unpack_to_dest = false>
+inline void llk_unpack_A(
+    const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) {
+    std::uint32_t operand_id = get_operand_id(operand);
+    std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+    std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+    std::uint32_t address = base_address + offset_address;
+
+    _llk_unpack_A_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+        address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h
new file mode 100644
index 00000000000..6b61452722a
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h
@@ -0,0 +1,141 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "circular_buffer.h"
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_globals.h"
+#include "ckernel_template.h"
+#include "cunpack_common.h"
+#include "llk_defs.h"
+#include "llk_io.h"
+#include "llk_operands.h"
+#include "llk_param_structs.h"
+#include "llk_unpack_common.h"
+
+/*************************************************************************
+ * LLK UNPACK COMMON
+ *************************************************************************/
+
+void llk_zero_operand(std::uint32_t operand) {
+    std::uint32_t operand_id = get_operand_id(operand);
+    std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size;
+    std::uint32_t size = cb_interface[operand_id].fifo_size;
+    _llk_zero_buffer_(fifo_base_addr, size);
+}
+
+template <bool mail2math = true, bool mail2pack = true>
+inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
+    std::uint32_t operand_id = get_operand_id(operand);
+    std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+    std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+    std::uint32_t address = base_address + offset_address;
+    _llk_unpack_get_tile_<mail2math, mail2pack>(address, p_tile);
+}
+
+template <bool mail2math = true, bool mail2pack = true>
+inline void llk_unpack_release_tile(std::uint32_t operand) {
+    _llk_unpack_release_tile_<mail2math, mail2pack>();
+}
+
+inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
+    _llk_unpack_debug_dump_(data, byte_size);
+}
+
+inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); }
+
+template <bool is_tile_dim_reconfig_en = false>
+inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
+    const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand);
+    const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id);
+    _llk_unpack_reconfig_data_format_srca_impl_<is_tile_dim_reconfig_en>(
+        unpack_src_format[srca_operand_id],
+        unpack_dst_format[srca_operand_id],
+        cb_interface[srca_operand_id].fifo_page_size,
+        face_r_dim,
+        num_faces);
+}
+
+template <bool is_tile_dim_reconfig_en = false>
+inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
+    std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand);
+    const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id);
+    _llk_unpack_reconfig_data_format_srcb_impl_<is_tile_dim_reconfig_en>(
+        unpack_src_format[srcb_operand_id],
+        unpack_dst_format[srcb_operand_id],
+        cb_interface[srcb_operand_id].fifo_page_size,
+        face_r_dim,
+        num_faces);
+}
+
+template <bool is_tile_dim_reconfig_en = false>
+inline void llk_unpack_reconfig_data_format_srca(
+    const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
+    std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+    std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+
+    if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) {
+        llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+    } else if constexpr (is_tile_dim_reconfig_en) {
+        llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+    }
+}
+
+template <bool is_tile_dim_reconfig_en = false>
+inline void llk_unpack_reconfig_data_format_srcb(
+    const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
+    std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+    std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+    if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) {
+        llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+    } else if constexpr (is_tile_dim_reconfig_en) {
+        llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+    }
+}
+
+template <bool is_tile_dim_reconfig_en = false>
+inline void llk_unpack_reconfig_data_format(
+    const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
+    llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+    llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+}
+
+template <bool is_tile_dim_reconfig_en = false>
+inline void llk_unpack_reconfig_data_format(
+    const std::uint32_t srca_old_operand,
+    const std::uint32_t srca_new_operand,
+    const std::uint32_t srcb_old_operand,
+    const std::uint32_t srcb_new_operand) {
+    llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_old_operand, srca_new_operand);
+    llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_old_operand, srcb_new_operand);
+}
+
+inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); }
+
+inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); }
+
+// All TILE_SIZE related functions were deprecared in BBE for WH.  The following is needed for pack_shifted so just
+// keeping here.
+// FIXME: Need to review and adjust accordingly
+constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) {
+    switch (format & 0x1F) {
+        case ((uint8_t)DataFormat::Float32): return ((index << 8));
+        case ((uint8_t)DataFormat::Float16):
+        case ((uint8_t)DataFormat::Float16_b): return ((index << 7));
+        case ((uint8_t)DataFormat::Bfp8):
+        case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2));
+        case ((uint8_t)DataFormat::Bfp4):
+        case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2));
+        case ((uint8_t)DataFormat::Bfp2):
+        case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2));
+        case ((uint8_t)DataFormat::Int8):
+        case ((uint8_t)DataFormat::Lf8): return ((index << 6));
+        // Keep default as Bfp8?
+        default: return ((index << 6) + (index << 2));
+    };
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h
new file mode 100644
index 00000000000..afa60f7947b
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h
@@ -0,0 +1,94 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_reduce.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+* LLK UNPACK REDUCE
+*************************************************************************/
+
+template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_reduce_hw_configure(
+    const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) {
+
+    constexpr bool within_face_16x16_transpose  = (ReduceDim::REDUCE_ROW == dim);
+
+    const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand);
+    const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+    const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+    constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
+    const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
+                               ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
+
+    _llk_unpack_reduce_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format[unpA_operand_id],
+        unpB_src_format,
+        unpack_dst_format[unpA_operand_id],
+        unpB_dst_format,
+        unpA_face_r_dim,
+        unpA_face_r_dim,
+        within_face_16x16_transpose,
+        unpA_num_faces,
+        unpA_num_faces
+    );
+
+    if constexpr (type != PoolType::MAX) {
+        union {
+            float f;
+            uint32_t u;
+        } f2u = {.f = const_mult};
+
+        for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u;  // Load const into L1 buffer
+    }
+}
+
+template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en=false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) {
+    const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand};
+    llk_unpack_reduce_hw_configure<type, dim, is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_reduce_params, mult);
+}
+
+template <PoolType type, ReduceDim dim>
+inline void llk_unpack_reduce_mop_config() {
+    _llk_unpack_reduce_mop_config_<type, dim>();
+}
+
+template <PoolType type, ReduceDim dim>
+inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) {
+
+    constexpr std::uint32_t unpA_operand_id = 0;
+
+    const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
+    const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
+                               ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
+
+    cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>(unpB_dst_format);
+
+    cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0xf>(unpB_src_format);
+    cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpB_dst_format);
+
+    TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32);
+    TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
+    TTI_NOP; TTI_NOP;
+
+    _llk_unpack_reduce_init_<type, dim>(
+        within_face_16x16_transpose
+    );
+}
+
+template <PoolType type, ReduceDim dim>
+inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) {
+
+    std::uint32_t operand_id = get_operand_id(operand);
+    std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+    std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+    std::uint32_t address = base_address + offset_address;
+
+    _llk_unpack_reduce_<type, dim>(
+        address
+    );
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
new file mode 100644
index 00000000000..0f0a1b69ab3
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_tilize.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+* LLK UNPACK TILIZE
+*************************************************************************/
+
+template <bool is_fp32_dest_acc_en = false>
+inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) {
+
+    constexpr bool  within_face_16x16_transpose = false;
+    constexpr StochRndMode stoch_rnd_mode = StochRndMode::None;
+
+    const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand);
+    const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+    const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+    _llk_unpack_tilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format[unpA_operand_id],
+        unpack_dst_format[unpA_operand_id],
+        unpA_face_r_dim,
+        within_face_16x16_transpose,
+        unpA_num_faces
+    );
+}
+
+
+template <bool is_fp32_dest_acc_en = false>
+inline void llk_unpack_tilize_hw_configure_disaggregated(
+    const std::uint32_t unpA_operand) {
+    const llk_unpack_A_params_t unpack_tilize_params = {
+        .unpA_operand = unpA_operand
+    };
+    llk_unpack_tilize_hw_configure<is_fp32_dest_acc_en>(&unpack_tilize_params);
+}
+
+inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) {
+    std::uint32_t operand_id = get_operand_id(operand);
+    const bool narrow_tile = get_operand_narrow_tile(operand_id);
+    _llk_unpack_tilize_mop_config_(narrow_tile);
+}
+
+inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) {
+    cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(0);
+
+    const std::uint32_t operand_id = get_operand_id(operand);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+    const bool narrow_tile = get_operand_narrow_tile(operand_id);
+
+    // Save state of unpacker config for quick restore
+    TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0]
+    TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
+
+    _llk_unpack_tilize_init_(
+        unpack_src_format[operand_id],
+        unpack_dst_format[operand_id],
+        ct_dim,
+        face_r_dim,
+        narrow_tile
+    );
+
+}
+
+inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) {
+    TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0);
+    TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0]
+    TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32,  p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context
+}
+
+inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) {
+
+    std::uint32_t operand_id = get_operand_id(operand);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+    const bool narrow_tile = get_operand_narrow_tile(operand_id);
+
+    std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;  // Remove header size added by descriptor
+
+    _llk_unpack_tilize_(
+        base_address,
+        tile_index,
+        unpack_src_format[operand_id],
+        block_ct_dim,
+        face_r_dim,
+        num_faces,
+        narrow_tile
+    );
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h
new file mode 100644
index 00000000000..5a135ad8903
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_untilize.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+* LLK UNPACK UNTILIZE
+*************************************************************************/
+template <bool is_fp32_dest_acc_en = false>
+inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) {
+    constexpr bool is_row_pool = false;
+    constexpr bool within_face_16x16_transpose = false;
+    constexpr StochRndMode stoch_rnd_mode = StochRndMode::None;
+
+    const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand);
+    const uint32_t unpA_num_faces = 4;
+    const uint32_t unpA_face_r_dim = FACE_R_DIM;
+
+    _llk_unpack_untilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+        unpack_src_format[unpA_operand_id],
+        unpack_dst_format[unpA_operand_id],
+        unpA_face_r_dim,
+        within_face_16x16_transpose,
+        unpA_num_faces
+    );
+}
+
+inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) {
+    const llk_unpack_A_params_t unpack_untilize_params = {
+        .unpA_operand = unpA_operand,
+    };
+    llk_unpack_untilize_hw_configure(&unpack_untilize_params);
+}
+
+inline void llk_unpack_untilize_mop_config() {
+    _llk_unpack_untilize_mop_config_();
+}
+
+inline void llk_unpack_untilize_init(std::uint32_t operand = 0) {
+    const std::uint32_t operand_id = get_operand_id(operand);
+    const std::uint32_t face_r_dim = 1;
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+    // Save state of unpacker config for quick restore
+    TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config
+    TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
+    TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1
+
+    _llk_unpack_untilize_init_(
+        unpack_dst_format[operand_id],
+        cb_interface[operand_id].fifo_page_size,
+        face_r_dim,
+        num_faces
+    );
+}
+
+inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) {
+    std::uint32_t operand_id = get_operand_id(operand);
+    std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
+    std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride;
+
+    // Check that unpacker is done (all contexts freed up) before starting hw configuration
+    wait_for_idle();
+
+    // Reset address counters
+    unpacker_addr_counter_init();
+
+    // Wait for cfg to be free to edit
+    TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK);
+
+    // Reset the values to default in unpack AB common.
+    TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0);
+    TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16);
+    cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 0, 0xFFFF>(1);
+    cfg_reg_rmw_tensix<UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, UNP0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, UNP0_ADDR_CTRL_XY_REG_1_Ystride_MASK>(unpA_ch1_y_stride);
+    TTI_NOP; TTI_NOP; // Do we need this for WH?
+}
+
+template <bool first_pass = true>
+inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) {
+    const std::uint32_t operand_id = get_operand_id(operand);
+    const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+
+    _llk_unpack_untilize_pass_<first_pass>(
+        base_address,
+        block_tile_cols
+    );
+}
+
+inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) {
+    llk_unpack_untilize_pass<true>(operand, block_c_tiles);
+    llk_unpack_untilize_pass<false>(operand, block_c_tiles);
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc
new file mode 100644
index 00000000000..b3f31c2c095
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc
@@ -0,0 +1,3 @@
+#include "llk_io.h"
+
+CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] = {0};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h
new file mode 100644
index 00000000000..7d3e365a730
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <cstdint>
+
+#include "circular_buffer.h"
+
+extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_pack.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_pack.h
similarity index 98%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_pack.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_pack.h
index 7341143fbdb..29dc128e053 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_pack.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_pack.h
@@ -14,8 +14,8 @@
 
 using namespace ckernel;
 
-inline void llk_setup_cb_interface() {
-
+// "llk_setup_outputs" is the old function name that HLKC emits
+inline void llk_setup_outputs() {
     volatile tt_l1_ptr std::uint32_t* circular_buffer_config_addr = (volatile uint32_t*)(CIRCULAR_BUFFER_CONFIG_BASE);
 
     for (std::uint32_t cb_id = 0; cb_id < NUM_CIRCULAR_BUFFERS; cb_id++) {
@@ -40,11 +40,6 @@ inline void llk_setup_cb_interface() {
     }
 }
 
-// "llk_setup_outputs" is the old function name that HLKC emits
-inline void llk_setup_outputs() {
-    llk_setup_cb_interface();
-}
-
 // Blocking call to wait for free space needed to pack N tiles
 template <bool skip_sync = false, bool wait_for_blocks = false, bool brisc_pack = false>
 inline void llk_wait_for_free_tiles(const std::int32_t operand, const std::int32_t num_tiles) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_unpack.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_unpack.h
similarity index 97%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_unpack.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_unpack.h
index 0cafd67dfa2..e9a882ce5da 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_unpack.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_unpack.h
@@ -7,14 +7,15 @@
 #include "ckernel_globals.h"
 #include "ckernel.h"
 #include "stream_interface.h"
+#include "stream_io_map.h"
 #include "hostdevcommon/common_runtime_address_map.h"
-#include "llk_unpack_common.h"
+#include "llk_unpack_common_api.h"
 
 
 using namespace ckernel;
 
-inline void llk_setup_cb_interface() {
-
+// "llk_setup_operands" is the old function name that HLKC emits
+inline void llk_setup_operands() {
     volatile tt_l1_ptr std::uint32_t* circular_buffer_config_addr = (volatile uint32_t*)(CIRCULAR_BUFFER_CONFIG_BASE);
 
     for (uint32_t cb_id = 0; cb_id < NUM_CIRCULAR_BUFFERS; cb_id++) {
@@ -35,11 +36,6 @@ inline void llk_setup_cb_interface() {
     }
 }
 
-// "llk_setup_operands" is the old function name that HLKC emits
-inline void llk_setup_operands() {
-    llk_setup_cb_interface();
-}
-
 // Wait for N tiles available in the incoming stream
 inline void llk_wait_tiles(int operand, std::int32_t num_tiles) {
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
new file mode 100644
index 00000000000..c6d1b438f42
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
@@ -0,0 +1,46 @@
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+inline uint32_t get_operand_id(uint32_t operand)
+{
+    const int INTERMEDIATE_BASE_ID = 24;
+    const int OPERAND_BASE_ID = 0;
+    return (operand>=INTERMEDIATE_BASE_ID) ? operand - 8 : operand - OPERAND_BASE_ID;
+}
+
+inline const uint32_t get_operand_src_format(const std::uint32_t operand_id)
+{
+   return unpack_src_format[operand_id];
+}
+
+inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id)
+{
+   return unpack_src_format[operand_id];
+}
+
+inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id)
+{
+   return 4;
+}
+
+inline const uint32_t get_operand_partial_face(const std::uint32_t operand_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_operand_face_r_dim(const std::uint32_t operand_id)
+{
+   return 16;
+}
+
+inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id)
+{
+   return 0;
+}
+
+inline const std::vector<uint32_t> get_operand_tile_dims(const std::uint32_t operand_id)
+{
+   return {32, 32};
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
new file mode 100644
index 00000000000..596255257d0
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+// Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes
+inline uint32_t get_output_id(uint32_t output)
+{
+   const uint32_t OUTPUT_BASE    = 0;
+   return ((output) - OUTPUT_BASE);
+}
+
+inline const uint32_t get_output_base_id()
+{
+   const uint32_t OUTPUT_BASE_ID = 16;
+   return (OUTPUT_BASE_ID);
+}
+
+inline const uint32_t get_output_src_format(const std::uint32_t output_id)
+{
+   return pack_src_format[output_id];
+}
+
+inline const uint32_t get_output_dst_format(const std::uint32_t output_id)
+{
+   return pack_src_format[output_id];
+}
+
+inline const uint32_t get_output_num_faces(const std::uint32_t output_id)
+{
+   return 4;
+}
+
+inline const uint32_t get_output_partial_face(const std::uint32_t output_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_output_face_r_dim(const std::uint32_t output_id)
+{
+   return 16;
+}
+
+inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id)
+{
+   return 0;
+}
+
+inline const std::vector<uint32_t> get_output_tile_dims(const std::uint32_t operand_id)
+{
+   return {32, 32};
+}
diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc
index 6702c298990..465c7b74d44 100644
--- a/tt_metal/hw/firmware/src/brisc.cc
+++ b/tt_metal/hw/firmware/src/brisc.cc
@@ -17,7 +17,7 @@
 #include "c_tensix_core.h"
 #include "tdma_xmov.h"
 #include "noc_nonblocking_api.h"
-#include "ckernel_globals.h"
+#include "metal_ckernel_globals.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "dev_msgs.h"
 #include "risc_attribs.h"
diff --git a/tt_metal/hw/firmware/src/brisck.cc b/tt_metal/hw/firmware/src/brisck.cc
index f00d6233b2b..06567a58a7d 100644
--- a/tt_metal/hw/firmware/src/brisck.cc
+++ b/tt_metal/hw/firmware/src/brisck.cc
@@ -15,7 +15,7 @@
 #include "c_tensix_core.h"
 #include "tdma_xmov.h"
 #include "noc_nonblocking_api.h"
-#include "ckernel_globals.h"
+#include "metal_ckernel_globals.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "dataflow_api.h"
 #include "noc_addr_ranges_gen.h"
diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc
index ace48e9b4c6..6a96aa0fbb0 100644
--- a/tt_metal/hw/firmware/src/ncrisc.cc
+++ b/tt_metal/hw/firmware/src/ncrisc.cc
@@ -7,7 +7,7 @@
 #include "noc_nonblocking_api.h"
 #include "dev_msgs.h"
 #include "stream_io_map.h"
-#include "ckernel_globals.h"
+#include "metal_ckernel_globals.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "risc_attribs.h"
 #include "generated_bank_to_noc_coord_mapping.h"
diff --git a/tt_metal/hw/firmware/src/ncrisck.cc b/tt_metal/hw/firmware/src/ncrisck.cc
index 3aff6217abd..7a6d037733c 100644
--- a/tt_metal/hw/firmware/src/ncrisck.cc
+++ b/tt_metal/hw/firmware/src/ncrisck.cc
@@ -9,7 +9,7 @@
 #ifdef PERF_DUMP
 #include "risc_perf.h"
 #endif
-#include "ckernel_globals.h"
+#include "metal_ckernel_globals.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "dataflow_api.h"
 #include "tensix_functions.h"
diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc
index 0267c005839..f1e0aad4b6f 100644
--- a/tt_metal/hw/firmware/src/trisc.cc
+++ b/tt_metal/hw/firmware/src/trisc.cc
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ckernel.h"
-#include "ckernel_globals.h"
+#include "metal_ckernel_globals.h"
 #include "risc_common.h"
 #include <tensix.h>
 #include "dev_msgs.h"
@@ -12,6 +12,7 @@
 
 #include "debug/fw_debug.h"
 #include "debug/status.h"
+#include "circular_buffer.h"
 
 namespace kernel_profiler {
 uint32_t wIndex __attribute__((used));
@@ -22,10 +23,10 @@ namespace ckernel
 
 enum class ttRiscCores : std::uint32_t { Unpack = 0, Math = 1, Pack = 2, Brisc = 3, Nrisc = 4};
 
-volatile tt_reg_ptr uint * const reg_base = reinterpret_cast<volatile uint *>(0xFFB10000);
-volatile tt_reg_ptr uint * const pc_buf_base = reinterpret_cast<volatile uint *>(PC_BUF_BASE);
-volatile tt_reg_ptr uint * const regfile = reinterpret_cast<volatile uint *>(REGFILE_BASE);
-volatile tt_reg_ptr uint * const instrn_buffer = reinterpret_cast<volatile uint *>(INSTRN_BUF_BASE);
+volatile tt_reg_ptr uint * reg_base = reinterpret_cast<volatile uint *>(0xFFB10000);
+volatile tt_reg_ptr uint * pc_buf_base = reinterpret_cast<volatile uint *>(PC_BUF_BASE);
+volatile tt_reg_ptr uint * regfile = reinterpret_cast<volatile uint *>(REGFILE_BASE);
+volatile tt_reg_ptr uint * instrn_buffer = reinterpret_cast<volatile uint *>(INSTRN_BUF_BASE);
 tt_reg_ptr uint *regmem = reinterpret_cast<tt_reg_ptr uint *>(REGFILE_BASE);
 
 uint32_t cfg_state_id __attribute__((used)) = 0;  // Flip between 0 and 1 to keep state between kernel calls
diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc
index 1369a6d1bca..174fe265300 100644
--- a/tt_metal/hw/firmware/src/trisck.cc
+++ b/tt_metal/hw/firmware/src/trisck.cc
@@ -9,7 +9,7 @@
 //      Need to make sure no other file includes these lists since it also include global parameter definitions
 // 2) instantiate global variables
 
-#include "ckernel_globals.h"
+#include "metal_ckernel_globals.h"
 
 #include "chlkc_list.h"
 
@@ -23,9 +23,9 @@ uint32_t gl_alu_format_spec_reg = 0;
 
 namespace ckernel
 {
-volatile tt_reg_ptr uint * const regfile = reinterpret_cast<volatile uint *>(REGFILE_BASE);
-volatile tt_reg_ptr uint * const instrn_buffer = reinterpret_cast<volatile uint *>(INSTRN_BUF_BASE);
-volatile tt_reg_ptr uint * const pc_buf_base = reinterpret_cast<volatile uint *>(PC_BUF_BASE);
+volatile tt_reg_ptr uint * regfile = reinterpret_cast<volatile uint *>(REGFILE_BASE);
+volatile tt_reg_ptr uint * instrn_buffer = reinterpret_cast<volatile uint *>(INSTRN_BUF_BASE);
+volatile tt_reg_ptr uint * pc_buf_base = reinterpret_cast<volatile uint *>(PC_BUF_BASE);
 }
 
 void kernel_launch()
diff --git a/tt_metal/hw/inc/debug/fw_debug.h b/tt_metal/hw/inc/debug/fw_debug.h
index 577743d5cc8..4232dcd6ddb 100644
--- a/tt_metal/hw/inc/debug/fw_debug.h
+++ b/tt_metal/hw/inc/debug/fw_debug.h
@@ -3,3 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #define FWASSERT(s, p)
+#define FWLOG0(...) (void)sizeof(__VA_ARGS__)
+#define FWLOG1(...) (void)sizeof(__VA_ARGS__)
+#define FWLOG2(...) (void)sizeof(__VA_ARGS__)
diff --git a/tt_metal/include/compute_kernel_api.h b/tt_metal/include/compute_kernel_api.h
index 1a79892db6e..a1abc4c2d4f 100644
--- a/tt_metal/include/compute_kernel_api.h
+++ b/tt_metal/include/compute_kernel_api.h
@@ -10,18 +10,19 @@
 #include "ckernel_include.h"
 #include "hostdevcommon/kernel_structs.h"
 #include "risc_attribs.h"
+#include "llk_op_info_api.h"
 
 #define SYNC SyncHalf
 
 #define ALWI inline __attribute__((always_inline))
 
 #ifdef TRISC_MATH
-#include "llk_math_common.h"
-#include "llk_math_matmul.h"
-#include "llk_math_eltwise_unary_datacopy.h"
-#include "llk_math_eltwise_binary.h"
-#include "llk_math_eltwise_unary_sfpu.h"
-#include "llk_math_reduce.h"
+#include "llk_math_common_api.h"
+#include "llk_math_matmul_api.h"
+#include "llk_math_unary_datacopy_api.h"
+#include "llk_math_binary_api.h"
+#include "llk_math_unary_sfpu_api.h"
+#include "llk_math_reduce_api.h"
 #define MATH(x) x
 #define MAIN math_main()
 #else
@@ -38,13 +39,13 @@
 #endif
 
 #ifdef TRISC_UNPACK
-#include "llk_unpack_common.h"
-#include "llk_unpack_AB_matmul.h"
-#include "llk_unpack_A.h"
-#include "llk_unpack_AB.h"
-#include "llk_unpack_reduce.h"
-#include "llk_unpack_tilize.h"
-#include "llk_unpack_untilize.h"
+#include "llk_unpack_common_api.h"
+#include "llk_unpack_AB_matmul_api.h"
+#include "llk_unpack_A_api.h"
+#include "llk_unpack_AB_api.h"
+#include "llk_unpack_reduce_api.h"
+#include "llk_unpack_tilize_api.h"
+#include "llk_unpack_untilize_api.h"
 #define UNPACK(x) x
 #define MAIN unpack_main()
 #else
diff --git a/tt_metal/include/compute_kernel_api/bcast.h b/tt_metal/include/compute_kernel_api/bcast.h
index 110cbd2db1d..3e5c8dc8c54 100644
--- a/tt_metal/include/compute_kernel_api/bcast.h
+++ b/tt_metal/include/compute_kernel_api/bcast.h
@@ -7,13 +7,13 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_binary.h"
-#include "llk_math_matmul.h"
+#include "llk_math_binary_api.h"
+#include "llk_math_matmul_api.h"
 #include "llk_math_common.h"
 #endif
 #ifdef TRISC_UNPACK
-#include "llk_unpack_AB.h"
-#include "llk_unpack_A.h"
+#include "llk_unpack_AB_api.h"
+#include "llk_unpack_A_api.h"
 #endif
 #ifdef TRISC_PACK
 #include "llk_pack.h"
diff --git a/tt_metal/include/compute_kernel_api/cb_api.h b/tt_metal/include/compute_kernel_api/cb_api.h
index a9ca5d2d97c..dbec2593108 100644
--- a/tt_metal/include/compute_kernel_api/cb_api.h
+++ b/tt_metal/include/compute_kernel_api/cb_api.h
@@ -6,6 +6,14 @@
 
 
 #include "compute_kernel_api/common_globals.h"
+#ifdef TRISC_PACK
+#include "llk_io_pack.h"
+#endif
+#ifdef TRISC_UNPACK
+#include "llk_io_unpack.h"
+#endif
+
+
 namespace ckernel {
 
 /**
diff --git a/tt_metal/include/compute_kernel_api/common_globals.h b/tt_metal/include/compute_kernel_api/common_globals.h
index 8393566a239..213859b1ae4 100644
--- a/tt_metal/include/compute_kernel_api/common_globals.h
+++ b/tt_metal/include/compute_kernel_api/common_globals.h
@@ -10,12 +10,12 @@
 
 #include "chlkc_list.h"
 #include "ckernel.h"
-#include "ckernel_globals.h"
+#include "metal_ckernel_globals.h"
 #include "ckernel_include.h"
 #include "hostdevcommon/kernel_structs.h"
 
 #ifdef TRISC_MATH
-#include "llk_math_common.h"
+#include "llk_math_common_api.h"
 #define MATH(x) x
 #define MAIN math_main()
 #else
@@ -23,8 +23,7 @@
 #endif
 
 #ifdef TRISC_PACK
-#include "llk_pack_common.h"
-#include "llk_pack.h"
+#include "llk_pack_api.h"
 #define PACK(x) x
 #define MAIN pack_main()
 #else
@@ -32,7 +31,6 @@
 #endif
 
 #ifdef TRISC_UNPACK
-#include "llk_unpack_common.h"
 #define UNPACK(x) x
 #define MAIN unpack_main()
 #else
diff --git a/tt_metal/include/compute_kernel_api/eltwise_binary.h b/tt_metal/include/compute_kernel_api/eltwise_binary.h
index 3dbd756686c..dc54de90a9f 100644
--- a/tt_metal/include/compute_kernel_api/eltwise_binary.h
+++ b/tt_metal/include/compute_kernel_api/eltwise_binary.h
@@ -7,10 +7,10 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_binary.h"
+#include "llk_math_binary_api.h"
 #endif
 #ifdef TRISC_UNPACK
-#include "llk_unpack_AB.h"
+#include "llk_unpack_AB_api.h"
 #endif
 
 
diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h
index 418bac60302..0b926fdb253 100644
--- a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h
+++ b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h
@@ -7,10 +7,10 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_unary_datacopy_api.h"
 #endif
 #ifdef TRISC_UNPACK
-#include "llk_unpack_AB.h"
+#include "llk_unpack_AB_api.h"
 #endif
 
 
diff --git a/tt_metal/include/compute_kernel_api/matmul.h b/tt_metal/include/compute_kernel_api/matmul.h
index 9e3ebf6ac19..8a22fe02e4c 100644
--- a/tt_metal/include/compute_kernel_api/matmul.h
+++ b/tt_metal/include/compute_kernel_api/matmul.h
@@ -7,10 +7,10 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_matmul.h"
+#include "llk_math_matmul_api.h"
 #endif
 #ifdef TRISC_UNPACK
-#include "llk_unpack_AB_matmul.h"
+#include "llk_unpack_AB_matmul_api.h"
 #endif
 
 namespace ckernel {
@@ -146,12 +146,21 @@ ALWI void mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t
     #endif
     MATH(( llk_math_pack_sync_init<SYNC>()  ));
 
+    #ifdef ARCH_GRAYSKULL
     PACK(( llk_pack_init<false, false, DstTileFaceLayout::ColMajor>()  ));
     PACK(( llk_pack_hw_configure_disaggregated<false>(out_cb_id) ));
     PACK(( llk_setup_outputs()  ));
     PACK(( llk_pack_dest_init<SYNC, DstTileFaceLayout::ColMajor, false>()  ));
     // TODO(AP): ZM-only kernel
     PACK(( llk_init_packer_dest_offset_registers<SyncHalf,DstTileFaceLayout::ColMajor,false>()  ));
+    #else
+    PACK(( llk_pack_init<false, false, DstTileFaceLayout::RowMajor>()  ));
+    PACK(( llk_pack_hw_configure_disaggregated<false>(out_cb_id) ));
+    PACK(( llk_setup_outputs()  ));
+    PACK(( llk_pack_dest_init<SYNC, DstTileFaceLayout::RowMajor, false>()  ));
+    // TODO(AP): ZM-only kernel
+    PACK(( llk_init_packer_dest_offset_registers<SyncHalf,DstTileFaceLayout::RowMajor,false>()  ));
+    #endif
 }
 
 /**
diff --git a/tt_metal/include/compute_kernel_api/reduce.h b/tt_metal/include/compute_kernel_api/reduce.h
index 82a74919d10..b00b317eda2 100644
--- a/tt_metal/include/compute_kernel_api/reduce.h
+++ b/tt_metal/include/compute_kernel_api/reduce.h
@@ -7,13 +7,13 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_reduce.h"
+#include "llk_math_reduce_api.h"
 #endif
 
 
 #ifdef TRISC_UNPACK
-#include "llk_unpack_AB.h"
-#include "llk_unpack_reduce.h"
+#include "llk_unpack_AB_api.h"
+#include "llk_unpack_reduce_api.h"
 #endif
 
 
diff --git a/tt_metal/include/compute_kernel_api/tile_move_copy.h b/tt_metal/include/compute_kernel_api/tile_move_copy.h
index ac04b9d4f8a..80a056ef038 100644
--- a/tt_metal/include/compute_kernel_api/tile_move_copy.h
+++ b/tt_metal/include/compute_kernel_api/tile_move_copy.h
@@ -7,11 +7,11 @@
 #include "compute_kernel_api/common_globals.h"
 
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_unary_datacopy_api.h"
 #endif
 
 #ifdef TRISC_UNPACK
-#include "llk_unpack_A.h"
+#include "llk_unpack_A_api.h"
 #endif
 namespace ckernel {
 
diff --git a/tt_metal/include/compute_kernel_api/tilize.h b/tt_metal/include/compute_kernel_api/tilize.h
index 422069f8ce0..58c1a7fc1d5 100644
--- a/tt_metal/include/compute_kernel_api/tilize.h
+++ b/tt_metal/include/compute_kernel_api/tilize.h
@@ -7,10 +7,10 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_unary_datacopy_api.h"
 #endif
 #ifdef TRISC_UNPACK
-#include "llk_unpack_tilize.h"
+#include "llk_unpack_tilize_api.h"
 #endif
 
 #include "debug/dprint.h"
diff --git a/tt_metal/include/compute_kernel_api/transpose_wh.h b/tt_metal/include/compute_kernel_api/transpose_wh.h
index 558e78676cd..167185e0947 100644
--- a/tt_metal/include/compute_kernel_api/transpose_wh.h
+++ b/tt_metal/include/compute_kernel_api/transpose_wh.h
@@ -6,10 +6,10 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_unary_datacopy_api.h"
 #endif
 #ifdef TRISC_UNPACK
-#include "llk_unpack_A.h"
+#include "llk_unpack_A_api.h"
 #endif
 
 
diff --git a/tt_metal/include/compute_kernel_api/untilize.h b/tt_metal/include/compute_kernel_api/untilize.h
index 1b770be282f..7f52753ee3c 100644
--- a/tt_metal/include/compute_kernel_api/untilize.h
+++ b/tt_metal/include/compute_kernel_api/untilize.h
@@ -7,10 +7,10 @@
 
 #include "compute_kernel_api/common.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_unary_datacopy_api.h"
 #endif
 #ifdef TRISC_UNPACK
-#include "llk_unpack_untilize.h"
+#include "llk_unpack_untilize_api.h"
 #endif
 
 namespace ckernel {

From 9da02bdf50af593ba6dc3c2d8748f5360a46c4f2 Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Wed, 29 Nov 2023 17:59:59 +0000
Subject: [PATCH 02/16] #3908: Fix linker code size overflow error for matmul

---
 .../wormhole_b0/metal/llk_api/llk_math_matmul_api.h      | 9 ++++-----
 .../hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h  | 9 +++++++--
 .../hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h   | 9 +++++++--
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
index 8f7ea1f5713..ff64fb27b2d 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
@@ -23,11 +23,10 @@ inline void llk_math_matmul_init(
 
     const bool partial_face = get_operand_partial_face(in0_id);
 
-    const auto unpack_tile_dims = get_operand_tile_dims(in0_id);
-    const std::uint32_t in0_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX];
-    const std::uint32_t in0_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX];
-    const std::uint32_t in1_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX];
-    const std::uint32_t in1_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX];
+    const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id);
+    const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id);
+    const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id);
+    const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id);
 
 #ifdef ARCH_GRAYSKULL
     _llk_math_matmul_init_<NUM_FIDELITY_PHASES, FaceLayout>(
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
index c6d1b438f42..9c71ef63b52 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
@@ -40,7 +40,12 @@ inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id)
    return 0;
 }
 
-inline const std::vector<uint32_t> get_operand_tile_dims(const std::uint32_t operand_id)
+inline const uint32_t get_operand_tile_r_dim(const std::uint32_t operand_id)
 {
-   return {32, 32};
+   return 32;
+}
+
+inline const uint32_t get_operand_tile_c_dim(const std::uint32_t operand_id)
+{
+   return 32;
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
index 596255257d0..cba5398b604 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
@@ -49,7 +49,12 @@ inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id)
    return 0;
 }
 
-inline const std::vector<uint32_t> get_output_tile_dims(const std::uint32_t operand_id)
+inline const uint32_t get_output_tile_r_dim(const std::uint32_t output_id)
 {
-   return {32, 32};
+   return 32;
+}
+
+inline const uint32_t get_output_tile_c_dim(const std::uint32_t output_id)
+{
+   return 32;
 }

From 85865ae299a4cc60b671287da4c941d81a2ca058 Mon Sep 17 00:00:00 2001
From: Reem Tawfik <rtawfik@tenstorrent.com>
Date: Wed, 29 Nov 2023 21:48:17 +0000
Subject: [PATCH 03/16] #3908: Fixes for regressions/piplines: 	- Fixed packer
 tile header write 	- Added missing spdx liscenses 	- Added missing
 includes/funcs 	- Make mod_div_lib common, remove conflicting
 functions

---
 .../compute/matmul_large_block.cpp            |  2 +-
 .../kernels/compute/bmm_tilize_untilize.cpp   |  1 +
 ...ts_in_l1_single_output_block_width_dim.cpp |  1 +
 .../conv_bmm_tilize_col_major_out_blocks.cpp  |  1 +
 .../wormhole_b0/common/inc/ckernel_defs.h     |  1 +
 .../wormhole_b0/common/inc/ckernel_perf_api.h |  4 +
 .../common/inc/ckernel_perf_include.h         |  4 +
 .../common/inc/ckernel_perf_math.h            |  4 +
 .../common/inc/ckernel_perf_unpack_pack.h     |  4 +
 .../metal/common/metal_ckernel_globals.h      |  1 +
 .../wormhole_b0/metal/llk_api/llk_pack_api.h  | 16 ++--
 .../metal/llk_api/llk_unpack_tilize_api.h     |  6 ++
 .../wormhole_b0/metal/llk_io/llk_io.h         |  4 +
 .../wormhole_b0/metal/llk_io/llk_operands.h   |  3 +
 .../metal_mod_div_lib.h => inc/mod_div_lib.h} |  0
 tt_metal/hw/inc/risc_common.h                 | 88 +------------------
 tt_metal/include/compute_kernel_api/tilize.h  |  4 -
 17 files changed, 46 insertions(+), 98 deletions(-)
 rename tt_metal/hw/{ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h => inc/mod_div_lib.h} (100%)

diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
index 651955d6944..e0336993506 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
@@ -9,7 +9,7 @@
 #include "compute_kernel_api/tile_move_copy.h"
 #include "compute_kernel_api/matmul.h"
 
-
+#include "mod_div_lib.h"
 
 inline void tilize_activation(uint32_t in0_cb, uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t out_cb)
 {
diff --git a/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp b/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp
index 9b969a8cb9d..c419a0f2e0f 100644
--- a/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp
+++ b/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp
@@ -4,6 +4,7 @@
 
 #include <cstdint>
 
+#include "mod_div_lib.h"
 #include "compute_kernel_api/tilize.h"
 #include "compute_kernel_api/untilize.h"
 #include "compute_kernel_api/tile_move_copy.h"
diff --git a/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp b/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp
index 83c68e74272..f2d348a4e8d 100644
--- a/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp
+++ b/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp
@@ -4,6 +4,7 @@
 
 #include <cstdint>
 
+#include "mod_div_lib.h"
 #include "compute_kernel_api/tilize.h"
 #include "compute_kernel_api/untilize.h"
 #include "compute_kernel_api/tile_move_copy.h"
diff --git a/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp b/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp
index 787849b31c0..83a75eacbff 100644
--- a/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp
+++ b/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp
@@ -4,6 +4,7 @@
 
 #include <cstdint>
 
+#include "mod_div_lib.h"
 #include "compute_kernel_api/tilize.h"
 #include "compute_kernel_api/untilize.h"
 #include "compute_kernel_api/tile_move_copy.h"
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
index ffd8ad6dae9..41450e32f27 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "llk_defs.h"
 #include "ckernel_ops.h"
 #include "tensix_types.h"
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
index 0e0c729f4b2..9bfa79f6934 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
@@ -1,3 +1,7 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #include <cstdint>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
index 50b9ed3f7cc..d9ff57a5403 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
@@ -1,3 +1,7 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #ifdef PERF_DUMP
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
index a5df5a61f62..812f5cc9884 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
@@ -1,3 +1,7 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #include <cstdint>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
index aaa854ebc2f..9a2b21b4756 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
@@ -1,3 +1,7 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #include <cstdint>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
index f31efd1c3d0..29a2dbf9cfe 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
@@ -10,6 +10,7 @@
 #include "metal_compile_time_args.h"
 #include "risc_attribs.h"
 #include "hostdevcommon/common_runtime_address_map.h"
+#include "hostdevcommon/kernel_structs.h"
 
 extern uint32_t __ldm_bss_start[];
 extern uint32_t __ldm_bss_end[];
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
index 808d88a6281..1e57d003cfc 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
@@ -1,3 +1,7 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 #include "ckernel.h"
 #include "ckernel_defs.h"
@@ -17,7 +21,7 @@
 * LLK PACK
 *************************************************************************/
 
-template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
 inline void llk_pack_mop_config(const uint32_t output) {
 
     const std::uint32_t output_id = get_output_id(output);
@@ -26,7 +30,7 @@ inline void llk_pack_mop_config(const uint32_t output) {
     const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]);
     const bool narrow_tile = get_output_narrow_tile(output_id);
 
-    _llk_pack_mop_config_<untilize, zero_output, FaceLayout, write_tile_header>(
+    _llk_pack_mop_config_<untilize, zero_output, FaceLayout, false>(
         pack_dst_format[output_id],
         face_r_dim,
         num_faces,
@@ -94,7 +98,7 @@ inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output
     llk_pack_reduce_hw_configure<untilize, type, dim, is_fp32_dest_acc_en>(&llk_pack_params);
 }
 
-template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
 inline void llk_pack_init(const std::uint32_t pack_output = 0) {
 
     const std::uint32_t output_id = get_output_id(pack_output);
@@ -103,7 +107,7 @@ inline void llk_pack_init(const std::uint32_t pack_output = 0) {
     const bool partial_face = get_output_partial_face(output_id);
     const bool narrow_tile = get_output_narrow_tile(output_id);
 
-    _llk_pack_init_<untilize, zero_output, FaceLayout, write_tile_header>(
+    _llk_pack_init_<untilize, zero_output, FaceLayout, false>(
         pack_dst_format[output_id],
         face_r_dim,
         num_faces,
@@ -237,7 +241,7 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) {
     );
 }
 
-template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor, bool write_tile_header = true>
+template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
 inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) {
     std::uint32_t old_output_id = get_output_id(old_output);
     std::uint32_t new_output_id = get_output_id(new_output);
@@ -248,7 +252,7 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const
         llk_pack_reconfig_data_format<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(new_output);
     } else if constexpr (is_tile_dim_reconfig_en) {
         // Same format but different tile dims
-        llk_pack_mop_config<false, false, FaceLayout, write_tile_header>(new_output);
+        llk_pack_mop_config<false, false, FaceLayout, false>(new_output);
     }
 }
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
index 0f0a1b69ab3..71eeb6a0ba2 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
@@ -91,3 +91,9 @@ inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, s
         narrow_tile
     );
 }
+
+inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) {
+    for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) {
+        llk_unpack_tilize(operand, tile_index, block_c_tiles);
+    }
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h
index 7d3e365a730..37e018dc6b8 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h
@@ -1,3 +1,7 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 #include <cstdint>
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
index 9c71ef63b52..4a03157715b 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 #include <cstdint>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h b/tt_metal/hw/inc/mod_div_lib.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h
rename to tt_metal/hw/inc/mod_div_lib.h
diff --git a/tt_metal/hw/inc/risc_common.h b/tt_metal/hw/inc/risc_common.h
index 7afd2d84974..cd6b02c7a7d 100644
--- a/tt_metal/hw/inc/risc_common.h
+++ b/tt_metal/hw/inc/risc_common.h
@@ -15,6 +15,7 @@
 #include "stream_io_map.h"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "limits.h"
+#include "mod_div_lib.h"
 
 #define NOC_X(x) (noc_index == 0 ? (x) : (noc_size_x-1-(x)))
 #define NOC_Y(y) (noc_index == 0 ? (y) : (noc_size_y-1-(y)))
@@ -147,93 +148,6 @@ inline uint32_t special_mult(uint32_t a, uint32_t special_b) {
   return 0;
 }
 
-inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b)
-{
-  unsigned int r = 0;
-  while (a)
-    {
-      if (a & 1)
-        r += b;
-      a >>= 1;
-      b <<= 1;
-    }
-  return r;
-}
-
-inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n)
-{
-    // Uses embedding style magic number
-    // * fixed point 1/12 then shifting.
-    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
-    return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3;
-}
-
-inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n)
-{
-    // Uses embedding style magic number
-    // * fixed point 1/12 then shifting.
-    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
-    return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6;
-}
-
-template <uint32_t d>
-inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n)
-{
-    if constexpr (d == 12) {
-        // fast divide for 12 divisor
-        return fast_udiv_12(n);
-    } else if constexpr (d == 94) {
-        // fast divide for 94 divisor. Handles Banked L1 address generation for E75
-        return fast_udiv_94(n);
-    } else {
-        // generic divide from llvm
-        const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT;
-        unsigned int q;
-        unsigned int r;
-        unsigned sr;
-        /* special cases */
-        if (d == 0)
-            return 0; /* ?! */
-        if (n == 0)
-            return 0;
-        sr = __builtin_clz(d) - __builtin_clz(n);
-        /* 0 <= sr <= n_uword_bits - 1 or sr large */
-        if (sr > n_uword_bits - 1)  /* d > r */
-            return 0;
-        if (sr == n_uword_bits - 1)  /* d == 1 */
-            return n;
-        ++sr;
-        /* 1 <= sr <= n_uword_bits - 1 */
-        /* Not a special case */
-        q = n << (n_uword_bits - sr);
-        r = n >> sr;
-        unsigned int  carry = 0;
-        for (; sr > 0; --sr)
-        {
-            /* r:q = ((r:q)  << 1) | carry */
-            r = (r << 1) | (q >> (n_uword_bits - 1));
-            q = (q << 1) | carry;
-            /* carry = 0;
-             * if (r.all >= d.all)
-             * {
-             *      r.all -= d.all;
-             *      carry = 1;
-             * }
-             */
-            const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1);
-            carry = s & 1;
-            r -= d & s;
-        }
-        q = (q << 1) | carry;
-        return q;
-    }
-}
-template <uint32_t d>
-inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a)
-{
-    return a - udivsi3_const_divisor<d>(a) * d;
-}
-
 void risc_init();
 void replicate(uint32_t noc_id, uint32_t src_addr, uint64_t dest_addr, uint32_t chunk_size_bytes, uint32_t times_to_replicate);
 void replicate_l1(uint32_t noc_id, uint32_t src_addr, uint64_t dest_addr, uint32_t chunk_size_bytes, uint32_t times_to_replicate);
diff --git a/tt_metal/include/compute_kernel_api/tilize.h b/tt_metal/include/compute_kernel_api/tilize.h
index 58c1a7fc1d5..096b87027b3 100644
--- a/tt_metal/include/compute_kernel_api/tilize.h
+++ b/tt_metal/include/compute_kernel_api/tilize.h
@@ -37,11 +37,7 @@ ALWI void tilize_init(uint32_t icb, uint32_t block, uint32_t ocb = 16)
     PACK(( llk_pack_dest_init<SyncHalf, DstTileFaceLayout::RowMajor, false>() ));
 
     UNPACK(( llk_setup_operands() ));
-    #ifdef ARCH_GRAYSKULL
     UNPACK(( llk_unpack_tilize_hw_configure_disaggregated(icb) ));
-    #else
-    UNPACK(( llk_unpack_tilize_hw_configure_disaggregated<>(icb, block) ));
-    #endif
     UNPACK(( llk_unpack_tilize_init(icb, block) ));
 }
 

From 649c02e35f3a54f6abbb113ece28afcc89a80bfb Mon Sep 17 00:00:00 2001
From: Kei-Ming Kwong <kkwong@tenstorrent.com>
Date: Thu, 30 Nov 2023 19:21:55 +0000
Subject: [PATCH 04/16] #3908: Update llk uplift 	- Fix for reduce +
 build issues with cb_interface

---
 .../ckernels/wormhole_b0/common/inc/ckernel.h |   21 +-
 .../wormhole_b0/common/inc/ckernel_addrmod.h  |    1 +
 .../wormhole_b0/common/inc/ckernel_defs.h     |   93 +-
 .../wormhole_b0/common/inc/ckernel_globals.h  |    1 +
 .../wormhole_b0/common/inc/ckernel_gpr_map.h  |    1 +
 .../wormhole_b0/common/inc/ckernel_include.h  |    1 +
 .../common/inc/ckernel_instr_params.h         |   11 +-
 .../wormhole_b0/common/inc/ckernel_ops.h      | 1130 +++++++++++++++++
 .../wormhole_b0/common/inc/ckernel_pcbuf.h    |    1 +
 .../wormhole_b0/common/inc/ckernel_sfpi.h     |    1 +
 .../wormhole_b0/common/inc/ckernel_sfpu.h     |  596 ++++-----
 .../wormhole_b0/common/inc/ckernel_structs.h  |    1 +
 .../wormhole_b0/common/inc/ckernel_template.h |  218 ++++
 .../wormhole_b0/common/inc/ckernel_xmov.h     |    1 +
 .../wormhole_b0/common/inc/cmath_common.h     |   19 +
 .../wormhole_b0/common/inc/cpack_common.h     |   28 +
 .../wormhole_b0/common/inc/cunpack_common.h   |   43 +-
 .../common/src/ckernel_template.cc            |  230 ----
 .../wormhole_b0/common/src/ckernel_unity.cc   |    1 -
 .../wormhole_b0/common/src/fwlog_list         |    1 -
 .../ckernels/wormhole_b0/llk_lib/llk_defs.h   |   78 +-
 .../wormhole_b0/llk_lib/llk_math_common.h     |    1 +
 .../llk_lib/llk_math_eltwise_binary.h         |   14 +-
 .../llk_lib/llk_math_eltwise_binary_sfpu.h    |   92 +-
 .../llk_lib/llk_math_eltwise_unary_datacopy.h |    3 +-
 .../llk_lib/llk_math_eltwise_unary_sfpi.h     |    1 +
 .../llk_lib/llk_math_eltwise_unary_sfpu.h     |   82 +-
 .../wormhole_b0/llk_lib/llk_math_matmul.h     |   20 +-
 .../wormhole_b0/llk_lib/llk_math_reduce.h     |   50 +-
 .../ckernels/wormhole_b0/llk_lib/llk_pack.h   |    3 +
 .../wormhole_b0/llk_lib/llk_pack_common.h     |   23 +-
 .../wormhole_b0/llk_lib/llk_pack_untilize.h   |   71 ++
 .../wormhole_b0/llk_lib/llk_unpack_A.h        |   16 +-
 .../wormhole_b0/llk_lib/llk_unpack_AB.h       |   10 +-
 .../llk_lib/llk_unpack_AB_matmul.h            |   10 +-
 .../wormhole_b0/llk_lib/llk_unpack_common.h   |   34 +-
 .../wormhole_b0/llk_lib/llk_unpack_reduce.h   |    8 +-
 .../wormhole_b0/llk_lib/llk_unpack_tilize.h   |    8 +-
 .../wormhole_b0/llk_lib/llk_unpack_untilize.h |    8 +-
 .../metal/llk_api/llk_math_matmul_api.h       |    4 +-
 .../metal/llk_api/llk_math_unary_sfpu_api.h   |   30 -
 .../wormhole_b0/metal/llk_api/llk_pack_api.h  |   33 +
 .../metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h |    2 +-
 .../llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h  |    4 +-
 .../llk_api/llk_sfpu/ckernel_sfpu_erfinv.h    |    2 +-
 .../llk_api/llk_sfpu/ckernel_sfpu_gelu.h      |   14 +-
 .../metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h  |    2 +-
 .../llk_sfpu/ckernel_sfpu_logical_not_noti.h  |    2 +-
 .../llk_api/llk_sfpu/ckernel_sfpu_relu.h      |    6 +-
 ..._math_eltwise_unary_sfpu_common_includes.h |  155 +++
 .../llk_math_eltwise_unary_sfpu_init.h        |   45 +-
 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h     |   20 +-
 .../metal/llk_api/llk_sfpu_types.h            |   69 +
 .../metal/llk_api/llk_unpack_AB_api.h         |    4 +-
 .../metal/llk_api/llk_unpack_AB_matmul_api.h  |    4 +-
 .../metal/llk_api/llk_unpack_A_api.h          |    4 +-
 .../metal/llk_api/llk_unpack_common_api.h     |   12 +-
 .../metal/llk_api/llk_unpack_reduce_api.h     |    4 +-
 .../metal/llk_api/llk_unpack_tilize_api.h     |    2 +-
 .../metal/llk_api/llk_unpack_untilize_api.h   |    2 +-
 tt_metal/hw/inc/debug/dprint_tile.h           |    2 +-
 61 files changed, 2278 insertions(+), 1075 deletions(-)
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h

diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
index 2f72476ade2..b731cc4bf81 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "risc_attribs.h"
@@ -35,6 +36,9 @@
 #define OVERLAY_DECOUPLE 0
 #endif
 
+#ifdef LLK_TB_TEST
+#include "kernel_slowdown_config.h"
+#endif
 
 #ifndef INSERT_UNPACK_DELAY
 #define INSERT_UNPACK_DELAY 0
@@ -56,10 +60,13 @@
 
 #include "ckernel_include.h"
 #include "tensix.h"
-#include "debug/fw_debug.h"
-#include "eth_l1_address_map.h"
-#include "hostdevcommon/common_runtime_address_map.h"
+#include "fw_debug.h"
 // #include <cstring>
+#if defined(PERF_DUMP) || DELAY_EN > 0
+#include <l1_address_map.h>
+#include "tt_log.h"
+#include "perf_lib/scratch_api.h"
+#endif
 
 namespace ckernel
 {
@@ -73,10 +80,10 @@ constexpr uint RESET_VAL = 0;
 constexpr uint KERNEL_IN_PROGRESS = 15;
 constexpr uint KERNEL_COMPLETE = 1;
 
-extern volatile uint tt_reg_ptr * reg_base;
-extern volatile uint tt_reg_ptr * pc_buf_base;
-extern volatile uint tt_reg_ptr * regfile;
-extern volatile uint tt_reg_ptr * instrn_buffer;
+extern volatile uint tt_reg_ptr *reg_base;
+extern volatile uint tt_reg_ptr *pc_buf_base;
+extern volatile uint tt_reg_ptr *regfile;
+extern volatile uint tt_reg_ptr *instrn_buffer;
 extern volatile uint tt_reg_ptr *mailbox_base[4];
 extern volatile uint tt_reg_ptr *dbg_event_scratch;
 extern volatile uint tt_reg_ptr *trisc_l1_mailbox;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h
index f917c1e009d..3009fae2695 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
index 41450e32f27..b1630dfe798 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "llk_defs.h"
@@ -67,24 +68,6 @@ enum PackSelMask
     PACK_23=0xC
 };
 
-/*
-Stochastic rounding modes:
-    None: No stochastic rounding enabled, default rounding is round to nearest even.
-    Fpu: Enables stochastic rounding for every accumulation in the fpu
-    Pack: Enables stochastic rounding in both gasket and packer. Gasket rounding is in
-    data format conversion stage from dest format to pack_src_format. Packer rounding
-    is in data format conversion stage from pack_src_format to pack_dst_format.
-    All: Enables fpu, pack and gasket rounding.
-*/
-enum class StochRndMode : std::uint8_t
-{
-    None    = 0,
-    Fpu     = 1,
-    Pack    = 2,
-    All     = 0xf,
-    Invalid = 0xff,
-};
-
 constexpr std::uint32_t FACE_HEIGHT = 16;
 constexpr std::uint32_t FACE_WIDTH  = 16;
 constexpr std::uint32_t TILE_HEIGHT = 32;
@@ -108,78 +91,6 @@ static_assert((DEST_NUM_TILES_FP16 & (DEST_NUM_TILES_FP16 - 1)) == 0);
 #define LO_16(REG) (2 * (REG))
 #define HI_16(REG) (2 * (REG) + 1)
 
-
-/*
-constexpr static std::int32_t MUL_TILE_SIZE_AND_INDEX(uint format, uint index) {
-    switch (format&0xF) {
-        case ((uint8_t)DataFormat::Float32): return ((index<<8)+(index<<1));
-        case ((uint8_t)DataFormat::Float16):
-        case ((uint8_t)DataFormat::Float16_b): return ((index<<7)+(index<<1));
-        case ((uint8_t)DataFormat::Bfp8):
-        case ((uint8_t)DataFormat::Bfp8_b): return ((index<<6)+(index<<2)+(index<<1));
-        case ((uint8_t)DataFormat::Bfp4):
-        case ((uint8_t)DataFormat::Bfp4_b): return ((index<<5)+(index<<2)+(index<<1));
-        case ((uint8_t)DataFormat::Bfp2):
-        case ((uint8_t)DataFormat::Bfp2_b): return ((index<<4)+(index<<2)+(index<<1));
-        case ((uint8_t)DataFormat::Int8):
-        case ((uint8_t)DataFormat::Lf8): return ((index<<6)+(index<<1));
-        //Keep default as Bfp8?
-        default: return ((index<<6)+(index<<2)+(index<<1));
-    };
-}
-
-constexpr static std::int32_t MUL_DEST_TILE_SIZE_AND_INDEX(uint format, uint index) {
-    switch (format&0xF) {
-        case ((uint8_t)DataFormat::Float32): return (index<<12);
-        case ((uint8_t)DataFormat::Float16):
-        case ((uint8_t)DataFormat::Float16_b): return (index<<11);
-        case ((uint8_t)DataFormat::Bfp8):
-        case ((uint8_t)DataFormat::Bfp8_b): return (index<<10);
-        case ((uint8_t)DataFormat::Bfp4):
-        case ((uint8_t)DataFormat::Bfp4_b): return (index<<9);
-        case ((uint8_t)DataFormat::Bfp2):
-        case ((uint8_t)DataFormat::Bfp2_b): return (index<<8);
-        case ((uint8_t)DataFormat::Int8):
-        case ((uint8_t)DataFormat::Lf8): return (index<<10);
-        default: return (index<<10);
-    };
-}
-
-constexpr static std::int32_t GET_L1_TILE_SIZE(uint format) {
-    switch (format&0xF) {
-        case ((uint8_t)DataFormat::Float32): return ((4096>>4)+(32>>4));
-        case ((uint8_t)DataFormat::Float16):
-        case ((uint8_t)DataFormat::Float16_b): return ((2048>>4)+(32>>4));
-        case ((uint8_t)DataFormat::Bfp8):
-        case ((uint8_t)DataFormat::Bfp8_b): return ((1024>>4)+(64>>4)+(32>>4));
-        case ((uint8_t)DataFormat::Bfp4):
-        case ((uint8_t)DataFormat::Bfp4_b): return ((512>>4)+(64>>4)+(32>>4));
-        case ((uint8_t)DataFormat::Bfp2):
-        case ((uint8_t)DataFormat::Bfp2_b): return ((256>>4)+(64>>4)+(32>>4));
-        case ((uint8_t)DataFormat::Int8):
-        case ((uint8_t)DataFormat::Lf8): return ((1024>>4)+(32>>4));
-        default: return ((1024>>4)+(64>>4)+(32>>4));
-    };
-}
-
-constexpr static std::int32_t GET_DEST_TILE_BYTE_SIZE(uint format) {
-    switch (format&0xF) {
-        case ((uint8_t)DataFormat::Float32): return 4096;
-        case ((uint8_t)DataFormat::Float16):
-        case ((uint8_t)DataFormat::Float16_b): return 2048;
-        case ((uint8_t)DataFormat::Bfp8):
-        case ((uint8_t)DataFormat::Bfp8_b): return 1024;
-        case ((uint8_t)DataFormat::Bfp4):
-        case ((uint8_t)DataFormat::Bfp4_b): return 512;
-        case ((uint8_t)DataFormat::Bfp2):
-        case ((uint8_t)DataFormat::Bfp2_b): return 256;
-        case ((uint8_t)DataFormat::Int8):
-        case ((uint8_t)DataFormat::Lf8): return 1024;
-        default: return 1024;
-    };
-}
-*/
-
 constexpr static std::uint32_t GET_L1_HEADERLESS_TILE_SIZE(uint format) {
     switch (format&0xF) {
         case ((uint8_t)DataFormat::Int32):
@@ -243,6 +154,4 @@ constexpr static std::uint32_t SCALE_DATUM_SIZE(uint format, uint datum_count) {
 #define LOWER_HALFWORD(x) ((x) & 0xFFFF)
 #define UPPER_HALFWORD(x) ((x) >> 16)
 
-constexpr int WHB0_ITERATIONS = 8;
-
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
index 90ac67944f5..f9359469e33 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include <cstdint>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h
index 822704cc9e1..ba1abdca84e 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 // Hand-coded parameter encoding for various GPR mappings
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h
index 4418cfdb57e..6fa83799cb1 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 //
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h
index bb28714aa71..4c72eecf213 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #ifdef PERF_DUMP
@@ -18,13 +19,13 @@ struct p_setrwc
 
 #if SKIP_UNP == 1
     constexpr static uint CLR_A        = 0x0;
-        constexpr static uint CLR_B        = 0x0;
-        constexpr static uint CLR_AB       = 0x0;
+    constexpr static uint CLR_B        = 0x0;
+    constexpr static uint CLR_AB       = 0x0;
 #else
     constexpr static uint CLR_A        = 0x1;
-        constexpr static uint CLR_B        = 0x2;
-        constexpr static uint CLR_AB       = 0x3;
-    #endif
+    constexpr static uint CLR_B        = 0x2;
+    constexpr static uint CLR_AB       = 0x3;
+#endif
 
 #else
     constexpr static uint CLR_A        = 0x1;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h
new file mode 100644
index 00000000000..bdd4e9e048b
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h
@@ -0,0 +1,1130 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+
+//
+// Auto-generated file, do not modify!
+//
+
+#pragma once
+
+#define TT_OP(opcode, params) ( (opcode << 24) + params )
+#define INSTRUCTION_WORD(x) __asm__ __volatile__(".word (%0)" : : "i" ((x))) // Drop 32 bits into the instruction stream.
+#define TRISC_OP_SWIZZLE(x) ( (((x) >> 30) & 0x3) | (((x) & 0x3FFFFFFF) << 2) ) // Put top 2 bits, which are currently never 'b11 to bottom, indicating to Risc that they are not risc instructions
+
+#define TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  TT_OP(0x58, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0)))
+#define TT_ADDDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6))
+#define TT_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex)
+#define TTI_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) ))
+
+#define TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  TT_OP(0x53, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0)))
+#define TT_ADDRCRXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6))
+#define TT_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  ckernel::instrn_buffer[0] = TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask)
+#define TTI_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) ))
+
+#define TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  TT_OP(0x56, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0)))
+#define TT_ADDRCRZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6))
+#define TT_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  ckernel::instrn_buffer[0] = TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask)
+#define TTI_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) ))
+
+#define TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \
+  TT_OP(0x25, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0)))
+#define TT_APOOL3S1_VALID(clear_dvalid, addr_mode, index_en, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14))
+#define TT_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst)
+#define TTI_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) ))
+
+#define TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \
+  TT_OP(0x32, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0)))
+#define TT_APOOL3S2_VALID(clear_dvalid, addr_mode, index_en, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14))
+#define TT_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst)
+#define TTI_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) ))
+
+#define TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  TT_OP(0x64, (((MemHierSel) << 23) + ((SwapVal) << 18) + ((CmpVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0)))
+#define TT_ATCAS_VALID(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SwapVal, 5) && ckernel::is_valid(CmpVal, 4) && ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6))
+#define TT_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex)
+#define TTI_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) ))
+
+#define TT_OP_ATGETM(mutex_index) \
+  TT_OP(0xa0, (((mutex_index) << 0)))
+#define TT_ATGETM_VALID(mutex_index) \
+  (ckernel::is_valid(mutex_index, 24))
+#define TT_ATGETM(mutex_index) \
+  ckernel::instrn_buffer[0] = TT_OP_ATGETM(mutex_index)
+#define TTI_ATGETM(mutex_index) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATGETM(mutex_index) ))
+
+#define TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  TT_OP(0x61, (((MemHierSel) << 23) + ((WrapVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0)))
+#define TT_ATINCGET_VALID(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(WrapVal, 9) && ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6))
+#define TT_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex)
+#define TTI_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) ))
+
+#define TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  TT_OP(0x62, (((MemHierSel) << 23) + ((NoIncr) << 22) + ((IncrVal) << 18) + ((WrapVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0)))
+#define TT_ATINCGETPTR_VALID(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(NoIncr, 1) && ckernel::is_valid(IncrVal, 4) && ckernel::is_valid(WrapVal, 4) && ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6))
+#define TT_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex)
+#define TTI_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) ))
+
+#define TT_OP_ATRELM(mutex_index) \
+  TT_OP(0xa1, (((mutex_index) << 0)))
+#define TT_ATRELM_VALID(mutex_index) \
+  (ckernel::is_valid(mutex_index, 24))
+#define TT_ATRELM(mutex_index) \
+  ckernel::instrn_buffer[0] = TT_OP_ATRELM(mutex_index)
+#define TTI_ATRELM(mutex_index) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATRELM(mutex_index) ))
+
+#define TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \
+  TT_OP(0x63, (((MemHierSel) << 23) + ((SwapMask) << 14) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0)))
+#define TT_ATSWAP_VALID(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \
+  (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SwapMask, 9) && ckernel::is_valid(DataRegIndex, 8) && ckernel::is_valid(AddrRegIndex, 6))
+#define TT_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex)
+#define TTI_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) ))
+
+#define TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  TT_OP(0x5b, (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0)))
+#define TT_BITWOPDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6))
+#define TT_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex)
+#define TTI_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) ))
+
+#define TT_OP_CLEARDVALID(cleardvalid, reset) \
+  TT_OP(0x36, (((cleardvalid) << 22) + ((reset) << 0)))
+#define TT_CLEARDVALID_VALID(cleardvalid, reset) \
+  (ckernel::is_valid(cleardvalid, 2) && ckernel::is_valid(reset, 22))
+#define TT_CLEARDVALID(cleardvalid, reset) \
+  ckernel::instrn_buffer[0] = TT_OP_CLEARDVALID(cleardvalid, reset)
+#define TTI_CLEARDVALID(cleardvalid, reset) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CLEARDVALID(cleardvalid, reset) ))
+
+#define TT_OP_CLREXPHIST\
+  TT_OP(0x21, 0)
+#define TTI_CLREXPHIST\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CLREXPHIST))
+
+#define TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  TT_OP(0x5d, (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0)))
+#define TT_CMPDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6))
+#define TT_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex)
+#define TTI_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) ))
+
+#define TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \
+  TT_OP(0x22, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_CONV3S1_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(dst, 15))
+#define TT_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst)
+#define TTI_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) ))
+
+#define TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \
+  TT_OP(0x23, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_CONV3S2_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(dst, 15))
+#define TT_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst)
+#define TTI_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) ))
+
+#define TT_OP_DMANOP\
+  TT_OP(0x60, 0)
+#define TTI_DMANOP\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_DMANOP))
+
+#define TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  TT_OP(0x29, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_DOTPV_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15))
+#define TT_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst)
+#define TTI_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) ))
+
+#define TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  TT_OP(0x28, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_ELWADD_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15))
+#define TT_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst)
+#define TTI_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) ))
+
+#define TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  TT_OP(0x27, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_ELWMUL_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15))
+#define TT_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst)
+#define TTI_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) ))
+
+#define TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  TT_OP(0x30, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_ELWSUB_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15))
+#define TT_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst)
+#define TTI_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) ))
+
+#define TT_OP_FLUSHDMA(FlushSpec) \
+  TT_OP(0x46, (((FlushSpec) << 0)))
+#define TT_FLUSHDMA_VALID(FlushSpec) \
+  (ckernel::is_valid(FlushSpec, 24))
+#define TT_FLUSHDMA(FlushSpec) \
+  ckernel::instrn_buffer[0] = TT_OP_FLUSHDMA(FlushSpec)
+#define TTI_FLUSHDMA(FlushSpec) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_FLUSHDMA(FlushSpec) ))
+
+#define TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  TT_OP(0x34, (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((max_pool_index_en) << 14) + ((dst) << 0)))
+#define TT_GAPOOL_VALID(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(max_pool_index_en, 1) && ckernel::is_valid(dst, 14))
+#define TT_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst)
+#define TTI_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) ))
+
+#define TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \
+  TT_OP(0x35, (((reset_srcb_gate_control) << 1) + ((reset_srca_gate_control) << 0)))
+#define TT_GATESRCRST_VALID(reset_srcb_gate_control, reset_srca_gate_control) \
+  (ckernel::is_valid(reset srcb gate control, 23) && ckernel::is_valid(reset srca gate control, 1))
+#define TT_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \
+  ckernel::instrn_buffer[0] = TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control)
+#define TTI_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) ))
+
+#define TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  TT_OP(0x33, (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((max_pool_index_en) << 14) + ((dst) << 0)))
+#define TT_GMPOOL_VALID(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(max_pool_index_en, 1) && ckernel::is_valid(dst, 14))
+#define TT_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst)
+#define TTI_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) ))
+
+#define TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  TT_OP(0x52, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6)))
+#define TT_INCADCXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3))
+#define TT_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  ckernel::instrn_buffer[0] = TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X)
+#define TTI_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) ))
+
+#define TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  TT_OP(0x55, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6)))
+#define TT_INCADCZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3))
+#define TT_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  ckernel::instrn_buffer[0] = TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X)
+#define TTI_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) ))
+
+#define TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \
+  TT_OP(0x38, (((rwc_cr) << 18) + ((rwc_d) << 14) + ((rwc_b) << 10) + ((rwc_a) << 6)))
+#define TT_INCRWC_VALID(rwc_cr, rwc_d, rwc_b, rwc_a) \
+  (ckernel::is_valid(rwc_cr, 6) && ckernel::is_valid(rwc_d, 4) && ckernel::is_valid(rwc_b, 4) && ckernel::is_valid(rwc_a, 4))
+#define TT_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \
+  ckernel::instrn_buffer[0] = TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a)
+#define TTI_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) ))
+
+#define TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  TT_OP(0x49, (((SizeSel) << 22) + ((OffsetIndex) << 14) + ((AutoIncSpec) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0)))
+#define TT_LOADIND_VALID(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  (ckernel::is_valid(SizeSel, 2) && ckernel::is_valid(OffsetIndex, 8) && ckernel::is_valid(AutoIncSpec, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6))
+#define TT_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex)
+#define TTI_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) ))
+
+#define TT_OP_LOADREG(TdmaDataRegIndex, RegAddr) \
+  TT_OP(0x68, (((TdmaDataRegIndex) << 18) + ((RegAddr) << 0)))
+#define TT_LOADREG_VALID(TdmaDataRegIndex, RegAddr) \
+  (ckernel::is_valid(TdmaDataRegIndex, 6) && ckernel::is_valid(RegAddr, 18))
+#define TT_LOADREG(TdmaDataRegIndex, RegAddr) \
+  ckernel::instrn_buffer[0] = TT_OP_LOADREG(TdmaDataRegIndex, RegAddr)
+#define TTI_LOADREG(TdmaDataRegIndex, RegAddr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_LOADREG(TdmaDataRegIndex, RegAddr) ))
+
+#define TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \
+  TT_OP(0x3a, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_MFCONV3S1_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(dst, 15))
+#define TT_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst)
+#define TTI_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) ))
+
+#define TT_OP_MOP(mop_type, loop_count, zmask_lo16) \
+  TT_OP(0x01, (((mop_type) << 23) + ((loop_count) << 16) + ((zmask_lo16) << 0)))
+#define TT_MOP_VALID(mop_type, loop_count, zmask_lo16) \
+  (ckernel::is_valid(mop_type, 1) && ckernel::is_valid(loop_count, 7) && ckernel::is_valid(zmask_lo16, 16))
+#define TT_MOP(mop_type, loop_count, zmask_lo16) \
+  ckernel::instrn_buffer[0] = TT_OP_MOP(mop_type, loop_count, zmask_lo16)
+#define TTI_MOP(mop_type, loop_count, zmask_lo16) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOP(mop_type, loop_count, zmask_lo16) ))
+
+#define TT_OP_MOP_CFG(zmask_hi16) \
+  TT_OP(0x03, (((zmask_hi16) << 0)))
+#define TT_MOP_CFG_VALID(zmask_hi16) \
+  (ckernel::is_valid(zmask_hi16, 24))
+#define TT_MOP_CFG(zmask_hi16) \
+  ckernel::instrn_buffer[0] = TT_OP_MOP_CFG(zmask_hi16)
+#define TTI_MOP_CFG(zmask_hi16) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOP_CFG(zmask_hi16) ))
+
+#define TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  TT_OP(0x12, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0)))
+#define TT_MOVA2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12))
+#define TT_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst)
+#define TTI_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) ))
+
+#define TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb) \
+  TT_OP(0x0b, (((srca) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((srcb) << 0)))
+#define TT_MOVB2A_VALID(srca, addr_mode, instr_mod, srcb) \
+  (ckernel::is_valid(srca, 7) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(srcb, 12))
+#define TT_MOVB2A(srca, addr_mode, instr_mod, srcb) \
+  ckernel::instrn_buffer[0] = TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb)
+#define TTI_MOVB2A(srca, addr_mode, instr_mod, srcb) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb) ))
+
+#define TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  TT_OP(0x13, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0)))
+#define TT_MOVB2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12))
+#define TT_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst)
+#define TTI_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) ))
+
+#define TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  TT_OP(0x08, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0)))
+#define TT_MOVD2A_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12))
+#define TT_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst)
+#define TTI_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) ))
+
+#define TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  TT_OP(0x0a, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0)))
+#define TT_MOVD2B_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12))
+#define TT_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst)
+#define TTI_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) ))
+
+#define TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  TT_OP(0x09, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0)))
+#define TT_MOVDBGA2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12))
+#define TT_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst)
+#define TTI_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) ))
+
+#define TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \
+  TT_OP(0x24, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0)))
+#define TT_MPOOL3S1_VALID(clear_dvalid, addr_mode, index_en, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14))
+#define TT_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst)
+#define TTI_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) ))
+
+#define TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \
+  TT_OP(0x31, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0)))
+#define TT_MPOOL3S2_VALID(clear_dvalid, addr_mode, index_en, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14))
+#define TT_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst)
+#define TTI_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) ))
+
+#define TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  TT_OP(0x5a, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0)))
+#define TT_MULDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6))
+#define TT_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex)
+#define TTI_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) ))
+
+#define TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \
+  TT_OP(0x26, (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0)))
+#define TT_MVMUL_VALID(clear_dvalid, instr_mod19, addr_mode, dst) \
+  (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15))
+#define TT_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst)
+#define TTI_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) ))
+
+#define TT_OP_NOP\
+  TT_OP(0x02, 0)
+#define TTI_NOP\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_NOP))
+
+#define TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \
+  TT_OP(0x41, (((AddrMode) << 15) + ((ZeroWrite) << 12) + ((PackSel) << 8) + ((OvrdThreadId) << 7) + ((Concat) << 4) + ((Flush) << 1) + ((Last) << 0)))
+#define TT_PACR_VALID(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \
+  (ckernel::is_valid(AddrMode, 9) && ckernel::is_valid(ZeroWrite, 3) && ckernel::is_valid(PackSel, 4) && ckernel::is_valid(OvrdThreadId, 1) && ckernel::is_valid(Concat, 3) && ckernel::is_valid(Flush, 3) && ckernel::is_valid(Last, 1))
+#define TT_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \
+  ckernel::instrn_buffer[0] = TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last)
+#define TTI_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) ))
+
+#define TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \
+  TT_OP(0x4a, (((Push) << 23) + ((AddrSel) << 22) + ((WrData) << 12) + ((PackSel) << 8) + ((StreamId) << 2) + ((Flush) << 1) + ((Last) << 0)))
+#define TT_PACR_SETREG_VALID(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \
+  (ckernel::is_valid(Push, 1) && ckernel::is_valid(AddrSel, 1) && ckernel::is_valid(WrData, 10) && ckernel::is_valid(PackSel, 4) && ckernel::is_valid(StreamId, 6) && ckernel::is_valid(Flush, 1) && ckernel::is_valid(Last, 1))
+#define TT_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \
+  ckernel::instrn_buffer[0] = TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last)
+#define TTI_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) ))
+
+#define TT_OP_RAREB\
+  TT_OP(0x15, 0)
+#define TTI_RAREB\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RAREB))
+
+#define TT_OP_RDCFG(GprAddress, CfgReg) \
+  TT_OP(0xb1, (((GprAddress) << 16) + ((CfgReg) << 0)))
+#define TT_RDCFG_VALID(GprAddress, CfgReg) \
+  (ckernel::is_valid(GprAddress, 8) && ckernel::is_valid(CfgReg, 16))
+#define TT_RDCFG(GprAddress, CfgReg) \
+  ckernel::instrn_buffer[0] = TT_OP_RDCFG(GprAddress, CfgReg)
+#define TTI_RDCFG(GprAddress, CfgReg) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RDCFG(GprAddress, CfgReg) ))
+
+#define TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \
+  TT_OP(0x48, (((SizeSel) << 22) + ((TargetSel) << 20) + ((ByteOffset) << 18) + ((ContextId_2) << 16) + ((FlopIndex) << 6) + ((RegIndex) << 0)))
+#define TT_REG2FLOP_VALID(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \
+  (ckernel::is_valid(SizeSel, 2) && ckernel::is_valid(TargetSel, 2) && ckernel::is_valid(ByteOffset, 2) && ckernel::is_valid(ContextId_2, 2) && ckernel::is_valid(FlopIndex, 10) && ckernel::is_valid(RegIndex, 6))
+#define TT_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex)
+#define TTI_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) ))
+
+#define TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode) \
+  TT_OP(0x04, (((start_idx) << 14) + ((len) << 4) + ((execute_while_loading) << 1) + ((load_mode) << 0)))
+#define TT_REPLAY_VALID(start_idx, len, execute_while_loading, load_mode) \
+  (ckernel::is_valid(start_idx, 10) && ckernel::is_valid(len, 10) && ckernel::is_valid(execute_while_loading, 3) && ckernel::is_valid(load_mode, 1))
+#define TT_REPLAY(start_idx, len, execute_while_loading, load_mode) \
+  ckernel::instrn_buffer[0] = TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode)
+#define TTI_REPLAY(start_idx, len, execute_while_loading, load_mode) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode) ))
+
+#define TT_OP_RMWCIB0(Mask, Data, CfgRegAddr) \
+  TT_OP(0xb3, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0)))
+#define TT_RMWCIB0_VALID(Mask, Data, CfgRegAddr) \
+  (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8))
+#define TT_RMWCIB0(Mask, Data, CfgRegAddr) \
+  ckernel::instrn_buffer[0] = TT_OP_RMWCIB0(Mask, Data, CfgRegAddr)
+#define TTI_RMWCIB0(Mask, Data, CfgRegAddr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB0(Mask, Data, CfgRegAddr) ))
+
+#define TT_OP_RMWCIB1(Mask, Data, CfgRegAddr) \
+  TT_OP(0xb4, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0)))
+#define TT_RMWCIB1_VALID(Mask, Data, CfgRegAddr) \
+  (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8))
+#define TT_RMWCIB1(Mask, Data, CfgRegAddr) \
+  ckernel::instrn_buffer[0] = TT_OP_RMWCIB1(Mask, Data, CfgRegAddr)
+#define TTI_RMWCIB1(Mask, Data, CfgRegAddr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB1(Mask, Data, CfgRegAddr) ))
+
+#define TT_OP_RMWCIB2(Mask, Data, CfgRegAddr) \
+  TT_OP(0xb5, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0)))
+#define TT_RMWCIB2_VALID(Mask, Data, CfgRegAddr) \
+  (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8))
+#define TT_RMWCIB2(Mask, Data, CfgRegAddr) \
+  ckernel::instrn_buffer[0] = TT_OP_RMWCIB2(Mask, Data, CfgRegAddr)
+#define TTI_RMWCIB2(Mask, Data, CfgRegAddr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB2(Mask, Data, CfgRegAddr) ))
+
+#define TT_OP_RMWCIB3(Mask, Data, CfgRegAddr) \
+  TT_OP(0xb6, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0)))
+#define TT_RMWCIB3_VALID(Mask, Data, CfgRegAddr) \
+  (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8))
+#define TT_RMWCIB3(Mask, Data, CfgRegAddr) \
+  ckernel::instrn_buffer[0] = TT_OP_RMWCIB3(Mask, Data, CfgRegAddr)
+#define TTI_RMWCIB3(Mask, Data, CfgRegAddr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB3(Mask, Data, CfgRegAddr) ))
+
+#define TT_OP_RSTDMA\
+  TT_OP(0x44, 0)
+#define TTI_RSTDMA\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RSTDMA))
+
+#define TT_OP_SEMGET(sem_sel) \
+  TT_OP(0xa5, (((sem_sel) << 2)))
+#define TT_SEMGET_VALID(sem_sel) \
+  (ckernel::is_valid(sem_sel, 22))
+#define TT_SEMGET(sem_sel) \
+  ckernel::instrn_buffer[0] = TT_OP_SEMGET(sem_sel)
+#define TTI_SEMGET(sem_sel) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMGET(sem_sel) ))
+
+#define TT_OP_SEMINIT(max_value, init_value, sem_sel) \
+  TT_OP(0xa3, (((max_value) << 20) + ((init_value) << 16) + ((sem_sel) << 2)))
+#define TT_SEMINIT_VALID(max_value, init_value, sem_sel) \
+  (ckernel::is_valid(max_value, 4) && ckernel::is_valid(init_value, 4) && ckernel::is_valid(sem_sel, 14))
+#define TT_SEMINIT(max_value, init_value, sem_sel) \
+  ckernel::instrn_buffer[0] = TT_OP_SEMINIT(max_value, init_value, sem_sel)
+#define TTI_SEMINIT(max_value, init_value, sem_sel) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMINIT(max_value, init_value, sem_sel) ))
+
+#define TT_OP_SEMPOST(sem_sel) \
+  TT_OP(0xa4, (((sem_sel) << 2)))
+#define TT_SEMPOST_VALID(sem_sel) \
+  (ckernel::is_valid(sem_sel, 22))
+#define TT_SEMPOST(sem_sel) \
+  ckernel::instrn_buffer[0] = TT_OP_SEMPOST(sem_sel)
+#define TTI_SEMPOST(sem_sel) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMPOST(sem_sel) ))
+
+#define TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \
+  TT_OP(0xa6, (((stall_res) << 15) + ((sem_sel) << 2) + ((wait_sem_cond) << 0)))
+#define TT_SEMWAIT_VALID(stall_res, sem_sel, wait_sem_cond) \
+  (ckernel::is_valid(stall_res, 9) && ckernel::is_valid(sem_sel, 13) && ckernel::is_valid(wait_sem_cond, 2))
+#define TT_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \
+  ckernel::instrn_buffer[0] = TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond)
+#define TTI_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond) ))
+
+#define TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \
+  TT_OP(0x50, (((CntSetMask) << 21) + ((ChannelIndex) << 20) + ((DimensionIndex) << 18) + ((Value) << 0)))
+#define TT_SETADC_VALID(CntSetMask, ChannelIndex, DimensionIndex, Value) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(ChannelIndex, 1) && ckernel::is_valid(DimensionIndex, 2) && ckernel::is_valid(Value, 18))
+#define TT_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \
+  ckernel::instrn_buffer[0] = TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value)
+#define TTI_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) ))
+
+#define TT_OP_SETADCXX(CntSetMask, x_end2, x_start) \
+  TT_OP(0x5e, (((CntSetMask) << 21) + ((x_end2) << 10) + ((x_start) << 0)))
+#define TT_SETADCXX_VALID(CntSetMask, x_end2, x_start) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(x_end2, 11) && ckernel::is_valid(x_start, 10))
+#define TT_SETADCXX(CntSetMask, x_end2, x_start) \
+  ckernel::instrn_buffer[0] = TT_OP_SETADCXX(CntSetMask, x_end2, x_start)
+#define TTI_SETADCXX(CntSetMask, x_end2, x_start) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCXX(CntSetMask, x_end2, x_start) ))
+
+#define TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  TT_OP(0x51, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0)))
+#define TT_SETADCXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6))
+#define TT_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  ckernel::instrn_buffer[0] = TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask)
+#define TTI_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) ))
+
+#define TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  TT_OP(0x54, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0)))
+#define TT_SETADCZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6))
+#define TT_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  ckernel::instrn_buffer[0] = TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask)
+#define TTI_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) ))
+
+#define TT_OP_SETASHRMH(reg_mask, halo_mask) \
+  TT_OP(0x1e, (((reg_mask) << 1) + ((halo_mask) << 0)))
+#define TT_SETASHRMH_VALID(reg_mask, halo_mask) \
+  (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1))
+#define TT_SETASHRMH(reg_mask, halo_mask) \
+  ckernel::instrn_buffer[0] = TT_OP_SETASHRMH(reg_mask, halo_mask)
+#define TTI_SETASHRMH(reg_mask, halo_mask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH(reg_mask, halo_mask) ))
+
+#define TT_OP_SETASHRMH0(reg_mask, halo_mask) \
+  TT_OP(0x1a, (((reg_mask) << 1) + ((halo_mask) << 0)))
+#define TT_SETASHRMH0_VALID(reg_mask, halo_mask) \
+  (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1))
+#define TT_SETASHRMH0(reg_mask, halo_mask) \
+  ckernel::instrn_buffer[0] = TT_OP_SETASHRMH0(reg_mask, halo_mask)
+#define TTI_SETASHRMH0(reg_mask, halo_mask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH0(reg_mask, halo_mask) ))
+
+#define TT_OP_SETASHRMH1(reg_mask, halo_mask) \
+  TT_OP(0x1b, (((reg_mask) << 1) + ((halo_mask) << 0)))
+#define TT_SETASHRMH1_VALID(reg_mask, halo_mask) \
+  (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1))
+#define TT_SETASHRMH1(reg_mask, halo_mask) \
+  ckernel::instrn_buffer[0] = TT_OP_SETASHRMH1(reg_mask, halo_mask)
+#define TTI_SETASHRMH1(reg_mask, halo_mask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH1(reg_mask, halo_mask) ))
+
+#define TT_OP_SETASHRMV(reg_mask2) \
+  TT_OP(0x1c, (((reg_mask2) << 0)))
+#define TT_SETASHRMV_VALID(reg_mask2) \
+  (ckernel::is_valid(reg_mask2, 24))
+#define TT_SETASHRMV(reg_mask2) \
+  ckernel::instrn_buffer[0] = TT_OP_SETASHRMV(reg_mask2)
+#define TTI_SETASHRMV(reg_mask2) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMV(reg_mask2) ))
+
+#define TT_OP_SETC16(setc16_reg, setc16_value) \
+  TT_OP(0xb2, (((setc16_reg) << 16) + ((setc16_value) << 0)))
+#define TT_SETC16_VALID(setc16_reg, setc16_value) \
+  (ckernel::is_valid(setc16_reg, 8) && ckernel::is_valid(setc16_value, 16))
+#define TT_SETC16(setc16_reg, setc16_value) \
+  ckernel::instrn_buffer[0] = TT_OP_SETC16(setc16_reg, setc16_value)
+#define TTI_SETC16(setc16_reg, setc16_value) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETC16(setc16_reg, setc16_value) ))
+
+#define TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \
+  TT_OP(0x45, (((Payload_SigSelSize) << 22) + ((Payload_SigSel) << 8) + ((SetSignalsMode) << 7) + ((RegIndex16b) << 0)))
+#define TT_SETDMAREG_VALID(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \
+  (ckernel::is_valid(Payload_SigSelSize, 2) && ckernel::is_valid(Payload_SigSel, 14) && ckernel::is_valid(SetSignalsMode, 1) && ckernel::is_valid(RegIndex16b, 7))
+#define TT_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \
+  ckernel::instrn_buffer[0] = TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b)
+#define TTI_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) ))
+
+#define TT_OP_SETDVALID(setvalid) \
+  TT_OP(0x57, (((setvalid) << 0)))
+#define TT_SETDVALID_VALID(setvalid) \
+  (ckernel::is_valid(setvalid, 24))
+#define TT_SETDVALID(setvalid) \
+  ckernel::instrn_buffer[0] = TT_OP_SETDVALID(setvalid)
+#define TTI_SETDVALID(setvalid) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETDVALID(setvalid) ))
+
+#define TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \
+  TT_OP(0x39, (((rwc_cr) << 18) + ((rwc_bias) << 6) + ((set_inc_ctrl) << 0)))
+#define TT_SETIBRWC_VALID(rwc_cr, rwc_bias, set_inc_ctrl) \
+  (ckernel::is_valid(rwc_cr, 6) && ckernel::is_valid(rwc_bias, 12) && ckernel::is_valid(set_inc_ctrl, 6))
+#define TT_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \
+  ckernel::instrn_buffer[0] = TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl)
+#define TTI_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) ))
+
+#define TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start) \
+  TT_OP(0x1d, (((y_end) << 12) + ((y_start) << 8) + ((x_end) << 4) + ((x_start) << 0)))
+#define TT_SETPKEDGOF_VALID(y_end, y_start, x_end, x_start) \
+  (ckernel::is_valid(y_end, 12) && ckernel::is_valid(y_start, 4) && ckernel::is_valid(x_end, 4) && ckernel::is_valid(x_start, 4))
+#define TT_SETPKEDGOF(y_end, y_start, x_end, x_start) \
+  ckernel::instrn_buffer[0] = TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start)
+#define TTI_SETPKEDGOF(y_end, y_start, x_end, x_start) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start) ))
+
+#define TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \
+  TT_OP(0x37, (((clear_ab_vld) << 22) + ((rwc_cr) << 18) + ((rwc_d) << 14) + ((rwc_b) << 10) + ((rwc_a) << 6) + ((BitMask) << 0)))
+#define TT_SETRWC_VALID(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \
+  (ckernel::is_valid(clear_ab_vld, 2) && ckernel::is_valid(rwc_cr, 4) && ckernel::is_valid(rwc_d, 4) && ckernel::is_valid(rwc_b, 4) && ckernel::is_valid(rwc_a, 4) && ckernel::is_valid(BitMask, 6))
+#define TT_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \
+  ckernel::instrn_buffer[0] = TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask)
+#define TTI_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) ))
+
+#define TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x7d, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPABS_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  TT_OP(0x85, (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPADD_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1)
+#define TTI_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1) \
+  TT_OP(0x75, (((imm16_math) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPADDI_VALID(imm16_math, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPADDI(imm16_math, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1)
+#define TTI_SFPADDI(imm16_math, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x7e, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPAND_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \
+  TT_OP(0x90, (((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPCAST_VALID(lreg_src_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(lreg_src_c, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1)
+#define TTI_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x8b, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPCOMPC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1) \
+  TT_OP(0x91, (((imm16_math) << 8) + ((config_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPCONFIG_VALID(imm16_math, config_dest, instr_mod1) \
+  (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(config_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPCONFIG(imm16_math, config_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1)
+#define TTI_SFPCONFIG(imm16_math, config_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1) ))
+
+#define TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x76, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPDIVP2_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x8a, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPENCC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x77, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPEXEXP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x78, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPEXMAN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x79, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPIADD_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  TT_OP(0x70, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0)))
+#define TT_SFPLOAD_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && ckernel::is_valid(dest_reg_addr, 14))
+#define TT_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr)
+#define TTI_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) ))
+
+#define TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16) \
+  TT_OP(0x71, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((imm16) << 0)))
+#define TT_SFPLOADI_VALID(lreg_ind, instr_mod0, imm16) \
+  (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(imm16, 16))
+#define TT_SFPLOADI(lreg_ind, instr_mod0, imm16) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16)
+#define TTI_SFPLOADI(lreg_ind, instr_mod0, imm16) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16) ))
+
+#define TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  TT_OP(0x93, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0)))
+#define TT_SFPLOADMACRO_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && ckernel::is_valid(dest_reg_addr, 14))
+#define TT_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr)
+#define TTI_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) ))
+
+#define TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \
+  TT_OP(0x73, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((dest_reg_addr) << 0)))
+#define TT_SFPLUT_VALID(lreg_ind, instr_mod0, dest_reg_addr) \
+  (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(dest_reg_addr, 16))
+#define TT_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr)
+#define TTI_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) ))
+
+#define TT_OP_SFPLUTFP32(lreg_dest, instr_mod1) \
+  TT_OP(0x95, (((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPLUTFP32_VALID(lreg_dest, instr_mod1) \
+  (ckernel::is_valid(lreg_dest, 20) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPLUTFP32(lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPLUTFP32(lreg_dest, instr_mod1)
+#define TTI_SFPLUTFP32(lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLUTFP32(lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x81, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPLZ_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  TT_OP(0x84, (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPMAD_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1)
+#define TTI_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x7c, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPMOV_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  TT_OP(0x86, (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPMUL_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1)
+#define TTI_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1) \
+  TT_OP(0x74, (((imm16_math) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPMULI_VALID(imm16_math, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPMULI(imm16_math, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1)
+#define TTI_SFPMULI(imm16_math, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPNOP\
+  TT_OP(0x8f, 0)
+#define TTI_SFPNOP\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPNOP))
+
+#define TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x80, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPNOT_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x7f, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPOR_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x88, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPPOPC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x87, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPPUSHC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x7b, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPSETCC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x82, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPSETEXP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x83, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPSETMAN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x89, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPSETSGN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x7a, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPSHFT_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  TT_OP(0x94, (((imm12_math) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPSHFT2_VALID(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1)
+#define TTI_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  TT_OP(0x72, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0)))
+#define TT_SFPSTORE_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && ckernel::is_valid(dest_reg_addr, 14))
+#define TT_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr)
+#define TTI_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) ))
+
+#define TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  TT_OP(0x92, (((imm12_math) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPSWAP_VALID(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1)
+#define TTI_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x8c, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPTRANSP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  TT_OP(0x8d, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFPXOR_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1)
+#define TTI_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  TT_OP(0x8e, (((rnd_mode) << 21) + ((imm8_math) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0)))
+#define TT_SFP_STOCH_RND_VALID(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  (ckernel::is_valid(rnd_mode, 3) && ckernel::is_valid(imm8_math, 5) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4))
+#define TT_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  ckernel::instrn_buffer[0] = TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1)
+#define TTI_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) ))
+
+#define TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  TT_OP(0x5c, (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0)))
+#define TT_SHIFTDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6))
+#define TT_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex)
+#define TTI_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) ))
+
+#define TT_OP_SHIFTXA(log2_amount2, shift_mode) \
+  TT_OP(0x17, (((log2_amount2) << 2) + ((shift_mode) << 0)))
+#define TT_SHIFTXA_VALID(log2_amount2, shift_mode) \
+  (ckernel::is_valid(log2_amount2, 22) && ckernel::is_valid(shift_mode, 2))
+#define TT_SHIFTXA(log2_amount2, shift_mode) \
+  ckernel::instrn_buffer[0] = TT_OP_SHIFTXA(log2_amount2, shift_mode)
+#define TTI_SHIFTXA(log2_amount2, shift_mode) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTXA(log2_amount2, shift_mode) ))
+
+#define TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row) \
+  TT_OP(0x18, (((addr_mode) << 15) + ((rot_shift) << 10) + ((shift_row) << 0)))
+#define TT_SHIFTXB_VALID(addr_mode, rot_shift, shift_row) \
+  (ckernel::is_valid(addr_mode, 9) && ckernel::is_valid(rot_shift, 5) && ckernel::is_valid(shift_row, 10))
+#define TT_SHIFTXB(addr_mode, rot_shift, shift_row) \
+  ckernel::instrn_buffer[0] = TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row)
+#define TTI_SHIFTXB(addr_mode, rot_shift, shift_row) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row) ))
+
+#define TT_OP_STALLWAIT(stall_res, wait_res) \
+  TT_OP(0xa2, (((stall_res) << 15) + ((wait_res) << 0)))
+#define TT_STALLWAIT_VALID(stall_res, wait_res) \
+  (ckernel::is_valid(stall_res, 9) && ckernel::is_valid(wait_res, 15))
+#define TT_STALLWAIT(stall_res, wait_res) \
+  ckernel::instrn_buffer[0] = TT_OP_STALLWAIT(stall_res, wait_res)
+#define TTI_STALLWAIT(stall_res, wait_res) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STALLWAIT(stall_res, wait_res) ))
+
+#define TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  TT_OP(0x66, (((MemHierSel) << 23) + ((SizeSel) << 22) + ((RegSizeSel) << 21) + ((OffsetIndex) << 14) + ((AutoIncSpec) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0)))
+#define TT_STOREIND_VALID(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SizeSel, 1) && ckernel::is_valid(RegSizeSel, 1) && ckernel::is_valid(OffsetIndex, 7) && ckernel::is_valid(AutoIncSpec, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6))
+#define TT_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex)
+#define TTI_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) ))
+
+#define TT_OP_STOREREG(TdmaDataRegIndex, RegAddr) \
+  TT_OP(0x67, (((TdmaDataRegIndex) << 18) + ((RegAddr) << 0)))
+#define TT_STOREREG_VALID(TdmaDataRegIndex, RegAddr) \
+  (ckernel::is_valid(TdmaDataRegIndex, 6) && ckernel::is_valid(RegAddr, 18))
+#define TT_STOREREG(TdmaDataRegIndex, RegAddr) \
+  ckernel::instrn_buffer[0] = TT_OP_STOREREG(TdmaDataRegIndex, RegAddr)
+#define TTI_STOREREG(TdmaDataRegIndex, RegAddr) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STOREREG(TdmaDataRegIndex, RegAddr) ))
+
+#define TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  TT_OP(0x59, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0)))
+#define TT_SUBDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6))
+#define TT_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  ckernel::instrn_buffer[0] = TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex)
+#define TTI_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) ))
+
+#define TT_OP_TBUFCMD\
+  TT_OP(0x4b, 0)
+#define TTI_TBUFCMD\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TBUFCMD))
+
+#define TT_OP_TRNSPSRCA\
+  TT_OP(0x14, 0)
+#define TTI_TRNSPSRCA\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TRNSPSRCA))
+
+#define TT_OP_TRNSPSRCB\
+  TT_OP(0x16, 0)
+#define TTI_TRNSPSRCB\
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TRNSPSRCB))
+
+#define TT_OP_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \
+  TT_OP(0x42, (((Unpack_block_selection) << 23) + ((AddrMode) << 15) + ((CfgContextCntInc) << 13) + ((CfgContextId) << 10) + ((AddrCntContextId) << 8) + ((OvrdThreadId) << 7) + ((SetDatValid) << 6) + ((rareb_en) << 5) + ((ZeroWrite2) << 4) + ((AutoIncContextID) << 3) + ((RowSearch) << 2) + ((SearchCacheFlush) << 1) + ((Last) << 0)))
+#define TT_UNPACR_VALID(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \
+  (ckernel::is_valid(Unpack_block_selection, 1) && ckernel::is_valid(AddrMode, 8) && ckernel::is_valid(CfgContextCntInc, 2) && ckernel::is_valid(CfgContextId, 3) && ckernel::is_valid(AddrCntContextId, 2) && ckernel::is_valid(OvrdThreadId, 1) && ckernel::is_valid(SetDatValid, 1) && ckernel::is_valid(rareb_en, 1) && ckernel::is_valid(ZeroWrite2, 1) && ckernel::is_valid(AutoIncContextID, 1) && ckernel::is_valid(RowSearch, 1) && ckernel::is_valid(SearchCacheFlush, 1) && ckernel::is_valid(Last, 1))
+#define TT_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \
+  ckernel::instrn_buffer[0] = TT_OP_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last)
+#define TTI_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) ))
+
+#define TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp) \
+  TT_OP(0x43, (((Unpack_block_selection) << 23) + ((NoOp) << 0)))
+#define TT_UNPACR_NOP_VALID(Unpack_block_selection, NoOp) \
+  (ckernel::is_valid(Unpack_block_selection, 1) && ckernel::is_valid(NoOp, 23))
+#define TT_UNPACR_NOP(Unpack_block_selection, NoOp) \
+  ckernel::instrn_buffer[0] = TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp)
+#define TTI_UNPACR_NOP(Unpack_block_selection, NoOp) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp) ))
+
+#define TT_OP_WRCFG(GprAddress, wr128b, CfgReg) \
+  TT_OP(0xb0, (((GprAddress) << 16) + ((wr128b) << 15) + ((CfgReg) << 0)))
+#define TT_WRCFG_VALID(GprAddress, wr128b, CfgReg) \
+  (ckernel::is_valid(GprAddress, 8) && ckernel::is_valid(wr128b, 1) && ckernel::is_valid(CfgReg, 15))
+#define TT_WRCFG(GprAddress, wr128b, CfgReg) \
+  ckernel::instrn_buffer[0] = TT_OP_WRCFG(GprAddress, wr128b, CfgReg)
+#define TTI_WRCFG(GprAddress, wr128b, CfgReg) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_WRCFG(GprAddress, wr128b, CfgReg) ))
+
+#define TT_OP_XMOV(Mov_block_selection, Last) \
+  TT_OP(0x40, (((Mov_block_selection) << 23) + ((Last) << 0)))
+#define TT_XMOV_VALID(Mov_block_selection, Last) \
+  (ckernel::is_valid(Mov block selection, 1) && ckernel::is_valid(Last, 23))
+#define TT_XMOV(Mov_block_selection, Last) \
+  ckernel::instrn_buffer[0] = TT_OP_XMOV(Mov_block_selection, Last)
+#define TTI_XMOV(Mov_block_selection, Last) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_XMOV(Mov_block_selection, Last) ))
+
+#define TT_OP_ZEROACC(clear_mode, AddrMode, dst) \
+  TT_OP(0x10, (((clear_mode) << 19) + ((AddrMode) << 15) + ((dst) << 0)))
+#define TT_ZEROACC_VALID(clear_mode, AddrMode, dst) \
+  (ckernel::is_valid(clear_mode, 5) && ckernel::is_valid(AddrMode, 4) && ckernel::is_valid(dst, 15))
+#define TT_ZEROACC(clear_mode, AddrMode, dst) \
+  ckernel::instrn_buffer[0] = TT_OP_ZEROACC(clear_mode, AddrMode, dst)
+#define TTI_ZEROACC(clear_mode, AddrMode, dst) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ZEROACC(clear_mode, AddrMode, dst) ))
+
+#define TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \
+  TT_OP(0x11, (((zero_val) << 4) + ((write_mode) << 3) + ((bank_mask) << 2) + ((src_mask) << 0)))
+#define TT_ZEROSRC_VALID(zero_val, write_mode, bank_mask, src_mask) \
+  (ckernel::is_valid(zero_val, 20) && ckernel::is_valid(write_mode, 1) && ckernel::is_valid(bank_mask, 1) && ckernel::is_valid(src_mask, 2))
+#define TT_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \
+  ckernel::instrn_buffer[0] = TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask)
+#define TTI_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \
+  INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) ))
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h
index d0c2c755411..ef70dc53f88 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 // Functions for encoding and decoding PC buffer writes
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h
index 0dd06c65dc8..38d054cd6b1 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #include "ckernel_template.h"
 #include "ckernel.h"
 #include "cmath_common.h"
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
index e7b80e9cabf..b00ea4a0b1f 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel_defs.h"
 #include "noc_nonblocking_api.h"
 #include "ckernel.h"
-#include "llk_defs.h"
 #include <limits>
 
 #include "sfpi.h"
@@ -20,22 +20,27 @@ namespace sfpu
 {
 
 
-inline void sfpu_load_imm32(const uint dest, const uint val)
+inline void _sfpu_load_imm32_(const uint dest, const uint val)
+{
+        TT_SFPLOADI(dest, 10, (val & 0xFFFF));  // insmod == 10 will write the lower bits, and not affect the upper bits;
+        TT_SFPLOADI(dest, 8, (val>>16) & 0xFFFF);  // insmod == 8 will write the upper bits, and not affect the lower bits;
+}
+
+inline void _sfpu_load_imm16_(const uint dest, const uint val)
 {
-        TT_SFPLOADI(dest, 0xA, (val & 0xFFFF));  // insmod == A will write the lower bits, and not affect the upper bits;
-        TT_SFPLOADI(dest, 0x8, (val>>16) & 0xFFFF);  // insmod == 8 will write the upper bits, and not affect the lower bits;
+        TT_SFPLOADI(dest, 2, val);  // insmod == 2 will write imm16 value treated as unsigned integer, right justified and padded with zeroes on the MSBs
 }
 
-inline void sfpu_load_config32(const uint dest, const uint upper16, const uint lower16)
+inline void _sfpu_load_config32_(const uint dest, const uint upper16, const uint lower16)
 {
         // registers 11 through 14 are programmable "constants" which are shared across all 4 rows
         // They are updated only through the CONFIG path, which uses LREG[0] first and then copies it to the desired register location
-        TTI_SFPLOADI(0, 0xA, lower16);  // insmod == A will write the lower bits, and not affect the upper bits;
-        TTI_SFPLOADI(0, 0x8, upper16);  // insmod == 8 will write the upper bits, and not affect the lower bits;
+        TTI_SFPLOADI(0, 10, lower16);  // insmod == A will write the lower bits, and not affect the upper bits;
+        TTI_SFPLOADI(0, 8, upper16);  // insmod == 8 will write the upper bits, and not affect the lower bits;
         TTI_SFPCONFIG(0, dest, 0);
 }
 
-sfpi_inline vInt sfpu_is_fp16_zero(const vFloat& v, uint exponent_size_8)
+sfpi_inline vInt _sfpu_is_fp16_zero_(const vFloat& v, uint exponent_size_8)
 {
     if (exponent_size_8) {
         // fp16b
@@ -114,7 +119,8 @@ sfpi_inline vFloat _sfpu_reciprocal_(const vFloat in)
     return setexp(result, new_exp);
 }
 
-inline void init_dropout_seed(uint16_t p2){
+inline void _init_dropout_seed_(uint16_t p2){
+    FWLOG1("calculate_dropout() -- input seed:%x", p2);
 
     uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(0, 0, NOC_NODE_ID);
 
@@ -123,6 +129,8 @@ inline void init_dropout_seed(uint16_t p2){
 
     uint16_t per_tensix_input_seed = p2 ^ (my_x << my_y);
 
+    FWLOG1("calculate_dropout() -- calculated seed:%x", per_tensix_input_seed);
+
     vInt result = l_reg[LRegs::LReg3];
 
     vInt tmp = vConstTileId << 10;
@@ -133,217 +141,187 @@ inline void init_dropout_seed(uint16_t p2){
 }
 
 template <bool APPROXIMATION_MODE>
-inline void configure_programmable_constants(SfpuType operation)
+inline void _init_exponential_()
 {
-    switch (operation) {
-    case SfpuType::gelu:
-        vConstFloatPrgm0 = 0.5f;
-        break;
-    case SfpuType::exponential:
-        if (APPROXIMATION_MODE) {
-            vConstFloatPrgm0 = 1.442695f; // ln2_recip
-            vConstFloatPrgm1 = s2vFloat16b(p_exp::C23_73);
-            vConstFloatPrgm2 = s2vFloat16b(p_exp::ADJ_EXP);
-            break;
-        }
-
-
-
-        // Fall through
-    case SfpuType::gelu_derivative:
-        vConstFloatPrgm2 = 0.863281f;
-
-        // Fall through
-    case SfpuType::reciprocal:
+    if (APPROXIMATION_MODE) {
+        vConstFloatPrgm0 = 1.442695f; // ln2_recip
+        vConstFloatPrgm1 = s2vFloat16b(p_exp::C23_73);
+        vConstFloatPrgm2 = s2vFloat16b(p_exp::ADJ_EXP);
+    } else {
         vConstFloatPrgm0 = 1.442695f; // ln2_recip
         vConstFloatPrgm1 = 2.0f;
-        break;
-
-    case SfpuType::log:
-        // ln2
-        vConstFloatPrgm0 = 0.692871f; // ln2
+        vConstFloatPrgm2 = 0.863281f;
+    }
+}
 
-        // XXXXX could do these to higher precision
-        vConstFloatPrgm1 = 0.1058f;
-        vConstFloatPrgm2 = -0.7166f;
-        break;
+template <bool APPROXIMATION_MODE>
+inline void _init_reciprocal_()
+{
+    vConstFloatPrgm0 = 1.442695f; // ln2_recip
+    vConstFloatPrgm1 = 2.0f;
+}
 
-    case SfpuType::sqrt:
-        if (APPROXIMATION_MODE) {
-            vConstFloatPrgm0 = s2vFloat16b(127 << 7);
-        } else {
-            vConstFloatPrgm0 = s2vFloat16b(0x5f37);
-        }
-        break;
+template <bool APPROXIMATION_MODE>
+inline void _init_log_()
+{
+    vConstFloatPrgm0 = 0.692871f; // ln2
 
-    case SfpuType::dropout:
-        vConstIntPrgm0 = 0xb400;
-        vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB
-        break;
+    // XXXXX could do these to higher precision
+    vConstFloatPrgm1 = 0.1058f;
+    vConstFloatPrgm2 = -0.7166f;
+}
 
-    default:
-        // Should result in compile time error??
-        break;
+template <bool APPROXIMATION_MODE>
+inline void _init_sqrt_()
+{
+    if (APPROXIMATION_MODE) {
+        vConstFloatPrgm0 = s2vFloat16b(127 << 7);
+    } else {
+        vConstFloatPrgm0 = s2vFloat16b(0x5f37);
     }
 }
 
 template <bool APPROXIMATION_MODE>
-inline void sfpu_init(SfpuType operation, uint param0 = 0)
+inline void _init_tanh_()
 {
-    configure_programmable_constants<APPROXIMATION_MODE>(operation);
     uint imm0;
     uint imm1;
     uint imm2;
-    uint imm0_high;
-    uint imm0_low;
-    uint imm1_high;
-    uint imm1_low;
-    uint imm2_high;
-    uint imm2_low;
-    uint imm3_high;
-    uint imm3_low;
-    uint imm4_high;
-    uint imm4_low;
-    uint imm5_high;
-    uint imm5_low;
-    switch (operation) {
-    case SfpuType::tanh:
-    case SfpuType::tanh_derivative:
-        imm0 = 0x1DFF; //0.90625*x
-        imm1 = 0x481A; //0.09375*x + 0.8125
-        imm2 = 0xFF00; //1
-        TTI_SFPLOADI(0, 2, imm0);
-        TTI_SFPLOADI(1, 2, imm1);
-        TTI_SFPLOADI(2, 2, imm2);
-        break;
-    case SfpuType::sigmoid:
-        // imm0 = 0x3DFF;
-        // imm1 = 0x21D8;
-        // imm2 = 0xFF10;
-        // TTI_SFPLOADI(0, 2, imm0);
-        // TTI_SFPLOADI(1, 2, imm1);
-        // TTI_SFPLOADI(2, 2, imm2);
-        // Using a 6 piece LUT to calculate and model sigmoid  directly
-        // x <= 0.5 --> 0.2452x + (-0.0004997)
-        // x <= 1.0 --> 0.2173x + 0.0152
-        // x <= 1.5 --> 0.1731x + 0.05988
-        // x <= 2.0 --> 0.1262x + 0.1298
-        // x <= 4.0 --> 0.0485x + 0.2998
-        // x >  4.0 --> 0.4998
-
-        // imm0[15:0] = A0=0.2452 = 0x33D9 -- imm0[31:16] = A1=0.2173 = 0x32F4
-        sfpu_load_imm32(0,0x32F433D9);
-        // imm4[15:0] = B0= -0.0004997  = 0x9018 -- imm4[31:16] = B1= 0.0152 = 0x23c8
-        sfpu_load_imm32(4,0x23C89018);
-
-        // imm1[15:0] = A2=0.1731 = 0x318a -- imm1[31:16] = A3=0.1262 = 0x300a
-        sfpu_load_imm32(1,0x300A318A);
-        // imm5[15:0] = B2=0.05988 = 0x2BAA -- imm5[31:16] = B3=0.1298 = 0x3027
-        sfpu_load_imm32(5,0x30272BAA);
-
-        // imm2[15:0] = A4=0.0485 = 0x2A35 -- imm2[31:16] = A5=0.0 = 0x7C00
-        sfpu_load_imm32(2,0x7C002A35);
-        // imm6[15:0] = B4=0.2998 = 0x34CC -- imm6[31:16] = B5=0.4998 = 0x37ff
-        sfpu_load_imm32(6,0x37ff34CC);
-
-        break;
-    case SfpuType::gelu_derivative:
-        if constexpr (APPROXIMATION_MODE) {
-            // Using a 6 piece LUT to calculate and model gelu_derivative directly
-            // x <= 0.5 --> 0.8x + 0.5
-            // x <= 1.0 --> 0.4x + 0.7
-            // x <= 1.5 --> 0.1x + 0.99
-            // x <= 2.0 --> -0.09x + 1.27
-            // x <= 3.0 --> -0.075x + 1.235
-            // x >  3.0 --> 1.0
-            // imm0[15:0] = A0=0.8    = 0x3A66 -- imm0[31:16] = A1=0.4   = 0x3666
-            imm0_high = 0x3666;
-            imm0_low  = 0x3A66;
-            // imm1[15:0] = A2=0.1    = 0x2E66 -- imm1[31:16] = A3=-0.09 = 0xADC3
-            imm1_high = 0xADC3;
-            imm1_low  = 0x2E66;
-            // imm2[15:0] = A4=-0.075 = 0xACCD -- imm2[31:16] = A5=0     = 0x7C00
-            imm2_high = 0x7C00;
-            imm2_low  = 0xACCD;
-            // imm3[15:0] = B0=0.5    = 0x3800 -- imm3[31:16] = B1=0.7   = 0x399A
-            imm3_high = 0x399A;
-            imm3_low  = 0x3800;
-            // imm4[15:0] = B2=0.99   = 0x3BEC -- imm4[31:16] = B3=1.27  = 0x3D14
-            imm4_high = 0x3D14;
-            imm4_low  = 0x3BEC;
-            // imm5[15:0] = B4=1.235  = 0x3CF1 -- imm5[31:16] = B5=1.0   = 0x3C00
-            imm5_high = 0x3C00;
-            imm5_low  = 0x3CF1;
-            TTI_SFPLOADI(0, 10, imm0_low);
-            TTI_SFPLOADI(0,  8, imm0_high);
-            TTI_SFPLOADI(1, 10, imm1_low);
-            TTI_SFPLOADI(1,  8, imm1_high);
-            TTI_SFPLOADI(2, 10, imm2_low);
-            TTI_SFPLOADI(2,  8, imm2_high);
-            TTI_SFPLOADI(4, 10, imm3_low);
-            TTI_SFPLOADI(4,  8, imm3_high);
-            TTI_SFPLOADI(5, 10, imm4_low);
-            TTI_SFPLOADI(5,  8, imm4_high);
-            TTI_SFPLOADI(6, 10, imm5_low);
-            TTI_SFPLOADI(6,  8, imm5_high);
-        } else {
-            imm0 = 0x28FF;
-            imm1 = 0x3020;
-            TTI_SFPLOADI(0, 2, imm0);
-            TTI_SFPLOADI(1, 2, imm1);
-        }
-        break;
-    case SfpuType::gelu:
-        // //SG: FIXME
-        // imm0 = 0x18FF;
-        // imm1 = (APPROXIMATION_MODE)? 0x212C : 0x2010;
-        // imm2 = 0xFF00;
-        // TTI_SFPLOADI(0, 2, imm0);
-        // TTI_SFPLOADI(1, 2, imm1);
-        // TTI_SFPLOADI(2, 2, imm2);
-
-        // // >= 3.0f
-        // lreg2_hi=0.50;//3800
-        // lreg6_hi=0.0f;//7c00
-        // // 2.0f -> 3.0f
-        // lreg2_lo= 0.5402f;//3852
-        // lreg6_lo= -0.1194f;//AFA4
-        // // 1.5f -> 2.0f
-        // lreg1_hi= .6099f; //38E1
-        // lreg5_hi= -.2635f; //B437
-        // // 1.0f -> 1.5f
-        // lreg1_lo=0.6189;//38F3
-        // lreg5_lo=-.2797;//B479
-        // // 0.5f -> 1.0f
-        // lreg0_hi=.4939f;//37E7
-        // lreg4_hi=-.1605f;//B122
-        // // 0.0f -> 0.5f
-        // lreg0_lo=0.1928f;//322B
-        // lreg4_lo=-0.0150f;//A3AE
-        sfpu_load_imm32(0,0x37E7322B);
-        //sfpu_load_imm32(4,0xB122A3AE);
-        sfpu_load_imm32(4,0xB12286D8);
-
-
-        sfpu_load_imm32(1,0x38E138F3);
-        sfpu_load_imm32(5,0xB437B479);
-
-        sfpu_load_imm32(2,0x38003852);
-        sfpu_load_imm32(6,0x7c00afa4);
-
-        break;
-    case SfpuType::dropout:
-        init_dropout_seed(param0);
-        break;
-    case SfpuType::quant_int32:
-    case SfpuType::requant_int32:
-    case SfpuType::dequant_int32:
-        sfpu_load_imm32(2,param0);
-        break;
-    default:
-        // Should result in compile time error??
-        break;
+    imm0 = 0x1DFF; //0.90625*x
+    imm1 = 0x481A; //0.09375*x + 0.8125
+    imm2 = 0xFF00; //1
+    _sfpu_load_imm16_(0, imm0);
+    _sfpu_load_imm16_(1, imm1);
+    _sfpu_load_imm16_(2, imm2);
+}
+
+template <bool APPROXIMATION_MODE>
+inline void _init_sigmoid_()
+{
+    // imm0 = 0x3DFF;
+    // imm1 = 0x21D8;
+    // imm2 = 0xFF10;
+    // TTI_SFPLOADI(0, 2, imm0);
+    // TTI_SFPLOADI(1, 2, imm1);
+    // TTI_SFPLOADI(2, 2, imm2);
+    // Using a 6 piece LUT to calculate and model sigmoid  directly
+    // x <= 0.5 --> 0.2452x + (-0.0004997)
+    // x <= 1.0 --> 0.2173x + 0.0152
+    // x <= 1.5 --> 0.1731x + 0.05988
+    // x <= 2.0 --> 0.1262x + 0.1298
+    // x <= 4.0 --> 0.0485x + 0.2998
+    // x >  4.0 --> 0.4998
+
+    // imm0[15:0] = A0=0.2452 = 0x33D9 -- imm0[31:16] = A1=0.2173 = 0x32F4
+    _sfpu_load_imm32_(0,0x32F433D9);
+    // imm4[15:0] = B0= -0.0004997  = 0x9018 -- imm4[31:16] = B1= 0.0152 = 0x23c8
+    _sfpu_load_imm32_(4,0x23C89018);
+
+    // imm1[15:0] = A2=0.1731 = 0x318a -- imm1[31:16] = A3=0.1262 = 0x300a
+    _sfpu_load_imm32_(1,0x300A318A);
+    // imm5[15:0] = B2=0.05988 = 0x2BAA -- imm5[31:16] = B3=0.1298 = 0x3027
+    _sfpu_load_imm32_(5,0x30272BAA);
+
+    // imm2[15:0] = A4=0.0485 = 0x2A35 -- imm2[31:16] = A5=0.0 = 0x7C00
+    _sfpu_load_imm32_(2,0x7C002A35);
+    // imm6[15:0] = B4=0.2998 = 0x34CC -- imm6[31:16] = B5=0.4998 = 0x37ff
+    _sfpu_load_imm32_(6,0x37ff34CC);
+}
+
+template <bool APPROXIMATION_MODE>
+inline void _init_gelu_derivative_()
+{
+    vConstFloatPrgm0 = 1.442695f; // ln2_recip
+    vConstFloatPrgm1 = 2.0f;
+    vConstFloatPrgm2 = 0.863281f;
+
+    uint imm0;
+    uint imm1;
+    uint imm2;
+    uint imm3;
+    uint imm4;
+    uint imm5;
+
+    if constexpr (APPROXIMATION_MODE) {
+        // Using a 6 piece LUT to calculate and model gelu_derivative directly
+        // x <= 0.5 --> 0.8x + 0.5
+        // x <= 1.0 --> 0.4x + 0.7
+        // x <= 1.5 --> 0.1x + 0.99
+        // x <= 2.0 --> -0.09x + 1.27
+        // x <= 3.0 --> -0.075x + 1.235
+        // x >  3.0 --> 1.0
+        // imm0[15:0] = A0=0.8    = 0x3A66 -- imm0[31:16] = A1=0.4   = 0x3666
+        imm0 = 0x36663A66;
+        // imm1[15:0] = A2=0.1    = 0x2E66 -- imm1[31:16] = A3=-0.09 = 0xADC3
+        imm1 = 0xADC32E66;
+        // imm2[15:0] = A4=-0.075 = 0xACCD -- imm2[31:16] = A5=0     = 0x7C00
+        imm2 = 0x7C00ACCD;
+        // imm3[15:0] = B0=0.5    = 0x3800 -- imm3[31:16] = B1=0.7   = 0x399A
+        imm3 = 0x399A3800;
+        // imm4[15:0] = B2=0.99   = 0x3BEC -- imm4[31:16] = B3=1.27  = 0x3D14
+        imm4 = 0x3D143BEC;
+        // imm5[15:0] = B4=1.235  = 0x3CF1 -- imm5[31:16] = B5=1.0   = 0x3C00
+        imm5 = 0x3C003CF1;
+        _sfpu_load_imm32_(0, imm0);
+        _sfpu_load_imm32_(1, imm1);
+        _sfpu_load_imm32_(2, imm2);
+        _sfpu_load_imm32_(4, imm3);
+        _sfpu_load_imm32_(5, imm4);
+        _sfpu_load_imm32_(6, imm5);
+    } else {
+        imm0 = 0x28FF;
+        imm1 = 0x3020;
+        _sfpu_load_imm16_(0, imm0);
+        _sfpu_load_imm16_(1, imm1);
     }
+
+}
+
+template <bool APPROXIMATION_MODE>
+inline void _init_gelu_()
+{
+    vConstFloatPrgm0 = 0.5f;
+
+    // // >= 3.0f
+    // lreg2_hi=0.50;//3800
+    // lreg6_hi=0.0f;//7c00
+    // // 2.0f -> 3.0f
+    // lreg2_lo= 0.5402f;//3852
+    // lreg6_lo= -0.1194f;//AFA4
+    // // 1.5f -> 2.0f
+    // lreg1_hi= .6099f; //38E1
+    // lreg5_hi= -.2635f; //B437
+    // // 1.0f -> 1.5f
+    // lreg1_lo=0.6189;//38F3
+    // lreg5_lo=-.2797;//B479
+    // // 0.5f -> 1.0f
+    // lreg0_hi=.4939f;//37E7
+    // lreg4_hi=-.1605f;//B122
+    // // 0.0f -> 0.5f
+    // lreg0_lo=0.1928f;//322B
+    // lreg4_lo=-0.0150f;//A3AE
+    _sfpu_load_imm32_(0,0x37E7322B);
+    _sfpu_load_imm32_(4,0xB12286D8);
+
+    _sfpu_load_imm32_(1,0x38E138F3);
+    _sfpu_load_imm32_(5,0xB437B479);
+
+    _sfpu_load_imm32_(2,0x38003852);
+    _sfpu_load_imm32_(6,0x7c00afa4);
+
+}
+
+inline void _init_dropout_(const uint seed)
+{
+    vConstIntPrgm0 = 0xb400;
+    vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB
+
+    _init_dropout_seed_(seed);
+}
+
+inline void init_quant_zero_point(const uint zero_point)
+{
+    _sfpu_load_imm32_(2,zero_point);
 }
 
 template <bool APPROXIMATION_MODE>
@@ -403,7 +381,7 @@ void calculate_cube(uint16_t exp_base_scale_factor = 0)
 */
 
 template <bool APPROXIMATION_MODE, bool SCALE_EN, int ITERATIONS>
-void calculate_exponential(const int iterations, uint16_t exp_base_scale_factor = 0)
+void _calculate_exponential_(const int iterations, uint16_t exp_base_scale_factor = 0)
 {
     // Unroll 8 best for approx, unroll 0 for precise, compiler figures this out
     for (int d = 0; d < iterations; d++)
@@ -471,7 +449,7 @@ inline vFloat _calculate_gelu_core_(vFloat in)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_gelu(const int iterations)
+inline void _calculate_gelu_(const int iterations)
 {
 
     vUInt l0 = l_reg[LRegs::LReg0];
@@ -485,7 +463,7 @@ inline void calculate_gelu(const int iterations)
     for (int d = 0; d < iterations; d++)
     {
         // vFloat in = dst_reg[0];
-        // vFloat result = _calculate_gelu_core_<APPROXIMATION_MODE>(in);
+        // vFloat result = calculate_gelu_core<APPROXIMATION_MODE>(in);
 
         // vFloat half_in = in * half;
         // result = lut(result, l0, l1, l2);
@@ -522,7 +500,7 @@ inline void calculate_gelu(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_sigmoid(const int iterations)
+inline void _calculate_sigmoid_(const int iterations)
 {
     constexpr int lut_mode = 0; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1
     vUInt l0 = l_reg[LRegs::LReg0];
@@ -553,7 +531,7 @@ inline void calculate_sigmoid(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_tanh(const int iterations)
+inline void _calculate_tanh_(const int iterations)
 {
     // SFPU microcode
     vUInt l0 = l_reg[LRegs::LReg0];
@@ -576,7 +554,7 @@ inline void calculate_tanh(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_hardtanh(const int iterations, uint param0, uint param1, uint param2)
+inline void _calculate_hardtanh_(const int iterations, uint param0, uint param1, uint param2)
 {
     // All params are in FP16_B format
     // param0 = -(neg_threshold)
@@ -613,7 +591,7 @@ inline void calculate_hardtanh(const int iterations, uint param0, uint param1, u
 }
 
 template <bool APPROXIMATION_MODE, int WITH_PRECOMPUTED_TANH, int ITERATIONS>
-inline void calculate_tanh_derivative(const int iterations)
+inline void _calculate_tanh_derivative_(const int iterations)
 {
     vUInt l0 = l_reg[LRegs::LReg0];
     vUInt l1 = l_reg[LRegs::LReg1];
@@ -640,7 +618,7 @@ inline void calculate_tanh_derivative(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_gelu_derivative(const int iterations)
+inline void _calculate_gelu_derivative_(const int iterations)
 {
     if constexpr (APPROXIMATION_MODE) {
         constexpr int lut_mode = 1; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1
@@ -706,7 +684,7 @@ inline void calculate_gelu_derivative(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_reciprocal(const int iterations)
+inline void _calculate_reciprocal_(const int iterations)
 {
     #pragma GCC unroll 8
     for (int d = 0; d < iterations; d++)
@@ -727,7 +705,7 @@ inline void calculate_reciprocal(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
-inline void calculate_sqrt(const int iterations)
+inline void _calculate_sqrt_(const int iterations)
 {
     #pragma GCC unroll 8
     for (int d = 0; d < iterations; d++)
@@ -773,9 +751,13 @@ inline void calculate_sqrt(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_dropout(const int iterations, uint prob, uint scale)
+inline void _calculate_dropout_(const int iterations, uint prob, uint scale)
 {
     // SFPU microcode
+
+    FWLOG1("calculate_dropout() -- prob:%x", prob);
+    FWLOG1("calculate_dropout() -- scale:%x", scale);
+
     vUInt rand = l_reg[LRegs::LReg3];
 
     #pragma GCC unroll 0
@@ -812,7 +794,7 @@ inline void calculate_dropout(const int iterations, uint prob, uint scale)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_lrelu(const int iterations, uint slope)
+inline void _calculate_lrelu_(const int iterations, uint slope)
 {
     // SFPU microcode
     vFloat s = s2vFloat16b(slope);
@@ -833,7 +815,7 @@ inline void calculate_lrelu(const int iterations, uint slope)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_power(const int iterations, uint exponent)
+inline void _calculate_power_(const int iterations, uint exponent)
 {
     for (int d = 0; d < iterations; d++)
     {
@@ -850,7 +832,7 @@ inline void calculate_power(const int iterations, uint exponent)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_square(const int iterations)
+inline void _calculate_square_(const int iterations)
 {
     #pragma GCC unroll 8
     for (int d = 0; d < iterations; d++)
@@ -920,7 +902,7 @@ sfpi_inline void _calculate_log_body_(const uint log_base_scale_factor)
 }
 
 template <bool APPROXIMATION_MODE, bool HAS_BASE_SCALING, int ITERATIONS>
-inline void calculate_log(const int iterations, uint log_base_scale_factor)
+inline void _calculate_log_(const int iterations, uint log_base_scale_factor)
 {
     #pragma GCC unroll 8
     for(int d = 0; d < iterations; d++){
@@ -937,13 +919,9 @@ sfpi_inline void _calculate_comp_init_flag_(bool check, vFloat& flag1, vFloat& f
     }
 }
 
-template <bool APPROXIMATION_MODE, SfpuType COMP_MODE, int ITERATIONS>
-inline void calculate_comp(const int iterations, uint exponent_size_8)
+template <bool APPROXIMATION_MODE, bool invert_output, bool check_zero, bool second_check, bool is_less_than_equal_zero, int ITERATIONS>
+inline void _calculate_comp_(const int iterations, uint exponent_size_8)
 {
-    //invert output and use same comparison check
-    constexpr bool invert_output = ((COMP_MODE == SfpuType::greater_than_equal_zero) ||
-                                    (COMP_MODE == SfpuType::not_equal_zero) ||
-                                    (COMP_MODE == SfpuType::greater_than_zero));
 
     // output_0 and output_1 hold the outputs use use when a zero or negative check is true/false.
     // False = 0.0 = kCONST_0 (5/8-bit exponent format)
@@ -954,16 +932,13 @@ inline void calculate_comp(const int iterations, uint exponent_size_8)
     constexpr float output_0 = invert_output ? 0.0f : 1.0f;
     constexpr float output_1 = invert_output ? 1.0f : 0.0f;
 
-    constexpr bool check_zero = (COMP_MODE == SfpuType::equal_zero) || (COMP_MODE == SfpuType::not_equal_zero);
-    constexpr bool second_check = (COMP_MODE == SfpuType::less_than_equal_zero) || (COMP_MODE == SfpuType::greater_than_zero);
-
     for (int d = 0; d < iterations; d++)
     {
         vFloat v = dst_reg[0];
         vFloat flag1, flag2;
         if constexpr(check_zero)
         {
-            v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+            v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
                 _calculate_comp_init_flag_(second_check, flag1, flag2, output_0);
             } v_else {
                 _calculate_comp_init_flag_(second_check, flag1, flag2, output_1);
@@ -983,18 +958,18 @@ inline void calculate_comp(const int iterations, uint exponent_size_8)
         vFloat result;
         if constexpr (second_check)
         {
-            // SfpuType::less_than_equal_zero
+            // less_than_equal_zero
             // flag1 = 0x3F80(1.0) if DST < 0 else 0
             // flag2 = 0x3F80(1.0) if DST == 0 else 0
             // Do a bitwise Or (flag1 | flag2) to get <= condition.
             // flag1 < 0 OR flag2 == 0 => DST is Less than or Equal to zero.
             // Result will be either 0x0000(0.0) or 0x3F80(1.0)
-            if constexpr (COMP_MODE == SfpuType::less_than_equal_zero) {
+            if constexpr (is_less_than_equal_zero) {
                 result = reinterpret<vFloat>(reinterpret<vUInt>(flag1) | reinterpret<vUInt>(flag2));
             }
             else
             {
-                // SfpuType::greater_than_zero
+                // greater_than_zero
                 // flag1 = 0x3F80(1.0) if DST >= 0 else 0
                 // flag2 = 0x3F80(1.0) if DST != 0 else 0
                 // Do a bitwise And (flag1 & flag2) to get > condition.
@@ -1013,7 +988,7 @@ inline void calculate_comp(const int iterations, uint exponent_size_8)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_clamp(const int iterations, uint param0, uint param1, uint param2)
+inline void _calculate_clamp_(const int iterations, uint param0, uint param1, uint param2)
 {
     // All params are in FP16 format
     // param0 = min
@@ -1044,7 +1019,7 @@ inline void calculate_clamp(const int iterations, uint param0, uint param1, uint
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_abs(const int iterations)
+inline void _calculate_abs_(const int iterations)
 {
     // SFPU microcode
     for (int d = 0; d < iterations; d++)
@@ -1056,7 +1031,7 @@ inline void calculate_abs(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_sign(const int iterations, uint exponent_size_8)
+inline void _calculate_sign_(const int iterations, uint exponent_size_8)
 {
     // All params are in FP16 format
     // uint format = 1;
@@ -1072,7 +1047,7 @@ inline void calculate_sign(const int iterations, uint exponent_size_8)
 
         //param0 == 0 is Bfp8 format. It does not require bias removal.
         //param0 != 0 is Float16 format and exp bias needs to be removed for zero check.
-        v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+        v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
             dst_reg[0] = vConst0;
         }
         v_endif;
@@ -1082,7 +1057,7 @@ inline void calculate_sign(const int iterations, uint exponent_size_8)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_max(const int iterations)
+inline void _calculate_max_(const int iterations)
 {
     for (int d = 0; d < iterations; d++)
     {
@@ -1098,7 +1073,7 @@ inline void calculate_max(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_max_int32(const int iterations)
+inline void _calculate_max_int32_(const int iterations)
 {
     for (int d = 0; d < iterations; d++)
     {
@@ -1113,7 +1088,7 @@ inline void calculate_max_int32(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE>
-sfpi_inline vFloat sfpu_sine_maclaurin_series(vFloat val)
+sfpi_inline vFloat _sfpu_sine_maclaurin_series_(vFloat val)
 {
     // Good for [-pi:pi]
     // Mclauren series = x - x^3/3! + x^5/5! - x^7/7! + x^9/9! - x^11/11!
@@ -1142,7 +1117,7 @@ sfpi_inline vFloat sfpu_sine_maclaurin_series(vFloat val)
     return output;
 }
 template <bool APPROXIMATION_MODE>
-sfpi_inline vFloat sfpu_cosine_maclaurin_series(vFloat val)
+sfpi_inline vFloat _sfpu_cosine_maclaurin_series_(vFloat val)
 {
     // Good for [-pi:pi]
     // Mclauren series = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! - x^10/10! + x^12/12!
@@ -1170,7 +1145,7 @@ sfpi_inline vFloat sfpu_cosine_maclaurin_series(vFloat val)
     return output;
 }
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_sine(const int iterations)
+inline void _calculate_sine_(const int iterations)
 {
     // SFPU microcode
     for (int d = 0; d < iterations; d++)
@@ -1181,7 +1156,7 @@ inline void calculate_sine(const int iterations)
         vFloat whole_v_float = int32_to_float(whole_v, 0);
         v = v - whole_v_float;
         v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi]
-        v = sfpu_sine_maclaurin_series<APPROXIMATION_MODE>(v);
+        v = _sfpu_sine_maclaurin_series_<APPROXIMATION_MODE>(v);
         whole_v = whole_v & 0x1;
         v_if(whole_v != 0) {
             // odd so flip the sign
@@ -1193,7 +1168,7 @@ inline void calculate_sine(const int iterations)
     }
 }
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_cosine(const int iterations)
+inline void _calculate_cosine_(const int iterations)
 {
     // SFPU microcode
     for (int d = 0; d < iterations; d++)
@@ -1204,7 +1179,7 @@ inline void calculate_cosine(const int iterations)
         vFloat whole_v_float = int32_to_float(whole_v, 0);
         v = v - whole_v_float;
         v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi]
-        v = sfpu_cosine_maclaurin_series<APPROXIMATION_MODE>(v);
+        v = _sfpu_cosine_maclaurin_series_<APPROXIMATION_MODE>(v);
         whole_v = whole_v & 0x1;
         v_if(whole_v != 0) {
             // odd so flip the sign
@@ -1216,7 +1191,7 @@ inline void calculate_cosine(const int iterations)
     }
 }
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void relu_max(const int iterations, uint uint_threshold)
+inline void _relu_max_(const int iterations, uint uint_threshold)
 {
     vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a);
     for (int d = 0; d < iterations; d++)
@@ -1235,7 +1210,7 @@ inline void relu_max(const int iterations, uint uint_threshold)
     }
 }
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void relu_min(const int iterations, uint uint_threshold)
+inline void _relu_min_(const int iterations, uint uint_threshold)
 {
     vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a);
     for (int d = 0; d < iterations; d++)
@@ -1250,7 +1225,7 @@ inline void relu_min(const int iterations, uint uint_threshold)
     }
 }
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void cast_fp32_to_fp16a(const int iterations)
+inline void _cast_fp32_to_fp16a_(const int iterations)
 {
     #pragma GCC unroll 8
     for (int d = 0; d < iterations; d++)
@@ -1265,7 +1240,7 @@ inline void cast_fp32_to_fp16a(const int iterations)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void quant_int32(const int iterations, const uint dst_offset)
+inline void _quant_int32_(const int iterations, const uint dst_offset)
 {
     // Operand A is input (fp32)
     // Operand B is scaling factor (fp32)
@@ -1290,7 +1265,7 @@ inline void quant_int32(const int iterations, const uint dst_offset)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void requant_int32(const int iterations, const uint dst_offset)
+inline void _requant_int32_(const int iterations, const uint dst_offset)
 {
     // Operand A is input to requant (int32)
     // Operand B is scaling factor (fp32)
@@ -1318,7 +1293,7 @@ inline void requant_int32(const int iterations, const uint dst_offset)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void dequant_int32(const int iterations, const uint dst_offset)
+inline void _dequant_int32_(const int iterations, const uint dst_offset)
 {
     // Operand A[LREG0] is input to dequant (int32)
     // Operand B[LREG1] is scaling factor (fp32)
@@ -1344,126 +1319,5 @@ inline void dequant_int32(const int iterations, const uint dst_offset)
     }
 }
 
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_mask()
-{
-    bool exponent_size_8 = true;
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat mask = dst_reg[32];
-        v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) {
-            dst_reg[0] = 0;
-        }
-        v_endif;
-        dst_reg++;
-    }
-}
-
-template <SfpuType operation, bool APPROXIMATION_MODE, int SfpuType_PARAM=0, int ITERATIONS=8, bool IS_INT_SFPU_EN = false>
-inline void calculate_sfpu(const int iterations = ITERATIONS, uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0)
-{
-    if constexpr (operation == SfpuType::exponential) {
-        calculate_exponential<APPROXIMATION_MODE, false, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::exp_with_base) {
-        calculate_exponential<APPROXIMATION_MODE, true, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::tanh) {
-        calculate_tanh<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::hardtanh) {
-        calculate_hardtanh<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
-    }
-    else if constexpr (operation == SfpuType::gelu) {
-        calculate_gelu<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::reciprocal) {
-        calculate_reciprocal<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::sigmoid) {
-        calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::sqrt) {
-        calculate_sqrt<APPROXIMATION_MODE, ITERATIONS, 2>(iterations);
-    }
-    else if constexpr (operation == SfpuType::tanh_derivative) {
-        calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::lrelu) {
-        calculate_lrelu<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::dropout) {
-        calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1);
-    }
-    else if constexpr (operation == SfpuType::power) {
-        calculate_power<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::square) {
-        calculate_square<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::log) {
-        calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::log_with_base) {
-        calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::gelu_derivative) {
-        calculate_gelu_derivative<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr ((operation == SfpuType::equal_zero) ||
-                       (operation == SfpuType::not_equal_zero) ||
-                       (operation == SfpuType::less_than_zero) ||
-                       (operation == SfpuType::greater_than_equal_zero) ||
-                       (operation == SfpuType::less_than_equal_zero) ||
-                       (operation == SfpuType::greater_than_zero)) {
-        calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(iterations, param5);
-    }
-    else if constexpr (operation == SfpuType::clamp) {
-        calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
-    }
-    else if constexpr (operation == SfpuType::abs) {
-        calculate_abs<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::sign) {
-        calculate_sign<APPROXIMATION_MODE, ITERATIONS>(iterations, param5);
-    }
-    else if constexpr (operation == SfpuType::max) {
-        if constexpr (IS_INT_SFPU_EN)
-            calculate_max_int32<APPROXIMATION_MODE, ITERATIONS>(iterations);
-        else
-            calculate_max<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::sine) {
-        calculate_sine<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::cosine) {
-        calculate_cosine<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::relu_min) {
-        relu_min<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::relu_max) {
-        relu_max<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::cast_fp32_to_fp16a) {
-        cast_fp32_to_fp16a<APPROXIMATION_MODE, ITERATIONS>(iterations);
-    }
-    else if constexpr (operation == SfpuType::quant_int32) {
-        quant_int32<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::requant_int32) {
-        requant_int32<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::dequant_int32) {
-        dequant_int32<APPROXIMATION_MODE, ITERATIONS>(iterations, param0);
-    }
-    else if constexpr (operation == SfpuType::mask) {
-        calculate_mask<APPROXIMATION_MODE, ITERATIONS>();
-    }
-    else if constexpr (operation == SfpuType::negative) {
-        calculate_negative<APPROXIMATION_MODE, ITERATIONS>();
-    }
-}
-
 } // namespace sfpu
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h
index a8134eb8d47..d9acf613adc 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h
index 2e1b7acb36f..35edc65483b 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel.h"
@@ -243,4 +244,221 @@ class ckernel_unpack_template
     void program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask = 0); // calls program, then run
 };
 
+    inline ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op)
+        , m_loop_op1(TT_OP_NOP)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op;
+        m_loop1_last_instr = loop_op;
+    }
+
+    inline ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op0)
+        , m_loop_op1(loop_op1)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op1;
+        m_loop1_last_instr = loop_op1;
+    }
+
+    inline void ckernel_template::set_loop_op0(uint loop_op)
+    {
+        m_loop_op0 = loop_op;
+    }
+
+    inline void ckernel_template::set_loop_op1(uint loop_op)
+    {
+        m_loop_op1 = loop_op;
+    }
+
+    inline void ckernel_template::set_end_ops(uint end_op0, uint end_op1)
+    {
+        m_end_op0 = end_op0;
+        m_end_op1 = end_op1;
+    }
+
+    inline void ckernel_template::set_end_op(uint end_op0)
+    {
+        set_end_ops(end_op0, TT_OP_NOP);
+    }
+
+    inline void ckernel_template::set_start_op(uint start_op0)
+    {
+        m_start_op0 = start_op0;
+    }
+
+    inline void ckernel_template::set_last_inner_loop_instr(uint op)
+    {
+        m_loop1_last_instr = op;
+    }
+
+    inline void ckernel_template::set_last_outer_loop_instr(uint op)
+    {
+        m_loop0_last_instr = op;
+    }
+
+    inline void ckernel_template::program_and_run(volatile uint *instrn_buffer)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer);
+    }
+
+    inline void ckernel_template::run(volatile uint *instrn_buffer)
+    {
+        TTI_MOP(1, 0, 0); // run the double-loop template
+    }
+
+    inline void ckernel_template::program(volatile uint *instrn_buffer)
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[0] = m_outer_loop_len;
+        mop_cfg[1] = m_inner_loop_len;
+        mop_cfg[2] = m_start_op0;
+        mop_cfg[3] = m_end_op0;
+        mop_cfg[4] = m_end_op1;
+        mop_cfg[5] = m_loop_op0;
+        mop_cfg[6] = m_loop_op1;
+        mop_cfg[7] = m_loop0_last_instr;
+        mop_cfg[8] = m_loop1_last_instr;
+    }
+
+    inline void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer, count, zmask);
+    }
+
+    inline void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP_CFG(zmask >> 16);              // Set the top 16 bits of zmask - we could skip this for count <= 16
+        TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template
+    }
+
+    // Version without zmask, should be slightly faster by eliminating one instruction.
+    inline void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP(0, count - 1, 0); // Run the template
+    }
+
+    inline void ckernel_unpack_template::program(volatile uint *instrn_buffer) const
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[1] = m_unpackB | (m_unpack_halo << 1);
+        mop_cfg[2] = m_B_instr;
+        mop_cfg[3] = m_A0_instr;
+        mop_cfg[4] = m_A1_instr;
+        mop_cfg[5] = m_A2_instr;
+        mop_cfg[6] = m_A3_instr;
+        mop_cfg[7] = m_skipA_instr;
+        mop_cfg[8] = m_skipB_instr;
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            A_instr, 0, 0, 0, skipA_instr, 0, 0);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            B_instr, 0, 0, 0, skipB_instr, 0, 0);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr,
+
+        uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(true, // src B
+            false,                           // halo
+            A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){
+        return ckernel_unpack_template::lA(instr0, skip0);
+    }
+
+    inline ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){
+        // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA.
+        return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1);
+    }
+
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h
index e9b2559a1b7..120f8898adf 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel.h"
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h
index fa97031b17a..0c06a710cbb 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h
@@ -2,11 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 //#include "kernel_types.h"
 #include "ckernel.h"
 #include "ckernel_template.h"
+#include "ckernel_sfpu.h"
 #include "ckernel_globals.h"
 #include "llk_defs.h"
 
@@ -197,6 +199,13 @@ inline void clear_addr_mod_base()
     TTI_SETC16(ADDR_MOD_SET_Base_ADDR32, 0); // clear addr mod base (use addr mods 0..3)
 }
 
+template <uint num_rows=8>
+inline void inc_dst_addr()
+{
+    static_assert(num_rows <= 15, "num_rows must be <= 15");
+    TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, num_rows, 0, 0, p_setrwc::SET_D);
+}
+
 inline void math_dest_wait()
 {
     FWLOG0("XX math_full_dest_sync()->wait for whole dest available");
@@ -230,4 +239,14 @@ inline constexpr bool is_32bit_input(const std::uint32_t src_format, const std::
            ((output_df == (uint)DataFormat::Int32) || (output_df == (uint)DataFormat::Float32));
 }
 
+inline constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc)
+{
+    return (math_fidelity_desc & 0x7);
+}
+
+inline constexpr int get_math_fidelity_increment(const int math_fidelity_desc)
+{
+    return ((math_fidelity_desc >> 3) & 0x1) + 1;
+}
+
 } // namespace ckernel::math
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h
index bdc0b6b5063..011092787c7 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel.h"
@@ -517,6 +518,33 @@ namespace ckernel::packer
       TT_SETDMAREG(0, UPPER_HALFWORD(addr), 0, HI_16(p_gpr_pack::OUTPUT_ADDR));
    }
 
+   template <uint32_t block_ct_dim>
+   inline void program_packer_untilized_destination(const uint32_t addr, const uint32_t pack_dst_format)
+   {
+      // Each packer packs 8 rows of block_ct_dim*TILE_C_DIM datums
+      const uint32_t block_size = SCALE_DATUM_SIZE(pack_dst_format, block_ct_dim * TILE_C_DIM * (TILE_R_DIM/4));
+      constexpr uint32_t offset0 = 0;
+      const uint32_t offset1 = (1*block_size)/16;
+      const uint32_t offset2 = (2*block_size)/16;
+      const uint32_t offset3 = (3*block_size)/16;
+
+      TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset0), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+0));
+      TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset0), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+0));
+      TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset1), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+1));
+      TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset1), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+1));
+      TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset2), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+2));
+      TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset2), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+2));
+      TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset3), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+3));
+      TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset3), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+3));
+
+      TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR);
+      TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+1);
+      TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+2);
+      TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+3);
+
+      TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 0); // pack flush
+   }
+
    inline void program_packer_dest_offset_registers(uint32_t dest_tile_offset)
    {
       TT_SETDMAREG(0, LOWER_HALFWORD(dest_tile_offset), 0, LO_16(p_gpr_pack::TEMP_TILE_OFFSET));
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h
index 55404e24d39..eec20973ec2 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel.h"
@@ -180,13 +181,13 @@ namespace ckernel::unpacker
        while (semaphore_read(semaphore::UNPACK_SYNC) > 0) {}
    }
 
-   inline void enalbe_int8_fpu_math() {
+   inline void enable_int8_fpu_math() {
       alu_config_u alu_payload = {.val = 0};
       alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = 1;
       cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, ALU_ACC_CTRL_INT8_math_enabled_MASK>(alu_payload.val);
    }
 
-   template<bool row_pool=false, bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+   template<bool row_pool=false, bool is_fp32_dest_acc_en = false, bool fpu_srnd_en = false, bool pack_srnd_en = false>
    inline void configure_unpack_AB(
      const uint unpA_src_format,
      const uint unpB_src_format,
@@ -249,10 +250,9 @@ namespace ckernel::unpacker
       alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = int8_math_enabled;
 
       constexpr uint alu_stoch_rnd_mask = ALU_ROUNDING_MODE_Fpu_srnd_en_MASK | ALU_ROUNDING_MODE_Gasket_srnd_en_MASK | ALU_ROUNDING_MODE_Packer_srnd_en_MASK;
-      constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndMode::All);
-      alu_payload.f.ALU_ROUNDING_MODE_Fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Fpu);
-      alu_payload.f.ALU_ROUNDING_MODE_Gasket_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndMode::Pack);
-      alu_payload.f.ALU_ROUNDING_MODE_Packer_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Pack);
+      alu_payload.f.ALU_ROUNDING_MODE_Fpu_srnd_en = fpu_srnd_en;
+      alu_payload.f.ALU_ROUNDING_MODE_Gasket_srnd_en = pack_srnd_en;
+      alu_payload.f.ALU_ROUNDING_MODE_Packer_srnd_en = pack_srnd_en;
 
       constexpr uint alu_mask = alu_format_mask | alu_dest_format_mask | alu_stoch_rnd_mask;
 
@@ -348,28 +348,47 @@ namespace ckernel::unpacker
       reset_config_context();
    }
 
-   template <bool INSERT_FENCE=false, std::uint32_t UNP_SEL = p_setadc::UNP_AB>
-   inline void config_face_dim(const uint32_t face_r_dim)
+   template <std::uint32_t UNP_SEL = p_setadc::UNP_AB>
+   inline void config_unpacker_x_end(const uint32_t face_r_dim)
    {
       switch (face_r_dim) {
          case 1:
             TTI_SETADCXX(UNP_SEL, 1*FACE_C_DIM-1, 0x0);
-            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16);
             break;
          case 2:
             TTI_SETADCXX(UNP_SEL, 2*FACE_C_DIM-1, 0x0);
-            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_2x16);
             break;
          case 4:
             TTI_SETADCXX(UNP_SEL, 4*FACE_C_DIM-1, 0x0);
-            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_4x16);
             break;
          case 8:
             TTI_SETADCXX(UNP_SEL, 8*FACE_C_DIM-1, 0x0);
-            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_8x16);
             break;
          default:
             TTI_SETADCXX(UNP_SEL, FACE_R_DIM*FACE_C_DIM-1, 0x0);
+            break;
+      }
+   }
+
+   template <bool INSERT_FENCE=false, std::uint32_t UNP_SEL = p_setadc::UNP_AB>
+   inline void config_unpacker_0_face_dim(const uint32_t face_r_dim)
+   {
+      //tile x dim registers are only for unpacker 0
+      static_assert(UNP_SEL != p_setadc::UNP_B);
+      switch (face_r_dim) {
+         case 1:
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16);
+            break;
+         case 2:
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_2x16);
+            break;
+         case 4:
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_4x16);
+            break;
+         case 8:
+            TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_8x16);
+            break;
+         default:
             TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16);
             break;
       }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc
deleted file mode 100644
index baeba52c6c6..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
-*/
-
-#include "ckernel_template.h"
-
-namespace ckernel
-{
-extern volatile uint *cfg_regs;
-
-ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op)
-    : m_outer_loop_len(outer_loop_len)
-    , m_inner_loop_len(inner_loop_len)
-    , m_loop_op0(loop_op)
-    , m_loop_op1(TT_OP_NOP)
-    , m_end_op0(TT_OP_NOP)
-    , m_end_op1(TT_OP_NOP)
-    , m_start_op0(TT_OP_NOP)
-{
-    m_loop0_last_instr = loop_op;
-    m_loop1_last_instr = loop_op;
-}
-
-ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1)
-    : m_outer_loop_len(outer_loop_len)
-    , m_inner_loop_len(inner_loop_len)
-    , m_loop_op0(loop_op0)
-    , m_loop_op1(loop_op1)
-    , m_end_op0(TT_OP_NOP)
-    , m_end_op1(TT_OP_NOP)
-    , m_start_op0(TT_OP_NOP)
-{
-    m_loop0_last_instr = loop_op1;
-    m_loop1_last_instr = loop_op1;
-}
-
-void ckernel_template::set_loop_op0(uint loop_op)
-{
-    m_loop_op0 = loop_op;
-}
-
-void ckernel_template::set_loop_op1(uint loop_op)
-{
-    m_loop_op1 = loop_op;
-}
-
-void ckernel_template::set_end_ops(uint end_op0, uint end_op1)
-{
-    m_end_op0 = end_op0;
-    m_end_op1 = end_op1;
-}
-
-void ckernel_template::set_end_op(uint end_op0)
-{
-    set_end_ops(end_op0, TT_OP_NOP);
-}
-
-void ckernel_template::set_start_op(uint start_op0)
-{
-    m_start_op0 = start_op0;
-}
-
-void ckernel_template::set_last_inner_loop_instr(uint op)
-{
-    m_loop1_last_instr = op;
-}
-
-void ckernel_template::set_last_outer_loop_instr(uint op)
-{
-    m_loop0_last_instr = op;
-}
-
-void ckernel_template::program_and_run(volatile uint *instrn_buffer)
-{
-    program(instrn_buffer);
-    run(instrn_buffer);
-}
-
-void ckernel_template::run(volatile uint *instrn_buffer)
-{
-    TTI_MOP(1, 0, 0); // run the double-loop template
-}
-
-void ckernel_template::program(volatile uint *instrn_buffer)
-{
-    volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
-
-    mop_sync(); // wait until previous mops have completed
-
-    mop_cfg[0] = m_outer_loop_len;
-    mop_cfg[1] = m_inner_loop_len;
-    mop_cfg[2] = m_start_op0;
-    mop_cfg[3] = m_end_op0;
-    mop_cfg[4] = m_end_op1;
-    mop_cfg[5] = m_loop_op0;
-    mop_cfg[6] = m_loop_op1;
-    mop_cfg[7] = m_loop0_last_instr;
-    mop_cfg[8] = m_loop1_last_instr;
-}
-
-void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
-{
-    program(instrn_buffer);
-    run(instrn_buffer, count, zmask);
-}
-
-void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
-{
-    FWASSERT("Unpack template only supports loops up to 128", count <= 128);
-    TT_MOP_CFG(zmask >> 16);              // Set the top 16 bits of zmask - we could skip this for count <= 16
-    TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template
-}
-
-// Version without zmask, should be slightly faster by eliminating one instruction.
-void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count)
-{
-    FWASSERT("Unpack template only supports loops up to 128", count <= 128);
-    TT_MOP(0, count - 1, 0); // Run the template
-}
-
-void ckernel_unpack_template::program(volatile uint *instrn_buffer) const
-{
-    volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
-
-    mop_sync(); // wait until previous mops have completed
-
-    mop_cfg[1] = m_unpackB | (m_unpack_halo << 1);
-    mop_cfg[2] = m_B_instr;
-    mop_cfg[3] = m_A0_instr;
-    mop_cfg[4] = m_A1_instr;
-    mop_cfg[5] = m_A2_instr;
-    mop_cfg[6] = m_A3_instr;
-    mop_cfg[7] = m_skipA_instr;
-    mop_cfg[8] = m_skipB_instr;
-}
-
-ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        false,                            // halo
-        A_instr, 0, 0, 0, skipA_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        false,                            // halo
-        B_instr, 0, 0, 0, skipB_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(true, // src B
-        true,                            // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B);
-}
-
-ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(true, // src B
-        true,                            // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr,
-
-    uint B_instr, uint skipB_instr)
-{
-    return ckernel_unpack_template(true, // src B
-        false,                           // halo
-        A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr);
-}
-
-ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){
-    return ckernel_unpack_template::lA(instr0, skip0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){
-    // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA.
-    return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1);
-}
-
-} // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
index 35130c72520..103269694e5 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
@@ -2,7 +2,6 @@
 // to reduce the overhead of the compilation process and
 // improve build times
 #include "ckernel.cc"
-#include "ckernel_template.cc"
 #ifdef PERF_DUMP
 #include "ckernel_perf_unpack_pack.cc"
 #endif
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
index 2a66c11d1a6..99880029ff6 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
@@ -1,2 +1 @@
 ckernel.cc
-ckernel_template.cc
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
index e205ec12747..e5250fb7412 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 namespace ckernel {
@@ -97,67 +98,20 @@ enum ReluType {
     MAX_THRESHOLD_RELU,
 };
 
-enum SfpuType {
-    tanh,
-    hardtanh,
-    gelu,
-    exponential,
-    exp_with_base,
-    sigmoid,
-    reciprocal,
-    sqrt,
-    lrelu,
-    power,
-    square,
-    tanh_derivative,
-    log,
-    log_with_base,
-    equal_zero,
-    not_equal_zero,
-    less_than_zero,
-    greater_than_equal_zero,
-    less_than_equal_zero,
-    greater_than_zero,
-    clamp,
-    gelu_derivative,
-    dropout,
-    abs,
-    sign,
-    max,
-    sine,
-    cosine,
-    tan,
-    relu_max,
-    relu_min,
-    cast_fp32_to_fp16a,
-    sigmoid_appx,
-    gelu_appx,
-    elu,
-    min,
-    exp2,
-    heaviside,
-    expm1,
-    signbit,
-    asin,
-    acos,
-    atan,
-    erf,
-    erfc,
-    rsqrt,
-    isfinite,
-    isinf,
-    isposinf,
-    isneginf,
-    isnan,
-    logical_not_unary,
-    erfinv,
-    i0,
-    silu,
-    mask,
-    negative,
-    dequant_int32,
-    requant_int32,
-    quant_int32,
-    unused,
+/*
+Stochastic rounding modes:
+    None: No stochastic rounding enabled, default rounding is round to nearest even.
+    Fpu: Enables stochastic rounding for every accumulation in the fpu
+    Pack: Enables stochastic rounding in both gasket and packer. Gasket rounding is in
+    data format conversion stage from dest format to pack_src_format. Packer rounding
+    is in data format conversion stage from pack_src_format to pack_dst_format.
+    All: Enables fpu, pack and gasket rounding.
+*/
+enum struct StochRndType {
+    None    = 0,
+    Fpu     = 1,
+    Pack    = 2,
+    All     = 0xf,
 };
+
 }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h
index 8eb5e084934..a626e1b8180 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel_defs.h"
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h
index 0a70d430497..f34cd4aa74b 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel_include.h"
 #include "ckernel_template.h"
@@ -209,7 +210,7 @@ inline void _llk_math_eltwise_binary_(const std::uint32_t num_faces, uint dst_in
 }
 
 
-template <EltwiseBinaryType eltwise_binary_type, BroadcastType bcast_type>
+template <EltwiseBinaryType eltwise_binary_type, BroadcastType bcast_type, std::uint32_t FIDELITY_INCREMENT>
 inline void eltwise_binary_configure_addrmod() {
     // Use srcA for data movement
     if constexpr (
@@ -240,7 +241,7 @@ inline void eltwise_binary_configure_addrmod() {
             .srca = {.incr = 0, .clr = 1},
             .srcb = {.incr = 0, .clr = 1},
             .dest = {.incr = 0, .clr = 0, .cr = 1},
-            .fidelity = {.incr = 1}}
+            .fidelity = {.incr = FIDELITY_INCREMENT}}
             .set(ADDR_MOD_2);
 
         addr_mod_t{
@@ -330,15 +331,18 @@ inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, co
 template <
     EltwiseBinaryType eltwise_binary_type,
     BroadcastType src_b_bcast_type,
-    int NUM_FIDELITY_PHASES = 0,
+    int MATH_FIDELITY_DESC = 0,
     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
 inline void _llk_math_eltwise_binary_init_(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) {
 
-    eltwise_binary_configure_addrmod<eltwise_binary_type, src_b_bcast_type>();
+    constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC);
+    constexpr int MATH_FIDELITY_INCREMENT = get_math_fidelity_increment(MATH_FIDELITY_DESC);
+
+    eltwise_binary_configure_addrmod<eltwise_binary_type, src_b_bcast_type, MATH_FIDELITY_INCREMENT>();
 
     if constexpr (
         (eltwise_binary_type == ELWADD) || (eltwise_binary_type == ELWSUB) || (eltwise_binary_type == ELWMUL)) {
-        eltwise_binary_configure_mop<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(acc_to_dest, num_faces);
+        eltwise_binary_configure_mop<eltwise_binary_type, src_b_bcast_type, MATH_FIDELITY_PHASES, binary_reuse_dest>(acc_to_dest, num_faces);
     } else {
         FWASSERT("Unsupported op!", false);
     }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h
index 9e23dab17f2..36b5e02c21e 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h
@@ -1,8 +1,7 @@
-/*
- * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
- *
- * SPDX-License-Identifier: Apache-2.0
-*/
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
 
 #pragma once
 
@@ -17,7 +16,6 @@
 
 using namespace ckernel;
 // local function declarations
-template <SfpuType sfpu_op>
 inline void eltwise_binary_sfpu_configure_addrmod(){
     // NOTE: this kernel is typically used in conjunction with
     //       A2D, which is using ADDR_MOD_0 and ADDR_MOD_2, so use one
@@ -32,22 +30,8 @@ inline void eltwise_binary_sfpu_configure_addrmod(){
 }
 inline void eltwise_binary_sfpu_configure_mop();
 
-template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void _llk_math_eltwise_binary_sfpu_(
-    const uint face_r_dim,
-    const uint num_faces,
-    uint dst_index_a,
-    uint dst_index_b,
-    int vector_mode = (int)Dim::RC,
-    uint param0 = 0,
-    uint param1 = 0,
-    uint param2 = 0,
-    uint param3 = 0,
-    uint param4 = 0,
-    uint param5 = 0) {
-    constexpr int ITERATIONS = 8;
-    uint dst_index = (dst_index_a <= dst_index_b) ? dst_index_a : dst_index_b;
-    param0 = (dst_index_a > dst_index_b) ? dst_index_a-dst_index_b : dst_index_b-dst_index_a;
+template <DstSync Dst = DstSync::SyncFull>
+inline void _llk_math_eltwise_binary_sfpu_start_(const uint dst_index) {
     if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
         math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
     } else {
@@ -55,65 +39,21 @@ inline void _llk_math_eltwise_binary_sfpu_(
     }
     math::set_addr_mod_base();
     TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
-    if (vector_mode == (int)Dim::R) {
-        // Do a row vector, Face0 + Face1 -- first iteration (first row)
-        const int iterations = (num_faces < 4) ?
-                                    ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns
-#pragma GCC unroll 0
-        for (int face = 0; face < 2; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS>(iterations, param0, param1, param2, param3, param4, param5);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        }
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-    } else if (vector_mode == (int)Dim::C) {
-        // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face
-#pragma GCC unroll 0
-        for (int face = 0; face < 2; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS>(ITERATIONS, param0, param1, param2, param3, param4, param5);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            if (num_faces>2) { // Skip next 2 faces if tile is 32x32
-                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            }
-        }
-        if (num_faces<=2) {
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        }
-    } else {
-        // Do all four faces, and iterate through all 4 blocks of 4 rows each
-#pragma GCC unroll 0
-        for (int face = 0; face < 4; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS>(ITERATIONS, param0, param1, param2, param3, param4, param5);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        }
-    }
+}
+
+inline void _llk_math_eltwise_binary_sfpu_done_() {
     math::clear_dst_reg_addr();
 
     TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::WAIT_SFPU);
     math::clear_addr_mod_base();
 }
 
-template <SfpuType sfpu_op, bool APPROXIMATE>
-inline void _llk_math_eltwise_binary_sfpu_init_(
-    uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
-    eltwise_binary_sfpu_configure_addrmod< sfpu_op >();
-    if constexpr (sfpu_op == SfpuType::quant_int32) {
-        sfpu::sfpu_init<APPROXIMATE>(sfpu_op, param0);
-    } else if constexpr (sfpu_op == SfpuType::requant_int32) {
-        sfpu::sfpu_init<APPROXIMATE>(sfpu_op, param0);
-    } else if constexpr (sfpu_op == SfpuType::dequant_int32) {
-        sfpu::sfpu_init<APPROXIMATE>(sfpu_op, param0);
-    } else {
-        sfpu::sfpu_init<APPROXIMATE>(sfpu_op);
-    }
+inline void _llk_math_eltwise_binary_sfpu_inc_dst_face_addr_() {
+    math::inc_dst_addr<8>();
+    math::inc_dst_addr<8>();
+}
+
+inline void _llk_math_eltwise_binary_sfpu_init_() {
+    eltwise_binary_sfpu_configure_addrmod();
     math::reset_counters(p_setrwc::SET_ABD_F);
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h
index f26d2ca3f46..c471e91a797 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel_include.h"
@@ -19,7 +20,7 @@ inline void eltwise_unary_configure_addrmod();
 template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE, DstSync Dst = DstSync::SyncFull, bool is_fp32_dest_acc_en = false, bool unpack_to_dest = false>
 inline void _llk_math_eltwise_unary_datacopy_(const std::uint32_t dst_index, const std::uint32_t src_format, const std::uint32_t dst_format) {
 
-    if (unpack_to_dest && math::is_32bit_input(src_format, dst_format)) {
+    if (unpack_to_dest && is_32bit_input(src_format, dst_format)) {
         math_unpack_to_dest_math_ready();
         math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32, true>(dst_index);
         math::math_unpack_to_dest_tile_ready();
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h
index 3f83bb707b0..e8b293b0597 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #include "ckernel_include.h"
 #include "ckernel_template.h"
 #include <type_traits>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h
index ccd0dc293ff..33dec5ac11f 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel_include.h"
 #include "ckernel_template.h"
@@ -13,12 +14,7 @@
 #include "ckernel_sfpu.h"
 
 using namespace ckernel;
-template <SfpuType sfpu_type>
-void static_assert_sfpu_type_dependent() {
-    static_assert(sfpu_type == SfpuType::unused, "sfpu_type exception");
-}
 // local function declarations
-template <SfpuType sfpu_op>
 inline void eltwise_unary_sfpu_configure_addrmod(){
     // NOTE: this kernel is typically used in conjunction with
     //       A2D, which is using ADDR_MOD_0 and ADDR_MOD_2, so use one
@@ -33,20 +29,8 @@ inline void eltwise_unary_sfpu_configure_addrmod(){
 }
 inline void eltwise_unary_sfpu_configure_mop();
 
-template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN=false>
-inline void _llk_math_eltwise_unary_sfpu_(
-    const uint face_r_dim,
-    const uint num_faces,
-    uint dst_index,
-    int vector_mode = (int)Dim::RC,
-    uint param0 = 0,
-    uint param1 = 0,
-    uint param2 = 0,
-    uint param3 = 0,
-    uint param4 = 0,
-    uint param5 = 0) {
-
-    constexpr int ITERATIONS = 8;
+template <DstSync Dst = DstSync::SyncFull>
+inline void _llk_math_eltwise_unary_sfpu_start_(const uint dst_index) {
     if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
         math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
     } else {
@@ -54,61 +38,21 @@ inline void _llk_math_eltwise_unary_sfpu_(
     }
     math::set_addr_mod_base();
     TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
-    if (vector_mode == (int)Dim::R) {
-        // Do a row vector, Face0 + Face1 -- first iteration (first row)
-        const int iterations = (num_faces < 4) ?
-                                    ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns
-#pragma GCC unroll 0
-        for (int face = 0; face < 2; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(iterations, param0, param1, param2, param3, param4, param5);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        }
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-    } else if (vector_mode == (int)Dim::C) {
-        // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face
-#pragma GCC unroll 0
-        for (int face = 0; face < 2; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(ITERATIONS, param0, param1, param2, param3, param4, param5);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            if (num_faces>2) { // Skip next 2 faces if tile is 32x32
-                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-                TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            }
-        }
-        if (num_faces<=2) {
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        }
-    } else {
-        // Do all four faces, and iterate through all 4 blocks of 4 rows each
-#pragma GCC unroll 0
-        for (int face = 0; face < 4; face++) {
-            sfpu::calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(ITERATIONS, param0, param1, param2, param3, param4, param5);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-            TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-        }
-    }
+}
+
+inline void _llk_math_eltwise_unary_sfpu_done_() {
     math::clear_dst_reg_addr();
 
     TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::WAIT_SFPU);
     math::clear_addr_mod_base();
 }
 
-template <SfpuType sfpu_op, bool APPROXIMATE>
-inline void _llk_math_eltwise_unary_sfpu_init_(
-    uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
-    eltwise_unary_sfpu_configure_addrmod< sfpu_op >();
-    if constexpr (sfpu_op == SfpuType::dropout) {
-        sfpu::sfpu_init<APPROXIMATE>(sfpu_op, param2);
-    } else {
-        sfpu::sfpu_init<APPROXIMATE>(sfpu_op);
-    }
+inline void _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_() {
+    math::inc_dst_addr<8>();
+    math::inc_dst_addr<8>();
+}
+
+inline void _llk_math_eltwise_unary_sfpu_init_() {
+    eltwise_unary_sfpu_configure_addrmod();
     math::reset_counters(p_setrwc::SET_ABD_F);
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h
index 5ebaefe0d96..bf111343ccd 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel_include.h"
 #include "ckernel_template.h"
@@ -15,15 +16,19 @@
 
 using namespace ckernel;
 
-template <int NUM_FIDELITY_PHASES>
+template <int MATH_FIDELITY_DESC, DstTileFaceLayout FaceLayout=DstTileFaceLayout::ColMajor>
 inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) {
 
+    constexpr int NUM_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC);
     constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0);
+    constexpr int FIDELITY_INCREMENT = high_fidelity ? get_math_fidelity_increment(MATH_FIDELITY_DESC) : 0;
 
     const bool is_in0_16x32 = (in0_tile_r_dim <=FACE_R_DIM) && (in0_tile_c_dim > FACE_C_DIM);
     const bool is_in0_32x16 = (in0_tile_r_dim > FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM);
     const bool is_in1_32x16 = (in1_tile_r_dim > FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM);
 
+    static_assert(FaceLayout == DstTileFaceLayout::RowMajor, "FaceLayout must be RowMajor");
+
     // MVMUL does D = B*A
 
     // Inner Loop --> 32/8 = 4 times for the full 32x16 face
@@ -52,7 +57,7 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c
         .srca = {.incr = 0, .clr = 1, .cr = 1},
         .srcb = {.incr = 0, .clr = 1, .cr = 1},
         .dest = {.incr = 0, .clr = 1, .cr = 1},
-        .fidelity = {.incr = high_fidelity, .clr = 0},
+        .fidelity = {.incr = FIDELITY_INCREMENT, .clr = 0},
         .bias = {.incr = 1},
     }
         .set(ADDR_MOD_5);
@@ -235,7 +240,7 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c
 
 }
 
-template <int NUM_FIDELITY_PHASES>
+template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout=DstTileFaceLayout::ColMajor>
 inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) {
 
     // in0 - loaded to SrcB
@@ -358,10 +363,10 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con
     tmp.program(instrn_buffer);
 }
 
-template <int NUM_FIDELITY_PHASES>
+template <int MATH_FIDELITY_DESC, DstTileFaceLayout FaceLayout=DstTileFaceLayout::ColMajor>
 inline void _llk_math_matmul_init_(const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
 
-    matmul_configure_addrmod<NUM_FIDELITY_PHASES>(transpose, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face);
+    matmul_configure_addrmod<MATH_FIDELITY_DESC, FaceLayout>(transpose, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face);
     const bool reuse_a = ct_dim>=rt_dim;
     const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim;
     if (t_dim>1) {
@@ -374,11 +379,12 @@ inline void _llk_math_matmul_init_(const std::uint32_t in0_tile_r_dim = TILE_R_D
         TTI_SETC16(CLR_DVALID_SrcA_Disable_ADDR32, 0);
     }
 
-    matmul_configure_mop<NUM_FIDELITY_PHASES>(transpose>0, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face);
+    constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC);
+    matmul_configure_mop<MATH_FIDELITY_PHASES, FaceLayout>(transpose>0, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face);
     math::reset_counters(p_setrwc::SET_ABD_F);
 }
 
-template <int NUM_FIDELITY_PHASES>
+template <int MATH_FIDELITY_DESC, DstTileFaceLayout FaceLayout=DstTileFaceLayout::ColMajor>
 inline void _llk_math_matmul_(uint dst_index, const bool transpose=false, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) {
     const bool reuse_a = ct_dim>=rt_dim;
     const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h
index 4c77069f857..66879b3ea5a 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel_include.h"
 #include "ckernel_template.h"
@@ -18,16 +19,19 @@ inline void reduce_configure_addrmod();
 template <ReduceDim dim, int num_fidelity_phases>
 inline void reduce_configure_mop();
 
-template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0, bool is_fp32_dest_acc_en = false, bool is_int_fpu_en = false>
+template <PoolType type, ReduceDim dim, int MATH_FIDELITY_DESC = 0, bool is_fp32_dest_acc_en = false, bool is_int_fpu_en = false>
 inline void _llk_math_reduce_(const uint dst_index) {
-    constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4;
+
+    constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC);
+    constexpr bool HIGH_FIDELITY = MATH_FIDELITY_PHASES > 0;
+
     math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(dst_index);
     if constexpr (dim == ReduceDim::REDUCE_ROW) {
         // Transpose for each face in src A done at unpacker, and pool
         if constexpr (type == PoolType::MAX) {
             TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
         } else {
-            if constexpr (high_fidelity) {
+            if constexpr (HIGH_FIDELITY) {
                 ckernel_template::run(instrn_buffer);
                 TTI_CLEARDVALID(p_setrwc::CLR_AB, 0);
             } else {
@@ -38,7 +42,7 @@ inline void _llk_math_reduce_(const uint dst_index) {
         if constexpr (type == PoolType::MAX) {
             TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
         } else {
-            if constexpr (high_fidelity) {
+            if constexpr (HIGH_FIDELITY) {
                 ckernel_template::run(instrn_buffer);
             } else {
                 TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
@@ -97,7 +101,7 @@ inline void _llk_math_reduce_(const uint dst_index) {
         if constexpr (type == PoolType::MAX) {
             TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
         } else {
-            if constexpr (high_fidelity) {
+            if constexpr (HIGH_FIDELITY) {
                 ckernel_template::run(instrn_buffer);
                 TTI_CLEARDVALID(p_setrwc::CLR_AB, 0);
             } else {
@@ -108,7 +112,7 @@ inline void _llk_math_reduce_(const uint dst_index) {
         if constexpr (type == PoolType::MAX) {
             TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
         } else {
-            if constexpr (high_fidelity) {
+            if constexpr (HIGH_FIDELITY) {
                 ckernel_template::run(instrn_buffer);
             } else {
                 TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
@@ -159,7 +163,7 @@ inline void _llk_math_reduce_(const uint dst_index) {
             if constexpr (type == PoolType::MAX) {
                 TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
             } else {
-                if constexpr (high_fidelity) {
+                if constexpr (HIGH_FIDELITY) {
                     ckernel_template::run(instrn_buffer);
                 } else {
                     TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
@@ -171,7 +175,7 @@ inline void _llk_math_reduce_(const uint dst_index) {
             if constexpr (type == PoolType::MAX) {
                 TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
             } else {
-                if constexpr (high_fidelity) {
+                if constexpr (HIGH_FIDELITY) {
                     ckernel_template::run(instrn_buffer);
                 } else {
                     TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
@@ -189,7 +193,7 @@ inline void _llk_math_reduce_(const uint dst_index) {
             if constexpr (type == PoolType::MAX) {
                 TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 4);
             } else {
-                if constexpr (high_fidelity) {
+                if constexpr (HIGH_FIDELITY) {
                     ckernel_template::run(instrn_buffer);
                     TTI_CLEARDVALID(p_setrwc::CLR_AB, 0);
                 } else {
@@ -201,7 +205,7 @@ inline void _llk_math_reduce_(const uint dst_index) {
         if constexpr (type == PoolType::MAX) {
             TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 4);
         } else {
-            if constexpr (high_fidelity) {
+            if constexpr (HIGH_FIDELITY) {
                 ckernel_template::run(instrn_buffer);
             } else {
                 TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 4);
@@ -230,8 +234,8 @@ inline void _llk_math_reduce_(const uint dst_index) {
         if constexpr (type == PoolType::MAX) {
             TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0);
         } else {
-            if constexpr (high_fidelity) {
-                for (int i = 0; i < num_fidelity_phases - 1; i++) {
+            if constexpr (HIGH_FIDELITY) {
+                for (int i = 0; i < MATH_FIDELITY_PHASES - 1; i++) {
                     TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_3, p_gpool::INDEX_DIS, 0);
                 }
             }
@@ -240,8 +244,13 @@ inline void _llk_math_reduce_(const uint dst_index) {
     }
 }
 
-template <PoolType type, bool is_high_fidelity>
+template <PoolType type, int MATH_FIDELITY_DESC>
 inline void reduce_configure_addrmod() {
+
+    constexpr int NUM_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC);
+    constexpr int FIDELITY_INCREMENT = get_math_fidelity_increment(MATH_FIDELITY_DESC);
+    constexpr bool HIGH_FIDELITY = NUM_FIDELITY_PHASES > 0;
+
     addr_mod_t{
         .srca = {.incr = 0 },
         .srcb = {.incr = 0 },
@@ -262,12 +271,12 @@ inline void reduce_configure_addrmod() {
     }
         .set(ADDR_MOD_2);
 
-    if constexpr (is_high_fidelity) {
+    if constexpr (HIGH_FIDELITY) {
         addr_mod_t{
             .srca = {.incr = 0},
             .srcb = {.incr = 0},
             .dest = {.incr = 0},
-            .fidelity = { .incr = 1}
+            .fidelity = { .incr = FIDELITY_INCREMENT}
         }.set(ADDR_MOD_3);
     }
 }
@@ -293,14 +302,15 @@ inline void reduce_configure_mop() {
     }
 }
 
-template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0>
+template <PoolType type, ReduceDim dim, int MATH_FIDELITY_DESC = 0>
 inline void _llk_math_reduce_init_(const std::uint32_t within_face_16x16_transpose=0) { //within_face_16x16_transpose used for unpack, ignored by math
 
-    constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4;
+    constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC);
+    constexpr bool HIGH_FIDELITY = MATH_FIDELITY_PHASES > 0;
 
-    reduce_configure_addrmod<type, high_fidelity>();
-    if constexpr (high_fidelity) {
-        reduce_configure_mop<dim, num_fidelity_phases>();
+    reduce_configure_addrmod<type, MATH_FIDELITY_DESC>();
+    if constexpr (HIGH_FIDELITY) {
+        reduce_configure_mop<dim, MATH_FIDELITY_PHASES>();
     }
 
     TTI_SETC16(CLR_DVALID_SrcA_Disable_ADDR32, 0);
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h
index 7df83739dc9..cb4a26dbf1a 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "llk_defs.h"
 
@@ -207,3 +208,5 @@ inline void _llk_pack_(const std::uint32_t tile_index, const std::uint32_t addre
         TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close tile
     }
 }
+
+#include "llk_pack_untilize.h"
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
index 88dbdb186a9..5f796f4c5b3 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel.h"
@@ -187,8 +188,8 @@ inline void _llk_pack_reduce_mask_config_() {
 
     // We initialize PCK_EDGE_OFFSET_SEC0 mask to clear out all the datums in the row
     pack_edge_offset.f.mask = 0x0;
-    uint32_t row_set_mapping_1;
-    uint32_t edge_offset_sec1_mask;
+    uint32_t row_set_mapping_1 = 0;
+    uint32_t edge_offset_sec1_mask = 0;
 
     if constexpr (dim == ReduceDim::REDUCE_ROW) {
         // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one
@@ -223,6 +224,24 @@ inline void _llk_pack_reduce_mask_config_() {
             // TILE_ROW_SET_MAPPING_1 configuration sets only first row to use PCK_EDGE_OFFSET_SEC1 mask
             row_set_mapping_1 = 0x00000001; // each packer packs 1x16 row
         }
+    } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) {
+                // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one
+        edge_offset_sec1_mask = 0x0001;
+        if constexpr (untilize) {
+            pack_edge_offset.f.tile_row_set_select_pack0 = 1;
+            pack_edge_offset.f.tile_row_set_select_pack1 = 1;
+            pack_edge_offset.f.tile_row_set_select_pack2 = 1;
+            pack_edge_offset.f.tile_row_set_select_pack3 = 1;
+            row_set_mapping_1 = 0x00000005;
+        } else {
+            // Packer 0 and 2 will use TILE_ROW_SET_MAPPING_1, while packer 1 and 3 will keep using
+            // TILE_ROW_SET_MAPPING_0 configuration which is the default one
+            pack_edge_offset.f.tile_row_set_select_pack0 = 1;
+            pack_edge_offset.f.tile_row_set_select_pack2 = 1;
+
+            // TILE_ROW_SET_MAPPING_1 configuration sets all rows to use PCK_EDGE_OFFSET_SEC1 mask
+            row_set_mapping_1 = 0x00000001;
+        }
     }
 
     // Initialize TMP registers with values we need to write in CFG registers
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h
new file mode 100644
index 00000000000..a1defc58dde
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+
+#pragma once
+#include "llk_defs.h"
+
+#include "ckernel.h"
+#include "ckernel_template.h"
+#include "llk_pack_common.h"
+#include "ckernel_globals.h"
+
+using namespace ckernel;
+using namespace ckernel::packer;
+
+inline void _llk_pack_untilize_configure_addrmod_() {
+
+    addr_mod_pack_t{
+        .y_src = {.incr = 15}, // 4-bit value so max is 15. incadcxy will increment it by 1
+    }
+    .set(ADDR_MOD_0);
+
+    addr_mod_pack_t{
+        .y_src = { .incr = 0, .clr = 0, .cr = 1  },
+    }.set(ADDR_MOD_1);
+
+    addr_mod_pack_t{
+        .y_src = { .incr = 0, .clr = 1, .cr = 0  },
+    }.set(ADDR_MOD_2);
+
+}
+
+template <std::uint32_t block_ct_dim>
+inline void _llk_pack_untilize_mop_config_() {
+    const uint PACKCNT = 4;
+    constexpr uint MEGAROW = 1;
+    constexpr uint ZERO_OUTPUT_FLAG = p_pacr::P_ZERO_OUTPUT_DISABLED;
+    constexpr uint MOP_INNER_LOOP = 1;
+
+    constexpr uint MOP_OUTER_LOOP = block_ct_dim;
+
+    // Inc ch0_y+=1 (addr_mod_0 will increment by 15)
+    ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 1, 0));
+    tmp.set_start_op(TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0));
+    tmp.set_end_ops(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0),
+                    TT_OP_INCADCZW(p_setadc::PAC, 0, 0, 1, 0)); // w cnt points to the next tile
+    tmp.program(instrn_buffer);
+}
+
+template <std::uint32_t block_ct_dim>
+inline void _llk_pack_untilize_init_() {
+
+    _llk_pack_untilize_configure_addrmod_();
+
+    _llk_pack_untilize_mop_config_<block_ct_dim>();
+}
+
+template <std::uint32_t block_ct_dim>
+inline void _llk_pack_untilize_(const std::uint32_t address, const std::uint32_t pack_dst_format) {
+
+    program_packer_untilized_destination<block_ct_dim>(address, pack_dst_format);
+
+    for (std::uint32_t row=0; row<TILE_R_DIM/4; row++) {
+        TTI_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_W, 0); // Clear tile counter
+        ckernel::ckernel_template::run(instrn_buffer);
+        TTI_ADDRCRXY(p_setadc::PAC, 0, 0, 1, 0, 0b0010); // Read new row in the tile
+    }
+
+    TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close block
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h
index 61dd252e81e..af9a75150d9 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel.h"
 #include "ckernel_defs.h"
@@ -49,7 +50,7 @@ inline void _llk_unpack_A_mop_config_(const bool transpose_of_faces, const std::
         static constexpr uint srcb_clear_z = TT_OP_SETADCZW(p_setadc::UNP_B, 0, 0, 0, 0, 0b0001); // set srcB ch0_z = 0
     #endif
 
-    if (unpack_to_dest && unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) {
+    if (unpack_to_dest && is_32bit_input(unpack_src_format, unpack_dst_format)) {
         const uint32_t outerloop = num_faces;
         constexpr uint32_t innerloop = 1;
         ckernel_template tmp(outerloop, innerloop, unpack_srca_to_dest);
@@ -136,10 +137,13 @@ inline void _llk_unpack_A_mop_config_(const bool transpose_of_faces, const std::
     }
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void _llk_unpack_A_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
     constexpr bool is_row_pool = false;
-    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+    constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All);
+    constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu);
+    constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, fpu_srnd_en, pack_srnd_en>(
         unpack_src_format,
         unpack_src_format,
         unpack_dst_format,
@@ -154,7 +158,7 @@ inline void _llk_unpack_A_hw_configure_(const std::uint32_t unpack_src_format, c
 template <BroadcastType BType = BroadcastType::NONE, bool acc_to_dest = false, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, bool unpack_to_dest = false>
 inline void _llk_unpack_A_init_(const std::uint32_t transpose_of_faces=0, const std::uint32_t within_face_16x16_transpose=0, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const std::uint32_t unpack_src_format = 0, const std::uint32_t unpack_dst_format = 0) {
     constexpr std::uint32_t UNP_SEL = (BType == BroadcastType::NONE) ? p_setadc::UNP_A : p_setadc::UNP_B;
-    config_face_dim<false, UNP_SEL>(face_r_dim);
+    config_unpacker_x_end<UNP_SEL>(face_r_dim);
     _llk_unpack_A_mop_config_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(transpose_of_faces>0, num_faces, unpack_src_format, unpack_dst_format);
 }
 
@@ -195,7 +199,7 @@ inline void _llk_unpack_A_(const std::uint32_t address, const bool transpose_of_
     }
 
     if constexpr (unpack_to_dest) {
-        if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) {
+        if (is_32bit_input(unpack_src_format, unpack_dst_format)) {
             set_dst_write_addr(unp_cfg_context, unpack_dst_format);
             wait_for_dest_available();
         }
@@ -208,7 +212,7 @@ inline void _llk_unpack_A_(const std::uint32_t address, const bool transpose_of_
     t6_semaphore_get(semaphore::UNPACK_SYNC);
 
     if (unpack_to_dest) {
-        if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) {
+        if (is_32bit_input(unpack_src_format, unpack_dst_format)) {
             unpack_to_dest_tile_done(unp_cfg_context);
         }
     }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h
index 0f6d54f2909..fb10f53d08e 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel.h"
 #include "ckernel_defs.h"
@@ -70,10 +71,13 @@ inline void _llk_unpack_AB_mop_config_(const bool transpose_of_faces=false, cons
 
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void _llk_unpack_AB_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format,  const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
     constexpr bool is_row_pool = false;
-    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+    constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All);
+    constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu);
+    constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, fpu_srnd_en, pack_srnd_en>(
         unpA_src_format,
         unpB_src_format,
         unpA_dst_format,
@@ -91,7 +95,7 @@ inline void _llk_unpack_AB_init_(const std::uint32_t face_r_dim=FACE_R_DIM, cons
     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(transpose); // transpose within the face
 
     constexpr std::uint32_t UNP_SEL = p_setadc::UNP_AB;
-    config_face_dim<false, UNP_SEL>(face_r_dim);
+    config_unpacker_x_end<UNP_SEL>(face_r_dim);
 
     _llk_unpack_AB_mop_config_<BType>(transpose>0, num_faces, narrow_tile); // transpose of faces 0,2,1,3
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h
index 4578126b9e8..6759e3b3065 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel.h"
 #include "ckernel_defs.h"
@@ -90,12 +91,15 @@ inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::u
 
 }
 
-template<bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template<bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void _llk_unpack_AB_matmul_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format,  const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4, const std::uint32_t unpA_tile_size = 0, const std::uint32_t unpB_tile_size = 0) {
 
     constexpr bool is_row_pool = false;
+    constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All);
+    constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu);
+    constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack);
 
-    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, fpu_srnd_en, pack_srnd_en>(
         unpA_src_format,
         unpB_src_format,
         unpA_dst_format,
@@ -136,7 +140,7 @@ __attribute__((always_inline)) inline void _llk_unpack_AB_matmul_init_(const std
     if (partial_face) {
         // Do face by face unpacking. Need to program correct face dim
         // to compute address of the next face
-        config_face_dim<false, p_setadc::UNP_B>(unpB_face_r_dim);
+        config_unpacker_x_end<p_setadc::UNP_B>(unpB_face_r_dim);
     } else {
         // Do full tile unpacking. No need to program face dim
         // as address counter pointing to the face is not incremented
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h
index 92222ddaaa3..cb747267e34 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 
 #include "ckernel.h"
@@ -38,7 +39,7 @@ void _llk_zero_buffer_(const std::uint32_t base_address, const std::uint32_t siz
 
 template <bool mail2math=true, bool mail2pack=true>
 inline void _llk_unpack_get_tile_(std::uint32_t address, std::uint32_t *p_tile) {
-    std::uint32_t byte_address = (address + TILE_HEADER_SIZE)<<4;
+    std::uint32_t byte_address = (address)<<4;
 
     if constexpr (mail2math) {
        mailbox_write(ThreadId::MathThreadId, byte_address);
@@ -66,29 +67,28 @@ inline void _llk_unpack_debug_dump_seek_(std::uint8_t offset) {
     debug_dump_seek(offset);
 }
 
-template <bool is_tile_dim_reconfig_en = false>
-inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4)
+inline void _llk_unpack_config_tile_dim_srca_impl_(const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4)
 {
-    if constexpr(is_tile_dim_reconfig_en) {
-        const uint face_dim = face_r_dim*FACE_C_DIM;
+    cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 16, 0xffff0000>(num_faces);
+    config_unpacker_0_face_dim<true, p_setadc::UNP_A>(face_r_dim);
+}
 
-        cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 16, 0xffff0000>(num_faces);
-        cfg_reg_rmw_tensix<THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32, 0, 0xffffffff>(face_dim | face_dim << 16);
-    }
+inline void _llk_unpack_config_tile_dim_srcb_impl_(const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4)
+{
+    const uint face_dim = face_r_dim*FACE_C_DIM;
+    cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 16, 0xffff0000>(face_dim);
+    cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32+1, 16, 0xffff0000>(num_faces);
+}
+
+inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size)
+{
     cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format);
     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Out_data_format_RMW>(unpack_dst_format);
     TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A
 }
 
-template <bool is_tile_dim_reconfig_en = false>
-inline void _llk_unpack_reconfig_data_format_srcb_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4)
+inline void _llk_unpack_reconfig_data_format_srcb_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size)
 {
-    if constexpr(is_tile_dim_reconfig_en) {
-        const uint face_dim = face_r_dim*FACE_C_DIM;
-
-        cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 16, 0xffff0000>(face_r_dim*FACE_C_DIM);
-        cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32+1, 16, 0xffff0000>(num_faces);
-    }
     cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format);
     cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpack_dst_format);
     TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_B)); // update gpr which holds tile size B
@@ -100,5 +100,5 @@ inline void _llk_unpack_dbg_feature_disable_(){
 }
 
 inline void _llk_enable_int8_fpu_math_() {
-    enalbe_int8_fpu_math();
+    enable_int8_fpu_math();
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h
index 8f0ea52e4fa..0fdaae9df61 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel.h"
 #include "ckernel_defs.h"
@@ -40,12 +41,15 @@ inline void _llk_unpack_reduce_mop_config_() {
     tmp.program(instrn_buffer);
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void _llk_unpack_reduce_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format,  const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4) {
 
     constexpr bool is_row_pool = true;
+    constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All);
+    constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu);
+    constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack);
 
-    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, fpu_srnd_en, pack_srnd_en>(
         unpA_src_format,
         unpB_src_format,
         unpA_dst_format,
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h
index ae1b22d830e..b695e2f296a 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel.h"
 #include "ckernel_defs.h"
@@ -30,12 +31,15 @@ inline void _llk_unpack_tilize_mop_config_(const bool narrow_tile=false) {
     tmp.program(instrn_buffer);
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void _llk_unpack_tilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
 
     constexpr bool is_row_pool = false;
+    constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All);
+    constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu);
+    constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack);
 
-    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, fpu_srnd_en, pack_srnd_en>(
         unpack_src_format,
         unpack_src_format,
         unpack_dst_format,
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h
index 723f9716c88..f67b72f5cee 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+
 #pragma once
 #include "ckernel.h"
 #include "ckernel_defs.h"
@@ -48,10 +49,13 @@ inline void _llk_unpack_untilize_mop_config_() {
     tmp.program(instrn_buffer);
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void _llk_unpack_untilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM,  const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) {
     constexpr bool is_row_pool = false;
-    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, stoch_rnd_mode>(
+    constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All);
+    constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu);
+    constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack);
+    configure_unpack_AB<is_row_pool, is_fp32_dest_acc_en, fpu_srnd_en, pack_srnd_en>(
         unpack_src_format,
         unpack_src_format,
         unpack_dst_format,
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
index ff64fb27b2d..513a0a15972 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h
@@ -40,7 +40,7 @@ inline void llk_math_matmul_init(
         rt_dim,
         kt_dim);
 #else
-    _llk_math_matmul_init_<NUM_FIDELITY_PHASES>(
+    _llk_math_matmul_init_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(
         in0_tile_r_dim,
         in0_tile_c_dim,
         in1_tile_r_dim,
@@ -63,6 +63,6 @@ inline void llk_math_matmul(
 #ifdef ARCH_GRAYSKULL
     _llk_math_matmul_<NUM_FIDELITY_PHASES, FaceLayout>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
 #else
-    _llk_math_matmul_<NUM_FIDELITY_PHASES>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
+    _llk_math_matmul_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
 #endif
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
index 17bba18f12a..898788ca415 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -13,36 +13,6 @@ namespace ckernel {
 * LLK ELTWISE UNARY SFPU
 *************************************************************************/
 
-template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN=false>
-inline void llk_math_eltwise_unary_sfpu(
-    uint dst_index,
-    int vector_mode = (int)Dim::RC,
-    uint param0 = 0,
-    uint param1 = 0,
-    uint param2 = 0,
-    uint param3 = 0,
-    uint param4 = 0,
-    uint param5 = 0) {
-
-    const std::uint32_t operand_id = get_operand_id(0);
-    const std::uint32_t num_faces = get_operand_num_faces(0);
-    const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
-
-    _llk_math_eltwise_unary_sfpu_<sfpu_op, APPROXIMATE, Dst, IS_INT_SFPU_EN>(
-        face_r_dim,
-        num_faces,
-        dst_index,
-        vector_mode,
-        param0,
-        param1,
-        param2,
-        param3,
-        param4,
-        param5
-    );
-}
-
-
 // New LLK SFPU APIs
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
 inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
index 1e57d003cfc..249f62bc71b 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
@@ -272,3 +272,36 @@ inline void llk_pack_reduce_mask_config() {
 inline void llk_pack_reduce_mask_clear() {
     _llk_pack_reduce_mask_clear_();
 }
+
+// FIXME-WH-UPLIFT
+template <ReduceDim dim, bool at_kernel_start = false, bool revert=false, bool is_fp32_dest_acc_en = false>
+inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
+
+    const bool untilize = false;
+    if constexpr (at_kernel_start) {
+
+        const std::uint32_t output_id = get_output_id(icb_out);
+        const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+        const std::uint32_t num_faces = get_output_num_faces(output_id);
+        const bool partial_face = get_output_partial_face(output_id);
+        const bool narrow_tile = get_output_narrow_tile(output_id);
+        const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+        const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}};
+
+        _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
+            pack_src_format[output_id],
+            pack_dst_format[output_id],
+            tile_size,
+            face_r_dim,
+            num_faces,
+            partial_face,
+            narrow_tile,
+            relu_config.val
+        );
+    } else {
+        TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK);
+        tensix_sync();
+    }
+
+    _llk_pack_reduce_mask_config_<untilize, dim>();
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h
index eb7928d3090..e8bbec37fc6 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h
@@ -28,7 +28,7 @@ inline void calculate_elu(uint slope)
     vFloat s = c_slope.f;
 
     #pragma GCC unroll 0
-    for (int d = 0; d < WHB0_ITERATIONS; d++) {
+    for (int d = 0; d < 8; d++) {
         vFloat v = dst_reg[0];
 
         v_if (v < 0.0f) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h
index 42d4a30a7ce..cec5879a69a 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h
@@ -33,7 +33,7 @@ sfpi_inline vFloat calculate_erf_body(vFloat x) {
 // TODO: Fix assertion error for accurate mode
 template <bool APPROXIMATION_MODE>
 inline void calculate_erf() {
-    for (int d = 0; d < WHB0_ITERATIONS; d++) {
+    for (int d = 0; d < 8; d++) {
         // SFPU microcode:
         vFloat x = dst_reg[0];
         v_if(x < 0.0f) {
@@ -51,7 +51,7 @@ inline void calculate_erf() {
 template <bool APPROXIMATION_MODE>
 inline void calculate_erfc() {
 // SFPU microcode:
-    for (int d = 0; d < WHB0_ITERATIONS; d++) {
+    for (int d = 0; d < 8; d++) {
         vFloat x = dst_reg[0];
         v_if(x < 0.0f) { x = 1.0 + (calculate_erf_body<APPROXIMATION_MODE>(x)); }
         v_else { x = 1.0 - (calculate_erf_body<APPROXIMATION_MODE>(x)); }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h
index 56d1a98cb69..ea77be75900 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h
@@ -59,7 +59,7 @@ template <bool APPROXIMATION_MODE>
 inline void calculate_erfinv()
 {
     // SFPU microcode
-    for (int d = 0; d < WHB0_ITERATIONS; d++)
+    for (int d = 0; d < 8; d++)
     {
         vFloat v = dst_reg[0];
         v_if (v == 1.0f) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h
index 6301e66ba55..a60ef1c4628 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h
@@ -89,16 +89,16 @@ template <bool APPROXIMATION_MODE>
 void gelu_init() {
     vConstFloatPrgm0 = 0.5f;
     if constexpr (APPROXIMATION_MODE) {
-        sfpu_load_imm32(0,0x37E7322B);
-        //sfpu_load_imm32(4,0xB122A3AE);
-        sfpu_load_imm32(4,0xB12286D8);
+        _sfpu_load_imm32_(0,0x37E7322B);
+        //_sfpu_load_imm32_(4,0xB122A3AE);
+        _sfpu_load_imm32_(4,0xB12286D8);
 
 
-        sfpu_load_imm32(1,0x38E138F3);
-        sfpu_load_imm32(5,0xB437B479);
+        _sfpu_load_imm32_(1,0x38E138F3);
+        _sfpu_load_imm32_(5,0xB437B479);
 
-        sfpu_load_imm32(2,0x38003852);
-        sfpu_load_imm32(6,0x7c00afa4);
+        _sfpu_load_imm32_(2,0x38003852);
+        _sfpu_load_imm32_(6,0x7c00afa4);
     }
 }
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h
index 5aaac5b8000..b8c818c8bbe 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h
@@ -19,7 +19,7 @@ inline void calculate_i0()
 {
     #pragma GCC unroll 0
 
-    for (int d = 0; d < WHB0_ITERATIONS; d++)
+    for (int d = 0; d < 8; d++)
     {
         vFloat result = 0.0f;
         vFloat input = dst_reg[0];
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h
index 03b72c8962d..4aaadf3c305 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h
@@ -19,7 +19,7 @@ template <bool APPROXIMATION_MODE>
 inline void calculate_logical_not_unary()
 {
     #pragma GCC unroll 0
-    for (int d = 0; d < WHB0_ITERATIONS; d++) {
+    for (int d = 0; d < 8; d++) {
         vFloat v = dst_reg[0];
         v_if (v == 0) {
             dst_reg[0] = 1.0f;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h
index f75819f4239..4ab63536c78 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h
@@ -20,7 +20,7 @@ template <bool APPROXIMATION_MODE>
 inline void relu_min(uint uint_threshold)
 {
     vFloat threshold = Converter::to_float(uint_threshold);
-    for (int d = 0; d < WHB0_ITERATIONS; d++)
+    for (int d = 0; d < 8; d++)
     {
         vFloat a = dst_reg[0];
         v_if(a < threshold) {
@@ -37,7 +37,7 @@ template <bool APPROXIMATION_MODE>
 inline void relu_max(uint uint_threshold)
 {
     vFloat threshold = Converter::to_float(uint_threshold);
-    for (int d = 0; d < WHB0_ITERATIONS; d++)
+    for (int d = 0; d < 8; d++)
     {
         vFloat a = dst_reg[0];
         v_if(a > threshold) {
@@ -62,7 +62,7 @@ inline void calculate_lrelu(uint slope)
     vFloat s = c_slope.f;
 
     #pragma GCC unroll 0
-    for (int d = 0; d < WHB0_ITERATIONS; d++) {
+    for (int d = 0; d < 8; d++) {
         vFloat v = dst_reg[0];
 
         v_if (v < 0.0f) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
index f1e7d19acc8..cc08a9f346c 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -5,6 +5,7 @@
 #pragma once
 #include <type_traits>
 
+#include "llk_sfpu_types.h"
 #include "ckernel_globals.h"
 #include "ckernel_include.h"
 #include "ckernel_template.h"
@@ -16,3 +17,157 @@
 #include "llk_math_eltwise_unary_sfpu.h"
 
 using namespace ckernel;
+using namespace ckernel::sfpu;
+namespace ckernel {
+
+/*************************************************************************
+ * LLK ELTWISE UNARY SFPU
+ *************************************************************************/
+
+template <
+    SfpuType operation,
+    bool APPROXIMATION_MODE,
+    int SfpuType_PARAM = 0,
+    int ITERATIONS = 8,
+    bool IS_INT_SFPU_EN = false>
+inline void llk_math_calculate_sfpu(
+    const int iterations = ITERATIONS,
+    uint param0 = 0,
+    uint param1 = 0,
+    uint param2 = 0,
+    uint param3 = 0,
+    uint param4 = 0,
+    uint param5 = 0) {
+    if constexpr (operation == SfpuType::exp_with_base) {
+        constexpr bool zero_negative = true;
+        _calculate_exponential_<APPROXIMATION_MODE, zero_negative, true, ITERATIONS>(iterations, param0);
+    } else if constexpr (operation == SfpuType::tanh) {
+        _calculate_tanh_<APPROXIMATION_MODE, ITERATIONS>(iterations);
+    } else if constexpr (operation == SfpuType::hardtanh) {
+        _calculate_hardtanh_<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
+    } else if constexpr (operation == SfpuType::rsqrt) {
+        // param0 = true -> approximate fast mode
+        //          false -> high precision mode
+        //  The algorithm uses Newton's method based on no.of iteration better approximation can be calculated
+        if (param0) {
+            calculate_rsqrt<true, ITERATIONS, 10>();
+        } else {
+            calculate_rsqrt<false, ITERATIONS, 25>();
+        }
+    } else if constexpr (operation == SfpuType::sigmoid) {
+        calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::sigmoid_appx) {
+        calculate_sigmoid_appx<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::tanh_derivative) {
+        calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::dropout) {
+        calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(param0, param1);
+    } else if constexpr (operation == SfpuType::power) {
+        calculate_power_iterative<APPROXIMATION_MODE, ITERATIONS>(param0);
+    } else if constexpr (operation == SfpuType::square) {
+        calculate_square<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::log) {
+        calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(param0);
+    } else if constexpr (operation == SfpuType::log_with_base) {
+        calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(param0);
+    } else if constexpr (
+        (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) ||
+        (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) ||
+        (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) {
+        calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(8);  // BFLOAT16 - exp
+    } else if constexpr (operation == SfpuType::clamp) {
+        calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(param0, param1, param2);
+    } else if constexpr (operation == SfpuType::abs) {
+        calculate_abs<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::sign) {
+        calculate_sign<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::max) {
+        calculate_max<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::min) {
+        calculate_min<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::exp2) {
+        calculate_exp2<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::heaviside) {
+        calculate_heaviside<APPROXIMATION_MODE, ITERATIONS>(param0);
+    } else if constexpr (operation == SfpuType::expm1) {
+        calculate_expm1<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::asin) {
+        calculate_asin<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::acos) {
+        calculate_acos<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::atan) {
+        calculate_atan<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::signbit) {
+        calculate_signbit<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::silu) {
+        calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::mask) {
+        calculate_mask<APPROXIMATION_MODE, ITERATIONS>();
+    } else if constexpr (operation == SfpuType::negative) {
+        calculate_negative<APPROXIMATION_MODE, ITERATIONS>();
+    }
+}
+
+template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN = false>
+inline void llk_math_eltwise_unary_sfpu(
+    uint dst_index,
+    int vector_mode = (int)Dim::RC,
+    uint param0 = 0,
+    uint param1 = 0,
+    uint param2 = 0,
+    uint param3 = 0,
+    uint param4 = 0,
+    uint param5 = 0) {
+    const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support
+    const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+    const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+    constexpr int ITERATIONS = 8;
+
+    _llk_math_eltwise_unary_sfpu_start_<Dst>(dst_index);
+
+    if (vector_mode == (int)Dim::R) {
+        // Do a row vector, Face0 + Face1 -- first iteration (first row)
+        const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2)
+                                               : 2;  // At least 2 iterations for odd and even columns
+#pragma GCC unroll 0
+        for (int face = 0; face < 2; face++) {
+            llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+                iterations, param0, param1, param2, param3, param4, param5);
+            // Move to the next face
+            _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+        }
+        // Skip next two faces
+        _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+        _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+    } else if (vector_mode == (int)Dim::C) {
+        // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for
+        // full face
+#pragma GCC unroll 0
+        for (int face = 0; face < 2; face++) {
+            llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+                ITERATIONS, param0, param1, param2, param3, param4, param5);
+            _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+            if (num_faces > 2) {  // Skip next face if tile is 32x32
+                _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+            }
+        }
+        if (num_faces <= 2) {
+            // Skip next two faces
+            _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+            _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+        }
+    } else {
+        // Do all four faces, and iterate through all 4 blocks of 4 rows each
+#pragma GCC unroll 0
+        for (int face = 0; face < 4; face++) {
+            llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+                ITERATIONS, param0, param1, param2, param3, param4, param5);
+            // Move to the next face
+            _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+        }
+    }
+    _llk_math_eltwise_unary_sfpu_done_();
+}
+
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
index b82b1f39cb4..e43682ab4f2 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
@@ -30,16 +30,41 @@ inline void llk_math_eltwise_unary_sfpu_init(void (*func)()) {
 
 template <SfpuType sfpu_op, bool APPROXIMATE>
 inline void llk_math_eltwise_unary_sfpu_init(
-    uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
-
-    _llk_math_eltwise_unary_sfpu_init_<sfpu_op, APPROXIMATE>(
-        param0,
-        param1,
-        param2,
-        param3,
-        param4,
-        param5
-    );
+    const uint param0 = 0, const uint param1 = 0, const uint param2 = 0, const uint param3 = 0, const uint param4 = 0, const uint param5 = 0) {
+    _llk_math_eltwise_unary_sfpu_init_();
+
+    switch (sfpu_op) {
+        case SfpuType::reciprocal:
+            sfpu::_init_reciprocal_<APPROXIMATE>();
+            break;
+        case SfpuType::exponential:
+            sfpu::_init_exponential_<APPROXIMATE>();
+            break;
+        case SfpuType::log:
+            sfpu::_init_log_<APPROXIMATE>();
+            break;
+        case SfpuType::sqrt:
+            sfpu::_init_sqrt_<APPROXIMATE>();
+            break;
+        case SfpuType::tanh:
+        case SfpuType::tanh_derivative:
+            sfpu::_init_tanh_<APPROXIMATE>();
+            break;
+        case SfpuType::sigmoid:
+            sfpu::_init_sigmoid_<APPROXIMATE>();
+            break;
+        case SfpuType::gelu_derivative:
+            sfpu::_init_gelu_derivative_<APPROXIMATE>();
+            break;
+        case SfpuType::gelu:
+            sfpu::_init_gelu_<APPROXIMATE>();
+            break;
+        case SfpuType::dropout:
+            sfpu::_init_dropout_(param2);
+            break;
+        default:
+            break;
+    }
 }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
index 72c27cde02b..4c059e37585 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "llk_sfpu_types.h"
 #include "ckernel_defs.h"
 #include "ckernel_sfpu.h"
 #include "ckernel.h"
@@ -338,7 +339,7 @@ inline void calculate_comp(uint exponent_size_8)
 
 	//a[i] == 0
 	if constexpr(COMP_MODE == SfpuType::equal_zero) {
-	    v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
 	      v = one;
 	    } v_else {
 	      v = zero;
@@ -348,7 +349,7 @@ inline void calculate_comp(uint exponent_size_8)
 
 	//a[i] != 0
 	if constexpr(COMP_MODE == SfpuType::not_equal_zero) {
-	    v_if (sfpu_is_fp16_zero(v, exponent_size_8)) {
+	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
 	      v = zero;
 	    } v_else {
 	      v = one;
@@ -774,5 +775,20 @@ inline void calculate_silu()
     }
 }
 
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_mask()
+{
+    bool exponent_size_8 = true;
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat mask = dst_reg[32];
+        v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) {
+            dst_reg[0] = 0;
+        }
+        v_endif;
+        dst_reg++;
+    }
+}
+
 } // namespace sfpu
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h
new file mode 100644
index 00000000000..ce290b5ef6c
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+enum SfpuType {
+    tanh,
+    hardtanh,
+    gelu,
+    exponential,
+    exp_with_base,
+    sigmoid,
+    reciprocal,
+    sqrt,
+    lrelu,
+    power,
+    square,
+    tanh_derivative,
+    log,
+    log_with_base,
+    equal_zero,
+    not_equal_zero,
+    less_than_zero,
+    greater_than_equal_zero,
+    less_than_equal_zero,
+    greater_than_zero,
+    clamp,
+    gelu_derivative,
+    dropout,
+    abs,
+    sign,
+    max,
+    sine,
+    cosine,
+    tan,
+    relu_max,
+    relu_min,
+    cast_fp32_to_fp16a,
+    sigmoid_appx,
+    gelu_appx,
+    elu,
+    min,
+    exp2,
+    heaviside,
+    expm1,
+    signbit,
+    asin,
+    acos,
+    atan,
+    erf,
+    erfc,
+    rsqrt,
+    isfinite,
+    isinf,
+    isposinf,
+    isneginf,
+    isnan,
+    logical_not_unary,
+    erfinv,
+    i0,
+    silu,
+    mask,
+    negative,
+    dequant_int32,
+    requant_int32,
+    quant_int32,
+    unused,
+};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h
index bce909a4395..420cb04a9d3 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h
@@ -10,7 +10,7 @@
  * LLK UNPACK AB
  *************************************************************************/
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_AB_hw_configure(
     const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) {
     // In0 -> unpA
@@ -34,7 +34,7 @@ inline void llk_unpack_AB_hw_configure(
         num_faces);
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_AB_hw_configure_disaggregated(
     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) {
     const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h
index 68eca79f4e9..bab5b81e885 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h
@@ -10,7 +10,7 @@
  * LLK UNPACK AB MATMUL
  *************************************************************************/
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) {
     const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca;
 
@@ -41,7 +41,7 @@ inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_
         cb_interface[unpB_operand_id].fifo_page_size);
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_AB_matmul_hw_configure_disaggregated(
     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) {
     const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h
index e8918793baa..26f943ec1af 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h
@@ -10,7 +10,7 @@
  * LLK UNPACK A
  *************************************************************************/
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_A_hw_configure(
     const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) {
     const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand);
@@ -25,7 +25,7 @@ inline void llk_unpack_A_hw_configure(
         unpA_num_faces);
 }
 
-template <bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_A_hw_configure_disaggregated(
     const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) {
     const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand};
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h
index 6b61452722a..114d6f79389 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h
@@ -51,12 +51,10 @@ inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_op
     const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand);
     const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id);
     const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id);
-    _llk_unpack_reconfig_data_format_srca_impl_<is_tile_dim_reconfig_en>(
+    _llk_unpack_reconfig_data_format_srca_impl_(
         unpack_src_format[srca_operand_id],
         unpack_dst_format[srca_operand_id],
-        cb_interface[srca_operand_id].fifo_page_size,
-        face_r_dim,
-        num_faces);
+        cb_interface[srca_operand_id].fifo_page_size);
 }
 
 template <bool is_tile_dim_reconfig_en = false>
@@ -64,12 +62,10 @@ inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_op
     std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand);
     const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id);
     const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id);
-    _llk_unpack_reconfig_data_format_srcb_impl_<is_tile_dim_reconfig_en>(
+    _llk_unpack_reconfig_data_format_srcb_impl_(
         unpack_src_format[srcb_operand_id],
         unpack_dst_format[srcb_operand_id],
-        cb_interface[srcb_operand_id].fifo_page_size,
-        face_r_dim,
-        num_faces);
+        cb_interface[srcb_operand_id].fifo_page_size);
 }
 
 template <bool is_tile_dim_reconfig_en = false>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h
index afa60f7947b..433e33184ec 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h
@@ -10,7 +10,7 @@
 * LLK UNPACK REDUCE
 *************************************************************************/
 
-template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_reduce_hw_configure(
     const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) {
 
@@ -46,7 +46,7 @@ inline void llk_unpack_reduce_hw_configure(
     }
 }
 
-template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en=false, StochRndMode stoch_rnd_mode = StochRndMode::None>
+template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en=false, StochRndType stoch_rnd_mode = StochRndType::None>
 inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) {
     const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand};
     llk_unpack_reduce_hw_configure<type, dim, is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_reduce_params, mult);
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
index 71eeb6a0ba2..b0c3f50cca7 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h
@@ -14,7 +14,7 @@ template <bool is_fp32_dest_acc_en = false>
 inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) {
 
     constexpr bool  within_face_16x16_transpose = false;
-    constexpr StochRndMode stoch_rnd_mode = StochRndMode::None;
+    constexpr StochRndType stoch_rnd_mode = StochRndType::None;
 
     const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand);
     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h
index 5a135ad8903..16751995c93 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h
@@ -13,7 +13,7 @@ template <bool is_fp32_dest_acc_en = false>
 inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) {
     constexpr bool is_row_pool = false;
     constexpr bool within_face_16x16_transpose = false;
-    constexpr StochRndMode stoch_rnd_mode = StochRndMode::None;
+    constexpr StochRndType stoch_rnd_mode = StochRndType::None;
 
     const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand);
     const uint32_t unpA_num_faces = 4;
diff --git a/tt_metal/hw/inc/debug/dprint_tile.h b/tt_metal/hw/inc/debug/dprint_tile.h
index 1b650924d9c..76b23d752d5 100644
--- a/tt_metal/hw/inc/debug/dprint_tile.h
+++ b/tt_metal/hw/inc/debug/dprint_tile.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "hostdevcommon/dprint_common.h"
-
+#include "llk_io.h"
 
 struct SliceRange {
     // A slice object encoding semantics of np.slice(h0:h1:hs, w0:w1:ws)

From 8c58fb12a38208b5f55530f552ca43041675af33 Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Mon, 4 Dec 2023 20:58:47 +0000
Subject: [PATCH 05/16] #3908: Fixtypo in llk_operands.h and llk_outputs.h to
 get *dst* format instead of src. Added revert mode to remove edge mask config
 which was missing

---
 .../wormhole_b0/common/src/ckernel.cc         | 221 -------------
 .../wormhole_b0/common/src/ckernel_main.cc    |  21 --
 .../common/src/ckernel_perf_unpack_pack.cc    | 301 ------------------
 .../wormhole_b0/common/src/ckernel_unity.cc   |   9 -
 .../wormhole_b0/common/src/fwlog_list         |   1 -
 .../wormhole_b0/llk_lib/llk_pack_common.h     |  21 +-
 .../wormhole_b0/llk_lib/llk_pack_shifted.h    | 202 ------------
 .../wormhole_b0/metal/llk_api/llk_pack_api.h  |   9 +-
 .../wormhole_b0/metal/llk_io/llk_operands.h   |   2 +-
 .../wormhole_b0/metal/llk_io/llk_outputs.h    |   2 +-
 10 files changed, 13 insertions(+), 776 deletions(-)
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h

diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc
deleted file mode 100644
index 3db907d6b99..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-
-#include "ckernel.h"
-#include "ckernel_addr_map.h"
-#include "ckernel_pcbuf.h"
-#include "ckernel_main.h"
-#include "ckernel_globals.h"
-#include <l1_address_map.h>
-#include <tensix.h>
-#ifdef PERF_DUMP
-#include "ckernel_perf_unpack_pack.h"
-#include "ckernel_perf_math.h"
-#endif
-
-namespace ckernel
-{
-
-enum class ttRiscCores : std::uint32_t { Unpack = 0, Math = 1, Pack = 2, Brisc = 3, Nrisc = 4};
-
-volatile uint tt_reg_ptr *reg_base = reinterpret_cast<volatile uint *>(0xFFB10000);
-volatile uint tt_reg_ptr *pc_buf_base = reinterpret_cast<volatile uint *>(PC_BUF_BASE);
-volatile uint tt_reg_ptr *regfile = reinterpret_cast<volatile uint *>(REGFILE_BASE);
-volatile uint tt_reg_ptr *instrn_buffer = reinterpret_cast<volatile uint *>(INSTRN_BUF_BASE);
-volatile uint tt_reg_ptr *mailbox_base[4] = {
-    reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX0_BASE), reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX1_BASE),
-    reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX2_BASE), reinterpret_cast<volatile uint tt_reg_ptr *>(TENSIX_MAILBOX3_BASE)
-};
-volatile uint tt_reg_ptr *dbg_event_scratch = nullptr;
-
-uint32_t cfg_state_id __attribute__((section(".bss"))) = 0;  // Flip between 0 and 1 to keep state between kernel calls
-uint32_t dest_offset_id __attribute__((section(".bss"))) = 0; // Flip between 0 and 1 to keep dest pointer between kernel calls
-
-uint32_t dbg_event_index __attribute__((section(".bss"))) = 0;
-uint32_t dbg_event_end __attribute__((section(".bss"))) = 0;
-volatile uint16_t tt_reg_ptr *debug_mailbox_base = nullptr;
-uint8_t mailbox_index = 0;
-const uint8_t mailbox_end = 32;
-volatile uint8_t tt_l1_ptr *debug_buffer = nullptr;
-volatile uint8_t tt_l1_ptr *debug_buffer_start = nullptr;
-uint8_t thread_id __attribute__((section(".bss"))) = 0;
-
-#ifdef PERF_DUMP
-uint32_t perf_index __attribute__((section(".bss"))) = 0;
-uint32_t perf_end __attribute__((section(".bss"))) = 0;
-volatile uint32_t *perf_buf_base[2];
-uint8_t perf_buf_base_id __attribute__((section(".bss"))) = 0;
-bool record_perf_events __attribute__((section(".bss"))) = 0;
-uint32_t perf_events_target_idx __attribute__((section(".bss"))) = 0;
-uint16_t current_outer_loop_iter __attribute__((section(".bss"))) = 0;
-int32_t dram_dump_req_local;
-bool first_unpack_recorded __attribute__((section(".bss"))) = 0;
-volatile uint *ncrisc_ack_addr = nullptr;
-uint32_t header;
-#if OVERLAY_DECOUPLE == 1
-uint8_t overlay_output_decouple_mask = 0;
-inline void update_overlay_decoupling_mailbox() {
-    overlay_output_decouple_mask = PERF_RISC_MAILBOX_OUTPUT_DECOUPLE_MASK_PTR[0] & 0xff;
-    if (thread_id == 0 || thread_id == 1) {
-        while(semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {}
-    }
-}
-inline void reset_unpack_pack_sync() {
-    if (thread_id == 2) {
-        semaphore_get(semaphore::UNPACK_MATH_DONE);
-    }
-}
-#endif
-#endif
-
-volatile uint tt_l1_ptr * trisc_l1_mailbox = reinterpret_cast<volatile uint tt_l1_ptr *>(MAILBOX_ADDR);
-
-inline bool ready_for_next_epoch() {         // place this through compiler into a section that is not going to overwritten
-    return true;
-    // mailbox_write(ttRiscCores::Nrisc);              // signal done epoch to NCRisc
-    // mailbox_read(ttRiscCores::Nrisc);               // This is blocking read, until NCrisc signals epoch is ready
-}
-
-inline void set_thread_id_parameter() {
-    if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) {
-        thread_id = 0;
-    } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) {
-        thread_id = 1;
-    } else {
-        thread_id = 2;
-    }
-}
-
-inline void allocate_debug_mailbox_buffer() {
-   std::int32_t debug_mailbox_addr;
-   if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) {
-      debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 0*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE;
-   } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
-      debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 1*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE;
-   } else {
-      debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 2*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE;
-   }
-   debug_mailbox_base = reinterpret_cast<volatile uint16_t tt_l1_ptr *>(debug_mailbox_addr);
-   clear_mailbox_values();
-}
-
-inline void allocate_debug_buffer() {
-   std::int32_t debug_buffer_addr;
-   if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) {
-      debug_buffer_addr = l1_mem::address_map::TRISC0_DEBUG_BUFFER_BASE;
-   } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
-      debug_buffer_addr = l1_mem::address_map::TRISC1_DEBUG_BUFFER_BASE;
-   } else {
-      debug_buffer_addr = l1_mem::address_map::TRISC2_DEBUG_BUFFER_BASE;
-   }
-   debug_buffer = reinterpret_cast<volatile uint8_t tt_l1_ptr *>(debug_buffer_addr);
-   debug_buffer[l1_mem::address_map::DEBUG_BUFFER_SIZE-1]=0x0;
-   debug_buffer_start = debug_buffer;
-}
-
-__attribute__((noinline)) void debug_dump(const uint8_t *data, uint32_t byte_size) {
-  for (uint32_t i = 0; i < byte_size; i++) {
-    if ((((uint32_t) debug_buffer)&(l1_mem::address_map::DEBUG_BUFFER_SIZE-1)) ==
-         l1_mem::address_map::DEBUG_BUFFER_SIZE-1) {
-       *(debug_buffer) = 0xff; //overflow detected
-    } else {
-       *debug_buffer = data[i];
-       debug_buffer++;
-    }
-  }
-}
-
-__attribute__((noinline)) void debug_dump_seek(uint8_t offset) {
-  debug_buffer = reinterpret_cast<volatile uint8_t *>(debug_buffer_start + offset);
-}
-
-} // namespace ckernel
-
-void local_mem_copy() {
-   volatile uint tt_l1_ptr *l1_local_mem_start_addr;
-   volatile uint *local_mem_start_addr = (volatile uint*) LOCAL_MEM_BASE_ADDR;
-
-   if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) {
-      l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC0_LOCAL_MEM_BASE;
-   } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) {
-      l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC1_LOCAL_MEM_BASE;
-   } else {
-      l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC2_LOCAL_MEM_BASE;
-   }
-   uint word_size = ((uint)__local_mem_rodata_end_addr - (uint)__local_mem_rodata_start_addr)>>2;
-
-   if (word_size>0) {
-      for (uint n=0;n<word_size;n++) {
-         local_mem_start_addr[n] = l1_local_mem_start_addr[n];
-      }
-   }
-
-}
-
-using namespace ckernel;
-
-int main(int argc, char *argv[])
-{
-    FWEVENT("Launching proudction env kernels");
-
-    // Initialize GPRs to all 0s
-    for (int i = 0; i < 64; i++)
-        regfile[i] = 0;
-
-    // Init L1 buffer with 1.0f (used for reduce max)
-    union {
-        float f;
-        uint32_t u;
-    } f2u = {.f = 1.0f};
-
-    // Save a little code space.  GCC fails to remove the loop variable so loop with a ptr
-#pragma GCC unroll 0
-    for (volatile uint32_t tt_l1_ptr *ptr = l1_buffer; ptr < &l1_buffer[16]; *ptr++ = f2u.u) // Load const into L1 buffer
-
-    reset_cfg_state_id();
-
-    trisc_l1_mailbox_write(RESET_VAL);
-
-    if ((uint)l1_mem::address_map::RISC_LOCAL_MEM_BASE ==
-            ((uint)__local_mem_rodata_end_addr&0xfff00000))
-    {
-       local_mem_copy();
-    }
-
-    allocate_debug_mailbox_buffer();
-    allocate_debug_buffer();
-    if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) {
-        reg_write(RISCV_DEBUG_REG_DBG_FEATURE_DISABLE, 0); // Clear debug feature disable in case it was set by previous kernel on TRISC0
-                                                             // e.g workaround for bug https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1372
-        regfile[p_gpr_unpack::L1_BUFFER_ADDR] = (((uint)l1_buffer) >> 4) - 1; //Store L1 buffer address for reduce input 1
-        sync_regfile_write(p_gpr_unpack::L1_BUFFER_ADDR);
-    }
-
-#ifdef PERF_DUMP
-    set_thread_id_parameter();
-    allocate_perf_buffer();
-    setup_fpu_perf_cnt();
-    record_dummy_math_event();
-#if OVERLAY_DECOUPLE == 1
-    update_overlay_decoupling_mailbox();
-#endif
-#endif
-
-    //while (ready_for_next_epoch())
-    {
-        run_kernel();
-    }
-
-    // Signal completion
-    tensix_sync();
-#ifdef PERF_DUMP
-#if OVERLAY_DECOUPLE == 1
-    reset_unpack_pack_sync();
-#endif
-    record_perf_dump_end_and_check_overflow();
-    // There has to be a tensix_sync() before this last pass.
-    last_trisc_perf_dump_to_dram();
-    tensix_sync();
-#endif
-
-    trisc_l1_mailbox_write(KERNEL_COMPLETE);
-
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc
deleted file mode 100644
index b2c39df3313..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-
-// This c-file's purpose is:
-// 1) include the generated list of kernels
-//      The files hold run_kernel() definition and inline kernel_main functions for every ckernel
-//      Need to make sure no other file includes these lists since it also include global parameter definitions
-// 2) instantiate global variables
-
-
-#include "ckernel_globals.h"
-
-#if defined(UCK_CHLKC_UNPACK) || defined(UCK_CHLKC_MATH) || defined(UCK_CHLKC_PACK)
-#include "chlkc_list.h"
-#else
-#include "ckernel_list.h"
-#endif
-
-// Global vars
-uint32_t unp_cfg_context = 0;
-uint32_t pack_sync_tile_dst_ptr = 0;
-uint32_t math_sync_tile_dst_index = 0;
-volatile uint32_t tt_l1_ptr l1_buffer[16] __attribute__ ((section (".text#"))) __attribute__ ((aligned (16)));
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc
deleted file mode 100644
index 446e14cb8f6..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc
+++ /dev/null
@@ -1,301 +0,0 @@
-
-#include "ckernel_perf_unpack_pack.h"
-#include "stream_interface.h"
-
-#pragma GCC diagnostic ignored "-Wunused-function"
-
-
-namespace ckernel
-{
-extern uint32_t perf_index;
-extern uint32_t perf_end;
-// Perf-buffer are double buffered for spill_to_dram.
-// Ncrisc will move one half to dram while trisc populates the other half.
-// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
-extern volatile uint32_t *perf_buf_base[2];
-// Selects the half of perf_buffer that trisc is currently writing into.
-extern uint8_t perf_buf_base_id;
-extern bool record_perf_events;
-extern uint16_t current_outer_loop_iter;
-extern uint8_t thread_id;
-extern int32_t dram_dump_req_local;
-extern volatile uint* ncrisc_ack_addr;
-extern uint32_t header;
-
-void allocate_perf_buffer() {
-   std::int32_t perf_buf_base_addr;
-   if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) {
-      perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + 0*TRISC_PERF_BUF_SIZE;
-      perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d.
-      if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) {
-         perf_end = TRISC_PERF_BUF_SIZE >> 3;
-      } else {
-         perf_end = 3;
-      }
-      dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[0];
-      ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[0];
-   } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
-      perf_buf_base_addr = l1_mem::address_map::MATH_PERF_BUF_BASE_ADDR;
-      perf_index = 4; // The first 4 32b regs are skipped in recording math perf counters.
-      perf_end = 16;
-
-      // Initialize math_dram_dump_req_local in the beginning of epoch.
-      // EPOCH_INFO_PTR->perf_dram_copy_req counters do not get reset between epochs.
-      dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[1];
-      ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[1];
-   } else {
-      perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + TRISC_PERF_BUF_SIZE;
-      perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d.
-      if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) {
-         perf_end = TRISC_PERF_BUF_SIZE >> 3;
-      } else {
-         perf_end = 3;
-      }
-      TTI_SEMINIT(1, 0, 1 << semaphore::PACK_DONE);
-      dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[2];
-      ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[2];
-   }
-   // Tirsc starts dumping into the first half of the perf_buffers.
-   perf_buf_base_id = 0;
-   // Program the address for the first half of the perf buffer address.
-   perf_buf_base[0] = reinterpret_cast<volatile uint32_t *>(perf_buf_base_addr);
-   // Program the address for the second half of the perf buffer address.
-   perf_buf_base[1] = reinterpret_cast<volatile uint32_t *>(perf_buf_base_addr + (TRISC_PERF_BUF_SIZE >> 1));
-   perf_buf_base[perf_buf_base_id][0] = PERF_DUMP_END_SIGNAL;
-#if PERF_DUMP_CONCURRENT
-   volatile uint32_t* header_ptr = reinterpret_cast<volatile uint32_t *>(l1_mem::address_map::PERF_THREAD_HEADER);
-   header = header_ptr[0];
-   header = (header & 0xfff8ffff) | (((uint32_t)(thread_id) & 0b111) << 16);
-   perf_buf_base[perf_buf_base_id][1] = header;
-   for (uint i = 2; i < perf_index; i++) {
-      perf_buf_base[perf_buf_base_id][i] = 0xffffffff;
-   }
-#else
-   for (uint i = 1; i < perf_index; i++) {
-      perf_buf_base[perf_buf_base_id][i] = 0xffffffff;
-   }
-#endif
-}
-
-void switch_perf_buffers() {
-
-   if constexpr (INTERMED_DUMP || PERF_DUMP_CONCURRENT) {
-      for (uint i = perf_index; i < perf_end; i++) {
-         perf_buf_base[perf_buf_base_id][i] = 0xffffffff;
-      }
-      bool stalled = false;
-      uint32_t timestamp_stall_start_l;
-      uint32_t timestamp_stall_start_h;
-      uint32_t timestamp_stall_end_l;
-      uint32_t timestamp_stall_end_h;
-
-      // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram
-      int32_t ack_local = *ncrisc_ack_addr;
-      if (ack_local <= dram_dump_req_local - 1) {
-         stalled = true;
-         timestamp_stall_start_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
-         timestamp_stall_start_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
-
-         while (ack_local <= dram_dump_req_local - 1) {
-            ack_local = *ncrisc_ack_addr;
-         }
-
-         timestamp_stall_end_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
-         timestamp_stall_end_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
-      }
-
-      dram_dump_req_local++;
-      EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local;
-
-      perf_buf_base_id = 1 - perf_buf_base_id;
-      if constexpr(INTERMED_DUMP) {
-         perf_index = 0;
-      } else {
-         perf_index = 0;
-         perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL;
-         perf_buf_base[perf_buf_base_id][perf_index+1] = *(uint32_t*)(&header);
-         perf_index = 2;
-      }
-      if (stalled && perf_index + 5 < perf_end - 1) {
-         uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::STALL_TRISC_FOR_DRAM_PERF_DUMP, current_outer_loop_iter);
-         perf_buf_base[perf_buf_base_id][perf_index] = event_id;
-         perf_buf_base[perf_buf_base_id][perf_index+1] = timestamp_stall_start_h;
-         perf_buf_base[perf_buf_base_id][perf_index+2] = timestamp_stall_start_l;
-         perf_buf_base[perf_buf_base_id][perf_index+3] = event_id;
-         perf_buf_base[perf_buf_base_id][perf_index+4] = timestamp_stall_end_h;
-         perf_buf_base[perf_buf_base_id][perf_index+5] = timestamp_stall_end_l;
-         perf_index += 6;
-      }
-   }
-}
-
-void last_trisc_perf_dump_to_dram() {
-   if (perf_index > 0) {
-
-      // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram
-      int32_t ack_local = *ncrisc_ack_addr;
-      while (ack_local <= dram_dump_req_local - 1) {
-         ack_local = *ncrisc_ack_addr;
-      }
-
-      if constexpr (INTERMED_DUMP) {
-         if (thread_id == 1) {
-            dram_dump_req_local += 2;
-         } else {
-            dram_dump_req_local++;
-         }
-      } else if constexpr (PERF_DUMP_CONCURRENT) {
-         dram_dump_req_local++;
-      } else {
-         dram_dump_req_local += 2;
-      }
-      EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local;
-   }
-}
-
-void increment_unpack_tiles(uint operand_idx, uint num_tiles) {
-   if (record_perf_events && (perf_events_target_idx == 1)) {
-      if (operand_idx >= PERF_MAX_NUM_INPUTS) {
-         return;
-      }
-      uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0;
-      regfile_base_idx += (operand_idx >> 1);
-      bool upper = operand_idx & 0b1;
-      uint32_t num_tiles_regfile = regfile[regfile_base_idx];
-      uint32_t current_num_tiles;
-      if (upper) {
-         current_num_tiles = (num_tiles_regfile >> 16) & 0xffff;
-         current_num_tiles += num_tiles;
-         regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff) + ((current_num_tiles & 0xffff) << 16);
-      } else {
-         current_num_tiles = (num_tiles_regfile + num_tiles) & 0xffff;
-         regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff0000) + (current_num_tiles & 0xffff);
-      }
-      sync_regfile_write(regfile_base_idx);
-   }
-}
-
-void increment_pack_tiles(uint num_tiles) {
-   if (record_perf_events && (perf_events_target_idx == 1)) {
-      regfile[p_gpr_pack::PERF_PACK_NUM_TILES] += num_tiles;
-      sync_regfile_write(p_gpr_pack::PERF_PACK_NUM_TILES);
-   }
-}
-
-#if OVERLAY_DECOUPLE == 1
-
-// This runs prior to set_perf_dump_flag_for_input so perf_end has to be adjusted
-void record_overlay_decoupled_output_bw_start(uint32_t num_tiles) {
-   if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) {
-      perf_end += 6;
-   }
-   if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) {
-      perf_end = TRISC_PERF_BUF_SIZE >> 2;
-   }
-   uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_NUM_TILES, perf_events_target_inputs[0]);
-   record_perf_value_and_check_overflow(event_id, num_tiles, 0);
-   event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]);
-   uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
-   uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
-   record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0);
-}
-
-void record_overlay_decoupled_output_bw_end() {
-   if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) {
-      perf_end += 6;
-   }
-   if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) {
-      perf_end = TRISC_PERF_BUF_SIZE >> 2;
-   }
-   uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]);
-   uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
-   uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
-   record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0);
-}
-
-void llk_push_all_packer_tiles_for_decoupling() {
-   uint32_t operand = OPERAND_OUTPUT_START_INDEX;
-   uint32_t output = operand_to_output_index(operand);
-
-   // Populate the output buffer with headers
-   uint32_t stream_buf_size_bytes = EPOCH_INFO_PTR->outputs[output]->buf_full_size_bytes;
-   uint32_t stream_buf_addr = EPOCH_INFO_PTR->outputs[output]->buf_base_addr;
-   uint32_t stream_msg_info_buf_ptr = (EPOCH_INFO_PTR->outputs[output]->msg_info_buf_start)*MEM_WORD_WIDTH;
-   uint32_t tile_size_words = *(volatile uint32_t tt_l1_ptr *)(stream_msg_info_buf_ptr);
-   uint32_t tile_size_bytes = tile_size_words*MEM_WORD_WIDTH;
-   for (uint32_t tile_header_ptr = stream_buf_addr; tile_header_ptr < stream_buf_addr + stream_buf_size_bytes; tile_header_ptr += tile_size_bytes) {
-         *((uint32_t *)(tile_header_ptr)) = tile_size_words;
-   }
-
-   uint32_t total_num_tiles_to_push = 0;
-   uint32_t num_tiles_to_push[EPOCH_MAX_OUTPUT_FORKS+1];
-   uint32_t stream_id = EPOCH_INFO_PTR->outputs[output]->stream_id;
-   uint32_t active_stream_idx = get_active_stream_idx(stream_id);
-   volatile epoch_stream_info_t * l1_stream_info = EPOCH_INFO_PTR->active_streams[active_stream_idx];
-   for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) {
-      uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1];
-      uint32_t epoch_num_tiles = EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->epoch_num_tiles;
-      num_tiles_to_push[k] = epoch_num_tiles;
-      total_num_tiles_to_push += epoch_num_tiles;
-   }
-   if (((l1_stream_info->flags & STREAM_MOVES_RAW_DATA) != 0) || l1_stream_info->legacy_pack) {
-
-      record_overlay_decoupled_output_bw_start(total_num_tiles_to_push);
-
-      while(total_num_tiles_to_push > 0) {
-         uint32_t stream_msg_info_buf_ptr = (l1_stream_info->msg_info_buf_start)*MEM_WORD_WIDTH;
-         uint32_t tile_size_words = *(volatile uint32_t *)(stream_msg_info_buf_ptr);
-         uint32_t stream_buf_size_tiles = l1_stream_info->buf_size_tiles;
-         bool any_streams_busy = false;
-         for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) {
-               uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1];
-               uint32_t fork_stream_id = k == 0 ? stream_id : EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->stream_id;
-               if (num_tiles_to_push[k] == 0) {
-                  continue;
-               }
-               uint32_t dram_output_no_push = ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_DRAM_NO_PUSH) != 0) || ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_MOVES_RAW_DATA) != 0);
-               if (dram_output_no_push) {
-                  uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id);
-                  uint16_t operand_tiles_received = (uint16_t)*get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id));
-                  uint16_t operand_tiles_acked = (uint16_t)*get_operand_tiles_acked_ptr(stream_id_to_operand(fork_stream_id));
-                  uint16_t tiles_available = operand_tiles_received - operand_tiles_acked;// op_pack_tiles_ptr_sub(operand_tiles_received, operand_tiles_acked);
-                  uint32_t stream_buf_free_tiles = stream_buf_size_tiles - tiles_available;
-                  uint32_t num_tiles = tiles_left_in_phase > stream_buf_free_tiles ? stream_buf_free_tiles : tiles_left_in_phase;
-                  if (num_tiles > 0) {
-                     stream_set_tiles_left_in_phase(fork_stream_id, num_tiles);
-                     volatile uint32_t tt_reg_ptr* tiles_received_ptr = (volatile uint32_t tt_reg_ptr*)get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id));
-                     operand_tiles_received = (uint16_t)tiles_received_ptr[0];
-                     uint16_t new_epoch_tiles_received = operand_tiles_received + num_tiles;// op_pack_tiles_ptr_add(operand_tiles_received, num_tiles);
-                     tiles_received_ptr[0] = new_epoch_tiles_received;
-
-                     num_tiles_to_push[k] -= num_tiles;
-                     total_num_tiles_to_push -= num_tiles;
-                  }
-               } else {
-                  uint32_t phase_active = stream_phase_is_active(fork_stream_id) && !is_dummy_phase(fork_stream_id);
-                  if (phase_active) {
-                     uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id);
-                     uint32_t num_free_words = stream_get_free_words(fork_stream_id);
-                     uint32_t num_tiles = 0;
-                     uint32_t num_words = 0;
-                     while (num_words + tile_size_words <= num_free_words && num_tiles + 1 <= tiles_left_in_phase) {
-                           num_tiles++;
-                           num_words += tile_size_words;
-                     }
-                     if (num_tiles > 0) {
-                           stream_set_tiles_left_in_phase(fork_stream_id, num_tiles);
-                           stream_relay_tiles(fork_stream_id, num_tiles, num_words);
-
-                           num_tiles_to_push[k] -= num_tiles;
-                           total_num_tiles_to_push -= num_tiles;
-                     }
-                  }
-               }
-         }
-      }
-      record_overlay_decoupled_output_bw_end();
-   }
-}
-#endif
-
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
deleted file mode 100644
index 103269694e5..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc
+++ /dev/null
@@ -1,9 +0,0 @@
-// combining multiple C++ source files into a single file
-// to reduce the overhead of the compilation process and
-// improve build times
-#include "ckernel.cc"
-#ifdef PERF_DUMP
-#include "ckernel_perf_unpack_pack.cc"
-#endif
-#include "ckernel_main.cc"
-#include "llk_io.cc" // sw stack specific io interface
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
deleted file mode 100644
index 99880029ff6..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list
+++ /dev/null
@@ -1 +0,0 @@
-ckernel.cc
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
index 5f796f4c5b3..8952ec8cb87 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h
@@ -225,23 +225,14 @@ inline void _llk_pack_reduce_mask_config_() {
             row_set_mapping_1 = 0x00000001; // each packer packs 1x16 row
         }
     } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) {
-                // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one
+        // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one
         edge_offset_sec1_mask = 0x0001;
-        if constexpr (untilize) {
-            pack_edge_offset.f.tile_row_set_select_pack0 = 1;
-            pack_edge_offset.f.tile_row_set_select_pack1 = 1;
-            pack_edge_offset.f.tile_row_set_select_pack2 = 1;
-            pack_edge_offset.f.tile_row_set_select_pack3 = 1;
-            row_set_mapping_1 = 0x00000005;
-        } else {
-            // Packer 0 and 2 will use TILE_ROW_SET_MAPPING_1, while packer 1 and 3 will keep using
-            // TILE_ROW_SET_MAPPING_0 configuration which is the default one
-            pack_edge_offset.f.tile_row_set_select_pack0 = 1;
-            pack_edge_offset.f.tile_row_set_select_pack2 = 1;
+        // Packer 0  will use TILE_ROW_SET_MAPPING_1, while packers 1,2 and 3 will keep using
+        // TILE_ROW_SET_MAPPING_0 configuration which is the default one
+        pack_edge_offset.f.tile_row_set_select_pack0 = 1;
 
-            // TILE_ROW_SET_MAPPING_1 configuration sets all rows to use PCK_EDGE_OFFSET_SEC1 mask
-            row_set_mapping_1 = 0x00000001;
-        }
+        // TILE_ROW_SET_MAPPING_1 configuration sets only first row to use PCK_EDGE_OFFSET_SEC1 mask
+        row_set_mapping_1 = 0x00000001;
     }
 
     // Initialize TMP registers with values we need to write in CFG registers
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h
deleted file mode 100644
index 725d008b626..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h
+++ /dev/null
@@ -1,202 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-#include "llk_io_pack.h"
-#include "llk_param_structs.h"
-
-#include "ckernel.h"
-#include "ckernel_template.h"
-#include "llk_pack_common.h"
-#include "ckernel_globals.h"
-
-using namespace ckernel;
-using namespace ckernel::packer;
-
-inline void llk_pack_shifted_mop_config(std::uint32_t stride) {
-    addr_mod_pack_t{
-        .y_src = {.incr = (std::uint8_t) stride},
-        .y_dst = {.incr = (std::uint8_t) stride},
-    }
-        .set(ADDR_MOD_0);
-
-    addr_mod_pack_t{
-        .y_src = {.incr = 0, .clr = 1, .cr = 0},
-        .y_dst = {.incr = 0, .clr = 1, .cr = 0},
-        .z_src = {.incr = 0, .clr = 0},
-        .z_dst = {.incr = 0, .clr = 0},
-    }
-        .set(ADDR_MOD_1);
-
-    addr_mod_pack_t{
-        .y_src = {.incr = 0, .clr = 0, .cr = 0},
-        .y_dst = {.incr = 0, .clr = 0, .cr = 0},
-        .z_src = {.incr = 0, .clr = 0},
-        .z_dst = {.incr = 0, .clr = 0},
-    }
-        .set(ADDR_MOD_2);
-
-    const uint MOP_INNER_LOOP = 16;
-    const uint MOP_OUTER_LOOP = 1;
-    const uint PACKCNT = 4;
-    const uint MEGAROW = 1;
-    constexpr uint ZERO_OUTPUT_FLAG = p_pacr::P_ZERO_OUTPUT_DISABLED;
-
-    ckernel::ckernel_template tmp(
-        MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0));
-
-    tmp.set_last_inner_loop_instr(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, 0, 0, 0));
-    tmp.set_last_outer_loop_instr(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, 0, 0, 0));
-
-    // Write header to l1
-    tmp.set_end_op(TT_OP_STOREIND(1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::OUTPUT_ADDR));
-
-    tmp.program(instrn_buffer);
-}
-
-template <bool untilize = false, bool is_fp32_dest_acc_en = false>
-inline void llk_pack_shifted_hw_configure(const llk_pack_shifted_params_t *pack_params) {
-    configure_pack<is_fp32_dest_acc_en>(get_output_id(pack_params->pack_output), pack_params->relu_config.val);
-
-    std::uint32_t output = get_output_id(pack_params->pack_output);
-}
-
-template <bool untilize = false, ReluType relu_type=ReluType::NO_RELU, std::uint32_t relu_threshold=0>
-inline void llk_pack_shifted_hw_configure_disaggregated(std::uint32_t pack_output) {
-    llk_pack_shifted_params_t llk_pack_shifted_params = {
-        .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}};
-    llk_pack_shifted_hw_configure(&llk_pack_shifted_params);
-    volatile uint *cfg = get_cfg_pointer();
-    // Disable auto-last generation
-    for (uint i=0; i<4; i++) { cfg[PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32+i]=0; }
-
-    // FIXME: configure based on initial padding param value
-    //regfile[p_gpr_pack::TMP_DEST_OFFSET]   = 0x0 - 1;
-    //regfile[p_gpr_pack::TMP_DEST_OFFSET+1] = 0x0 + 0x20 - 1;
-    //regfile[p_gpr_pack::TMP_DEST_OFFSET+2] = 0x0 + 0x10 - 1;
-    //regfile[p_gpr_pack::TMP_DEST_OFFSET+3] = 0x0 + 0x30 - 1;
-}
-
-inline void llk_pack_shifted_init(const llk_pack_shifted_params_t *params=0) {
-    llk_pack_shifted_mop_config(params->stride);
-}
-
-inline void llk_pack_shifted(const llk_pack_shifted_params_t *params, llk_pack_shifted_state_t *state, std::uint32_t output, std::uint32_t output_tile_index = 0) {
-    std::uint8_t output_id = get_output_id(output);
-    constexpr std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id();
-
-    std::uint16_t pack_tile_base_addr;
-    std::uint16_t pack_tile_offset_addr = 0;
-    pack_tile_base_addr = cb_interface[output_id].fifo_wr_ptr + MUL_TILE_SIZE_AND_INDEX((std::uint8_t)pack_dst_format[OUTPUT_BASE_ID], output_tile_index);
-
-    int write_row_index = state->current_wr_ptr;
-
-    if (state->partial_tile) {
-        pack_tile_offset_addr = ((write_row_index&(FACE_HEIGHT-1))+2*(write_row_index&FACE_HEIGHT))*2; //FIXME: scale row index with format
-        state->partial_tile = false;
-    }
-
-    program_packer_destination<PACK_01>((pack_tile_base_addr+pack_tile_offset_addr), OUTPUT_BASE_ID);
-
-    if (params->initial_padding>0) {
-       if (params->initial_padding <= FACE_HEIGHT) {
-           TT_SETADCXX(p_setadc::PAC, ((params->initial_padding*16)-1), 0x0);
-           TTI_PACR(ADDR_MOD_2, 1, 0x3, 0, 0, 0, 0);
-           write_row_index+=params->initial_padding;
-       } else if (params->initial_padding < TILE_HEIGHT) {
-           TTI_SETADCXX(p_setadc::PAC, (16*FACE_HEIGHT)-1, 0x0);
-           TTI_PACR(ADDR_MOD_2, 1, 0x3, 0, 0, 0, 1);
-           TT_SETADCXX(p_setadc::PAC, (((params->initial_padding-FACE_HEIGHT)*16)-1), 0x0);
-           program_packer_destination<PACK_01>((std::uint16_t)(pack_tile_base_addr+2*(2*FACE_HEIGHT)), OUTPUT_BASE_ID); //FIXME: scale based on the format
-           TTI_PACR(ADDR_MOD_2, 1, 0x3, 0, 0, 0, 0);
-           write_row_index+=params->initial_padding;
-       } else {
-           program_packer_destination<PACK_ALL>((std::uint16_t)pack_tile_base_addr, OUTPUT_BASE_ID);
-           TTI_SETADCXX(p_setadc::PAC, (256)-1, 0x0); // zero tile detected
-           TTI_PACR(ADDR_MOD_2, 1, 0xF, 0, 0, 0, 1);
-           write_row_index+=TILE_HEIGHT;
-       }
-       // Pack single rows
-       TTI_SETADCXX(p_setadc::PAC, 16-1, 0x0);
-    }
-
-    int curr_tile_index=-1;
-    while ( (write_row_index < TILE_HEIGHT) &&
-            // Keep going until we reached end of valid dest, unless it's final iteration in which case we just pad to the end
-            ( (state->current_rd_ptr < params->valid_row_count) || params->final_iteration) )
-    {
-        bool insert_blank =
-            ((state->current_y) >= params->original_y) ||  // we're past the end
-            (((state->current_x) < params->row_shift_x) && (params->row_shift_x > 0)) || // initial postive X-shift
-            (((state->current_x) >= (params->original_x + params->row_shift_x)) && (params->row_shift_x < 0)); // final negative X-shift
-
-        if (write_row_index == FACE_HEIGHT) {
-           TTI_PACR(ADDR_MOD_2, 0, 0x3, 0, 0, 1, 1); //close tile in order to update address
-           program_packer_destination<PACK_01>((std::uint16_t)(pack_tile_base_addr+2*(2*FACE_HEIGHT)), OUTPUT_BASE_ID); //FIXME: scale based on the format
-        }
-
-
-
-        if (insert_blank)
-        {
-            // Insert empty rows
-            TTI_PACR(ADDR_MOD_0, 1, 0x3, 0, 0, 0, 0);
-        }
-        else
-        {
-            int tile_index = state->current_rd_ptr / TILE_HEIGHT;
-            int pack_zeros = 0;
-
-            if (curr_tile_index != tile_index) {
-               curr_tile_index = tile_index;
-               if ( (tile_index < 0) || (tile_index >= 16) ) {
-                  pack_zeros = 1;
-               } else {
-                  uint16_t row_index = (state->current_rd_ptr & (TILE_HEIGHT-1));
-                  TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_Y, row_index);
-                  TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_W, tile_index);
-               }
-            }
-
-            TT_PACR(ADDR_MOD_0, pack_zeros, 0x3, 0, 0, 0, 0);
-
-
-        }
-        write_row_index++;
-
-        // Move read pointers accordingly
-        state->current_rd_ptr += params->stride;
-        state->current_x += params->stride;
-        if (state->current_x >= params->original_x)
-        {
-            if (state->current_x > params->original_x)
-            {
-                // Stride got us too far, let's rewind back
-                state->current_rd_ptr -= state->current_x - params->original_x;
-            }
-            state->current_x = params->stride_offset;
-            state->current_y += params->stride;
-            state->current_rd_ptr += (params->stride - 1) * params->original_x; // stride Y
-            if (params->stride > 1) {
-               uint16_t row_index = (state->current_rd_ptr & (TILE_HEIGHT-1));
-               TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_Y, row_index);
-            }
-        }
-
-    }
-
-    if (write_row_index == TILE_HEIGHT) {
-        state->current_wr_ptr = 0;
-        state->partial_tile = false;
-        // write header
-        TT_SETDMAREG(0, pack_tile_base_addr, 0, LO_16(p_gpr_pack::HEADER_ADDR));
-        TTI_STOREIND(1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::HEADER_ADDR);
-    }
-    else {
-        state->current_wr_ptr = write_row_index;
-        state->partial_tile = true;
-    }
-
-    TTI_PACR(ADDR_MOD_2, 0, 0x3, 0, 0, 1, 1); //close tile
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
index 249f62bc71b..1a787231608 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
@@ -298,10 +298,11 @@ inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
             narrow_tile,
             relu_config.val
         );
-    } else {
-        TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK);
-        tensix_sync();
     }
 
-    _llk_pack_reduce_mask_config_<untilize, dim>();
+    if constexpr (revert) {
+        _llk_pack_reduce_mask_clear_();
+    } else {
+        _llk_pack_reduce_mask_config_<untilize, dim>();
+    }
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
index 4a03157715b..0207ecc345f 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
@@ -20,7 +20,7 @@ inline const uint32_t get_operand_src_format(const std::uint32_t operand_id)
 
 inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id)
 {
-   return unpack_src_format[operand_id];
+   return unpack_dst_format[operand_id];
 }
 
 inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id)
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
index cba5398b604..11d634c25e4 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
@@ -26,7 +26,7 @@ inline const uint32_t get_output_src_format(const std::uint32_t output_id)
 
 inline const uint32_t get_output_dst_format(const std::uint32_t output_id)
 {
-   return pack_src_format[output_id];
+   return pack_dst_format[output_id];
 }
 
 inline const uint32_t get_output_num_faces(const std::uint32_t output_id)

From 55c5c657a109d54edbff52e6087e52e5ab2b541b Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Tue, 5 Dec 2023 00:00:47 +0000
Subject: [PATCH 06/16] #3908: Set default pack_output to 16

---
 .../hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
index 1a787231608..81584704615 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h
@@ -99,7 +99,7 @@ inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output
 }
 
 template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
-inline void llk_pack_init(const std::uint32_t pack_output = 0) {
+inline void llk_pack_init(const std::uint32_t pack_output = 16) {
 
     const std::uint32_t output_id = get_output_id(pack_output);
     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
@@ -179,7 +179,7 @@ inline void llk_pack_dest_section_done() {
 }
 
 template <DstSync Dst, DstTileFaceLayout FaceLayout, bool untilize = false>
-inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 0) {
+inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) {
     const std::uint32_t output_id = get_output_id(pack_output);
     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
     const bool narrow_tile = get_output_narrow_tile(output_id);
@@ -191,7 +191,7 @@ inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_outpu
 }
 
 template <DstSync Dst, DstTileFaceLayout FaceLayout = RowMajor, bool untilize = false, bool is_fp32_dest_acc_en = false>
-inline void llk_pack_dest_init(const std::uint32_t pack_output = 0) {
+inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) {
 
     const std::uint32_t output_id = get_output_id(pack_output);
     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);

From 5e838a834169b66bc282a2de640e20d42437ff47 Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Tue, 5 Dec 2023 19:57:03 +0000
Subject: [PATCH 07/16] #3908: Fix get_operand_id to return correct value

---
 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
index 0207ecc345f..2b94607012d 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
@@ -8,9 +8,8 @@
 
 inline uint32_t get_operand_id(uint32_t operand)
 {
-    const int INTERMEDIATE_BASE_ID = 24;
     const int OPERAND_BASE_ID = 0;
-    return (operand>=INTERMEDIATE_BASE_ID) ? operand - 8 : operand - OPERAND_BASE_ID;
+    return (operand);
 }
 
 inline const uint32_t get_operand_src_format(const std::uint32_t operand_id)

From dc9f23993553a7012a0da714fe5a286e0f020169 Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Tue, 5 Dec 2023 23:03:10 +0000
Subject: [PATCH 08/16] #3908: Uplift the latest changes from core llk lib.
 Common files are now identical

---
 .../ckernels/wormhole_b0/common/inc/ckernel.h |  4 ++--
 .../wormhole_b0/common/inc/ckernel_sfpu.h     | 20 +++++++++++++++++++
 .../ckernels/wormhole_b0/llk_lib/llk_defs.h   |  4 +---
 .../metal/llk_api/llk_math_binary_sfpu_api.h  |  8 ++++----
 .../metal/llk_api/llk_math_unary_sfpu_api.h   | 18 ++++++++---------
 .../llk_math_eltwise_unary_sfpu_0_param.h     |  6 +++---
 .../llk_math_eltwise_unary_sfpu_1_param.h     |  6 +++---
 ..._math_eltwise_unary_sfpu_common_includes.h |  6 +++---
 .../llk_math_eltwise_unary_sfpu_elu.h         |  2 +-
 .../llk_math_eltwise_unary_sfpu_erf_erfc.h    |  4 ++--
 .../llk_math_eltwise_unary_sfpu_erfinv.h      |  2 +-
 .../llk_math_eltwise_unary_sfpu_exp.h         |  2 +-
 .../llk_math_eltwise_unary_sfpu_gelu.h        |  4 ++--
 .../llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h |  2 +-
 .../llk_math_eltwise_unary_sfpu_isinf_isnan.h | 10 +++++-----
 ...math_eltwise_unary_sfpu_logical_not_noti.h |  2 +-
 .../llk_math_eltwise_unary_sfpu_recip.h       |  2 +-
 .../llk_math_eltwise_unary_sfpu_relu.h        |  8 ++++----
 .../llk_math_eltwise_unary_sfpu_reverseops.h  |  2 +-
 .../llk_math_eltwise_unary_sfpu_sqrt.h        |  2 +-
 ...llk_math_eltwise_unary_sfpu_trigonometry.h |  6 +++---
 21 files changed, 69 insertions(+), 51 deletions(-)

diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
index b731cc4bf81..ebc48646c8a 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h
@@ -36,7 +36,7 @@
 #define OVERLAY_DECOUPLE 0
 #endif
 
-#ifdef LLK_TB_TEST
+#if defined(EN_KERNEL_SLOWDOWN)
 #include "kernel_slowdown_config.h"
 #endif
 
@@ -61,10 +61,10 @@
 #include "ckernel_include.h"
 #include "tensix.h"
 #include "fw_debug.h"
+#include "tt_log.h"
 // #include <cstring>
 #if defined(PERF_DUMP) || DELAY_EN > 0
 #include <l1_address_map.h>
-#include "tt_log.h"
 #include "perf_lib/scratch_api.h"
 #endif
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
index b00ea4a0b1f..c0fcaf197cb 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h
@@ -1319,5 +1319,25 @@ inline void _dequant_int32_(const int iterations, const uint dst_offset)
     }
 }
 
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void _add_int32_(const int iterations, const uint dst_offset) {
+    // Operand A is input1 (int32)
+    // Operand B is input2 (int32)
+    // Output is int32
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        // operand A - int32
+        TTI_SFPLOAD(0, 12, 3, 0);
+        // operand B - int32
+        TT_SFPLOAD(1, 12, 3, dst_offset * 64);
+        TTI_SFPIADD(0, 1, 0, 4);
+        // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result
+        TTI_NOP;
+        // LREG_0 -> dest as int32
+        TTI_SFPSTORE(0, 12, 3, 0);
+        dst_reg++;
+    }
+}
+
 } // namespace sfpu
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
index e5250fb7412..e80e29d83c7 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h
@@ -7,13 +7,11 @@
 
 namespace ckernel {
 
-enum Dim {
+enum VectorMode {
   None      = 0,
   R         = 1,
   C         = 2,
-  Z         = 3,
   RC        = 4,
-  ZR        = 5,
   Invalid   = 0xFF,
 };
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h
index 5f662f22081..c7c42763d95 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h
@@ -15,7 +15,7 @@ inline void llk_math_eltwise_binary_sfpu(
     const uint operand,
     uint dst_index_a,
     uint dst_index_b,
-    int vector_mode = (int)Dim::RC,
+    int vector_mode = (int)VectorMode::RC,
     uint param0 = 0,
     uint param1 = 0,
     uint param2 = 0,
@@ -38,7 +38,7 @@ inline void llk_math_eltwise_binary_sfpu_init(
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
 inline void llk_math_eltwise_binary_sfpu_quant_int32(
-    uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+    uint dst_index_a, uint dst_index_b, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_binary_sfpu<SfpuType::quant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
 }
 
@@ -49,7 +49,7 @@ inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point)
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
 inline void llk_math_eltwise_binary_sfpu_requant_int32(
-    uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+    uint dst_index_a, uint dst_index_b, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_binary_sfpu<SfpuType::requant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
 }
 
@@ -60,7 +60,7 @@ inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_poin
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
 inline void llk_math_eltwise_binary_sfpu_dequant_int32(
-    uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+    uint dst_index_a, uint dst_index_b, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_binary_sfpu<SfpuType::dequant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
 }
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
index 898788ca415..f65a6b86ddd 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -25,7 +25,7 @@ inline void llk_math_eltwise_unary_sfpu_rsqrt_init() {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::log, APPROXIMATE, dst_sync>(dst_index, vector_mode);
 }
 
@@ -45,7 +45,7 @@ inline void llk_math_eltwise_unary_sfpu_log_with_base_init() {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::tanh, APPROXIMATE, dst_sync>(dst_index, vector_mode);
 }
 
@@ -88,7 +88,7 @@ inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::sigmoid, APPROXIMATE, dst_sync>(dst_index, vector_mode);
 }
 
@@ -164,7 +164,7 @@ inline void llk_math_eltwise_unary_sfpu_gez_init() {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::max, APPROXIMATE, dst_sync>(dst_index, vector_mode);
 }
 
@@ -174,7 +174,7 @@ inline void llk_math_eltwise_unary_sfpu_max_init() {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::square, APPROXIMATE, dst_sync>(dst_index, vector_mode);
 }
 
@@ -184,7 +184,7 @@ inline void llk_math_eltwise_unary_sfpu_square_init() {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::power, APPROXIMATE, dst_sync>(dst_index, vector_mode, pow);
 }
 
@@ -194,7 +194,7 @@ inline void llk_math_eltwise_unary_sfpu_power_init() {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::abs, APPROXIMATE, dst_sync>(dst_index, vector_mode);
 }
 
@@ -204,7 +204,7 @@ inline void llk_math_eltwise_unary_sfpu_abs_init() {
 }
 
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::cast_fp32_to_fp16a, APPROXIMATE, dst_sync>(dst_index, vector_mode);
 }
 
@@ -226,7 +226,7 @@ inline void llk_math_eltwise_unary_sfpu_exp2_init() {
 
 //heaviside
 template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu<SfpuType::heaviside, APPROXIMATE, dst_sync>(dst_index,vector_mode,param0);
 }
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h
index a896c4064c3..c72b136f851 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h
@@ -11,7 +11,7 @@ inline void llk_math_eltwise_unary_sfpu_0_param(
     void (*first_func)(),
     void (*func)(),
     uint dst_index,
-    int vector_mode = Dim::RC) {
+    int vector_mode = (int)VectorMode::RC) {
     if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
         math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
     } else {
@@ -19,7 +19,7 @@ inline void llk_math_eltwise_unary_sfpu_0_param(
     }
     math::set_addr_mod_base();
     TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
-    if (vector_mode == Dim::R) {
+    if (vector_mode == (int)VectorMode::R) {
         // Do a row vector, Face0 + Face1 -- first iteration (first row)
         const int ITERATIONS = 1;
 #pragma GCC unroll 0
@@ -33,7 +33,7 @@ inline void llk_math_eltwise_unary_sfpu_0_param(
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-    } else if (vector_mode == Dim::C) {
+    } else if (vector_mode == (int)VectorMode::C) {
         // Do a column vector, Face0 + Face2 -- All iterations for full face
 #pragma GCC unroll 0
         for (int face = 0; face < 2; face++) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h
index c3477ea5c7e..892ed513ea2 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h
@@ -11,7 +11,7 @@ inline void llk_math_eltwise_unary_sfpu_1_param(
     void (*first_func)(uint),
     void (*func)(uint),
     uint dst_index,
-    int vector_mode = Dim::RC,
+    int vector_mode = (int)VectorMode::RC,
     uint param0 = 0) {
     if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) {
         math::set_dst_write_addr<DstTileLayout::Default, DstTileShape::Tile32x32>(math_sync_tile_dst_index);
@@ -20,7 +20,7 @@ inline void llk_math_eltwise_unary_sfpu_1_param(
     }
     math::set_addr_mod_base();
     TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH);
-    if (vector_mode == Dim::R) {
+    if (vector_mode == (int)VectorMode::R) {
         // Do a row vector, Face0 + Face1 -- first iteration (first row)
         const int ITERATIONS = 1;
 #pragma GCC unroll 0
@@ -34,7 +34,7 @@ inline void llk_math_eltwise_unary_sfpu_1_param(
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
         TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D);
-    } else if (vector_mode == Dim::C) {
+    } else if (vector_mode == (int)VectorMode::C) {
         // Do a column vector, Face0 + Face2 -- All iterations for full face
 #pragma GCC unroll 0
         for (int face = 0; face < 2; face++) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
index cc08a9f346c..06aa57e9e34 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -111,7 +111,7 @@ inline void llk_math_calculate_sfpu(
 template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN = false>
 inline void llk_math_eltwise_unary_sfpu(
     uint dst_index,
-    int vector_mode = (int)Dim::RC,
+    int vector_mode = (int)VectorMode::RC,
     uint param0 = 0,
     uint param1 = 0,
     uint param2 = 0,
@@ -126,7 +126,7 @@ inline void llk_math_eltwise_unary_sfpu(
 
     _llk_math_eltwise_unary_sfpu_start_<Dst>(dst_index);
 
-    if (vector_mode == (int)Dim::R) {
+    if (vector_mode == (int)VectorMode::R) {
         // Do a row vector, Face0 + Face1 -- first iteration (first row)
         const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2)
                                                : 2;  // At least 2 iterations for odd and even columns
@@ -140,7 +140,7 @@ inline void llk_math_eltwise_unary_sfpu(
         // Skip next two faces
         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-    } else if (vector_mode == (int)Dim::C) {
+    } else if (vector_mode == (int)VectorMode::C) {
         // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for
         // full face
 #pragma GCC unroll 0
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h
index 7917165a25f..81250ea1d3b 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h
@@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_elu(uint dst_index, uint param0) {
     llk_math_eltwise_unary_sfpu_1_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_elu<APPROXIMATE>,
                                 ckernel::sfpu::calculate_elu<APPROXIMATE>,
-				                dst_index, Dim::RC, param0);
+				                dst_index, (int)VectorMode::RC, param0);
 }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h
index 48f6b8dc398..da9a44c5382 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h
@@ -29,7 +29,7 @@ inline void llk_math_eltwise_unary_sfpu_erf(uint dst_index, int param0 = 0) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_erf_erfc<SfpuType::erf, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_erf_erfc<SfpuType::erf, APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 }
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
@@ -37,7 +37,7 @@ inline void llk_math_eltwise_unary_sfpu_erfc(uint dst_index, int param0 = 0) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_erf_erfc<SfpuType::erfc, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_erf_erfc<SfpuType::erfc, APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h
index aadcb422609..6b7f6a9311e 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h
@@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_erfinv_op(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_erfinv<APPROXIMATE>,
                                 ckernel::sfpu::calculate_erfinv<APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h
index 4022d34274b..ca18cfd4fb8 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h
@@ -15,7 +15,7 @@ namespace ckernel {
 // New LLK SFPU APIs
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_exponential(uint dst_index, int vector_mode = Dim::RC, int param0 = 0) {
+inline void llk_math_eltwise_unary_sfpu_exponential(uint dst_index, int vector_mode = (int)VectorMode::RC, int param0 = 0) {
 
 	constexpr bool zero_negative = true;
     constexpr int first_iterations = 1;
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h
index 9c96bd6877a..c4976bae921 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h
@@ -13,7 +13,7 @@ namespace ckernel {
 // New LLK SFPU APIs
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_gelu(uint dst_index, int vector_mode = Dim::RC, int param0=0) {
+inline void llk_math_eltwise_unary_sfpu_gelu(uint dst_index, int vector_mode = (int)VectorMode::RC, int param0=0) {
     constexpr int first_iterations = 1;
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_gelu<APPROXIMATE, first_iterations>,
@@ -27,7 +27,7 @@ inline void llk_math_eltwise_unary_sfpu_gelu_init() {
 }
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_gelu_derivative(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_gelu_derivative(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     constexpr int first_iterations = 1;
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_gelu_derivative<APPROXIMATE, first_iterations>,
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h
index 3b3e8ef87a3..c0b686a269b 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h
@@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_i0_op(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_i0<APPROXIMATE>,
                                 ckernel::sfpu::calculate_i0<APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h
index be0d2cfea43..216bcba507f 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h
@@ -26,7 +26,7 @@ inline void llk_math_eltwise_unary_sfpu_isinf(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isinf, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isinf, APPROXIMATE>,
- 				                dst_index, Dim::RC);
+ 				                dst_index, (int)VectorMode::RC);
 
 }
 
@@ -41,7 +41,7 @@ inline void llk_math_eltwise_unary_sfpu_isposinf(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isposinf, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isposinf, APPROXIMATE>,
- 				                dst_index,Dim::RC);
+ 				                dst_index,(int)VectorMode::RC);
 
 }
 
@@ -58,7 +58,7 @@ inline void llk_math_eltwise_unary_sfpu_isneginf(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isneginf, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isneginf, APPROXIMATE>,
-                                dst_index,Dim::RC);
+                                dst_index,(int)VectorMode::RC);
 
 }
 
@@ -73,7 +73,7 @@ inline void llk_math_eltwise_unary_sfpu_isnan(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isnan, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isnan, APPROXIMATE>,
- 				                dst_index,Dim::RC);
+ 				                dst_index,(int)VectorMode::RC);
 
 }
 
@@ -88,7 +88,7 @@ inline void llk_math_eltwise_unary_sfpu_isfinite(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isfinite, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_isinf_isnan<SfpuType::isfinite, APPROXIMATE>,
- 				                dst_index,Dim::RC);
+ 				                dst_index,(int)VectorMode::RC);
 
 }
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h
index 668701d0aa9..ca3db8419de 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h
@@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_logical_not_unary_op(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_logical_not_unary<APPROXIMATE>,
                                 ckernel::sfpu::calculate_logical_not_unary<APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h
index a2baf2e58e6..251bed18f26 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h
@@ -14,7 +14,7 @@ namespace ckernel {
 // New LLK SFPU APIs
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_reciprocal(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_reciprocal(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     constexpr int first_iterations = 1;
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_reciprocal<APPROXIMATE, first_iterations>,
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h
index 9d737a27db4..40862b65b8b 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h
@@ -38,7 +38,7 @@ inline void llk_math_eltwise_unary_sfpu_lrelu(uint dst_index, uint param0 = 0) {
     llk_math_eltwise_unary_sfpu_1_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_lrelu<APPROXIMATE>,
                                 ckernel::sfpu::calculate_lrelu<APPROXIMATE>,
-                                dst_index, Dim::RC, param0);
+                                dst_index, (int)VectorMode::RC, param0);
 }
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
@@ -46,7 +46,7 @@ inline void llk_math_eltwise_unary_sfpu_relu_max(uint dst_index, uint param0 = 0
     llk_math_eltwise_unary_sfpu_1_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::relu_max<APPROXIMATE>,
                                 ckernel::sfpu::relu_max<APPROXIMATE>,
-                                dst_index, Dim::RC, param0);
+                                dst_index, (int)VectorMode::RC, param0);
 }
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
@@ -54,7 +54,7 @@ inline void llk_math_eltwise_unary_sfpu_relu_min(uint dst_index, uint param0 = 0
     llk_math_eltwise_unary_sfpu_1_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::relu_min<APPROXIMATE>,
                                 ckernel::sfpu::relu_min<APPROXIMATE>,
-                                dst_index, Dim::RC, param0);
+                                dst_index, (int)VectorMode::RC, param0);
 }
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
@@ -62,7 +62,7 @@ inline void llk_math_eltwise_unary_sfpu_relu(uint dst_index) {
     llk_math_eltwise_unary_sfpu_1_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::relu_min<APPROXIMATE>,
                                 ckernel::sfpu::relu_min<APPROXIMATE>,
-                                dst_index, Dim::RC, 0);
+                                dst_index, (int)VectorMode::RC, 0);
 }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h
index c1c6c697f81..d307f9490ad 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h
@@ -24,7 +24,7 @@ namespace ckernel {
         llk_math_eltwise_unary_sfpu_1_param<APPROXIMATE, Dst>
                                     (ckernel::sfpu::calculate_rsub<APPROXIMATE,8>,
                                     ckernel::sfpu::calculate_rsub<APPROXIMATE,8>,
-                                    dst_index, Dim::RC, param0);
+                                    dst_index, (int)VectorMode::RC, param0);
     }
 
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
index dc1be1f16f9..ec50f756429 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
@@ -13,7 +13,7 @@ namespace ckernel {
 // New LLK SFPU APIs
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_sqrt(uint dst_index, int vector_mode = Dim::RC) {
+inline void llk_math_eltwise_unary_sfpu_sqrt(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     constexpr bool zero_negative = true;
     constexpr int first_iterations = 1;
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h
index e5066307377..94022110bc3 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h
@@ -25,7 +25,7 @@ inline void llk_math_eltwise_unary_sfpu_sine_op(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_trig<SfpuType::sine, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_trig<SfpuType::sine, APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 }
 
 
@@ -40,7 +40,7 @@ inline void llk_math_eltwise_unary_sfpu_cosine_op(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_trig<SfpuType::cosine, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_trig<SfpuType::cosine, APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 }
 
 
@@ -55,7 +55,7 @@ inline void llk_math_eltwise_unary_sfpu_tan_op(uint dst_index) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sfpu_trig<SfpuType::tan, APPROXIMATE>,
                                 ckernel::sfpu::calculate_sfpu_trig<SfpuType::tan, APPROXIMATE>,
-                                dst_index, Dim::RC);
+                                dst_index, (int)VectorMode::RC);
 
 }
 

From c07542fe347205ff2e274f29a48b3817add0ccc2 Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Wed, 6 Dec 2023 16:39:21 +0000
Subject: [PATCH 09/16] #3908: Fix kernel compile error for
 test_device_profiler tests

---
 tt_metal/include/compute_kernel_api.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tt_metal/include/compute_kernel_api.h b/tt_metal/include/compute_kernel_api.h
index a1abc4c2d4f..433f9ea07f5 100644
--- a/tt_metal/include/compute_kernel_api.h
+++ b/tt_metal/include/compute_kernel_api.h
@@ -30,8 +30,8 @@
 #endif
 
 #ifdef TRISC_PACK
-#include "llk_pack_common.h"
-#include "llk_pack.h"
+#include "llk_pack_api.h"
+#include "llk_io_pack.h"
 #define PACK(x) x
 #define MAIN pack_main()
 #else
@@ -46,6 +46,7 @@
 #include "llk_unpack_reduce_api.h"
 #include "llk_unpack_tilize_api.h"
 #include "llk_unpack_untilize_api.h"
+#include "llk_io_unpack.h"
 #define UNPACK(x) x
 #define MAIN unpack_main()
 #else

From 24d0029a5c638e34f36c81e0550dc99d14ea63ad Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Wed, 6 Dec 2023 18:17:44 +0000
Subject: [PATCH 10/16] #3908: Fix sfpu init for exp2,expm1,rsqrt,atan

---
 .../llk_math_eltwise_unary_sfpu_init.h        | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
index e43682ab4f2..6d735702f3c 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
@@ -34,30 +34,24 @@ inline void llk_math_eltwise_unary_sfpu_init(
     _llk_math_eltwise_unary_sfpu_init_();
 
     switch (sfpu_op) {
-        case SfpuType::reciprocal:
-            sfpu::_init_reciprocal_<APPROXIMATE>();
-            break;
-        case SfpuType::exponential:
-            sfpu::_init_exponential_<APPROXIMATE>();
-            break;
-        case SfpuType::log:
-            sfpu::_init_log_<APPROXIMATE>();
-            break;
-        case SfpuType::sqrt:
-            sfpu::_init_sqrt_<APPROXIMATE>();
-            break;
         case SfpuType::tanh:
         case SfpuType::tanh_derivative:
-            sfpu::_init_tanh_<APPROXIMATE>();
-            break;
+             sfpu::_init_tanh_<APPROXIMATE>();
+             break;
         case SfpuType::sigmoid:
-            sfpu::_init_sigmoid_<APPROXIMATE>();
+             sfpu::_init_sigmoid_<APPROXIMATE>();
+             break;
+        case SfpuType::expm1:
+        case SfpuType::exp2:
+            sfpu::_init_exponential_<APPROXIMATE>();
             break;
-        case SfpuType::gelu_derivative:
-            sfpu::_init_gelu_derivative_<APPROXIMATE>();
+        case SfpuType::rsqrt:
+        case SfpuType::atan:
+            sfpu::_init_reciprocal_<APPROXIMATE>();
             break;
-        case SfpuType::gelu:
-            sfpu::_init_gelu_<APPROXIMATE>();
+        case SfpuType::log_with_base:
+        case SfpuType::log:
+            sfpu::_init_log_<APPROXIMATE>();
             break;
         case SfpuType::dropout:
             sfpu::_init_dropout_(param2);

From 1e6d679b8e67d159dc3dbff0840a910036119c60 Mon Sep 17 00:00:00 2001
From: acejkov <acejkov@tenstorrent.com>
Date: Thu, 7 Dec 2023 20:19:29 +0000
Subject: [PATCH 11/16] #3908: Add missing global var to trisck.cc to fix
 kernel compile error for test_graph_interpreter

---
 tt_metal/hw/firmware/src/trisck.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc
index 174fe265300..0115db2f96f 100644
--- a/tt_metal/hw/firmware/src/trisck.cc
+++ b/tt_metal/hw/firmware/src/trisck.cc
@@ -20,6 +20,7 @@ uint32_t unp_cfg_context = 0;
 uint32_t pack_sync_tile_dst_ptr = 0;
 uint32_t math_sync_tile_dst_index = 0;
 uint32_t gl_alu_format_spec_reg = 0;
+uint32_t op_info_offset = 0;
 
 namespace ckernel
 {

From fae0364d154c5519a6dc7b21688d65c973d629c7 Mon Sep 17 00:00:00 2001
From: Reem Tawfik <rtawfik@tenstorrent.com>
Date: Wed, 6 Dec 2023 18:07:26 +0000
Subject: [PATCH 12/16] #3908: Add GS llk api/lib files to fix compile

---
 .../ckernels/grayskull/common/inc/ckernel.h   | 105 ---
 .../grayskull/common/inc/ckernel_globals.h    |  59 --
 .../grayskull/common/inc/ckernel_template.h   | 217 +++++
 .../grayskull/common/inc/cpack_common.h       |  10 -
 .../grayskull/common/inc/cunpack_common.h     |   5 -
 .../grayskull/common/src/ckernel_template.cc  | 229 -----
 .../hw/ckernels/grayskull/llk_lib/llk_defs.h  |  61 --
 ..._math_eltwise_unary_sfpu_common_includes.h |  16 -
 .../hw/ckernels/grayskull/llk_lib/llk_pack.h  |   2 +-
 .../ckernels/grayskull/llk_lib/llk_unpack_A.h |   3 +
 .../grayskull/llk_lib/llk_unpack_AB.h         |   3 +
 .../grayskull/llk_lib/llk_unpack_AB_matmul.h  |   3 +
 .../grayskull/llk_lib/llk_unpack_common.h     |   3 +
 .../{common/inc => metal/common}/chlkc_list.h |   4 +-
 .../metal/common/metal_ckernel_globals.h      |  63 ++
 .../metal/llk_api/llk_math_binary_api.h       |  86 ++
 .../metal/llk_api/llk_math_binary_sfpu_api.h  |  70 ++
 .../metal/llk_api/llk_math_common_api.h       | 108 +++
 .../metal/llk_api/llk_math_matmul_api.h       |  68 ++
 .../metal/llk_api/llk_math_reduce_api.h       |  28 +
 .../llk_api/llk_math_unary_datacopy_api.h     |  36 +
 .../metal/llk_api/llk_math_unary_sfpu_api.h   | 293 +++++++
 .../grayskull/metal/llk_api/llk_op_info_api.h |  23 +
 .../grayskull/metal/llk_api/llk_pack_api.h    | 308 +++++++
 .../llk_api}/llk_param_structs.h              |   0
 .../llk_api/llk_sfpu}/ckernel_reverseops.h    |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_cdf.h      |   0
 .../llk_sfpu}/ckernel_sfpu_converter.h        |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_elu.h      |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h   |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_exp.h      |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_gelu.h     |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_i0.h       |   0
 .../llk_sfpu}/ckernel_sfpu_isinf_isnan.h      |   0
 .../llk_sfpu}/ckernel_sfpu_logical_not_noti.h |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_recip.h    |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_relu.h     |   0
 .../llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h     |   0
 .../llk_sfpu}/ckernel_sfpu_trigonometry.h     |   0
 .../llk_math_eltwise_unary_sfpu_0_param.h     |   0
 .../llk_math_eltwise_unary_sfpu_1_param.h     |   0
 ..._math_eltwise_unary_sfpu_common_includes.h | 171 ++++
 .../llk_math_eltwise_unary_sfpu_elu.h         |   0
 .../llk_math_eltwise_unary_sfpu_erf_erfc.h    |   0
 .../llk_math_eltwise_unary_sfpu_erfinv.h      |   0
 .../llk_math_eltwise_unary_sfpu_exp.h         |   0
 .../llk_math_eltwise_unary_sfpu_gelu.h        |   0
 .../llk_math_eltwise_unary_sfpu_i0.h          |   0
 .../llk_math_eltwise_unary_sfpu_init.h        |   0
 .../llk_math_eltwise_unary_sfpu_isinf_isnan.h |   0
 ...math_eltwise_unary_sfpu_logical_not_noti.h |   0
 .../llk_math_eltwise_unary_sfpu_recip.h       |   0
 .../llk_math_eltwise_unary_sfpu_relu.h        |   0
 .../llk_math_eltwise_unary_sfpu_reverseops.h  |   0
 .../llk_math_eltwise_unary_sfpu_sqrt.h        |   0
 ...llk_math_eltwise_unary_sfpu_trigonometry.h |   0
 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h     | 780 ++++++++++++++++++
 .../grayskull/metal/llk_api/llk_sfpu_types.h  |  64 ++
 .../metal/llk_api/llk_unpack_AB_api.h         |  85 ++
 .../metal/llk_api/llk_unpack_AB_matmul_api.h  | 136 +++
 .../metal/llk_api/llk_unpack_A_api.h          |  89 ++
 .../metal/llk_api/llk_unpack_common_api.h     | 137 +++
 .../metal/llk_api/llk_unpack_reduce_api.h     |  94 +++
 .../metal/llk_api/llk_unpack_tilize_api.h     |  99 +++
 .../metal/llk_api/llk_unpack_untilize_api.h   |  96 +++
 .../ckernels/grayskull/metal/llk_io/llk_io.cc |   3 +
 .../ckernels/grayskull/metal/llk_io/llk_io.h  |  10 +
 .../{llk_lib => metal/llk_io}/llk_io_pack.h   |   0
 .../{llk_lib => metal/llk_io}/llk_io_unpack.h |   0
 .../grayskull/metal/llk_io/llk_operands.h     |  53 ++
 .../grayskull/metal/llk_io/llk_outputs.h      |  61 ++
 .../wormhole_b0/common/inc/ckernel_globals.h  |   1 -
 .../metal/common/metal_ckernel_globals.h      |   2 +
 .../wormhole_b0/metal/llk_io/llk_operands.h   |   3 +-
 .../compute_kernel_api/common_globals.h       |   1 +
 tt_metal/include/compute_kernel_api/unpack.h  |   2 +-
 77 files changed, 3197 insertions(+), 493 deletions(-)
 delete mode 100644 tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc
 delete mode 100644 tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/common}/chlkc_list.h (91%)
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api}/llk_param_structs.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_reverseops.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_cdf.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_converter.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_elu.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_exp.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_gelu.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_i0.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_isinf_isnan.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_logical_not_noti.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_recip.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_relu.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_trigonometry.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_0_param.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_1_param.h (100%)
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_elu.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erf_erfc.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erfinv.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_exp.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_gelu.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_i0.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_init.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_isinf_isnan.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_logical_not_noti.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_recip.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_relu.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_reverseops.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_sqrt.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_trigonometry.h (100%)
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_io}/llk_io_pack.h (100%)
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_io}/llk_io_unpack.h (100%)
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h

diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
index b2de68e862a..8bbf675af9e 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
@@ -62,8 +62,6 @@ extern uint32_t dest_offset_id;
 extern uint32_t dbg_event_index;
 extern uint32_t dbg_event_end;
 
-extern uint32_t op_info_offset;
-
 // Internal scope to namespace methods only (C++ does not allow namespace private ownership)
 namespace internal {
 }
@@ -281,109 +279,6 @@ inline void debug_dump(uint8_t *data, uint32_t byte_size) {
     // TODO(pk) re-implement
 }
 
-inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {
-
-    uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
-    static constexpr uint32_t op_info_num_items = 7;
-
-    volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(&op_info_struct);
-    for (uint32_t i = 0; i < op_info_num_items; i++) {
-        op_info_struct_ptr[i] = op_info_ptr[i];
-    }
-    op_info_offset += 28;
-
-    if (op_info_offset == OP_INFO_SIZE) {
-        op_info_offset = 0; // In case we go out of bounds
-    }
-}
-
-inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b)
-{
-  unsigned int r = 0;
-  while (a)
-    {
-      if (a & 1)
-        r += b;
-      a >>= 1;
-      b <<= 1;
-    }
-  return r;
-}
-
-inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n)
-{
-    // Uses embedding style magic number
-    // * fixed point 1/12 then shifting.
-    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
-    return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3;
-}
-
-inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n)
-{
-    // Uses embedding style magic number
-    // * fixed point 1/12 then shifting.
-    // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
-    return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6;
-}
-
-template <uint32_t d>
-inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n)
-{
-    if constexpr (d == 12) {
-        // fast divide for 12 divisor
-        return fast_udiv_12(n);
-    } else if constexpr (d == 94) {
-        // fast divide for 94 divisor. Handles Banked L1 address generation for E75
-        return fast_udiv_94(n);
-    } else {
-        // generic divide from llvm
-        const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT;
-        unsigned int q;
-        unsigned int r;
-        unsigned sr;
-        /* special cases */
-        if (d == 0)
-            return 0; /* ?! */
-        if (n == 0)
-            return 0;
-        sr = __builtin_clz(d) - __builtin_clz(n);
-        /* 0 <= sr <= n_uword_bits - 1 or sr large */
-        if (sr > n_uword_bits - 1)  /* d > r */
-            return 0;
-        if (sr == n_uword_bits - 1)  /* d == 1 */
-            return n;
-        ++sr;
-        /* 1 <= sr <= n_uword_bits - 1 */
-        /* Not a special case */
-        q = n << (n_uword_bits - sr);
-        r = n >> sr;
-        unsigned int  carry = 0;
-        for (; sr > 0; --sr)
-        {
-            /* r:q = ((r:q)  << 1) | carry */
-            r = (r << 1) | (q >> (n_uword_bits - 1));
-            q = (q << 1) | carry;
-            /* carry = 0;
-             * if (r.all >= d.all)
-             * {
-             *      r.all -= d.all;
-             *      carry = 1;
-             * }
-             */
-            const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1);
-            carry = s & 1;
-            r -= d & s;
-        }
-        q = (q << 1) | carry;
-        return q;
-    }
-}
-template <uint32_t d>
-inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a)
-{
-    return a - udivsi3_const_divisor<d>(a) * d;
-}
-
 inline void tensix_sync()
 {
     volatile uint foo = 0;
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
index a98ae7577ae..445c77d1e0b 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
@@ -7,8 +7,6 @@
 #include <cstdint>
 #include "ckernel_structs.h"
 #include "risc_attribs.h"
-#include "tensix_functions.h"
-#include "hostdevcommon/common_runtime_address_map.h"
 
 extern uint32_t cfg_state_id;
 extern uint32_t unp_cfg_context;
@@ -16,62 +14,5 @@ extern uint32_t gl_alu_format_spec_reg;
 
 extern volatile uint32_t l1_buffer[16];
 
-//extern const int32_t unpack_src_format[24];
-//extern const int32_t unpack_dst_format[24];
-//extern const int32_t pack_src_format[16];
-//extern const int32_t pack_dst_format[16];
-
 extern uint32_t pack_sync_tile_dst_ptr;
 extern uint32_t math_sync_tile_dst_index;
-
-extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
-
-extern uint32_t __ldm_bss_start[];
-extern uint32_t __ldm_bss_end[];
-extern uint32_t __ldm_data_start[];
-extern uint32_t __ldm_data_end[];
-extern void (* __init_array_start[])();
-extern void (* __init_array_end[])();
-extern uint32_t __firmware_start[];
-
-extern void kernel_init();
-extern void kernel_launch();
-
-inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
-    // Cover L1 load latency of 6 cycles for the bulk of the copy
-    int32_t n = 0;
-    while (n < len - 5) {
-        uint32_t v0 = l1_addr[n + 0];
-        uint32_t v1 = l1_addr[n + 1];
-        uint32_t v2 = l1_addr[n + 2];
-        uint32_t v3 = l1_addr[n + 3];
-        uint32_t v4 = l1_addr[n + 4];
-        uint32_t v5 = l1_addr[n + 5];
-        local_mem_addr[n + 0] = v0;
-        local_mem_addr[n + 1] = v1;
-        local_mem_addr[n + 2] = v2;
-        local_mem_addr[n + 3] = v3;
-        local_mem_addr[n + 4] = v4;
-        local_mem_addr[n + 5] = v5;
-        n += 6;
-    }
-    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
-    while (n < len) {
-        local_mem_addr[n] = l1_addr[n];
-        n++;
-    }
-}
-
-inline void firmware_kernel_common_init(void *init_local_l1_base) {
-
-    // Handle stuff typically done in crt0 in asm.  Easier to do in C
-    wzerorange(__ldm_bss_start, __ldm_bss_end);
-
-    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
-    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
-    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
-
-    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
-        (**fptr)();
-    }
-}
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h
index c8968d06577..ba1c08033b1 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h
@@ -237,4 +237,221 @@ class ckernel_unpack_template
     void program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask = 0); // calls program, then run
 };
 
+    ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op)
+        , m_loop_op1(TT_OP_NOP)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op;
+        m_loop1_last_instr = loop_op;
+    }
+
+    ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op0)
+        , m_loop_op1(loop_op1)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op1;
+        m_loop1_last_instr = loop_op1;
+    }
+
+    void ckernel_template::set_loop_op0(uint loop_op)
+    {
+        m_loop_op0 = loop_op;
+    }
+
+    void ckernel_template::set_loop_op1(uint loop_op)
+    {
+        m_loop_op1 = loop_op;
+    }
+
+    void ckernel_template::set_end_ops(uint end_op0, uint end_op1)
+    {
+        m_end_op0 = end_op0;
+        m_end_op1 = end_op1;
+    }
+
+    void ckernel_template::set_end_op(uint end_op0)
+    {
+        set_end_ops(end_op0, TT_OP_NOP);
+    }
+
+    void ckernel_template::set_start_op(uint start_op0)
+    {
+        m_start_op0 = start_op0;
+    }
+
+    void ckernel_template::set_last_inner_loop_instr(uint op)
+    {
+        m_loop1_last_instr = op;
+    }
+
+    void ckernel_template::set_last_outer_loop_instr(uint op)
+    {
+        m_loop0_last_instr = op;
+    }
+
+    void ckernel_template::program_and_run(volatile uint *instrn_buffer)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer);
+    }
+
+    void ckernel_template::run(volatile uint *instrn_buffer)
+    {
+        TTI_MOP(1, 0, 0); // run the double-loop template
+    }
+
+    void ckernel_template::program(volatile uint *instrn_buffer)
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[0] = m_outer_loop_len;
+        mop_cfg[1] = m_inner_loop_len;
+        mop_cfg[2] = m_start_op0;
+        mop_cfg[3] = m_end_op0;
+        mop_cfg[4] = m_end_op1;
+        mop_cfg[5] = m_loop_op0;
+        mop_cfg[6] = m_loop_op1;
+        mop_cfg[7] = m_loop0_last_instr;
+        mop_cfg[8] = m_loop1_last_instr;
+    }
+
+    void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer, count, zmask);
+    }
+
+    void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP_CFG(zmask >> 16);              // Set the top 16 bits of zmask - we could skip this for count <= 16
+        TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template
+    }
+
+    // Version without zmask, should be slightly faster by eliminating one instruction.
+    void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP(0, count - 1, 0); // Run the template
+    }
+
+    void ckernel_unpack_template::program(volatile uint *instrn_buffer) const
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[1] = m_unpackB | (m_unpack_halo << 1);
+        mop_cfg[2] = m_B_instr;
+        mop_cfg[3] = m_A0_instr;
+        mop_cfg[4] = m_A1_instr;
+        mop_cfg[5] = m_A2_instr;
+        mop_cfg[6] = m_A3_instr;
+        mop_cfg[7] = m_skipA_instr;
+        mop_cfg[8] = m_skipB_instr;
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            A_instr, 0, 0, 0, skipA_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            B_instr, 0, 0, 0, skipB_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr,
+
+        uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(true, // src B
+            false,                           // halo
+            A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){
+        return ckernel_unpack_template::lA(instr0, skip0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){
+        // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA.
+        return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1);
+    }
+
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h
index b3c32f94d0e..6d3a1b6fa7b 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h
@@ -416,14 +416,4 @@ namespace ckernel::packer
    {
        dest_offset_id = 0;
    }
-
-   inline uint32_t get_output_id(uint32_t output)
-   {
-      return ((output) - OUTPUT_BASE);
-   }
-
-   inline constexpr uint32_t get_output_base_id()
-   {
-      return (OUTPUT_BASE_ID);
-   }
 }
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h
index 99e4a2c892f..33a3c7f0a58 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h
@@ -335,9 +335,4 @@ namespace ckernel::unpacker
       // Clear context ID
       //reset_config_context();
     }
-
-   inline uint32_t get_operand_id(uint32_t operand)
-   {
-      return operand;
-   }
 }
diff --git a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc
deleted file mode 100644
index 238301e0566..00000000000
--- a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "ckernel_template.h"
-#include "debug/fw_debug.h"
-
-namespace ckernel
-{
-extern volatile uint *cfg_regs;
-
-ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op)
-    : m_outer_loop_len(outer_loop_len)
-    , m_inner_loop_len(inner_loop_len)
-    , m_loop_op0(loop_op)
-    , m_loop_op1(TT_OP_NOP)
-    , m_end_op0(TT_OP_NOP)
-    , m_end_op1(TT_OP_NOP)
-    , m_start_op0(TT_OP_NOP)
-{
-    m_loop0_last_instr = loop_op;
-    m_loop1_last_instr = loop_op;
-}
-
-ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1)
-    : m_outer_loop_len(outer_loop_len)
-    , m_inner_loop_len(inner_loop_len)
-    , m_loop_op0(loop_op0)
-    , m_loop_op1(loop_op1)
-    , m_end_op0(TT_OP_NOP)
-    , m_end_op1(TT_OP_NOP)
-    , m_start_op0(TT_OP_NOP)
-{
-    m_loop0_last_instr = loop_op1;
-    m_loop1_last_instr = loop_op1;
-}
-
-void ckernel_template::set_loop_op0(uint loop_op)
-{
-    m_loop_op0 = loop_op;
-}
-
-void ckernel_template::set_loop_op1(uint loop_op)
-{
-    m_loop_op1 = loop_op;
-}
-
-void ckernel_template::set_end_ops(uint end_op0, uint end_op1)
-{
-    m_end_op0 = end_op0;
-    m_end_op1 = end_op1;
-}
-
-void ckernel_template::set_end_op(uint end_op0)
-{
-    set_end_ops(end_op0, TT_OP_NOP);
-}
-
-void ckernel_template::set_start_op(uint start_op0)
-{
-    m_start_op0 = start_op0;
-}
-
-void ckernel_template::set_last_inner_loop_instr(uint op)
-{
-    m_loop1_last_instr = op;
-}
-
-void ckernel_template::set_last_outer_loop_instr(uint op)
-{
-    m_loop0_last_instr = op;
-}
-
-void ckernel_template::program_and_run(volatile uint *instrn_buffer)
-{
-    program(instrn_buffer);
-    run(instrn_buffer);
-}
-
-void ckernel_template::run(volatile uint *instrn_buffer)
-{
-    TTI_MOP(1, 0, 0); // run the double-loop template
-}
-
-void ckernel_template::program(volatile uint *instrn_buffer)
-{
-    volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
-
-    mop_sync(); // wait until previous mops have completed
-
-    mop_cfg[0] = m_outer_loop_len;
-    mop_cfg[1] = m_inner_loop_len;
-    mop_cfg[2] = m_start_op0;
-    mop_cfg[3] = m_end_op0;
-    mop_cfg[4] = m_end_op1;
-    mop_cfg[5] = m_loop_op0;
-    mop_cfg[6] = m_loop_op1;
-    mop_cfg[7] = m_loop0_last_instr;
-    mop_cfg[8] = m_loop1_last_instr;
-}
-
-void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
-{
-    program(instrn_buffer);
-    run(instrn_buffer, count, zmask);
-}
-
-void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
-{
-    FWASSERT("Unpack template only supports loops up to 128", count <= 128);
-    TT_MOP_CFG(zmask >> 16);              // Set the top 16 bits of zmask - we could skip this for count <= 16
-    TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template
-}
-
-// Version without zmask, should be slightly faster by eliminating one instruction.
-void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count)
-{
-    FWASSERT("Unpack template only supports loops up to 128", count <= 128);
-    TT_MOP(0, count - 1, 0); // Run the template
-}
-
-void ckernel_unpack_template::program(volatile uint *instrn_buffer) const
-{
-    volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
-
-    mop_sync(); // wait until previous mops have completed
-
-    mop_cfg[1] = m_unpackB | (m_unpack_halo << 1);
-    mop_cfg[2] = m_B_instr;
-    mop_cfg[3] = m_A0_instr;
-    mop_cfg[4] = m_A1_instr;
-    mop_cfg[5] = m_A2_instr;
-    mop_cfg[6] = m_A3_instr;
-    mop_cfg[7] = m_skipA_instr;
-    mop_cfg[8] = m_skipB_instr;
-}
-
-ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        false,                            // halo
-        A_instr, 0, 0, 0, skipA_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        false,                            // halo
-        B_instr, 0, 0, 0, skipB_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr)
-{
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(false, // src B
-        true,                             // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(true, // src B
-        true,                            // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B);
-}
-
-ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask)
-{
-    // Figure out which unpack is last
-    const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
-
-    return ckernel_unpack_template(true, // src B
-        true,                            // halo
-        ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
-        ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
-        ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
-        ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B);
-}
-
-ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr,
-
-    uint B_instr, uint skipB_instr)
-{
-    return ckernel_unpack_template(true, // src B
-        false,                           // halo
-        A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr);
-}
-
-ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){
-    return ckernel_unpack_template::lA(instr0, skip0);
-}
-
-ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){
-    // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA.
-    return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1);
-}
-
-} // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h
index 815b76c9d93..2c28acf94e0 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h
@@ -90,67 +90,6 @@ enum ReluType {
   MAX_THRESHOLD_RELU
 };
 
-enum SfpuType
-{
-  tanh,
-  hardtanh,
-  gelu,
-  exponential,
-  exp_with_base,
-  sigmoid,
-  sigmoid_appx,
-  reciprocal,
-  sqrt,
-  rsqrt,
-  lrelu,
-  power,
-  square,
-  tanh_derivative,
-  log,
-  log_with_base,
-  equal_zero,
-  not_equal_zero,
-  less_than_zero,
-  greater_than_equal_zero,
-  less_than_equal_zero,
-  greater_than_zero,
-  clamp,
-  gelu_derivative,
-  dropout,
-  abs,
-  sign,
-  max,
-  min,
-  sine,
-  cosine,
-  tan,
-  relu_min,
-  relu_max,
-  elu,
-  exp2,
-  heaviside,
-  expm1,
-  signbit,
-  asin,
-  acos,
-  atan,
-  erf,
-  erfc,
-  isfinite,
-  isinf,
-  isposinf,
-  isneginf,
-  isnan,
-  logical_not_unary,
-  erfinv,
-  i0,
-  silu,
-  mask,
-  negative,
-  unused
-};
-
-
 enum SfpiTestType
 {
   logical_not,
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
deleted file mode 100644
index 822699707d1..00000000000
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-#include <type_traits>
-
-#include "ckernel_globals.h"
-#include "ckernel_include.h"
-#include "ckernel_template.h"
-#include "cmath_common.h"
-#include "llk_format_conversions.h"
-#include "llk_math_common.h"
-#include "llk_param_structs.h"
-
-using namespace ckernel;
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h
index a6f0e32f133..97c0b3d4909 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h
@@ -149,7 +149,7 @@ inline void llk_pack_init() {
 template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false>
 inline void llk_matmul_pack(std::uint32_t start_tile_index, std::uint32_t output, uint32_t ntiles, std::uint32_t output_tile_index = 0) {
     std::uint8_t output_id = get_output_id(output);
-    constexpr std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id();
+    const std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id();
 
     static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
 
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h
index 3f1e76ea0d1..03848f843e5 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h
@@ -6,6 +6,9 @@
 #include "llk_io_unpack.h"
 #include "llk_param_structs.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h
index ee9127628ce..307e94b25ef 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h
@@ -6,6 +6,9 @@
 #include "llk_io_unpack.h"
 #include "llk_param_structs.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h
index b438a9715c5..78a28594917 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h
@@ -7,6 +7,9 @@
 #include "llk_io_unpack.h"
 #include "llk_param_structs.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "ckernel_template.h"
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h
index 3269aab1192..2c46633312f 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h
@@ -11,6 +11,9 @@
 #include "llk_param_structs.h"
 #include "llk_io_unpack.h"
 
+//TODO: Remove with GS uplift
+#include "llk_operands.h"
+
 #ifdef PERF_DUMP
 #include "ckernel_perf_api.h"
 #endif
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h b/tt_metal/hw/ckernels/grayskull/metal/common/chlkc_list.h
similarity index 91%
rename from tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h
rename to tt_metal/hw/ckernels/grayskull/metal/common/chlkc_list.h
index 3d02d79f908..0a30e5f179b 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/common/chlkc_list.h
@@ -14,20 +14,18 @@ using namespace ckernel;
 
 
 #ifdef UCK_CHLKC_MATH
-// #include "chlkc_math_llk_args.h"
+#include "chlkc_unpack_data_format.h"
 #include "chlkc_math_fidelity.h"
 #include "chlkc_math_approx_mode.h"
 #include "chlkc_math.cpp"
 #endif
 
 #ifdef UCK_CHLKC_PACK
-// #include "chlkc_pack_llk_args.h"
 #include "chlkc_pack_data_format.h"
 #include "chlkc_pack.cpp"
 #endif
 
 #ifdef UCK_CHLKC_UNPACK
-// #include "chlkc_unpack_llk_args.h"
 #include "chlkc_unpack_data_format.h"
 #include "chlkc_unpack.cpp"
 #endif
diff --git a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
new file mode 100644
index 00000000000..7800a9934d7
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//TODO: This file should be deleted after fixing redefinition errors,
+// functions should be moved to ckernel_globals.h
+#pragma once
+
+#include <cstdint>
+#include "ckernel_structs.h"
+#include "risc_attribs.h"
+#include "tensix_functions.h"
+#include "hostdevcommon/common_runtime_address_map.h"
+
+extern uint32_t __ldm_bss_start[];
+extern uint32_t __ldm_bss_end[];
+extern uint32_t __ldm_data_start[];
+extern uint32_t __ldm_data_end[];
+extern void (* __init_array_start[])();
+extern void (* __init_array_end[])();
+extern uint32_t __firmware_start[];
+
+extern void kernel_init();
+extern void kernel_launch();
+
+inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
+    // Cover L1 load latency of 6 cycles for the bulk of the copy
+    int32_t n = 0;
+    while (n < len - 5) {
+        uint32_t v0 = l1_addr[n + 0];
+        uint32_t v1 = l1_addr[n + 1];
+        uint32_t v2 = l1_addr[n + 2];
+        uint32_t v3 = l1_addr[n + 3];
+        uint32_t v4 = l1_addr[n + 4];
+        uint32_t v5 = l1_addr[n + 5];
+        local_mem_addr[n + 0] = v0;
+        local_mem_addr[n + 1] = v1;
+        local_mem_addr[n + 2] = v2;
+        local_mem_addr[n + 3] = v3;
+        local_mem_addr[n + 4] = v4;
+        local_mem_addr[n + 5] = v5;
+        n += 6;
+    }
+    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
+    while (n < len) {
+        local_mem_addr[n] = l1_addr[n];
+        n++;
+    }
+}
+
+inline void firmware_kernel_common_init(void *init_local_l1_base) {
+
+    // Handle stuff typically done in crt0 in asm.  Easier to do in C
+    wzerorange(__ldm_bss_start, __ldm_bss_end);
+
+    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
+    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
+    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
+
+    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
+        (**fptr)();
+    }
+}
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
new file mode 100644
index 00000000000..317c14707ca
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
@@ -0,0 +1,86 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_binary.h"
+
+// /*************************************************************************
+//  * LLK ELTWISE BINARY
+//  *************************************************************************/
+
+// // Version with no operand
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
+// inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) {
+//     const std::uint32_t num_faces = 4;
+
+//     _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
+//         num_faces, transpose, acc_to_dest);
+// }
+
+// // Version with operands
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
+// inline void llk_math_eltwise_binary_init_with_operands(
+//     const std::uint32_t operand_A,
+//     const std::uint32_t operand_B,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t acc_to_dest = 0) {
+//     const std::uint32_t operand_id =
+//         get_operand_id(operand_A);  // operand_id is used to extract tile dim data which is the same for both operands
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
+//         num_faces, transpose, acc_to_dest);
+// }
+
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     DstSync Dst = DstSync::SyncFull,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool is_fp32_dest_acc_en = false>
+// inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) {
+//     const std::uint32_t num_faces = 4;
+
+//     _llk_math_eltwise_binary_<
+//         eltwise_binary_type,
+//         src_b_bcast_type,
+//         Dst,
+//         NUM_FIDELITY_PHASES,
+//         binary_reuse_dest,
+//         is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
+// }
+
+// template <
+//     EltwiseBinaryType eltwise_binary_type,
+//     BroadcastType src_b_bcast_type,
+//     DstSync Dst = DstSync::SyncFull,
+//     int NUM_FIDELITY_PHASES = 0,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool is_fp32_dest_acc_en = false>
+// inline void llk_math_eltwise_binary(
+//     const std::uint32_t operand_A,
+//     const std::uint32_t operand_B,
+//     uint dst_index,
+//     const bool clear_fp32_dst_acc = true) {
+//     const std::uint32_t operand_id = get_operand_id(operand_A);  // both operands must have same number of faces
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_math_eltwise_binary_<
+//         eltwise_binary_type,
+//         src_b_bcast_type,
+//         Dst,
+//         NUM_FIDELITY_PHASES,
+//         binary_reuse_dest,
+//         is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
new file mode 100644
index 00000000000..21c3e8ae428
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_binary_sfpu.h"
+
+// /*************************************************************************
+//  * LLK ELTWISE BINARY SFPU
+//  *************************************************************************/
+
+// template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu(
+//     const uint operand,
+//     uint dst_index_a,
+//     uint dst_index_b,
+//     int vector_mode = (int)Dim::RC,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+//     _llk_math_eltwise_binary_sfpu_<sfpu_op, APPROXIMATE, Dst>(
+//         face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5);
+// }
+
+// template <SfpuType sfpu_op, bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_init(
+//     uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
+//     _llk_math_eltwise_binary_sfpu_init_<sfpu_op, APPROXIMATE>(param0, param1, param2, param3, param4, param5);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu_quant_int32(
+//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+//     llk_math_eltwise_binary_sfpu<SfpuType::quant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) {
+//     llk_math_eltwise_binary_sfpu_init<SfpuType::quant_int32, APPROXIMATE>(zero_point);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu_requant_int32(
+//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+//     llk_math_eltwise_binary_sfpu<SfpuType::requant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) {
+//     llk_math_eltwise_binary_sfpu_init<SfpuType::requant_int32, APPROXIMATE>(zero_point);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_binary_sfpu_dequant_int32(
+//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
+//     llk_math_eltwise_binary_sfpu<SfpuType::dequant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) {
+//     llk_math_eltwise_binary_sfpu_init<SfpuType::dequant_int32, APPROXIMATE>(zero_point);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
new file mode 100644
index 00000000000..3da220f0cba
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
@@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_globals.h"
+#include "ckernel_template.h"
+#include "cmath_common.h"
+#include "llk_defs.h"
+#include "llk_io.h"
+#include "llk_math_common.h"
+#include "llk_operands.h"
+#include "llk_param_structs.h"
+
+// // Need to revisit why we even need this
+// #define EPS 1.19209e-07  // std::numeric_limits::epsilon() for FP32
+
+// /*************************************************************************
+//  * LLK MATH COMMON
+//  *************************************************************************/
+
+// template <DstSync Dst>
+// inline void llk_math_wait_for_dest_available() {
+//     _llk_math_wait_for_dest_available_<Dst>();
+// }
+
+// template <DstSync Dst = SyncFull, bool is_fp32_dest_acc_en = false>
+// inline void llk_math_dest_section_done() {
+//     _llk_math_dest_section_done_<Dst, is_fp32_dest_acc_en>();
+// }
+
+// template <DstSync Dst, bool is_fp32_dest_acc_en = false>
+// inline void llk_math_pack_sync_init() {
+//     _llk_math_pack_sync_init_<Dst, is_fp32_dest_acc_en>();
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
+//     _llk_math_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_math_release_tile(std::uint32_t operand) {
+//     _llk_math_release_tile_<mail2math, mail2pack>();
+// }
+
+// inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); }
+
+// inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); }
+
+// inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+//     _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]);
+// }
+
+// inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+//     _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]);
+// }
+
+// inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]);
+// }
+
+// inline void llk_math_reconfig_data_format(
+//     const std::uint32_t srca_old_operand,
+//     const std::uint32_t srca_new_operand,
+//     const std::uint32_t srcb_old_operand,
+//     const std::uint32_t srcb_new_operand) {
+//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) &&
+//         (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+//         llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand);
+//     } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
+//         llk_math_reconfig_data_format_srca(srca_new_operand);
+//     } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+//         llk_math_reconfig_data_format_srcb(srcb_new_operand);
+//     }
+// }
+
+// inline void llk_math_reconfig_data_format_srca(
+//     const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
+//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+
+//     if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
+//         llk_math_reconfig_data_format_srca(srca_new_operand);
+//     }
+// }
+
+// inline void llk_math_reconfig_data_format_srcb(
+//     const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
+//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
+//         llk_math_reconfig_data_format_srcb(srcb_new_operand);
+//     }
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
new file mode 100644
index 00000000000..a12bcca1ef4
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_matmul.h"
+
+// /*************************************************************************
+//  * LLK MATMUL
+//  *************************************************************************/
+
+// template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
+// inline void llk_math_matmul_init(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+//     const std::uint32_t in0_id = get_operand_id(operandA);
+//     const std::uint32_t in1_id = get_operand_id(operandB);
+
+//     const bool partial_face = get_operand_partial_face(in0_id);
+
+//     const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id);
+//     const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id);
+//     const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id);
+//     const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id);
+
+// #ifdef ARCH_GRAYSKULL
+//     _llk_math_matmul_init_<NUM_FIDELITY_PHASES, FaceLayout>(
+//         in0_tile_r_dim,
+//         in0_tile_c_dim,
+//         in1_tile_r_dim,
+//         in1_tile_c_dim,
+//         partial_face,
+//         transpose,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim);
+// #else
+//     _llk_math_matmul_init_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(
+//         in0_tile_r_dim,
+//         in0_tile_c_dim,
+//         in1_tile_r_dim,
+//         in1_tile_c_dim,
+//         partial_face,
+//         transpose,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim);
+// #endif
+// }
+
+// template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
+// inline void llk_math_matmul(
+//     uint dst_index,
+//     const bool transpose = false,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+// #ifdef ARCH_GRAYSKULL
+//     _llk_math_matmul_<NUM_FIDELITY_PHASES, FaceLayout>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
+// #else
+//     _llk_math_matmul_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
+// #endif
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
new file mode 100644
index 00000000000..c5f11d005f2
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "llk_math_reduce.h"
+
+// /*************************************************************************
+//  * LLK REDUCE
+//  *************************************************************************/
+
+// template <
+//     PoolType type,
+//     ReduceDim dim,
+//     int num_fidelity_phases = 0,
+//     bool is_fp32_dest_acc_en = false,
+//     bool is_int_fpu_en = false>
+// inline void llk_math_reduce(const uint dst_index) {
+//     _llk_math_reduce_<type, dim, num_fidelity_phases, is_fp32_dest_acc_en, is_int_fpu_en>(dst_index);
+// }
+
+// template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0>
+// inline void llk_math_reduce_init(
+//     const std::uint32_t within_face_16x16_transpose =
+//         0) {  // within_face_16x16_transpose used for unpack, ignored by math
+//     _llk_math_reduce_init_<type, dim, num_fidelity_phases>(within_face_16x16_transpose);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
new file mode 100644
index 00000000000..ca2a5d39e40
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_math_common_api.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+
+// /*************************************************************************
+//  * LLK ELTWISE UNARY DATACOPY
+//  *************************************************************************/
+
+// template <
+//     DataCopyType type,
+//     BroadcastType src_b_bcast_type = BroadcastType::NONE,
+//     DstSync Dst = DstSync::SyncFull,
+//     bool is_fp32_dest_acc_en = false,
+//     bool unpack_to_dest = false>
+// inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0);
+//     _llk_math_eltwise_unary_datacopy_<type, src_b_bcast_type, Dst, is_fp32_dest_acc_en, unpack_to_dest>(
+//         dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
+// }
+
+// template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE>
+// // within_face_16x16_transpose is used by unpacker, math does not transpose
+// inline void llk_math_eltwise_unary_datacopy_init(
+//     const std::uint32_t transpose_of_faces = 0 /*unused*/,
+//     const std::uint32_t within_face_16x16_transpose = 0 /* unused */,
+//     const std::uint32_t operand = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     _llk_math_eltwise_unary_datacopy_init_<type, src_b_bcast_type>(
+//         transpose_of_faces, within_face_16x16_transpose, num_faces);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
new file mode 100644
index 00000000000..53b9d1afe8b
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -0,0 +1,293 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_math_common_api.h"
+#include "metal_ckernel_sfpu.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+
+// namespace ckernel {
+
+// /*************************************************************************
+// * LLK ELTWISE UNARY SFPU
+// *************************************************************************/
+
+// // New LLK SFPU APIs
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::rsqrt, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_rsqrt_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::rsqrt, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::log, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_log_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::log, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::log_with_base, APPROXIMATE, dst_sync>(dst_index,base_scale);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_log_with_base_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::log_with_base, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::tanh, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::signbit, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_signbit_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::signbit, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_tanh_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::tanh, APPROXIMATE>();
+// }
+
+// //sign
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::sign, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_sign_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::sign, APPROXIMATE>();
+// }
+// template <DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) {
+//     constexpr bool dont_care = false;
+//     llk_math_eltwise_unary_sfpu<SfpuType::dropout, dont_care, dst_sync>(dst_index, vector_mode, integer_dropout, scale_factor);
+// }
+
+// inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) {
+//     constexpr bool dont_care = false;
+//     constexpr uint dont_care_param = 0;
+
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::dropout, dont_care>(dont_care_param, dont_care_param, seed);
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::sigmoid, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_sigmoid_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::sigmoid, APPROXIMATE>();
+// }
+
+// //EQZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_eqz_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::equal_zero, APPROXIMATE>();
+// }
+
+// //NEZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::not_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_nez_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::not_equal_zero, APPROXIMATE>();
+// }
+
+// //LTZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::less_than_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_ltz_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_zero, APPROXIMATE>();
+// }
+
+// //GTZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::greater_than_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_gtz_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_zero, APPROXIMATE>();
+// }
+
+// //LEZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::less_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_lez_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_equal_zero, APPROXIMATE>();
+// }
+
+// //GEZ
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::greater_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_gez_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_equal_zero, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::max, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_max_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::max, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::square, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_square_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::square, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::power, APPROXIMATE, dst_sync>(dst_index, vector_mode, pow);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_power_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::power, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::abs, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_abs_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::abs, APPROXIMATE>();
+// }
+
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::cast_fp32_to_fp16a, APPROXIMATE, dst_sync>(dst_index, vector_mode);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::cast_fp32_to_fp16a, APPROXIMATE>();
+// }
+
+// //EXP2
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::exp2, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_exp2_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::exp2, APPROXIMATE>();
+// }
+
+// //heaviside
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::heaviside, APPROXIMATE, dst_sync>(dst_index,vector_mode,param0);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_heaviside_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::heaviside, APPROXIMATE>();
+// }
+
+// //EXPM1
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::expm1, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_expm1_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::expm1, APPROXIMATE>();
+// }
+
+// //Asin
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::asin, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_asin_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::asin, APPROXIMATE>();
+// }
+
+// //Atan
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::atan, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_atan_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::atan, APPROXIMATE>();
+// }
+
+// //Acos
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::acos, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_acos_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::acos, APPROXIMATE>();
+// }
+
+// //silu
+// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
+// inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) {
+//     llk_math_eltwise_unary_sfpu<SfpuType::silu, APPROXIMATE, dst_sync>(dst_index);
+// }
+
+// template <bool APPROXIMATE>
+// inline void llk_math_eltwise_unary_sfpu_silu_init() {
+//     llk_math_eltwise_unary_sfpu_init<SfpuType::silu, APPROXIMATE>();
+// }
+
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h
new file mode 100644
index 00000000000..ca7e298a7c2
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+extern uint32_t op_info_offset;
+
+inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {
+
+    uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
+    static constexpr uint32_t op_info_num_items = 7;
+
+    volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(&op_info_struct);
+    for (uint32_t i = 0; i < op_info_num_items; i++) {
+        op_info_struct_ptr[i] = op_info_ptr[i];
+    }
+    op_info_offset += 28;
+
+    if (op_info_offset == OP_INFO_SIZE) {
+        op_info_offset = 0; // In case we go out of bounds
+    }
+}
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
new file mode 100644
index 00000000000..37ee8a0fe56
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
@@ -0,0 +1,308 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_template.h"
+#include "cpack_common.h"
+#include "ckernel_globals.h"
+#include "circular_buffer.h"
+
+#include "llk_io.h"
+#include "llk_defs.h"
+#include "llk_outputs.h"
+#include "llk_param_structs.h"
+#include "llk_pack.h"
+#include "llk_pack_common.h"
+
+/*************************************************************************
+* LLK PACK
+*************************************************************************/
+
+// template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_mop_config(const uint32_t output) {
+
+//     const std::uint32_t output_id = get_output_id(output);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_mop_config_<untilize, zero_output, FaceLayout, false>(
+//         pack_dst_format[output_id],
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile
+//     );
+// }
+
+// template <bool untilize = false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) {
+
+//     const std::uint32_t output_id = get_output_id(pack_params->pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+
+//     _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
+//         pack_src_format[output_id],
+//         pack_dst_format[output_id],
+//         tile_size,
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile,
+//         pack_params->relu_config.val
+//     );
+// }
+
+// template <bool untilize = false, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
+// inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) {
+//     llk_pack_params_t llk_pack_params = {
+//         .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}};
+//     llk_pack_hw_configure<untilize, is_fp32_dest_acc_en>(&llk_pack_params);
+// }
+
+// template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) {
+//     const std::uint32_t output_id = get_output_id(pack_params->pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+
+//     _llk_pack_reduce_hw_configure_<untilize, type, dim, is_fp32_dest_acc_en>(
+//         pack_src_format[output_id],
+//         pack_dst_format[output_id],
+//         tile_size,
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile,
+//         pack_params->relu_config.val
+//     );
+// }
+
+// template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
+// inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) {
+//     llk_pack_params_t llk_pack_params = {
+//         .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}};
+//     llk_pack_reduce_hw_configure<untilize, type, dim, is_fp32_dest_acc_en>(&llk_pack_params);
+// }
+
+// template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_init(const std::uint32_t pack_output = 16) {
+
+//     const std::uint32_t output_id = get_output_id(pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_init_<untilize, zero_output, FaceLayout, false>(
+//         pack_dst_format[output_id],
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile
+//     );
+// }
+
+// template <bool out_of_order_output, bool untilize>
+// inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) {
+
+//     std::uint32_t pack_tile_addr;
+//     if constexpr (out_of_order_output) {
+//         pack_tile_addr = cb_interface[output_id].fifo_wr_ptr +
+//                         (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1;
+//     } else {
+//         if constexpr (untilize) {
+//             // FIXME: Need to support pack-untilize?
+//             // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim +
+//             //                                 cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf
+//             // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
+//             // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size);
+
+//             // cb_interface[output_id].ublock_tile_cnt++;
+
+//             // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) {
+//             //    cb_interface[output_id].ublock_tile_cnt=0;
+//             //    cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct;
+//             // }
+//         } else {
+//             pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
+//             cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size;
+//         }
+//     }
+//     return pack_tile_addr;
+// }
+
+// template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) {
+//     std::uint8_t output_id = get_output_id(output);
+
+//     static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
+
+//     std::uint32_t pack_tile_addr = get_output_tile_address<out_of_order_output, untilize>(output_id, output_tile_index);
+
+//     _llk_pack_<Dst, untilize, is_fp32_dest_acc_en>(
+//         tile_index,
+//         pack_tile_addr
+//     );
+// }
+
+// /*************************************************************************
+// * LLK PACK COMMON
+// *************************************************************************/
+
+
+// inline void llk_packer_wait_for_math_done() {
+//     _llk_packer_wait_for_math_done_();
+// }
+
+// template <uint WaitRes = p_stall::NONE>
+// inline void llk_packer_set_math_semaphore() {
+//     _llk_packer_set_math_semaphore_<WaitRes>();
+// }
+
+// template <DstSync Dst, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_dest_section_done() {
+//     _llk_pack_dest_section_done_<Dst, is_fp32_dest_acc_en>();
+// }
+
+// template <DstSync Dst, DstTileFaceLayout FaceLayout, bool untilize = false>
+// inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) {
+//     const std::uint32_t output_id = get_output_id(pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_init_packer_dest_offset_registers_<Dst, FaceLayout, untilize>(
+//         face_r_dim,
+//         narrow_tile
+//     );
+// }
+
+// template <DstSync Dst, DstTileFaceLayout FaceLayout = RowMajor, bool untilize = false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) {
+
+//     const std::uint32_t output_id = get_output_id(pack_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_dest_init_<Dst, FaceLayout, untilize, is_fp32_dest_acc_en>(
+//         face_r_dim,
+//         narrow_tile
+//     );
+// }
+
+// template <bool mail2math=true, bool mail2pack=true>
+// inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) {
+//     _llk_pack_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
+// }
+
+// template <bool mail2math=true, bool mail2pack=true>
+// inline void llk_pack_release_tile(std::uint32_t output) {
+//     _llk_pack_release_tile_<mail2math, mail2pack>();
+// }
+
+// inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
+//     _llk_pack_debug_dump_(data, byte_size);
+// }
+
+// inline void llk_pack_debug_dump_seek(std::uint8_t offset) {
+//     _llk_pack_debug_dump_seek_(offset);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) {
+
+//     const std::uint32_t output_id = get_output_id(new_output);
+//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//     const std::uint32_t num_faces = get_output_num_faces(output_id);
+//     const bool partial_face = get_output_partial_face(output_id);
+//     const bool narrow_tile = get_output_narrow_tile(output_id);
+
+//     _llk_pack_reconfig_data_format_<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(
+//         pack_src_format[output_id],
+//         pack_dst_format[output_id],
+//         cb_interface[output_id].fifo_page_size,
+//         face_r_dim,
+//         num_faces,
+//         partial_face,
+//         narrow_tile
+//     );
+// }
+
+// template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
+// inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) {
+//     std::uint32_t old_output_id = get_output_id(old_output);
+//     std::uint32_t new_output_id = get_output_id(new_output);
+
+//     if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id])
+//        && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid)
+//        && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) {
+//         llk_pack_reconfig_data_format<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(new_output);
+//     } else if constexpr (is_tile_dim_reconfig_en) {
+//         // Same format but different tile dims
+//         llk_pack_mop_config<false, false, FaceLayout, false>(new_output);
+//     }
+// }
+
+// TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) {
+//     _llk_pack_relu_config_(config);
+// }
+
+// inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) {
+//     _llk_pack_reconfig_l1_acc_(enable);
+// }
+
+// template <bool untilize = false, ReduceDim dim>
+// inline void llk_pack_reduce_mask_config() {
+//     _llk_pack_reduce_mask_config_<untilize, dim>();
+// }
+
+// inline void llk_pack_reduce_mask_clear() {
+//     _llk_pack_reduce_mask_clear_();
+// }
+
+// // FIXME-WH-UPLIFT
+// template <ReduceDim dim, bool at_kernel_start = false, bool revert=false, bool is_fp32_dest_acc_en = false>
+// inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
+
+//     const bool untilize = false;
+//     if constexpr (at_kernel_start) {
+
+//         const std::uint32_t output_id = get_output_id(icb_out);
+//         const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+//         const std::uint32_t num_faces = get_output_num_faces(output_id);
+//         const bool partial_face = get_output_partial_face(output_id);
+//         const bool narrow_tile = get_output_narrow_tile(output_id);
+//         const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
+//         const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}};
+
+//         _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
+//             pack_src_format[output_id],
+//             pack_dst_format[output_id],
+//             tile_size,
+//             face_r_dim,
+//             num_faces,
+//             partial_face,
+//             narrow_tile,
+//             relu_config.val
+//         );
+//     }
+
+//     if constexpr (revert) {
+//         _llk_pack_reduce_mask_clear_();
+//     } else {
+//         _llk_pack_reduce_mask_config_<untilize, dim>();
+//     }
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_reverseops.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_reverseops.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_cdf.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_cdf.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_converter.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_converter.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_elu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_elu.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erf_erfc.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erfinv.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erfinv.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_exp.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_exp.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_gelu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_gelu.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_i0.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_i0.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_isinf_isnan.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_logical_not_noti.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_recip.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_recip.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_relu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_relu.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_sqrt.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_sqrt.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_trigonometry.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_trigonometry.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
new file mode 100644
index 00000000000..83a5fdcca92
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -0,0 +1,171 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <type_traits>
+
+#include "llk_sfpu_types.h"
+#include "ckernel_globals.h"
+#include "ckernel_include.h"
+#include "ckernel_template.h"
+#include "metal_ckernel_sfpu.h"
+#include "cmath_common.h"
+#include "llk_format_conversions.h"
+#include "llk_math_common.h"
+#include "llk_param_structs.h"
+#include "llk_math_eltwise_unary_sfpu.h"
+
+//TODO: Fix for GS uplift
+
+// using namespace ckernel;
+// using namespace ckernel::sfpu;
+// namespace ckernel {
+
+// /*************************************************************************
+//  * LLK ELTWISE UNARY SFPU
+//  *************************************************************************/
+
+// template <
+//     SfpuType operation,
+//     bool APPROXIMATION_MODE,
+//     int SfpuType_PARAM = 0,
+//     int ITERATIONS = 8,
+//     bool IS_INT_SFPU_EN = false>
+// inline void llk_math_calculate_sfpu(
+//     const int iterations = ITERATIONS,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     if constexpr (operation == SfpuType::exp_with_base) {
+//         constexpr bool zero_negative = true;
+//         _calculate_exponential_<APPROXIMATION_MODE, zero_negative, true, ITERATIONS>(iterations, param0);
+//     } else if constexpr (operation == SfpuType::tanh) {
+//         _calculate_tanh_<APPROXIMATION_MODE, ITERATIONS>(iterations);
+//     } else if constexpr (operation == SfpuType::hardtanh) {
+//         _calculate_hardtanh_<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
+//     } else if constexpr (operation == SfpuType::rsqrt) {
+//         // param0 = true -> approximate fast mode
+//         //          false -> high precision mode
+//         //  The algorithm uses Newton's method based on no.of iteration better approximation can be calculated
+//         if (param0) {
+//             calculate_rsqrt<true, ITERATIONS, 10>();
+//         } else {
+//             calculate_rsqrt<false, ITERATIONS, 25>();
+//         }
+//     } else if constexpr (operation == SfpuType::sigmoid) {
+//         calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::sigmoid_appx) {
+//         calculate_sigmoid_appx<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::tanh_derivative) {
+//         calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::dropout) {
+//         calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(param0, param1);
+//     } else if constexpr (operation == SfpuType::power) {
+//         calculate_power_iterative<APPROXIMATION_MODE, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::square) {
+//         calculate_square<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::log) {
+//         calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::log_with_base) {
+//         calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(param0);
+//     } else if constexpr (
+//         (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) ||
+//         (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) ||
+//         (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) {
+//         calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(8);  // BFLOAT16 - exp
+//     } else if constexpr (operation == SfpuType::clamp) {
+//         calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(param0, param1, param2);
+//     } else if constexpr (operation == SfpuType::abs) {
+//         calculate_abs<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::sign) {
+//         calculate_sign<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::max) {
+//         calculate_max<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::min) {
+//         calculate_min<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::exp2) {
+//         calculate_exp2<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::heaviside) {
+//         calculate_heaviside<APPROXIMATION_MODE, ITERATIONS>(param0);
+//     } else if constexpr (operation == SfpuType::expm1) {
+//         calculate_expm1<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::asin) {
+//         calculate_asin<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::acos) {
+//         calculate_acos<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::atan) {
+//         calculate_atan<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::signbit) {
+//         calculate_signbit<APPROXIMATION_MODE, ITERATIONS>();
+//     } else if constexpr (operation == SfpuType::silu) {
+//         calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
+//     }
+// }
+
+// template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN = false>
+// inline void llk_math_eltwise_unary_sfpu(
+//     uint dst_index,
+//     int vector_mode = (int)Dim::RC,
+//     uint param0 = 0,
+//     uint param1 = 0,
+//     uint param2 = 0,
+//     uint param3 = 0,
+//     uint param4 = 0,
+//     uint param5 = 0) {
+//     const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+
+//     constexpr int ITERATIONS = 8;
+
+//     _llk_math_eltwise_unary_sfpu_start_<Dst>(dst_index);
+
+//     if (vector_mode == (int)Dim::R) {
+//         // Do a row vector, Face0 + Face1 -- first iteration (first row)
+//         const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2)
+//                                                : 2;  // At least 2 iterations for odd and even columns
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 2; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 iterations, param0, param1, param2, param3, param4, param5);
+//             // Move to the next face
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//         // Skip next two faces
+//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//     } else if (vector_mode == (int)Dim::C) {
+//         // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for
+//         // full face
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 2; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             if (num_faces > 2) {  // Skip next face if tile is 32x32
+//                 _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             }
+//         }
+//         if (num_faces <= 2) {
+//             // Skip next two faces
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//     } else {
+//         // Do all four faces, and iterate through all 4 blocks of 4 rows each
+// #pragma GCC unroll 0
+//         for (int face = 0; face < 4; face++) {
+//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
+//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
+//             // Move to the next face
+//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
+//         }
+//     }
+//     _llk_math_eltwise_unary_sfpu_done_();
+// }
+
+// }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_elu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_elu.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_exp.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_exp.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_i0.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_i0.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_init.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_recip.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_recip.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_relu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_relu.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
new file mode 100644
index 00000000000..50018e399c3
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
@@ -0,0 +1,780 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_sfpu_types.h"
+#include "ckernel_defs.h"
+#include "ckernel_sfpu.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+#include "ckernel_sfpu_cdf.h"
+#include "ckernel_sfpu_exp.h"
+#include "ckernel_sfpu_recip.h"
+#include "ckernel_sfpu_converter.h"
+
+//TODO: Delete this file once GS uplift is done
+// using namespace sfpi;
+
+// namespace ckernel
+// {
+// namespace sfpu
+// {
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
+// inline void calculate_rsqrt()
+// {
+
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+
+//         vFloat in = dst_reg[0];
+//         v_if(dst_reg[0] == 0.0f){
+//             dst_reg[0] = std::numeric_limits<float>::infinity();
+//         }v_else{
+//             vFloat result = 1.0f;
+//             v_if(dst_reg[0] > 1.0f){
+//                 result = sfpu_reciprocal(in);
+//             }v_endif;
+
+//             for (int r = 0; r < RECIPROCAL_ITERATIONS; r++)
+//             {
+//                 // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration.
+//                 result = result * (1.5F - 0.5F  * dst_reg[0] * result * result);
+//             }
+//             dst_reg[0] = result;
+//         }v_endif;
+
+//         dst_reg++;
+
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_sigmoid_appx()
+// {
+//     vUInt l0 = l_reg[LRegs::LReg0];
+//     vUInt l1 = l_reg[LRegs::LReg1];
+//     vUInt l2 = l_reg[LRegs::LReg2];
+
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         dst_reg[0] = lut(val, l0, l1, l2) + 0.5f;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg0] = l0;
+//     l_reg[LRegs::LReg1] = l1;
+//     l_reg[LRegs::LReg2] = l2;
+// }
+
+// // TODO: Implement using bitwise comparision
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_signbit()
+// {
+
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         v_if (val <= -0.0f) {
+//             val = 1.0f;
+//         } v_elseif (val >= 0.0f) {
+//             val = 0.0f;
+//         }
+//         v_endif;
+//         dst_reg[0] = val;
+
+//        dst_reg++;
+//     }
+
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_tanh()
+// {
+//     // SFPU microcode
+//     vUInt l0 = l_reg[LRegs::LReg0];
+//     vUInt l1 = l_reg[LRegs::LReg1];
+//     vUInt l2 = l_reg[LRegs::LReg2];
+
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         val = lut(val, l0, l1, l2);
+//         dst_reg[0] = val;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg0] = l0;
+//     l_reg[LRegs::LReg1] = l1;
+//     l_reg[LRegs::LReg2] = l2;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_hardtanh(uint param0, uint param1, uint param2)
+// {
+//     // All params are in FP16_B format
+//     // param0 = -(neg_threshold)
+//     // param1 = -(pos_threshold - neg_threshold)
+//     // param2 = -(pos_threshold)
+
+//     vFloat p0 = s2vFloat16(param0);
+//     vFloat p1 = s2vFloat16(param1);
+//     vFloat p2 = s2vFloat16(param2);
+//     // SFPU microcode
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         val += p0;// 12 bits
+//         v_if (val < 0.0f) {
+//             val = 0.0f;
+//         }
+//         v_endif;
+
+//         val += p1;// 12 bits
+//         v_if (val >= 0.0f) {
+//             val = 0.0f;
+//         }
+//         v_endif;
+
+//         val += p2;// 12 bits
+
+//         dst_reg[0] = val;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int WITH_PRECOMPUTED_TANH, int ITERATIONS>
+// inline void calculate_tanh_derivative()
+// {
+//     vUInt l0 = l_reg[LRegs::LReg0];
+//     vUInt l1 = l_reg[LRegs::LReg1];
+//     vUInt l2 = l_reg[LRegs::LReg2];
+
+//     // tanh'(x) = 1 - (tanh(x))^2
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         if constexpr (!WITH_PRECOMPUTED_TANH) {
+//             val = lut(val, l0, l1, l2);
+//         }
+
+//         val = val * (-val) + vConst1;
+//         dst_reg[0] = val;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg0] = l0;
+//     l_reg[LRegs::LReg1] = l1;
+//     l_reg[LRegs::LReg2] = l2;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_dropout(uint prob, uint scale)
+// {
+//     // SFPU microcode
+
+//     vUInt rand = l_reg[LRegs::LReg3];
+
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++) {
+//         ////////////////////////
+//         // Scale samples
+//         ///////////////////////
+//         dst_reg[0] = dst_reg[0] * s2vFloat16b(scale);
+
+//         ////////////////////////
+//         // Drop samples
+//         ///////////////////////
+//         v_if (rand < prob) {
+//             dst_reg[0] = vConst0;
+//         }
+//         v_endif;
+
+//         ////////////////////////
+//         // 16-bit PRNG update
+//         ///////////////////////
+//         vUInt lfsr = vConstIntPrgm1;
+//         vUInt tmp = lfsr & rand;
+//         rand = rand >> 1;
+//         v_if (tmp != 0) {
+//             vUInt mask = vConstIntPrgm0;
+//             rand ^= mask;
+//         }
+//         v_endif;
+
+//         dst_reg++;
+//     }
+
+//     l_reg[LRegs::LReg3] = rand;
+// }
+
+// template <bool APPROXIMATION_MODE,int ITERATIONS>
+// inline void calculate_power_iterative(const uint exponent)
+// {
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < 8; d++)
+//     {
+//         vFloat in = dst_reg[0];
+//         vFloat result = 1.0f;
+//         for (uint i = 0; i < exponent; i++) {
+//             result *= in;
+//         }
+// 	dst_reg[0]=result;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_square()
+// {
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat in = dst_reg[0];
+//         vFloat result = in * in;
+
+//         dst_reg[0] = result;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool HAS_BASE_SCALING>
+// sfpi_inline void calculate_log_body(const uint log_base_scale_factor)
+// {
+//     ////////////////////////////
+//     // Load From dest + "normalize to calculation range"
+//     ////////////////////////////
+//     vFloat in = dst_reg[0];
+//     vFloat x = setexp(in, 127);    // set exp to exp bias (put in range of 1-2)
+
+//     // XXXXXX ask Namal? if we can derive the coefficients below to higher precision
+//     ////////////////////////////
+//     // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order
+//     // x* ( x* (A*x + B) + C) + D
+//     // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006
+//     // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to
+//     // save the subtract and get A',B',C',D'):
+//     // A' = A
+//     // B' = -3A + B
+//     // C' = 3a -2B + C
+//     // D' = -A + B - C + D
+//     // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753
+//     ////////////////////////////
+//     vFloat a = vConstFloatPrgm1;
+//     vFloat b = vConstFloatPrgm2;
+//     // XXXXX try variants of the below: B'=.7122, C'=2.0869
+//     vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f;
+
+//     ////////////////////////////
+//     // Convert exponent to float
+//     ////////////////////////////
+//     vInt exp = exexp(in);
+//     v_if (exp < 0) {
+//         exp = setsgn(~exp + 1, 1);
+//     }
+//     v_endif;
+
+//     vFloat expf = int32_to_float(exp, 0);
+//     vFloat vConstLn2 = vConstFloatPrgm0;
+//     vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2)
+
+//     if constexpr (HAS_BASE_SCALING) {
+//         result *= s2vFloat16a(log_base_scale_factor);
+//     }
+
+//     ////////////////////////////
+//     // Base case when input is 0. ln(0) = -inf
+//     ////////////////////////////
+//     v_if (in == 0.0F) { // Reload for register pressure
+//         result = -std::numeric_limits<float>::infinity();
+//     }
+//     v_endif;
+
+//     dst_reg[0] = result;
+// }
+
+// template <bool APPROXIMATION_MODE, bool HAS_BASE_SCALING, int ITERATIONS>
+// inline void calculate_log(uint log_base_scale_factor)
+// {
+//     #pragma GCC unroll 8
+//     for(int d = 0; d < ITERATIONS; d++){
+//         calculate_log_body<HAS_BASE_SCALING>(log_base_scale_factor);
+//         dst_reg++;
+//     }
+// }
+
+// sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init)
+// {
+//     flag1 = init;
+//     if (check) {
+//         flag2 = init;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, SfpuType COMP_MODE, int ITERATIONS>
+// inline void calculate_comp(uint exponent_size_8)
+// {
+//    const vFloat zero = 0.0f;
+//    const vFloat one = 1.0f;
+//    for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         vFloat flag1, flag2;
+
+// 	//a[i] == 0
+// 	if constexpr(COMP_MODE == SfpuType::equal_zero) {
+// 	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
+// 	      v = one;
+// 	    } v_else {
+// 	      v = zero;
+// 	    }
+// 	    v_endif;
+// 	  }
+
+// 	//a[i] != 0
+// 	if constexpr(COMP_MODE == SfpuType::not_equal_zero) {
+// 	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
+// 	      v = zero;
+// 	    } v_else {
+// 	      v = one;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] < 0
+// 	if constexpr(COMP_MODE == SfpuType::less_than_zero) {
+// 	    v_if (v >= 0.0f) {
+// 	      v = zero;
+// 	    } v_else {
+// 	      v = one;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] >= 0
+// 	if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) {
+// 	    v_if (v >= 0.0f) {
+// 	      v = one;
+// 	    } v_else {
+// 	      v = zero;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] > 0
+// 	if constexpr(COMP_MODE == SfpuType::greater_than_zero) {
+// 	    v_if (v > 0.0f) {
+// 	      v = one;
+// 	    } v_else {
+// 	      v = zero;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	//a[i] <= 0
+// 	if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) {
+// 	    v_if (v > 0.0f) {
+// 	      v = zero;
+// 	    } v_else {
+// 	      v = one;
+// 	    }
+// 	    v_endif;
+//         }
+
+// 	dst_reg[0] = v;
+// 	dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_clamp(uint param0, uint param1, uint param2)
+// {
+//     // All params are in FP16 format
+//     // param0 = min
+//     // param1 = max
+
+//     //uint format = (param0 >> 16)&0x1;
+//     s2vFloat16::Format format = s2vFloat16::fp16a;
+
+//     // SFPU microcode
+//     vFloat min = s2vFloat16(param0, format);
+//     vFloat max = s2vFloat16(param1, format);
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+
+//         v_if (val < min) {
+//             val = s2vFloat16(param0, format);
+//         } v_elseif (val >= max) {
+//             val = s2vFloat16(param1, format);
+//         }
+//         v_endif;
+
+//         dst_reg[0] = val + s2vFloat16b(param2); // 12 bits
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_abs()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         dst_reg[0] = sfpi::abs(v);
+//         dst_reg++;
+//     }
+// }
+
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_exp2()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         // log(2) = 0.6931471805;
+//         v = v * 0.6931471805f;
+// 	    // exp = e^(v)
+// 	    vFloat exp = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
+// 	    dst_reg[0] = exp;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_sign()
+// {
+//     // All params are in FP16 format
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+// 	vFloat result = vConst1;
+//         v_if (v < 0.0f) {
+//            result = vConstNeg1;
+//         } v_elseif(v > 0.0f) {
+// 	  result = vConst1;
+// 	} v_else {
+// 	  result = vConst0;
+//         }
+//         v_endif;
+
+// 	dst_reg[0] = result;
+//         dst_reg++;
+//     }
+// }
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_max()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat a = dst_reg[0];
+//         vFloat b = dst_reg[32];
+//         v_if(a < b) {
+//             dst_reg[0] = b;
+//         }
+//         v_endif;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_min()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat a = dst_reg[0];
+//         vFloat b = dst_reg[32];
+//         v_if(a > b) {
+//             dst_reg[0] = b;
+//         }
+//         v_endif;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_expm1()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         v = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
+//         dst_reg[0] = v - 1.0f;
+//         dst_reg++;
+//     }
+// }
+
+
+// #define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4)  (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0)
+
+// template <bool APPROXIMATION_MODE>
+// sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val)
+// {
+//     v_if(1 > sfpi::abs(val)){
+//         dst_reg[0] = sfpi::abs(val)  ;
+//     }
+//     v_else{
+//         dst_reg[0] =  sfpu_reciprocal(sfpi::abs(val));
+//     }
+//     v_endif;
+
+//     vFloat t1 = dst_reg[0] * dst_reg[0];
+
+//     t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1);
+
+//     t1 = t1 * dst_reg[0];
+
+//     v_if (sfpi::abs(val) > 1){
+//         t1 = 1.570796327f - t1;
+//     }
+//     v_endif;
+
+//     v_if(val < 0 ){
+//         t1 = -t1;
+//     }
+//     v_endif;
+
+//     return t1;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_atan()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         val = sfpu_atan_maclaurin_series<APPROXIMATION_MODE>(val);
+//         dst_reg[0] = val;
+//         dst_reg++;
+//     }
+// }
+
+
+// template <bool APPROXIMATION_MODE>
+// sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val)
+// {
+//     // input for [-1:1]
+//     // Mclauren series
+//     // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ...
+//     // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a
+
+//     vFloat tmp = val;
+//     vFloat val_square = val * val;
+//     // x
+//     vFloat output = tmp;
+//     // (1/6) * x^3
+//     tmp = tmp * val_square;
+//     output += 0.166666666 * tmp;
+//     // (3/40) * x^5
+//     tmp = tmp * val_square;
+//     output +=  0.075 * tmp;
+
+//     //(5/112) * x^7
+//     tmp = tmp * val_square;
+//     output += 0.044642857 * tmp;
+
+//     // (35/1152) *x^9
+//     tmp = tmp * val_square;
+//     output += 0.03038194 * tmp;
+
+//     //(63/2816) * x^11
+//     tmp = tmp * val_square;
+//     output += 0.02237216 * tmp;
+
+//     // Write out output
+//     return output;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_asin()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
+//         dst_reg[0] = v;
+//         dst_reg++;
+//     }
+// }
+
+
+// #define PI_2 (1.570796326794)
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_acos()
+// {
+//     // SFPU microcode
+//     // acos = (pi/2 - asin)
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat v = dst_reg[0];
+//         v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
+//         v = PI_2 - v;
+//         dst_reg[0] = v;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void cast_fp32_to_fp16a()
+// {
+//     #pragma GCC unroll 8
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         //vFloat val = dst_reg[0];
+//         //dst_reg[0] = float_to_fp16a(val, 0);
+//         TTI_SFPLOAD(0, 0, 3, 0);
+//         TTI_SFP_STOCH_RND(0,0,0,0,0,8);
+//         TTI_SFPSTORE(0,1,3,0);
+//         dst_reg++;
+//     }
+// }
+
+
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_negative()
+// {
+
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         dst_reg[0] = -val;
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_add1()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         dst_reg[0] = 1.0f + val;
+//         dst_reg++;
+//     }
+// }
+
+// inline
+// vFloat sigmoid_piecewise_linear_positive(vFloat val) {
+//         vFloat result = 0.0f;
+// 	v_if ( val >= +5.0f)  {
+// 	  result = 1.0f;
+// 	} v_elseif ( val > 1.0f && val < 5.0f ) {
+// 	  result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f,  0.24300185f,  0.50437757f,val);
+// 	} v_else {
+// 	  result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5
+// 	}
+// 	v_endif;
+// 	return result;
+// }
+
+// //sigmoid is anti-symmetric and offset by 1
+// //sigmoid[-x] = 1 - sigmoid[x]
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_sigmoid()
+// {
+//     for (int d = 0; d < ITERATIONS; d++)
+//     {
+//         vFloat val = dst_reg[0];
+//         vFloat result = 0.0f;
+
+//         v_if ( val < 0.0f ) {
+//   	   val = -val;
+//         }
+//         v_endif;
+
+// 	result = sigmoid_piecewise_linear_positive(val);
+
+// 	val = dst_reg[0];
+//         v_if ( val < 0.0f ) {
+//             result = 1.0f - result;
+//         }
+//         v_endif;
+
+//         dst_reg[0] = result;
+//         dst_reg++;
+//     }
+
+//     return;
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_heaviside(uint value)
+// {
+//     // SFPU microcode
+//     Converter c_value;
+//     c_value.u = value;
+//     vFloat s = c_value.f;
+
+//     #pragma GCC unroll 0
+//     for (int d = 0; d < ITERATIONS; d++) {
+//         vFloat v = dst_reg[0];
+
+//         v_if (v < 0.0f) {
+//             v = 0.0f;
+//         }v_elseif (v > 0.0f) {
+//             v = 1.0f;
+//         }v_else {
+//             v = s;
+//         }
+//         v_endif;
+
+//        dst_reg[0] = v;
+
+//         dst_reg++;
+//     }
+// }
+
+// template <bool APPROXIMATION_MODE, int ITERATIONS>
+// inline void calculate_silu()
+// {
+//     // SFPU microcode
+//     for (int d = 0; d < ITERATIONS; d++) {
+//         vFloat val = dst_reg[0];
+//         v_if ( val < 0.0f ) {
+//             val = -val;
+//         }
+//         v_endif;
+
+// 	    vFloat result = sigmoid_piecewise_linear_positive(val);
+
+// 	    val = dst_reg[0];
+//         v_if ( val < 0.0f ) {
+//             result = 1.0f - result;
+//         }
+//         v_endif;
+//         result = val * result;
+//         dst_reg[0] = result;
+//         dst_reg++;
+//     }
+// }
+
+// } // namespace sfpu
+// } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h
new file mode 100644
index 00000000000..bf23a084b6d
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+enum SfpuType {
+  tanh,
+  hardtanh,
+  gelu,
+  exponential,
+  exp_with_base,
+  sigmoid,
+  sigmoid_appx,
+  reciprocal,
+  sqrt,
+  rsqrt,
+  lrelu,
+  power,
+  square,
+  tanh_derivative,
+  log,
+  log_with_base,
+  equal_zero,
+  not_equal_zero,
+  less_than_zero,
+  greater_than_equal_zero,
+  less_than_equal_zero,
+  greater_than_zero,
+  clamp,
+  gelu_derivative,
+  dropout,
+  abs,
+  sign,
+  max,
+  min,
+  sine,
+  cosine,
+  tan,
+  relu_min,
+  relu_max,
+  elu,
+  exp2,
+  heaviside,
+  expm1,
+  signbit,
+  asin,
+  acos,
+  atan,
+  erf,
+  erfc,
+  isfinite,
+  isinf,
+  isposinf,
+  isneginf,
+  isnan,
+  logical_not_unary,
+  erfinv,
+  i0,
+  silu,
+  mask,
+  negative,
+  unused,
+};
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
new file mode 100644
index 00000000000..642fbb1591e
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_AB.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+//  * LLK UNPACK AB
+//  *************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_hw_configure(
+//     const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) {
+//     // In0 -> unpA
+//     // In1 -> unpB
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
+//     const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
+
+//     // unpA -> srcA
+//     // unpB -> srcB
+//     const uint32_t num_faces = get_operand_num_faces(unpA_operand_id);  // num faces in unpA and unpB are the same
+
+//     const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id);  // face r dim in unpA and unpB are the same
+
+//     _llk_unpack_AB_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_src_format[unpB_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpack_dst_format[unpB_operand_id],
+//         face_r_dim,
+//         within_face_16x16_transpose,
+//         num_faces);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) {
+//     const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand};
+
+//     llk_unpack_AB_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_params, within_face_16x16_transpose);
+// }
+
+// template <BroadcastType BType = BroadcastType::NONE>
+// inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) {
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);  // if narrow tile read face 0 twice for row broadcast
+//                                                                    // or read face 0 and 1 for col broadcast
+//     _llk_unpack_AB_mop_config_<BType>(transpose_of_faces, num_faces, narrow_tile);
+// }
+
+// template <BroadcastType BType = BroadcastType::NONE>
+// inline void llk_unpack_AB_init(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t acc_to_dest = 0) {
+//     const std::uint32_t operandA_id = get_operand_id(operandA);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id);  // face r dim in unpA and unpB are the same
+//     const std::uint32_t num_faces = get_operand_num_faces(operandA_id);
+//     const bool narrow_tile =
+//         get_operand_narrow_tile(operandA_id);  // if narrow tile read face 0 twice for row broadcast
+
+//     _llk_unpack_AB_init_<BType>(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest);
+// }
+
+// template <BroadcastType BType = BroadcastType::NONE>
+// inline void llk_unpack_AB(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t tile_index_a,
+//     const std::uint32_t tile_index_b,
+//     const bool transpose_of_faces = 0 /*not used*/) {
+//     std::uint32_t operandA_id = get_operand_id(operandA);
+//     std::uint32_t operandB_id = get_operand_id(operandB);
+//     std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a;
+//     std::uint32_t address_a = base_address_a + offset_address_a;
+//     std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b;
+//     std::uint32_t address_b = base_address_b + offset_address_b;
+
+//     _llk_unpack_AB_<BType>(address_a, address_b, transpose_of_faces > 0);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
new file mode 100644
index 00000000000..f4aee2da6bd
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
@@ -0,0 +1,136 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_AB_matmul.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+//  * LLK UNPACK AB MATMUL
+//  *************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) {
+//     const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca;
+
+//     // In0 -> unpB
+//     // In1 -> unpA
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
+//     const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
+
+//     // unpA -> srcA
+//     // unpB -> srcB
+//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id);
+
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+//     const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id);
+
+//     _llk_unpack_AB_matmul_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_src_format[unpB_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpack_dst_format[unpB_operand_id],
+//         unpA_face_r_dim,
+//         unpB_face_r_dim,
+//         transpose_xy_srca,
+//         unpA_num_faces,
+//         unpB_num_faces,
+//         cb_interface[unpA_operand_id].fifo_page_size,
+//         cb_interface[unpB_operand_id].fifo_page_size);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_AB_matmul_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) {
+//     const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = {
+//         .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca};
+//     llk_unpack_AB_matmul_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_matmul_params);
+// }
+
+// inline void llk_unpack_AB_matmul_mop_config(
+//     const bool transpose,
+//     const std::uint32_t ct_dim,
+//     const std::uint32_t rt_dim,
+//     const std::uint32_t kt_dim,
+//     const bool partial_face) {
+//     // in0 - loaded to SrcB
+//     // in1 - loaded to SrcA
+//     _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face);
+// }
+
+// __attribute__((always_inline)) inline void llk_unpack_AB_matmul_init(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t transpose = 0,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+//     // In0 -> srcB (supports partial face)
+//     // In1 -> srcA
+//     const uint32_t operandA_id = get_operand_id(operandB);
+//     const uint32_t operandB_id = get_operand_id(operandA);
+
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id);
+//     const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id);
+
+//     const bool reuse_a = ct_dim >= rt_dim;
+//     const bool partial_face = get_operand_partial_face(operandB_id);
+
+//     const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id);
+//     const uint32_t unpB_num_faces =
+//         partial_face ? 1 : get_operand_num_faces(operandB_id);  // if partial face -> unpack face by face
+
+//     _llk_unpack_AB_matmul_init_(
+//         transpose,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim,
+//         unpA_face_r_dim,
+//         unpB_face_r_dim,
+//         unpA_num_faces,
+//         unpB_num_faces,
+//         partial_face);
+// }
+
+// inline void llk_unpack_AB_matmul(
+//     const std::uint32_t operandA,
+//     const std::uint32_t operandB,
+//     const std::uint32_t tile_index_a,
+//     const std::uint32_t tile_index_b,
+//     const std::uint32_t ct_dim = 1,
+//     const std::uint32_t rt_dim = 1,
+//     const std::uint32_t kt_dim = 1) {
+//     // In0/InA -> srcB (supports partial face)
+//     // In1/InB -> srcA
+
+//     volatile uint *cfg = get_cfg_pointer();  // get pointer to registers for current state ID
+
+//     const std::uint32_t operandA_id = get_operand_id(operandA);
+//     const std::uint32_t operandB_id = get_operand_id(operandB);
+//     const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id);  // In1/InB -> srcA
+//     const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id);  // In0/InA -> srcB
+
+//     const bool partial_face = get_operand_partial_face(operandA_id);
+
+//     std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
+//     std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
+
+//     std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size;
+//     std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size;
+
+//     _llk_unpack_AB_matmul_(
+//         base_address_a,
+//         base_address_b,
+//         tile_index_a,
+//         tile_index_b,
+//         tile_size_a,
+//         tile_size_b,
+//         unpA_face_r_dim,
+//         unpB_face_r_dim,
+//         partial_face,
+//         ct_dim,
+//         rt_dim,
+//         kt_dim);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
new file mode 100644
index 00000000000..ca39397653c
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_A.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+//  * LLK UNPACK A
+//  *************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_A_hw_configure(
+//     const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) {
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand);
+//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+//     _llk_unpack_A_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces);
+// }
+
+// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_A_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) {
+//     const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand};
+//     llk_unpack_A_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_A_params, within_face_16x16_transpose);
+// }
+
+// template <
+//     BroadcastType BType = BroadcastType::NONE,
+//     bool acc_to_dest = false,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool unpack_to_dest = false>
+// inline void llk_unpack_A_mop_config(
+//     const bool transpose_of_faces,
+//     const std::uint32_t operand_id,
+//     const std::uint32_t unpack_src_format = 0,
+//     std::uint32_t unpack_dst_format = 0) {
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_unpack_A_mop_config_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+//         transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format);
+// }
+
+// template <
+//     BroadcastType BType = BroadcastType::NONE,
+//     bool acc_to_dest = false,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool unpack_to_dest = false>
+// inline void llk_unpack_A_init(
+//     const std::uint32_t transpose_of_faces = 0,
+//     const std::uint32_t within_face_16x16_transpose = 0,
+//     const std::uint32_t operand = 0) {
+//     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(within_face_16x16_transpose);
+
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     _llk_unpack_A_init_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+//         transpose_of_faces,
+//         within_face_16x16_transpose,
+//         face_r_dim,
+//         num_faces,
+//         unpack_src_format[operand_id],
+//         unpack_dst_format[operand_id]);
+// }
+
+// template <
+//     BroadcastType BType = BroadcastType::NONE,
+//     bool acc_to_dest = false,
+//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
+//     bool unpack_to_dest = false>
+// inline void llk_unpack_A(
+//     const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+//     std::uint32_t address = base_address + offset_address;
+
+//     _llk_unpack_A_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
+//         address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
new file mode 100644
index 00000000000..a2f5d8c675f
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
@@ -0,0 +1,137 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "circular_buffer.h"
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_globals.h"
+#include "ckernel_template.h"
+#include "cunpack_common.h"
+#include "llk_defs.h"
+#include "llk_io.h"
+#include "llk_operands.h"
+#include "llk_param_structs.h"
+#include "llk_unpack_common.h"
+
+// /*************************************************************************
+//  * LLK UNPACK COMMON
+//  *************************************************************************/
+
+// void llk_zero_operand(std::uint32_t operand) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size;
+//     std::uint32_t size = cb_interface[operand_id].fifo_size;
+//     _llk_zero_buffer_(fifo_base_addr, size);
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+//     std::uint32_t address = base_address + offset_address;
+//     _llk_unpack_get_tile_<mail2math, mail2pack>(address, p_tile);
+// }
+
+// template <bool mail2math = true, bool mail2pack = true>
+// inline void llk_unpack_release_tile(std::uint32_t operand) {
+//     _llk_unpack_release_tile_<mail2math, mail2pack>();
+// }
+
+// inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
+//     _llk_unpack_debug_dump_(data, byte_size);
+// }
+
+// inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
+//     const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand);
+//     const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id);
+//     _llk_unpack_reconfig_data_format_srca_impl_(
+//         unpack_src_format[srca_operand_id],
+//         unpack_dst_format[srca_operand_id],
+//         cb_interface[srca_operand_id].fifo_page_size);
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
+//     std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand);
+//     const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id);
+//     _llk_unpack_reconfig_data_format_srcb_impl_(
+//         unpack_src_format[srcb_operand_id],
+//         unpack_dst_format[srcb_operand_id],
+//         cb_interface[srcb_operand_id].fifo_page_size);
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srca(
+//     const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
+//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
+//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
+
+//     if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) {
+//         llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+//     } else if constexpr (is_tile_dim_reconfig_en) {
+//         llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+//     }
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format_srcb(
+//     const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
+//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
+//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
+
+//     if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) {
+//         llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+//     } else if constexpr (is_tile_dim_reconfig_en) {
+//         llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+//     }
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format(
+//     const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
+//     llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
+//     llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
+// }
+
+// template <bool is_tile_dim_reconfig_en = false>
+// inline void llk_unpack_reconfig_data_format(
+//     const std::uint32_t srca_old_operand,
+//     const std::uint32_t srca_new_operand,
+//     const std::uint32_t srcb_old_operand,
+//     const std::uint32_t srcb_new_operand) {
+//     llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_old_operand, srca_new_operand);
+//     llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_old_operand, srcb_new_operand);
+// }
+
+// inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); }
+
+// inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); }
+
+// // All TILE_SIZE related functions were deprecared in BBE for WH.  The following is needed for pack_shifted so just
+// // keeping here.
+// // FIXME: Need to review and adjust accordingly
+// constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) {
+//     switch (format & 0x1F) {
+//         case ((uint8_t)DataFormat::Float32): return ((index << 8));
+//         case ((uint8_t)DataFormat::Float16):
+//         case ((uint8_t)DataFormat::Float16_b): return ((index << 7));
+//         case ((uint8_t)DataFormat::Bfp8):
+//         case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2));
+//         case ((uint8_t)DataFormat::Bfp4):
+//         case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2));
+//         case ((uint8_t)DataFormat::Bfp2):
+//         case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2));
+//         case ((uint8_t)DataFormat::Int8):
+//         case ((uint8_t)DataFormat::Lf8): return ((index << 6));
+//         // Keep default as Bfp8?
+//         default: return ((index << 6) + (index << 2));
+//     };
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
new file mode 100644
index 00000000000..01a12122375
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
@@ -0,0 +1,94 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_reduce.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+* LLK UNPACK REDUCE
+*************************************************************************/
+
+// template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_reduce_hw_configure(
+//     const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) {
+
+//     constexpr bool within_face_16x16_transpose  = (ReduceDim::REDUCE_ROW == dim);
+
+//     const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand);
+//     const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+//     constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
+//     const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
+//                                ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
+
+//     _llk_unpack_reduce_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpB_src_format,
+//         unpack_dst_format[unpA_operand_id],
+//         unpB_dst_format,
+//         unpA_face_r_dim,
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces,
+//         unpA_num_faces
+//     );
+
+//     if constexpr (type != PoolType::MAX) {
+//         union {
+//             float f;
+//             uint32_t u;
+//         } f2u = {.f = const_mult};
+
+//         for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u;  // Load const into L1 buffer
+//     }
+// }
+
+// template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en=false, StochRndType stoch_rnd_mode = StochRndType::None>
+// inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) {
+//     const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand};
+//     llk_unpack_reduce_hw_configure<type, dim, is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_reduce_params, mult);
+// }
+
+// template <PoolType type, ReduceDim dim>
+// inline void llk_unpack_reduce_mop_config() {
+//     _llk_unpack_reduce_mop_config_<type, dim>();
+// }
+
+// template <PoolType type, ReduceDim dim>
+// inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) {
+
+//     constexpr std::uint32_t unpA_operand_id = 0;
+
+//     const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
+//     const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
+//                                ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
+
+//     cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>(unpB_dst_format);
+
+//     cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0xf>(unpB_src_format);
+//     cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpB_dst_format);
+
+//     TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32);
+//     TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
+//     TTI_NOP; TTI_NOP;
+
+//     _llk_unpack_reduce_init_<type, dim>(
+//         within_face_16x16_transpose
+//     );
+// }
+
+// template <PoolType type, ReduceDim dim>
+// inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) {
+
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
+//     std::uint32_t address = base_address + offset_address;
+
+//     _llk_unpack_reduce_<type, dim>(
+//         address
+//     );
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
new file mode 100644
index 00000000000..59ede271732
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_tilize.h"
+#include "llk_unpack_common_api.h"
+
+/*************************************************************************
+* LLK UNPACK TILIZE
+*************************************************************************/
+
+// template <bool is_fp32_dest_acc_en = false>
+// inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) {
+
+//     constexpr bool  within_face_16x16_transpose = false;
+//     constexpr StochRndType stoch_rnd_mode = StochRndType::None;
+
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand);
+//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
+//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
+
+//     _llk_unpack_tilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces
+//     );
+// }
+
+
+// template <bool is_fp32_dest_acc_en = false>
+// inline void llk_unpack_tilize_hw_configure_disaggregated(
+//     const std::uint32_t unpA_operand) {
+//     const llk_unpack_A_params_t unpack_tilize_params = {
+//         .unpA_operand = unpA_operand
+//     };
+//     llk_unpack_tilize_hw_configure<is_fp32_dest_acc_en>(&unpack_tilize_params);
+// }
+
+// inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
+//     _llk_unpack_tilize_mop_config_(narrow_tile);
+// }
+
+// inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) {
+//     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(0);
+
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
+
+//     // Save state of unpacker config for quick restore
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0]
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
+
+//     _llk_unpack_tilize_init_(
+//         unpack_src_format[operand_id],
+//         unpack_dst_format[operand_id],
+//         ct_dim,
+//         face_r_dim,
+//         narrow_tile
+//     );
+
+// }
+
+// inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) {
+//     TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0);
+//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0]
+//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32,  p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context
+// }
+
+// inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) {
+
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
+
+//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;  // Remove header size added by descriptor
+
+//     _llk_unpack_tilize_(
+//         base_address,
+//         tile_index,
+//         unpack_src_format[operand_id],
+//         block_ct_dim,
+//         face_r_dim,
+//         num_faces,
+//         narrow_tile
+//     );
+// }
+
+// inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) {
+//     for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) {
+//         llk_unpack_tilize(operand, tile_index, block_c_tiles);
+//     }
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
new file mode 100644
index 00000000000..dded559e94d
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "llk_unpack_untilize.h"
+#include "llk_unpack_common_api.h"
+
+// /*************************************************************************
+// * LLK UNPACK UNTILIZE
+// *************************************************************************/
+// template <bool is_fp32_dest_acc_en = false>
+// inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) {
+//     constexpr bool is_row_pool = false;
+//     constexpr bool within_face_16x16_transpose = false;
+//     constexpr StochRndType stoch_rnd_mode = StochRndType::None;
+
+//     const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand);
+//     const uint32_t unpA_num_faces = 4;
+//     const uint32_t unpA_face_r_dim = FACE_R_DIM;
+
+//     _llk_unpack_untilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
+//         unpack_src_format[unpA_operand_id],
+//         unpack_dst_format[unpA_operand_id],
+//         unpA_face_r_dim,
+//         within_face_16x16_transpose,
+//         unpA_num_faces
+//     );
+// }
+
+// inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) {
+//     const llk_unpack_A_params_t unpack_untilize_params = {
+//         .unpA_operand = unpA_operand,
+//     };
+//     llk_unpack_untilize_hw_configure(&unpack_untilize_params);
+// }
+
+// inline void llk_unpack_untilize_mop_config() {
+//     _llk_unpack_untilize_mop_config_();
+// }
+
+// inline void llk_unpack_untilize_init(std::uint32_t operand = 0) {
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t face_r_dim = 1;
+//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
+
+//     // Save state of unpacker config for quick restore
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
+//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1
+
+//     _llk_unpack_untilize_init_(
+//         unpack_dst_format[operand_id],
+//         cb_interface[operand_id].fifo_page_size,
+//         face_r_dim,
+//         num_faces
+//     );
+// }
+
+// inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) {
+//     std::uint32_t operand_id = get_operand_id(operand);
+//     std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
+//     std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride;
+
+//     // Check that unpacker is done (all contexts freed up) before starting hw configuration
+//     wait_for_idle();
+
+//     // Reset address counters
+//     unpacker_addr_counter_init();
+
+//     // Wait for cfg to be free to edit
+//     TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK);
+
+//     // Reset the values to default in unpack AB common.
+//     TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0);
+//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16);
+//     cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 0, 0xFFFF>(1);
+//     cfg_reg_rmw_tensix<UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, UNP0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, UNP0_ADDR_CTRL_XY_REG_1_Ystride_MASK>(unpA_ch1_y_stride);
+//     TTI_NOP; TTI_NOP; // Do we need this for WH?
+// }
+
+// template <bool first_pass = true>
+// inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) {
+//     const std::uint32_t operand_id = get_operand_id(operand);
+//     const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
+
+//     _llk_unpack_untilize_pass_<first_pass>(
+//         base_address,
+//         block_tile_cols
+//     );
+// }
+
+// inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) {
+//     llk_unpack_untilize_pass<true>(operand, block_c_tiles);
+//     llk_unpack_untilize_pass<false>(operand, block_c_tiles);
+// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc
new file mode 100644
index 00000000000..b3f31c2c095
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc
@@ -0,0 +1,3 @@
+#include "llk_io.h"
+
+CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] = {0};
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h
new file mode 100644
index 00000000000..37e018dc6b8
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h
@@ -0,0 +1,10 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+
+#include "circular_buffer.h"
+
+extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_pack.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_pack.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_pack.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_pack.h
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_unpack.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_unpack.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_unpack.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_unpack.h
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h
new file mode 100644
index 00000000000..1569b4cdcd1
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+inline uint32_t get_operand_id(uint32_t operand)
+{
+   return (operand);
+}
+
+inline const uint32_t get_operand_src_format(const std::uint32_t operand_id)
+{
+   return unpack_src_format[operand_id];
+}
+
+inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id)
+{
+   return unpack_dst_format[operand_id];
+}
+
+//TODO: Do we need tile dim functions for GS?
+inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id)
+{
+   return 4;
+}
+
+inline const uint32_t get_operand_partial_face(const std::uint32_t operand_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_operand_face_r_dim(const std::uint32_t operand_id)
+{
+   return 16;
+}
+
+inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_operand_tile_r_dim(const std::uint32_t operand_id)
+{
+   return 32;
+}
+
+inline const uint32_t get_operand_tile_c_dim(const std::uint32_t operand_id)
+{
+   return 32;
+}
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h
new file mode 100644
index 00000000000..bd010082bbd
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+// Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes
+inline uint32_t get_output_id(uint32_t output)
+{
+   const uint32_t OUTPUT_BASE    = 0;
+   return ((output) - OUTPUT_BASE);
+}
+
+inline const uint32_t get_output_base_id()
+{
+   const uint32_t OUTPUT_BASE_ID = 16;
+   return (OUTPUT_BASE_ID);
+}
+
+inline const uint32_t get_output_src_format(const std::uint32_t output_id)
+{
+   return pack_src_format[output_id];
+}
+
+inline const uint32_t get_output_dst_format(const std::uint32_t output_id)
+{
+   return pack_dst_format[output_id];
+}
+
+//TODO: Do we need tile dim functions for GS?
+inline const uint32_t get_output_num_faces(const std::uint32_t output_id)
+{
+   return 4;
+}
+
+inline const uint32_t get_output_partial_face(const std::uint32_t output_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_output_face_r_dim(const std::uint32_t output_id)
+{
+   return 16;
+}
+
+inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id)
+{
+   return 0;
+}
+
+inline const uint32_t get_output_tile_r_dim(const std::uint32_t output_id)
+{
+   return 32;
+}
+
+inline const uint32_t get_output_tile_c_dim(const std::uint32_t output_id)
+{
+   return 32;
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
index f9359469e33..3dd7dbe114c 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h
@@ -19,4 +19,3 @@ extern uint32_t math_sync_tile_dst_index;
 
 extern uint32_t __local_mem_rodata_start_addr[];
 extern uint32_t __local_mem_rodata_end_addr[];
-extern uint32_t __firmware_start[];
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
index 29a2dbf9cfe..cf08580ad69 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+//TODO: This file should be deleted after fixing redefinition errors,
+// functions should be moved to ckernel_globals.h
 #pragma once
 
 #include <cstdint>
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
index 2b94607012d..ea113ce5fa0 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h
@@ -8,8 +8,7 @@
 
 inline uint32_t get_operand_id(uint32_t operand)
 {
-    const int OPERAND_BASE_ID = 0;
-    return (operand);
+   return (operand);
 }
 
 inline const uint32_t get_operand_src_format(const std::uint32_t operand_id)
diff --git a/tt_metal/include/compute_kernel_api/common_globals.h b/tt_metal/include/compute_kernel_api/common_globals.h
index 213859b1ae4..0b0eee877dc 100644
--- a/tt_metal/include/compute_kernel_api/common_globals.h
+++ b/tt_metal/include/compute_kernel_api/common_globals.h
@@ -31,6 +31,7 @@
 #endif
 
 #ifdef TRISC_UNPACK
+#include "llk_unpack_common_api.h"
 #define UNPACK(x) x
 #define MAIN unpack_main()
 #else
diff --git a/tt_metal/include/compute_kernel_api/unpack.h b/tt_metal/include/compute_kernel_api/unpack.h
index 2aaefe1d9d4..ba676d4b938 100644
--- a/tt_metal/include/compute_kernel_api/unpack.h
+++ b/tt_metal/include/compute_kernel_api/unpack.h
@@ -7,7 +7,7 @@
 
 #include "common_globals.h"
 
-
+//TODO: Should WHB0 functions be added here?
 namespace ckernel {
 
 /**

From 655fc5f729927d123e514580d4ee5cfad19b27b4 Mon Sep 17 00:00:00 2001
From: Reem Tawfik <rtawfik@tenstorrent.com>
Date: Thu, 7 Dec 2023 16:59:36 +0000
Subject: [PATCH 13/16] #3908: Fixes for llk lib compile/regressions: 	- Add
 separate llk api files for sfpu negative & mask 	- Move sfpu identity
 to metal api folder 	- Add llk_io files to erisc core compile

---
 .../grayskull/common/inc/ckernel_sfpu.h       |  36 -
 .../llk_lib/llk_math_eltwise_unary_sfpu.h     |  22 -
 .../metal/llk_api/llk_math_binary_api.h       |  75 --
 .../metal/llk_api/llk_math_binary_sfpu_api.h  |  59 --
 .../metal/llk_api/llk_math_common_api.h       |  89 --
 .../metal/llk_api/llk_math_matmul_api.h       |  57 --
 .../metal/llk_api/llk_math_reduce_api.h       |  17 -
 .../llk_api/llk_math_unary_datacopy_api.h     |  24 -
 .../metal/llk_api/llk_math_unary_sfpu_api.h   | 279 -------
 .../grayskull/metal/llk_api/llk_pack_api.h    | 286 -------
 .../metal/llk_api/llk_param_structs.h         |   1 -
 .../llk_api/llk_sfpu}/ckernel_sfpu_identity.h |   0
 .../llk_api/llk_sfpu/ckernel_sfpu_mask.h      |  34 +
 .../llk_api/llk_sfpu/ckernel_sfpu_negative.h  |  31 +
 ..._math_eltwise_unary_sfpu_common_includes.h | 150 ----
 .../llk_math_eltwise_unary_sfpu_identity.h    |   0
 .../llk_math_eltwise_unary_sfpu_mask.h        |  31 +
 .../llk_math_eltwise_unary_sfpu_negative.h    |  31 +
 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h     | 762 ------------------
 .../metal/llk_api/llk_unpack_AB_api.h         |  74 --
 .../metal/llk_api/llk_unpack_AB_matmul_api.h  | 125 ---
 .../metal/llk_api/llk_unpack_A_api.h          |  78 --
 .../metal/llk_api/llk_unpack_common_api.h     | 117 ---
 .../metal/llk_api/llk_unpack_reduce_api.h     |  83 --
 .../metal/llk_api/llk_unpack_tilize_api.h     |  88 --
 .../metal/llk_api/llk_unpack_untilize_api.h   |  85 --
 .../metal/llk_api/llk_math_unary_sfpu_api.h   |  22 -
 .../llk_api/llk_sfpu}/ckernel_sfpu_identity.h |   0
 .../llk_api/llk_sfpu/ckernel_sfpu_mask.h      |  34 +
 .../llk_api/llk_sfpu/ckernel_sfpu_negative.h  |  31 +
 ..._math_eltwise_unary_sfpu_common_includes.h |   4 -
 .../llk_math_eltwise_unary_sfpu_identity.h    |   5 +-
 .../llk_math_eltwise_unary_sfpu_mask.h        |  31 +
 .../llk_math_eltwise_unary_sfpu_negative.h    |  30 +
 .../llk_math_eltwise_unary_sfpu_sqrt.h        |   1 -
 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h     |  29 -
 .../eltwise_unary/negative.h                  |   2 +-
 tt_metal/include/compute_kernel_api/mask.h    |   2 +-
 tt_metal/jit_build/build.cpp                  |  18 +-
 39 files changed, 270 insertions(+), 2573 deletions(-)
 rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_identity.h (100%)
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
 rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_identity.h (100%)
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
 create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
 rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_identity.h (100%)
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
 rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_identity.h (88%)
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h

diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h
index 80b27698ef9..10673511969 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h
@@ -244,20 +244,6 @@ inline void calculate_atan()
     }
 }
 
-
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_negative()
-{
-
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat val = dst_reg[0];
-        dst_reg[0] = -val;
-        dst_reg++;
-    }
-}
-
-
 template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
 inline void calculate_rsqrt()
 {
@@ -888,21 +874,6 @@ inline void calculate_silu()
     }
 }
 
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_mask()
-{
-    bool exponent_size_8 = true;
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat mask = dst_reg[16];
-        v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) {
-            dst_reg[0] = 0;
-        }
-        v_endif;
-        dst_reg++;
-    }
-}
-
 template <SfpuType operation, bool APPROXIMATION_MODE, int SfpuType_PARAM = 0, int ITERATIONS = 4>
 inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0)
 {
@@ -997,13 +968,6 @@ inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, ui
     else if constexpr (operation == SfpuType::silu) {
         calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
     }
-    else if constexpr (operation == SfpuType::mask) {
-        calculate_mask<APPROXIMATION_MODE, ITERATIONS>();
-    }
-    else if constexpr (operation == SfpuType::negative) {
-        calculate_negative<APPROXIMATION_MODE, ITERATIONS>();
-    }
-
     //erf, erfc are dispatched directly.
 }
 
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h
index bdc159d0d08..91b2e60d506 100644
--- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h
+++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h
@@ -366,26 +366,4 @@ inline void llk_math_eltwise_unary_sfpu_silu_init() {
     llk_math_eltwise_unary_sfpu_init<SfpuType::silu, APPROXIMATE>();
 }
 
-//Mask
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::mask, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_mask_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::mask, APPROXIMATE>();
-}
-
-// Negative
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::negative, APPROXIMATE, dst_sync>(dst_index,vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_negative_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::negative, APPROXIMATE>();
-}
-
 }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
index 317c14707ca..0dd9613dfe0 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h
@@ -9,78 +9,3 @@
 // /*************************************************************************
 //  * LLK ELTWISE BINARY
 //  *************************************************************************/
-
-// // Version with no operand
-// template <
-//     EltwiseBinaryType eltwise_binary_type,
-//     BroadcastType src_b_bcast_type,
-//     int NUM_FIDELITY_PHASES = 0,
-//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-// inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) {
-//     const std::uint32_t num_faces = 4;
-
-//     _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
-//         num_faces, transpose, acc_to_dest);
-// }
-
-// // Version with operands
-// template <
-//     EltwiseBinaryType eltwise_binary_type,
-//     BroadcastType src_b_bcast_type,
-//     int NUM_FIDELITY_PHASES = 0,
-//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE>
-// inline void llk_math_eltwise_binary_init_with_operands(
-//     const std::uint32_t operand_A,
-//     const std::uint32_t operand_B,
-//     const std::uint32_t transpose = 0,
-//     const std::uint32_t acc_to_dest = 0) {
-//     const std::uint32_t operand_id =
-//         get_operand_id(operand_A);  // operand_id is used to extract tile dim data which is the same for both operands
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-
-//     _llk_math_eltwise_binary_init_<eltwise_binary_type, src_b_bcast_type, NUM_FIDELITY_PHASES, binary_reuse_dest>(
-//         num_faces, transpose, acc_to_dest);
-// }
-
-// template <
-//     EltwiseBinaryType eltwise_binary_type,
-//     BroadcastType src_b_bcast_type,
-//     DstSync Dst = DstSync::SyncFull,
-//     int NUM_FIDELITY_PHASES = 0,
-//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
-//     bool is_fp32_dest_acc_en = false>
-// inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) {
-//     const std::uint32_t num_faces = 4;
-
-//     _llk_math_eltwise_binary_<
-//         eltwise_binary_type,
-//         src_b_bcast_type,
-//         Dst,
-//         NUM_FIDELITY_PHASES,
-//         binary_reuse_dest,
-//         is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
-// }
-
-// template <
-//     EltwiseBinaryType eltwise_binary_type,
-//     BroadcastType src_b_bcast_type,
-//     DstSync Dst = DstSync::SyncFull,
-//     int NUM_FIDELITY_PHASES = 0,
-//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
-//     bool is_fp32_dest_acc_en = false>
-// inline void llk_math_eltwise_binary(
-//     const std::uint32_t operand_A,
-//     const std::uint32_t operand_B,
-//     uint dst_index,
-//     const bool clear_fp32_dst_acc = true) {
-//     const std::uint32_t operand_id = get_operand_id(operand_A);  // both operands must have same number of faces
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-
-//     _llk_math_eltwise_binary_<
-//         eltwise_binary_type,
-//         src_b_bcast_type,
-//         Dst,
-//         NUM_FIDELITY_PHASES,
-//         binary_reuse_dest,
-//         is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc);
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
index 21c3e8ae428..41ba7fc4b73 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h
@@ -9,62 +9,3 @@
 // /*************************************************************************
 //  * LLK ELTWISE BINARY SFPU
 //  *************************************************************************/
-
-// template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-// inline void llk_math_eltwise_binary_sfpu(
-//     const uint operand,
-//     uint dst_index_a,
-//     uint dst_index_b,
-//     int vector_mode = (int)Dim::RC,
-//     uint param0 = 0,
-//     uint param1 = 0,
-//     uint param2 = 0,
-//     uint param3 = 0,
-//     uint param4 = 0,
-//     uint param5 = 0) {
-//     const std::uint32_t operand_id = get_operand_id(0);
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
-
-//     _llk_math_eltwise_binary_sfpu_<sfpu_op, APPROXIMATE, Dst>(
-//         face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5);
-// }
-
-// template <SfpuType sfpu_op, bool APPROXIMATE>
-// inline void llk_math_eltwise_binary_sfpu_init(
-//     uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) {
-//     _llk_math_eltwise_binary_sfpu_init_<sfpu_op, APPROXIMATE>(param0, param1, param2, param3, param4, param5);
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_binary_sfpu_quant_int32(
-//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
-//     llk_math_eltwise_binary_sfpu<SfpuType::quant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) {
-//     llk_math_eltwise_binary_sfpu_init<SfpuType::quant_int32, APPROXIMATE>(zero_point);
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_binary_sfpu_requant_int32(
-//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
-//     llk_math_eltwise_binary_sfpu<SfpuType::requant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) {
-//     llk_math_eltwise_binary_sfpu_init<SfpuType::requant_int32, APPROXIMATE>(zero_point);
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_binary_sfpu_dequant_int32(
-//     uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) {
-//     llk_math_eltwise_binary_sfpu<SfpuType::dequant_int32, APPROXIMATE, dst_sync>(dst_index_a, dst_index_b, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) {
-//     llk_math_eltwise_binary_sfpu_init<SfpuType::dequant_int32, APPROXIMATE>(zero_point);
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
index 3da220f0cba..f56234c0a0e 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h
@@ -14,95 +14,6 @@
 #include "llk_operands.h"
 #include "llk_param_structs.h"
 
-// // Need to revisit why we even need this
-// #define EPS 1.19209e-07  // std::numeric_limits::epsilon() for FP32
-
 // /*************************************************************************
 //  * LLK MATH COMMON
 //  *************************************************************************/
-
-// template <DstSync Dst>
-// inline void llk_math_wait_for_dest_available() {
-//     _llk_math_wait_for_dest_available_<Dst>();
-// }
-
-// template <DstSync Dst = SyncFull, bool is_fp32_dest_acc_en = false>
-// inline void llk_math_dest_section_done() {
-//     _llk_math_dest_section_done_<Dst, is_fp32_dest_acc_en>();
-// }
-
-// template <DstSync Dst, bool is_fp32_dest_acc_en = false>
-// inline void llk_math_pack_sync_init() {
-//     _llk_math_pack_sync_init_<Dst, is_fp32_dest_acc_en>();
-// }
-
-// template <bool mail2math = true, bool mail2pack = true>
-// inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
-//     _llk_math_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
-// }
-
-// template <bool mail2math = true, bool mail2pack = true>
-// inline void llk_math_release_tile(std::uint32_t operand) {
-//     _llk_math_release_tile_<mail2math, mail2pack>();
-// }
-
-// inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); }
-
-// inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); }
-
-// inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
-//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-//     _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]);
-// }
-
-// inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
-//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-//     _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]);
-// }
-
-// inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
-//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-
-//     _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]);
-// }
-
-// inline void llk_math_reconfig_data_format(
-//     const std::uint32_t srca_old_operand,
-//     const std::uint32_t srca_new_operand,
-//     const std::uint32_t srcb_old_operand,
-//     const std::uint32_t srcb_new_operand) {
-//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
-//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
-//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-
-//     if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) &&
-//         (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
-//         llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand);
-//     } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
-//         llk_math_reconfig_data_format_srca(srca_new_operand);
-//     } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
-//         llk_math_reconfig_data_format_srcb(srcb_new_operand);
-//     }
-// }
-
-// inline void llk_math_reconfig_data_format_srca(
-//     const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
-//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
-//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-
-//     if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) {
-//         llk_math_reconfig_data_format_srca(srca_new_operand);
-//     }
-// }
-
-// inline void llk_math_reconfig_data_format_srcb(
-//     const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
-//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
-//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-
-//     if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) {
-//         llk_math_reconfig_data_format_srcb(srcb_new_operand);
-//     }
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
index a12bcca1ef4..70d2109196b 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h
@@ -9,60 +9,3 @@
 // /*************************************************************************
 //  * LLK MATMUL
 //  *************************************************************************/
-
-// template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
-// inline void llk_math_matmul_init(
-//     const std::uint32_t operandA,
-//     const std::uint32_t operandB,
-//     const std::uint32_t transpose = 0,
-//     const std::uint32_t ct_dim = 1,
-//     const std::uint32_t rt_dim = 1,
-//     const std::uint32_t kt_dim = 1) {
-//     const std::uint32_t in0_id = get_operand_id(operandA);
-//     const std::uint32_t in1_id = get_operand_id(operandB);
-
-//     const bool partial_face = get_operand_partial_face(in0_id);
-
-//     const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id);
-//     const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id);
-//     const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id);
-//     const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id);
-
-// #ifdef ARCH_GRAYSKULL
-//     _llk_math_matmul_init_<NUM_FIDELITY_PHASES, FaceLayout>(
-//         in0_tile_r_dim,
-//         in0_tile_c_dim,
-//         in1_tile_r_dim,
-//         in1_tile_c_dim,
-//         partial_face,
-//         transpose,
-//         ct_dim,
-//         rt_dim,
-//         kt_dim);
-// #else
-//     _llk_math_matmul_init_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(
-//         in0_tile_r_dim,
-//         in0_tile_c_dim,
-//         in1_tile_r_dim,
-//         in1_tile_c_dim,
-//         partial_face,
-//         transpose,
-//         ct_dim,
-//         rt_dim,
-//         kt_dim);
-// #endif
-// }
-
-// template <int NUM_FIDELITY_PHASES, DstTileFaceLayout FaceLayout = DstTileFaceLayout::ColMajor>
-// inline void llk_math_matmul(
-//     uint dst_index,
-//     const bool transpose = false,
-//     const std::uint32_t ct_dim = 1,
-//     const std::uint32_t rt_dim = 1,
-//     const std::uint32_t kt_dim = 1) {
-// #ifdef ARCH_GRAYSKULL
-//     _llk_math_matmul_<NUM_FIDELITY_PHASES, FaceLayout>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
-// #else
-//     _llk_math_matmul_<NUM_FIDELITY_PHASES, DstTileFaceLayout::RowMajor>(dst_index, transpose, ct_dim, rt_dim, kt_dim);
-// #endif
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
index c5f11d005f2..be0284f144d 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h
@@ -9,20 +9,3 @@
 // /*************************************************************************
 //  * LLK REDUCE
 //  *************************************************************************/
-
-// template <
-//     PoolType type,
-//     ReduceDim dim,
-//     int num_fidelity_phases = 0,
-//     bool is_fp32_dest_acc_en = false,
-//     bool is_int_fpu_en = false>
-// inline void llk_math_reduce(const uint dst_index) {
-//     _llk_math_reduce_<type, dim, num_fidelity_phases, is_fp32_dest_acc_en, is_int_fpu_en>(dst_index);
-// }
-
-// template <PoolType type, ReduceDim dim, int num_fidelity_phases = 0>
-// inline void llk_math_reduce_init(
-//     const std::uint32_t within_face_16x16_transpose =
-//         0) {  // within_face_16x16_transpose used for unpack, ignored by math
-//     _llk_math_reduce_init_<type, dim, num_fidelity_phases>(within_face_16x16_transpose);
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
index ca2a5d39e40..33ec73901a8 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h
@@ -10,27 +10,3 @@
 // /*************************************************************************
 //  * LLK ELTWISE UNARY DATACOPY
 //  *************************************************************************/
-
-// template <
-//     DataCopyType type,
-//     BroadcastType src_b_bcast_type = BroadcastType::NONE,
-//     DstSync Dst = DstSync::SyncFull,
-//     bool is_fp32_dest_acc_en = false,
-//     bool unpack_to_dest = false>
-// inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) {
-//     const std::uint32_t operand_id = get_operand_id(0);
-//     _llk_math_eltwise_unary_datacopy_<type, src_b_bcast_type, Dst, is_fp32_dest_acc_en, unpack_to_dest>(
-//         dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
-// }
-
-// template <DataCopyType type, BroadcastType src_b_bcast_type = BroadcastType::NONE>
-// // within_face_16x16_transpose is used by unpacker, math does not transpose
-// inline void llk_math_eltwise_unary_datacopy_init(
-//     const std::uint32_t transpose_of_faces = 0 /*unused*/,
-//     const std::uint32_t within_face_16x16_transpose = 0 /* unused */,
-//     const std::uint32_t operand = 0) {
-//     const std::uint32_t operand_id = get_operand_id(0);
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-//     _llk_math_eltwise_unary_datacopy_init_<type, src_b_bcast_type>(
-//         transpose_of_faces, within_face_16x16_transpose, num_faces);
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
index 53b9d1afe8b..0972e48ebb0 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -12,282 +12,3 @@
 // /*************************************************************************
 // * LLK ELTWISE UNARY SFPU
 // *************************************************************************/
-
-// // New LLK SFPU APIs
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::rsqrt, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_rsqrt_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::rsqrt, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::log, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_log_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::log, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::log_with_base, APPROXIMATE, dst_sync>(dst_index,base_scale);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_log_with_base_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::log_with_base, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::tanh, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::signbit, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_signbit_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::signbit, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_tanh_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::tanh, APPROXIMATE>();
-// }
-
-// //sign
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::sign, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_sign_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::sign, APPROXIMATE>();
-// }
-// template <DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) {
-//     constexpr bool dont_care = false;
-//     llk_math_eltwise_unary_sfpu<SfpuType::dropout, dont_care, dst_sync>(dst_index, vector_mode, integer_dropout, scale_factor);
-// }
-
-// inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) {
-//     constexpr bool dont_care = false;
-//     constexpr uint dont_care_param = 0;
-
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::dropout, dont_care>(dont_care_param, dont_care_param, seed);
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::sigmoid, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_sigmoid_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::sigmoid, APPROXIMATE>();
-// }
-
-// //EQZ
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::equal_zero, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_eqz_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::equal_zero, APPROXIMATE>();
-// }
-
-// //NEZ
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::not_equal_zero, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_nez_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::not_equal_zero, APPROXIMATE>();
-// }
-
-// //LTZ
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::less_than_zero, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_ltz_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_zero, APPROXIMATE>();
-// }
-
-// //GTZ
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::greater_than_zero, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_gtz_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_zero, APPROXIMATE>();
-// }
-
-// //LEZ
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::less_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_lez_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::less_than_equal_zero, APPROXIMATE>();
-// }
-
-// //GEZ
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::greater_than_equal_zero, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_gez_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::greater_than_equal_zero, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::max, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_max_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::max, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::square, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_square_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::square, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::power, APPROXIMATE, dst_sync>(dst_index, vector_mode, pow);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_power_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::power, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::abs, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_abs_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::abs, APPROXIMATE>();
-// }
-
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::cast_fp32_to_fp16a, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::cast_fp32_to_fp16a, APPROXIMATE>();
-// }
-
-// //EXP2
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::exp2, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_exp2_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::exp2, APPROXIMATE>();
-// }
-
-// //heaviside
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::heaviside, APPROXIMATE, dst_sync>(dst_index,vector_mode,param0);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_heaviside_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::heaviside, APPROXIMATE>();
-// }
-
-// //EXPM1
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::expm1, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_expm1_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::expm1, APPROXIMATE>();
-// }
-
-// //Asin
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::asin, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_asin_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::asin, APPROXIMATE>();
-// }
-
-// //Atan
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::atan, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_atan_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::atan, APPROXIMATE>();
-// }
-
-// //Acos
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::acos, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_acos_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::acos, APPROXIMATE>();
-// }
-
-// //silu
-// template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-// inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) {
-//     llk_math_eltwise_unary_sfpu<SfpuType::silu, APPROXIMATE, dst_sync>(dst_index);
-// }
-
-// template <bool APPROXIMATE>
-// inline void llk_math_eltwise_unary_sfpu_silu_init() {
-//     llk_math_eltwise_unary_sfpu_init<SfpuType::silu, APPROXIMATE>();
-// }
-
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
index 37ee8a0fe56..b648be30f3c 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h
@@ -20,289 +20,3 @@
 /*************************************************************************
 * LLK PACK
 *************************************************************************/
-
-// template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
-// inline void llk_pack_mop_config(const uint32_t output) {
-
-//     const std::uint32_t output_id = get_output_id(output);
-//     const std::uint32_t num_faces = get_output_num_faces(output_id);
-//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//     const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]);
-//     const bool narrow_tile = get_output_narrow_tile(output_id);
-
-//     _llk_pack_mop_config_<untilize, zero_output, FaceLayout, false>(
-//         pack_dst_format[output_id],
-//         face_r_dim,
-//         num_faces,
-//         partial_face,
-//         narrow_tile
-//     );
-// }
-
-// template <bool untilize = false, bool is_fp32_dest_acc_en = false>
-// inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) {
-
-//     const std::uint32_t output_id = get_output_id(pack_params->pack_output);
-//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//     const std::uint32_t num_faces = get_output_num_faces(output_id);
-//     const bool partial_face = get_output_partial_face(output_id);
-//     const bool narrow_tile = get_output_narrow_tile(output_id);
-
-//     const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
-
-//     _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
-//         pack_src_format[output_id],
-//         pack_dst_format[output_id],
-//         tile_size,
-//         face_r_dim,
-//         num_faces,
-//         partial_face,
-//         narrow_tile,
-//         pack_params->relu_config.val
-//     );
-// }
-
-// template <bool untilize = false, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
-// inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) {
-//     llk_pack_params_t llk_pack_params = {
-//         .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}};
-//     llk_pack_hw_configure<untilize, is_fp32_dest_acc_en>(&llk_pack_params);
-// }
-
-// template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false>
-// inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) {
-//     const std::uint32_t output_id = get_output_id(pack_params->pack_output);
-//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//     const std::uint32_t num_faces = get_output_num_faces(output_id);
-//     const bool partial_face = get_output_partial_face(output_id);
-//     const bool narrow_tile = get_output_narrow_tile(output_id);
-
-//     const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
-
-//     _llk_pack_reduce_hw_configure_<untilize, type, dim, is_fp32_dest_acc_en>(
-//         pack_src_format[output_id],
-//         pack_dst_format[output_id],
-//         tile_size,
-//         face_r_dim,
-//         num_faces,
-//         partial_face,
-//         narrow_tile,
-//         pack_params->relu_config.val
-//     );
-// }
-
-// template <bool untilize = false, PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, ReluType relu_type = ReluType::NO_RELU, std::uint32_t relu_threshold = 0>
-// inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) {
-//     llk_pack_params_t llk_pack_params = {
-//         .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}};
-//     llk_pack_reduce_hw_configure<untilize, type, dim, is_fp32_dest_acc_en>(&llk_pack_params);
-// }
-
-// template <bool untilize = false, bool zero_output = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
-// inline void llk_pack_init(const std::uint32_t pack_output = 16) {
-
-//     const std::uint32_t output_id = get_output_id(pack_output);
-//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//     const std::uint32_t num_faces = get_output_num_faces(output_id);
-//     const bool partial_face = get_output_partial_face(output_id);
-//     const bool narrow_tile = get_output_narrow_tile(output_id);
-
-//     _llk_pack_init_<untilize, zero_output, FaceLayout, false>(
-//         pack_dst_format[output_id],
-//         face_r_dim,
-//         num_faces,
-//         partial_face,
-//         narrow_tile
-//     );
-// }
-
-// template <bool out_of_order_output, bool untilize>
-// inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) {
-
-//     std::uint32_t pack_tile_addr;
-//     if constexpr (out_of_order_output) {
-//         pack_tile_addr = cb_interface[output_id].fifo_wr_ptr +
-//                         (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1;
-//     } else {
-//         if constexpr (untilize) {
-//             // FIXME: Need to support pack-untilize?
-//             // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim +
-//             //                                 cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf
-//             // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
-//             // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size);
-
-//             // cb_interface[output_id].ublock_tile_cnt++;
-
-//             // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) {
-//             //    cb_interface[output_id].ublock_tile_cnt=0;
-//             //    cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct;
-//             // }
-//         } else {
-//             pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1;
-//             cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size;
-//         }
-//     }
-//     return pack_tile_addr;
-// }
-
-// template <bool out_of_order_output = false, DstSync Dst = SyncFull, bool untilize = false, bool is_fp32_dest_acc_en = false>
-// inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) {
-//     std::uint8_t output_id = get_output_id(output);
-
-//     static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!");
-
-//     std::uint32_t pack_tile_addr = get_output_tile_address<out_of_order_output, untilize>(output_id, output_tile_index);
-
-//     _llk_pack_<Dst, untilize, is_fp32_dest_acc_en>(
-//         tile_index,
-//         pack_tile_addr
-//     );
-// }
-
-// /*************************************************************************
-// * LLK PACK COMMON
-// *************************************************************************/
-
-
-// inline void llk_packer_wait_for_math_done() {
-//     _llk_packer_wait_for_math_done_();
-// }
-
-// template <uint WaitRes = p_stall::NONE>
-// inline void llk_packer_set_math_semaphore() {
-//     _llk_packer_set_math_semaphore_<WaitRes>();
-// }
-
-// template <DstSync Dst, bool is_fp32_dest_acc_en = false>
-// inline void llk_pack_dest_section_done() {
-//     _llk_pack_dest_section_done_<Dst, is_fp32_dest_acc_en>();
-// }
-
-// template <DstSync Dst, DstTileFaceLayout FaceLayout, bool untilize = false>
-// inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) {
-//     const std::uint32_t output_id = get_output_id(pack_output);
-//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//     const bool narrow_tile = get_output_narrow_tile(output_id);
-
-//     _llk_init_packer_dest_offset_registers_<Dst, FaceLayout, untilize>(
-//         face_r_dim,
-//         narrow_tile
-//     );
-// }
-
-// template <DstSync Dst, DstTileFaceLayout FaceLayout = RowMajor, bool untilize = false, bool is_fp32_dest_acc_en = false>
-// inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) {
-
-//     const std::uint32_t output_id = get_output_id(pack_output);
-//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//     const bool narrow_tile = get_output_narrow_tile(output_id);
-
-//     _llk_pack_dest_init_<Dst, FaceLayout, untilize, is_fp32_dest_acc_en>(
-//         face_r_dim,
-//         narrow_tile
-//     );
-// }
-
-// template <bool mail2math=true, bool mail2pack=true>
-// inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) {
-//     _llk_pack_get_tile_<mail2math, mail2pack>(tile_index, p_tile);
-// }
-
-// template <bool mail2math=true, bool mail2pack=true>
-// inline void llk_pack_release_tile(std::uint32_t output) {
-//     _llk_pack_release_tile_<mail2math, mail2pack>();
-// }
-
-// inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
-//     _llk_pack_debug_dump_(data, byte_size);
-// }
-
-// inline void llk_pack_debug_dump_seek(std::uint8_t offset) {
-//     _llk_pack_debug_dump_seek_(offset);
-// }
-
-// template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
-// inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) {
-
-//     const std::uint32_t output_id = get_output_id(new_output);
-//     const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//     const std::uint32_t num_faces = get_output_num_faces(output_id);
-//     const bool partial_face = get_output_partial_face(output_id);
-//     const bool narrow_tile = get_output_narrow_tile(output_id);
-
-//     _llk_pack_reconfig_data_format_<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(
-//         pack_src_format[output_id],
-//         pack_dst_format[output_id],
-//         cb_interface[output_id].fifo_page_size,
-//         face_r_dim,
-//         num_faces,
-//         partial_face,
-//         narrow_tile
-//     );
-// }
-
-// template <bool is_fp32_dest_acc_en = false, bool is_tile_dim_reconfig_en = false, DstTileFaceLayout FaceLayout = DstTileFaceLayout::RowMajor>
-// inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) {
-//     std::uint32_t old_output_id = get_output_id(old_output);
-//     std::uint32_t new_output_id = get_output_id(new_output);
-
-//     if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id])
-//        && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid)
-//        && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) {
-//         llk_pack_reconfig_data_format<is_fp32_dest_acc_en, is_tile_dim_reconfig_en, FaceLayout>(new_output);
-//     } else if constexpr (is_tile_dim_reconfig_en) {
-//         // Same format but different tile dims
-//         llk_pack_mop_config<false, false, FaceLayout, false>(new_output);
-//     }
-// }
-
-// TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) {
-//     _llk_pack_relu_config_(config);
-// }
-
-// inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) {
-//     _llk_pack_reconfig_l1_acc_(enable);
-// }
-
-// template <bool untilize = false, ReduceDim dim>
-// inline void llk_pack_reduce_mask_config() {
-//     _llk_pack_reduce_mask_config_<untilize, dim>();
-// }
-
-// inline void llk_pack_reduce_mask_clear() {
-//     _llk_pack_reduce_mask_clear_();
-// }
-
-// // FIXME-WH-UPLIFT
-// template <ReduceDim dim, bool at_kernel_start = false, bool revert=false, bool is_fp32_dest_acc_en = false>
-// inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
-
-//     const bool untilize = false;
-//     if constexpr (at_kernel_start) {
-
-//         const std::uint32_t output_id = get_output_id(icb_out);
-//         const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
-//         const std::uint32_t num_faces = get_output_num_faces(output_id);
-//         const bool partial_face = get_output_partial_face(output_id);
-//         const bool narrow_tile = get_output_narrow_tile(output_id);
-//         const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size;
-//         const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}};
-
-//         _llk_pack_hw_configure_<untilize, is_fp32_dest_acc_en>(
-//             pack_src_format[output_id],
-//             pack_dst_format[output_id],
-//             tile_size,
-//             face_r_dim,
-//             num_faces,
-//             partial_face,
-//             narrow_tile,
-//             relu_config.val
-//         );
-//     }
-
-//     if constexpr (revert) {
-//         _llk_pack_reduce_mask_clear_();
-//     } else {
-//         _llk_pack_reduce_mask_config_<untilize, dim>();
-//     }
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h
index 83f94387efa..62d59b90afe 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h
@@ -80,7 +80,6 @@ struct llk_pack_params_t {
     llk_relu_config_u relu_config;
 };
 
-// TODO: nsmith move this to a common place where the hlk can include it
 struct hlk_pack_shifted_params_t {
     std::uint32_t pack_output;
     llk_relu_config_u relu_config;
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_identity.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_identity.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
new file mode 100644
index 00000000000..302cb97c934
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+
+template <bool APPROXIMATION_MODE, int ITERATIONS=4>
+inline void calculate_mask()
+{
+    bool exponent_size_8 = true;
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat mask = dst_reg[16];
+        v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) {
+            dst_reg[0] = 0;
+        }
+        v_endif;
+        dst_reg++;
+    }
+}
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
new file mode 100644
index 00000000000..fd9cfef2da6
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+
+template <bool APPROXIMATION_MODE, int ITERATIONS=4>
+inline void calculate_negative()
+{
+
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        dst_reg[0] = -val;
+        dst_reg++;
+    }
+}
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
index 83a5fdcca92..fd920521909 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -16,156 +16,6 @@
 #include "llk_param_structs.h"
 #include "llk_math_eltwise_unary_sfpu.h"
 
-//TODO: Fix for GS uplift
-
-// using namespace ckernel;
-// using namespace ckernel::sfpu;
-// namespace ckernel {
-
 // /*************************************************************************
 //  * LLK ELTWISE UNARY SFPU
 //  *************************************************************************/
-
-// template <
-//     SfpuType operation,
-//     bool APPROXIMATION_MODE,
-//     int SfpuType_PARAM = 0,
-//     int ITERATIONS = 8,
-//     bool IS_INT_SFPU_EN = false>
-// inline void llk_math_calculate_sfpu(
-//     const int iterations = ITERATIONS,
-//     uint param0 = 0,
-//     uint param1 = 0,
-//     uint param2 = 0,
-//     uint param3 = 0,
-//     uint param4 = 0,
-//     uint param5 = 0) {
-//     if constexpr (operation == SfpuType::exp_with_base) {
-//         constexpr bool zero_negative = true;
-//         _calculate_exponential_<APPROXIMATION_MODE, zero_negative, true, ITERATIONS>(iterations, param0);
-//     } else if constexpr (operation == SfpuType::tanh) {
-//         _calculate_tanh_<APPROXIMATION_MODE, ITERATIONS>(iterations);
-//     } else if constexpr (operation == SfpuType::hardtanh) {
-//         _calculate_hardtanh_<APPROXIMATION_MODE, ITERATIONS>(iterations, param0, param1, param2);
-//     } else if constexpr (operation == SfpuType::rsqrt) {
-//         // param0 = true -> approximate fast mode
-//         //          false -> high precision mode
-//         //  The algorithm uses Newton's method based on no.of iteration better approximation can be calculated
-//         if (param0) {
-//             calculate_rsqrt<true, ITERATIONS, 10>();
-//         } else {
-//             calculate_rsqrt<false, ITERATIONS, 25>();
-//         }
-//     } else if constexpr (operation == SfpuType::sigmoid) {
-//         calculate_sigmoid<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::sigmoid_appx) {
-//         calculate_sigmoid_appx<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::tanh_derivative) {
-//         calculate_tanh_derivative<APPROXIMATION_MODE, SfpuType_PARAM, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::dropout) {
-//         calculate_dropout<APPROXIMATION_MODE, ITERATIONS>(param0, param1);
-//     } else if constexpr (operation == SfpuType::power) {
-//         calculate_power_iterative<APPROXIMATION_MODE, ITERATIONS>(param0);
-//     } else if constexpr (operation == SfpuType::square) {
-//         calculate_square<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::log) {
-//         calculate_log<APPROXIMATION_MODE, false, ITERATIONS>(param0);
-//     } else if constexpr (operation == SfpuType::log_with_base) {
-//         calculate_log<APPROXIMATION_MODE, true, ITERATIONS>(param0);
-//     } else if constexpr (
-//         (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) ||
-//         (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) ||
-//         (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) {
-//         calculate_comp<APPROXIMATION_MODE, operation, ITERATIONS>(8);  // BFLOAT16 - exp
-//     } else if constexpr (operation == SfpuType::clamp) {
-//         calculate_clamp<APPROXIMATION_MODE, ITERATIONS>(param0, param1, param2);
-//     } else if constexpr (operation == SfpuType::abs) {
-//         calculate_abs<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::sign) {
-//         calculate_sign<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::max) {
-//         calculate_max<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::min) {
-//         calculate_min<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::exp2) {
-//         calculate_exp2<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::heaviside) {
-//         calculate_heaviside<APPROXIMATION_MODE, ITERATIONS>(param0);
-//     } else if constexpr (operation == SfpuType::expm1) {
-//         calculate_expm1<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::asin) {
-//         calculate_asin<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::acos) {
-//         calculate_acos<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::atan) {
-//         calculate_atan<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::signbit) {
-//         calculate_signbit<APPROXIMATION_MODE, ITERATIONS>();
-//     } else if constexpr (operation == SfpuType::silu) {
-//         calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
-//     }
-// }
-
-// template <SfpuType sfpu_op, bool APPROXIMATE, DstSync Dst = DstSync::SyncFull, bool IS_INT_SFPU_EN = false>
-// inline void llk_math_eltwise_unary_sfpu(
-//     uint dst_index,
-//     int vector_mode = (int)Dim::RC,
-//     uint param0 = 0,
-//     uint param1 = 0,
-//     uint param2 = 0,
-//     uint param3 = 0,
-//     uint param4 = 0,
-//     uint param5 = 0) {
-//     const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
-
-//     constexpr int ITERATIONS = 8;
-
-//     _llk_math_eltwise_unary_sfpu_start_<Dst>(dst_index);
-
-//     if (vector_mode == (int)Dim::R) {
-//         // Do a row vector, Face0 + Face1 -- first iteration (first row)
-//         const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2)
-//                                                : 2;  // At least 2 iterations for odd and even columns
-// #pragma GCC unroll 0
-//         for (int face = 0; face < 2; face++) {
-//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
-//                 iterations, param0, param1, param2, param3, param4, param5);
-//             // Move to the next face
-//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//         }
-//         // Skip next two faces
-//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//         _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//     } else if (vector_mode == (int)Dim::C) {
-//         // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for
-//         // full face
-// #pragma GCC unroll 0
-//         for (int face = 0; face < 2; face++) {
-//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
-//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
-//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//             if (num_faces > 2) {  // Skip next face if tile is 32x32
-//                 _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//             }
-//         }
-//         if (num_faces <= 2) {
-//             // Skip next two faces
-//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//         }
-//     } else {
-//         // Do all four faces, and iterate through all 4 blocks of 4 rows each
-// #pragma GCC unroll 0
-//         for (int face = 0; face < 4; face++) {
-//             llk_math_calculate_sfpu<sfpu_op, APPROXIMATE, 0, ITERATIONS, IS_INT_SFPU_EN>(
-//                 ITERATIONS, param0, param1, param2, param3, param4, param5);
-//             // Move to the next face
-//             _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_();
-//         }
-//     }
-//     _llk_math_eltwise_unary_sfpu_done_();
-// }
-
-// }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_identity.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h
similarity index 100%
rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_identity.h
rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
new file mode 100644
index 00000000000..eed4732e5eb
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+
+#include "llk_math_eltwise_unary_sfpu_common_includes.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+#include "llk_math_eltwise_unary_sfpu_0_param.h"
+#include "ckernel_sfpu_mask.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_mask_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::mask, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) {
+    constexpr int first_iterations = 1;
+    llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
+                            (ckernel::sfpu::calculate_mask<APPROXIMATE, first_iterations>,
+                            ckernel::sfpu::calculate_mask<APPROXIMATE>,
+                            dst_index, vector_mode);
+}
+
+}
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
new file mode 100644
index 00000000000..5badb47497b
--- /dev/null
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+
+#include "llk_math_eltwise_unary_sfpu_common_includes.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+#include "llk_math_eltwise_unary_sfpu_0_param.h"
+#include "ckernel_sfpu_negative.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_negative_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::negative, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) {
+    constexpr int first_iterations = 1;
+    llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
+                            (ckernel::sfpu::calculate_negative<APPROXIMATE, first_iterations>,
+                            ckernel::sfpu::calculate_negative<APPROXIMATE>,
+                            dst_index, vector_mode);
+}
+
+} // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
index 50018e399c3..f2dd3abf0ce 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
@@ -16,765 +16,3 @@
 #include "ckernel_sfpu_exp.h"
 #include "ckernel_sfpu_recip.h"
 #include "ckernel_sfpu_converter.h"
-
-//TODO: Delete this file once GS uplift is done
-// using namespace sfpi;
-
-// namespace ckernel
-// {
-// namespace sfpu
-// {
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
-// inline void calculate_rsqrt()
-// {
-
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-
-//         vFloat in = dst_reg[0];
-//         v_if(dst_reg[0] == 0.0f){
-//             dst_reg[0] = std::numeric_limits<float>::infinity();
-//         }v_else{
-//             vFloat result = 1.0f;
-//             v_if(dst_reg[0] > 1.0f){
-//                 result = sfpu_reciprocal(in);
-//             }v_endif;
-
-//             for (int r = 0; r < RECIPROCAL_ITERATIONS; r++)
-//             {
-//                 // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration.
-//                 result = result * (1.5F - 0.5F  * dst_reg[0] * result * result);
-//             }
-//             dst_reg[0] = result;
-//         }v_endif;
-
-//         dst_reg++;
-
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_sigmoid_appx()
-// {
-//     vUInt l0 = l_reg[LRegs::LReg0];
-//     vUInt l1 = l_reg[LRegs::LReg1];
-//     vUInt l2 = l_reg[LRegs::LReg2];
-
-//     #pragma GCC unroll 8
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-
-//         dst_reg[0] = lut(val, l0, l1, l2) + 0.5f;
-
-//         dst_reg++;
-//     }
-
-//     l_reg[LRegs::LReg0] = l0;
-//     l_reg[LRegs::LReg1] = l1;
-//     l_reg[LRegs::LReg2] = l2;
-// }
-
-// // TODO: Implement using bitwise comparision
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_signbit()
-// {
-
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-//         v_if (val <= -0.0f) {
-//             val = 1.0f;
-//         } v_elseif (val >= 0.0f) {
-//             val = 0.0f;
-//         }
-//         v_endif;
-//         dst_reg[0] = val;
-
-//        dst_reg++;
-//     }
-
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_tanh()
-// {
-//     // SFPU microcode
-//     vUInt l0 = l_reg[LRegs::LReg0];
-//     vUInt l1 = l_reg[LRegs::LReg1];
-//     vUInt l2 = l_reg[LRegs::LReg2];
-
-//     #pragma GCC unroll 8
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-//         val = lut(val, l0, l1, l2);
-//         dst_reg[0] = val;
-
-//         dst_reg++;
-//     }
-
-//     l_reg[LRegs::LReg0] = l0;
-//     l_reg[LRegs::LReg1] = l1;
-//     l_reg[LRegs::LReg2] = l2;
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_hardtanh(uint param0, uint param1, uint param2)
-// {
-//     // All params are in FP16_B format
-//     // param0 = -(neg_threshold)
-//     // param1 = -(pos_threshold - neg_threshold)
-//     // param2 = -(pos_threshold)
-
-//     vFloat p0 = s2vFloat16(param0);
-//     vFloat p1 = s2vFloat16(param1);
-//     vFloat p2 = s2vFloat16(param2);
-//     // SFPU microcode
-//     #pragma GCC unroll 0
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-
-//         val += p0;// 12 bits
-//         v_if (val < 0.0f) {
-//             val = 0.0f;
-//         }
-//         v_endif;
-
-//         val += p1;// 12 bits
-//         v_if (val >= 0.0f) {
-//             val = 0.0f;
-//         }
-//         v_endif;
-
-//         val += p2;// 12 bits
-
-//         dst_reg[0] = val;
-
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int WITH_PRECOMPUTED_TANH, int ITERATIONS>
-// inline void calculate_tanh_derivative()
-// {
-//     vUInt l0 = l_reg[LRegs::LReg0];
-//     vUInt l1 = l_reg[LRegs::LReg1];
-//     vUInt l2 = l_reg[LRegs::LReg2];
-
-//     // tanh'(x) = 1 - (tanh(x))^2
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-
-//         if constexpr (!WITH_PRECOMPUTED_TANH) {
-//             val = lut(val, l0, l1, l2);
-//         }
-
-//         val = val * (-val) + vConst1;
-//         dst_reg[0] = val;
-
-//         dst_reg++;
-//     }
-
-//     l_reg[LRegs::LReg0] = l0;
-//     l_reg[LRegs::LReg1] = l1;
-//     l_reg[LRegs::LReg2] = l2;
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_dropout(uint prob, uint scale)
-// {
-//     // SFPU microcode
-
-//     vUInt rand = l_reg[LRegs::LReg3];
-
-//     #pragma GCC unroll 0
-//     for (int d = 0; d < ITERATIONS; d++) {
-//         ////////////////////////
-//         // Scale samples
-//         ///////////////////////
-//         dst_reg[0] = dst_reg[0] * s2vFloat16b(scale);
-
-//         ////////////////////////
-//         // Drop samples
-//         ///////////////////////
-//         v_if (rand < prob) {
-//             dst_reg[0] = vConst0;
-//         }
-//         v_endif;
-
-//         ////////////////////////
-//         // 16-bit PRNG update
-//         ///////////////////////
-//         vUInt lfsr = vConstIntPrgm1;
-//         vUInt tmp = lfsr & rand;
-//         rand = rand >> 1;
-//         v_if (tmp != 0) {
-//             vUInt mask = vConstIntPrgm0;
-//             rand ^= mask;
-//         }
-//         v_endif;
-
-//         dst_reg++;
-//     }
-
-//     l_reg[LRegs::LReg3] = rand;
-// }
-
-// template <bool APPROXIMATION_MODE,int ITERATIONS>
-// inline void calculate_power_iterative(const uint exponent)
-// {
-//     #pragma GCC unroll 8
-//     for (int d = 0; d < 8; d++)
-//     {
-//         vFloat in = dst_reg[0];
-//         vFloat result = 1.0f;
-//         for (uint i = 0; i < exponent; i++) {
-//             result *= in;
-//         }
-// 	dst_reg[0]=result;
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_square()
-// {
-//     #pragma GCC unroll 8
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat in = dst_reg[0];
-//         vFloat result = in * in;
-
-//         dst_reg[0] = result;
-
-//         dst_reg++;
-//     }
-// }
-
-// template <bool HAS_BASE_SCALING>
-// sfpi_inline void calculate_log_body(const uint log_base_scale_factor)
-// {
-//     ////////////////////////////
-//     // Load From dest + "normalize to calculation range"
-//     ////////////////////////////
-//     vFloat in = dst_reg[0];
-//     vFloat x = setexp(in, 127);    // set exp to exp bias (put in range of 1-2)
-
-//     // XXXXXX ask Namal? if we can derive the coefficients below to higher precision
-//     ////////////////////////////
-//     // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order
-//     // x* ( x* (A*x + B) + C) + D
-//     // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006
-//     // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to
-//     // save the subtract and get A',B',C',D'):
-//     // A' = A
-//     // B' = -3A + B
-//     // C' = 3a -2B + C
-//     // D' = -A + B - C + D
-//     // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753
-//     ////////////////////////////
-//     vFloat a = vConstFloatPrgm1;
-//     vFloat b = vConstFloatPrgm2;
-//     // XXXXX try variants of the below: B'=.7122, C'=2.0869
-//     vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f;
-
-//     ////////////////////////////
-//     // Convert exponent to float
-//     ////////////////////////////
-//     vInt exp = exexp(in);
-//     v_if (exp < 0) {
-//         exp = setsgn(~exp + 1, 1);
-//     }
-//     v_endif;
-
-//     vFloat expf = int32_to_float(exp, 0);
-//     vFloat vConstLn2 = vConstFloatPrgm0;
-//     vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2)
-
-//     if constexpr (HAS_BASE_SCALING) {
-//         result *= s2vFloat16a(log_base_scale_factor);
-//     }
-
-//     ////////////////////////////
-//     // Base case when input is 0. ln(0) = -inf
-//     ////////////////////////////
-//     v_if (in == 0.0F) { // Reload for register pressure
-//         result = -std::numeric_limits<float>::infinity();
-//     }
-//     v_endif;
-
-//     dst_reg[0] = result;
-// }
-
-// template <bool APPROXIMATION_MODE, bool HAS_BASE_SCALING, int ITERATIONS>
-// inline void calculate_log(uint log_base_scale_factor)
-// {
-//     #pragma GCC unroll 8
-//     for(int d = 0; d < ITERATIONS; d++){
-//         calculate_log_body<HAS_BASE_SCALING>(log_base_scale_factor);
-//         dst_reg++;
-//     }
-// }
-
-// sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init)
-// {
-//     flag1 = init;
-//     if (check) {
-//         flag2 = init;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, SfpuType COMP_MODE, int ITERATIONS>
-// inline void calculate_comp(uint exponent_size_8)
-// {
-//    const vFloat zero = 0.0f;
-//    const vFloat one = 1.0f;
-//    for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat v = dst_reg[0];
-//         vFloat flag1, flag2;
-
-// 	//a[i] == 0
-// 	if constexpr(COMP_MODE == SfpuType::equal_zero) {
-// 	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
-// 	      v = one;
-// 	    } v_else {
-// 	      v = zero;
-// 	    }
-// 	    v_endif;
-// 	  }
-
-// 	//a[i] != 0
-// 	if constexpr(COMP_MODE == SfpuType::not_equal_zero) {
-// 	    v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
-// 	      v = zero;
-// 	    } v_else {
-// 	      v = one;
-// 	    }
-// 	    v_endif;
-//         }
-
-// 	//a[i] < 0
-// 	if constexpr(COMP_MODE == SfpuType::less_than_zero) {
-// 	    v_if (v >= 0.0f) {
-// 	      v = zero;
-// 	    } v_else {
-// 	      v = one;
-// 	    }
-// 	    v_endif;
-//         }
-
-// 	//a[i] >= 0
-// 	if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) {
-// 	    v_if (v >= 0.0f) {
-// 	      v = one;
-// 	    } v_else {
-// 	      v = zero;
-// 	    }
-// 	    v_endif;
-//         }
-
-// 	//a[i] > 0
-// 	if constexpr(COMP_MODE == SfpuType::greater_than_zero) {
-// 	    v_if (v > 0.0f) {
-// 	      v = one;
-// 	    } v_else {
-// 	      v = zero;
-// 	    }
-// 	    v_endif;
-//         }
-
-// 	//a[i] <= 0
-// 	if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) {
-// 	    v_if (v > 0.0f) {
-// 	      v = zero;
-// 	    } v_else {
-// 	      v = one;
-// 	    }
-// 	    v_endif;
-//         }
-
-// 	dst_reg[0] = v;
-// 	dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_clamp(uint param0, uint param1, uint param2)
-// {
-//     // All params are in FP16 format
-//     // param0 = min
-//     // param1 = max
-
-//     //uint format = (param0 >> 16)&0x1;
-//     s2vFloat16::Format format = s2vFloat16::fp16a;
-
-//     // SFPU microcode
-//     vFloat min = s2vFloat16(param0, format);
-//     vFloat max = s2vFloat16(param1, format);
-//     #pragma GCC unroll 0
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-
-//         v_if (val < min) {
-//             val = s2vFloat16(param0, format);
-//         } v_elseif (val >= max) {
-//             val = s2vFloat16(param1, format);
-//         }
-//         v_endif;
-
-//         dst_reg[0] = val + s2vFloat16b(param2); // 12 bits
-
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_abs()
-// {
-//     // SFPU microcode
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat v = dst_reg[0];
-//         dst_reg[0] = sfpi::abs(v);
-//         dst_reg++;
-//     }
-// }
-
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_exp2()
-// {
-//     // SFPU microcode
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat v = dst_reg[0];
-//         // log(2) = 0.6931471805;
-//         v = v * 0.6931471805f;
-// 	    // exp = e^(v)
-// 	    vFloat exp = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
-// 	    dst_reg[0] = exp;
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_sign()
-// {
-//     // All params are in FP16 format
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat v = dst_reg[0];
-// 	vFloat result = vConst1;
-//         v_if (v < 0.0f) {
-//            result = vConstNeg1;
-//         } v_elseif(v > 0.0f) {
-// 	  result = vConst1;
-// 	} v_else {
-// 	  result = vConst0;
-//         }
-//         v_endif;
-
-// 	dst_reg[0] = result;
-//         dst_reg++;
-//     }
-// }
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_max()
-// {
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat a = dst_reg[0];
-//         vFloat b = dst_reg[32];
-//         v_if(a < b) {
-//             dst_reg[0] = b;
-//         }
-//         v_endif;
-
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_min()
-// {
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat a = dst_reg[0];
-//         vFloat b = dst_reg[32];
-//         v_if(a > b) {
-//             dst_reg[0] = b;
-//         }
-//         v_endif;
-
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_expm1()
-// {
-//     // SFPU microcode
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat v = dst_reg[0];
-//         v = calculate_exponential_body_improved<APPROXIMATION_MODE, true>(v);
-//         dst_reg[0] = v - 1.0f;
-//         dst_reg++;
-//     }
-// }
-
-
-// #define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4)  (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0)
-
-// template <bool APPROXIMATION_MODE>
-// sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val)
-// {
-//     v_if(1 > sfpi::abs(val)){
-//         dst_reg[0] = sfpi::abs(val)  ;
-//     }
-//     v_else{
-//         dst_reg[0] =  sfpu_reciprocal(sfpi::abs(val));
-//     }
-//     v_endif;
-
-//     vFloat t1 = dst_reg[0] * dst_reg[0];
-
-//     t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1);
-
-//     t1 = t1 * dst_reg[0];
-
-//     v_if (sfpi::abs(val) > 1){
-//         t1 = 1.570796327f - t1;
-//     }
-//     v_endif;
-
-//     v_if(val < 0 ){
-//         t1 = -t1;
-//     }
-//     v_endif;
-
-//     return t1;
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_atan()
-// {
-//     // SFPU microcode
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-//         val = sfpu_atan_maclaurin_series<APPROXIMATION_MODE>(val);
-//         dst_reg[0] = val;
-//         dst_reg++;
-//     }
-// }
-
-
-// template <bool APPROXIMATION_MODE>
-// sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val)
-// {
-//     // input for [-1:1]
-//     // Mclauren series
-//     // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ...
-//     // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a
-
-//     vFloat tmp = val;
-//     vFloat val_square = val * val;
-//     // x
-//     vFloat output = tmp;
-//     // (1/6) * x^3
-//     tmp = tmp * val_square;
-//     output += 0.166666666 * tmp;
-//     // (3/40) * x^5
-//     tmp = tmp * val_square;
-//     output +=  0.075 * tmp;
-
-//     //(5/112) * x^7
-//     tmp = tmp * val_square;
-//     output += 0.044642857 * tmp;
-
-//     // (35/1152) *x^9
-//     tmp = tmp * val_square;
-//     output += 0.03038194 * tmp;
-
-//     //(63/2816) * x^11
-//     tmp = tmp * val_square;
-//     output += 0.02237216 * tmp;
-
-//     // Write out output
-//     return output;
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_asin()
-// {
-//     // SFPU microcode
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat v = dst_reg[0];
-//         v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
-//         dst_reg[0] = v;
-//         dst_reg++;
-//     }
-// }
-
-
-// #define PI_2 (1.570796326794)
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_acos()
-// {
-//     // SFPU microcode
-//     // acos = (pi/2 - asin)
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat v = dst_reg[0];
-//         v = sfpu_asine_maclaurin_series<APPROXIMATION_MODE>(v);
-//         v = PI_2 - v;
-//         dst_reg[0] = v;
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void cast_fp32_to_fp16a()
-// {
-//     #pragma GCC unroll 8
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         //vFloat val = dst_reg[0];
-//         //dst_reg[0] = float_to_fp16a(val, 0);
-//         TTI_SFPLOAD(0, 0, 3, 0);
-//         TTI_SFP_STOCH_RND(0,0,0,0,0,8);
-//         TTI_SFPSTORE(0,1,3,0);
-//         dst_reg++;
-//     }
-// }
-
-
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_negative()
-// {
-
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-//         dst_reg[0] = -val;
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_add1()
-// {
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-//         dst_reg[0] = 1.0f + val;
-//         dst_reg++;
-//     }
-// }
-
-// inline
-// vFloat sigmoid_piecewise_linear_positive(vFloat val) {
-//         vFloat result = 0.0f;
-// 	v_if ( val >= +5.0f)  {
-// 	  result = 1.0f;
-// 	} v_elseif ( val > 1.0f && val < 5.0f ) {
-// 	  result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f,  0.24300185f,  0.50437757f,val);
-// 	} v_else {
-// 	  result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5
-// 	}
-// 	v_endif;
-// 	return result;
-// }
-
-// //sigmoid is anti-symmetric and offset by 1
-// //sigmoid[-x] = 1 - sigmoid[x]
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_sigmoid()
-// {
-//     for (int d = 0; d < ITERATIONS; d++)
-//     {
-//         vFloat val = dst_reg[0];
-//         vFloat result = 0.0f;
-
-//         v_if ( val < 0.0f ) {
-//   	   val = -val;
-//         }
-//         v_endif;
-
-// 	result = sigmoid_piecewise_linear_positive(val);
-
-// 	val = dst_reg[0];
-//         v_if ( val < 0.0f ) {
-//             result = 1.0f - result;
-//         }
-//         v_endif;
-
-//         dst_reg[0] = result;
-//         dst_reg++;
-//     }
-
-//     return;
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_heaviside(uint value)
-// {
-//     // SFPU microcode
-//     Converter c_value;
-//     c_value.u = value;
-//     vFloat s = c_value.f;
-
-//     #pragma GCC unroll 0
-//     for (int d = 0; d < ITERATIONS; d++) {
-//         vFloat v = dst_reg[0];
-
-//         v_if (v < 0.0f) {
-//             v = 0.0f;
-//         }v_elseif (v > 0.0f) {
-//             v = 1.0f;
-//         }v_else {
-//             v = s;
-//         }
-//         v_endif;
-
-//        dst_reg[0] = v;
-
-//         dst_reg++;
-//     }
-// }
-
-// template <bool APPROXIMATION_MODE, int ITERATIONS>
-// inline void calculate_silu()
-// {
-//     // SFPU microcode
-//     for (int d = 0; d < ITERATIONS; d++) {
-//         vFloat val = dst_reg[0];
-//         v_if ( val < 0.0f ) {
-//             val = -val;
-//         }
-//         v_endif;
-
-// 	    vFloat result = sigmoid_piecewise_linear_positive(val);
-
-// 	    val = dst_reg[0];
-//         v_if ( val < 0.0f ) {
-//             result = 1.0f - result;
-//         }
-//         v_endif;
-//         result = val * result;
-//         dst_reg[0] = result;
-//         dst_reg++;
-//     }
-// }
-
-// } // namespace sfpu
-// } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
index 642fbb1591e..77c43ef0650 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h
@@ -9,77 +9,3 @@
 // /*************************************************************************
 //  * LLK UNPACK AB
 //  *************************************************************************/
-
-// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_AB_hw_configure(
-//     const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) {
-//     // In0 -> unpA
-//     // In1 -> unpB
-//     const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
-//     const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
-
-//     // unpA -> srcA
-//     // unpB -> srcB
-//     const uint32_t num_faces = get_operand_num_faces(unpA_operand_id);  // num faces in unpA and unpB are the same
-
-//     const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id);  // face r dim in unpA and unpB are the same
-
-//     _llk_unpack_AB_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
-//         unpack_src_format[unpA_operand_id],
-//         unpack_src_format[unpB_operand_id],
-//         unpack_dst_format[unpA_operand_id],
-//         unpack_dst_format[unpB_operand_id],
-//         face_r_dim,
-//         within_face_16x16_transpose,
-//         num_faces);
-// }
-
-// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_AB_hw_configure_disaggregated(
-//     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) {
-//     const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand};
-
-//     llk_unpack_AB_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_params, within_face_16x16_transpose);
-// }
-
-// template <BroadcastType BType = BroadcastType::NONE>
-// inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) {
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-//     const bool narrow_tile = get_operand_narrow_tile(operand_id);  // if narrow tile read face 0 twice for row broadcast
-//                                                                    // or read face 0 and 1 for col broadcast
-//     _llk_unpack_AB_mop_config_<BType>(transpose_of_faces, num_faces, narrow_tile);
-// }
-
-// template <BroadcastType BType = BroadcastType::NONE>
-// inline void llk_unpack_AB_init(
-//     const std::uint32_t operandA,
-//     const std::uint32_t operandB,
-//     const std::uint32_t transpose = 0,
-//     const std::uint32_t acc_to_dest = 0) {
-//     const std::uint32_t operandA_id = get_operand_id(operandA);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id);  // face r dim in unpA and unpB are the same
-//     const std::uint32_t num_faces = get_operand_num_faces(operandA_id);
-//     const bool narrow_tile =
-//         get_operand_narrow_tile(operandA_id);  // if narrow tile read face 0 twice for row broadcast
-
-//     _llk_unpack_AB_init_<BType>(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest);
-// }
-
-// template <BroadcastType BType = BroadcastType::NONE>
-// inline void llk_unpack_AB(
-//     const std::uint32_t operandA,
-//     const std::uint32_t operandB,
-//     const std::uint32_t tile_index_a,
-//     const std::uint32_t tile_index_b,
-//     const bool transpose_of_faces = 0 /*not used*/) {
-//     std::uint32_t operandA_id = get_operand_id(operandA);
-//     std::uint32_t operandB_id = get_operand_id(operandB);
-//     std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
-//     std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a;
-//     std::uint32_t address_a = base_address_a + offset_address_a;
-//     std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
-//     std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b;
-//     std::uint32_t address_b = base_address_b + offset_address_b;
-
-//     _llk_unpack_AB_<BType>(address_a, address_b, transpose_of_faces > 0);
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
index f4aee2da6bd..d3299a0d299 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h
@@ -9,128 +9,3 @@
 // /*************************************************************************
 //  * LLK UNPACK AB MATMUL
 //  *************************************************************************/
-
-// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) {
-//     const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca;
-
-//     // In0 -> unpB
-//     // In1 -> unpA
-//     const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand);
-//     const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand);
-
-//     // unpA -> srcA
-//     // unpB -> srcB
-//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
-//     const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id);
-
-//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
-//     const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id);
-
-//     _llk_unpack_AB_matmul_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
-//         unpack_src_format[unpA_operand_id],
-//         unpack_src_format[unpB_operand_id],
-//         unpack_dst_format[unpA_operand_id],
-//         unpack_dst_format[unpB_operand_id],
-//         unpA_face_r_dim,
-//         unpB_face_r_dim,
-//         transpose_xy_srca,
-//         unpA_num_faces,
-//         unpB_num_faces,
-//         cb_interface[unpA_operand_id].fifo_page_size,
-//         cb_interface[unpB_operand_id].fifo_page_size);
-// }
-
-// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_AB_matmul_hw_configure_disaggregated(
-//     const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) {
-//     const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = {
-//         .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca};
-//     llk_unpack_AB_matmul_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_AB_matmul_params);
-// }
-
-// inline void llk_unpack_AB_matmul_mop_config(
-//     const bool transpose,
-//     const std::uint32_t ct_dim,
-//     const std::uint32_t rt_dim,
-//     const std::uint32_t kt_dim,
-//     const bool partial_face) {
-//     // in0 - loaded to SrcB
-//     // in1 - loaded to SrcA
-//     _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face);
-// }
-
-// __attribute__((always_inline)) inline void llk_unpack_AB_matmul_init(
-//     const std::uint32_t operandA,
-//     const std::uint32_t operandB,
-//     const std::uint32_t transpose = 0,
-//     const std::uint32_t ct_dim = 1,
-//     const std::uint32_t rt_dim = 1,
-//     const std::uint32_t kt_dim = 1) {
-//     // In0 -> srcB (supports partial face)
-//     // In1 -> srcA
-//     const uint32_t operandA_id = get_operand_id(operandB);
-//     const uint32_t operandB_id = get_operand_id(operandA);
-
-//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id);
-//     const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id);
-
-//     const bool reuse_a = ct_dim >= rt_dim;
-//     const bool partial_face = get_operand_partial_face(operandB_id);
-
-//     const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id);
-//     const uint32_t unpB_num_faces =
-//         partial_face ? 1 : get_operand_num_faces(operandB_id);  // if partial face -> unpack face by face
-
-//     _llk_unpack_AB_matmul_init_(
-//         transpose,
-//         ct_dim,
-//         rt_dim,
-//         kt_dim,
-//         unpA_face_r_dim,
-//         unpB_face_r_dim,
-//         unpA_num_faces,
-//         unpB_num_faces,
-//         partial_face);
-// }
-
-// inline void llk_unpack_AB_matmul(
-//     const std::uint32_t operandA,
-//     const std::uint32_t operandB,
-//     const std::uint32_t tile_index_a,
-//     const std::uint32_t tile_index_b,
-//     const std::uint32_t ct_dim = 1,
-//     const std::uint32_t rt_dim = 1,
-//     const std::uint32_t kt_dim = 1) {
-//     // In0/InA -> srcB (supports partial face)
-//     // In1/InB -> srcA
-
-//     volatile uint *cfg = get_cfg_pointer();  // get pointer to registers for current state ID
-
-//     const std::uint32_t operandA_id = get_operand_id(operandA);
-//     const std::uint32_t operandB_id = get_operand_id(operandB);
-//     const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id);  // In1/InB -> srcA
-//     const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id);  // In0/InA -> srcB
-
-//     const bool partial_face = get_operand_partial_face(operandA_id);
-
-//     std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1;
-//     std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1;
-
-//     std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size;
-//     std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size;
-
-//     _llk_unpack_AB_matmul_(
-//         base_address_a,
-//         base_address_b,
-//         tile_index_a,
-//         tile_index_b,
-//         tile_size_a,
-//         tile_size_b,
-//         unpA_face_r_dim,
-//         unpB_face_r_dim,
-//         partial_face,
-//         ct_dim,
-//         rt_dim,
-//         kt_dim);
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
index ca39397653c..9d9f30a6c75 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h
@@ -9,81 +9,3 @@
 // /*************************************************************************
 //  * LLK UNPACK A
 //  *************************************************************************/
-
-// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_A_hw_configure(
-//     const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) {
-//     const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand);
-//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
-//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
-
-//     _llk_unpack_A_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
-//         unpack_src_format[unpA_operand_id],
-//         unpack_dst_format[unpA_operand_id],
-//         unpA_face_r_dim,
-//         within_face_16x16_transpose,
-//         unpA_num_faces);
-// }
-
-// template <bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_A_hw_configure_disaggregated(
-//     const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) {
-//     const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand};
-//     llk_unpack_A_hw_configure<is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_A_params, within_face_16x16_transpose);
-// }
-
-// template <
-//     BroadcastType BType = BroadcastType::NONE,
-//     bool acc_to_dest = false,
-//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
-//     bool unpack_to_dest = false>
-// inline void llk_unpack_A_mop_config(
-//     const bool transpose_of_faces,
-//     const std::uint32_t operand_id,
-//     const std::uint32_t unpack_src_format = 0,
-//     std::uint32_t unpack_dst_format = 0) {
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-
-//     _llk_unpack_A_mop_config_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
-//         transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format);
-// }
-
-// template <
-//     BroadcastType BType = BroadcastType::NONE,
-//     bool acc_to_dest = false,
-//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
-//     bool unpack_to_dest = false>
-// inline void llk_unpack_A_init(
-//     const std::uint32_t transpose_of_faces = 0,
-//     const std::uint32_t within_face_16x16_transpose = 0,
-//     const std::uint32_t operand = 0) {
-//     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(within_face_16x16_transpose);
-
-//     const std::uint32_t operand_id = get_operand_id(operand);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-
-//     _llk_unpack_A_init_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
-//         transpose_of_faces,
-//         within_face_16x16_transpose,
-//         face_r_dim,
-//         num_faces,
-//         unpack_src_format[operand_id],
-//         unpack_dst_format[operand_id]);
-// }
-
-// template <
-//     BroadcastType BType = BroadcastType::NONE,
-//     bool acc_to_dest = false,
-//     EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE,
-//     bool unpack_to_dest = false>
-// inline void llk_unpack_A(
-//     const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) {
-//     std::uint32_t operand_id = get_operand_id(operand);
-//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
-//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
-//     std::uint32_t address = base_address + offset_address;
-
-//     _llk_unpack_A_<BType, acc_to_dest, binary_reuse_dest, unpack_to_dest>(
-//         address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]);
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
index a2f5d8c675f..1ba062360fb 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h
@@ -18,120 +18,3 @@
 // /*************************************************************************
 //  * LLK UNPACK COMMON
 //  *************************************************************************/
-
-// void llk_zero_operand(std::uint32_t operand) {
-//     std::uint32_t operand_id = get_operand_id(operand);
-//     std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size;
-//     std::uint32_t size = cb_interface[operand_id].fifo_size;
-//     _llk_zero_buffer_(fifo_base_addr, size);
-// }
-
-// template <bool mail2math = true, bool mail2pack = true>
-// inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) {
-//     std::uint32_t operand_id = get_operand_id(operand);
-//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
-//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
-//     std::uint32_t address = base_address + offset_address;
-//     _llk_unpack_get_tile_<mail2math, mail2pack>(address, p_tile);
-// }
-
-// template <bool mail2math = true, bool mail2pack = true>
-// inline void llk_unpack_release_tile(std::uint32_t operand) {
-//     _llk_unpack_release_tile_<mail2math, mail2pack>();
-// }
-
-// inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) {
-//     _llk_unpack_debug_dump_(data, byte_size);
-// }
-
-// inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); }
-
-// template <bool is_tile_dim_reconfig_en = false>
-// inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) {
-//     const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand);
-//     const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id);
-//     _llk_unpack_reconfig_data_format_srca_impl_(
-//         unpack_src_format[srca_operand_id],
-//         unpack_dst_format[srca_operand_id],
-//         cb_interface[srca_operand_id].fifo_page_size);
-// }
-
-// template <bool is_tile_dim_reconfig_en = false>
-// inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) {
-//     std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand);
-//     const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id);
-//     _llk_unpack_reconfig_data_format_srcb_impl_(
-//         unpack_src_format[srcb_operand_id],
-//         unpack_dst_format[srcb_operand_id],
-//         cb_interface[srcb_operand_id].fifo_page_size);
-// }
-
-// template <bool is_tile_dim_reconfig_en = false>
-// inline void llk_unpack_reconfig_data_format_srca(
-//     const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) {
-//     std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand);
-//     std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand);
-
-//     if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) {
-//         llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
-//     } else if constexpr (is_tile_dim_reconfig_en) {
-//         llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
-//     }
-// }
-
-// template <bool is_tile_dim_reconfig_en = false>
-// inline void llk_unpack_reconfig_data_format_srcb(
-//     const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) {
-//     std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand);
-//     std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand);
-
-//     if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) {
-//         llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
-//     } else if constexpr (is_tile_dim_reconfig_en) {
-//         llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
-//     }
-// }
-
-// template <bool is_tile_dim_reconfig_en = false>
-// inline void llk_unpack_reconfig_data_format(
-//     const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) {
-//     llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_new_operand);
-//     llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_new_operand);
-// }
-
-// template <bool is_tile_dim_reconfig_en = false>
-// inline void llk_unpack_reconfig_data_format(
-//     const std::uint32_t srca_old_operand,
-//     const std::uint32_t srca_new_operand,
-//     const std::uint32_t srcb_old_operand,
-//     const std::uint32_t srcb_new_operand) {
-//     llk_unpack_reconfig_data_format_srca<is_tile_dim_reconfig_en>(srca_old_operand, srca_new_operand);
-//     llk_unpack_reconfig_data_format_srcb<is_tile_dim_reconfig_en>(srcb_old_operand, srcb_new_operand);
-// }
-
-// inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); }
-
-// inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); }
-
-// // All TILE_SIZE related functions were deprecared in BBE for WH.  The following is needed for pack_shifted so just
-// // keeping here.
-// // FIXME: Need to review and adjust accordingly
-// constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) {
-//     switch (format & 0x1F) {
-//         case ((uint8_t)DataFormat::Float32): return ((index << 8));
-//         case ((uint8_t)DataFormat::Float16):
-//         case ((uint8_t)DataFormat::Float16_b): return ((index << 7));
-//         case ((uint8_t)DataFormat::Bfp8):
-//         case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2));
-//         case ((uint8_t)DataFormat::Bfp4):
-//         case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2));
-//         case ((uint8_t)DataFormat::Bfp2):
-//         case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2));
-//         case ((uint8_t)DataFormat::Int8):
-//         case ((uint8_t)DataFormat::Lf8): return ((index << 6));
-//         // Keep default as Bfp8?
-//         default: return ((index << 6) + (index << 2));
-//     };
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
index 01a12122375..c68c94b6de9 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h
@@ -9,86 +9,3 @@
 /*************************************************************************
 * LLK UNPACK REDUCE
 *************************************************************************/
-
-// template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en = false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_reduce_hw_configure(
-//     const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) {
-
-//     constexpr bool within_face_16x16_transpose  = (ReduceDim::REDUCE_ROW == dim);
-
-//     const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand);
-//     const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
-//     const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
-
-//     constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
-//     const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
-//                                ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
-
-//     _llk_unpack_reduce_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
-//         unpack_src_format[unpA_operand_id],
-//         unpB_src_format,
-//         unpack_dst_format[unpA_operand_id],
-//         unpB_dst_format,
-//         unpA_face_r_dim,
-//         unpA_face_r_dim,
-//         within_face_16x16_transpose,
-//         unpA_num_faces,
-//         unpA_num_faces
-//     );
-
-//     if constexpr (type != PoolType::MAX) {
-//         union {
-//             float f;
-//             uint32_t u;
-//         } f2u = {.f = const_mult};
-
-//         for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u;  // Load const into L1 buffer
-//     }
-// }
-
-// template <PoolType type, ReduceDim dim, bool is_fp32_dest_acc_en=false, StochRndType stoch_rnd_mode = StochRndType::None>
-// inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) {
-//     const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand};
-//     llk_unpack_reduce_hw_configure<type, dim, is_fp32_dest_acc_en, stoch_rnd_mode>(&unpack_reduce_params, mult);
-// }
-
-// template <PoolType type, ReduceDim dim>
-// inline void llk_unpack_reduce_mop_config() {
-//     _llk_unpack_reduce_mop_config_<type, dim>();
-// }
-
-// template <PoolType type, ReduceDim dim>
-// inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) {
-
-//     constexpr std::uint32_t unpA_operand_id = 0;
-
-//     const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32;
-//     const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a
-//                                ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16);
-
-//     cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG1_SrcB_RMW>(unpB_dst_format);
-
-//     cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0xf>(unpB_src_format);
-//     cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpB_dst_format);
-
-//     TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32);
-//     TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
-//     TTI_NOP; TTI_NOP;
-
-//     _llk_unpack_reduce_init_<type, dim>(
-//         within_face_16x16_transpose
-//     );
-// }
-
-// template <PoolType type, ReduceDim dim>
-// inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) {
-
-//     std::uint32_t operand_id = get_operand_id(operand);
-//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
-//     std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index;
-//     std::uint32_t address = base_address + offset_address;
-
-//     _llk_unpack_reduce_<type, dim>(
-//         address
-//     );
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
index 59ede271732..7ac7b91b52e 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h
@@ -9,91 +9,3 @@
 /*************************************************************************
 * LLK UNPACK TILIZE
 *************************************************************************/
-
-// template <bool is_fp32_dest_acc_en = false>
-// inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) {
-
-//     constexpr bool  within_face_16x16_transpose = false;
-//     constexpr StochRndType stoch_rnd_mode = StochRndType::None;
-
-//     const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand);
-//     const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id);
-//     const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id);
-
-//     _llk_unpack_tilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
-//         unpack_src_format[unpA_operand_id],
-//         unpack_dst_format[unpA_operand_id],
-//         unpA_face_r_dim,
-//         within_face_16x16_transpose,
-//         unpA_num_faces
-//     );
-// }
-
-
-// template <bool is_fp32_dest_acc_en = false>
-// inline void llk_unpack_tilize_hw_configure_disaggregated(
-//     const std::uint32_t unpA_operand) {
-//     const llk_unpack_A_params_t unpack_tilize_params = {
-//         .unpA_operand = unpA_operand
-//     };
-//     llk_unpack_tilize_hw_configure<is_fp32_dest_acc_en>(&unpack_tilize_params);
-// }
-
-// inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) {
-//     std::uint32_t operand_id = get_operand_id(operand);
-//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
-//     _llk_unpack_tilize_mop_config_(narrow_tile);
-// }
-
-// inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) {
-//     cfg_reg_rmw_tensix<THCON_SEC0_REG2_Haloize_mode_RMW>(0);
-
-//     const std::uint32_t operand_id = get_operand_id(operand);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
-//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
-
-//     // Save state of unpacker config for quick restore
-//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0]
-//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
-
-//     _llk_unpack_tilize_init_(
-//         unpack_src_format[operand_id],
-//         unpack_dst_format[operand_id],
-//         ct_dim,
-//         face_r_dim,
-//         narrow_tile
-//     );
-
-// }
-
-// inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) {
-//     TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0);
-//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0]
-//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32,  p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context
-// }
-
-// inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) {
-
-//     std::uint32_t operand_id = get_operand_id(operand);
-//     const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id);
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-//     const bool narrow_tile = get_operand_narrow_tile(operand_id);
-
-//     std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;  // Remove header size added by descriptor
-
-//     _llk_unpack_tilize_(
-//         base_address,
-//         tile_index,
-//         unpack_src_format[operand_id],
-//         block_ct_dim,
-//         face_r_dim,
-//         num_faces,
-//         narrow_tile
-//     );
-// }
-
-// inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) {
-//     for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) {
-//         llk_unpack_tilize(operand, tile_index, block_c_tiles);
-//     }
-// }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
index dded559e94d..51f7b91e8bf 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h
@@ -9,88 +9,3 @@
 // /*************************************************************************
 // * LLK UNPACK UNTILIZE
 // *************************************************************************/
-// template <bool is_fp32_dest_acc_en = false>
-// inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) {
-//     constexpr bool is_row_pool = false;
-//     constexpr bool within_face_16x16_transpose = false;
-//     constexpr StochRndType stoch_rnd_mode = StochRndType::None;
-
-//     const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand);
-//     const uint32_t unpA_num_faces = 4;
-//     const uint32_t unpA_face_r_dim = FACE_R_DIM;
-
-//     _llk_unpack_untilize_hw_configure_<is_fp32_dest_acc_en, stoch_rnd_mode>(
-//         unpack_src_format[unpA_operand_id],
-//         unpack_dst_format[unpA_operand_id],
-//         unpA_face_r_dim,
-//         within_face_16x16_transpose,
-//         unpA_num_faces
-//     );
-// }
-
-// inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) {
-//     const llk_unpack_A_params_t unpack_untilize_params = {
-//         .unpA_operand = unpA_operand,
-//     };
-//     llk_unpack_untilize_hw_configure(&unpack_untilize_params);
-// }
-
-// inline void llk_unpack_untilize_mop_config() {
-//     _llk_unpack_untilize_mop_config_();
-// }
-
-// inline void llk_unpack_untilize_init(std::uint32_t operand = 0) {
-//     const std::uint32_t operand_id = get_operand_id(operand);
-//     const std::uint32_t face_r_dim = 1;
-//     const std::uint32_t num_faces = get_operand_num_faces(operand_id);
-
-//     // Save state of unpacker config for quick restore
-//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config
-//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context
-//     TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1
-
-//     _llk_unpack_untilize_init_(
-//         unpack_dst_format[operand_id],
-//         cb_interface[operand_id].fifo_page_size,
-//         face_r_dim,
-//         num_faces
-//     );
-// }
-
-// inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) {
-//     std::uint32_t operand_id = get_operand_id(operand);
-//     std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1;
-//     std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride;
-
-//     // Check that unpacker is done (all contexts freed up) before starting hw configuration
-//     wait_for_idle();
-
-//     // Reset address counters
-//     unpacker_addr_counter_init();
-
-//     // Wait for cfg to be free to edit
-//     TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK);
-
-//     // Reset the values to default in unpack AB common.
-//     TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0);
-//     TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16);
-//     cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32+1, 0, 0xFFFF>(1);
-//     cfg_reg_rmw_tensix<UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32, UNP0_ADDR_CTRL_XY_REG_0_Ystride_SHAMT, UNP0_ADDR_CTRL_XY_REG_1_Ystride_MASK>(unpA_ch1_y_stride);
-//     TTI_NOP; TTI_NOP; // Do we need this for WH?
-// }
-
-// template <bool first_pass = true>
-// inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) {
-//     const std::uint32_t operand_id = get_operand_id(operand);
-//     const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1;
-
-//     _llk_unpack_untilize_pass_<first_pass>(
-//         base_address,
-//         block_tile_cols
-//     );
-// }
-
-// inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) {
-//     llk_unpack_untilize_pass<true>(operand, block_c_tiles);
-//     llk_unpack_untilize_pass<false>(operand, block_c_tiles);
-// }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
index f65a6b86ddd..7b504ae34ac 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h
@@ -290,26 +290,4 @@ inline void llk_math_eltwise_unary_sfpu_silu_init() {
     llk_math_eltwise_unary_sfpu_init<SfpuType::silu, APPROXIMATE>();
 }
 
-//Mask
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::mask, APPROXIMATE, dst_sync>(dst_index, vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_mask_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::mask, APPROXIMATE>();
-}
-
-// Negative
-template <bool APPROXIMATE, DstSync dst_sync = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) {
-    llk_math_eltwise_unary_sfpu<SfpuType::negative, APPROXIMATE, dst_sync>(dst_index,vector_mode);
-}
-
-template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_negative_init() {
-    llk_math_eltwise_unary_sfpu_init<SfpuType::negative, APPROXIMATE>();
-}
-
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_identity.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_identity.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
new file mode 100644
index 00000000000..c0f73fb172e
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+
+template <bool APPROXIMATION_MODE, int ITERATIONS=8>
+inline void calculate_mask()
+{
+    bool exponent_size_8 = true;
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat mask = dst_reg[32];
+        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) {
+            dst_reg[0] = 0;
+        }
+        v_endif;
+        dst_reg++;
+    }
+}
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
new file mode 100644
index 00000000000..503843211e7
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+
+template <bool APPROXIMATION_MODE, int ITERATIONS=8>
+inline void calculate_negative()
+{
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        vFloat val = dst_reg[0];
+        dst_reg[0] = -val;
+        dst_reg++;
+    }
+}
+
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
index 06aa57e9e34..360c0c9c9a0 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h
@@ -101,10 +101,6 @@ inline void llk_math_calculate_sfpu(
         calculate_signbit<APPROXIMATION_MODE, ITERATIONS>();
     } else if constexpr (operation == SfpuType::silu) {
         calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
-    } else if constexpr (operation == SfpuType::mask) {
-        calculate_mask<APPROXIMATION_MODE, ITERATIONS>();
-    } else if constexpr (operation == SfpuType::negative) {
-        calculate_negative<APPROXIMATION_MODE, ITERATIONS>();
     }
 }
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_identity.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h
similarity index 88%
rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_identity.h
rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h
index 0bd1d26c78d..e59defb4588 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_identity.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h
@@ -15,10 +15,7 @@ namespace ckernel {
 // New LLK SFPU APIs
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
-inline void llk_math_eltwise_unary_sfpu_identity(uint dst_index, int vector_mode = Dim::RC) {
-
-	constexpr bool zero_negative = true;
-    constexpr int first_iterations = 1;
+inline void llk_math_eltwise_unary_sfpu_identity(uint dst_index, int vector_mode = (int)VectorMode::RC) {
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
       (ckernel::sfpu::calculate_identity<APPROXIMATE,8>,
        ckernel::sfpu::calculate_identity<APPROXIMATE,8>,
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
new file mode 100644
index 00000000000..c59ab659106
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+
+#include "llk_math_eltwise_unary_sfpu_common_includes.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+#include "llk_math_eltwise_unary_sfpu_0_param.h"
+#include "ckernel_sfpu_mask.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_mask_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::mask, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = (int)VectorMode::RC) {
+    constexpr int first_iterations = 1;
+    llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
+                            (ckernel::sfpu::calculate_mask<APPROXIMATE, first_iterations>,
+                            ckernel::sfpu::calculate_mask<APPROXIMATE>,
+                            dst_index, vector_mode);
+}
+
+}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
new file mode 100644
index 00000000000..54f8c84ce6b
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+
+#include "llk_math_eltwise_unary_sfpu_common_includes.h"
+#include "llk_math_eltwise_unary_sfpu_init.h"
+#include "llk_math_eltwise_unary_sfpu_0_param.h"
+#include "ckernel_sfpu_negative.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_negative_init() {
+    llk_math_eltwise_unary_sfpu_init<SfpuType::negative, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
+inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
+                            (ckernel::sfpu::calculate_negative<APPROXIMATE>,
+                            ckernel::sfpu::calculate_negative<APPROXIMATE>,
+                            dst_index, vector_mode);
+}
+
+} // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
index ec50f756429..d5f48028601 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h
@@ -14,7 +14,6 @@ namespace ckernel {
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
 inline void llk_math_eltwise_unary_sfpu_sqrt(uint dst_index, int vector_mode = (int)VectorMode::RC) {
-    constexpr bool zero_negative = true;
     constexpr int first_iterations = 1;
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
                                 (ckernel::sfpu::calculate_sqrt<APPROXIMATE, first_iterations>,
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
index 4c059e37585..6eb95e8c730 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h
@@ -655,20 +655,6 @@ inline void cast_fp32_to_fp16a()
     }
 }
 
-
-
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_negative()
-{
-
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat val = dst_reg[0];
-        dst_reg[0] = -val;
-        dst_reg++;
-    }
-}
-
 template <bool APPROXIMATION_MODE, int ITERATIONS>
 inline void calculate_add1()
 {
@@ -775,20 +761,5 @@ inline void calculate_silu()
     }
 }
 
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void calculate_mask()
-{
-    bool exponent_size_8 = true;
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        vFloat mask = dst_reg[32];
-        v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) {
-            dst_reg[0] = 0;
-        }
-        v_endif;
-        dst_reg++;
-    }
-}
-
 } // namespace sfpu
 } // namespace ckernel
diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h b/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h
index bb65c153f49..b74823e4927 100644
--- a/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h
+++ b/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h
@@ -9,7 +9,7 @@
 
 #include "compute_kernel_api/common_globals.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_unary_sfpu.h"
+#include "llk_math_eltwise_unary_sfpu_negative.h"
 #define MAIN math_main()
 #define MATH(x) x
 #else
diff --git a/tt_metal/include/compute_kernel_api/mask.h b/tt_metal/include/compute_kernel_api/mask.h
index 9b8a75999f7..2d3f370a788 100644
--- a/tt_metal/include/compute_kernel_api/mask.h
+++ b/tt_metal/include/compute_kernel_api/mask.h
@@ -9,7 +9,7 @@
 
 #include "compute_kernel_api/common_globals.h"
 #ifdef TRISC_MATH
-#include "llk_math_eltwise_unary_sfpu.h"
+#include "llk_math_eltwise_unary_sfpu_mask.h"
 #define MAIN math_main()
 #define MATH(x) x
 #else
diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp
index 80efc1fe988..8a0285268e5 100644
--- a/tt_metal/jit_build/build.cpp
+++ b/tt_metal/jit_build/build.cpp
@@ -103,11 +103,14 @@ void JitBuildEnv::init(uint32_t device_id, tt::ARCH arch)
         "-I" + this->root_ + "tt_metal " +
         "-I" + this->root_ + "tt_metal/include " +
         "-I" + this->root_ + "tt_metal/hw/inc " +
+        "-I" + this->root_ + "tt_metal/hw/inc/debug " +
         "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + " " +
         "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/" + this->arch_name_ + "_defines " +
         "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/noc " +
         "-I" + this->root_ + "tt_metal/third_party/umd/device/" + this->arch_name_ + " " + // TODO(fixme)
-        "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/common/inc "; // TODO(fixme) datamovement fw shouldn't read this
+        "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/common/inc " + // TODO(fixme) datamovement fw shouldn't read this
+        "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/common " +
+        "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/llk_io ";
 
     this->lflags_ = common_flags;
     this->lflags_ += "-fno-exceptions -Wl,-z,max-page-size=16 -Wl,-z,common-page-size=16 -nostartfiles ";
@@ -160,7 +163,9 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, int which, bo
         "-Os " +
         "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops
     this->includes_ = env_.includes_ +
-        "-I " + env_.root_ + "tt_metal/hw/firmware/src ";
+        "-I " + env_.root_ + "tt_metal/hw/firmware/src " +
+        "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " +
+        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io ";
 
     this->defines_ = env_.defines_;
 
@@ -230,6 +235,10 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, int which, bool is_fw)
     this->includes_ = env_.includes_ +
         "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/inc " +
         "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/llk_lib " +
+        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " +
+        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io " +
+        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api " +
+        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api/llk_sfpu " +
         "-I" + env_.root_ + "tt_metal/third_party/sfpi/include " +
         "-I" + env_.root_ + "tt_metal/hw/firmware/src ";
 
@@ -238,7 +247,6 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, int which, bool is_fw)
         this->srcs_.push_back("tt_metal/hw/firmware/src/trisc.cc");
         this->srcs_.push_back("tt_metal/hw/toolchain/tmu-crt0.S");
     } else {
-        this->srcs_.push_back("tt_metal/hw/ckernels/" + env_.arch_name_ + "/common/src/ckernel_template.cc");
         this->srcs_.push_back("tt_metal/hw/firmware/src/trisck.cc");
         this->srcs_.push_back("tt_metal/hw/toolchain/tmu-crt0k.S");
     }
@@ -305,7 +313,9 @@ JitBuildEthernet::JitBuildEthernet(const JitBuildEnv& env, int which, bool is_fw
     }
 
     this->includes_ = env_.includes_ +
-        "-I " + env_.root_ + "tt_metal/hw/inc/ethernet ";
+        "-I " + env_.root_ + "tt_metal/hw/inc/ethernet " +
+        "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " +
+        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io ";
 
     this->srcs_.push_back("tt_metal/hw/toolchain/substitutes.cpp");
     if (this->is_fw_) {

From 525692513edcc4b221d7ca096a309d054d453435 Mon Sep 17 00:00:00 2001
From: Reem Tawfik <rtawfik@tenstorrent.com>
Date: Tue, 12 Dec 2023 23:14:46 +0000
Subject: [PATCH 14/16] #3908: Remove perf kernels, add sfpu loop unroll

---
 .../llk_api/llk_sfpu/ckernel_sfpu_mask.h      |   1 +
 .../llk_api/llk_sfpu/ckernel_sfpu_negative.h  |   2 +-
 .../llk_math_eltwise_unary_sfpu_negative.h    |   3 +-
 .../wormhole_b0/common/inc/ckernel_perf_api.h | 162 ----------
 .../common/inc/ckernel_perf_include.h         |  32 --
 .../common/inc/ckernel_perf_math.h            | 283 ------------------
 .../common/inc/ckernel_perf_unpack_pack.h     | 162 ----------
 .../llk_api/llk_sfpu/ckernel_sfpu_mask.h      |   1 +
 .../llk_api/llk_sfpu/ckernel_sfpu_negative.h  |   1 +
 9 files changed, 5 insertions(+), 642 deletions(-)
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
 delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h

diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
index 302cb97c934..2dcd2a5d63e 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -20,6 +20,7 @@ template <bool APPROXIMATION_MODE, int ITERATIONS=4>
 inline void calculate_mask()
 {
     bool exponent_size_8 = true;
+    #pragma GCC unroll 4
     for (int d = 0; d < ITERATIONS; d++)
     {
         vFloat mask = dst_reg[16];
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
index fd9cfef2da6..136877237ab 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
@@ -19,7 +19,7 @@ namespace sfpu {
 template <bool APPROXIMATION_MODE, int ITERATIONS=4>
 inline void calculate_negative()
 {
-
+    #pragma GCC unroll 4
     for (int d = 0; d < ITERATIONS; d++)
     {
         vFloat val = dst_reg[0];
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
index 5badb47497b..fd6ca275adc 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h
@@ -21,9 +21,8 @@ inline void llk_math_eltwise_unary_sfpu_negative_init() {
 
 template <bool APPROXIMATE, DstSync Dst = DstSync::SyncFull>
 inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) {
-    constexpr int first_iterations = 1;
     llk_math_eltwise_unary_sfpu_0_param<APPROXIMATE, Dst>
-                            (ckernel::sfpu::calculate_negative<APPROXIMATE, first_iterations>,
+                            (ckernel::sfpu::calculate_negative<APPROXIMATE>,
                             ckernel::sfpu::calculate_negative<APPROXIMATE>,
                             dst_index, vector_mode);
 }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
deleted file mode 100644
index 9bfa79f6934..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h
+++ /dev/null
@@ -1,162 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <cstdint>
-#include <l1_address_map.h>
-#include "ckernel_include.h"
-#include "ckernel_globals.h"
-#include "ckernel.h"
-#include "tensix.h"
-#include "fw_debug.h"
-#include "epoch.h"
-
-#ifdef PERF_DUMP
-#include "perf_lib/scratch_api.h"
-#include "perf_res_decouple.h"
-#include "ckernel_perf_math.h"
-#include "ckernel_perf_unpack_pack.h"
-#endif
-
-#ifndef INTERMED_DUMP
-#define INTERMED_DUMP 0
-#endif
-
-#pragma GCC diagnostic ignored "-Wunused-function"
-
-// Comment in/out to enable perf scratch even logging
-
-namespace ckernel
-{
-extern uint32_t perf_index;
-extern uint32_t perf_end;
-// Perf-buffer are double buffered for spill_to_dram.
-// Ncrisc will move one half to dram while trisc populates the other half.
-// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
-extern volatile uint32_t *perf_buf_base[2];
-// Selects the half of perf_buffer that trisc is currently writing into.
-extern uint8_t perf_buf_base_id;
-extern bool record_perf_events;
-extern uint32_t perf_events_target_idx;
-extern uint16_t current_outer_loop_iter;
-extern uint8_t thread_id;
-extern bool first_unpack_recorded;
-
-inline void set_perf_dump_flag_for_input(int input_idx) {
-   #ifdef PERF_DUMP
-      TT_LLK_DUMP("set_perf_dump_flag_for_input({})", input_idx);
-      if (perf_events_target_inputs[perf_events_target_idx] == input_idx) {
-         record_perf_events = true;
-         perf_events_target_idx++;
-         if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) {
-            if (thread_id == 0 || thread_id == 2) {
-                  perf_end += num_events_per_input;
-                  // The buffer size available for each thread after double buffering is (l1_mem::address_map::TRISC_PERF_BUF_SIZE)/2.
-                  // Max number of events we can record in each half of the buffer will be that size divided by 4, since each event will be 4 bytes.
-                  if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) {
-                     perf_end = TRISC_PERF_BUF_SIZE >> 2;
-                  }
-            }
-         }
-         current_outer_loop_iter = input_idx;
-      } else {
-         record_perf_events = false;
-      }
-      first_unpack_recorded = false;
-   #endif
-}
-
-inline void record_pack_input_init_timestamp() {
-   #ifdef PERF_DUMP
-      TT_LLK_DUMP("record_pack_input_init_timestamp()");
-      if (record_perf_events) {
-         uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter);
-         record_timestamp_64b(event_id);
-      }
-   #endif
-}
-
-void record_pack_input_end_timestamp() {
-   #ifdef PERF_DUMP
-      TT_LLK_DUMP("record_pack_input_end_timestamp()");
-      if (record_perf_events) {
-         uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter);
-         record_timestamp_64b(event_id);
-         if (perf_events_target_idx == 1) {
-            uint32_t event_id_num_tiles_pack = perf::get_event_id(0, 0, perf::EventType::NUM_TILES_PACK, current_outer_loop_iter);
-            uint16_t num_tiles = regfile[p_gpr_pack::PERF_PACK_NUM_TILES] & 0xffff;
-            record_perf_value_and_check_overflow(event_id_num_tiles_pack, num_tiles, 0);
-         }
-      }
-   #endif
-}
-
-inline void perf_math_counter_start() {
-   #ifdef PERF_DUMP
-      TT_LLK_DUMP("perf_math_counter_start()");
-      if constexpr(SKIP_UNP) {
-         TTI_SETDVALID(p_setrwc::SET_A);
-         TTI_SETDVALID(p_setrwc::SET_B);
-      }
-      if (record_perf_events) {
-         // Due to a race condition that corrupts the write address of the fpu counters, reprogram them for every input
-         dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[perf_buf_base_id][perf_index], (uint32_t)&perf_buf_base[perf_buf_base_id][perf_end]);
-         start_fpu_perf_cnt<true>();
-      }
-   #endif
-}
-
-inline void record_perf_math_counter() {
-   #ifdef PERF_DUMP
-      TT_LLK_DUMP("record_perf_math_counter()");
-      if constexpr(SKIP_UNP) {
-         TTI_CLEARDVALID(0x1, 0);
-         TTI_CLEARDVALID(0x2, 0);
-      }
-      if (record_perf_events) {
-         stop_fpu_perf_cnt<true, true>();
-         // record_fpu_perf_cnt_value();
-      }
-   #endif
-}
-
-void record_unpack_num_tiles() {
-   #ifdef PERF_DUMP
-      if (perf_events_target_idx == 1) {
-         for (uint8_t operand = 0; operand < PERF_MAX_NUM_INPUTS; operand++) {
-            uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0;
-            regfile_base_idx += (operand >> 1);
-            bool upper = operand & 0b1;
-            uint16_t num_tiles;
-            if (upper) {
-               num_tiles = (regfile[regfile_base_idx] >> 16) & 0xffff;
-            } else {
-               num_tiles = regfile[regfile_base_idx] & 0xffff;
-            }
-            if (num_tiles != 0) {
-               uint32_t event_id_num_tiles_unpack = perf::get_event_id(operand, 0, perf::EventType::NUM_TILES_UNPACK, current_outer_loop_iter);
-               record_perf_value_and_check_overflow(event_id_num_tiles_unpack, num_tiles, 0);
-            }
-         }
-      }
-   #endif
-}
-
-void record_unpack_first_instruction_timestamp() {
-   #ifdef PERF_DUMP
-      TT_LLK_DUMP("record_unpack_first_instruction_timestamp()");
-      if (record_perf_events) {
-         uint32_t clock_lo = regfile[p_gpr_unpack::PERF_FIRST_UNP_LO];
-         uint32_t clock_hi = regfile[p_gpr_unpack::PERF_FIRST_UNP_HI];
-         uint32_t event_id_last_wait_tile = perf::get_event_id(0, 0, perf::EventType::UNPACK_FIRST_INSTRUCTION, current_outer_loop_iter);
-         record_perf_value_and_check_overflow(event_id_last_wait_tile, clock_lo, clock_hi);
-         if (perf_events_target_idx == 1) {
-            record_unpack_num_tiles();
-         }
-      }
-   #endif
-}
-
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
deleted file mode 100644
index d9ff57a5403..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#ifdef PERF_DUMP
-#include <l1_address_map.h>
-
-#include "perf_events_target_inputs.h"
-#include "perf_lib/scratch_api.h"
-
-#ifndef INTERMED_DUMP
-#define INTERMED_DUMP 0
-#endif
-
-#ifndef PERF_DUMP_CONCURRENT
-#define PERF_DUMP_CONCURRENT 0
-#endif
-
-#pragma GCC diagnostic ignored "-Wunused-function"
-
-static constexpr uint32_t PERF_DUMP_END_SIGNAL = 0xbeeff00d;
-static constexpr uint32_t PERF_CNT_DUMP_ENTRY_SIZE = 16; // Entry size in bytes
-
-#if PERF_DUMP_LEVEL == 0
-static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_0;
-#else
-static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_1;
-#endif
-
-#endif
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
deleted file mode 100644
index 812f5cc9884..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h
+++ /dev/null
@@ -1,283 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <cstdint>
-#include <l1_address_map.h>
-#include "ckernel_include.h"
-#include "ckernel_globals.h"
-#include "ckernel.h"
-#include "tensix.h"
-#include "fw_debug.h"
-#include "epoch.h"
-
-#include "ckernel_perf_include.h"
-
-#ifndef INTERMED_DUMP
-#define INTERMED_DUMP 0
-#endif
-
-#pragma GCC diagnostic ignored "-Wunused-function"
-
-// Comment in/out to enable perf scratch even logging
-
-namespace ckernel
-{
-extern uint32_t perf_index;
-extern uint32_t perf_end;
-// Perf-buffer are double buffered for spill_to_dram.
-// Ncrisc will move one half to dram while trisc populates the other half.
-// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
-extern volatile uint32_t *perf_buf_base[2];
-// Selects the half of perf_buffer that trisc is currently writing into.
-extern uint8_t perf_buf_base_id;
-extern uint16_t current_outer_loop_iter;
-extern uint8_t thread_id;
-extern uint32_t perf_events_target_idx;
-
-// In math thread, THCON dumps perf buffers in l1.
-// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well.
-// Flipping the l1 perf start address must also be done by THCON for math thread.
-// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective.
-// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed.
-// We read this value initially for all threads to reduce the l1-reads.
-extern int32_t dram_dump_req_local;
-
-struct cperf_cnt_mode
-{
-    constexpr static uint32_t PERF_CNT_MODE_FREE = 0; // Free running period counter
-    constexpr static uint32_t PERF_CNT_MODE_STOP = 1; // Stop counter
-    constexpr static uint32_t PERF_CNT_MODE_WRAP = 2; // Wrap period counter
-};
-
-struct cperf_cnt_block_sel
-{
-    constexpr static uint32_t PERF_CNT_INSTR_THREAD = 0; // Select all instruction thread perf counters(includes TDMA)
-    constexpr static uint32_t PERF_CNT_FPU = 1; // Select FPU perf counters
-    constexpr static uint32_t PERF_CNT_L1  = 2; // Select L1 perf counters
-    constexpr static uint32_t PERF_CNT_ALL = 3; // Select all perf counters
-};
-
-struct cperf_dbg_daisy_id
-{
-    constexpr static uint32_t DEBUG_DAISY_INSTRN_THREAD = 1; // Thread specific perf counters
-    constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_0 = 4; // TDMA+math
-    constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_1 = 5; // math+instruction issue
-    constexpr static uint32_t DEBUG_DAISY_TENSIX  = 7; // FPU and L1 perf counters
-};
-
-struct cperf_dbg_dump_to_mem_mode
-{
-    constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_WR = 0;
-    constexpr static uint32_t DEBUG_MEM_MODE_AUTO_WR = 1;
-    constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_RD = 2;
-    constexpr static uint32_t DEBUG_MEM_MODE_AUTO_RD = 3;
-};
-
-inline void set_perf_cnt_params(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU, uint32_t ref_period=0xffffffff, uint32_t mode=cperf_cnt_mode::PERF_CNT_MODE_FREE) {
-  uint32_t perf_cnt_ref_period_reg;
-  switch (block_sel) {
-     case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD:     perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD0; break;
-     case cperf_cnt_block_sel::PERF_CNT_L1:              perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_L1_0; break;
-     default: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_FPU0;
-  }
-  reg_write(perf_cnt_ref_period_reg, ref_period);
-  reg_write(perf_cnt_ref_period_reg+4, 0x00010100);
-}
-
-inline void stop_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) {
-  uint32_t perf_cnt_cntl_reg;
-  switch (block_sel) {
-     case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break;
-     case cperf_cnt_block_sel::PERF_CNT_L1:           perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break;
-     case cperf_cnt_block_sel::PERF_CNT_ALL:          perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break;
-     default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2;
-  }
-  reg_write(perf_cnt_cntl_reg, 0x00000002);
-  reg_write(perf_cnt_cntl_reg, 0x00000000);
-}
-
-template <bool use_tensix=true, bool stall_on_math=false>
-inline void stop_fpu_perf_cnt() {
-   if (perf_events_target_idx <= 1) {
-      if constexpr (use_tensix) {
-            if constexpr (stall_on_math) {
-               TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH);
-            }
-            TTI_STOREREG(p_gpr_math::PERF_CNT_STOP, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
-            TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
-      } else {
-         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000002);
-         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000);
-      }
-   }
-}
-
-inline void start_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) {
-  uint32_t perf_cnt_cntl_reg;
-  switch (block_sel) {
-     case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break;
-     case cperf_cnt_block_sel::PERF_CNT_L1:           perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break;
-     case cperf_cnt_block_sel::PERF_CNT_ALL:          perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break;
-     default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2;
-  }
-  reg_write(perf_cnt_cntl_reg, 0x00000001);
-  reg_write(perf_cnt_cntl_reg, 0x00000000);
-}
-
-template <bool use_tensix=true>
-inline void start_fpu_perf_cnt() {
-   if (perf_events_target_idx <= 1) {
-      if constexpr (use_tensix) {
-            TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH);
-            TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::THCON);
-            TTI_STOREREG(p_gpr_math::PERF_CNT_START, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
-            TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff);
-      } else {
-         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000001);
-         reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000);
-      }
-   }
-}
-
-
-inline void sel_fpu_perf_cnt(uint32_t cnt_id) {
-   riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl;
-   dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);;
-   dbg_bus_cntl.f.dbg_daisy_sel = cperf_dbg_daisy_id::DEBUG_DAISY_TENSIX;
-   dbg_bus_cntl.f.dbg_sig_sel   = 0x0;
-   dbg_bus_cntl.f.dbg_rd_sel    = cnt_id<<1; //rd_sel is aligned to 16-bit
-   reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val);
-}
-
-// Return value of the selected perf counter
-inline uint32_t get_perf_cnt() {
-   return reg_read(RISCV_DEBUG_REG_DBG_RD_DATA);
-}
-
-template <bool use_tensix=true>
-inline void dump_perf_cnt_to_mem() {
-   if constexpr (use_tensix) {
-      TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_SET,   (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff);
-      TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff);
-   } else {
-      riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2;
-      dbg_l1_mem_reg2.val = 0;
-      dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR;
-      dbg_l1_mem_reg2.f.mem_write = 1;
-      reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
-      dbg_l1_mem_reg2.f.mem_write = 0;
-      reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
-   }
-}
-
-inline void dbg_daisy_enable() {
-   riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl;
-   dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);
-   dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x1;
-   dbg_bus_cntl.f.dbg_daisy_en = 0x1;
-   reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val);
-}
-
-inline void dbg_daisy_disable() {
-   riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl;
-   dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);
-   dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x0;
-   dbg_bus_cntl.f.dbg_daisy_en = 0x0;
-   reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val);
-}
-
-inline void dbg_enable_dump_to_mem(uint32_t start_addr, uint32_t end_addr) {
-
-   TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH);
-   uint32_t start_addr_lo = (start_addr >> 4) & 0xffff;
-   uint32_t start_addr_hi = (start_addr >> 4) >> 16;
-   TT_SETDMAREG(0, start_addr_lo, 0, LO_16(p_gpr_math::TMP0));
-   TT_SETDMAREG(0, start_addr_hi, 0, HI_16(p_gpr_math::TMP0));
-   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG0 >> 2) & 0x3ffff);
-
-   uint32_t end_addr_lo = (end_addr >> 4) & 0xffff;
-   uint32_t end_addr_hi = (end_addr >> 4) >> 16;
-   TT_SETDMAREG(0, end_addr_lo, 0, LO_16(p_gpr_math::TMP0));
-   TT_SETDMAREG(0, end_addr_hi, 0, HI_16(p_gpr_math::TMP0));
-   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG1 >> 2) & 0x3ffff);
-
-   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG0, start_addr>>4);
-   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG1, end_addr>>4);
-   riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2;
-   dbg_l1_mem_reg2.val = 0;
-   dbg_l1_mem_reg2.f.mem_dump_mode = 0xf; //invalid and overriden below to trigger pulse needed to latch start address
-   dbg_l1_mem_reg2.f.skip_cycles = 0;
-
-   uint32_t debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff;
-   uint32_t debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff;
-   TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0));
-   TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0));
-   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff);
-
-
-   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
-   dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; // This value must change in order to latch new start address!!!
-   // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val);
-
-   debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff;
-   debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff;
-   TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0));
-   TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0));
-   TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff);
-
-   TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::THCON);
-}
-
-template <bool use_tensix=true>
-inline void record_fpu_perf_cnt_value() {
-   // if (perf_events_target_idx <= 1) {
-   //    // In l1 mode always reserve last event for PERF_DUMP_END_SIGNAL.
-   //    uint32_t reserve_space_for_trisc_end_signal = 1;
-   //    if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default.
-   //       //perf_buf_base[perf_index] = get_perf_cnt();
-   //       //perf_buf_base[perf_index + 1] = get_perf_cnt();
-   //       dump_perf_cnt_to_mem<use_tensix>(); //Dump 16B to L1
-   //       perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t));
-   //    }
-   // }
-}
-
-// Dump a dummy math event to get the initial fpu counter value.
-inline void record_dummy_math_event() {
-   if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
-      uint32_t reserve_space_for_trisc_end_signal = 1;
-      if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default.
-         perf_buf_base[perf_buf_base_id][perf_index] = 0;
-         perf_buf_base[perf_buf_base_id][perf_index+1] = 0;
-         perf_buf_base[perf_buf_base_id][perf_index+2] = 0;
-         perf_buf_base[perf_buf_base_id][perf_index+3] = 0;
-         perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t));
-      }
-   }
-}
-
-inline void setup_fpu_perf_cnt() {
-   // Only program perf counters for math thread (trisc1)
-   if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) {
-      set_perf_cnt_params(cperf_cnt_block_sel::PERF_CNT_FPU,0xffffffff,cperf_cnt_mode::PERF_CNT_MODE_FREE);
-      sel_fpu_perf_cnt(0);
-      dbg_daisy_enable();
-      dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[0][PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)], (uint32_t)&perf_buf_base[0][perf_end]);
-
-      riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2;
-      dbg_l1_mem_reg2.val = 0;
-      dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR;
-      dbg_l1_mem_reg2.f.mem_write = 0;
-      regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR]=dbg_l1_mem_reg2.val;
-      dbg_l1_mem_reg2.f.mem_write = 1;
-      regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_SET]=dbg_l1_mem_reg2.val;
-
-      regfile[p_gpr_math::PERF_CNT_START]=0x1;
-      regfile[p_gpr_math::PERF_CNT_STOP]=0x2;
-      sync_regfile_write(p_gpr_math::PERF_CNT_STOP);
-   }
-}
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
deleted file mode 100644
index 9a2b21b4756..00000000000
--- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h
+++ /dev/null
@@ -1,162 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <cstdint>
-#include <l1_address_map.h>
-#include "ckernel_include.h"
-#include "ckernel_globals.h"
-#include "ckernel.h"
-#include "tensix.h"
-#include "fw_debug.h"
-#include "epoch.h"
-
-#include "ckernel_perf_include.h"
-
-#pragma GCC diagnostic ignored "-Wunused-function"
-
-// Comment in/out to enable perf scratch even logging
-
-namespace ckernel
-{
-extern uint32_t perf_index;
-extern uint32_t perf_end;
-// Perf-buffer are double buffered for spill_to_dram.
-// Ncrisc will move one half to dram while trisc populates the other half.
-// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0].
-extern volatile uint32_t *perf_buf_base[2];
-// Selects the half of perf_buffer that trisc is currently writing into.
-extern uint8_t perf_buf_base_id;
-extern uint8_t thread_id;
-
-// In math thread, THCON dumps perf buffers in l1.
-// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well.
-// Flipping the l1 perf start address must also be done by THCON for math thread.
-// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective.
-// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed.
-// We read this value initially for all threads to reduce the l1-reads.
-extern int32_t dram_dump_req_local;
-extern bool record_perf_events;
-extern uint32_t perf_events_target_idx;
-extern bool first_unpack_recorded;
-extern volatile uint * ncrisc_ack_addr;
-extern uint16_t current_outer_loop_iter;
-#if OVERLAY_DECOUPLE == 1
-extern uint8_t overlay_output_decouple_mask;
-#endif
-
-void allocate_perf_buffer();
-
-// This function gets called when half-perf-buffer is full and need to switch.
-// Only used for threads 0 and 2.
-// For thread 1 a different function is used: switch_perf_buffers_for_math_thread
-// If ncrisc has not yet finished dumping the next half of perf-buffer, trisc will stall.
-// If is_perf_end_signal is true, we just need to write the PERF_DUMP_END_SIGNAL.
-// This function should only get executed in INTERMED_DUMP mode.
-void switch_perf_buffers();
-void last_trisc_perf_dump_to_dram();
-
-// The two following functions are separated to avoid inline recursive function calls.
-// TODO: Check the behaviour of the compiler if the two following functions were merged into a template function.
-inline void record_perf_value(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b) {
-   perf_buf_base[perf_buf_base_id][perf_index] = event_id;
-   perf_buf_base[perf_buf_base_id][perf_index + 1] = event_value_hi_32b;
-   perf_buf_base[perf_buf_base_id][perf_index + 2] = event_value_lo_32b;
-   perf_index += 3;
-}
-
-inline void record_perf_dump_end() {
-   if (perf_index < perf_end) {
-      perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL;
-      perf_index += 1;
-   }
-#if PERF_DUMP_CONCURRENT == 1
-   if (perf_index < perf_end) {
-      perf_buf_base[perf_buf_base_id][perf_end - 1] = PERF_DUMP_END_SIGNAL;
-   }
-#endif
-}
-
-inline void record_perf_value_and_check_overflow(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b, uint32_t leave_space = 0) {
-   // In l1 mode always reserve the last event for PERF_DUMP_END_SIGNAL.
-   int reserve_space_for_trisc_end_signal = 1;
-
-#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1)
-   leave_space = 0;
-   reserve_space_for_trisc_end_signal = 0;
-   if (perf_index + 2 >= perf_end - reserve_space_for_trisc_end_signal - leave_space) {
-      switch_perf_buffers();
-   }
-   record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b);
-#else
-   if (perf_index + 2 < perf_end - reserve_space_for_trisc_end_signal - leave_space) {
-      record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b);
-   }
-#endif
-}
-
-inline void record_timestamp_64b(uint event_id, uint leave_space = 0) {
-   if (record_perf_events) {
-      uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
-      uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
-      record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, leave_space);
-   }
-}
-
-inline void record_perf_dump_end_and_check_overflow() {
-   if (thread_id == 1) {
-      uint32_t reserve_space_for_trisc_end_signal = 1;
-      if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default.
-         perf_buf_base[perf_buf_base_id][perf_index] = reg_read(0xFFB12000 + 0x120);
-         perf_buf_base[perf_buf_base_id][perf_index+1] = reg_read(0xFFB12000 + 0x124);
-         perf_buf_base[perf_buf_base_id][perf_index+2] = 0;
-         perf_buf_base[perf_buf_base_id][perf_index+3] = 0;
-         perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t));
-      }
-   }
-
-#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1)
-   if (perf_index >= perf_end) {
-      switch_perf_buffers();
-   }
-   record_perf_dump_end();
-#else
-   if (perf_index < perf_end) {
-      record_perf_dump_end();
-   }
-#endif
-}
-
-inline void record_latest_wait_for_tile() {
-#if defined(PERF_DUMP)
-   if (!first_unpack_recorded) {
-      uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
-      uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
-      regfile[p_gpr_unpack::PERF_FIRST_UNP_LO] = timestamp_low & 0xffffffff;
-      sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_LO);
-      regfile[p_gpr_unpack::PERF_FIRST_UNP_HI] = timestamp_high & 0xffffffff;
-      sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_HI);
-   }
-#endif
-}
-
-void increment_unpack_tiles(uint operand_idx, uint num_tiles);
-void increment_pack_tiles(uint num_tiles);
-#if OVERLAY_DECOUPLE == 1
-inline uint32_t get_active_stream_idx(uint32_t stream_id) {
-    std::uint32_t active_stream_idx;
-    for (uint32_t active_streams_idx = 0; active_streams_idx < NOC_NUM_STREAMS; active_streams_idx++) {
-      if (stream_id == EPOCH_INFO_PTR->active_streams[active_streams_idx]->stream_id) {
-        active_stream_idx = active_streams_idx;
-        break;
-      }
-    }
-    return active_stream_idx;
-}
-
-void llk_push_all_packer_tiles_for_decoupling();
-#endif
-
-}
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
index c0f73fb172e..f00bc07cbb3 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -20,6 +20,7 @@ template <bool APPROXIMATION_MODE, int ITERATIONS=8>
 inline void calculate_mask()
 {
     bool exponent_size_8 = true;
+    #pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++)
     {
         vFloat mask = dst_reg[32];
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
index 503843211e7..3af9e78c3d9 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h
@@ -19,6 +19,7 @@ namespace sfpu {
 template <bool APPROXIMATION_MODE, int ITERATIONS=8>
 inline void calculate_negative()
 {
+    #pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++)
     {
         vFloat val = dst_reg[0];

From 77cadf2addfec744531338d54892af260924ed62 Mon Sep 17 00:00:00 2001
From: Reem Tawfik <rtawfik@tenstorrent.com>
Date: Wed, 13 Dec 2023 00:41:36 +0000
Subject: [PATCH 15/16] #3908: Move common files under inc, instead of under
 <arch_name> folders

---
 .../ckernels/grayskull/common/inc/ckernel.h   |  2 -
 .../metal/common/metal_ckernel_globals.h      | 63 -------------------
 tt_metal/hw/firmware/src/brisc.cc             |  2 +-
 tt_metal/hw/firmware/src/brisck.cc            |  2 +-
 tt_metal/hw/firmware/src/ncrisc.cc            |  2 +-
 tt_metal/hw/firmware/src/ncrisck.cc           |  2 +-
 tt_metal/hw/firmware/src/trisc.cc             |  2 +-
 tt_metal/hw/firmware/src/trisck.cc            |  2 +-
 .../compile_time_args.h}                      |  0
 .../firmware_common.h}                        |  2 +-
 .../compute_kernel_api/common_globals.h       |  2 +-
 tt_metal/jit_build/build.cpp                  |  4 +-
 12 files changed, 10 insertions(+), 75 deletions(-)
 delete mode 100644 tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
 rename tt_metal/hw/{ckernels/wormhole_b0/metal/common/metal_compile_time_args.h => inc/compile_time_args.h} (100%)
 rename tt_metal/hw/{ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h => inc/firmware_common.h} (98%)

diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
index 8bbf675af9e..1c2a86ef572 100644
--- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
+++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
@@ -42,8 +42,6 @@
 namespace ckernel
 {
 
-#define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_ ## arg_idx
-
 constexpr uint PACK_FLUSH_COUNTERS = // counters flush
     (1 << PACK_COUNTERS_SEC2_pack_per_xy_plane_SHAMT) |
     (1 << PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT) |
diff --git a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
deleted file mode 100644
index 7800a9934d7..00000000000
--- a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-//TODO: This file should be deleted after fixing redefinition errors,
-// functions should be moved to ckernel_globals.h
-#pragma once
-
-#include <cstdint>
-#include "ckernel_structs.h"
-#include "risc_attribs.h"
-#include "tensix_functions.h"
-#include "hostdevcommon/common_runtime_address_map.h"
-
-extern uint32_t __ldm_bss_start[];
-extern uint32_t __ldm_bss_end[];
-extern uint32_t __ldm_data_start[];
-extern uint32_t __ldm_data_end[];
-extern void (* __init_array_start[])();
-extern void (* __init_array_end[])();
-extern uint32_t __firmware_start[];
-
-extern void kernel_init();
-extern void kernel_launch();
-
-inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
-    // Cover L1 load latency of 6 cycles for the bulk of the copy
-    int32_t n = 0;
-    while (n < len - 5) {
-        uint32_t v0 = l1_addr[n + 0];
-        uint32_t v1 = l1_addr[n + 1];
-        uint32_t v2 = l1_addr[n + 2];
-        uint32_t v3 = l1_addr[n + 3];
-        uint32_t v4 = l1_addr[n + 4];
-        uint32_t v5 = l1_addr[n + 5];
-        local_mem_addr[n + 0] = v0;
-        local_mem_addr[n + 1] = v1;
-        local_mem_addr[n + 2] = v2;
-        local_mem_addr[n + 3] = v3;
-        local_mem_addr[n + 4] = v4;
-        local_mem_addr[n + 5] = v5;
-        n += 6;
-    }
-    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
-    while (n < len) {
-        local_mem_addr[n] = l1_addr[n];
-        n++;
-    }
-}
-
-inline void firmware_kernel_common_init(void *init_local_l1_base) {
-
-    // Handle stuff typically done in crt0 in asm.  Easier to do in C
-    wzerorange(__ldm_bss_start, __ldm_bss_end);
-
-    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
-    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
-    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
-
-    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
-        (**fptr)();
-    }
-}
diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc
index 465c7b74d44..b7151c38589 100644
--- a/tt_metal/hw/firmware/src/brisc.cc
+++ b/tt_metal/hw/firmware/src/brisc.cc
@@ -17,7 +17,7 @@
 #include "c_tensix_core.h"
 #include "tdma_xmov.h"
 #include "noc_nonblocking_api.h"
-#include "metal_ckernel_globals.h"
+#include "firmware_common.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "dev_msgs.h"
 #include "risc_attribs.h"
diff --git a/tt_metal/hw/firmware/src/brisck.cc b/tt_metal/hw/firmware/src/brisck.cc
index 06567a58a7d..bc6a252eefb 100644
--- a/tt_metal/hw/firmware/src/brisck.cc
+++ b/tt_metal/hw/firmware/src/brisck.cc
@@ -15,7 +15,7 @@
 #include "c_tensix_core.h"
 #include "tdma_xmov.h"
 #include "noc_nonblocking_api.h"
-#include "metal_ckernel_globals.h"
+#include "firmware_common.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "dataflow_api.h"
 #include "noc_addr_ranges_gen.h"
diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc
index 6a96aa0fbb0..fe40b9f6eb5 100644
--- a/tt_metal/hw/firmware/src/ncrisc.cc
+++ b/tt_metal/hw/firmware/src/ncrisc.cc
@@ -7,7 +7,7 @@
 #include "noc_nonblocking_api.h"
 #include "dev_msgs.h"
 #include "stream_io_map.h"
-#include "metal_ckernel_globals.h"
+#include "firmware_common.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "risc_attribs.h"
 #include "generated_bank_to_noc_coord_mapping.h"
diff --git a/tt_metal/hw/firmware/src/ncrisck.cc b/tt_metal/hw/firmware/src/ncrisck.cc
index 7a6d037733c..ef7f78d6ea8 100644
--- a/tt_metal/hw/firmware/src/ncrisck.cc
+++ b/tt_metal/hw/firmware/src/ncrisck.cc
@@ -9,7 +9,7 @@
 #ifdef PERF_DUMP
 #include "risc_perf.h"
 #endif
-#include "metal_ckernel_globals.h"
+#include "firmware_common.h"
 #include "tools/profiler/kernel_profiler.hpp"
 #include "dataflow_api.h"
 #include "tensix_functions.h"
diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc
index f1e0aad4b6f..78497e3e3e0 100644
--- a/tt_metal/hw/firmware/src/trisc.cc
+++ b/tt_metal/hw/firmware/src/trisc.cc
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ckernel.h"
-#include "metal_ckernel_globals.h"
+#include "firmware_common.h"
 #include "risc_common.h"
 #include <tensix.h>
 #include "dev_msgs.h"
diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc
index 0115db2f96f..9746b263a6a 100644
--- a/tt_metal/hw/firmware/src/trisck.cc
+++ b/tt_metal/hw/firmware/src/trisck.cc
@@ -9,7 +9,7 @@
 //      Need to make sure no other file includes these lists since it also include global parameter definitions
 // 2) instantiate global variables
 
-#include "metal_ckernel_globals.h"
+#include "firmware_common.h"
 
 #include "chlkc_list.h"
 
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h b/tt_metal/hw/inc/compile_time_args.h
similarity index 100%
rename from tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h
rename to tt_metal/hw/inc/compile_time_args.h
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/inc/firmware_common.h
similarity index 98%
rename from tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
rename to tt_metal/hw/inc/firmware_common.h
index cf08580ad69..d6b6b7b9d5f 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h
+++ b/tt_metal/hw/inc/firmware_common.h
@@ -9,8 +9,8 @@
 #include <cstdint>
 #include "ckernel_globals.h"
 #include "tensix_functions.h"
-#include "metal_compile_time_args.h"
 #include "risc_attribs.h"
+#include "compile_time_args.h"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "hostdevcommon/kernel_structs.h"
 
diff --git a/tt_metal/include/compute_kernel_api/common_globals.h b/tt_metal/include/compute_kernel_api/common_globals.h
index 0b0eee877dc..5c789bdc7d8 100644
--- a/tt_metal/include/compute_kernel_api/common_globals.h
+++ b/tt_metal/include/compute_kernel_api/common_globals.h
@@ -10,7 +10,7 @@
 
 #include "chlkc_list.h"
 #include "ckernel.h"
-#include "metal_ckernel_globals.h"
+#include "firmware_common.h"
 #include "ckernel_include.h"
 #include "hostdevcommon/kernel_structs.h"
 
diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp
index 8a0285268e5..04d4cfa12e4 100644
--- a/tt_metal/jit_build/build.cpp
+++ b/tt_metal/jit_build/build.cpp
@@ -165,7 +165,7 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, int which, bo
     this->includes_ = env_.includes_ +
         "-I " + env_.root_ + "tt_metal/hw/firmware/src " +
         "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " +
-        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io ";
+        "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io ";
 
     this->defines_ = env_.defines_;
 
@@ -315,7 +315,7 @@ JitBuildEthernet::JitBuildEthernet(const JitBuildEnv& env, int which, bool is_fw
     this->includes_ = env_.includes_ +
         "-I " + env_.root_ + "tt_metal/hw/inc/ethernet " +
         "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " +
-        "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io ";
+        "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io ";
 
     this->srcs_.push_back("tt_metal/hw/toolchain/substitutes.cpp");
     if (this->is_fw_) {

From 222653025afdc64a952d9792aa933199a7c07373 Mon Sep 17 00:00:00 2001
From: Reem Tawfik <rtawfik@tenstorrent.com>
Date: Wed, 13 Dec 2023 01:20:41 -0500
Subject: [PATCH 16/16] #3908: Clean up some magic values

---
 .../grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h     | 5 +++--
 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h    | 3 +--
 .../wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h   | 3 ++-
 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h  | 3 +--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
index 2dcd2a5d63e..1bd2e70d97e 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -19,11 +19,12 @@ namespace sfpu {
 template <bool APPROXIMATION_MODE, int ITERATIONS=4>
 inline void calculate_mask()
 {
-    bool exponent_size_8 = true;
+    const bool exponent_size_8 = true;
+    const int mask_val_idx = 16;
     #pragma GCC unroll 4
     for (int d = 0; d < ITERATIONS; d++)
     {
-        vFloat mask = dst_reg[16];
+        vFloat mask = dst_reg[mask_val_idx];
         v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) {
             dst_reg[0] = 0;
         }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h
index bd010082bbd..7558f53219a 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h
@@ -9,8 +9,7 @@
 // Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes
 inline uint32_t get_output_id(uint32_t output)
 {
-   const uint32_t OUTPUT_BASE    = 0;
-   return ((output) - OUTPUT_BASE);
+   return (output);
 }
 
 inline const uint32_t get_output_base_id()
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
index f00bc07cbb3..f2292c5281d 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -19,7 +19,8 @@ namespace sfpu {
 template <bool APPROXIMATION_MODE, int ITERATIONS=8>
 inline void calculate_mask()
 {
-    bool exponent_size_8 = true;
+    const bool exponent_size_8 = true;
+    const int mask_val_idx = 16;
     #pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++)
     {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
index 11d634c25e4..b92af5b8ddc 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h
@@ -9,8 +9,7 @@
 // Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes
 inline uint32_t get_output_id(uint32_t output)
 {
-   const uint32_t OUTPUT_BASE    = 0;
-   return ((output) - OUTPUT_BASE);
+   return (output);
 }
 
 inline const uint32_t get_output_base_id()