From 364fb1d9098fc1e9c5f5ec44b3c0bb41df7d878c Mon Sep 17 00:00:00 2001 From: Reem Tawfik Date: Wed, 6 Dec 2023 18:07:26 +0000 Subject: [PATCH] #0: Add empty llk api files for grayskull to fix compile --- .../grayskull/common/inc/chlkc_list.h | 4 +- .../ckernels/grayskull/common/inc/ckernel.h | 18 - .../grayskull/common/inc/ckernel_globals.h | 59 -- .../grayskull/common/inc/ckernel_template.h | 217 +++++ .../grayskull/common/inc/cpack_common.h | 10 - .../grayskull/common/inc/cunpack_common.h | 5 - .../grayskull/common/src/ckernel_template.cc | 229 ----- .../hw/ckernels/grayskull/llk_lib/llk_defs.h | 60 -- ..._math_eltwise_unary_sfpu_common_includes.h | 155 +++- .../hw/ckernels/grayskull/llk_lib/llk_pack.h | 2 +- .../ckernels/grayskull/llk_lib/llk_unpack_A.h | 3 + .../grayskull/llk_lib/llk_unpack_AB.h | 3 + .../grayskull/llk_lib/llk_unpack_AB_matmul.h | 3 + .../grayskull/llk_lib/llk_unpack_common.h | 3 + .../metal/common/metal_ckernel_globals.h | 63 ++ .../metal/llk_api/llk_math_binary_api.h | 86 ++ .../metal/llk_api/llk_math_binary_sfpu_api.h | 70 ++ .../metal/llk_api/llk_math_common_api.h | 108 +++ .../metal/llk_api/llk_math_matmul_api.h | 68 ++ .../metal/llk_api/llk_math_reduce_api.h | 28 + .../llk_api/llk_math_unary_datacopy_api.h | 36 + .../metal/llk_api/llk_math_unary_sfpu_api.h | 293 +++++++ .../grayskull/metal/llk_api/llk_op_info_api.h | 23 + .../grayskull/metal/llk_api/llk_pack_api.h | 308 +++++++ .../llk_api}/llk_param_structs.h | 0 ..._math_eltwise_unary_sfpu_common_includes.h | 171 ++++ .../llk_api/llk_sfpu/metal_ckernel_sfpu.h | 780 ++++++++++++++++++ .../grayskull/metal/llk_api/llk_sfpu_types.h | 63 ++ .../metal/llk_api/llk_unpack_AB_api.h | 85 ++ .../metal/llk_api/llk_unpack_AB_matmul_api.h | 136 +++ .../metal/llk_api/llk_unpack_A_api.h | 89 ++ .../metal/llk_api/llk_unpack_common_api.h | 137 +++ .../metal/llk_api/llk_unpack_reduce_api.h | 94 +++ .../metal/llk_api/llk_unpack_tilize_api.h | 99 +++ .../metal/llk_api/llk_unpack_untilize_api.h | 96 +++ .../ckernels/grayskull/metal/llk_io/llk_io.h | 10 + .../grayskull/metal/llk_io/llk_operands.h | 53 ++ .../grayskull/metal/llk_io/llk_outputs.h | 61 ++ .../wormhole_b0/common/inc/ckernel_globals.h | 1 - .../metal/common/metal_ckernel_globals.h | 2 + .../wormhole_b0/metal/llk_io/llk_operands.h | 3 +- tt_metal/include/compute_kernel_api/unpack.h | 114 +-- 42 files changed, 3402 insertions(+), 446 deletions(-) delete mode 100644 tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc create mode 100644 tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api}/llk_param_structs.h (100%) create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h b/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h index 3d02d79f908..0a30e5f179b 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h @@ -14,20 +14,18 @@ using namespace ckernel; #ifdef UCK_CHLKC_MATH -// #include "chlkc_math_llk_args.h" +#include "chlkc_unpack_data_format.h" #include "chlkc_math_fidelity.h" #include "chlkc_math_approx_mode.h" #include "chlkc_math.cpp" #endif #ifdef UCK_CHLKC_PACK -// #include "chlkc_pack_llk_args.h" #include "chlkc_pack_data_format.h" #include "chlkc_pack.cpp" #endif #ifdef UCK_CHLKC_UNPACK -// #include "chlkc_unpack_llk_args.h" #include "chlkc_unpack_data_format.h" #include "chlkc_unpack.cpp" #endif diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h index b2de68e862a..88bb41af795 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h @@ -62,8 +62,6 @@ extern uint32_t dest_offset_id; extern uint32_t dbg_event_index; extern uint32_t dbg_event_end; -extern uint32_t op_info_offset; - // Internal scope to namespace methods only (C++ does not allow namespace private ownership) namespace internal { } @@ -281,22 +279,6 @@ inline void debug_dump(uint8_t *data, uint32_t byte_size) { // TODO(pk) re-implement } -inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) { - - uint32_t* op_info_ptr = reinterpret_cast(OP_INFO_BASE_ADDR + op_info_offset); - static constexpr uint32_t op_info_num_items = 7; - - volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast(&op_info_struct); - for (uint32_t i = 0; i < op_info_num_items; i++) { - op_info_struct_ptr[i] = op_info_ptr[i]; - } - op_info_offset += 28; - - if (op_info_offset == OP_INFO_SIZE) { - op_info_offset = 0; // In case we go out of bounds - } -} - inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b) { unsigned int r = 0; diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h index a98ae7577ae..445c77d1e0b 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h @@ -7,8 +7,6 @@ #include #include "ckernel_structs.h" #include "risc_attribs.h" -#include "tensix_functions.h" -#include "hostdevcommon/common_runtime_address_map.h" extern uint32_t cfg_state_id; extern uint32_t unp_cfg_context; @@ -16,62 +14,5 @@ extern uint32_t gl_alu_format_spec_reg; extern volatile uint32_t l1_buffer[16]; -//extern const int32_t unpack_src_format[24]; -//extern const int32_t unpack_dst_format[24]; -//extern const int32_t pack_src_format[16]; -//extern const int32_t pack_dst_format[16]; - extern uint32_t pack_sync_tile_dst_ptr; extern uint32_t math_sync_tile_dst_index; - -extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; - -extern uint32_t __ldm_bss_start[]; -extern uint32_t __ldm_bss_end[]; -extern uint32_t __ldm_data_start[]; -extern uint32_t __ldm_data_end[]; -extern void (* __init_array_start[])(); -extern void (* __init_array_end[])(); -extern uint32_t __firmware_start[]; - -extern void kernel_init(); -extern void kernel_launch(); - -inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) { - // Cover L1 load latency of 6 cycles for the bulk of the copy - int32_t n = 0; - while (n < len - 5) { - uint32_t v0 = l1_addr[n + 0]; - uint32_t v1 = l1_addr[n + 1]; - uint32_t v2 = l1_addr[n + 2]; - uint32_t v3 = l1_addr[n + 3]; - uint32_t v4 = l1_addr[n + 4]; - uint32_t v5 = l1_addr[n + 5]; - local_mem_addr[n + 0] = v0; - local_mem_addr[n + 1] = v1; - local_mem_addr[n + 2] = v2; - local_mem_addr[n + 3] = v3; - local_mem_addr[n + 4] = v4; - local_mem_addr[n + 5] = v5; - n += 6; - } - // Could optimize this further (eg, loop of 2 or 4), probably not worth it - while (n < len) { - local_mem_addr[n] = l1_addr[n]; - n++; - } -} - -inline void firmware_kernel_common_init(void *init_local_l1_base) { - - // Handle stuff typically done in crt0 in asm. Easier to do in C - wzerorange(__ldm_bss_start, __ldm_bss_end); - - int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2; - uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE; - l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words); - - for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) { - (**fptr)(); - } -} diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h index c8968d06577..ba1c08033b1 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h @@ -237,4 +237,221 @@ class ckernel_unpack_template void program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask = 0); // calls program, then run }; + ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op) + : m_outer_loop_len(outer_loop_len) + , m_inner_loop_len(inner_loop_len) + , m_loop_op0(loop_op) + , m_loop_op1(TT_OP_NOP) + , m_end_op0(TT_OP_NOP) + , m_end_op1(TT_OP_NOP) + , m_start_op0(TT_OP_NOP) + { + m_loop0_last_instr = loop_op; + m_loop1_last_instr = loop_op; + } + + ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1) + : m_outer_loop_len(outer_loop_len) + , m_inner_loop_len(inner_loop_len) + , m_loop_op0(loop_op0) + , m_loop_op1(loop_op1) + , m_end_op0(TT_OP_NOP) + , m_end_op1(TT_OP_NOP) + , m_start_op0(TT_OP_NOP) + { + m_loop0_last_instr = loop_op1; + m_loop1_last_instr = loop_op1; + } + + void ckernel_template::set_loop_op0(uint loop_op) + { + m_loop_op0 = loop_op; + } + + void ckernel_template::set_loop_op1(uint loop_op) + { + m_loop_op1 = loop_op; + } + + void ckernel_template::set_end_ops(uint end_op0, uint end_op1) + { + m_end_op0 = end_op0; + m_end_op1 = end_op1; + } + + void ckernel_template::set_end_op(uint end_op0) + { + set_end_ops(end_op0, TT_OP_NOP); + } + + void ckernel_template::set_start_op(uint start_op0) + { + m_start_op0 = start_op0; + } + + void ckernel_template::set_last_inner_loop_instr(uint op) + { + m_loop1_last_instr = op; + } + + void ckernel_template::set_last_outer_loop_instr(uint op) + { + m_loop0_last_instr = op; + } + + void ckernel_template::program_and_run(volatile uint *instrn_buffer) + { + program(instrn_buffer); + run(instrn_buffer); + } + + void ckernel_template::run(volatile uint *instrn_buffer) + { + TTI_MOP(1, 0, 0); // run the double-loop template + } + + void ckernel_template::program(volatile uint *instrn_buffer) + { + volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); + + mop_sync(); // wait until previous mops have completed + + mop_cfg[0] = m_outer_loop_len; + mop_cfg[1] = m_inner_loop_len; + mop_cfg[2] = m_start_op0; + mop_cfg[3] = m_end_op0; + mop_cfg[4] = m_end_op1; + mop_cfg[5] = m_loop_op0; + mop_cfg[6] = m_loop_op1; + mop_cfg[7] = m_loop0_last_instr; + mop_cfg[8] = m_loop1_last_instr; + } + + void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) + { + program(instrn_buffer); + run(instrn_buffer, count, zmask); + } + + void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) + { + FWASSERT("Unpack template only supports loops up to 128", count <= 128); + TT_MOP_CFG(zmask >> 16); // Set the top 16 bits of zmask - we could skip this for count <= 16 + TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template + } + + // Version without zmask, should be slightly faster by eliminating one instruction. + void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count) + { + FWASSERT("Unpack template only supports loops up to 128", count <= 128); + TT_MOP(0, count - 1, 0); // Run the template + } + + void ckernel_unpack_template::program(volatile uint *instrn_buffer) const + { + volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); + + mop_sync(); // wait until previous mops have completed + + mop_cfg[1] = m_unpackB | (m_unpack_halo << 1); + mop_cfg[2] = m_B_instr; + mop_cfg[3] = m_A0_instr; + mop_cfg[4] = m_A1_instr; + mop_cfg[5] = m_A2_instr; + mop_cfg[6] = m_A3_instr; + mop_cfg[7] = m_skipA_instr; + mop_cfg[8] = m_skipB_instr; + } + + ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr) + { + return ckernel_unpack_template(false, // src B + false, // halo + A_instr, 0, 0, 0, skipA_instr, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr) + { + return ckernel_unpack_template(false, // src B + false, // halo + B_instr, 0, 0, 0, skipB_instr, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr) + { + return ckernel_unpack_template(false, // src B + true, // halo + neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(false, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(false, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(true, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B); + } + + ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(true, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B); + } + + ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr, + + uint B_instr, uint skipB_instr) + { + return ckernel_unpack_template(true, // src B + false, // halo + A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr); + } + + ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){ + return ckernel_unpack_template::lA(instr0, skip0); + } + + ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){ + // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA. + return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1); + } + } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h index b3c32f94d0e..6d3a1b6fa7b 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h @@ -416,14 +416,4 @@ namespace ckernel::packer { dest_offset_id = 0; } - - inline uint32_t get_output_id(uint32_t output) - { - return ((output) - OUTPUT_BASE); - } - - inline constexpr uint32_t get_output_base_id() - { - return (OUTPUT_BASE_ID); - } } diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h index 99e4a2c892f..33a3c7f0a58 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h @@ -335,9 +335,4 @@ namespace ckernel::unpacker // Clear context ID //reset_config_context(); } - - inline uint32_t get_operand_id(uint32_t operand) - { - return operand; - } } diff --git a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc deleted file mode 100644 index 238301e0566..00000000000 --- a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc +++ /dev/null @@ -1,229 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "ckernel_template.h" -#include "debug/fw_debug.h" - -namespace ckernel -{ -extern volatile uint *cfg_regs; - -ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op) - : m_outer_loop_len(outer_loop_len) - , m_inner_loop_len(inner_loop_len) - , m_loop_op0(loop_op) - , m_loop_op1(TT_OP_NOP) - , m_end_op0(TT_OP_NOP) - , m_end_op1(TT_OP_NOP) - , m_start_op0(TT_OP_NOP) -{ - m_loop0_last_instr = loop_op; - m_loop1_last_instr = loop_op; -} - -ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1) - : m_outer_loop_len(outer_loop_len) - , m_inner_loop_len(inner_loop_len) - , m_loop_op0(loop_op0) - , m_loop_op1(loop_op1) - , m_end_op0(TT_OP_NOP) - , m_end_op1(TT_OP_NOP) - , m_start_op0(TT_OP_NOP) -{ - m_loop0_last_instr = loop_op1; - m_loop1_last_instr = loop_op1; -} - -void ckernel_template::set_loop_op0(uint loop_op) -{ - m_loop_op0 = loop_op; -} - -void ckernel_template::set_loop_op1(uint loop_op) -{ - m_loop_op1 = loop_op; -} - -void ckernel_template::set_end_ops(uint end_op0, uint end_op1) -{ - m_end_op0 = end_op0; - m_end_op1 = end_op1; -} - -void ckernel_template::set_end_op(uint end_op0) -{ - set_end_ops(end_op0, TT_OP_NOP); -} - -void ckernel_template::set_start_op(uint start_op0) -{ - m_start_op0 = start_op0; -} - -void ckernel_template::set_last_inner_loop_instr(uint op) -{ - m_loop1_last_instr = op; -} - -void ckernel_template::set_last_outer_loop_instr(uint op) -{ - m_loop0_last_instr = op; -} - -void ckernel_template::program_and_run(volatile uint *instrn_buffer) -{ - program(instrn_buffer); - run(instrn_buffer); -} - -void ckernel_template::run(volatile uint *instrn_buffer) -{ - TTI_MOP(1, 0, 0); // run the double-loop template -} - -void ckernel_template::program(volatile uint *instrn_buffer) -{ - volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); - - mop_sync(); // wait until previous mops have completed - - mop_cfg[0] = m_outer_loop_len; - mop_cfg[1] = m_inner_loop_len; - mop_cfg[2] = m_start_op0; - mop_cfg[3] = m_end_op0; - mop_cfg[4] = m_end_op1; - mop_cfg[5] = m_loop_op0; - mop_cfg[6] = m_loop_op1; - mop_cfg[7] = m_loop0_last_instr; - mop_cfg[8] = m_loop1_last_instr; -} - -void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) -{ - program(instrn_buffer); - run(instrn_buffer, count, zmask); -} - -void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) -{ - FWASSERT("Unpack template only supports loops up to 128", count <= 128); - TT_MOP_CFG(zmask >> 16); // Set the top 16 bits of zmask - we could skip this for count <= 16 - TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template -} - -// Version without zmask, should be slightly faster by eliminating one instruction. -void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count) -{ - FWASSERT("Unpack template only supports loops up to 128", count <= 128); - TT_MOP(0, count - 1, 0); // Run the template -} - -void ckernel_unpack_template::program(volatile uint *instrn_buffer) const -{ - volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); - - mop_sync(); // wait until previous mops have completed - - mop_cfg[1] = m_unpackB | (m_unpack_halo << 1); - mop_cfg[2] = m_B_instr; - mop_cfg[3] = m_A0_instr; - mop_cfg[4] = m_A1_instr; - mop_cfg[5] = m_A2_instr; - mop_cfg[6] = m_A3_instr; - mop_cfg[7] = m_skipA_instr; - mop_cfg[8] = m_skipB_instr; -} - -ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr) -{ - return ckernel_unpack_template(false, // src B - false, // halo - A_instr, 0, 0, 0, skipA_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr) -{ - return ckernel_unpack_template(false, // src B - false, // halo - B_instr, 0, 0, 0, skipB_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr) -{ - return ckernel_unpack_template(false, // src B - true, // halo - neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(false, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(false, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(true, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B); -} - -ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(true, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B); -} - -ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr, - - uint B_instr, uint skipB_instr) -{ - return ckernel_unpack_template(true, // src B - false, // halo - A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr); -} - -ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){ - return ckernel_unpack_template::lA(instr0, skip0); -} - -ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){ - // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA. - return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1); -} - -} // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h index 4736137bb9e..2c28acf94e0 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h @@ -90,66 +90,6 @@ enum ReluType { MAX_THRESHOLD_RELU }; -enum SfpuType -{ - tanh, - hardtanh, - gelu, - exponential, - exp_with_base, - sigmoid, - sigmoid_appx, - reciprocal, - sqrt, - rsqrt, - lrelu, - power, - square, - tanh_derivative, - log, - log_with_base, - equal_zero, - not_equal_zero, - less_than_zero, - greater_than_equal_zero, - less_than_equal_zero, - greater_than_zero, - clamp, - gelu_derivative, - dropout, - abs, - sign, - max, - min, - sine, - cosine, - tan, - relu_min, - relu_max, - elu, - exp2, - heaviside, - expm1, - signbit, - asin, - acos, - atan, - erf, - erfc, - isfinite, - isinf, - isposinf, - isneginf, - isnan, - logical_not_unary, - erfinv, - i0, - silu, - mask, - unused -}; - - enum SfpiTestType { logical_not, diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h index 822699707d1..9ee2ba08a21 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h @@ -5,12 +5,165 @@ #pragma once #include +#include "llk_sfpu_types.h" #include "ckernel_globals.h" #include "ckernel_include.h" #include "ckernel_template.h" +#include "metal_ckernel_sfpu.h" #include "cmath_common.h" #include "llk_format_conversions.h" #include "llk_math_common.h" #include "llk_param_structs.h" +#include "llk_math_eltwise_unary_sfpu.h" -using namespace ckernel; +// using namespace ckernel; +// using namespace ckernel::sfpu; +// namespace ckernel { + +// /************************************************************************* +// * LLK ELTWISE UNARY SFPU +// *************************************************************************/ + +// template < +// SfpuType operation, +// bool APPROXIMATION_MODE, +// int SfpuType_PARAM = 0, +// int ITERATIONS = 8, +// bool IS_INT_SFPU_EN = false> +// inline void llk_math_calculate_sfpu( +// const int iterations = ITERATIONS, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// if constexpr (operation == SfpuType::exp_with_base) { +// constexpr bool zero_negative = true; +// _calculate_exponential_(iterations, param0); +// } else if constexpr (operation == SfpuType::tanh) { +// _calculate_tanh_(iterations); +// } else if constexpr (operation == SfpuType::hardtanh) { +// _calculate_hardtanh_(iterations, param0, param1, param2); +// } else if constexpr (operation == SfpuType::rsqrt) { +// // param0 = true -> approximate fast mode +// // false -> high precision mode +// // The algorithm uses Newton's method based on no.of iteration better approximation can be calculated +// if (param0) { +// calculate_rsqrt(); +// } else { +// calculate_rsqrt(); +// } +// } else if constexpr (operation == SfpuType::sigmoid) { +// calculate_sigmoid(); +// } else if constexpr (operation == SfpuType::sigmoid_appx) { +// calculate_sigmoid_appx(); +// } else if constexpr (operation == SfpuType::tanh_derivative) { +// calculate_tanh_derivative(); +// } else if constexpr (operation == SfpuType::dropout) { +// calculate_dropout(param0, param1); +// } else if constexpr (operation == SfpuType::power) { +// calculate_power_iterative(param0); +// } else if constexpr (operation == SfpuType::square) { +// calculate_square(); +// } else if constexpr (operation == SfpuType::log) { +// calculate_log(param0); +// } else if constexpr (operation == SfpuType::log_with_base) { +// calculate_log(param0); +// } else if constexpr ( +// (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) || +// (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) || +// (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) { +// calculate_comp(8); // BFLOAT16 - exp +// } else if constexpr (operation == SfpuType::clamp) { +// calculate_clamp(param0, param1, param2); +// } else if constexpr (operation == SfpuType::abs) { +// calculate_abs(); +// } else if constexpr (operation == SfpuType::sign) { +// calculate_sign(); +// } else if constexpr (operation == SfpuType::max) { +// calculate_max(); +// } else if constexpr (operation == SfpuType::min) { +// calculate_min(); +// } else if constexpr (operation == SfpuType::exp2) { +// calculate_exp2(); +// } else if constexpr (operation == SfpuType::heaviside) { +// calculate_heaviside(param0); +// } else if constexpr (operation == SfpuType::expm1) { +// calculate_expm1(); +// } else if constexpr (operation == SfpuType::asin) { +// calculate_asin(); +// } else if constexpr (operation == SfpuType::acos) { +// calculate_acos(); +// } else if constexpr (operation == SfpuType::atan) { +// calculate_atan(); +// } else if constexpr (operation == SfpuType::signbit) { +// calculate_signbit(); +// } else if constexpr (operation == SfpuType::silu) { +// calculate_silu(); +// } +// } + +// template +// inline void llk_math_eltwise_unary_sfpu( +// uint dst_index, +// int vector_mode = (int)Dim::RC, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + +// constexpr int ITERATIONS = 8; + +// _llk_math_eltwise_unary_sfpu_start_(dst_index); + +// if (vector_mode == (int)Dim::R) { +// // Do a row vector, Face0 + Face1 -- first iteration (first row) +// const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2) +// : 2; // At least 2 iterations for odd and even columns +// #pragma GCC unroll 0 +// for (int face = 0; face < 2; face++) { +// llk_math_calculate_sfpu( +// iterations, param0, param1, param2, param3, param4, param5); +// // Move to the next face +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// // Skip next two faces +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } else if (vector_mode == (int)Dim::C) { +// // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for +// // full face +// #pragma GCC unroll 0 +// for (int face = 0; face < 2; face++) { +// llk_math_calculate_sfpu( +// ITERATIONS, param0, param1, param2, param3, param4, param5); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// if (num_faces > 2) { // Skip next face if tile is 32x32 +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } +// if (num_faces <= 2) { +// // Skip next two faces +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } else { +// // Do all four faces, and iterate through all 4 blocks of 4 rows each +// #pragma GCC unroll 0 +// for (int face = 0; face < 4; face++) { +// llk_math_calculate_sfpu( +// ITERATIONS, param0, param1, param2, param3, param4, param5); +// // Move to the next face +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } +// _llk_math_eltwise_unary_sfpu_done_(); +// } + +// } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h index a6f0e32f133..97c0b3d4909 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h @@ -149,7 +149,7 @@ inline void llk_pack_init() { template inline void llk_matmul_pack(std::uint32_t start_tile_index, std::uint32_t output, uint32_t ntiles, std::uint32_t output_tile_index = 0) { std::uint8_t output_id = get_output_id(output); - constexpr std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id(); + const std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id(); static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h index 3f1e76ea0d1..03848f843e5 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h @@ -6,6 +6,9 @@ #include "llk_io_unpack.h" #include "llk_param_structs.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h index ee9127628ce..307e94b25ef 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h @@ -6,6 +6,9 @@ #include "llk_io_unpack.h" #include "llk_param_structs.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h index b438a9715c5..78a28594917 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h @@ -7,6 +7,9 @@ #include "llk_io_unpack.h" #include "llk_param_structs.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h index 3269aab1192..2c46633312f 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h @@ -11,6 +11,9 @@ #include "llk_param_structs.h" #include "llk_io_unpack.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #ifdef PERF_DUMP #include "ckernel_perf_api.h" #endif diff --git a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h new file mode 100644 index 00000000000..7800a9934d7 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +//TODO: This file should be deleted after fixing redefinition errors, +// functions should be moved to ckernel_globals.h +#pragma once + +#include +#include "ckernel_structs.h" +#include "risc_attribs.h" +#include "tensix_functions.h" +#include "hostdevcommon/common_runtime_address_map.h" + +extern uint32_t __ldm_bss_start[]; +extern uint32_t __ldm_bss_end[]; +extern uint32_t __ldm_data_start[]; +extern uint32_t __ldm_data_end[]; +extern void (* __init_array_start[])(); +extern void (* __init_array_end[])(); +extern uint32_t __firmware_start[]; + +extern void kernel_init(); +extern void kernel_launch(); + +inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) { + // Cover L1 load latency of 6 cycles for the bulk of the copy + int32_t n = 0; + while (n < len - 5) { + uint32_t v0 = l1_addr[n + 0]; + uint32_t v1 = l1_addr[n + 1]; + uint32_t v2 = l1_addr[n + 2]; + uint32_t v3 = l1_addr[n + 3]; + uint32_t v4 = l1_addr[n + 4]; + uint32_t v5 = l1_addr[n + 5]; + local_mem_addr[n + 0] = v0; + local_mem_addr[n + 1] = v1; + local_mem_addr[n + 2] = v2; + local_mem_addr[n + 3] = v3; + local_mem_addr[n + 4] = v4; + local_mem_addr[n + 5] = v5; + n += 6; + } + // Could optimize this further (eg, loop of 2 or 4), probably not worth it + while (n < len) { + local_mem_addr[n] = l1_addr[n]; + n++; + } +} + +inline void firmware_kernel_common_init(void *init_local_l1_base) { + + // Handle stuff typically done in crt0 in asm. Easier to do in C + wzerorange(__ldm_bss_start, __ldm_bss_end); + + int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2; + uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE; + l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words); + + for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) { + (**fptr)(); + } +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h new file mode 100644 index 00000000000..317c14707ca --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h @@ -0,0 +1,86 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_eltwise_binary.h" + +// /************************************************************************* +// * LLK ELTWISE BINARY +// *************************************************************************/ + +// // Version with no operand +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> +// inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) { +// const std::uint32_t num_faces = 4; + +// _llk_math_eltwise_binary_init_( +// num_faces, transpose, acc_to_dest); +// } + +// // Version with operands +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> +// inline void llk_math_eltwise_binary_init_with_operands( +// const std::uint32_t operand_A, +// const std::uint32_t operand_B, +// const std::uint32_t transpose = 0, +// const std::uint32_t acc_to_dest = 0) { +// const std::uint32_t operand_id = +// get_operand_id(operand_A); // operand_id is used to extract tile dim data which is the same for both operands +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_math_eltwise_binary_init_( +// num_faces, transpose, acc_to_dest); +// } + +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// DstSync Dst = DstSync::SyncFull, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool is_fp32_dest_acc_en = false> +// inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) { +// const std::uint32_t num_faces = 4; + +// _llk_math_eltwise_binary_< +// eltwise_binary_type, +// src_b_bcast_type, +// Dst, +// NUM_FIDELITY_PHASES, +// binary_reuse_dest, +// is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); +// } + +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// DstSync Dst = DstSync::SyncFull, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool is_fp32_dest_acc_en = false> +// inline void llk_math_eltwise_binary( +// const std::uint32_t operand_A, +// const std::uint32_t operand_B, +// uint dst_index, +// const bool clear_fp32_dst_acc = true) { +// const std::uint32_t operand_id = get_operand_id(operand_A); // both operands must have same number of faces +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_math_eltwise_binary_< +// eltwise_binary_type, +// src_b_bcast_type, +// Dst, +// NUM_FIDELITY_PHASES, +// binary_reuse_dest, +// is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h new file mode 100644 index 00000000000..21c3e8ae428 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_eltwise_binary_sfpu.h" + +// /************************************************************************* +// * LLK ELTWISE BINARY SFPU +// *************************************************************************/ + +// template +// inline void llk_math_eltwise_binary_sfpu( +// const uint operand, +// uint dst_index_a, +// uint dst_index_b, +// int vector_mode = (int)Dim::RC, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// const std::uint32_t operand_id = get_operand_id(0); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + +// _llk_math_eltwise_binary_sfpu_( +// face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_init( +// uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { +// _llk_math_eltwise_binary_sfpu_init_(param0, param1, param2, param3, param4, param5); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_quant_int32( +// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { +// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) { +// llk_math_eltwise_binary_sfpu_init(zero_point); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_requant_int32( +// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { +// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) { +// llk_math_eltwise_binary_sfpu_init(zero_point); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_dequant_int32( +// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { +// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) { +// llk_math_eltwise_binary_sfpu_init(zero_point); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h new file mode 100644 index 00000000000..3da220f0cba --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_globals.h" +#include "ckernel_template.h" +#include "cmath_common.h" +#include "llk_defs.h" +#include "llk_io.h" +#include "llk_math_common.h" +#include "llk_operands.h" +#include "llk_param_structs.h" + +// // Need to revisit why we even need this +// #define EPS 1.19209e-07 // std::numeric_limits::epsilon() for FP32 + +// /************************************************************************* +// * LLK MATH COMMON +// *************************************************************************/ + +// template +// inline void llk_math_wait_for_dest_available() { +// _llk_math_wait_for_dest_available_(); +// } + +// template +// inline void llk_math_dest_section_done() { +// _llk_math_dest_section_done_(); +// } + +// template +// inline void llk_math_pack_sync_init() { +// _llk_math_pack_sync_init_(); +// } + +// template +// inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { +// _llk_math_get_tile_(tile_index, p_tile); +// } + +// template +// inline void llk_math_release_tile(std::uint32_t operand) { +// _llk_math_release_tile_(); +// } + +// inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); } + +// inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); } + +// inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); +// _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]); +// } + +// inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); +// _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]); +// } + +// inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]); +// } + +// inline void llk_math_reconfig_data_format( +// const std::uint32_t srca_old_operand, +// const std::uint32_t srca_new_operand, +// const std::uint32_t srcb_old_operand, +// const std::uint32_t srcb_new_operand) { +// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); +// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) && +// (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { +// llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand); +// } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { +// llk_math_reconfig_data_format_srca(srca_new_operand); +// } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { +// llk_math_reconfig_data_format_srcb(srcb_new_operand); +// } +// } + +// inline void llk_math_reconfig_data_format_srca( +// const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { +// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + +// if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { +// llk_math_reconfig_data_format_srca(srca_new_operand); +// } +// } + +// inline void llk_math_reconfig_data_format_srcb( +// const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { +// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { +// llk_math_reconfig_data_format_srcb(srcb_new_operand); +// } +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h new file mode 100644 index 00000000000..a12bcca1ef4 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_matmul.h" + +// /************************************************************************* +// * LLK MATMUL +// *************************************************************************/ + +// template +// inline void llk_math_matmul_init( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t transpose = 0, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// const std::uint32_t in0_id = get_operand_id(operandA); +// const std::uint32_t in1_id = get_operand_id(operandB); + +// const bool partial_face = get_operand_partial_face(in0_id); + +// const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id); +// const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id); +// const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id); +// const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id); + +// #ifdef ARCH_GRAYSKULL +// _llk_math_matmul_init_( +// in0_tile_r_dim, +// in0_tile_c_dim, +// in1_tile_r_dim, +// in1_tile_c_dim, +// partial_face, +// transpose, +// ct_dim, +// rt_dim, +// kt_dim); +// #else +// _llk_math_matmul_init_( +// in0_tile_r_dim, +// in0_tile_c_dim, +// in1_tile_r_dim, +// in1_tile_c_dim, +// partial_face, +// transpose, +// ct_dim, +// rt_dim, +// kt_dim); +// #endif +// } + +// template +// inline void llk_math_matmul( +// uint dst_index, +// const bool transpose = false, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// #ifdef ARCH_GRAYSKULL +// _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); +// #else +// _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); +// #endif +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h new file mode 100644 index 00000000000..c5f11d005f2 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_reduce.h" + +// /************************************************************************* +// * LLK REDUCE +// *************************************************************************/ + +// template < +// PoolType type, +// ReduceDim dim, +// int num_fidelity_phases = 0, +// bool is_fp32_dest_acc_en = false, +// bool is_int_fpu_en = false> +// inline void llk_math_reduce(const uint dst_index) { +// _llk_math_reduce_(dst_index); +// } + +// template +// inline void llk_math_reduce_init( +// const std::uint32_t within_face_16x16_transpose = +// 0) { // within_face_16x16_transpose used for unpack, ignored by math +// _llk_math_reduce_init_(within_face_16x16_transpose); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h new file mode 100644 index 00000000000..ca2a5d39e40 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_common_api.h" +#include "llk_math_eltwise_unary_datacopy.h" + +// /************************************************************************* +// * LLK ELTWISE UNARY DATACOPY +// *************************************************************************/ + +// template < +// DataCopyType type, +// BroadcastType src_b_bcast_type = BroadcastType::NONE, +// DstSync Dst = DstSync::SyncFull, +// bool is_fp32_dest_acc_en = false, +// bool unpack_to_dest = false> +// inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) { +// const std::uint32_t operand_id = get_operand_id(0); +// _llk_math_eltwise_unary_datacopy_( +// dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]); +// } + +// template +// // within_face_16x16_transpose is used by unpacker, math does not transpose +// inline void llk_math_eltwise_unary_datacopy_init( +// const std::uint32_t transpose_of_faces = 0 /*unused*/, +// const std::uint32_t within_face_16x16_transpose = 0 /* unused */, +// const std::uint32_t operand = 0) { +// const std::uint32_t operand_id = get_operand_id(0); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// _llk_math_eltwise_unary_datacopy_init_( +// transpose_of_faces, within_face_16x16_transpose, num_faces); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h new file mode 100644 index 00000000000..53b9d1afe8b --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h @@ -0,0 +1,293 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "metal_ckernel_sfpu.h" +#include "llk_math_eltwise_unary_sfpu_init.h" + +// namespace ckernel { + +// /************************************************************************* +// * LLK ELTWISE UNARY SFPU +// *************************************************************************/ + +// // New LLK SFPU APIs +// template +// inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_rsqrt_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) { +// llk_math_eltwise_unary_sfpu(dst_index,base_scale); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log_with_base_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_signbit_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_tanh_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //sign +// template +// inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_sign_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } +// template +// inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) { +// constexpr bool dont_care = false; +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode, integer_dropout, scale_factor); +// } + +// inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) { +// constexpr bool dont_care = false; +// constexpr uint dont_care_param = 0; + +// llk_math_eltwise_unary_sfpu_init(dont_care_param, dont_care_param, seed); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_sigmoid_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //EQZ +// template +// inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_eqz_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //NEZ +// template +// inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_nez_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //LTZ +// template +// inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_ltz_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //GTZ +// template +// inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_gtz_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //LEZ +// template +// inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_lez_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //GEZ +// template +// inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_gez_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_max_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_square_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode, pow); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_power_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_abs_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //EXP2 +// template +// inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_exp2_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //heaviside +// template +// inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index,vector_mode,param0); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_heaviside_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //EXPM1 +// template +// inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_expm1_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //Asin +// template +// inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_asin_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //Atan +// template +// inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_atan_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //Acos +// template +// inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_acos_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //silu +// template +// inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_silu_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h new file mode 100644 index 00000000000..ca7e298a7c2 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +extern uint32_t op_info_offset; + +inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) { + + uint32_t* op_info_ptr = reinterpret_cast(OP_INFO_BASE_ADDR + op_info_offset); + static constexpr uint32_t op_info_num_items = 7; + + volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast(&op_info_struct); + for (uint32_t i = 0; i < op_info_num_items; i++) { + op_info_struct_ptr[i] = op_info_ptr[i]; + } + op_info_offset += 28; + + if (op_info_offset == OP_INFO_SIZE) { + op_info_offset = 0; // In case we go out of bounds + } +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h new file mode 100644 index 00000000000..37ee8a0fe56 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h @@ -0,0 +1,308 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_template.h" +#include "cpack_common.h" +#include "ckernel_globals.h" +#include "circular_buffer.h" + +#include "llk_io.h" +#include "llk_defs.h" +#include "llk_outputs.h" +#include "llk_param_structs.h" +#include "llk_pack.h" +#include "llk_pack_common.h" + +/************************************************************************* +* LLK PACK +*************************************************************************/ + +// template +// inline void llk_pack_mop_config(const uint32_t output) { + +// const std::uint32_t output_id = get_output_id(output); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_mop_config_( +// pack_dst_format[output_id], +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) { + +// const std::uint32_t output_id = get_output_id(pack_params->pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; + +// _llk_pack_hw_configure_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// tile_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile, +// pack_params->relu_config.val +// ); +// } + +// template +// inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) { +// llk_pack_params_t llk_pack_params = { +// .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}}; +// llk_pack_hw_configure(&llk_pack_params); +// } + +// template +// inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) { +// const std::uint32_t output_id = get_output_id(pack_params->pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; + +// _llk_pack_reduce_hw_configure_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// tile_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile, +// pack_params->relu_config.val +// ); +// } + +// template +// inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) { +// llk_pack_params_t llk_pack_params = { +// .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}}; +// llk_pack_reduce_hw_configure(&llk_pack_params); +// } + +// template +// inline void llk_pack_init(const std::uint32_t pack_output = 16) { + +// const std::uint32_t output_id = get_output_id(pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_init_( +// pack_dst_format[output_id], +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile +// ); +// } + +// template +// inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) { + +// std::uint32_t pack_tile_addr; +// if constexpr (out_of_order_output) { +// pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + +// (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1; +// } else { +// if constexpr (untilize) { +// // FIXME: Need to support pack-untilize? +// // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim + +// // cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf +// // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; +// // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size); + +// // cb_interface[output_id].ublock_tile_cnt++; + +// // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) { +// // cb_interface[output_id].ublock_tile_cnt=0; +// // cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct; +// // } +// } else { +// pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; +// cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size; +// } +// } +// return pack_tile_addr; +// } + +// template +// inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) { +// std::uint8_t output_id = get_output_id(output); + +// static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); + +// std::uint32_t pack_tile_addr = get_output_tile_address(output_id, output_tile_index); + +// _llk_pack_( +// tile_index, +// pack_tile_addr +// ); +// } + +// /************************************************************************* +// * LLK PACK COMMON +// *************************************************************************/ + + +// inline void llk_packer_wait_for_math_done() { +// _llk_packer_wait_for_math_done_(); +// } + +// template +// inline void llk_packer_set_math_semaphore() { +// _llk_packer_set_math_semaphore_(); +// } + +// template +// inline void llk_pack_dest_section_done() { +// _llk_pack_dest_section_done_(); +// } + +// template +// inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) { +// const std::uint32_t output_id = get_output_id(pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_init_packer_dest_offset_registers_( +// face_r_dim, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) { + +// const std::uint32_t output_id = get_output_id(pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_dest_init_( +// face_r_dim, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) { +// _llk_pack_get_tile_(tile_index, p_tile); +// } + +// template +// inline void llk_pack_release_tile(std::uint32_t output) { +// _llk_pack_release_tile_(); +// } + +// inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { +// _llk_pack_debug_dump_(data, byte_size); +// } + +// inline void llk_pack_debug_dump_seek(std::uint8_t offset) { +// _llk_pack_debug_dump_seek_(offset); +// } + +// template +// inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { + +// const std::uint32_t output_id = get_output_id(new_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_reconfig_data_format_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// cb_interface[output_id].fifo_page_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) { +// std::uint32_t old_output_id = get_output_id(old_output); +// std::uint32_t new_output_id = get_output_id(new_output); + +// if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id]) +// && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid) +// && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) { +// llk_pack_reconfig_data_format(new_output); +// } else if constexpr (is_tile_dim_reconfig_en) { +// // Same format but different tile dims +// llk_pack_mop_config(new_output); +// } +// } + +// TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) { +// _llk_pack_relu_config_(config); +// } + +// inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) { +// _llk_pack_reconfig_l1_acc_(enable); +// } + +// template +// inline void llk_pack_reduce_mask_config() { +// _llk_pack_reduce_mask_config_(); +// } + +// inline void llk_pack_reduce_mask_clear() { +// _llk_pack_reduce_mask_clear_(); +// } + +// // FIXME-WH-UPLIFT +// template +// inline void llk_pack_reduce_config_v2(uint32_t icb_out) { + +// const bool untilize = false; +// if constexpr (at_kernel_start) { + +// const std::uint32_t output_id = get_output_id(icb_out); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); +// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; +// const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}}; + +// _llk_pack_hw_configure_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// tile_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile, +// relu_config.val +// ); +// } + +// if constexpr (revert) { +// _llk_pack_reduce_mask_clear_(); +// } else { +// _llk_pack_reduce_mask_config_(); +// } +// } diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h new file mode 100644 index 00000000000..83a5fdcca92 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +#include "llk_sfpu_types.h" +#include "ckernel_globals.h" +#include "ckernel_include.h" +#include "ckernel_template.h" +#include "metal_ckernel_sfpu.h" +#include "cmath_common.h" +#include "llk_format_conversions.h" +#include "llk_math_common.h" +#include "llk_param_structs.h" +#include "llk_math_eltwise_unary_sfpu.h" + +//TODO: Fix for GS uplift + +// using namespace ckernel; +// using namespace ckernel::sfpu; +// namespace ckernel { + +// /************************************************************************* +// * LLK ELTWISE UNARY SFPU +// *************************************************************************/ + +// template < +// SfpuType operation, +// bool APPROXIMATION_MODE, +// int SfpuType_PARAM = 0, +// int ITERATIONS = 8, +// bool IS_INT_SFPU_EN = false> +// inline void llk_math_calculate_sfpu( +// const int iterations = ITERATIONS, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// if constexpr (operation == SfpuType::exp_with_base) { +// constexpr bool zero_negative = true; +// _calculate_exponential_(iterations, param0); +// } else if constexpr (operation == SfpuType::tanh) { +// _calculate_tanh_(iterations); +// } else if constexpr (operation == SfpuType::hardtanh) { +// _calculate_hardtanh_(iterations, param0, param1, param2); +// } else if constexpr (operation == SfpuType::rsqrt) { +// // param0 = true -> approximate fast mode +// // false -> high precision mode +// // The algorithm uses Newton's method based on no.of iteration better approximation can be calculated +// if (param0) { +// calculate_rsqrt(); +// } else { +// calculate_rsqrt(); +// } +// } else if constexpr (operation == SfpuType::sigmoid) { +// calculate_sigmoid(); +// } else if constexpr (operation == SfpuType::sigmoid_appx) { +// calculate_sigmoid_appx(); +// } else if constexpr (operation == SfpuType::tanh_derivative) { +// calculate_tanh_derivative(); +// } else if constexpr (operation == SfpuType::dropout) { +// calculate_dropout(param0, param1); +// } else if constexpr (operation == SfpuType::power) { +// calculate_power_iterative(param0); +// } else if constexpr (operation == SfpuType::square) { +// calculate_square(); +// } else if constexpr (operation == SfpuType::log) { +// calculate_log(param0); +// } else if constexpr (operation == SfpuType::log_with_base) { +// calculate_log(param0); +// } else if constexpr ( +// (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) || +// (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) || +// (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) { +// calculate_comp(8); // BFLOAT16 - exp +// } else if constexpr (operation == SfpuType::clamp) { +// calculate_clamp(param0, param1, param2); +// } else if constexpr (operation == SfpuType::abs) { +// calculate_abs(); +// } else if constexpr (operation == SfpuType::sign) { +// calculate_sign(); +// } else if constexpr (operation == SfpuType::max) { +// calculate_max(); +// } else if constexpr (operation == SfpuType::min) { +// calculate_min(); +// } else if constexpr (operation == SfpuType::exp2) { +// calculate_exp2(); +// } else if constexpr (operation == SfpuType::heaviside) { +// calculate_heaviside(param0); +// } else if constexpr (operation == SfpuType::expm1) { +// calculate_expm1(); +// } else if constexpr (operation == SfpuType::asin) { +// calculate_asin(); +// } else if constexpr (operation == SfpuType::acos) { +// calculate_acos(); +// } else if constexpr (operation == SfpuType::atan) { +// calculate_atan(); +// } else if constexpr (operation == SfpuType::signbit) { +// calculate_signbit(); +// } else if constexpr (operation == SfpuType::silu) { +// calculate_silu(); +// } +// } + +// template +// inline void llk_math_eltwise_unary_sfpu( +// uint dst_index, +// int vector_mode = (int)Dim::RC, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + +// constexpr int ITERATIONS = 8; + +// _llk_math_eltwise_unary_sfpu_start_(dst_index); + +// if (vector_mode == (int)Dim::R) { +// // Do a row vector, Face0 + Face1 -- first iteration (first row) +// const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2) +// : 2; // At least 2 iterations for odd and even columns +// #pragma GCC unroll 0 +// for (int face = 0; face < 2; face++) { +// llk_math_calculate_sfpu( +// iterations, param0, param1, param2, param3, param4, param5); +// // Move to the next face +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// // Skip next two faces +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } else if (vector_mode == (int)Dim::C) { +// // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for +// // full face +// #pragma GCC unroll 0 +// for (int face = 0; face < 2; face++) { +// llk_math_calculate_sfpu( +// ITERATIONS, param0, param1, param2, param3, param4, param5); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// if (num_faces > 2) { // Skip next face if tile is 32x32 +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } +// if (num_faces <= 2) { +// // Skip next two faces +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } else { +// // Do all four faces, and iterate through all 4 blocks of 4 rows each +// #pragma GCC unroll 0 +// for (int face = 0; face < 4; face++) { +// llk_math_calculate_sfpu( +// ITERATIONS, param0, param1, param2, param3, param4, param5); +// // Move to the next face +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } +// _llk_math_eltwise_unary_sfpu_done_(); +// } + +// } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h new file mode 100644 index 00000000000..50018e399c3 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h @@ -0,0 +1,780 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_sfpu_types.h" +#include "ckernel_defs.h" +#include "ckernel_sfpu.h" +#include "ckernel.h" +#include "noc_nonblocking_api.h" + +#include "sfpi.h" + +#include "ckernel_sfpu_cdf.h" +#include "ckernel_sfpu_exp.h" +#include "ckernel_sfpu_recip.h" +#include "ckernel_sfpu_converter.h" + +//TODO: Delete this file once GS uplift is done +// using namespace sfpi; + +// namespace ckernel +// { +// namespace sfpu +// { + +// template +// inline void calculate_rsqrt() +// { + +// for (int d = 0; d < ITERATIONS; d++) +// { + +// vFloat in = dst_reg[0]; +// v_if(dst_reg[0] == 0.0f){ +// dst_reg[0] = std::numeric_limits::infinity(); +// }v_else{ +// vFloat result = 1.0f; +// v_if(dst_reg[0] > 1.0f){ +// result = sfpu_reciprocal(in); +// }v_endif; + +// for (int r = 0; r < RECIPROCAL_ITERATIONS; r++) +// { +// // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration. +// result = result * (1.5F - 0.5F * dst_reg[0] * result * result); +// } +// dst_reg[0] = result; +// }v_endif; + +// dst_reg++; + +// } +// } + +// template +// inline void calculate_sigmoid_appx() +// { +// vUInt l0 = l_reg[LRegs::LReg0]; +// vUInt l1 = l_reg[LRegs::LReg1]; +// vUInt l2 = l_reg[LRegs::LReg2]; + +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// dst_reg[0] = lut(val, l0, l1, l2) + 0.5f; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg0] = l0; +// l_reg[LRegs::LReg1] = l1; +// l_reg[LRegs::LReg2] = l2; +// } + +// // TODO: Implement using bitwise comparision +// template +// inline void calculate_signbit() +// { + +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// v_if (val <= -0.0f) { +// val = 1.0f; +// } v_elseif (val >= 0.0f) { +// val = 0.0f; +// } +// v_endif; +// dst_reg[0] = val; + +// dst_reg++; +// } + +// } + +// template +// inline void calculate_tanh() +// { +// // SFPU microcode +// vUInt l0 = l_reg[LRegs::LReg0]; +// vUInt l1 = l_reg[LRegs::LReg1]; +// vUInt l2 = l_reg[LRegs::LReg2]; + +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// val = lut(val, l0, l1, l2); +// dst_reg[0] = val; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg0] = l0; +// l_reg[LRegs::LReg1] = l1; +// l_reg[LRegs::LReg2] = l2; +// } + +// template +// inline void calculate_hardtanh(uint param0, uint param1, uint param2) +// { +// // All params are in FP16_B format +// // param0 = -(neg_threshold) +// // param1 = -(pos_threshold - neg_threshold) +// // param2 = -(pos_threshold) + +// vFloat p0 = s2vFloat16(param0); +// vFloat p1 = s2vFloat16(param1); +// vFloat p2 = s2vFloat16(param2); +// // SFPU microcode +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// val += p0;// 12 bits +// v_if (val < 0.0f) { +// val = 0.0f; +// } +// v_endif; + +// val += p1;// 12 bits +// v_if (val >= 0.0f) { +// val = 0.0f; +// } +// v_endif; + +// val += p2;// 12 bits + +// dst_reg[0] = val; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_tanh_derivative() +// { +// vUInt l0 = l_reg[LRegs::LReg0]; +// vUInt l1 = l_reg[LRegs::LReg1]; +// vUInt l2 = l_reg[LRegs::LReg2]; + +// // tanh'(x) = 1 - (tanh(x))^2 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// if constexpr (!WITH_PRECOMPUTED_TANH) { +// val = lut(val, l0, l1, l2); +// } + +// val = val * (-val) + vConst1; +// dst_reg[0] = val; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg0] = l0; +// l_reg[LRegs::LReg1] = l1; +// l_reg[LRegs::LReg2] = l2; +// } + +// template +// inline void calculate_dropout(uint prob, uint scale) +// { +// // SFPU microcode + +// vUInt rand = l_reg[LRegs::LReg3]; + +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) { +// //////////////////////// +// // Scale samples +// /////////////////////// +// dst_reg[0] = dst_reg[0] * s2vFloat16b(scale); + +// //////////////////////// +// // Drop samples +// /////////////////////// +// v_if (rand < prob) { +// dst_reg[0] = vConst0; +// } +// v_endif; + +// //////////////////////// +// // 16-bit PRNG update +// /////////////////////// +// vUInt lfsr = vConstIntPrgm1; +// vUInt tmp = lfsr & rand; +// rand = rand >> 1; +// v_if (tmp != 0) { +// vUInt mask = vConstIntPrgm0; +// rand ^= mask; +// } +// v_endif; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg3] = rand; +// } + +// template +// inline void calculate_power_iterative(const uint exponent) +// { +// #pragma GCC unroll 8 +// for (int d = 0; d < 8; d++) +// { +// vFloat in = dst_reg[0]; +// vFloat result = 1.0f; +// for (uint i = 0; i < exponent; i++) { +// result *= in; +// } +// dst_reg[0]=result; +// dst_reg++; +// } +// } + +// template +// inline void calculate_square() +// { +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat in = dst_reg[0]; +// vFloat result = in * in; + +// dst_reg[0] = result; + +// dst_reg++; +// } +// } + +// template +// sfpi_inline void calculate_log_body(const uint log_base_scale_factor) +// { +// //////////////////////////// +// // Load From dest + "normalize to calculation range" +// //////////////////////////// +// vFloat in = dst_reg[0]; +// vFloat x = setexp(in, 127); // set exp to exp bias (put in range of 1-2) + +// // XXXXXX ask Namal? if we can derive the coefficients below to higher precision +// //////////////////////////// +// // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order +// // x* ( x* (A*x + B) + C) + D +// // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006 +// // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to +// // save the subtract and get A',B',C',D'): +// // A' = A +// // B' = -3A + B +// // C' = 3a -2B + C +// // D' = -A + B - C + D +// // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753 +// //////////////////////////// +// vFloat a = vConstFloatPrgm1; +// vFloat b = vConstFloatPrgm2; +// // XXXXX try variants of the below: B'=.7122, C'=2.0869 +// vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f; + +// //////////////////////////// +// // Convert exponent to float +// //////////////////////////// +// vInt exp = exexp(in); +// v_if (exp < 0) { +// exp = setsgn(~exp + 1, 1); +// } +// v_endif; + +// vFloat expf = int32_to_float(exp, 0); +// vFloat vConstLn2 = vConstFloatPrgm0; +// vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2) + +// if constexpr (HAS_BASE_SCALING) { +// result *= s2vFloat16a(log_base_scale_factor); +// } + +// //////////////////////////// +// // Base case when input is 0. ln(0) = -inf +// //////////////////////////// +// v_if (in == 0.0F) { // Reload for register pressure +// result = -std::numeric_limits::infinity(); +// } +// v_endif; + +// dst_reg[0] = result; +// } + +// template +// inline void calculate_log(uint log_base_scale_factor) +// { +// #pragma GCC unroll 8 +// for(int d = 0; d < ITERATIONS; d++){ +// calculate_log_body(log_base_scale_factor); +// dst_reg++; +// } +// } + +// sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init) +// { +// flag1 = init; +// if (check) { +// flag2 = init; +// } +// } + +// template +// inline void calculate_comp(uint exponent_size_8) +// { +// const vFloat zero = 0.0f; +// const vFloat one = 1.0f; +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// vFloat flag1, flag2; + +// //a[i] == 0 +// if constexpr(COMP_MODE == SfpuType::equal_zero) { +// v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { +// v = one; +// } v_else { +// v = zero; +// } +// v_endif; +// } + +// //a[i] != 0 +// if constexpr(COMP_MODE == SfpuType::not_equal_zero) { +// v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { +// v = zero; +// } v_else { +// v = one; +// } +// v_endif; +// } + +// //a[i] < 0 +// if constexpr(COMP_MODE == SfpuType::less_than_zero) { +// v_if (v >= 0.0f) { +// v = zero; +// } v_else { +// v = one; +// } +// v_endif; +// } + +// //a[i] >= 0 +// if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) { +// v_if (v >= 0.0f) { +// v = one; +// } v_else { +// v = zero; +// } +// v_endif; +// } + +// //a[i] > 0 +// if constexpr(COMP_MODE == SfpuType::greater_than_zero) { +// v_if (v > 0.0f) { +// v = one; +// } v_else { +// v = zero; +// } +// v_endif; +// } + +// //a[i] <= 0 +// if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) { +// v_if (v > 0.0f) { +// v = zero; +// } v_else { +// v = one; +// } +// v_endif; +// } + +// dst_reg[0] = v; +// dst_reg++; +// } +// } + +// template +// inline void calculate_clamp(uint param0, uint param1, uint param2) +// { +// // All params are in FP16 format +// // param0 = min +// // param1 = max + +// //uint format = (param0 >> 16)&0x1; +// s2vFloat16::Format format = s2vFloat16::fp16a; + +// // SFPU microcode +// vFloat min = s2vFloat16(param0, format); +// vFloat max = s2vFloat16(param1, format); +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// v_if (val < min) { +// val = s2vFloat16(param0, format); +// } v_elseif (val >= max) { +// val = s2vFloat16(param1, format); +// } +// v_endif; + +// dst_reg[0] = val + s2vFloat16b(param2); // 12 bits + +// dst_reg++; +// } +// } + +// template +// inline void calculate_abs() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// dst_reg[0] = sfpi::abs(v); +// dst_reg++; +// } +// } + + +// template +// inline void calculate_exp2() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// // log(2) = 0.6931471805; +// v = v * 0.6931471805f; +// // exp = e^(v) +// vFloat exp = calculate_exponential_body_improved(v); +// dst_reg[0] = exp; +// dst_reg++; +// } +// } + +// template +// inline void calculate_sign() +// { +// // All params are in FP16 format +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// vFloat result = vConst1; +// v_if (v < 0.0f) { +// result = vConstNeg1; +// } v_elseif(v > 0.0f) { +// result = vConst1; +// } v_else { +// result = vConst0; +// } +// v_endif; + +// dst_reg[0] = result; +// dst_reg++; +// } +// } +// template +// inline void calculate_max() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat a = dst_reg[0]; +// vFloat b = dst_reg[32]; +// v_if(a < b) { +// dst_reg[0] = b; +// } +// v_endif; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_min() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat a = dst_reg[0]; +// vFloat b = dst_reg[32]; +// v_if(a > b) { +// dst_reg[0] = b; +// } +// v_endif; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_expm1() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// v = calculate_exponential_body_improved(v); +// dst_reg[0] = v - 1.0f; +// dst_reg++; +// } +// } + + +// #define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4) (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0) + +// template +// sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val) +// { +// v_if(1 > sfpi::abs(val)){ +// dst_reg[0] = sfpi::abs(val) ; +// } +// v_else{ +// dst_reg[0] = sfpu_reciprocal(sfpi::abs(val)); +// } +// v_endif; + +// vFloat t1 = dst_reg[0] * dst_reg[0]; + +// t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1); + +// t1 = t1 * dst_reg[0]; + +// v_if (sfpi::abs(val) > 1){ +// t1 = 1.570796327f - t1; +// } +// v_endif; + +// v_if(val < 0 ){ +// t1 = -t1; +// } +// v_endif; + +// return t1; +// } + +// template +// inline void calculate_atan() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// val = sfpu_atan_maclaurin_series(val); +// dst_reg[0] = val; +// dst_reg++; +// } +// } + + +// template +// sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val) +// { +// // input for [-1:1] +// // Mclauren series +// // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ... +// // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a + +// vFloat tmp = val; +// vFloat val_square = val * val; +// // x +// vFloat output = tmp; +// // (1/6) * x^3 +// tmp = tmp * val_square; +// output += 0.166666666 * tmp; +// // (3/40) * x^5 +// tmp = tmp * val_square; +// output += 0.075 * tmp; + +// //(5/112) * x^7 +// tmp = tmp * val_square; +// output += 0.044642857 * tmp; + +// // (35/1152) *x^9 +// tmp = tmp * val_square; +// output += 0.03038194 * tmp; + +// //(63/2816) * x^11 +// tmp = tmp * val_square; +// output += 0.02237216 * tmp; + +// // Write out output +// return output; +// } + +// template +// inline void calculate_asin() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// v = sfpu_asine_maclaurin_series(v); +// dst_reg[0] = v; +// dst_reg++; +// } +// } + + +// #define PI_2 (1.570796326794) +// template +// inline void calculate_acos() +// { +// // SFPU microcode +// // acos = (pi/2 - asin) +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// v = sfpu_asine_maclaurin_series(v); +// v = PI_2 - v; +// dst_reg[0] = v; +// dst_reg++; +// } +// } + +// template +// inline void cast_fp32_to_fp16a() +// { +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// //vFloat val = dst_reg[0]; +// //dst_reg[0] = float_to_fp16a(val, 0); +// TTI_SFPLOAD(0, 0, 3, 0); +// TTI_SFP_STOCH_RND(0,0,0,0,0,8); +// TTI_SFPSTORE(0,1,3,0); +// dst_reg++; +// } +// } + + + +// template +// inline void calculate_negative() +// { + +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// dst_reg[0] = -val; +// dst_reg++; +// } +// } + +// template +// inline void calculate_add1() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// dst_reg[0] = 1.0f + val; +// dst_reg++; +// } +// } + +// inline +// vFloat sigmoid_piecewise_linear_positive(vFloat val) { +// vFloat result = 0.0f; +// v_if ( val >= +5.0f) { +// result = 1.0f; +// } v_elseif ( val > 1.0f && val < 5.0f ) { +// result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f, 0.24300185f, 0.50437757f,val); +// } v_else { +// result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5 +// } +// v_endif; +// return result; +// } + +// //sigmoid is anti-symmetric and offset by 1 +// //sigmoid[-x] = 1 - sigmoid[x] +// template +// inline void calculate_sigmoid() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// vFloat result = 0.0f; + +// v_if ( val < 0.0f ) { +// val = -val; +// } +// v_endif; + +// result = sigmoid_piecewise_linear_positive(val); + +// val = dst_reg[0]; +// v_if ( val < 0.0f ) { +// result = 1.0f - result; +// } +// v_endif; + +// dst_reg[0] = result; +// dst_reg++; +// } + +// return; +// } + +// template +// inline void calculate_heaviside(uint value) +// { +// // SFPU microcode +// Converter c_value; +// c_value.u = value; +// vFloat s = c_value.f; + +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) { +// vFloat v = dst_reg[0]; + +// v_if (v < 0.0f) { +// v = 0.0f; +// }v_elseif (v > 0.0f) { +// v = 1.0f; +// }v_else { +// v = s; +// } +// v_endif; + +// dst_reg[0] = v; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_silu() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) { +// vFloat val = dst_reg[0]; +// v_if ( val < 0.0f ) { +// val = -val; +// } +// v_endif; + +// vFloat result = sigmoid_piecewise_linear_positive(val); + +// val = dst_reg[0]; +// v_if ( val < 0.0f ) { +// result = 1.0f - result; +// } +// v_endif; +// result = val * result; +// dst_reg[0] = result; +// dst_reg++; +// } +// } + +// } // namespace sfpu +// } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h new file mode 100644 index 00000000000..dd04b59a63d --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +enum SfpuType { + tanh, + hardtanh, + gelu, + exponential, + exp_with_base, + sigmoid, + sigmoid_appx, + reciprocal, + sqrt, + rsqrt, + lrelu, + power, + square, + tanh_derivative, + log, + log_with_base, + equal_zero, + not_equal_zero, + less_than_zero, + greater_than_equal_zero, + less_than_equal_zero, + greater_than_zero, + clamp, + gelu_derivative, + dropout, + abs, + sign, + max, + min, + sine, + cosine, + tan, + relu_min, + relu_max, + elu, + exp2, + heaviside, + expm1, + signbit, + asin, + acos, + atan, + erf, + erfc, + isfinite, + isinf, + isposinf, + isneginf, + isnan, + logical_not_unary, + erfinv, + i0, + silu, + mask, + unused, +}; diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h new file mode 100644 index 00000000000..642fbb1591e --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_AB.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK AB +// *************************************************************************/ + +// template +// inline void llk_unpack_AB_hw_configure( +// const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) { +// // In0 -> unpA +// // In1 -> unpB +// const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand); +// const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand); + +// // unpA -> srcA +// // unpB -> srcB +// const uint32_t num_faces = get_operand_num_faces(unpA_operand_id); // num faces in unpA and unpB are the same + +// const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same + +// _llk_unpack_AB_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_src_format[unpB_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpack_dst_format[unpB_operand_id], +// face_r_dim, +// within_face_16x16_transpose, +// num_faces); +// } + +// template +// inline void llk_unpack_AB_hw_configure_disaggregated( +// const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) { +// const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand}; + +// llk_unpack_AB_hw_configure(&unpack_AB_params, within_face_16x16_transpose); +// } + +// template +// inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) { +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); // if narrow tile read face 0 twice for row broadcast +// // or read face 0 and 1 for col broadcast +// _llk_unpack_AB_mop_config_(transpose_of_faces, num_faces, narrow_tile); +// } + +// template +// inline void llk_unpack_AB_init( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t transpose = 0, +// const std::uint32_t acc_to_dest = 0) { +// const std::uint32_t operandA_id = get_operand_id(operandA); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id); // face r dim in unpA and unpB are the same +// const std::uint32_t num_faces = get_operand_num_faces(operandA_id); +// const bool narrow_tile = +// get_operand_narrow_tile(operandA_id); // if narrow tile read face 0 twice for row broadcast + +// _llk_unpack_AB_init_(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest); +// } + +// template +// inline void llk_unpack_AB( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t tile_index_a, +// const std::uint32_t tile_index_b, +// const bool transpose_of_faces = 0 /*not used*/) { +// std::uint32_t operandA_id = get_operand_id(operandA); +// std::uint32_t operandB_id = get_operand_id(operandB); +// std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a; +// std::uint32_t address_a = base_address_a + offset_address_a; +// std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b; +// std::uint32_t address_b = base_address_b + offset_address_b; + +// _llk_unpack_AB_(address_a, address_b, transpose_of_faces > 0); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h new file mode 100644 index 00000000000..f4aee2da6bd --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h @@ -0,0 +1,136 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_AB_matmul.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK AB MATMUL +// *************************************************************************/ + +// template +// inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) { +// const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca; + +// // In0 -> unpB +// // In1 -> unpA +// const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand); +// const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand); + +// // unpA -> srcA +// // unpB -> srcB +// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id); + +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); +// const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id); + +// _llk_unpack_AB_matmul_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_src_format[unpB_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpack_dst_format[unpB_operand_id], +// unpA_face_r_dim, +// unpB_face_r_dim, +// transpose_xy_srca, +// unpA_num_faces, +// unpB_num_faces, +// cb_interface[unpA_operand_id].fifo_page_size, +// cb_interface[unpB_operand_id].fifo_page_size); +// } + +// template +// inline void llk_unpack_AB_matmul_hw_configure_disaggregated( +// const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) { +// const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = { +// .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca}; +// llk_unpack_AB_matmul_hw_configure(&unpack_AB_matmul_params); +// } + +// inline void llk_unpack_AB_matmul_mop_config( +// const bool transpose, +// const std::uint32_t ct_dim, +// const std::uint32_t rt_dim, +// const std::uint32_t kt_dim, +// const bool partial_face) { +// // in0 - loaded to SrcB +// // in1 - loaded to SrcA +// _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face); +// } + +// __attribute__((always_inline)) inline void llk_unpack_AB_matmul_init( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t transpose = 0, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// // In0 -> srcB (supports partial face) +// // In1 -> srcA +// const uint32_t operandA_id = get_operand_id(operandB); +// const uint32_t operandB_id = get_operand_id(operandA); + +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id); +// const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id); + +// const bool reuse_a = ct_dim >= rt_dim; +// const bool partial_face = get_operand_partial_face(operandB_id); + +// const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id); +// const uint32_t unpB_num_faces = +// partial_face ? 1 : get_operand_num_faces(operandB_id); // if partial face -> unpack face by face + +// _llk_unpack_AB_matmul_init_( +// transpose, +// ct_dim, +// rt_dim, +// kt_dim, +// unpA_face_r_dim, +// unpB_face_r_dim, +// unpA_num_faces, +// unpB_num_faces, +// partial_face); +// } + +// inline void llk_unpack_AB_matmul( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t tile_index_a, +// const std::uint32_t tile_index_b, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// // In0/InA -> srcB (supports partial face) +// // In1/InB -> srcA + +// volatile uint *cfg = get_cfg_pointer(); // get pointer to registers for current state ID + +// const std::uint32_t operandA_id = get_operand_id(operandA); +// const std::uint32_t operandB_id = get_operand_id(operandB); +// const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id); // In1/InB -> srcA +// const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id); // In0/InA -> srcB + +// const bool partial_face = get_operand_partial_face(operandA_id); + +// std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; +// std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; + +// std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size; +// std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size; + +// _llk_unpack_AB_matmul_( +// base_address_a, +// base_address_b, +// tile_index_a, +// tile_index_b, +// tile_size_a, +// tile_size_b, +// unpA_face_r_dim, +// unpB_face_r_dim, +// partial_face, +// ct_dim, +// rt_dim, +// kt_dim); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h new file mode 100644 index 00000000000..ca39397653c --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_A.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK A +// *************************************************************************/ + +// template +// inline void llk_unpack_A_hw_configure( +// const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) { +// const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand); +// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + +// _llk_unpack_A_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces); +// } + +// template +// inline void llk_unpack_A_hw_configure_disaggregated( +// const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) { +// const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand}; +// llk_unpack_A_hw_configure(&unpack_A_params, within_face_16x16_transpose); +// } + +// template < +// BroadcastType BType = BroadcastType::NONE, +// bool acc_to_dest = false, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool unpack_to_dest = false> +// inline void llk_unpack_A_mop_config( +// const bool transpose_of_faces, +// const std::uint32_t operand_id, +// const std::uint32_t unpack_src_format = 0, +// std::uint32_t unpack_dst_format = 0) { +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_unpack_A_mop_config_( +// transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format); +// } + +// template < +// BroadcastType BType = BroadcastType::NONE, +// bool acc_to_dest = false, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool unpack_to_dest = false> +// inline void llk_unpack_A_init( +// const std::uint32_t transpose_of_faces = 0, +// const std::uint32_t within_face_16x16_transpose = 0, +// const std::uint32_t operand = 0) { +// cfg_reg_rmw_tensix(within_face_16x16_transpose); + +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_unpack_A_init_( +// transpose_of_faces, +// within_face_16x16_transpose, +// face_r_dim, +// num_faces, +// unpack_src_format[operand_id], +// unpack_dst_format[operand_id]); +// } + +// template < +// BroadcastType BType = BroadcastType::NONE, +// bool acc_to_dest = false, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool unpack_to_dest = false> +// inline void llk_unpack_A( +// const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; +// std::uint32_t address = base_address + offset_address; + +// _llk_unpack_A_( +// address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h new file mode 100644 index 00000000000..a2f5d8c675f --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "circular_buffer.h" +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_globals.h" +#include "ckernel_template.h" +#include "cunpack_common.h" +#include "llk_defs.h" +#include "llk_io.h" +#include "llk_operands.h" +#include "llk_param_structs.h" +#include "llk_unpack_common.h" + +// /************************************************************************* +// * LLK UNPACK COMMON +// *************************************************************************/ + +// void llk_zero_operand(std::uint32_t operand) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size; +// std::uint32_t size = cb_interface[operand_id].fifo_size; +// _llk_zero_buffer_(fifo_base_addr, size); +// } + +// template +// inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; +// std::uint32_t address = base_address + offset_address; +// _llk_unpack_get_tile_(address, p_tile); +// } + +// template +// inline void llk_unpack_release_tile(std::uint32_t operand) { +// _llk_unpack_release_tile_(); +// } + +// inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { +// _llk_unpack_debug_dump_(data, byte_size); +// } + +// inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); } + +// template +// inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { +// const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand); +// const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id); +// _llk_unpack_reconfig_data_format_srca_impl_( +// unpack_src_format[srca_operand_id], +// unpack_dst_format[srca_operand_id], +// cb_interface[srca_operand_id].fifo_page_size); +// } + +// template +// inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { +// std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand); +// const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id); +// _llk_unpack_reconfig_data_format_srcb_impl_( +// unpack_src_format[srcb_operand_id], +// unpack_dst_format[srcb_operand_id], +// cb_interface[srcb_operand_id].fifo_page_size); +// } + +// template +// inline void llk_unpack_reconfig_data_format_srca( +// const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { +// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + +// if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) { +// llk_unpack_reconfig_data_format_srca(srca_new_operand); +// } else if constexpr (is_tile_dim_reconfig_en) { +// llk_unpack_reconfig_data_format_srca(srca_new_operand); +// } +// } + +// template +// inline void llk_unpack_reconfig_data_format_srcb( +// const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { +// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) { +// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); +// } else if constexpr (is_tile_dim_reconfig_en) { +// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); +// } +// } + +// template +// inline void llk_unpack_reconfig_data_format( +// const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { +// llk_unpack_reconfig_data_format_srca(srca_new_operand); +// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); +// } + +// template +// inline void llk_unpack_reconfig_data_format( +// const std::uint32_t srca_old_operand, +// const std::uint32_t srca_new_operand, +// const std::uint32_t srcb_old_operand, +// const std::uint32_t srcb_new_operand) { +// llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand); +// llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand); +// } + +// inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); } + +// inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); } + +// // All TILE_SIZE related functions were deprecared in BBE for WH. The following is needed for pack_shifted so just +// // keeping here. +// // FIXME: Need to review and adjust accordingly +// constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) { +// switch (format & 0x1F) { +// case ((uint8_t)DataFormat::Float32): return ((index << 8)); +// case ((uint8_t)DataFormat::Float16): +// case ((uint8_t)DataFormat::Float16_b): return ((index << 7)); +// case ((uint8_t)DataFormat::Bfp8): +// case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2)); +// case ((uint8_t)DataFormat::Bfp4): +// case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2)); +// case ((uint8_t)DataFormat::Bfp2): +// case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2)); +// case ((uint8_t)DataFormat::Int8): +// case ((uint8_t)DataFormat::Lf8): return ((index << 6)); +// // Keep default as Bfp8? +// default: return ((index << 6) + (index << 2)); +// }; +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h new file mode 100644 index 00000000000..01a12122375 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h @@ -0,0 +1,94 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_reduce.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* +* LLK UNPACK REDUCE +*************************************************************************/ + +// template +// inline void llk_unpack_reduce_hw_configure( +// const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) { + +// constexpr bool within_face_16x16_transpose = (ReduceDim::REDUCE_ROW == dim); + +// const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand); +// const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + +// constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; +// const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a +// ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); + +// _llk_unpack_reduce_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpB_src_format, +// unpack_dst_format[unpA_operand_id], +// unpB_dst_format, +// unpA_face_r_dim, +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces, +// unpA_num_faces +// ); + +// if constexpr (type != PoolType::MAX) { +// union { +// float f; +// uint32_t u; +// } f2u = {.f = const_mult}; + +// for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u; // Load const into L1 buffer +// } +// } + +// template +// inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) { +// const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand}; +// llk_unpack_reduce_hw_configure(&unpack_reduce_params, mult); +// } + +// template +// inline void llk_unpack_reduce_mop_config() { +// _llk_unpack_reduce_mop_config_(); +// } + +// template +// inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) { + +// constexpr std::uint32_t unpA_operand_id = 0; + +// const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; +// const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a +// ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); + +// cfg_reg_rmw_tensix(unpB_dst_format); + +// cfg_reg_rmw_tensix(unpB_src_format); +// cfg_reg_rmw_tensix(unpB_dst_format); + +// TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32); +// TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32); +// TTI_NOP; TTI_NOP; + +// _llk_unpack_reduce_init_( +// within_face_16x16_transpose +// ); +// } + +// template +// inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) { + +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; +// std::uint32_t address = base_address + offset_address; + +// _llk_unpack_reduce_( +// address +// ); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h new file mode 100644 index 00000000000..59ede271732 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_tilize.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* +* LLK UNPACK TILIZE +*************************************************************************/ + +// template +// inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) { + +// constexpr bool within_face_16x16_transpose = false; +// constexpr StochRndType stoch_rnd_mode = StochRndType::None; + +// const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand); +// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + +// _llk_unpack_tilize_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces +// ); +// } + + +// template +// inline void llk_unpack_tilize_hw_configure_disaggregated( +// const std::uint32_t unpA_operand) { +// const llk_unpack_A_params_t unpack_tilize_params = { +// .unpA_operand = unpA_operand +// }; +// llk_unpack_tilize_hw_configure(&unpack_tilize_params); +// } + +// inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) { +// std::uint32_t operand_id = get_operand_id(operand); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); +// _llk_unpack_tilize_mop_config_(narrow_tile); +// } + +// inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) { +// cfg_reg_rmw_tensix(0); + +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); + +// // Save state of unpacker config for quick restore +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0] +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context + +// _llk_unpack_tilize_init_( +// unpack_src_format[operand_id], +// unpack_dst_format[operand_id], +// ct_dim, +// face_r_dim, +// narrow_tile +// ); + +// } + +// inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) { +// TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0); +// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0] +// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context +// } + +// inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) { + +// std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); + +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; // Remove header size added by descriptor + +// _llk_unpack_tilize_( +// base_address, +// tile_index, +// unpack_src_format[operand_id], +// block_ct_dim, +// face_r_dim, +// num_faces, +// narrow_tile +// ); +// } + +// inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) { +// for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) { +// llk_unpack_tilize(operand, tile_index, block_c_tiles); +// } +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h new file mode 100644 index 00000000000..dded559e94d --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_untilize.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK UNTILIZE +// *************************************************************************/ +// template +// inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) { +// constexpr bool is_row_pool = false; +// constexpr bool within_face_16x16_transpose = false; +// constexpr StochRndType stoch_rnd_mode = StochRndType::None; + +// const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand); +// const uint32_t unpA_num_faces = 4; +// const uint32_t unpA_face_r_dim = FACE_R_DIM; + +// _llk_unpack_untilize_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces +// ); +// } + +// inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) { +// const llk_unpack_A_params_t unpack_untilize_params = { +// .unpA_operand = unpA_operand, +// }; +// llk_unpack_untilize_hw_configure(&unpack_untilize_params); +// } + +// inline void llk_unpack_untilize_mop_config() { +// _llk_unpack_untilize_mop_config_(); +// } + +// inline void llk_unpack_untilize_init(std::uint32_t operand = 0) { +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = 1; +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// // Save state of unpacker config for quick restore +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1 + +// _llk_unpack_untilize_init_( +// unpack_dst_format[operand_id], +// cb_interface[operand_id].fifo_page_size, +// face_r_dim, +// num_faces +// ); +// } + +// inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; +// std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride; + +// // Check that unpacker is done (all contexts freed up) before starting hw configuration +// wait_for_idle(); + +// // Reset address counters +// unpacker_addr_counter_init(); + +// // Wait for cfg to be free to edit +// TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK); + +// // Reset the values to default in unpack AB common. +// TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0); +// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16); +// cfg_reg_rmw_tensix(1); +// cfg_reg_rmw_tensix(unpA_ch1_y_stride); +// TTI_NOP; TTI_NOP; // Do we need this for WH? +// } + +// template +// inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) { +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; + +// _llk_unpack_untilize_pass_( +// base_address, +// block_tile_cols +// ); +// } + +// inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) { +// llk_unpack_untilize_pass(operand, block_c_tiles); +// llk_unpack_untilize_pass(operand, block_c_tiles); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h new file mode 100644 index 00000000000..37e018dc6b8 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +#include "circular_buffer.h" + +extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h new file mode 100644 index 00000000000..1569b4cdcd1 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include + +inline uint32_t get_operand_id(uint32_t operand) +{ + return (operand); +} + +inline const uint32_t get_operand_src_format(const std::uint32_t operand_id) +{ + return unpack_src_format[operand_id]; +} + +inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id) +{ + return unpack_dst_format[operand_id]; +} + +//TODO: Do we need tile dim functions for GS? +inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id) +{ + return 4; +} + +inline const uint32_t get_operand_partial_face(const std::uint32_t operand_id) +{ + return 0; +} + +inline const uint32_t get_operand_face_r_dim(const std::uint32_t operand_id) +{ + return 16; +} + +inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id) +{ + return 0; +} + +inline const uint32_t get_operand_tile_r_dim(const std::uint32_t operand_id) +{ + return 32; +} + +inline const uint32_t get_operand_tile_c_dim(const std::uint32_t operand_id) +{ + return 32; +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h new file mode 100644 index 00000000000..bd010082bbd --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include + +// Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes +inline uint32_t get_output_id(uint32_t output) +{ + const uint32_t OUTPUT_BASE = 0; + return ((output) - OUTPUT_BASE); +} + +inline const uint32_t get_output_base_id() +{ + const uint32_t OUTPUT_BASE_ID = 16; + return (OUTPUT_BASE_ID); +} + +inline const uint32_t get_output_src_format(const std::uint32_t output_id) +{ + return pack_src_format[output_id]; +} + +inline const uint32_t get_output_dst_format(const std::uint32_t output_id) +{ + return pack_dst_format[output_id]; +} + +//TODO: Do we need tile dim functions for GS? +inline const uint32_t get_output_num_faces(const std::uint32_t output_id) +{ + return 4; +} + +inline const uint32_t get_output_partial_face(const std::uint32_t output_id) +{ + return 0; +} + +inline const uint32_t get_output_face_r_dim(const std::uint32_t output_id) +{ + return 16; +} + +inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id) +{ + return 0; +} + +inline const uint32_t get_output_tile_r_dim(const std::uint32_t output_id) +{ + return 32; +} + +inline const uint32_t get_output_tile_c_dim(const std::uint32_t output_id) +{ + return 32; +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h index f9359469e33..3dd7dbe114c 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h @@ -19,4 +19,3 @@ extern uint32_t math_sync_tile_dst_index; extern uint32_t __local_mem_rodata_start_addr[]; extern uint32_t __local_mem_rodata_end_addr[]; -extern uint32_t __firmware_start[]; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h index 29a2dbf9cfe..cf08580ad69 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +//TODO: This file should be deleted after fixing redefinition errors, +// functions should be moved to ckernel_globals.h #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h index 2b94607012d..ea113ce5fa0 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h @@ -8,8 +8,7 @@ inline uint32_t get_operand_id(uint32_t operand) { - const int OPERAND_BASE_ID = 0; - return (operand); + return (operand); } inline const uint32_t get_operand_src_format(const std::uint32_t operand_id) diff --git a/tt_metal/include/compute_kernel_api/unpack.h b/tt_metal/include/compute_kernel_api/unpack.h index 2aaefe1d9d4..c52dc248fea 100644 --- a/tt_metal/include/compute_kernel_api/unpack.h +++ b/tt_metal/include/compute_kernel_api/unpack.h @@ -7,67 +7,67 @@ #include "common_globals.h" +//TODO: Is this file needed? +// namespace ckernel { -namespace ckernel { +// /** +// * Helper function to reconfigure unpacker srca and srcb input data formats. +// */ +// ALWI void unpack_reconfig_data_format(const uint32_t srca_new_operand, const uint32_t srcb_new_operand) { +// #ifdef ARCH_GRAYSKULL +// UNPACK(( llk_unpack_reconfig_data_format(srca_new_operand, srcb_new_operand) )); +// #endif +// // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. +// } -/** - * Helper function to reconfigure unpacker srca and srcb input data formats. - */ -ALWI void unpack_reconfig_data_format(const uint32_t srca_new_operand, const uint32_t srcb_new_operand) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_reconfig_data_format(srca_new_operand, srcb_new_operand) )); - #endif - // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. -} +// /** +// * Helper function to reconfigure srca/srcb input data formats, only if they differ from existing formats. +// */ +// ALWI void unpack_reconfig_data_format(const uint32_t srca_old_operand, const uint32_t srca_new_operand, const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) { +// #ifdef ARCH_GRAYSKULL +// UNPACK(( llk_unpack_reconfig_data_format(srca_old_operand, srca_new_operand, srcb_old_operand, srcb_new_operand) )); +// #endif +// // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. +// } -/** - * Helper function to reconfigure srca/srcb input data formats, only if they differ from existing formats. -*/ -ALWI void unpack_reconfig_data_format(const uint32_t srca_old_operand, const uint32_t srca_new_operand, const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_reconfig_data_format(srca_old_operand, srca_new_operand, srcb_old_operand, srcb_new_operand) )); - #endif - // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. -} +// /** +// * Helper function to reconfigure unpacker srca input data format. +// */ +// ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_new_operand) { +// #ifdef ARCH_GRAYSKULL +// UNPACK(( llk_unpack_reconfig_data_format_srca(srca_new_operand) )); +// #endif +// // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. +// } -/** - * Helper function to reconfigure unpacker srca input data format. - */ -ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_new_operand) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_reconfig_data_format_srca(srca_new_operand) )); - #endif - // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. -} +// /** +// * Helper function to reconfigure unpacker srca input data format, only if it differs from existing format. +// */ +// ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_old_operand, const uint32_t srca_new_operand) { +// #ifdef ARCH_GRAYSKULL +// UNPACK(( llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand) )); +// #endif +// // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. +// } -/** - * Helper function to reconfigure unpacker srca input data format, only if it differs from existing format. - */ -ALWI void unpack_reconfig_data_format_srca(const uint32_t srca_old_operand, const uint32_t srca_new_operand) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand) )); - #endif - // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. -} +// /** +// * Helper function to reconfigure unpacker srcb input data format. +// */ +// ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_new_operand) { +// #ifdef ARCH_GRAYSKULL +// UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_new_operand) )); +// #endif +// // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. +// } -/** - * Helper function to reconfigure unpacker srcb input data format. - */ -ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_new_operand) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_new_operand) )); - #endif - // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. -} +// /** +// * Helper function to reconfigure unpacker srcb input data format, only if it differs from existing format. +// */ +// ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) { +// #ifdef ARCH_GRAYSKULL +// UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand) )); +// #endif +// // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. +// } -/** - * Helper function to reconfigure unpacker srcb input data format, only if it differs from existing format. - */ -ALWI void unpack_reconfig_data_format_srcb(const uint32_t srcb_old_operand, const uint32_t srcb_new_operand) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand) )); - #endif - // NOTE: For wormhole_b0, updated unpacker functions don't yet exist, so skip. -} - -} +// }