#0: Add empty llk api files for grayskull to fix compile

tenstorrent · Dec 6, 2023 · 364fb1d · 364fb1d
1 parent 9f7f00d
commit 364fb1d
Show file tree

Hide file tree

Showing 42 changed files with 3,402 additions and 446 deletions.
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h b/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h
@@ -14,20 +14,18 @@ using namespace ckernel;
 
 
 #ifdef UCK_CHLKC_MATH
-// #include "chlkc_math_llk_args.h"
+#include "chlkc_unpack_data_format.h"
 #include "chlkc_math_fidelity.h"
 #include "chlkc_math_approx_mode.h"
 #include "chlkc_math.cpp"
 #endif
 
 #ifdef UCK_CHLKC_PACK
-// #include "chlkc_pack_llk_args.h"
 #include "chlkc_pack_data_format.h"
 #include "chlkc_pack.cpp"
 #endif
 
 #ifdef UCK_CHLKC_UNPACK
-// #include "chlkc_unpack_llk_args.h"
 #include "chlkc_unpack_data_format.h"
 #include "chlkc_unpack.cpp"
 #endif

diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
@@ -62,8 +62,6 @@ extern uint32_t dest_offset_id;
 extern uint32_t dbg_event_index;
 extern uint32_t dbg_event_end;
 
-extern uint32_t op_info_offset;
-
 // Internal scope to namespace methods only (C++ does not allow namespace private ownership)
 namespace internal {
 }
@@ -281,22 +279,6 @@ inline void debug_dump(uint8_t *data, uint32_t byte_size) {
     // TODO(pk) re-implement
 }
 
-inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {
-
-    uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
-    static constexpr uint32_t op_info_num_items = 7;
-
-    volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(&op_info_struct);
-    for (uint32_t i = 0; i < op_info_num_items; i++) {
-        op_info_struct_ptr[i] = op_info_ptr[i];
-    }
-    op_info_offset += 28;
-
-    if (op_info_offset == OP_INFO_SIZE) {
-        op_info_offset = 0; // In case we go out of bounds
-    }
-}
-
 inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b)
 {
   unsigned int r = 0;

diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
@@ -7,71 +7,12 @@
 #include <cstdint>
 #include "ckernel_structs.h"
 #include "risc_attribs.h"
-#include "tensix_functions.h"
-#include "hostdevcommon/common_runtime_address_map.h"
 
 extern uint32_t cfg_state_id;
 extern uint32_t unp_cfg_context;
 extern uint32_t gl_alu_format_spec_reg;
 
 extern volatile uint32_t l1_buffer[16];
 
-//extern const int32_t unpack_src_format[24];
-//extern const int32_t unpack_dst_format[24];
-//extern const int32_t pack_src_format[16];
-//extern const int32_t pack_dst_format[16];
-
 extern uint32_t pack_sync_tile_dst_ptr;
 extern uint32_t math_sync_tile_dst_index;
-
-extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
-
-extern uint32_t __ldm_bss_start[];
-extern uint32_t __ldm_bss_end[];
-extern uint32_t __ldm_data_start[];
-extern uint32_t __ldm_data_end[];
-extern void (* __init_array_start[])();
-extern void (* __init_array_end[])();
-extern uint32_t __firmware_start[];
-
-extern void kernel_init();
-extern void kernel_launch();
-
-inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
-    // Cover L1 load latency of 6 cycles for the bulk of the copy
-    int32_t n = 0;
-    while (n < len - 5) {
-        uint32_t v0 = l1_addr[n + 0];
-        uint32_t v1 = l1_addr[n + 1];
-        uint32_t v2 = l1_addr[n + 2];
-        uint32_t v3 = l1_addr[n + 3];
-        uint32_t v4 = l1_addr[n + 4];
-        uint32_t v5 = l1_addr[n + 5];
-        local_mem_addr[n + 0] = v0;
-        local_mem_addr[n + 1] = v1;
-        local_mem_addr[n + 2] = v2;
-        local_mem_addr[n + 3] = v3;
-        local_mem_addr[n + 4] = v4;
-        local_mem_addr[n + 5] = v5;
-        n += 6;
-    }
-    // Could optimize this further (eg, loop of 2 or 4), probably not worth it
-    while (n < len) {
-        local_mem_addr[n] = l1_addr[n];
-        n++;
-    }
-}
-
-inline void firmware_kernel_common_init(void *init_local_l1_base) {
-
-    // Handle stuff typically done in crt0 in asm.  Easier to do in C
-    wzerorange(__ldm_bss_start, __ldm_bss_end);
-
-    int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
-    uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
-    l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
-
-    for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
-        (**fptr)();
-    }
-}
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h
@@ -237,4 +237,221 @@ class ckernel_unpack_template
     void program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask = 0); // calls program, then run
 };
 
+    ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op)
+        , m_loop_op1(TT_OP_NOP)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op;
+        m_loop1_last_instr = loop_op;
+    }
+
+    ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1)
+        : m_outer_loop_len(outer_loop_len)
+        , m_inner_loop_len(inner_loop_len)
+        , m_loop_op0(loop_op0)
+        , m_loop_op1(loop_op1)
+        , m_end_op0(TT_OP_NOP)
+        , m_end_op1(TT_OP_NOP)
+        , m_start_op0(TT_OP_NOP)
+    {
+        m_loop0_last_instr = loop_op1;
+        m_loop1_last_instr = loop_op1;
+    }
+
+    void ckernel_template::set_loop_op0(uint loop_op)
+    {
+        m_loop_op0 = loop_op;
+    }
+
+    void ckernel_template::set_loop_op1(uint loop_op)
+    {
+        m_loop_op1 = loop_op;
+    }
+
+    void ckernel_template::set_end_ops(uint end_op0, uint end_op1)
+    {
+        m_end_op0 = end_op0;
+        m_end_op1 = end_op1;
+    }
+
+    void ckernel_template::set_end_op(uint end_op0)
+    {
+        set_end_ops(end_op0, TT_OP_NOP);
+    }
+
+    void ckernel_template::set_start_op(uint start_op0)
+    {
+        m_start_op0 = start_op0;
+    }
+
+    void ckernel_template::set_last_inner_loop_instr(uint op)
+    {
+        m_loop1_last_instr = op;
+    }
+
+    void ckernel_template::set_last_outer_loop_instr(uint op)
+    {
+        m_loop0_last_instr = op;
+    }
+
+    void ckernel_template::program_and_run(volatile uint *instrn_buffer)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer);
+    }
+
+    void ckernel_template::run(volatile uint *instrn_buffer)
+    {
+        TTI_MOP(1, 0, 0); // run the double-loop template
+    }
+
+    void ckernel_template::program(volatile uint *instrn_buffer)
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[0] = m_outer_loop_len;
+        mop_cfg[1] = m_inner_loop_len;
+        mop_cfg[2] = m_start_op0;
+        mop_cfg[3] = m_end_op0;
+        mop_cfg[4] = m_end_op1;
+        mop_cfg[5] = m_loop_op0;
+        mop_cfg[6] = m_loop_op1;
+        mop_cfg[7] = m_loop0_last_instr;
+        mop_cfg[8] = m_loop1_last_instr;
+    }
+
+    void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        program(instrn_buffer);
+        run(instrn_buffer, count, zmask);
+    }
+
+    void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP_CFG(zmask >> 16);              // Set the top 16 bits of zmask - we could skip this for count <= 16
+        TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template
+    }
+
+    // Version without zmask, should be slightly faster by eliminating one instruction.
+    void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count)
+    {
+        FWASSERT("Unpack template only supports loops up to 128", count <= 128);
+        TT_MOP(0, count - 1, 0); // Run the template
+    }
+
+    void ckernel_unpack_template::program(volatile uint *instrn_buffer) const
+    {
+        volatile uint *mop_cfg = reinterpret_cast<volatile uint *>(TENSIX_MOP_CFG_BASE);
+
+        mop_sync(); // wait until previous mops have completed
+
+        mop_cfg[1] = m_unpackB | (m_unpack_halo << 1);
+        mop_cfg[2] = m_B_instr;
+        mop_cfg[3] = m_A0_instr;
+        mop_cfg[4] = m_A1_instr;
+        mop_cfg[5] = m_A2_instr;
+        mop_cfg[6] = m_A3_instr;
+        mop_cfg[7] = m_skipA_instr;
+        mop_cfg[8] = m_skipB_instr;
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            A_instr, 0, 0, 0, skipA_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            false,                            // halo
+            B_instr, 0, 0, 0, skipB_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr)
+    {
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(false, // src B
+            true,                             // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask)
+    {
+        // Figure out which unpack is last
+        const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0;
+
+        return ckernel_unpack_template(true, // src B
+            true,                            // halo
+            ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr,
+            ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr,
+            ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr,
+            ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr,
+
+        uint B_instr, uint skipB_instr)
+    {
+        return ckernel_unpack_template(true, // src B
+            false,                           // halo
+            A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){
+        return ckernel_unpack_template::lA(instr0, skip0);
+    }
+
+    ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){
+        // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA.
+        return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1);
+    }
+
 } // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h
@@ -416,14 +416,4 @@ namespace ckernel::packer
    {
        dest_offset_id = 0;
    }
-
-   inline uint32_t get_output_id(uint32_t output)
-   {
-      return ((output) - OUTPUT_BASE);
-   }
-
-   inline constexpr uint32_t get_output_base_id()
-   {
-      return (OUTPUT_BASE_ID);
-   }
 }
diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h
@@ -335,9 +335,4 @@ namespace ckernel::unpacker
       // Clear context ID
       //reset_config_context();
     }
-
-   inline uint32_t get_operand_id(uint32_t operand)
-   {
-      return operand;
-   }
 }