Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llk refactor uplift #3908

Merged
merged 16 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

#include <cstdint>
#include "llk_math_common.h"
#include "llk_math_eltwise_unary_datacopy.h"
#include "llk_math_eltwise_unary_datacopy.h"
#include "llk_math_matmul.h"
#include "llk_math_unary_datacopy_api.h"
#include "llk_math_unary_datacopy_api.h"
#include "llk_math_matmul_api.h"
namespace NAMESPACE
{

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include "llk_unpack_common.h"
#include "llk_unpack_tilize.h"
#include "llk_unpack_untilize.h"
#include "llk_unpack_A.h"
#include "llk_unpack_AB_matmul.h"
#include "llk_unpack_common_api.h"
#include "llk_unpack_tilize_api.h"
#include "llk_unpack_untilize_api.h"
#include "llk_unpack_A_api.h"
#include "llk_unpack_AB_matmul_api.h"
namespace NAMESPACE
{

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

#include <cstdint>
#include "llk_math_common.h"
#include "llk_math_eltwise_binary.h"
#include "llk_math_eltwise_unary_datacopy.h"
#include "llk_math_binary_api.h"
#include "llk_math_unary_datacopy_api.h"

namespace NAMESPACE
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include "llk_unpack_common.h"
#include "llk_unpack_AB.h"
#include "llk_unpack_untilize.h"
#include "llk_unpack_common_api.h"
#include "llk_unpack_AB_api.h"
#include "llk_unpack_untilize_api.h"

namespace NAMESPACE
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace NAMESPACE {

#ifdef TRISC_MATH
#include "llk_math_common.h"
#include "llk_math_eltwise_unary_datacopy.h"
#include "llk_math_unary_datacopy_api.h"

void math_main()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

#include <cstdint>

#include "llk_3c.h"

namespace NAMESPACE {
void MAIN {
// expands to hlk_relu_config(nullptr, 1); for relu only
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "compute_kernel_api/tile_move_copy.h"
#include "compute_kernel_api/matmul.h"


#include "mod_div_lib.h"

inline void tilize_activation(uint32_t in0_cb, uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t out_cb)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ namespace NAMESPACE {
#ifdef TRISC_MATH
#include <cstdint>
#include "llk_math_common.h"
#include "llk_math_eltwise_binary.h"
#include "llk_math_eltwise_unary_datacopy.h"
#include "llk_math_binary_api.h"
#include "llk_math_unary_datacopy_api.h"

void math_main()
{
Expand Down Expand Up @@ -49,9 +49,9 @@ void math_main()

#ifdef TRISC_UNPACK
#include <cstdint>
#include "llk_unpack_common.h"
#include "llk_unpack_AB.h"
#include "llk_unpack_untilize.h"
#include "llk_unpack_common_api.h"
#include "llk_unpack_AB_api.h"
#include "llk_unpack_untilize_api.h"

void unpack_main()
{
Expand Down
1 change: 1 addition & 0 deletions tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <cstdint>

#include "mod_div_lib.h"
#include "compute_kernel_api/tilize.h"
#include "compute_kernel_api/untilize.h"
#include "compute_kernel_api/tile_move_copy.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <cstdint>

#include "mod_div_lib.h"
#include "compute_kernel_api/tilize.h"
#include "compute_kernel_api/untilize.h"
#include "compute_kernel_api/tile_move_copy.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <cstdint>

#include "mod_div_lib.h"
#include "compute_kernel_api/tilize.h"
#include "compute_kernel_api/untilize.h"
#include "compute_kernel_api/tile_move_copy.h"
Expand Down
115 changes: 4 additions & 111 deletions tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,16 @@
namespace ckernel
{

#define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_ ## arg_idx

constexpr uint PACK_FLUSH_COUNTERS = // counters flush
(1 << PACK_COUNTERS_SEC2_pack_per_xy_plane_SHAMT) |
(1 << PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT) |
(1 << PACK_COUNTERS_SEC2_pack_xys_per_tile_SHAMT);

extern volatile uint * const reg_base;
extern volatile uint * const pc_buf_base;
extern volatile uint * const regfile;
extern volatile uint * reg_base;
extern volatile uint * pc_buf_base;
extern volatile uint * regfile;
rtawfik01 marked this conversation as resolved.
Show resolved Hide resolved
extern uint *regmem;
extern volatile uint * const instrn_buffer;
extern volatile uint * instrn_buffer;
extern volatile uint *dbg_event_scratch;
extern volatile uint local_mem_barrier;

Expand All @@ -62,8 +60,6 @@ extern uint32_t dest_offset_id;
extern uint32_t dbg_event_index;
extern uint32_t dbg_event_end;

extern uint32_t op_info_offset;

// Internal scope to namespace methods only (C++ does not allow namespace private ownership)
namespace internal {
}
Expand Down Expand Up @@ -281,109 +277,6 @@ inline void debug_dump(uint8_t *data, uint32_t byte_size) {
// TODO(pk) re-implement
}

inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) {

uint32_t* op_info_ptr = reinterpret_cast<uint32_t*>(OP_INFO_BASE_ADDR + op_info_offset);
static constexpr uint32_t op_info_num_items = 7;

volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(&op_info_struct);
for (uint32_t i = 0; i < op_info_num_items; i++) {
op_info_struct_ptr[i] = op_info_ptr[i];
}
op_info_offset += 28;

if (op_info_offset == OP_INFO_SIZE) {
op_info_offset = 0; // In case we go out of bounds
}
}

inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b)
{
unsigned int r = 0;
while (a)
{
if (a & 1)
r += b;
a >>= 1;
b <<= 1;
}
return r;
}

inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n)
{
// Uses embedding style magic number
// * fixed point 1/12 then shifting.
// https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3;
}

inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n)
{
// Uses embedding style magic number
// * fixed point 1/12 then shifting.
// https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm
return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6;
}

template <uint32_t d>
inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n)
{
if constexpr (d == 12) {
// fast divide for 12 divisor
return fast_udiv_12(n);
} else if constexpr (d == 94) {
// fast divide for 94 divisor. Handles Banked L1 address generation for E75
return fast_udiv_94(n);
} else {
// generic divide from llvm
const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT;
unsigned int q;
unsigned int r;
unsigned sr;
/* special cases */
if (d == 0)
return 0; /* ?! */
if (n == 0)
return 0;
sr = __builtin_clz(d) - __builtin_clz(n);
/* 0 <= sr <= n_uword_bits - 1 or sr large */
if (sr > n_uword_bits - 1) /* d > r */
return 0;
if (sr == n_uword_bits - 1) /* d == 1 */
return n;
++sr;
/* 1 <= sr <= n_uword_bits - 1 */
/* Not a special case */
q = n << (n_uword_bits - sr);
r = n >> sr;
unsigned int carry = 0;
for (; sr > 0; --sr)
{
/* r:q = ((r:q) << 1) | carry */
r = (r << 1) | (q >> (n_uword_bits - 1));
q = (q << 1) | carry;
/* carry = 0;
* if (r.all >= d.all)
* {
* r.all -= d.all;
* carry = 1;
* }
*/
const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1);
carry = s & 1;
r -= d & s;
}
q = (q << 1) | carry;
return q;
}
}
template <uint32_t d>
inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a)
{
return a - udivsi3_const_divisor<d>(a) * d;
}

inline void tensix_sync()
{
volatile uint foo = 0;
Expand Down
59 changes: 0 additions & 59 deletions tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,71 +7,12 @@
#include <cstdint>
#include "ckernel_structs.h"
#include "risc_attribs.h"
#include "tensix_functions.h"
#include "hostdevcommon/common_runtime_address_map.h"

extern uint32_t cfg_state_id;
extern uint32_t unp_cfg_context;
extern uint32_t gl_alu_format_spec_reg;

extern volatile uint32_t l1_buffer[16];

//extern const int32_t unpack_src_format[24];
//extern const int32_t unpack_dst_format[24];
//extern const int32_t pack_src_format[16];
//extern const int32_t pack_dst_format[16];

extern uint32_t pack_sync_tile_dst_ptr;
extern uint32_t math_sync_tile_dst_index;

extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];

extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
extern void (* __init_array_start[])();
extern void (* __init_array_end[])();
extern uint32_t __firmware_start[];

extern void kernel_init();
extern void kernel_launch();

inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
// Cover L1 load latency of 6 cycles for the bulk of the copy
int32_t n = 0;
while (n < len - 5) {
uint32_t v0 = l1_addr[n + 0];
uint32_t v1 = l1_addr[n + 1];
uint32_t v2 = l1_addr[n + 2];
uint32_t v3 = l1_addr[n + 3];
uint32_t v4 = l1_addr[n + 4];
uint32_t v5 = l1_addr[n + 5];
local_mem_addr[n + 0] = v0;
local_mem_addr[n + 1] = v1;
local_mem_addr[n + 2] = v2;
local_mem_addr[n + 3] = v3;
local_mem_addr[n + 4] = v4;
local_mem_addr[n + 5] = v5;
n += 6;
}
// Could optimize this further (eg, loop of 2 or 4), probably not worth it
while (n < len) {
local_mem_addr[n] = l1_addr[n];
n++;
}
}

inline void firmware_kernel_common_init(void *init_local_l1_base) {

// Handle stuff typically done in crt0 in asm. Easier to do in C
wzerorange(__ldm_bss_start, __ldm_bss_end);

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);

for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
(**fptr)();
}
}
36 changes: 0 additions & 36 deletions tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,20 +244,6 @@ inline void calculate_atan()
}
}


template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void calculate_negative()
{

for (int d = 0; d < ITERATIONS; d++)
{
vFloat val = dst_reg[0];
dst_reg[0] = -val;
dst_reg++;
}
}


template <bool APPROXIMATION_MODE, int ITERATIONS, int RECIPROCAL_ITERATIONS>
inline void calculate_rsqrt()
{
Expand Down Expand Up @@ -888,21 +874,6 @@ inline void calculate_silu()
}
}

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void calculate_mask()
{
bool exponent_size_8 = true;
for (int d = 0; d < ITERATIONS; d++)
{
vFloat mask = dst_reg[16];
v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) {
dst_reg[0] = 0;
}
v_endif;
dst_reg++;
}
}

template <SfpuType operation, bool APPROXIMATION_MODE, int SfpuType_PARAM = 0, int ITERATIONS = 4>
inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0)
{
Expand Down Expand Up @@ -997,13 +968,6 @@ inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, ui
else if constexpr (operation == SfpuType::silu) {
calculate_silu<APPROXIMATION_MODE, ITERATIONS>();
}
else if constexpr (operation == SfpuType::mask) {
calculate_mask<APPROXIMATION_MODE, ITERATIONS>();
}
else if constexpr (operation == SfpuType::negative) {
calculate_negative<APPROXIMATION_MODE, ITERATIONS>();
}

//erf, erfc are dispatched directly.
}

Expand Down
Loading