Skip to content

Commit

Permalink
Revert "#14826: Remove misoptimizations from init code (#14861)"
Browse files Browse the repository at this point in the history
This reverts commit 650c5c3.
  • Loading branch information
ttmchiou committed Nov 14, 2024
1 parent bd3b1c6 commit 52742eb
Show file tree
Hide file tree
Showing 13 changed files with 116 additions and 91 deletions.
3 changes: 2 additions & 1 deletion tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,8 @@ int main() {
DIRTY_STACK_MEMORY();
WAYPOINT("I");

do_crt1((uint32_t*)MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH);
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint*)__ldm_data_start, (uint tt_l1_ptr*)MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH, num_words);

mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0
noc_index = 0;
Expand Down
8 changes: 4 additions & 4 deletions tt_metal/hw/firmware/src/brisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#include "tools/profiler/kernel_profiler.hpp"
#include <kernel_includes.hpp>

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr) {

#if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL)
Expand All @@ -26,10 +29,7 @@ void kernel_launch(uint32_t kernel_base_addr) {
while (c_tensix_core::read_wall_clock() < end_time);
#endif
#else
extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((uint32_t tt_l1_ptr
*)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
noc_local_state_init(NOC_INDEX);
Expand Down
8 changes: 3 additions & 5 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,15 @@ uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used

void __attribute__((noinline)) Application(void) {
WAYPOINT("I");
rtos_context_switch_ptr = (void (*)())RtosTable[0];

// Not using do_crt1 since it is copying to registers???
// Not using firmware_kernel_common_init since it is copying to registers
// TODO: need to find free space that routing FW is not using
extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
wzerorange(__ldm_bss_start, __ldm_bss_end);

rtos_context_switch_ptr = (void (*)())RtosTable[0];

risc_init();
noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR);
wzerorange(__ldm_bss_start, __ldm_bss_end);

for (uint32_t n = 0; n < NUM_NOCS; n++) {
noc_local_state_init(n);
Expand Down
7 changes: 6 additions & 1 deletion tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,13 @@ int main() {
conditionally_disable_l1_cache();
DIRTY_STACK_MEMORY();
WAYPOINT("I");
do_crt1((uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH);
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
uint32_t *local_mem_ptr = (uint32_t *)__ldm_data_start;
uint32_t *l1_data_ptr = (uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH;
uint32_t heartbeat = 0;
for (int32_t i = 0; i < num_words; i++) {
local_mem_ptr[i] = l1_data_ptr[i];
}

risc_init();

Expand Down
8 changes: 4 additions & 4 deletions tt_metal/hw/firmware/src/idle_erisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@

#include <kernel_includes.hpp>

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr) {
DeviceZoneScopedMainChildN("ERISC-KERNEL");

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((uint32_t tt_l1_ptr
*)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

noc_local_state_init(NOC_INDEX);

Expand Down
3 changes: 2 additions & 1 deletion tt_metal/hw/firmware/src/ncrisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ int main(int argc, char *argv[]) {
DIRTY_STACK_MEMORY();
WAYPOINT("I");

do_crt1((uint32_t tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH);
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint *)__ldm_data_start, (uint tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH, num_words);

risc_init();

Expand Down
13 changes: 7 additions & 6 deletions tt_metal/hw/firmware/src/ncrisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ uint32_t noc_nonposted_writes_acked[NUM_NOCS];
uint32_t noc_nonposted_atomics_acked[NUM_NOCS];
uint32_t noc_posted_writes_num_issued[NUM_NOCS];

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr) {

DeviceZoneScopedMainChildN("NCRISC-KERNEL");
Expand All @@ -35,13 +38,11 @@ void kernel_launch(uint32_t kernel_base_addr) {
while (c_tensix_core::read_wall_clock() < KERNEL_RUN_TIME);
#endif
#else
extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((
uint32_t tt_l1_ptr *)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
noc_local_state_init(NOC_INDEX);
firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
noc_local_state_init(NOC_INDEX);
} else {
noc_local_state_init(NOC_0);
noc_local_state_init(NOC_1);
Expand Down
5 changes: 4 additions & 1 deletion tt_metal/hw/firmware/src/trisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ int main(int argc, char *argv[]) {
DIRTY_STACK_MEMORY();
WAYPOINT("I");

do_crt1((uint32_t tt_l1_ptr *)PREPROCESSOR_EXPAND(MEM_TRISC, COMPILE_FOR_TRISC, _INIT_LOCAL_L1_BASE_SCRATCH));
uint tt_l1_ptr *local_l1_start_addr =
(uint tt_l1_ptr *)PREPROCESSOR_EXPAND(MEM_TRISC, COMPILE_FOR_TRISC, _INIT_LOCAL_L1_BASE_SCRATCH);
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint *)__ldm_data_start, local_l1_start_addr, num_words);

// Initialize GPRs to all 0s
#pragma GCC unroll 0
Expand Down
8 changes: 4 additions & 4 deletions tt_metal/hw/firmware/src/trisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ volatile tt_reg_ptr uint * mailbox_base[4] = {
};
}

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr)
{
DeviceZoneScopedMainChildN("TRISC-KERNEL");
Expand All @@ -41,10 +44,7 @@ void kernel_launch(uint32_t kernel_base_addr)
ckernel::wait(KERNEL_RUN_TIME);
#endif
#else
extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((
uint32_t tt_l1_ptr *)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

#if defined(UCK_CHLKC_UNPACK)
// Make sure DBG_FEATURE_DISABLE register is cleared before every kernel is executed
Expand Down
79 changes: 37 additions & 42 deletions tt_metal/hw/inc/firmware_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,58 +14,53 @@
#include "hostdevcommon/kernel_structs.h"
#include "dev_msgs.h"

extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
extern void (* __init_array_start[])();
extern void (* __init_array_end[])();

extern void kernel_init(uint32_t kernel_init);
extern void kernel_launch(uint32_t kernel_base_addr);

// Clear bss, copy initial data image, run global constructors.
inline void do_crt1(uint32_t tt_l1_ptr *data_image) {
// Clear bss.
extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
wzerorange(__ldm_bss_start, __ldm_bss_end);

// Copy initialized data.
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
uint32_t *dst = __ldm_data_start;
uint32_t tt_l1_ptr *src = data_image;
unsigned len = __ldm_data_end - __ldm_data_start;
#pragma GCC unroll 0
while (len >= 3) {
auto v0 = src[0], v1 = src[1], v2 = src[2];
// 1) Make sure the optimizer does not think this is memcpy by
// hiding the pointer bookkeeping in an asm.
// 2) The scheduler doesn't know the above loads have 6 cycle
// latency. We emit the 3 bookkeeping adds as a single block
// in the load shadow before the stores. The optimizer will
// not be able to move these.
// 3) We don't need early clobbers here because of the +r
// constraint -- early clobbers would pessimize.
asm inline(
"addi %0,%0,3*%3\n\t"
"addi %1,%1,3*%3\n\t"
"addi %2,%2,-3"
: "+r"(src), "+r"(dst), "+r"(len)
: "i"(sizeof(v0)));
dst[-3] = v0, dst[-2] = v1, dst[-1] = v2;
inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
// Cover L1 load latency of 6 cycles for the bulk of the copy
int32_t n = 0;
while (n < len - 5) {
uint32_t v0 = l1_addr[n + 0];
uint32_t v1 = l1_addr[n + 1];
uint32_t v2 = l1_addr[n + 2];
uint32_t v3 = l1_addr[n + 3];
uint32_t v4 = l1_addr[n + 4];
uint32_t v5 = l1_addr[n + 5];
local_mem_addr[n + 0] = v0;
local_mem_addr[n + 1] = v1;
local_mem_addr[n + 2] = v2;
local_mem_addr[n + 3] = v3;
local_mem_addr[n + 4] = v4;
local_mem_addr[n + 5] = v5;
n += 6;
}
// There are 0, 1 or 2 words of residue. This is smaller than a loop.
// We get smaller code layout by expecting the conditions to be true.
if (__builtin_expect(len >= 1, true)) {
dst[0] = src[0];
if (__builtin_expect(len >= 2, true))
dst[1] = src[1];
// Could optimize this further (eg, loop of 2 or 4), probably not worth it
while (n < len) {
local_mem_addr[n] = l1_addr[n];
n++;
}
}

inline void firmware_kernel_common_init(void *init_local_l1_base) {

// Handle stuff typically done in crt0 in asm. Easier to do in C
wzerorange(__ldm_bss_start, __ldm_bss_end);

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base), num_words);

// Run constructors.
extern void (*__init_array_start[])();
extern void (*__init_array_end[])();
#pragma GCC unroll 0
for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
(**fptr)();
}
}

FORCE_INLINE
uint32_t firmware_config_init(tt_l1_ptr mailboxes_t* const mailboxes, uint32_t core_type_index, uint32_t dispatch_class) {

Expand Down
22 changes: 13 additions & 9 deletions tt_metal/hw/toolchain/substitutes.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: © 2023, 2024 Tenstorrent Inc.
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -7,18 +7,22 @@

using namespace std;

extern "C" int atexit(void (*f)(void)) { return 0; }
extern "C" int atexit(void (*f)(void))
{
return 0;
}

extern "C" void exit(int ec) {
extern "C" void exit(int ec)
{
while (1) { asm volatile ("" ::: "memory"); }
}

extern "C" void wzerorange(uint32_t *start, uint32_t *end) {
#pragma GCC unroll 0
while (start != end) {
extern "C" void wzerorange(uint32_t *start, uint32_t *end) __attribute__((aligned(16)));

extern "C" void wzerorange(uint32_t *start, uint32_t *end)
{
for (; start != end; start++)
{
*start = 0;
// Prevent optimizer considering this loop equivalent to
// memset (start, 0, end - start) -- that's code bloat.
asm inline("addi %0,%0,%1" : "+r"(start) : "i"(sizeof(*start)));
}
}
42 changes: 30 additions & 12 deletions tt_metal/hw/toolchain/tmu-crt0.S
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,28 @@ _start:
addi gp,gp,%lo(__global_pointer$)
.option pop

// set stack pointer, reserve 16 bytes for main's arguments
lui sp, %hi(__stack_top - 16)
addi sp, sp, %lo(__stack_top - 16)
// set stack pointer
lui sp, %hi(__stack_top)
addi sp, sp, %lo(__stack_top)

// main is responsible for the rest of crt -- clear bss, copy data image, run global constructors
// Clear bss
lui a0, %hi(__ldm_bss_start)
addi a0, a0, %lo(__ldm_bss_start)
lui a1, %hi(__ldm_bss_end)
addi a1, a1, %lo(__ldm_bss_end)
call wzerorange

// Run global initializers
lui s2, %hi(__init_array_start)
addi s2, s2, %lo(__init_array_start)
lui s3, %hi(__init_array_end)
addi s3, s3, %lo(__init_array_end)
beq s2, s3, 2f
1: lw a0, 0(s2)
jalr a0
addi s2, s2, 4
bne s2, s3, 1b
2:

/* Pass in the tensix coordinates as argv[0][0] through argv[0][3].
argc = 1, envp = NULL. In memory, we'll have
Expand All @@ -27,15 +44,16 @@ _start:
* sp+8: s1
* sp+c: 0
*/
addi sp, sp, -16 /* (stack is aligned to 16 bytes in riscv calling convention) */
addi a0, sp, 8
sw a0, 0(sp) // argv[0]
sw zero, 4(sp) // argv[1]
sw s1, 8(sp) // argv[0][0..3]
sw zero, 12(sp) // argv[0][4..7]

li a0, 1 // argc = 1
mv a1, sp // argv
mv a2, zero // env
sw a0, 0(sp)
sw zero, 4(sp)
sw s1, 8(sp)
sw zero, 12(sp)

li a0, 1 # argc = 1
mv a1, sp
mv a2, zero

call main
tail exit
Expand Down
1 change: 0 additions & 1 deletion tt_metal/hw/toolchain/tmu-crt0k.S
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
.type _start, @function

_start:
// kernel_launch is responsible for the rest of crt -- clear bss, copy data image, run global constructors
tail _Z13kernel_launchm
.size _start, .-_start

0 comments on commit 52742eb

Please sign in to comment.