Skip to content

Commit

Permalink
#14826: Remove misoptimizations from init code
Browse files Browse the repository at this point in the history
1) Stop wzerorange being recognized as memset
2) Reduce insns in data image copy
3) Do not use a loop for residue

Rename init code as do_crt1, to make it clearer what it is doing.
  • Loading branch information
nathan-TT committed Nov 7, 2024
1 parent 74c4dea commit a66aeb8
Show file tree
Hide file tree
Showing 13 changed files with 83 additions and 116 deletions.
3 changes: 1 addition & 2 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,7 @@ int main() {
DIRTY_STACK_MEMORY();
WAYPOINT("I");

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint*)__ldm_data_start, (uint tt_l1_ptr*)MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH, num_words);
do_crt1((uint32_t*)MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH);

mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0
noc_index = 0;
Expand Down
8 changes: 4 additions & 4 deletions tt_metal/hw/firmware/src/brisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
#include "tools/profiler/kernel_profiler.hpp"
#include <kernel_includes.hpp>

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr) {

#if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL)
Expand All @@ -29,7 +26,10 @@ void kernel_launch(uint32_t kernel_base_addr) {
while (c_tensix_core::read_wall_clock() < end_time);
#endif
#else
firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((uint32_t tt_l1_ptr
*)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
noc_local_state_init(NOC_INDEX);
Expand Down
8 changes: 5 additions & 3 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,17 @@ uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used

void __attribute__((noinline)) Application(void) {
WAYPOINT("I");
rtos_context_switch_ptr = (void (*)())RtosTable[0];

// Not using firmware_kernel_common_init since it is copying to registers
// Not using do_crt1 since it is copying to registers???
// TODO: need to find free space that routing FW is not using
extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
wzerorange(__ldm_bss_start, __ldm_bss_end);

rtos_context_switch_ptr = (void (*)())RtosTable[0];

risc_init();
noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR);
wzerorange(__ldm_bss_start, __ldm_bss_end);

for (uint32_t n = 0; n < NUM_NOCS; n++) {
noc_local_state_init(n);
Expand Down
7 changes: 1 addition & 6 deletions tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,8 @@ int main() {
conditionally_disable_l1_cache();
DIRTY_STACK_MEMORY();
WAYPOINT("I");
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
uint32_t *local_mem_ptr = (uint32_t *)__ldm_data_start;
uint32_t *l1_data_ptr = (uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH;
do_crt1((uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH);
uint32_t heartbeat = 0;
for (int32_t i = 0; i < num_words; i++) {
local_mem_ptr[i] = l1_data_ptr[i];
}

risc_init();

Expand Down
8 changes: 4 additions & 4 deletions tt_metal/hw/firmware/src/idle_erisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@

#include <kernel_includes.hpp>

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr) {
DeviceZoneScopedMainChildN("ERISC-KERNEL");

firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((uint32_t tt_l1_ptr
*)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

noc_local_state_init(NOC_INDEX);

Expand Down
3 changes: 1 addition & 2 deletions tt_metal/hw/firmware/src/ncrisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ int main(int argc, char *argv[]) {
DIRTY_STACK_MEMORY();
WAYPOINT("I");

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint *)__ldm_data_start, (uint tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH, num_words);
do_crt1((uint32_t tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH);

risc_init();

Expand Down
13 changes: 6 additions & 7 deletions tt_metal/hw/firmware/src/ncrisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ uint32_t noc_nonposted_writes_acked[NUM_NOCS];
uint32_t noc_nonposted_atomics_acked[NUM_NOCS];
uint32_t noc_posted_writes_num_issued[NUM_NOCS];

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr) {

DeviceZoneScopedMainChildN("NCRISC-KERNEL");
Expand All @@ -38,11 +35,13 @@ void kernel_launch(uint32_t kernel_base_addr) {
while (c_tensix_core::read_wall_clock() < KERNEL_RUN_TIME);
#endif
#else
extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((
uint32_t tt_l1_ptr *)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
noc_local_state_init(NOC_INDEX);
if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
noc_local_state_init(NOC_INDEX);
} else {
noc_local_state_init(NOC_0);
noc_local_state_init(NOC_1);
Expand Down
5 changes: 1 addition & 4 deletions tt_metal/hw/firmware/src/trisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,7 @@ int main(int argc, char *argv[]) {
DIRTY_STACK_MEMORY();
WAYPOINT("I");

uint tt_l1_ptr *local_l1_start_addr =
(uint tt_l1_ptr *)PREPROCESSOR_EXPAND(MEM_TRISC, COMPILE_FOR_TRISC, _INIT_LOCAL_L1_BASE_SCRATCH);
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint *)__ldm_data_start, local_l1_start_addr, num_words);
do_crt1((uint32_t tt_l1_ptr *)PREPROCESSOR_EXPAND(MEM_TRISC, COMPILE_FOR_TRISC, _INIT_LOCAL_L1_BASE_SCRATCH));

// Initialize GPRs to all 0s
#pragma GCC unroll 0
Expand Down
8 changes: 4 additions & 4 deletions tt_metal/hw/firmware/src/trisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ volatile tt_reg_ptr uint * mailbox_base[4] = {
};
}

extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];

void kernel_launch(uint32_t kernel_base_addr)
{
DeviceZoneScopedMainChildN("TRISC-KERNEL");
Expand All @@ -44,7 +41,10 @@ void kernel_launch(uint32_t kernel_base_addr)
ckernel::wait(KERNEL_RUN_TIME);
#endif
#else
firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
extern uint32_t __kernel_init_local_l1_base[];
extern uint32_t __fw_export_end_text[];
do_crt1((
uint32_t tt_l1_ptr *)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));

#if defined(UCK_CHLKC_UNPACK)
// Make sure DBG_FEATURE_DISABLE register is cleared before every kernel is executed
Expand Down
71 changes: 34 additions & 37 deletions tt_metal/hw/inc/firmware_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,53 +14,50 @@
#include "hostdevcommon/kernel_structs.h"
#include "dev_msgs.h"

extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
extern void (* __init_array_start[])();
extern void (* __init_array_end[])();

extern void kernel_init(uint32_t kernel_init);
extern void kernel_launch(uint32_t kernel_base_addr);

inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
// Cover L1 load latency of 6 cycles for the bulk of the copy
int32_t n = 0;
while (n < len - 5) {
uint32_t v0 = l1_addr[n + 0];
uint32_t v1 = l1_addr[n + 1];
uint32_t v2 = l1_addr[n + 2];
uint32_t v3 = l1_addr[n + 3];
uint32_t v4 = l1_addr[n + 4];
uint32_t v5 = l1_addr[n + 5];
local_mem_addr[n + 0] = v0;
local_mem_addr[n + 1] = v1;
local_mem_addr[n + 2] = v2;
local_mem_addr[n + 3] = v3;
local_mem_addr[n + 4] = v4;
local_mem_addr[n + 5] = v5;
n += 6;
}
// Could optimize this further (eg, loop of 2 or 4), probably not worth it
while (n < len) {
local_mem_addr[n] = l1_addr[n];
n++;
}
}

inline void firmware_kernel_common_init(void *init_local_l1_base) {

// Handle stuff typically done in crt0 in asm. Easier to do in C
// Clear bss, copy initial data image, run global constructors.
inline void do_crt1(uint32_t tt_l1_ptr *data_image) {
// Clear bss.
extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
wzerorange(__ldm_bss_start, __ldm_bss_end);

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base), num_words);
// Copy initialized data.
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
uint32_t *dst = __ldm_data_start;
uint32_t tt_l1_ptr *src = data_image;
unsigned len = __ldm_data_end - __ldm_data_start;
#pragma GCC unroll 0
while (len >= 3) {
auto v0 = src[0], v1 = src[1], v2 = src[2];
// Make sure optimizer does not think this is memcpy. Also
// prevent it moving the bookkeeping out from between the
// loads and stores -- they occupy load latency slots.
asm volatile("");
len -= 3, src += 3, dst += 3;
asm volatile("");
dst[-3] = v0, dst[-2] = v1, dst[-1] = v2;
}
// There are 0, 1 or 2 words of residue. This is smaller than a loop.
// We get smaller code layout by expecting the conditions to be true.
if (__builtin_expect(len >= 1, true)) {
dst[0] = src[0];
if (__builtin_expect(len >= 2, true))
dst[1] = src[1];
}

// Run constructors.
extern void (*__init_array_start[])();
extern void (*__init_array_end[])();
#pragma GCC unroll 0
for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
(**fptr)();
}
}

FORCE_INLINE
uint32_t firmware_config_init(tt_l1_ptr mailboxes_t* const mailboxes, uint32_t core_type_index, uint32_t dispatch_class) {

Expand Down
22 changes: 9 additions & 13 deletions tt_metal/hw/toolchain/substitutes.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
// SPDX-FileCopyrightText: © 2023, 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -7,22 +7,18 @@

using namespace std;

extern "C" int atexit(void (*f)(void))
{
return 0;
}
extern "C" int atexit(void (*f)(void)) { return 0; }

extern "C" void exit(int ec)
{
extern "C" void exit(int ec) {
while (1) { asm volatile ("" ::: "memory"); }
}

extern "C" void wzerorange(uint32_t *start, uint32_t *end) __attribute__((aligned(16)));

extern "C" void wzerorange(uint32_t *start, uint32_t *end)
{
for (; start != end; start++)
{
extern "C" void wzerorange(uint32_t *start, uint32_t *end) {
#pragma GCC unroll 0
for (; start != end; start++) {
// Prevent optimizer considering this loop equivalent to
// memset (start, 0, end - start) -- that's code bloat.
asm volatile("");
*start = 0;
}
}
42 changes: 12 additions & 30 deletions tt_metal/hw/toolchain/tmu-crt0.S
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,11 @@ _start:
addi gp,gp,%lo(__global_pointer$)
.option pop

// set stack pointer
lui sp, %hi(__stack_top)
addi sp, sp, %lo(__stack_top)
// set stack pointer, reserve 16 bytes for main's arguments
lui sp, %hi(__stack_top - 16)
addi sp, sp, %lo(__stack_top - 16)

// Clear bss
lui a0, %hi(__ldm_bss_start)
addi a0, a0, %lo(__ldm_bss_start)
lui a1, %hi(__ldm_bss_end)
addi a1, a1, %lo(__ldm_bss_end)
call wzerorange

// Run global initializers
lui s2, %hi(__init_array_start)
addi s2, s2, %lo(__init_array_start)
lui s3, %hi(__init_array_end)
addi s3, s3, %lo(__init_array_end)
beq s2, s3, 2f
1: lw a0, 0(s2)
jalr a0
addi s2, s2, 4
bne s2, s3, 1b
2:
// main is responsible for the rest of crt -- clear bss, copy data image, run global constructors

/* Pass in the tensix coordinates as argv[0][0] through argv[0][3].
argc = 1, envp = NULL. In memory, we'll have
Expand All @@ -44,16 +27,15 @@ _start:
* sp+8: s1
* sp+c: 0
*/
addi sp, sp, -16 /* (stack is aligned to 16 bytes in riscv calling convention) */
addi a0, sp, 8
sw a0, 0(sp)
sw zero, 4(sp)
sw s1, 8(sp)
sw zero, 12(sp)

li a0, 1 # argc = 1
mv a1, sp
mv a2, zero
sw a0, 0(sp) // argv[0]
sw zero, 4(sp) // argv[1]
sw s1, 8(sp) // argv[0][0..3]
sw zero, 12(sp) // argv[0][4..7]

li a0, 1 // argc = 1
mv a1, sp // argv
mv a2, zero // env

call main
tail exit
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/toolchain/tmu-crt0k.S
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
.type _start, @function

_start:
// kernel_launch is responsible for the rest of crt -- clear bss, copy data image, run global constructors
tail _Z13kernel_launchm
.size _start, .-_start

0 comments on commit a66aeb8

Please sign in to comment.