From 9ab8177b4a786b8b974ab2db9cde7d469a03f336 Mon Sep 17 00:00:00 2001 From: Kei-Ming Kwong Date: Sat, 18 Nov 2023 22:47:00 +0000 Subject: [PATCH 01/16] #3908: [llk] Uplift and move metal specialties to API layer --- .../3T/matmul_large_block_zm/zm_3m_math.cpp | 6 +- .../3T/matmul_large_block_zm/zm_3m_unpack.cpp | 10 +- .../chlkc_math.cpp | 4 +- .../chlkc_unpack.cpp | 6 +- .../test_kernels/compute/eltwise_copy_3m.cpp | 2 +- .../test_kernels/compute/eltwise_sfpi.cpp | 2 - .../test_kernels/compute/untilA_elwbin_3m.cpp | 10 +- .../ckernels/grayskull/common/inc/ckernel.h | 8 +- .../grayskull/common/inc/ckernel_reverseops.h | 1 - .../ckernels/wormhole_b0/common/inc/ckernel.h | 352 +++-- .../wormhole_b0/common/inc/ckernel_defs.h | 79 +- .../wormhole_b0/common/inc/ckernel_globals.h | 59 +- .../wormhole_b0/common/inc/ckernel_gpr_map.h | 8 +- .../wormhole_b0/common/inc/ckernel_noc.h | 317 ----- .../wormhole_b0/common/inc/ckernel_perf_api.h | 158 +++ .../common/inc/ckernel_perf_include.h | 28 + .../common/inc/ckernel_perf_math.h | 279 ++++ .../common/inc/ckernel_perf_unpack_pack.h | 158 +++ .../wormhole_b0/common/inc/ckernel_sfpi.h | 24 - .../wormhole_b0/common/inc/ckernel_sfpu.h | 1261 +++++++++++------ .../wormhole_b0/common/inc/ckernel_structs.h | 81 +- .../wormhole_b0/common/inc/cllk_io_headers.h | 58 - .../common/inc/cllk_math_headers.h | 56 - .../common/inc/cllk_packer_headers.h | 41 - .../common/inc/cllk_unpack_headers.h | 26 - .../wormhole_b0/common/inc/cmath_common.h | 67 +- .../wormhole_b0/common/inc/cpack_common.h | 242 +--- .../wormhole_b0/common/inc/cpriority_queue.h | 137 -- .../wormhole_b0/common/inc/cunpack_common.h | 148 +- .../wormhole_b0/common/src/ckernel.cc | 221 +++ .../wormhole_b0/common/src/ckernel_main.cc | 21 + .../common/src/ckernel_perf_unpack_pack.cc | 301 ++++ .../common/src/ckernel_template.cc | 9 +- .../wormhole_b0/common/src/ckernel_unity.cc | 10 + .../wormhole_b0/common/src/fwlog_list | 2 + .../hw/ckernels/wormhole_b0/llk_lib/llk_3c.h | 0 .../ckernels/wormhole_b0/llk_lib/llk_defs.h | 25 +- .../wormhole_b0/llk_lib/llk_math_common.h | 80 +- .../llk_lib/llk_math_eltwise_binary.h | 95 +- .../llk_lib/llk_math_eltwise_binary_sfpu.h | 119 ++ .../llk_lib/llk_math_eltwise_unary_datacopy.h | 65 +- .../llk_lib/llk_math_eltwise_unary_sfpi.h | 25 - .../llk_lib/llk_math_eltwise_unary_sfpu.h | 343 +---- .../wormhole_b0/llk_lib/llk_math_matmul.h | 160 ++- .../wormhole_b0/llk_lib/llk_math_reduce.h | 36 +- .../ckernels/wormhole_b0/llk_lib/llk_pack.h | 231 ++- .../wormhole_b0/llk_lib/llk_pack_common.h | 183 +-- .../wormhole_b0/llk_lib/llk_unpack_A.h | 93 +- .../wormhole_b0/llk_lib/llk_unpack_AB.h | 69 +- .../llk_lib/llk_unpack_AB_matmul.h | 107 +- .../wormhole_b0/llk_lib/llk_unpack_common.h | 100 +- .../wormhole_b0/llk_lib/llk_unpack_reduce.h | 79 +- .../wormhole_b0/llk_lib/llk_unpack_tilize.h | 80 +- .../wormhole_b0/llk_lib/llk_unpack_untilize.h | 97 +- .../wormhole_b0/llk_ops/tilize/chlkc_math.cpp | 33 - .../llk_ops/tilize/chlkc_math_fidelity.h | 5 - .../wormhole_b0/llk_ops/tilize/chlkc_pack.cpp | 37 - .../llk_ops/tilize/chlkc_pack_data_format.h | 10 - .../llk_ops/tilize/chlkc_unpack.cpp | 36 - .../llk_ops/tilize/chlkc_unpack_data_format.h | 10 - .../llk_ops/tilize/hlk_args_struct_init.h | 11 - .../wormhole_b0/llk_ops/tilize/loop_count.h | 5 - .../llk_ops/untilize/chlkc_math.cpp | 33 - .../llk_ops/untilize/chlkc_math_fidelity.h | 5 - .../llk_ops/untilize/chlkc_pack.cpp | 37 - .../llk_ops/untilize/chlkc_pack_data_format.h | 10 - .../llk_ops/untilize/chlkc_unpack.cpp | 37 - .../untilize/chlkc_unpack_data_format.h | 10 - .../llk_ops/untilize/hlk_args_struct_init.h | 12 - .../wormhole_b0/llk_ops/untilize/loop_count.h | 5 - .../{common/inc => metal/common}/chlkc_list.h | 1 + .../metal/common/metal_ckernel_globals.h | 62 + .../metal/common/metal_compile_time_args.h | 9 + .../metal/common/metal_mod_div_lib.h | 92 ++ .../wormhole_b0/metal/common/tt_log.h | 16 + .../metal/llk_api/llk_math_binary_api.h | 86 ++ .../metal/llk_api/llk_math_binary_sfpu_api.h | 70 + .../metal/llk_api/llk_math_common_api.h | 108 ++ .../metal/llk_api/llk_math_matmul_api.h | 69 + .../metal/llk_api/llk_math_reduce_api.h | 28 + .../llk_api/llk_math_unary_datacopy_api.h | 36 + .../metal/llk_api/llk_math_unary_sfpu_api.h | 345 +++++ .../metal/llk_api/llk_op_info_api.h | 23 + .../wormhole_b0/metal/llk_api/llk_pack_api.h | 270 ++++ .../llk_api}/llk_param_structs.h | 0 .../llk_api/llk_sfpu}/ckernel_reverseops.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_cdf.h | 0 .../llk_sfpu}/ckernel_sfpu_converter.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_elu.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_exp.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_gelu.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_i0.h | 0 .../llk_sfpu}/ckernel_sfpu_isinf_isnan.h | 0 .../llk_sfpu}/ckernel_sfpu_logical_not_noti.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_recip.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_relu.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h | 0 .../llk_sfpu}/ckernel_sfpu_trigonometry.h | 0 .../llk_math_eltwise_unary_sfpu_0_param.h | 0 .../llk_math_eltwise_unary_sfpu_1_param.h | 0 ..._math_eltwise_unary_sfpu_common_includes.h | 2 + .../llk_math_eltwise_unary_sfpu_elu.h | 0 .../llk_math_eltwise_unary_sfpu_erf_erfc.h | 0 .../llk_math_eltwise_unary_sfpu_erfinv.h | 0 .../llk_math_eltwise_unary_sfpu_exp.h | 0 .../llk_math_eltwise_unary_sfpu_gelu.h | 0 .../llk_math_eltwise_unary_sfpu_i0.h | 0 .../llk_math_eltwise_unary_sfpu_init.h | 14 + .../llk_math_eltwise_unary_sfpu_isinf_isnan.h | 0 ...math_eltwise_unary_sfpu_logical_not_noti.h | 0 .../llk_math_eltwise_unary_sfpu_recip.h | 0 .../llk_math_eltwise_unary_sfpu_relu.h | 0 .../llk_math_eltwise_unary_sfpu_reverseops.h | 0 .../llk_math_eltwise_unary_sfpu_sqrt.h | 0 ...llk_math_eltwise_unary_sfpu_trigonometry.h | 0 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h | 778 ++++++++++ .../metal/llk_api/llk_unpack_AB_api.h | 85 ++ .../metal/llk_api/llk_unpack_AB_matmul_api.h | 136 ++ .../metal/llk_api/llk_unpack_A_api.h | 89 ++ .../metal/llk_api/llk_unpack_common_api.h | 141 ++ .../metal/llk_api/llk_unpack_reduce_api.h | 94 ++ .../metal/llk_api/llk_unpack_tilize_api.h | 93 ++ .../metal/llk_api/llk_unpack_untilize_api.h | 96 ++ .../wormhole_b0/metal/llk_io/llk_io.cc | 3 + .../wormhole_b0/metal/llk_io/llk_io.h | 6 + .../{llk_lib => metal/llk_io}/llk_io_pack.h | 9 +- .../{llk_lib => metal/llk_io}/llk_io_unpack.h | 12 +- .../wormhole_b0/metal/llk_io/llk_operands.h | 46 + .../wormhole_b0/metal/llk_io/llk_outputs.h | 55 + tt_metal/hw/firmware/src/brisc.cc | 2 +- tt_metal/hw/firmware/src/brisck.cc | 2 +- tt_metal/hw/firmware/src/ncrisc.cc | 2 +- tt_metal/hw/firmware/src/ncrisck.cc | 2 +- tt_metal/hw/firmware/src/trisc.cc | 11 +- tt_metal/hw/firmware/src/trisck.cc | 8 +- tt_metal/hw/inc/debug/fw_debug.h | 3 + tt_metal/include/compute_kernel_api.h | 27 +- tt_metal/include/compute_kernel_api/bcast.h | 8 +- tt_metal/include/compute_kernel_api/cb_api.h | 8 + .../compute_kernel_api/common_globals.h | 8 +- .../compute_kernel_api/eltwise_binary.h | 4 +- .../eltwise_unary/eltwise_unary.h | 4 +- tt_metal/include/compute_kernel_api/matmul.h | 13 +- tt_metal/include/compute_kernel_api/reduce.h | 6 +- .../compute_kernel_api/tile_move_copy.h | 4 +- tt_metal/include/compute_kernel_api/tilize.h | 4 +- .../include/compute_kernel_api/transpose_wh.h | 4 +- .../include/compute_kernel_api/untilize.h | 4 +- 150 files changed, 6226 insertions(+), 3247 deletions(-) delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_3c.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/common}/chlkc_list.h (96%) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api}/llk_param_structs.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_reverseops.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_cdf.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_converter.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_elu.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_exp.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_gelu.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_i0.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_isinf_isnan.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_logical_not_noti.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_recip.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_relu.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_trigonometry.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_0_param.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_1_param.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_common_includes.h (83%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_elu.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erf_erfc.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erfinv.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_exp.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_gelu.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_i0.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_init.h (66%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_isinf_isnan.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_logical_not_noti.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_recip.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_relu.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_reverseops.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_sqrt.h (100%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_trigonometry.h (100%) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_io}/llk_io_pack.h (98%) rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_io}/llk_io_unpack.h (97%) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp index c68f206eb31..7717eb5676c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp @@ -4,9 +4,9 @@ #include #include "llk_math_common.h" -#include "llk_math_eltwise_unary_datacopy.h" -#include "llk_math_eltwise_unary_datacopy.h" -#include "llk_math_matmul.h" +#include "llk_math_unary_datacopy_api.h" +#include "llk_math_unary_datacopy_api.h" +#include "llk_math_matmul_api.h" namespace NAMESPACE { diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp index 7f6b7684c68..751693a6217 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include "llk_unpack_common.h" -#include "llk_unpack_tilize.h" -#include "llk_unpack_untilize.h" -#include "llk_unpack_A.h" -#include "llk_unpack_AB_matmul.h" +#include "llk_unpack_common_api.h" +#include "llk_unpack_tilize_api.h" +#include "llk_unpack_untilize_api.h" +#include "llk_unpack_A_api.h" +#include "llk_unpack_AB_matmul_api.h" namespace NAMESPACE { diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp index bdc0507c5ce..0665298d117 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp @@ -4,8 +4,8 @@ #include #include "llk_math_common.h" -#include "llk_math_eltwise_binary.h" -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_binary_api.h" +#include "llk_math_unary_datacopy_api.h" namespace NAMESPACE { diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp index 7f1e967ac54..d9d15e7a1fa 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include "llk_unpack_common.h" -#include "llk_unpack_AB.h" -#include "llk_unpack_untilize.h" +#include "llk_unpack_common_api.h" +#include "llk_unpack_AB_api.h" +#include "llk_unpack_untilize_api.h" namespace NAMESPACE { diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp index a47d5e02e24..10ede233bbd 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp @@ -14,7 +14,7 @@ namespace NAMESPACE { #ifdef TRISC_MATH #include "llk_math_common.h" -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_unary_datacopy_api.h" void math_main() { diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp index 51b21cff002..2dced1a4c0d 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp @@ -4,8 +4,6 @@ #include -#include "llk_3c.h" - namespace NAMESPACE { void MAIN { // expands to hlk_relu_config(nullptr, 1); for relu only diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp index 56be069bed3..f4feda0dbae 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp @@ -16,8 +16,8 @@ namespace NAMESPACE { #ifdef TRISC_MATH #include #include "llk_math_common.h" -#include "llk_math_eltwise_binary.h" -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_binary_api.h" +#include "llk_math_unary_datacopy_api.h" void math_main() { @@ -49,9 +49,9 @@ void math_main() #ifdef TRISC_UNPACK #include -#include "llk_unpack_common.h" -#include "llk_unpack_AB.h" -#include "llk_unpack_untilize.h" +#include "llk_unpack_common_api.h" +#include "llk_unpack_AB_api.h" +#include "llk_unpack_untilize_api.h" void unpack_main() { diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h index 5c5489622ec..b2de68e862a 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h @@ -49,11 +49,11 @@ constexpr uint PACK_FLUSH_COUNTERS = // counters flush (1 << PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT) | (1 << PACK_COUNTERS_SEC2_pack_xys_per_tile_SHAMT); -extern volatile uint * const reg_base; -extern volatile uint * const pc_buf_base; -extern volatile uint * const regfile; +extern volatile uint * reg_base; +extern volatile uint * pc_buf_base; +extern volatile uint * regfile; extern uint *regmem; -extern volatile uint * const instrn_buffer; +extern volatile uint * instrn_buffer; extern volatile uint *dbg_event_scratch; extern volatile uint local_mem_barrier; diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h index 7d1974639fe..e17a51820e1 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h @@ -6,7 +6,6 @@ #include "ckernel_defs.h" #include "ckernel.h" -#include "noc_nonblocking_api.h" #include "sfpi.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h index a0548a80deb..2f72476ade2 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 - #pragma once #include "risc_attribs.h" @@ -27,49 +26,95 @@ #define GPR_DEBUG_REGFILE 0 #endif +#ifdef PERF_DUMP +#define DECOUPLINGS_EN (SKIP_UNP || MATH_PACK_DECOUPLE) +#else +#define SKIP_UNP 0 +#define MATH_PACK_DECOUPLE 0 +#define DECOUPLINGS_EN 0 +#define OVERLAY_DECOUPLE 0 +#endif + + +#ifndef INSERT_UNPACK_DELAY +#define INSERT_UNPACK_DELAY 0 +#endif + +#ifndef INSERT_MATH_DELAY +#define INSERT_MATH_DELAY 0 +#endif + +#ifndef INSERT_PACK_DELAY +#define INSERT_PACK_DELAY 0 +#endif + +#define DELAY_EN (INSERT_UNPACK_DELAY || INSERT_PACK_DELAY || INSERT_MATH_DELAY) + +#define TT_ALWAYS_INLINE inline __attribute__ ((always_inline)) + #include #include "ckernel_include.h" -#include "debug/fw_debug.h" #include "tensix.h" +#include "debug/fw_debug.h" #include "eth_l1_address_map.h" -#include "noc_overlay_parameters.h" -#include "stream_io_map.h" #include "hostdevcommon/common_runtime_address_map.h" -#include "limits.h" // #include -//#include "perf_lib/scratch_api.h" // not used unless perf dump enabled? - namespace ckernel { -#define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_ ## arg_idx - constexpr uint PACK_FLUSH_COUNTERS = // counters flush (1 << PACK_COUNTERS_SEC2_pack_per_xy_plane_SHAMT) | (1 << PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT) | (1 << PACK_COUNTERS_SEC2_pack_xys_per_tile_SHAMT); -extern volatile uint tt_reg_ptr * const reg_base; -extern volatile uint tt_reg_ptr * const pc_buf_base; -extern volatile uint tt_reg_ptr * const regfile; -extern uint tt_reg_ptr * regmem; -extern volatile uint tt_reg_ptr * const instrn_buffer; +constexpr uint RESET_VAL = 0; +constexpr uint KERNEL_IN_PROGRESS = 15; +constexpr uint KERNEL_COMPLETE = 1; + +extern volatile uint tt_reg_ptr * reg_base; +extern volatile uint tt_reg_ptr * pc_buf_base; +extern volatile uint tt_reg_ptr * regfile; +extern volatile uint tt_reg_ptr * instrn_buffer; +extern volatile uint tt_reg_ptr *mailbox_base[4]; extern volatile uint tt_reg_ptr *dbg_event_scratch; +extern volatile uint tt_reg_ptr *trisc_l1_mailbox; +extern volatile uint8_t tt_l1_ptr *debug_buffer; extern uint32_t cfg_state_id; extern uint32_t dest_offset_id; extern uint32_t dbg_event_index; extern uint32_t dbg_event_end; -extern uint32_t op_info_offset; +extern volatile uint16_t tt_reg_ptr *debug_mailbox_base; +extern uint8_t mailbox_index; +const extern uint8_t mailbox_end; // Internal scope to namespace methods only (C++ does not allow namespace private ownership) namespace internal { } -void tensix_sync(); -void mop_sync(); +inline void tensix_sync() +{ + volatile uint foo = 0; + volatile uint *fooptr = &foo; + // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes + pc_buf_base[1] = foo; + + // Now read -- this read will block until we're idle + *fooptr = pc_buf_base[1]; +} + +inline void mop_sync() +{ + volatile uint foo = 0; + volatile uint *fooptr = &foo; + // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes + pc_buf_base[2] = foo; + + // Now read -- this read will block until mops are done + *fooptr = pc_buf_base[2]; +} inline void sync_regfile_write(const uint index); @@ -84,6 +129,7 @@ static constexpr bool is_valid(const T val, const uint8_t wid) inline void mmio_register_write(register_space_e space, uint addr, uint data) { const uint regaddr = (space << 6) | (addr & 0x3F); + //FWLOG2("Regaddr: 0x%x, data: 0x%x", regaddr, data); reg_base[regaddr] = data; } @@ -122,6 +168,17 @@ inline void t6_semaphore_get(const uint8_t index) TTI_SEMGET(semaphore::t6_sem(index)); } +template +inline void t6_semaphore_wait_on_max(const uint8_t index) +{ + TTI_SEMWAIT(WaitRes, semaphore::t6_sem(index), p_stall::STALL_ON_MAX); +} +template +inline void t6_semaphore_wait_on_zero(const uint8_t index) +{ + TTI_SEMWAIT(WaitRes, semaphore::t6_sem(index), p_stall::STALL_ON_ZERO); +} + // Tensix thread semaphore get optionally stalled inline void t6_semaphore_init(const uint8_t index, const uint8_t min_value, const uint8_t max_value) { @@ -154,7 +211,7 @@ inline void cfg_write(uint cfg_addr32, uint data) inline uint cfg_read(uint cfg_addr32) { // Declared here instead of globally to prevent direct access, which might ignore current state ID - volatile uint32_t tt_reg_ptr *cfg_regs = reinterpret_cast(TENSIX_CFG_BASE); + volatile uint *cfg_regs = reinterpret_cast(TENSIX_CFG_BASE); return cfg_regs[cfg_addr(cfg_addr32)]; } @@ -199,7 +256,11 @@ inline void mop_run(const uint8_t type, const uint8_t count) TTI_MOP(type, count - 1, 0); // Run the MOP } -inline __attribute__((always_inline)) uint32_t reg_read(uint32_t addr) +// Register read (workaround for bug +// https://yyz-gitlab.local.tenstorrent.com/tenstorrent/tensix/issues/976 +// now handled by the compiler) +// workaround is needed only for GS +inline uint reg_read(uint32_t addr) { volatile uint tt_reg_ptr *p_reg = reinterpret_cast (addr); return p_reg[0]; @@ -310,146 +371,169 @@ inline void cfg_reg_rmw_tensix(uint32_t val) } } -template -inline std::uint32_t memory_cast(T *object_ptr) +inline void mailbox_write(const uint8_t thread, const uint32_t data) { - return reinterpret_cast(object_ptr); + mailbox_base[thread + 1][0] = data; } -inline uint64_t read_wall_clock() +// Blocking read +inline uint32_t mailbox_read(const uint8_t thread) { - uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - return ((uint64_t)timestamp_high << 32) | timestamp_low; + return mailbox_base[thread + 1][0]; } -void debug_dump(const uint8_t *data, uint32_t byte_size); -void debug_dump_seek(uint8_t offset); +inline bool mailbox_not_empty(const uint8_t thread) +{ + return mailbox_base[thread + 1][1] > 0; +} +inline void mailbox_write_full(const uint8_t thread, const uint32_t data) +{ + mailbox_base[thread][0] = data; +} -inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b) +// Blocking read +inline uint32_t mailbox_read_full(const uint8_t thread) { - unsigned int r = 0; - while (a) - { - if (a & 1) - r += b; - a >>= 1; - b <<= 1; - } - return r; -} - -inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n) -{ - // Uses embedding style magic number - // * fixed point 1/12 then shifting. - // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm - return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3; -} - -inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n) -{ - // Uses embedding style magic number - // * fixed point 1/12 then shifting. - // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm - return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6; -} - -template -inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n) -{ - if constexpr (d == 12) { - // fast divide for 12 divisor - return fast_udiv_12(n); - } else if constexpr (d == 94) { - // fast divide for 94 divisor. Handles Banked L1 address generation for E75 - return fast_udiv_94(n); - } else { - // generic divide from llvm - const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT; - unsigned int q; - unsigned int r; - unsigned sr; - /* special cases */ - if (d == 0) - return 0; /* ?! */ - if (n == 0) - return 0; - sr = __builtin_clz(d) - __builtin_clz(n); - /* 0 <= sr <= n_uword_bits - 1 or sr large */ - if (sr > n_uword_bits - 1) /* d > r */ - return 0; - if (sr == n_uword_bits - 1) /* d == 1 */ - return n; - ++sr; - /* 1 <= sr <= n_uword_bits - 1 */ - /* Not a special case */ - q = n << (n_uword_bits - sr); - r = n >> sr; - unsigned int carry = 0; - for (; sr > 0; --sr) - { - /* r:q = ((r:q) << 1) | carry */ - r = (r << 1) | (q >> (n_uword_bits - 1)); - q = (q << 1) | carry; - /* carry = 0; - * if (r.all >= d.all) - * { - * r.all -= d.all; - * carry = 1; - * } - */ - const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1); - carry = s & 1; - r -= d & s; - } - q = (q << 1) | carry; - return q; - } + return mailbox_base[thread][0]; } -template -inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a) + +inline bool mailbox_not_empty_full(const uint8_t thread) { - return a - udivsi3_const_divisor(a) * d; + return mailbox_base[thread][1] > 0; } -inline void tensix_sync() +inline void trisc_l1_mailbox_write(const uint data) { - volatile uint foo = 0x0; - volatile uint *fooptr = &foo; - // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes - pc_buf_base[1] = foo; + trisc_l1_mailbox[0] = data; +} - // Now read -- this read will block until we're idle - *fooptr = pc_buf_base[1]; +inline uint trisc_l1_mailbox_read() +{ + return trisc_l1_mailbox[0]; } -inline void mop_sync() +template +inline std::uint32_t memory_cast(T *object_ptr) { - volatile uint foo = 0x0; - volatile uint *fooptr = &foo; - // Write to pc buffer to push all writes ahead of us.. otherwise, the pc buffer read can bypass older writes - pc_buf_base[2] = foo; + return reinterpret_cast(object_ptr); +} - // Now read -- this read will block until mops are done - *fooptr = pc_buf_base[2]; +inline void record_mailbox_value(uint16_t event_value) { + if (mailbox_index < mailbox_end) { + debug_mailbox_base[mailbox_index] = event_value; + mailbox_index++; + } } -inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) { +inline void record_mailbox_value_with_index(uint8_t index, uint16_t event_value) { + if (index < mailbox_end) { + debug_mailbox_base[index] = event_value; + } +} - uint32_t* op_info_ptr = reinterpret_cast(OP_INFO_BASE_ADDR + op_info_offset); - static constexpr uint32_t op_info_num_items = 7; +// Initialize debug scratch mailbox values and range +inline void clear_mailbox_values(uint16_t value = 0) { + for (int i = 0; i < mailbox_end; i++) + debug_mailbox_base[i] = value; +} - volatile uint32_t* op_info_struct_ptr = reinterpret_cast(&op_info_struct); - for (uint32_t i = 0; i < op_info_num_items; i++) { - op_info_struct_ptr[i] = op_info_ptr[i]; - } - op_info_offset += 28; +inline uint64_t read_wall_clock() +{ + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + return ((uint64_t)timestamp_high << 32) | timestamp_low; +} + +inline void record_kernel_runtime(uint64_t kernel_runtime) { + debug_mailbox_base[mailbox_end - 4] = kernel_runtime & 0xffff; + debug_mailbox_base[mailbox_end - 3] = (kernel_runtime >> 16) & 0xffff; + debug_mailbox_base[mailbox_end - 2] = (kernel_runtime >> 32) & 0xffff; + debug_mailbox_base[mailbox_end - 1] = (kernel_runtime >> 48) & 0xffff; +} + +void debug_dump(const uint8_t *data, uint32_t byte_size); +void debug_dump_seek(uint8_t offset); - if (op_info_offset == OP_INFO_SIZE) { - op_info_offset = 0; // In case we go out of bounds +inline void stall_kernel(uint32_t num_cycles) { +#if DELAY_EN > 0 + TT_LLK_DUMP("stall_kernel({})", num_cycles); + uint32_t start_clk_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t elapsed_time = 0; + while (elapsed_time <= num_cycles) { + uint32_t current_clk_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + if (current_clk_l >= start_clk_l) { + elapsed_time = current_clk_l - start_clk_l; + } else { + elapsed_time = 0xffffffff - (start_clk_l - current_clk_l); + } } +#endif } +#if defined(PERF_DUMP) || DELAY_EN > 0 +extern bool record_perf_events; +#endif + +// This api is inserted in the beginning of each input loop +// Wait for all instructions of previous loop to finish before starting the next loop +// If PERF_DUMP is enabled, always wait but only for the inputs that perf dump is enabled for +// If PERF_DUMP is enabled, and delay is not, no need to insert these apis for unpack and math +template +inline void serialize_input_loop_start() { + #if defined(PERF_DUMP) || DELAY_EN > 0 + TT_LLK_DUMP("serialize_input_loop_start<{}>()", thread_id); + if constexpr (thread_id == 0) { + #if DELAY_EN > 0 + t6_semaphore_post(semaphore::UNPACK_MATH_DONE); + while (semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {} + #endif + + } else if (thread_id == 1) { + #if DELAY_EN > 0 + t6_semaphore_post(semaphore::UNPACK_MATH_DONE); + while (semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {} + #endif + + } else if (thread_id == 2) { + #if DELAY_EN == 0 + if (record_perf_events) { + #endif + t6_semaphore_post(semaphore::PACK_DONE); + while (semaphore_read(semaphore::PACK_DONE) == 0) {} + #if DELAY_EN == 0 + } + #endif + } + #endif +} + +template +inline void serialize_input_loop_end() { + #if defined(PERF_DUMP) || DELAY_EN > 0 + TT_LLK_DUMP("serialize_input_loop_end<{}>()", thread_id); + if constexpr (thread_id == 0) { + #if DELAY_EN > 0 + t6_semaphore_get(semaphore::UNPACK_MATH_DONE); + while (semaphore_read(semaphore::UNPACK_MATH_DONE) > 0) {} + #endif + + } else if (thread_id == 1) { + #if DELAY_EN > 0 + t6_semaphore_get(semaphore::UNPACK_MATH_DONE); + while (semaphore_read(semaphore::UNPACK_MATH_DONE) > 0) {} + #endif + + } else if (thread_id == 2) { + #if DELAY_EN == 0 + if (record_perf_events) { + #endif + t6_semaphore_get(semaphore::PACK_DONE); + while (semaphore_read(semaphore::PACK_DONE) > 0) {} + #if DELAY_EN == 0 + } + #endif + } + #endif + } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h index 46b5b775903..ffd8ad6dae9 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h @@ -2,10 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 - #pragma once -#include "llk_defs.h" #include "ckernel_ops.h" #include "tensix_types.h" @@ -68,12 +66,30 @@ enum PackSelMask PACK_23=0xC }; +/* +Stochastic rounding modes: + None: No stochastic rounding enabled, default rounding is round to nearest even. + Fpu: Enables stochastic rounding for every accumulation in the fpu + Pack: Enables stochastic rounding in both gasket and packer. Gasket rounding is in + data format conversion stage from dest format to pack_src_format. Packer rounding + is in data format conversion stage from pack_src_format to pack_dst_format. + All: Enables fpu, pack and gasket rounding. +*/ +enum class StochRndMode : std::uint8_t +{ + None = 0, + Fpu = 1, + Pack = 2, + All = 0xf, + Invalid = 0xff, +}; + constexpr std::uint32_t FACE_HEIGHT = 16; constexpr std::uint32_t FACE_WIDTH = 16; constexpr std::uint32_t TILE_HEIGHT = 32; constexpr std::uint32_t TILE_WIDTH = 32; constexpr std::uint32_t DATUMS_PER_ROW = 16; -constexpr std::uint32_t TILE_HEADER_SIZE = 0; +constexpr std::uint32_t TILE_HEADER_SIZE = 1; constexpr std::uint32_t FACE_R_DIM = FACE_HEIGHT; constexpr std::uint32_t FACE_C_DIM = FACE_WIDTH ; @@ -92,29 +108,27 @@ static_assert((DEST_NUM_TILES_FP16 & (DEST_NUM_TILES_FP16 - 1)) == 0); #define HI_16(REG) (2 * (REG) + 1) -// all sizes are headerless -// in 16B words, in L1/DRAM, headerless +/* constexpr static std::int32_t MUL_TILE_SIZE_AND_INDEX(uint format, uint index) { - switch (format&0x1F) { - case ((uint8_t)DataFormat::Float32): return ((index<<8)); + switch (format&0xF) { + case ((uint8_t)DataFormat::Float32): return ((index<<8)+(index<<1)); case ((uint8_t)DataFormat::Float16): - case ((uint8_t)DataFormat::Float16_b): return ((index<<7)); + case ((uint8_t)DataFormat::Float16_b): return ((index<<7)+(index<<1)); case ((uint8_t)DataFormat::Bfp8): - case ((uint8_t)DataFormat::Bfp8_b): return ((index<<6)+(index<<2)); + case ((uint8_t)DataFormat::Bfp8_b): return ((index<<6)+(index<<2)+(index<<1)); case ((uint8_t)DataFormat::Bfp4): - case ((uint8_t)DataFormat::Bfp4_b): return ((index<<5)+(index<<2)); + case ((uint8_t)DataFormat::Bfp4_b): return ((index<<5)+(index<<2)+(index<<1)); case ((uint8_t)DataFormat::Bfp2): - case ((uint8_t)DataFormat::Bfp2_b): return ((index<<4)+(index<<2)); + case ((uint8_t)DataFormat::Bfp2_b): return ((index<<4)+(index<<2)+(index<<1)); case ((uint8_t)DataFormat::Int8): - case ((uint8_t)DataFormat::Lf8): return ((index<<6)); + case ((uint8_t)DataFormat::Lf8): return ((index<<6)+(index<<1)); //Keep default as Bfp8? - default: return ((index<<6)+(index<<2)); + default: return ((index<<6)+(index<<2)+(index<<1)); }; } -// in Bytes, in DST REG, headerless constexpr static std::int32_t MUL_DEST_TILE_SIZE_AND_INDEX(uint format, uint index) { - switch (format&0x1F) { + switch (format&0xF) { case ((uint8_t)DataFormat::Float32): return (index<<12); case ((uint8_t)DataFormat::Float16): case ((uint8_t)DataFormat::Float16_b): return (index<<11); @@ -130,27 +144,25 @@ constexpr static std::int32_t MUL_DEST_TILE_SIZE_AND_INDEX(uint format, uint ind }; } -// 16B words, L1/dram headerless! constexpr static std::int32_t GET_L1_TILE_SIZE(uint format) { - switch (format&0x1F) { - case ((uint8_t)DataFormat::Float32): return ((4096>>4)); + switch (format&0xF) { + case ((uint8_t)DataFormat::Float32): return ((4096>>4)+(32>>4)); case ((uint8_t)DataFormat::Float16): - case ((uint8_t)DataFormat::Float16_b): return ((2048>>4)); + case ((uint8_t)DataFormat::Float16_b): return ((2048>>4)+(32>>4)); case ((uint8_t)DataFormat::Bfp8): - case ((uint8_t)DataFormat::Bfp8_b): return ((1024>>4)+(64>>4)); + case ((uint8_t)DataFormat::Bfp8_b): return ((1024>>4)+(64>>4)+(32>>4)); case ((uint8_t)DataFormat::Bfp4): - case ((uint8_t)DataFormat::Bfp4_b): return ((512>>4)+(64>>4)); + case ((uint8_t)DataFormat::Bfp4_b): return ((512>>4)+(64>>4)+(32>>4)); case ((uint8_t)DataFormat::Bfp2): - case ((uint8_t)DataFormat::Bfp2_b): return ((256>>4)+(64>>4)); + case ((uint8_t)DataFormat::Bfp2_b): return ((256>>4)+(64>>4)+(32>>4)); case ((uint8_t)DataFormat::Int8): - case ((uint8_t)DataFormat::Lf8): return ((1024>>4)); - default: return ((1024>>4)+(64>>4)); + case ((uint8_t)DataFormat::Lf8): return ((1024>>4)+(32>>4)); + default: return ((1024>>4)+(64>>4)+(32>>4)); }; } - constexpr static std::int32_t GET_DEST_TILE_BYTE_SIZE(uint format) { - switch (format&0x1F) { + switch (format&0xF) { case ((uint8_t)DataFormat::Float32): return 4096; case ((uint8_t)DataFormat::Float16): case ((uint8_t)DataFormat::Float16_b): return 2048; @@ -165,9 +177,11 @@ constexpr static std::int32_t GET_DEST_TILE_BYTE_SIZE(uint format) { default: return 1024; }; } +*/ constexpr static std::uint32_t GET_L1_HEADERLESS_TILE_SIZE(uint format) { switch (format&0xF) { + case ((uint8_t)DataFormat::Int32): case ((uint8_t)DataFormat::Float32): return (4096>>4); case ((uint8_t)DataFormat::Float16): case ((uint8_t)DataFormat::Float16_b): return (2048>>4); @@ -204,8 +218,20 @@ constexpr static bool IS_BFP_A_FORMAT(uint format) { }; } +constexpr static bool IS_A_FORMAT(uint format) { + switch (format&0xF) { + case ((uint8_t)DataFormat::Lf8): + case ((uint8_t)DataFormat::Float16): + case ((uint8_t)DataFormat::Bfp8): + case ((uint8_t)DataFormat::Bfp4): + case ((uint8_t)DataFormat::Bfp2): return true; + default: return false; + }; +} + constexpr static std::uint32_t SCALE_DATUM_SIZE(uint format, uint datum_count) { switch (format&0xF) { + case ((uint8_t)DataFormat::Int32): case ((uint8_t)DataFormat::Float32): return (datum_count<<2); case ((uint8_t)DataFormat::Float16): case ((uint8_t)DataFormat::Float16_b): return (datum_count<<1); @@ -217,4 +243,5 @@ constexpr static std::uint32_t SCALE_DATUM_SIZE(uint format, uint datum_count) { #define UPPER_HALFWORD(x) ((x) >> 16) constexpr int WHB0_ITERATIONS = 8; + } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h index d4ad75f5e7f..90ac67944f5 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h @@ -7,70 +7,15 @@ #include #include "ckernel_structs.h" #include "risc_attribs.h" -#include "tensix_functions.h" -#include "hostdevcommon/common_runtime_address_map.h" extern uint32_t cfg_state_id; extern uint32_t unp_cfg_context; extern uint32_t volatile tt_l1_ptr l1_buffer[16]; -//extern const int32_t unpack_src_format[24]; -//extern const int32_t unpack_dst_format[24]; -//extern const int32_t pack_src_format[16]; -//extern const int32_t pack_dst_format[16]; - extern uint32_t pack_sync_tile_dst_ptr; extern uint32_t math_sync_tile_dst_index; -extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; - -extern uint32_t __ldm_bss_start[]; -extern uint32_t __ldm_bss_end[]; -extern uint32_t __ldm_data_start[]; -extern uint32_t __ldm_data_end[]; -extern void (* __init_array_start[])(); -extern void (* __init_array_end[])(); +extern uint32_t __local_mem_rodata_start_addr[]; +extern uint32_t __local_mem_rodata_end_addr[]; extern uint32_t __firmware_start[]; - -extern void kernel_init(); -extern void kernel_launch(); - -inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) { - // Cover L1 load latency of 6 cycles for the bulk of the copy - int32_t n = 0; - while (n < len - 5) { - uint32_t v0 = l1_addr[n + 0]; - uint32_t v1 = l1_addr[n + 1]; - uint32_t v2 = l1_addr[n + 2]; - uint32_t v3 = l1_addr[n + 3]; - uint32_t v4 = l1_addr[n + 4]; - uint32_t v5 = l1_addr[n + 5]; - local_mem_addr[n + 0] = v0; - local_mem_addr[n + 1] = v1; - local_mem_addr[n + 2] = v2; - local_mem_addr[n + 3] = v3; - local_mem_addr[n + 4] = v4; - local_mem_addr[n + 5] = v5; - n += 6; - } - // Could optimize this further (eg, loop of 2 or 4), probably not worth it - while (n < len) { - local_mem_addr[n] = l1_addr[n]; - n++; - } -} - -inline void firmware_kernel_common_init(void *init_local_l1_base) { - - // Handle stuff typically done in crt0 in asm. Easier to do in C - wzerorange(__ldm_bss_start, __ldm_bss_end); - - int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2; - uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE; - l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words); - - for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) { - (**fptr)(); - } -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h index 9b3f032e624..822704cc9e1 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h @@ -43,13 +43,19 @@ struct p_gpr_unpack constexpr static uint FACE_DIM_4x16 = 42; // Holds face dimension (4x16) constexpr static uint FACE_DIM_2x16 = 43; // Holds face dimension (2x16) constexpr static uint FACE_DIM_1x16 = 44; // Holds face dimension (1x16) + constexpr static uint PERF_UNPACK_NUM_TILES_0 = 45; // num tiles for input operands 0-1 + constexpr static uint PERF_UNPACK_NUM_TILES_1 = 46; // num tiles for input operands 2-3 + constexpr static uint PERF_UNPACK_NUM_TILES_2 = 47; // num tiles for input operands 4-5 + constexpr static uint PERF_UNPACK_NUM_TILES_3 = 48; // num tiles for input operands 6-7 + constexpr static uint UNPACK_STRIDE = 52; // Used to save/restore unpack A stride (UNP0_ADDR_CTRL_ZW_REG_1_Zstride register) + // before/after unpacking directly to dest constexpr static uint SR_UNPACK_TILIZER_STATE_0 = 54; // Save unpack state before tilizer is enabled for quick restore constexpr static uint SR_UNPACK_TILIZER_STATE_1 = 55; constexpr static uint SR_UNPACK_UNTILIZER_STATE_0 = 56; // Save unpack state before tilizer is enabled for quick restore constexpr static uint SR_UNPACK_UNTILIZER_STATE_1 = 57; constexpr static uint SR_UNPACK_UNTILIZER_STATE_2 = 58; constexpr static uint SR_UNPACK_UNTILIZER_STATE_3 = 59; - constexpr static uint SR_UNPACK_UNTILIZER_STATE_4 = 59; + }; // Math GPR thread diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h deleted file mode 100644 index 18c150d6388..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_noc.h +++ /dev/null @@ -1,317 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "debug/fw_debug.h" - -#include "noc_overlay_parameters.h" - -struct stream_tile_info_t -{ - uint32_t base_address; - TileHeader tile_header; -}; -// Functions for accessing NOC overlay registers -namespace ckernel -{ - -typedef volatile uint32_t tt_reg_ptr *regp; - -// Only perform the calculation once, as it's expensive to multiply numbers -inline regp get_stream_reg(uint32_t stream_id) -{ - constexpr uint32_t NOC_REGISTER_MMIO_BASE = 0xFFB40000; - constexpr uint32_t PER_STREAM_REG_SIZE = 0x1000; - return (regp) (NOC_REGISTER_MMIO_BASE + PER_STREAM_REG_SIZE * stream_id); -} - -inline uint32_t get_stream_reg_addr(uint32_t stream_id, uint32_t index) -{ - constexpr uint32_t NOC_REGISTER_MMIO_BASE = 0xFFB40000; - constexpr uint32_t PER_STREAM_REG_SIZE = 0x1000; - return (NOC_REGISTER_MMIO_BASE + PER_STREAM_REG_SIZE * stream_id + (index << 2)); -} - -inline void write_stream_register(regp p_stream_reg, uint32_t index, uint32_t value) -{ - p_stream_reg[index] = value; -} - -inline uint32_t read_stream_register(const regp p_stream_reg, uint32_t index) -{ - return p_stream_reg[index]; -} - -inline uint32_t read_stream_register_field(const regp p_stream_reg, uint32_t index, uint32_t shift, uint32_t width) -{ - return (read_stream_register(p_stream_reg, index) >> shift) & ((1 << width) - 1); -} - -// Wait until stream has at least 'count' tiles ready -inline void wait_for_stream_messages(const regp p_stream_reg, const uint count) -{ - uint c = 0; - do - { - c = read_stream_register(p_stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX); - } while (c < count); -} - -inline void wait_for_N_stream_messages(const regp p_stream_reg, const uint num_messages) { - - uint c = 0; - do { - uint32_t msg_info_wr = read_stream_register(p_stream_reg, STREAM_MSG_INFO_WR_PTR_REG_INDEX); - uint32_t msg_info = read_stream_register(p_stream_reg, STREAM_MSG_INFO_PTR_REG_INDEX); - uint32_t num_msg = read_stream_register(p_stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX); - c = num_msg + (msg_info_wr - msg_info); - // wait while we receive all the tiles from this stream - } while (c < num_messages); -} - -inline void wait_for_stream_phase(const regp p_stream_reg, const uint phase_id) -{ - if (phase_id == 0) - { - return; - } - uint p = 0; - do - { - p = read_stream_register(p_stream_reg, STREAM_CURR_PHASE_REG_INDEX); - } while (p != phase_id); -} - -inline void update_stream_read_pointer(regp p_stream_reg, const uint amount) -{ - write_stream_register(p_stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, amount); -} - -inline uint read_stream_base_address(const regp p_stream_reg, const uint tile_n) -{ - return read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + tile_n * 6 + 0); //-> activations base address for tile n -} - -inline uint read_stream_zero_mask(const regp p_stream_reg, const uint tile_n) -{ - return read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + tile_n * 6 + 4); //-> 32-bit zero mask -} - -// Read tile info from a stream -inline stream_tile_info_t read_stream_info(const uint tile_index, const regp p_stream_reg) -{ - const uint n = tile_index; - const uint base_address = read_stream_base_address(p_stream_reg, n); - - TileHeader_u header; - - header.val[0] = read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + n * 6 + 2); //-> tile n size including header and tile id (15:0 size, 31:16 tile id) - FWASSERT("Tile size must be != 0", (header.val[0] & 0xFFFF) != 0); - header.val[1] = read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + n * 6 + 3); //-> tile n meta data size and format - header.val[2] = read_stream_zero_mask(p_stream_reg, n); //-> 32-bit zero mask - //read_stream_register(p_stream_reg, STREAM_RECEIVER_MSG_INFO_REG_INDEX + n * 6 + 5); //-> Reserved - - return stream_tile_info_t{base_address, header.header}; -} - -inline uint read_dis_zero_compress_group_info(const regp p_stream_reg) -{ - return read_stream_register(p_stream_reg, STREAM_MSG_GROUP_COMPRESS_REG_INDEX); -} - -// Return the offset of a tile given it's tile id and table address -inline uint32_t get_indexed_offset(const uint tile_id, const uint weights_offset, const uint table_addr) -{ - const uint16_t *weight_offset_table = reinterpret_cast(table_addr << 4); - uint weight_offset = weight_offset_table[tile_id + weights_offset]; - return weight_offset; -} - -inline void unpacker_config(const regp p_stream_reg, const uint unpacker_id, const uint fifo_size_factor = 1) -{ - uint fifo_base_addr = read_stream_register(p_stream_reg, STREAM_BUF_START_REG_INDEX); - uint fifo_size = fifo_size_factor * read_stream_register(p_stream_reg, STREAM_BUF_SIZE_REG_INDEX); - cfg_write(unpacker_id ? THCON_SEC1_REG2_Unpack_limit_address_ADDR32 : THCON_SEC0_REG2_Unpack_limit_address_ADDR32, - (fifo_base_addr + fifo_size - 1) | (fifo_size << THCON_SEC0_REG2_Unpack_fifo_size_SHAMT)); -} - -// Optimized function that reads base addresses and programs registers for one context -inline void program_halo_strips_cntx0( - volatile uint *cfg, const regp p_stream_reg, const uint first_active_tile, const uint unpack_halo_mask, uint *group_dis_zero_compress) -{ - //const uint strip1_addr = read_stream_base_address(p_stream_reg, 1); - //const uint strip2_addr = read_stream_base_address(p_stream_reg, 2); - //const uint strip3_addr = read_stream_base_address(p_stream_reg, 3); - //cfg[THCON_SEC0_REG3_Base_cntx1_address_ADDR32] = strip1_addr; - //cfg[THCON_SEC0_REG3_Base_cntx2_address_ADDR32] = strip2_addr; - //cfg[THCON_SEC0_REG3_Base_cntx3_address_ADDR32] = strip3_addr; - uint dis_zero_compress_group_info = read_dis_zero_compress_group_info(p_stream_reg); // Get uncompress flag for all 4 tiles - uint dis_zero_compress_mask = ((dis_zero_compress_group_info & 0x1) << first_active_tile); // Get mask for first active tile - - uint index = 0; - uint tile = 0; - ; - for (uint i = 1; i <= 3; i++) - { - if (i == first_active_tile) - continue; - - if ((unpack_halo_mask >> i) & 0x1) - { - index++; - tile++; - const uint strip_addr = read_stream_base_address(p_stream_reg, index); - switch (i) - { - case 1: cfg[THCON_SEC0_REG3_Base_cntx1_address_ADDR32] = strip_addr; break; - case 2: cfg[THCON_SEC0_REG3_Base_cntx2_address_ADDR32] = strip_addr; break; - case 3: cfg[THCON_SEC0_REG3_Base_cntx3_address_ADDR32] = strip_addr; break; - } - dis_zero_compress_mask |= (((dis_zero_compress_group_info >> tile) & 0x1) << i); - } - } - *group_dis_zero_compress &= (~(0xf)); // Clear 4 uncompress flags for context 0 - *group_dis_zero_compress |= dis_zero_compress_mask; -} - -// Optimized function that reads base addresses and programs registers for one context -// FIXME: this is probably pretty slow.... need to evaluate, and maybe make a separate one for the 'common' case -// where the unpack halo mask is 0xF -inline void program_halo_strips_cntx1( - volatile uint *cfg, const regp p_stream_reg, const uint first_active_tile, const uint unpack_halo_mask, uint *group_dis_zero_compress) -{ - uint dis_zero_compress_group_info = read_dis_zero_compress_group_info(p_stream_reg); // Get uncompress flag for all 4 tiles - uint dis_zero_compress_mask = ((dis_zero_compress_group_info & 0x1) << first_active_tile); // Get mask for first active tile - - uint index = 0; - uint tile = 0; - for (uint i = 1; i <= 3; i++) - { - if (i == first_active_tile) - continue; - - if ((unpack_halo_mask >> i) & 0x1) - { - index++; - tile++; - const uint strip_addr = read_stream_base_address(p_stream_reg, index); - switch (i) - { - case 1: cfg[THCON_SEC0_REG4_Base_cntx5_address_ADDR32] = strip_addr; break; - case 2: cfg[THCON_SEC0_REG4_Base_cntx6_address_ADDR32] = strip_addr; break; - case 3: cfg[THCON_SEC0_REG4_Base_cntx7_address_ADDR32] = strip_addr; break; - } - dis_zero_compress_mask |= (((dis_zero_compress_group_info >> tile) & 0x1) << i); - } - } - *group_dis_zero_compress &= (~(0xf0000)); // Clear 4 uncompress flags for context 1 - *group_dis_zero_compress |= (dis_zero_compress_mask << 16); -} -} // namespace ckernel - -namespace ckernel::stream -{ - // Only perform the calculation once, as it's expensive to multiply numbers - inline regp get_reg(uint32_t stream_id) - { - return ckernel::get_stream_reg(stream_id); - } - - inline void wait_for_phase(const regp stream_reg, const uint phase_id) - { - ckernel::wait_for_stream_phase(stream_reg, phase_id); - } - - // Wait until stream has at least 'count' tiles ready - inline void wait_for_messages(const regp stream_reg, const uint count) - { - uint c = 0; - do - { - c = read_stream_register(stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX); - } while (c < count); - } - - // Wait until stream has any messages ready - template - inline void wait_for_token(const regp stream_reg) - { - wait_for_messages(stream_reg, 1); - - if constexpr (FastPop) { - write_stream_register(stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, 1); - } - } - - // Wait for a tile for streaming unpacker. Make sure to get address before updating pointer. - inline uint32_t wait_for_tile(const regp stream_reg) - { - constexpr auto tile_count = 1; - stream::wait_for_messages(stream_reg, tile_count); - auto tile_l1_addr = read_stream_base_address(stream_reg, 0); - update_stream_read_pointer(stream_reg, tile_count); - return tile_l1_addr; - } - - inline void pop_messages(const regp stream_reg, const uint count) { - for (uint j = 0; j < count; j++) { - // TODO: Change to do 2 or 4 (only for stream 4/5) pops at each instruction? - uint32_t num_msgs = 1; - // Wait for stream to load tiles into the msg info fifo so that we can pop them - while (read_stream_register(stream_reg, STREAM_NUM_MSGS_RECEIVED_REG_INDEX) == 0) {} - write_stream_register(stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, num_msgs); - write_stream_register(stream_reg, STREAM_MSG_DATA_CLEAR_REG_INDEX, num_msgs); - } - } - - inline void release_token(const regp stream_reg) - { - write_stream_register(stream_reg, STREAM_MSG_INFO_CLEAR_REG_INDEX, 1); - write_stream_register(stream_reg, STREAM_MSG_DATA_CLEAR_REG_INDEX, 1); - } - - // Wait until specific stream register index contains specific value. - inline void wait_for_reg_value(const regp p_stream_reg, const uint reg_index, const uint reg_value) - { - uint rd_value = reg_value - 1; // Initial non matching value - do - { - rd_value = read_stream_register(p_stream_reg, reg_index); - } while (rd_value != reg_value); - } - - inline std::uint8_t* get_stream_buf_base_ptr(const regp stream_reg) { - auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4; - return reinterpret_cast(base_addr); - } - - inline std::uint8_t* get_stream_msg_info_wr_ptr(const regp stream_reg) { - auto base_addr = read_stream_register(stream_reg, STREAM_MSG_INFO_WR_PTR_REG_INDEX) << 4; - return reinterpret_cast(base_addr); - } - - inline std::uint8_t* get_stream_buf_limit_ptr(const regp stream_reg) { - auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4;; - auto size = read_stream_register(stream_reg, STREAM_BUF_SIZE_REG_INDEX) << 4; - auto limit_addr = base_addr + size; - return reinterpret_cast(limit_addr); - } - - inline std::uint8_t* get_stream_msg_ptr(const regp stream_reg) { - auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4; - auto rdptr = read_stream_register(stream_reg, STREAM_RD_PTR_REG_INDEX) << 4; - auto tile_addr = base_addr + rdptr; - return reinterpret_cast(tile_addr); - } - - inline std::uint8_t* get_stream_msg_wr_ptr(const regp stream_reg) { - auto base_addr = read_stream_register(stream_reg, STREAM_BUF_START_REG_INDEX) << 4; - auto wrptr = read_stream_register(stream_reg, STREAM_WR_PTR_REG_INDEX) << 4; - auto tile_addr = base_addr + wrptr; - return reinterpret_cast(tile_addr); - } - - -} // namespace ckernel::stream diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h new file mode 100644 index 00000000000..0e0c729f4b2 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h @@ -0,0 +1,158 @@ +#pragma once + +#include +#include +#include "ckernel_include.h" +#include "ckernel_globals.h" +#include "ckernel.h" +#include "tensix.h" +#include "fw_debug.h" +#include "epoch.h" + +#ifdef PERF_DUMP +#include "perf_lib/scratch_api.h" +#include "perf_res_decouple.h" +#include "ckernel_perf_math.h" +#include "ckernel_perf_unpack_pack.h" +#endif + +#ifndef INTERMED_DUMP +#define INTERMED_DUMP 0 +#endif + +#pragma GCC diagnostic ignored "-Wunused-function" + +// Comment in/out to enable perf scratch even logging + +namespace ckernel +{ +extern uint32_t perf_index; +extern uint32_t perf_end; +// Perf-buffer are double buffered for spill_to_dram. +// Ncrisc will move one half to dram while trisc populates the other half. +// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. +extern volatile uint32_t *perf_buf_base[2]; +// Selects the half of perf_buffer that trisc is currently writing into. +extern uint8_t perf_buf_base_id; +extern bool record_perf_events; +extern uint32_t perf_events_target_idx; +extern uint16_t current_outer_loop_iter; +extern uint8_t thread_id; +extern bool first_unpack_recorded; + +inline void set_perf_dump_flag_for_input(int input_idx) { + #ifdef PERF_DUMP + TT_LLK_DUMP("set_perf_dump_flag_for_input({})", input_idx); + if (perf_events_target_inputs[perf_events_target_idx] == input_idx) { + record_perf_events = true; + perf_events_target_idx++; + if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) { + if (thread_id == 0 || thread_id == 2) { + perf_end += num_events_per_input; + // The buffer size available for each thread after double buffering is (l1_mem::address_map::TRISC_PERF_BUF_SIZE)/2. + // Max number of events we can record in each half of the buffer will be that size divided by 4, since each event will be 4 bytes. + if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) { + perf_end = TRISC_PERF_BUF_SIZE >> 2; + } + } + } + current_outer_loop_iter = input_idx; + } else { + record_perf_events = false; + } + first_unpack_recorded = false; + #endif +} + +inline void record_pack_input_init_timestamp() { + #ifdef PERF_DUMP + TT_LLK_DUMP("record_pack_input_init_timestamp()"); + if (record_perf_events) { + uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter); + record_timestamp_64b(event_id); + } + #endif +} + +void record_pack_input_end_timestamp() { + #ifdef PERF_DUMP + TT_LLK_DUMP("record_pack_input_end_timestamp()"); + if (record_perf_events) { + uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter); + record_timestamp_64b(event_id); + if (perf_events_target_idx == 1) { + uint32_t event_id_num_tiles_pack = perf::get_event_id(0, 0, perf::EventType::NUM_TILES_PACK, current_outer_loop_iter); + uint16_t num_tiles = regfile[p_gpr_pack::PERF_PACK_NUM_TILES] & 0xffff; + record_perf_value_and_check_overflow(event_id_num_tiles_pack, num_tiles, 0); + } + } + #endif +} + +inline void perf_math_counter_start() { + #ifdef PERF_DUMP + TT_LLK_DUMP("perf_math_counter_start()"); + if constexpr(SKIP_UNP) { + TTI_SETDVALID(p_setrwc::SET_A); + TTI_SETDVALID(p_setrwc::SET_B); + } + if (record_perf_events) { + // Due to a race condition that corrupts the write address of the fpu counters, reprogram them for every input + dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[perf_buf_base_id][perf_index], (uint32_t)&perf_buf_base[perf_buf_base_id][perf_end]); + start_fpu_perf_cnt(); + } + #endif +} + +inline void record_perf_math_counter() { + #ifdef PERF_DUMP + TT_LLK_DUMP("record_perf_math_counter()"); + if constexpr(SKIP_UNP) { + TTI_CLEARDVALID(0x1, 0); + TTI_CLEARDVALID(0x2, 0); + } + if (record_perf_events) { + stop_fpu_perf_cnt(); + // record_fpu_perf_cnt_value(); + } + #endif +} + +void record_unpack_num_tiles() { + #ifdef PERF_DUMP + if (perf_events_target_idx == 1) { + for (uint8_t operand = 0; operand < PERF_MAX_NUM_INPUTS; operand++) { + uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0; + regfile_base_idx += (operand >> 1); + bool upper = operand & 0b1; + uint16_t num_tiles; + if (upper) { + num_tiles = (regfile[regfile_base_idx] >> 16) & 0xffff; + } else { + num_tiles = regfile[regfile_base_idx] & 0xffff; + } + if (num_tiles != 0) { + uint32_t event_id_num_tiles_unpack = perf::get_event_id(operand, 0, perf::EventType::NUM_TILES_UNPACK, current_outer_loop_iter); + record_perf_value_and_check_overflow(event_id_num_tiles_unpack, num_tiles, 0); + } + } + } + #endif +} + +void record_unpack_first_instruction_timestamp() { + #ifdef PERF_DUMP + TT_LLK_DUMP("record_unpack_first_instruction_timestamp()"); + if (record_perf_events) { + uint32_t clock_lo = regfile[p_gpr_unpack::PERF_FIRST_UNP_LO]; + uint32_t clock_hi = regfile[p_gpr_unpack::PERF_FIRST_UNP_HI]; + uint32_t event_id_last_wait_tile = perf::get_event_id(0, 0, perf::EventType::UNPACK_FIRST_INSTRUCTION, current_outer_loop_iter); + record_perf_value_and_check_overflow(event_id_last_wait_tile, clock_lo, clock_hi); + if (perf_events_target_idx == 1) { + record_unpack_num_tiles(); + } + } + #endif +} + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h new file mode 100644 index 00000000000..50b9ed3f7cc --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h @@ -0,0 +1,28 @@ +#pragma once + +#ifdef PERF_DUMP +#include + +#include "perf_events_target_inputs.h" +#include "perf_lib/scratch_api.h" + +#ifndef INTERMED_DUMP +#define INTERMED_DUMP 0 +#endif + +#ifndef PERF_DUMP_CONCURRENT +#define PERF_DUMP_CONCURRENT 0 +#endif + +#pragma GCC diagnostic ignored "-Wunused-function" + +static constexpr uint32_t PERF_DUMP_END_SIGNAL = 0xbeeff00d; +static constexpr uint32_t PERF_CNT_DUMP_ENTRY_SIZE = 16; // Entry size in bytes + +#if PERF_DUMP_LEVEL == 0 +static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_0; +#else +static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_1; +#endif + +#endif diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h new file mode 100644 index 00000000000..a5df5a61f62 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h @@ -0,0 +1,279 @@ +#pragma once + +#include +#include +#include "ckernel_include.h" +#include "ckernel_globals.h" +#include "ckernel.h" +#include "tensix.h" +#include "fw_debug.h" +#include "epoch.h" + +#include "ckernel_perf_include.h" + +#ifndef INTERMED_DUMP +#define INTERMED_DUMP 0 +#endif + +#pragma GCC diagnostic ignored "-Wunused-function" + +// Comment in/out to enable perf scratch even logging + +namespace ckernel +{ +extern uint32_t perf_index; +extern uint32_t perf_end; +// Perf-buffer are double buffered for spill_to_dram. +// Ncrisc will move one half to dram while trisc populates the other half. +// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. +extern volatile uint32_t *perf_buf_base[2]; +// Selects the half of perf_buffer that trisc is currently writing into. +extern uint8_t perf_buf_base_id; +extern uint16_t current_outer_loop_iter; +extern uint8_t thread_id; +extern uint32_t perf_events_target_idx; + +// In math thread, THCON dumps perf buffers in l1. +// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well. +// Flipping the l1 perf start address must also be done by THCON for math thread. +// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective. +// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed. +// We read this value initially for all threads to reduce the l1-reads. +extern int32_t dram_dump_req_local; + +struct cperf_cnt_mode +{ + constexpr static uint32_t PERF_CNT_MODE_FREE = 0; // Free running period counter + constexpr static uint32_t PERF_CNT_MODE_STOP = 1; // Stop counter + constexpr static uint32_t PERF_CNT_MODE_WRAP = 2; // Wrap period counter +}; + +struct cperf_cnt_block_sel +{ + constexpr static uint32_t PERF_CNT_INSTR_THREAD = 0; // Select all instruction thread perf counters(includes TDMA) + constexpr static uint32_t PERF_CNT_FPU = 1; // Select FPU perf counters + constexpr static uint32_t PERF_CNT_L1 = 2; // Select L1 perf counters + constexpr static uint32_t PERF_CNT_ALL = 3; // Select all perf counters +}; + +struct cperf_dbg_daisy_id +{ + constexpr static uint32_t DEBUG_DAISY_INSTRN_THREAD = 1; // Thread specific perf counters + constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_0 = 4; // TDMA+math + constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_1 = 5; // math+instruction issue + constexpr static uint32_t DEBUG_DAISY_TENSIX = 7; // FPU and L1 perf counters +}; + +struct cperf_dbg_dump_to_mem_mode +{ + constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_WR = 0; + constexpr static uint32_t DEBUG_MEM_MODE_AUTO_WR = 1; + constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_RD = 2; + constexpr static uint32_t DEBUG_MEM_MODE_AUTO_RD = 3; +}; + +inline void set_perf_cnt_params(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU, uint32_t ref_period=0xffffffff, uint32_t mode=cperf_cnt_mode::PERF_CNT_MODE_FREE) { + uint32_t perf_cnt_ref_period_reg; + switch (block_sel) { + case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD0; break; + case cperf_cnt_block_sel::PERF_CNT_L1: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_L1_0; break; + default: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_FPU0; + } + reg_write(perf_cnt_ref_period_reg, ref_period); + reg_write(perf_cnt_ref_period_reg+4, 0x00010100); +} + +inline void stop_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) { + uint32_t perf_cnt_cntl_reg; + switch (block_sel) { + case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break; + case cperf_cnt_block_sel::PERF_CNT_L1: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break; + case cperf_cnt_block_sel::PERF_CNT_ALL: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break; + default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2; + } + reg_write(perf_cnt_cntl_reg, 0x00000002); + reg_write(perf_cnt_cntl_reg, 0x00000000); +} + +template +inline void stop_fpu_perf_cnt() { + if (perf_events_target_idx <= 1) { + if constexpr (use_tensix) { + if constexpr (stall_on_math) { + TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH); + } + TTI_STOREREG(p_gpr_math::PERF_CNT_STOP, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); + TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); + } else { + reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000002); + reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000); + } + } +} + +inline void start_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) { + uint32_t perf_cnt_cntl_reg; + switch (block_sel) { + case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break; + case cperf_cnt_block_sel::PERF_CNT_L1: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break; + case cperf_cnt_block_sel::PERF_CNT_ALL: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break; + default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2; + } + reg_write(perf_cnt_cntl_reg, 0x00000001); + reg_write(perf_cnt_cntl_reg, 0x00000000); +} + +template +inline void start_fpu_perf_cnt() { + if (perf_events_target_idx <= 1) { + if constexpr (use_tensix) { + TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH); + TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::THCON); + TTI_STOREREG(p_gpr_math::PERF_CNT_START, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); + TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); + } else { + reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000001); + reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000); + } + } +} + + +inline void sel_fpu_perf_cnt(uint32_t cnt_id) { + riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl; + dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);; + dbg_bus_cntl.f.dbg_daisy_sel = cperf_dbg_daisy_id::DEBUG_DAISY_TENSIX; + dbg_bus_cntl.f.dbg_sig_sel = 0x0; + dbg_bus_cntl.f.dbg_rd_sel = cnt_id<<1; //rd_sel is aligned to 16-bit + reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val); +} + +// Return value of the selected perf counter +inline uint32_t get_perf_cnt() { + return reg_read(RISCV_DEBUG_REG_DBG_RD_DATA); +} + +template +inline void dump_perf_cnt_to_mem() { + if constexpr (use_tensix) { + TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_SET, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff); + TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff); + } else { + riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2; + dbg_l1_mem_reg2.val = 0; + dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; + dbg_l1_mem_reg2.f.mem_write = 1; + reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); + dbg_l1_mem_reg2.f.mem_write = 0; + reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); + } +} + +inline void dbg_daisy_enable() { + riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl; + dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG); + dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x1; + dbg_bus_cntl.f.dbg_daisy_en = 0x1; + reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val); +} + +inline void dbg_daisy_disable() { + riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl; + dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG); + dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x0; + dbg_bus_cntl.f.dbg_daisy_en = 0x0; + reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val); +} + +inline void dbg_enable_dump_to_mem(uint32_t start_addr, uint32_t end_addr) { + + TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH); + uint32_t start_addr_lo = (start_addr >> 4) & 0xffff; + uint32_t start_addr_hi = (start_addr >> 4) >> 16; + TT_SETDMAREG(0, start_addr_lo, 0, LO_16(p_gpr_math::TMP0)); + TT_SETDMAREG(0, start_addr_hi, 0, HI_16(p_gpr_math::TMP0)); + TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG0 >> 2) & 0x3ffff); + + uint32_t end_addr_lo = (end_addr >> 4) & 0xffff; + uint32_t end_addr_hi = (end_addr >> 4) >> 16; + TT_SETDMAREG(0, end_addr_lo, 0, LO_16(p_gpr_math::TMP0)); + TT_SETDMAREG(0, end_addr_hi, 0, HI_16(p_gpr_math::TMP0)); + TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG1 >> 2) & 0x3ffff); + + // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG0, start_addr>>4); + // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG1, end_addr>>4); + riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2; + dbg_l1_mem_reg2.val = 0; + dbg_l1_mem_reg2.f.mem_dump_mode = 0xf; //invalid and overriden below to trigger pulse needed to latch start address + dbg_l1_mem_reg2.f.skip_cycles = 0; + + uint32_t debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff; + uint32_t debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff; + TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0)); + TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0)); + TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff); + + + // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); + dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; // This value must change in order to latch new start address!!! + // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); + + debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff; + debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff; + TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0)); + TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0)); + TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff); + + TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::THCON); +} + +template +inline void record_fpu_perf_cnt_value() { + // if (perf_events_target_idx <= 1) { + // // In l1 mode always reserve last event for PERF_DUMP_END_SIGNAL. + // uint32_t reserve_space_for_trisc_end_signal = 1; + // if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default. + // //perf_buf_base[perf_index] = get_perf_cnt(); + // //perf_buf_base[perf_index + 1] = get_perf_cnt(); + // dump_perf_cnt_to_mem(); //Dump 16B to L1 + // perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)); + // } + // } +} + +// Dump a dummy math event to get the initial fpu counter value. +inline void record_dummy_math_event() { + if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { + uint32_t reserve_space_for_trisc_end_signal = 1; + if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default. + perf_buf_base[perf_buf_base_id][perf_index] = 0; + perf_buf_base[perf_buf_base_id][perf_index+1] = 0; + perf_buf_base[perf_buf_base_id][perf_index+2] = 0; + perf_buf_base[perf_buf_base_id][perf_index+3] = 0; + perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)); + } + } +} + +inline void setup_fpu_perf_cnt() { + // Only program perf counters for math thread (trisc1) + if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { + set_perf_cnt_params(cperf_cnt_block_sel::PERF_CNT_FPU,0xffffffff,cperf_cnt_mode::PERF_CNT_MODE_FREE); + sel_fpu_perf_cnt(0); + dbg_daisy_enable(); + dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[0][PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)], (uint32_t)&perf_buf_base[0][perf_end]); + + riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2; + dbg_l1_mem_reg2.val = 0; + dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; + dbg_l1_mem_reg2.f.mem_write = 0; + regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR]=dbg_l1_mem_reg2.val; + dbg_l1_mem_reg2.f.mem_write = 1; + regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_SET]=dbg_l1_mem_reg2.val; + + regfile[p_gpr_math::PERF_CNT_START]=0x1; + regfile[p_gpr_math::PERF_CNT_STOP]=0x2; + sync_regfile_write(p_gpr_math::PERF_CNT_STOP); + } +} +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h new file mode 100644 index 00000000000..aaa854ebc2f --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h @@ -0,0 +1,158 @@ +#pragma once + +#include +#include +#include "ckernel_include.h" +#include "ckernel_globals.h" +#include "ckernel.h" +#include "tensix.h" +#include "fw_debug.h" +#include "epoch.h" + +#include "ckernel_perf_include.h" + +#pragma GCC diagnostic ignored "-Wunused-function" + +// Comment in/out to enable perf scratch even logging + +namespace ckernel +{ +extern uint32_t perf_index; +extern uint32_t perf_end; +// Perf-buffer are double buffered for spill_to_dram. +// Ncrisc will move one half to dram while trisc populates the other half. +// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. +extern volatile uint32_t *perf_buf_base[2]; +// Selects the half of perf_buffer that trisc is currently writing into. +extern uint8_t perf_buf_base_id; +extern uint8_t thread_id; + +// In math thread, THCON dumps perf buffers in l1. +// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well. +// Flipping the l1 perf start address must also be done by THCON for math thread. +// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective. +// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed. +// We read this value initially for all threads to reduce the l1-reads. +extern int32_t dram_dump_req_local; +extern bool record_perf_events; +extern uint32_t perf_events_target_idx; +extern bool first_unpack_recorded; +extern volatile uint * ncrisc_ack_addr; +extern uint16_t current_outer_loop_iter; +#if OVERLAY_DECOUPLE == 1 +extern uint8_t overlay_output_decouple_mask; +#endif + +void allocate_perf_buffer(); + +// This function gets called when half-perf-buffer is full and need to switch. +// Only used for threads 0 and 2. +// For thread 1 a different function is used: switch_perf_buffers_for_math_thread +// If ncrisc has not yet finished dumping the next half of perf-buffer, trisc will stall. +// If is_perf_end_signal is true, we just need to write the PERF_DUMP_END_SIGNAL. +// This function should only get executed in INTERMED_DUMP mode. +void switch_perf_buffers(); +void last_trisc_perf_dump_to_dram(); + +// The two following functions are separated to avoid inline recursive function calls. +// TODO: Check the behaviour of the compiler if the two following functions were merged into a template function. +inline void record_perf_value(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b) { + perf_buf_base[perf_buf_base_id][perf_index] = event_id; + perf_buf_base[perf_buf_base_id][perf_index + 1] = event_value_hi_32b; + perf_buf_base[perf_buf_base_id][perf_index + 2] = event_value_lo_32b; + perf_index += 3; +} + +inline void record_perf_dump_end() { + if (perf_index < perf_end) { + perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL; + perf_index += 1; + } +#if PERF_DUMP_CONCURRENT == 1 + if (perf_index < perf_end) { + perf_buf_base[perf_buf_base_id][perf_end - 1] = PERF_DUMP_END_SIGNAL; + } +#endif +} + +inline void record_perf_value_and_check_overflow(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b, uint32_t leave_space = 0) { + // In l1 mode always reserve the last event for PERF_DUMP_END_SIGNAL. + int reserve_space_for_trisc_end_signal = 1; + +#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1) + leave_space = 0; + reserve_space_for_trisc_end_signal = 0; + if (perf_index + 2 >= perf_end - reserve_space_for_trisc_end_signal - leave_space) { + switch_perf_buffers(); + } + record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b); +#else + if (perf_index + 2 < perf_end - reserve_space_for_trisc_end_signal - leave_space) { + record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b); + } +#endif +} + +inline void record_timestamp_64b(uint event_id, uint leave_space = 0) { + if (record_perf_events) { + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, leave_space); + } +} + +inline void record_perf_dump_end_and_check_overflow() { + if (thread_id == 1) { + uint32_t reserve_space_for_trisc_end_signal = 1; + if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default. + perf_buf_base[perf_buf_base_id][perf_index] = reg_read(0xFFB12000 + 0x120); + perf_buf_base[perf_buf_base_id][perf_index+1] = reg_read(0xFFB12000 + 0x124); + perf_buf_base[perf_buf_base_id][perf_index+2] = 0; + perf_buf_base[perf_buf_base_id][perf_index+3] = 0; + perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)); + } + } + +#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1) + if (perf_index >= perf_end) { + switch_perf_buffers(); + } + record_perf_dump_end(); +#else + if (perf_index < perf_end) { + record_perf_dump_end(); + } +#endif +} + +inline void record_latest_wait_for_tile() { +#if defined(PERF_DUMP) + if (!first_unpack_recorded) { + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + regfile[p_gpr_unpack::PERF_FIRST_UNP_LO] = timestamp_low & 0xffffffff; + sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_LO); + regfile[p_gpr_unpack::PERF_FIRST_UNP_HI] = timestamp_high & 0xffffffff; + sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_HI); + } +#endif +} + +void increment_unpack_tiles(uint operand_idx, uint num_tiles); +void increment_pack_tiles(uint num_tiles); +#if OVERLAY_DECOUPLE == 1 +inline uint32_t get_active_stream_idx(uint32_t stream_id) { + std::uint32_t active_stream_idx; + for (uint32_t active_streams_idx = 0; active_streams_idx < NOC_NUM_STREAMS; active_streams_idx++) { + if (stream_id == EPOCH_INFO_PTR->active_streams[active_streams_idx]->stream_id) { + active_stream_idx = active_streams_idx; + break; + } + } + return active_stream_idx; +} + +void llk_push_all_packer_tiles_for_decoupling(); +#endif + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h index 8b2df1139ff..0dd06c65dc8 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h @@ -2874,25 +2874,6 @@ void test17() copy_result_to_dreg0(17); } -inline void calculate_logical_not() -{ - vUInt v(dst_reg[0].get()); - const vUInt vZero(0), vOne(1); - v_if(v == 0) { - dst_reg[0] = vOne; - } v_else { - dst_reg[0] = vZero; - } - v_endif; -} - -inline void calculate_bitwise_complement() -{ - vUInt v( dst_reg[0].get() ); - vUInt v_comp = ~v; - dst_reg[0] = v_comp; -} - ////////////////////////////////////////////////////////////////////////////// // These tests are designed to be incremental so that if a test fails the // earlier tests should be examined/fixed prior to the latter tests. @@ -2934,12 +2915,7 @@ inline void calculate_sfpi(uint param0 = 0, uint param1 = 0, uint param2 = 0, ui test16(); } else if constexpr (operation == SfpiTestType::test17) { test17(); - } else if constexpr (operation == SfpiTestType::logical_not) { - calculate_logical_not(); - } else if constexpr (operation == SfpiTestType::bitwise_complement) { - calculate_bitwise_complement(); } - } } // NAMESPACE diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h index e63621398e4..e7b80e9cabf 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h @@ -5,16 +5,13 @@ #pragma once #include "ckernel_defs.h" -#include "ckernel.h" #include "noc_nonblocking_api.h" +#include "ckernel.h" +#include "llk_defs.h" +#include #include "sfpi.h" -#include "ckernel_sfpu_cdf.h" -#include "ckernel_sfpu_exp.h" -#include "ckernel_sfpu_recip.h" -#include "ckernel_sfpu_converter.h" - using namespace sfpi; namespace ckernel @@ -54,7 +51,71 @@ sfpi_inline vInt sfpu_is_fp16_zero(const vFloat& v, uint exponent_size_8) } } +sfpi_inline vFloat _sfpu_exp_(vFloat val) +{ + // If exponent is > -1 extract it and replace with -1 + vInt exp = exexp(val); + v_if (exp >= 0) { + val = setexp(val, 126); + } + v_endif; + + // Run series in Horner form + vFloat tmp = val * vConst0p8373 + s2vFloat16b(0.863281); + val = val * tmp + vConst1; + + v_if (exp >= 0) { + val = val * val; + for (int s_iter = 0; s_iter < 7; s_iter++) { + exp = exp - 1; + // Narrow predication on each loop + v_and(exp >= 0); + val = val * val; + } + } + v_endif; + + return val; +} + +template +sfpi_inline vFloat _sfpu_reciprocal_(const vFloat in) +{ + // Force sign to 1 (make number negative) + vFloat val = setsgn(in, 1); + + val = setexp(val, 126); // Set exponent to 126 to make the number in 0.5-1 + // Use 1.44 as first guess at x, ideal value would be 1.33, but we happen to have 1.44 available, so use that to avoid a load + vFloat vConstLn2Recip = vConstFloatPrgm0; + vFloat two = vConstFloatPrgm1; + vFloat result = vConstLn2Recip * (val * vConstLn2Recip + two); + + for (int s_iter = 0; s_iter < (max_iter-1); s_iter++) { + result = result * (val * result + two); + } + + vInt orig_exp = exexp(in); + vInt new_exp = exexp(result); + + // "Subtract" exponents, and re-bias. + // Execute: -1 - exp, then exp += 127 + new_exp -= orig_exp; + new_exp += 126; + + v_if (new_exp < 0) { + // If rebiased exponent is negative, we need to saturate at 0. + // This means the initial number was too big so reciprocal result should be 0 + result = 0.0F; + new_exp = 0; + } + v_endif; + + // Set newly denormalized exponent to result exponent field + return setexp(result, new_exp); +} + inline void init_dropout_seed(uint16_t p2){ + uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(0, 0, NOC_NODE_ID); uint16_t my_x = noc_id_reg & NOC_NODE_ID_MASK; @@ -75,23 +136,29 @@ template inline void configure_programmable_constants(SfpuType operation) { switch (operation) { - case SfpuType::expm1: - case SfpuType::exp2: + case SfpuType::gelu: + vConstFloatPrgm0 = 0.5f; + break; + case SfpuType::exponential: if (APPROXIMATION_MODE) { vConstFloatPrgm0 = 1.442695f; // ln2_recip vConstFloatPrgm1 = s2vFloat16b(p_exp::C23_73); vConstFloatPrgm2 = s2vFloat16b(p_exp::ADJ_EXP); break; } + + + + // Fall through + case SfpuType::gelu_derivative: vConstFloatPrgm2 = 0.863281f; // Fall through - case SfpuType::rsqrt: - case SfpuType::atan: + case SfpuType::reciprocal: vConstFloatPrgm0 = 1.442695f; // ln2_recip vConstFloatPrgm1 = 2.0f; break; - case SfpuType::log_with_base: + case SfpuType::log: // ln2 vConstFloatPrgm0 = 0.692871f; // ln2 @@ -100,6 +167,15 @@ inline void configure_programmable_constants(SfpuType operation) vConstFloatPrgm1 = 0.1058f; vConstFloatPrgm2 = -0.7166f; break; + + case SfpuType::sqrt: + if (APPROXIMATION_MODE) { + vConstFloatPrgm0 = s2vFloat16b(127 << 7); + } else { + vConstFloatPrgm0 = s2vFloat16b(0x5f37); + } + break; + case SfpuType::dropout: vConstIntPrgm0 = 0xb400; vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB @@ -140,25 +216,173 @@ inline void sfpu_init(SfpuType operation, uint param0 = 0) TTI_SFPLOADI(1, 2, imm1); TTI_SFPLOADI(2, 2, imm2); break; - case SfpuType::sigmoid_appx: - imm0 = 0x3DFF; - imm1 = 0x21D8; - imm2 = 0xFF10; - TTI_SFPLOADI(0, 2, imm0); - TTI_SFPLOADI(1, 2, imm1); - TTI_SFPLOADI(2, 2, imm2); + case SfpuType::sigmoid: + // imm0 = 0x3DFF; + // imm1 = 0x21D8; + // imm2 = 0xFF10; + // TTI_SFPLOADI(0, 2, imm0); + // TTI_SFPLOADI(1, 2, imm1); + // TTI_SFPLOADI(2, 2, imm2); + // Using a 6 piece LUT to calculate and model sigmoid directly + // x <= 0.5 --> 0.2452x + (-0.0004997) + // x <= 1.0 --> 0.2173x + 0.0152 + // x <= 1.5 --> 0.1731x + 0.05988 + // x <= 2.0 --> 0.1262x + 0.1298 + // x <= 4.0 --> 0.0485x + 0.2998 + // x > 4.0 --> 0.4998 + + // imm0[15:0] = A0=0.2452 = 0x33D9 -- imm0[31:16] = A1=0.2173 = 0x32F4 + sfpu_load_imm32(0,0x32F433D9); + // imm4[15:0] = B0= -0.0004997 = 0x9018 -- imm4[31:16] = B1= 0.0152 = 0x23c8 + sfpu_load_imm32(4,0x23C89018); + + // imm1[15:0] = A2=0.1731 = 0x318a -- imm1[31:16] = A3=0.1262 = 0x300a + sfpu_load_imm32(1,0x300A318A); + // imm5[15:0] = B2=0.05988 = 0x2BAA -- imm5[31:16] = B3=0.1298 = 0x3027 + sfpu_load_imm32(5,0x30272BAA); + + // imm2[15:0] = A4=0.0485 = 0x2A35 -- imm2[31:16] = A5=0.0 = 0x7C00 + sfpu_load_imm32(2,0x7C002A35); + // imm6[15:0] = B4=0.2998 = 0x34CC -- imm6[31:16] = B5=0.4998 = 0x37ff + sfpu_load_imm32(6,0x37ff34CC); + + break; + case SfpuType::gelu_derivative: + if constexpr (APPROXIMATION_MODE) { + // Using a 6 piece LUT to calculate and model gelu_derivative directly + // x <= 0.5 --> 0.8x + 0.5 + // x <= 1.0 --> 0.4x + 0.7 + // x <= 1.5 --> 0.1x + 0.99 + // x <= 2.0 --> -0.09x + 1.27 + // x <= 3.0 --> -0.075x + 1.235 + // x > 3.0 --> 1.0 + // imm0[15:0] = A0=0.8 = 0x3A66 -- imm0[31:16] = A1=0.4 = 0x3666 + imm0_high = 0x3666; + imm0_low = 0x3A66; + // imm1[15:0] = A2=0.1 = 0x2E66 -- imm1[31:16] = A3=-0.09 = 0xADC3 + imm1_high = 0xADC3; + imm1_low = 0x2E66; + // imm2[15:0] = A4=-0.075 = 0xACCD -- imm2[31:16] = A5=0 = 0x7C00 + imm2_high = 0x7C00; + imm2_low = 0xACCD; + // imm3[15:0] = B0=0.5 = 0x3800 -- imm3[31:16] = B1=0.7 = 0x399A + imm3_high = 0x399A; + imm3_low = 0x3800; + // imm4[15:0] = B2=0.99 = 0x3BEC -- imm4[31:16] = B3=1.27 = 0x3D14 + imm4_high = 0x3D14; + imm4_low = 0x3BEC; + // imm5[15:0] = B4=1.235 = 0x3CF1 -- imm5[31:16] = B5=1.0 = 0x3C00 + imm5_high = 0x3C00; + imm5_low = 0x3CF1; + TTI_SFPLOADI(0, 10, imm0_low); + TTI_SFPLOADI(0, 8, imm0_high); + TTI_SFPLOADI(1, 10, imm1_low); + TTI_SFPLOADI(1, 8, imm1_high); + TTI_SFPLOADI(2, 10, imm2_low); + TTI_SFPLOADI(2, 8, imm2_high); + TTI_SFPLOADI(4, 10, imm3_low); + TTI_SFPLOADI(4, 8, imm3_high); + TTI_SFPLOADI(5, 10, imm4_low); + TTI_SFPLOADI(5, 8, imm4_high); + TTI_SFPLOADI(6, 10, imm5_low); + TTI_SFPLOADI(6, 8, imm5_high); + } else { + imm0 = 0x28FF; + imm1 = 0x3020; + TTI_SFPLOADI(0, 2, imm0); + TTI_SFPLOADI(1, 2, imm1); + } + break; + case SfpuType::gelu: + // //SG: FIXME + // imm0 = 0x18FF; + // imm1 = (APPROXIMATION_MODE)? 0x212C : 0x2010; + // imm2 = 0xFF00; + // TTI_SFPLOADI(0, 2, imm0); + // TTI_SFPLOADI(1, 2, imm1); + // TTI_SFPLOADI(2, 2, imm2); + + // // >= 3.0f + // lreg2_hi=0.50;//3800 + // lreg6_hi=0.0f;//7c00 + // // 2.0f -> 3.0f + // lreg2_lo= 0.5402f;//3852 + // lreg6_lo= -0.1194f;//AFA4 + // // 1.5f -> 2.0f + // lreg1_hi= .6099f; //38E1 + // lreg5_hi= -.2635f; //B437 + // // 1.0f -> 1.5f + // lreg1_lo=0.6189;//38F3 + // lreg5_lo=-.2797;//B479 + // // 0.5f -> 1.0f + // lreg0_hi=.4939f;//37E7 + // lreg4_hi=-.1605f;//B122 + // // 0.0f -> 0.5f + // lreg0_lo=0.1928f;//322B + // lreg4_lo=-0.0150f;//A3AE + sfpu_load_imm32(0,0x37E7322B); + //sfpu_load_imm32(4,0xB122A3AE); + sfpu_load_imm32(4,0xB12286D8); + + + sfpu_load_imm32(1,0x38E138F3); + sfpu_load_imm32(5,0xB437B479); + + sfpu_load_imm32(2,0x38003852); + sfpu_load_imm32(6,0x7c00afa4); + break; case SfpuType::dropout: init_dropout_seed(param0); break; - case SfpuType::sigmoid: - break; + case SfpuType::quant_int32: + case SfpuType::requant_int32: + case SfpuType::dequant_int32: + sfpu_load_imm32(2,param0); + break; default: // Should result in compile time error?? break; } } +template +sfpi_inline vFloat _calculate_exponential_body_(vFloat in) +{ + vFloat out; + + if constexpr (APPROXIMATION_MODE) + { + constexpr int FRAC_BITS = 3; + constexpr uint SP_BIAS = 127 << FRAC_BITS; + + // * by 1/ln2 and add convert to 7.3 FxP format + vFloat vConstLn2Recip = vConstFloatPrgm0; + vFloat conv = in * vConstLn2Recip; + + // Clear exp bits + vInt c23_73 = p_exp::C23_73; + vInt tmp = reinterpret(conv) - c23_73; + + // Add bias + tmp += SP_BIAS; + + // SHL to move integer bits to exponent + out = reinterpret(tmp << (10 - FRAC_BITS)); + } + else + { + // Force sign to 0 (make number positive) + out = _sfpu_exp_(setsgn(in, 0)); + + v_if (in < 0) { + out = _sfpu_reciprocal_(out); + } + v_endif; + } + + return out; +} /* template @@ -178,81 +402,158 @@ void calculate_cube(uint16_t exp_base_scale_factor = 0) } */ - -template -inline void calculate_rsqrt() +template +void calculate_exponential(const int iterations, uint16_t exp_base_scale_factor = 0) { - - for (int d = 0; d < ITERATIONS; d++) + // Unroll 8 best for approx, unroll 0 for precise, compiler figures this out + for (int d = 0; d < iterations; d++) { + vFloat val = dst_reg[0]; + if constexpr(SCALE_EN){ + val = val * s2vFloat16a(exp_base_scale_factor); + } + if constexpr (APPROXIMATION_MODE) + { + v_if (val>=89){ + vFloat val_inf = std::numeric_limits::infinity(); + dst_reg[0] = val_inf; + } v_elseif(val<-42){ + dst_reg[0] = 0.0f; + } v_else { + // * by 1/ln2 and add convert to 7.3 FxP format + vFloat vConstLn2Recip = vConstFloatPrgm0; + vFloat c23_73 = vConstFloatPrgm1; + vInt adj_exp = vConstIntPrgm2; + val = val * vConstLn2Recip + c23_73; + + // Remove Exponent of 7 and bias the Mantissa to 127. + vInt val_short = adj_exp + reinterpret(val); + + // SHL to move integer bits to exponent + val_short <<= 10 - p_exp::FRAC_BITS; + dst_reg[0] = reinterpret(val_short); + } + v_endif; + } + else + { + // Force sign to 0 (make number positive) + vFloat result = _sfpu_exp_(setsgn(val, 0)); - vFloat in = dst_reg[0]; - v_if(dst_reg[0] == 0.0f){ - dst_reg[0] = std::numeric_limits::infinity(); - }v_else{ - vFloat result = 1.0f; - v_if(dst_reg[0] > 1.0f){ - result = sfpu_reciprocal(in); - }v_endif; - - for (int r = 0; r < RECIPROCAL_ITERATIONS; r++) - { - // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration. - result = result * (1.5F - 0.5F * dst_reg[0] * result * result); + v_if (val < 0) { + result = _sfpu_reciprocal_(result); } + v_endif; + dst_reg[0] = result; - }v_endif; + } dst_reg++; - } } +template +inline vFloat _calculate_gelu_core_(vFloat in) +{ + // SFPU microcode: + // result = (APPROX_MODE == 1) + // ? (1 + erf(x/sqrt(2))) + // : (1 + tanh( sqrt(2/pi) * (x + 0.044715*x^3) ) + vFloat result; + if constexpr (APPROXIMATION_MODE) { + result = in; + } else { + // f = (0.044715*x^3 + x) + result = (in * in) * (in * s2vFloat16b(0.044715f)) + in; + result *= s2vFloat16b(0.79788f); + } + + return result; +} template -inline void calculate_sigmoid_appx() +inline void calculate_gelu(const int iterations) { + vUInt l0 = l_reg[LRegs::LReg0]; vUInt l1 = l_reg[LRegs::LReg1]; vUInt l2 = l_reg[LRegs::LReg2]; + vUInt l4 = l_reg[LRegs::LReg4]; + vUInt l5 = l_reg[LRegs::LReg5]; + vUInt l6 = l_reg[LRegs::LReg6]; #pragma GCC unroll 8 - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { - vFloat val = dst_reg[0]; + // vFloat in = dst_reg[0]; + // vFloat result = _calculate_gelu_core_(in); - dst_reg[0] = lut(val, l0, l1, l2) + 0.5f; + // vFloat half_in = in * half; + // result = lut(result, l0, l1, l2); + // result = half_in * result + half_in; + + //dst_reg[0] = result; + + vFloat in = dst_reg[0]; + vFloat half = vConstFloatPrgm0; + vFloat half_in = in * half; + vFloat result = lut2_sign(in, l0, l1, l2, l4, l5, l6); + result = half_in + result; + + dst_reg[0] = result; dst_reg++; + + // dst_reg++; + //TTI_SFPLOAD(3, 0, 1/*load addr mode*/,0); // load from dest + ////TTI_SFPMUL(3,11,9,7,0); // lreg7 = 0.5*lreg3 + //TTI_SFPLUTFP32(7, 2); // lreg7= LUT(3) + //TTI_SFPMAD(3,12,7,3,0); // lreg3 = 0.5*lreg3+lregm7 + //TTI_SFPSTORE(3, 0, 3/*store_addr_mod3*/, 0); // and INCRWC by 4 using mode 3 } l_reg[LRegs::LReg0] = l0; l_reg[LRegs::LReg1] = l1; l_reg[LRegs::LReg2] = l2; + l_reg[LRegs::LReg4] = l4; + l_reg[LRegs::LReg5] = l5; + l_reg[LRegs::LReg6] = l6; + + } -// TODO: Implement using bitwise comparision template -inline void calculate_signbit() +inline void calculate_sigmoid(const int iterations) { + constexpr int lut_mode = 0; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1 + vUInt l0 = l_reg[LRegs::LReg0]; + vUInt l1 = l_reg[LRegs::LReg1]; + vUInt l2 = l_reg[LRegs::LReg2]; + vUInt l4 = l_reg[LRegs::LReg4]; + vUInt l5 = l_reg[LRegs::LReg5]; + vUInt l6 = l_reg[LRegs::LReg6]; - for (int d = 0; d < ITERATIONS; d++) + + #pragma GCC unroll 8 + for (int d = 0; d < iterations; d++) { vFloat val = dst_reg[0]; - v_if (val <= -0.0f) { - val = 1.0f; - } v_elseif (val >= 0.0f) { - val = 0.0f; - } - v_endif; - dst_reg[0] = val; - dst_reg++; + dst_reg[0] = lut2(val, l0, l1, l2, l4, l5, l6, lut_mode) + 0.5f; + + dst_reg++; } + l_reg[LRegs::LReg0] = l0; + l_reg[LRegs::LReg1] = l1; + l_reg[LRegs::LReg2] = l2; + l_reg[LRegs::LReg4] = l4; + l_reg[LRegs::LReg5] = l5; + l_reg[LRegs::LReg6] = l6; + } template -inline void calculate_tanh() +inline void calculate_tanh(const int iterations) { // SFPU microcode vUInt l0 = l_reg[LRegs::LReg0]; @@ -260,7 +561,7 @@ inline void calculate_tanh() vUInt l2 = l_reg[LRegs::LReg2]; #pragma GCC unroll 8 - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat val = dst_reg[0]; val = lut(val, l0, l1, l2); @@ -275,7 +576,7 @@ inline void calculate_tanh() } template -inline void calculate_hardtanh(uint param0, uint param1, uint param2) +inline void calculate_hardtanh(const int iterations, uint param0, uint param1, uint param2) { // All params are in FP16_B format // param0 = -(neg_threshold) @@ -287,7 +588,7 @@ inline void calculate_hardtanh(uint param0, uint param1, uint param2) vFloat p2 = s2vFloat16(param2); // SFPU microcode #pragma GCC unroll 0 - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat val = dst_reg[0]; @@ -312,14 +613,14 @@ inline void calculate_hardtanh(uint param0, uint param1, uint param2) } template -inline void calculate_tanh_derivative() +inline void calculate_tanh_derivative(const int iterations) { vUInt l0 = l_reg[LRegs::LReg0]; vUInt l1 = l_reg[LRegs::LReg1]; vUInt l2 = l_reg[LRegs::LReg2]; // tanh'(x) = 1 - (tanh(x))^2 - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat val = dst_reg[0]; @@ -339,14 +640,146 @@ inline void calculate_tanh_derivative() } template -inline void calculate_dropout(uint prob, uint scale) +inline void calculate_gelu_derivative(const int iterations) { - // SFPU microcode + if constexpr (APPROXIMATION_MODE) { + constexpr int lut_mode = 1; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1 + + vUInt l0 = l_reg[LRegs::LReg0]; + vUInt l1 = l_reg[LRegs::LReg1]; + vUInt l2 = l_reg[LRegs::LReg2]; + vUInt l4 = l_reg[LRegs::LReg4]; + vUInt l5 = l_reg[LRegs::LReg5]; + vUInt l6 = l_reg[LRegs::LReg6]; + + // SFPU microcode: + #pragma GCC unroll 0 + for (int d = 0; d < iterations; d++) + { + vFloat val = dst_reg[0]; + val = lut2(val, l0, l1, l2, l4, l5, l6, lut_mode); + v_if (val < 0.0F) { + val = val + 1.0f; + } + v_endif; + dst_reg[0] = val; + dst_reg++; + + } + + l_reg[LRegs::LReg0] = l0; + l_reg[LRegs::LReg1] = l1; + l_reg[LRegs::LReg2] = l2; + l_reg[LRegs::LReg4] = l4; + l_reg[LRegs::LReg5] = l5; + l_reg[LRegs::LReg6] = l6; + } else { + constexpr uint imm2 = 0xFF10; + + vUInt l0 = l_reg[LRegs::LReg0]; + vUInt l1 = l_reg[LRegs::LReg1]; + // SFPU microcode: + #pragma GCC unroll 0 + for (int d = 0; d < iterations; d++) + { + vFloat in = dst_reg[0]; + vFloat neg_half_sq_in = in * in * -0.5f; + + // exp = e^(val) + vFloat exp = _calculate_exponential_body_(neg_half_sq_in); + + // exp = exp * 1/sqrt(2*pi) + vFloat partial = exp * in * s2vFloat16b(0.3989423F); + + vFloat result = _calculate_gelu_core_(in); + + result = lut(result, l0, l1, imm2); + + dst_reg[0] = partial + result + 0.5f; + dst_reg++; + } + + l_reg[LRegs::LReg0] = l0; + l_reg[LRegs::LReg1] = l1; + } +} + +template +inline void calculate_reciprocal(const int iterations) +{ + #pragma GCC unroll 8 + for (int d = 0; d < iterations; d++) + { + vFloat in = dst_reg[0]; + vFloat out = _sfpu_reciprocal_(in); + + v_if (in < 0.0F) { + // Invert sign on calculated value if CC=1 (number is negative) + out = -out; + } + v_endif; + + dst_reg[0] = out; + + dst_reg++; + } +} + +template +inline void calculate_sqrt(const int iterations) +{ + #pragma GCC unroll 8 + for (int d = 0; d < iterations; d++) + { + vFloat val = dst_reg[0]; + + if constexpr (APPROXIMATION_MODE) + { + vUInt magic = vConstIntPrgm0; + + //sqrt initial approximation + // adjust bias + vUInt val_s = magic + reinterpret(val); + + // approximation of square root + val_s >>= 1; + dst_reg[0] = reinterpret(val_s); + } + else + { + // Recip root method + //// Init approx + //u.i = SQRT_MAGIC_F - (u.i >> 1); + v_if (val != 0.0f) + { + vUInt magic = vConstIntPrgm0; + vFloat approx = reinterpret(magic - (reinterpret(val) >> 1)); + + //Reciproot iterations + for (int r = 0; r < RECIPROCAL_ITERATIONS; r++) + { + //x*r*(1.5f - xhalf*r*r); + approx = ((approx * approx) * (val * -0.5f) + 1.5f) * approx; + } + + dst_reg[0] = approx * val; + } + v_endif; + } + + dst_reg++; + } +} + +template +inline void calculate_dropout(const int iterations, uint prob, uint scale) +{ + // SFPU microcode vUInt rand = l_reg[LRegs::LReg3]; #pragma GCC unroll 0 - for (int d = 0; d < ITERATIONS; d++) { + for (int d = 0; d < iterations; d++) { //////////////////////// // Scale samples /////////////////////// @@ -378,27 +811,49 @@ inline void calculate_dropout(uint prob, uint scale) l_reg[LRegs::LReg3] = rand; } -template -inline void calculate_power_iterative(const uint exponent) +template +inline void calculate_lrelu(const int iterations, uint slope) { - #pragma GCC unroll 8 - for (int d = 0; d < 8; d++) + // SFPU microcode + vFloat s = s2vFloat16b(slope); + + #pragma GCC unroll 0 + for (int d = 0; d < iterations; d++) { + vFloat v = dst_reg[0]; + + v_if (v < 0.0f) { + v *= s; + } + v_endif; + + dst_reg[0] = v; + + dst_reg++; + } +} + +template +inline void calculate_power(const int iterations, uint exponent) +{ + for (int d = 0; d < iterations; d++) { vFloat in = dst_reg[0]; - vFloat result = 1.0f; - for (uint i = 0; i < exponent; i++) { + vFloat result = in * in; + for (uint i = 2; i < exponent; i++) { result *= in; } - dst_reg[0]=result; + + dst_reg[0] = result; + dst_reg++; } } template -inline void calculate_square() +inline void calculate_square(const int iterations) { #pragma GCC unroll 8 - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat in = dst_reg[0]; vFloat result = in * in; @@ -410,7 +865,7 @@ inline void calculate_square() } template -sfpi_inline void calculate_log_body(const uint log_base_scale_factor) +sfpi_inline void _calculate_log_body_(const uint log_base_scale_factor) { //////////////////////////// // Load From dest + "normalize to calculation range" @@ -465,16 +920,16 @@ sfpi_inline void calculate_log_body(const uint log_base_scale_factor) } template -inline void calculate_log(uint log_base_scale_factor) +inline void calculate_log(const int iterations, uint log_base_scale_factor) { #pragma GCC unroll 8 - for(int d = 0; d < ITERATIONS; d++){ - calculate_log_body(log_base_scale_factor); + for(int d = 0; d < iterations; d++){ + _calculate_log_body_(log_base_scale_factor); dst_reg++; } } -sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init) +sfpi_inline void _calculate_comp_init_flag_(bool check, vFloat& flag1, vFloat& flag2, float init) { flag1 = init; if (check) { @@ -483,82 +938,82 @@ sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& fla } template -inline void calculate_comp(uint exponent_size_8) +inline void calculate_comp(const int iterations, uint exponent_size_8) { - const vFloat zero = 0.0f; - const vFloat one = 1.0f; - for (int d = 0; d < ITERATIONS; d++) + //invert output and use same comparison check + constexpr bool invert_output = ((COMP_MODE == SfpuType::greater_than_equal_zero) || + (COMP_MODE == SfpuType::not_equal_zero) || + (COMP_MODE == SfpuType::greater_than_zero)); + + // output_0 and output_1 hold the outputs use use when a zero or negative check is true/false. + // False = 0.0 = kCONST_0 (5/8-bit exponent format) + // True = 1.0 = kCONST_1_FP16B (8-bit exponent format) + // SFPU uses 8-bit exponent in operations so loading these constants in 8-bit exponent format. + // Although a command flag can tell SFPU to re-bias a 5-bit exponent to 8-bit, we are loading 8-bit + // exponent and telling SFPU to not add any bias to these constants. + constexpr float output_0 = invert_output ? 0.0f : 1.0f; + constexpr float output_1 = invert_output ? 1.0f : 0.0f; + + constexpr bool check_zero = (COMP_MODE == SfpuType::equal_zero) || (COMP_MODE == SfpuType::not_equal_zero); + constexpr bool second_check = (COMP_MODE == SfpuType::less_than_equal_zero) || (COMP_MODE == SfpuType::greater_than_zero); + + for (int d = 0; d < iterations; d++) { vFloat v = dst_reg[0]; vFloat flag1, flag2; - - //a[i] == 0 - if constexpr(COMP_MODE == SfpuType::equal_zero) { - v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { - v = one; - } v_else { - v = zero; - } - v_endif; - } - - //a[i] != 0 - if constexpr(COMP_MODE == SfpuType::not_equal_zero) { - v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { - v = zero; - } v_else { - v = one; - } - v_endif; - } - - //a[i] < 0 - if constexpr(COMP_MODE == SfpuType::less_than_zero) { - v_if (v >= 0.0f) { - v = zero; - } v_else { - v = one; - } - v_endif; + if constexpr(check_zero) + { + v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + _calculate_comp_init_flag_(second_check, flag1, flag2, output_0); + } v_else { + _calculate_comp_init_flag_(second_check, flag1, flag2, output_1); + } + v_endif; } - - //a[i] >= 0 - if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) { - v_if (v >= 0.0f) { - v = one; - } v_else { - v = zero; - } - v_endif; + else + { + v_if (v < 0.0F) { + _calculate_comp_init_flag_(second_check, flag1, flag2, output_0); + } v_else { + _calculate_comp_init_flag_(second_check, flag1, flag2, output_1); + } + v_endif; } - //a[i] > 0 - if constexpr(COMP_MODE == SfpuType::greater_than_zero) { - v_if (v > 0.0f) { - v = one; - } v_else { - v = zero; - } - v_endif; + vFloat result; + if constexpr (second_check) + { + // SfpuType::less_than_equal_zero + // flag1 = 0x3F80(1.0) if DST < 0 else 0 + // flag2 = 0x3F80(1.0) if DST == 0 else 0 + // Do a bitwise Or (flag1 | flag2) to get <= condition. + // flag1 < 0 OR flag2 == 0 => DST is Less than or Equal to zero. + // Result will be either 0x0000(0.0) or 0x3F80(1.0) + if constexpr (COMP_MODE == SfpuType::less_than_equal_zero) { + result = reinterpret(reinterpret(flag1) | reinterpret(flag2)); + } + else + { + // SfpuType::greater_than_zero + // flag1 = 0x3F80(1.0) if DST >= 0 else 0 + // flag2 = 0x3F80(1.0) if DST != 0 else 0 + // Do a bitwise And (flag1 & flag2) to get > condition. + // flag2 >= 0 AND flag1 != 0 => DST is Greater than zero + // Result will be either 0x0000(0.0) or 0x3F80(1.0) + result = reinterpret(reinterpret(flag1) & reinterpret(flag2)); + } + } else { + result = flag1; } - //a[i] <= 0 - if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) { - v_if (v > 0.0f) { - v = zero; - } v_else { - v = one; - } - v_endif; - } + dst_reg[0] = result; - dst_reg[0] = v; - dst_reg++; + dst_reg++; } } template -inline void calculate_clamp(uint param0, uint param1, uint param2) +inline void calculate_clamp(const int iterations, uint param0, uint param1, uint param2) { // All params are in FP16 format // param0 = min @@ -571,7 +1026,7 @@ inline void calculate_clamp(uint param0, uint param1, uint param2) vFloat min = s2vFloat16(param0, format); vFloat max = s2vFloat16(param1, format); #pragma GCC unroll 0 - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat val = dst_reg[0]; @@ -589,10 +1044,10 @@ inline void calculate_clamp(uint param0, uint param1, uint param2) } template -inline void calculate_abs() +inline void calculate_abs(const int iterations) { // SFPU microcode - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat v = dst_reg[0]; dst_reg[0] = sfpi::abs(v); @@ -600,53 +1055,25 @@ inline void calculate_abs() } } - -template -inline void calculate_exp2() -{ - // SFPU microcode - for (int d = 0; d < ITERATIONS; d++) - { - vFloat v = dst_reg[0]; - // log(2) = 0.6931471805; - v = v * 0.6931471805f; - // exp = e^(v) - vFloat exp = calculate_exponential_body_improved(v); - dst_reg[0] = exp; - dst_reg++; - } -} - template -inline void calculate_sign() +inline void calculate_sign(const int iterations, uint exponent_size_8) { // All params are in FP16 format - for (int d = 0; d < ITERATIONS; d++) + // uint format = 1; + #pragma GCC unroll 0 + for (int d = 0; d < iterations; d++) { vFloat v = dst_reg[0]; - vFloat result = vConst1; - v_if (v < 0.0f) { - result = vConstNeg1; - } v_elseif(v > 0.0f) { - result = vConst1; - } v_else { - result = vConst0; + dst_reg[0] = vConst1; + v_if (v < 0.0F) { + dst_reg[0] = vConstNeg1; } v_endif; - dst_reg[0] = result; - dst_reg++; - } -} -template -inline void calculate_max() -{ - for (int d = 0; d < ITERATIONS; d++) - { - vFloat a = dst_reg[0]; - vFloat b = dst_reg[32]; - v_if(a < b) { - dst_reg[0] = b; + //param0 == 0 is Bfp8 format. It does not require bias removal. + //param0 != 0 is Float16 format and exp bias needs to be removed for zero check. + v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + dst_reg[0] = vConst0; } v_endif; @@ -655,13 +1082,13 @@ inline void calculate_max() } template -inline void calculate_min() +inline void calculate_max(const int iterations) { - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat a = dst_reg[0]; vFloat b = dst_reg[32]; - v_if(a > b) { + v_if(a < b) { dst_reg[0] = b; } v_endif; @@ -671,261 +1098,248 @@ inline void calculate_min() } template -inline void calculate_expm1() +inline void calculate_max_int32(const int iterations) { - // SFPU microcode - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { - vFloat v = dst_reg[0]; - v = calculate_exponential_body_improved(v); - dst_reg[0] = v - 1.0f; + TTI_SFPLOAD(2, 12, 3, 0); + TTI_SFPLOAD(0, 12, 3, 64); + TTI_SFPMOV(0, 0, 1, 0); + TTI_SFPIADD(0, 2, 1, 2); + TTI_SFPSTORE(0, 12, 3, 0); + TTI_SFPENCC(0x003, 0, 0, 10); dst_reg++; } } - -#define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4) (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0) - template -sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val) +sfpi_inline vFloat sfpu_sine_maclaurin_series(vFloat val) { - v_if(1 > sfpi::abs(val)){ - dst_reg[0] = sfpi::abs(val) ; - } - v_else{ - dst_reg[0] = sfpu_reciprocal(sfpi::abs(val)); - } - v_endif; - - vFloat t1 = dst_reg[0] * dst_reg[0]; - - t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1); - - t1 = t1 * dst_reg[0]; - - v_if (sfpi::abs(val) > 1){ - t1 = 1.570796327f - t1; + // Good for [-pi:pi] + // Mclauren series = x - x^3/3! + x^5/5! - x^7/7! + x^9/9! - x^11/11! + vFloat tmp = val; + // x + vFloat output = tmp; + // x^3/3! + tmp = tmp*val*val; + output += -0.166666666*tmp; + // x^5/5! + tmp = tmp*val*val; + output += 0.0083333333*tmp; + // x^7/7! + tmp = tmp*val*val; + output += -0.0001984126*tmp; + if constexpr (not APPROXIMATION_MODE) { + // x^9/9! + tmp = tmp*val*val; + output += 0.0000027557*tmp; + // x^11/11! + tmp = tmp*val*val; + output += -0.00000002505*tmp; } - v_endif; - v_if(val < 0 ){ - t1 = -t1; - } - v_endif; - - return t1; -} - -template -inline void calculate_atan() -{ - // SFPU microcode - for (int d = 0; d < ITERATIONS; d++) - { - vFloat val = dst_reg[0]; - val = sfpu_atan_maclaurin_series(val); - dst_reg[0] = val; - dst_reg++; - } + // Write out output + return output; } - - template -sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val) +sfpi_inline vFloat sfpu_cosine_maclaurin_series(vFloat val) { - // input for [-1:1] - // Mclauren series - // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ... - // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a - - vFloat tmp = val; - vFloat val_square = val * val; - // x - vFloat output = tmp; - // (1/6) * x^3 - tmp = tmp * val_square; - output += 0.166666666 * tmp; - // (3/40) * x^5 - tmp = tmp * val_square; - output += 0.075 * tmp; - - //(5/112) * x^7 - tmp = tmp * val_square; - output += 0.044642857 * tmp; - - // (35/1152) *x^9 - tmp = tmp * val_square; - output += 0.03038194 * tmp; - - //(63/2816) * x^11 - tmp = tmp * val_square; - output += 0.02237216 * tmp; + // Good for [-pi:pi] + // Mclauren series = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! - x^10/10! + x^12/12! + // 1 + vFloat output = 1.0f; + // x^2/2! + vFloat tmp = val*val; + output += -0.5*tmp; + // x^4/4! + tmp = tmp*val*val; + output += 0.0416666666*tmp; + // x^6/6! + tmp = tmp*val*val; + output += -0.0013888888*tmp; + if constexpr (not APPROXIMATION_MODE) { + // x^8/8! + tmp = tmp*val*val; + output += 0.0000248015*tmp; + // x^10/10! + tmp = tmp*val*val; + output += -0.0000002755*tmp; + } // Write out output return output; } - template -inline void calculate_asin() +inline void calculate_sine(const int iterations) { // SFPU microcode - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat v = dst_reg[0]; - v = sfpu_asine_maclaurin_series(v); + v = 0.318309886183791f*v; // *1/pi to get number of pi rads. + vInt whole_v = float_to_int16(v); + vFloat whole_v_float = int32_to_float(whole_v, 0); + v = v - whole_v_float; + v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi] + v = sfpu_sine_maclaurin_series(v); + whole_v = whole_v & 0x1; + v_if(whole_v != 0) { + // odd so flip the sign + v *= -1; + } + v_endif; dst_reg[0] = v; dst_reg++; } } - - -#define PI_2 (1.570796326794) template -inline void calculate_acos() +inline void calculate_cosine(const int iterations) { // SFPU microcode - // acos = (pi/2 - asin) - for (int d = 0; d < ITERATIONS; d++) + for (int d = 0; d < iterations; d++) { vFloat v = dst_reg[0]; - v = sfpu_asine_maclaurin_series(v); - v = PI_2 - v; + v = 0.318309886183791f*v; // *1/pi to get number of pi rads. + vInt whole_v = float_to_int16(v); + vFloat whole_v_float = int32_to_float(whole_v, 0); + v = v - whole_v_float; + v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi] + v = sfpu_cosine_maclaurin_series(v); + whole_v = whole_v & 0x1; + v_if(whole_v != 0) { + // odd so flip the sign + v *= -1; + } + v_endif; dst_reg[0] = v; dst_reg++; } } - template -inline void cast_fp32_to_fp16a() +inline void relu_max(const int iterations, uint uint_threshold) { - #pragma GCC unroll 8 - for (int d = 0; d < ITERATIONS; d++) + vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a); + for (int d = 0; d < iterations; d++) { - //vFloat val = dst_reg[0]; - //dst_reg[0] = float_to_fp16a(val, 0); - TTI_SFPLOAD(0, 0, 3, 0); - TTI_SFP_STOCH_RND(0,0,0,0,0,8); - TTI_SFPSTORE(0,1,3,0); + vFloat a = dst_reg[0]; + v_if(a > threshold) { + a = threshold; + } + v_endif; + v_if(a < 0.0f) { + a = 0.0f; + } + v_endif; + dst_reg[0] = a; dst_reg++; } } - - - template -inline void calculate_negative() +inline void relu_min(const int iterations, uint uint_threshold) { - - for (int d = 0; d < ITERATIONS; d++) + vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a); + for (int d = 0; d < iterations; d++) { - vFloat val = dst_reg[0]; - dst_reg[0] = -val; + vFloat a = dst_reg[0]; + v_if(a < threshold) { + a = 0.0f; + } + v_endif; + dst_reg[0] = a; dst_reg++; } } - template -inline void calculate_add1() +inline void cast_fp32_to_fp16a(const int iterations) { - for (int d = 0; d < ITERATIONS; d++) + #pragma GCC unroll 8 + for (int d = 0; d < iterations; d++) { - vFloat val = dst_reg[0]; - dst_reg[0] = 1.0f + val; + //vFloat val = dst_reg[0]; + //dst_reg[0] = float_to_fp16a(val, 0); + TTI_SFPLOAD(0, 0, 3, 0); + TTI_SFP_STOCH_RND(0,0,0,0,0,8); + TTI_SFPSTORE(0,1,3,0); dst_reg++; } } -inline -vFloat sigmoid_piecewise_linear_positive(vFloat val) { - vFloat result = 0.0f; - v_if ( val >= +5.0f) { - result = 1.0f; - } v_elseif ( val > 1.0f && val < 5.0f ) { - result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f, 0.24300185f, 0.50437757f,val); - } v_else { - result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5 - } - v_endif; - return result; -} - -//sigmoid is anti-symmetric and offset by 1 -//sigmoid[-x] = 1 - sigmoid[x] template -inline void calculate_sigmoid() +inline void quant_int32(const int iterations, const uint dst_offset) { - for (int d = 0; d < ITERATIONS; d++) - { - vFloat val = dst_reg[0]; - vFloat result = 0.0f; - - v_if ( val < 0.0f ) { - val = -val; - } - v_endif; - - result = sigmoid_piecewise_linear_positive(val); - - val = dst_reg[0]; - v_if ( val < 0.0f ) { - result = 1.0f - result; - } - v_endif; - - dst_reg[0] = result; + // Operand A is input (fp32) + // Operand B is scaling factor (fp32) + // Operand C is zero-point constant (fp32) + // Output is int32 scaled to int8 range + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) { + // operand A - fp32 + TTI_SFPLOAD(0, 3, 3, 0); + // operand B - fp32 scaler + TT_SFPLOAD(1, 3, 3, dst_offset * 64); + // D(A) = A*B+C, LREG[2] = zero_point + TTI_SFPMAD(0, 1, 2, 0, 0); + // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result + TTI_NOP; + // fp32->int8, descale value is zero (LREG_9) + TTI_SFP_STOCH_RND(0,0,9,0,0,3); + // LREG_0 -> dest as int32 + TTI_SFPSTORE(0,4,3,0); dst_reg++; } - - return; } template -inline void calculate_heaviside(uint value) +inline void requant_int32(const int iterations, const uint dst_offset) { - // SFPU microcode - Converter c_value; - c_value.u = value; - vFloat s = c_value.f; - - #pragma GCC unroll 0 - for (int d = 0; d < ITERATIONS; d++) { - vFloat v = dst_reg[0]; - - v_if (v < 0.0f) { - v = 0.0f; - }v_elseif (v > 0.0f) { - v = 1.0f; - }v_else { - v = s; - } - v_endif; - - dst_reg[0] = v; - + // Operand A is input to requant (int32) + // Operand B is scaling factor (fp32) + // Operand C is zero-point constant (fp32) + // Output is int32 scaled to int8 range + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) + { + // operand A - int32 + TTI_SFPLOAD(0, 4, 3, 0); + // operand B - fp32 scaler + TT_SFPLOAD(1, 3, 3, dst_offset*64); + // cast int32->fp32 + TTI_SFPCAST(0, 0, 0); + // D(A) = A*B+C, LREG[2] = zero_point + TTI_SFPMAD(0, 1, 2, 0, 0); + // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result + TTI_NOP; + // fp32->int8, descale value is zero (LREG_9) + TTI_SFP_STOCH_RND(0,0,9,0,0,3); + // LREG_0 -> dest as int32 + TTI_SFPSTORE(0,4,3,0); dst_reg++; } } template -inline void calculate_silu() +inline void dequant_int32(const int iterations, const uint dst_offset) { - // SFPU microcode + // Operand A[LREG0] is input to dequant (int32) + // Operand B[LREG1] is scaling factor (fp32) + // Operand C[LREG2] is zero-point constant (fp32) + // Output = (A + (-C)) * B (fp32) + #pragma GCC unroll 8 for (int d = 0; d < ITERATIONS; d++) { - vFloat val = dst_reg[0]; - v_if ( val < 0.0f ) { - val = -val; - } - v_endif; - - vFloat result = sigmoid_piecewise_linear_positive(val); - - val = dst_reg[0]; - v_if ( val < 0.0f ) { - result = 1.0f - result; - } - v_endif; - result = val * result; - dst_reg[0] = result; + // operand A - int32 + TTI_SFPLOAD(0, 4, 3, 0); + // operand B - fp32 scaler + TT_SFPLOAD(1, 3, 3, dst_offset*64); + // cast int32->fp32 + TTI_SFPCAST(0, 0, 0); + // D(A)) = A+(-C), LREG[10] is 1, SFPADD = LREG_A*LREG_B+LREG_C + TTI_SFPADD(0,10,2,0,0); + TTI_NOP; + // D(A)) = (A+(-C))*B, LREG[9] is zero + TTI_SFPMUL(0,1,9,0,0); + TTI_NOP; + // LREG_0 -> dest as fp32 + TTI_SFPSTORE(0,3,3,0); dst_reg++; } } @@ -945,53 +1359,56 @@ inline void calculate_mask() } } - -template -inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) +template +inline void calculate_sfpu(const int iterations = ITERATIONS, uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { - if constexpr (operation == SfpuType::exp_with_base) { - constexpr bool zero_negative = true; - calculate_exponential(param0); + if constexpr (operation == SfpuType::exponential) { + calculate_exponential(iterations, param0); + } + else if constexpr (operation == SfpuType::exp_with_base) { + calculate_exponential(iterations, param0); } else if constexpr (operation == SfpuType::tanh) { - calculate_tanh(); + calculate_tanh(iterations); } else if constexpr (operation == SfpuType::hardtanh) { - calculate_hardtanh(param0, param1, param2); - } - else if constexpr (operation == SfpuType::rsqrt) { - //param0 = true -> approximate fast mode - // false -> high precision mode - // The algorithm uses Newton's method based on no.of iteration better approximation can be calculated - if ( param0 ) { - calculate_rsqrt(); - } else { - calculate_rsqrt(); - } + calculate_hardtanh(iterations, param0, param1, param2); + } + else if constexpr (operation == SfpuType::gelu) { + calculate_gelu(iterations); + } + else if constexpr (operation == SfpuType::reciprocal) { + calculate_reciprocal(iterations); } else if constexpr (operation == SfpuType::sigmoid) { - calculate_sigmoid(); + calculate_sigmoid(iterations); } - else if constexpr (operation == SfpuType::sigmoid_appx) { - calculate_sigmoid_appx(); + else if constexpr (operation == SfpuType::sqrt) { + calculate_sqrt(iterations); } else if constexpr (operation == SfpuType::tanh_derivative) { - calculate_tanh_derivative(); + calculate_tanh_derivative(iterations); + } + else if constexpr (operation == SfpuType::lrelu) { + calculate_lrelu(iterations, param0); } else if constexpr (operation == SfpuType::dropout) { - calculate_dropout(param0, param1); + calculate_dropout(iterations, param0, param1); } else if constexpr (operation == SfpuType::power) { - calculate_power_iterative(param0); + calculate_power(iterations, param0); } else if constexpr (operation == SfpuType::square) { - calculate_square(); + calculate_square(iterations); } else if constexpr (operation == SfpuType::log) { - calculate_log(param0); + calculate_log(iterations, param0); } else if constexpr (operation == SfpuType::log_with_base) { - calculate_log(param0); + calculate_log(iterations, param0); + } + else if constexpr (operation == SfpuType::gelu_derivative) { + calculate_gelu_derivative(iterations); } else if constexpr ((operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) || @@ -999,46 +1416,46 @@ inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, ui (operation == SfpuType::greater_than_equal_zero) || (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) { - calculate_comp(8); //BFLOAT16 - exp + calculate_comp(iterations, param5); } else if constexpr (operation == SfpuType::clamp) { - calculate_clamp(param0, param1, param2); + calculate_clamp(iterations, param0, param1, param2); } else if constexpr (operation == SfpuType::abs) { - calculate_abs(); + calculate_abs(iterations); } else if constexpr (operation == SfpuType::sign) { - calculate_sign(); + calculate_sign(iterations, param5); } else if constexpr (operation == SfpuType::max) { - calculate_max(); - } - else if constexpr (operation == SfpuType::min) { - calculate_min(); + if constexpr (IS_INT_SFPU_EN) + calculate_max_int32(iterations); + else + calculate_max(iterations); } - else if constexpr (operation == SfpuType::exp2) { - calculate_exp2(); + else if constexpr (operation == SfpuType::sine) { + calculate_sine(iterations); } - else if constexpr (operation == SfpuType::heaviside) { - calculate_heaviside(param0); + else if constexpr (operation == SfpuType::cosine) { + calculate_cosine(iterations); } - else if constexpr (operation == SfpuType::expm1) { - calculate_expm1(); + else if constexpr (operation == SfpuType::relu_min) { + relu_min(iterations, param0); } - else if constexpr (operation == SfpuType::asin) { - calculate_asin(); + else if constexpr (operation == SfpuType::relu_max) { + relu_max(iterations, param0); } - else if constexpr (operation == SfpuType::acos) { - calculate_acos(); + else if constexpr (operation == SfpuType::cast_fp32_to_fp16a) { + cast_fp32_to_fp16a(iterations); } - else if constexpr (operation == SfpuType::atan) { - calculate_atan(); + else if constexpr (operation == SfpuType::quant_int32) { + quant_int32(iterations, param0); } - else if constexpr (operation == SfpuType::signbit) { - calculate_signbit(); + else if constexpr (operation == SfpuType::requant_int32) { + requant_int32(iterations, param0); } - else if constexpr (operation == SfpuType::silu) { - calculate_silu(); + else if constexpr (operation == SfpuType::dequant_int32) { + dequant_int32(iterations, param0); } else if constexpr (operation == SfpuType::mask) { calculate_mask(); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h index 58ef2c3bb6a..a8134eb8d47 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h @@ -4,9 +4,6 @@ #pragma once -#include "circular_buffer.h" -#include "hostdevcommon/kernel_structs.h" - namespace ckernel { @@ -14,13 +11,14 @@ namespace ckernel struct semaphore { constexpr static uint32_t MATH_PACK = 1; // math <-> pack sync on dest register - constexpr static uint32_t UNPACK_PACK = 2; // pack <-> unpack sync on scratch buffer + constexpr static uint32_t UNPACK_TO_DEST = 2; // unpack <-> math sync on unpack to dest constexpr static uint32_t UNPACK_OPERAND_SYNC = 3; // unpack <-> pack, math sync on operand get/release constexpr static uint32_t PACK_DONE = 4; // Wait for beinning and end of each pack-iteration. For recording perf events and inserting delay. constexpr static uint32_t UNPACK_SYNC = 5; // trisc <-> unpack sync on hw kernel // Wait for beinning and end of each unpack or math iteration. For recording perf events and inserting delay. // This semaphore should only be used for either unpack or math. Not both at the same time. constexpr static uint32_t UNPACK_MATH_DONE = 6; + constexpr static uint32_t MATH_DONE = 7; // wait for math to finish when unpacking to dest constexpr static uint16_t t6_sem(const uint8_t sem_index) { @@ -46,79 +44,4 @@ enum firmware_msg_e SET_PERF_SCRATCH = 4 }; -constexpr uint8_t OPERAND_BASE_REG = 16; // base register used for operand storage -constexpr uint8_t OUTPUT_BASE_REG = 16; // base register used for output storage - -typedef struct { - uint32_t fifo_rd_ptr; - uint32_t fifo_limit; - uint16_t tiles_acked; - uint16_t accumulation_buffer; - uint32_t words_acked; - uint32_t fifo_size; - uint16_t blocks_per_iter; // total number of ublocks popped from interm buffer per input - uint16_t curr_block; // current number of ublocks popped per input - uint16_t num_iter; // total number of passes through the interm buffer per input - uint16_t curr_iter; // current numer of passes through the interm buffer per input - uint32_t fifo_rd_base_ptr; - uint32_t tile_size_words; -} operand_t; - -static_assert(sizeof(operand_t) == (sizeof(uint32_t) * 9)); - -typedef union { - operand_t f; - uint32_t val[9]; -} operand_u; - -typedef struct { - uint32_t fifo_wr_ptr; - uint32_t fifo_limit; - uint32_t fifo_size; - uint32_t fifo_num_pages; - uint32_t fifo_wr_base_ptr; - uint16_t fifo_wr_tile_ptr; - uint16_t tiles_received; - uint32_t dram_output_no_push; - uint16_t tile_size_words; - bool legacy_pack; - uint8_t fork; - uint8_t num_fork_streams; - bool shared_buffer; // interm buffer is shared with output - uint8_t shared_buffer_operand; //shared buffer output operand - bool accumulation_buffer; // interm buffer used for accumulation - uint8_t fork_stream_ids[16]; - union { - uint16_t ublock_ct; //ublock ct dim in tiles - uint16_t out_tile_dim; //output block dim in tiles - }; - union { - uint16_t ublock_tile_dim; //number of tiles in ublock for untilized output - uint16_t blocks_per_iter; //total number of ublocks pushed to interm buffer per input - }; - union { - uint16_t row_tile_dim; //one row of tiles - }; - union { - uint16_t block_tile_dim; //one row of ublocks for untilized output - uint16_t num_iter; //total number of passes through the interm buffer per input - }; - union { - uint16_t ublock_tile_cnt; - uint16_t curr_block; //current number of ublocks pushed to interm buffer per input - }; - union { - uint16_t block_tile_cnt; //current number of packed tiles for untilized output - uint16_t curr_iter; // current numer of passes through the interm buffer per input - }; -} output_t; - -static_assert(sizeof(output_t) == (sizeof(uint32_t) * 16)); - -typedef union { - output_t f; - uint32_t val[16]; -} output_u; - - } // namespace ckelimitrnel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h deleted file mode 100644 index b5bb5b1cbcd..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_io_headers.h +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - - -#include "ckernel_defs.h" - -// -// Receiving from a stream input -// - -// Setup pipe receiving data over stream -inline void llk_setup_input_operand(src_op_id_e operand); - -// Wait for N tiles available in the incoming stream -inline void llk_wait_tiles(src_op_id_e operand, std::uint32_t num_tiles); - -// Pop N tiles from the incoming stream -inline void llk_pop_tiles(src_op_id_e operand, std::uint32_t num_tiles); - -// -// Receiving from a local buffer -// - -// Setup pipe for receiving data over local buffer -inline void llk_setup_local_operand(src_op_id_e operand); - -// Wait for N tiles available in the local buffer -inline void llk_wait_local_tiles(src_op_id_e operand, std::uint32_t num_tiles); - -// Pop N tiles from the incoming stream -inline void llk_pop_local_tiles(src_op_id_e operand, std::uint32_t num_tiles); - -// -// Write to stream output -// - -// Setup pipe for writing output data to stream buffer -inline void llk_setup_output(out_op_id_e output); - -// Blockig call to wait for free space needed to pack N tiles -inline void llk_wait_for_free_tiles(out_op_id_e output, std::uint32_t num_tiles); - -// Push N tiles to stream buffer (increment write pointer) -inline void llk_push_tiles(out_op_id_e output, std::uint32_t num_tiles); - -// -// Write to local output -// - -// Setup pipe for writing output data to local output -inline void llk_setup_local_output(out_op_id_e output); - -// Blockig call to wait for free space needed to pack N tiles -inline void llk_wait_for_free_tiles(out_op_id_e output, std::uint32_t num_tiles); - -// Push N tiles to stream buffer (increment write pointer) -inline void llk_push_tiles(out_op_id_e output, std::uint32_t num_tiles); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h deleted file mode 100644 index 6782ceb8cb4..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_math_headers.h +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - - -#include "ckernel_defs.h" -#include "tensix_types.h" - - -// -// LLK math common -// - -template -inline void llk_math_wait_for_dest_available(); - -inline void llk_math_dest_section_done(); - -template -inline void llk_math_clear_dst(uint tile_index); - -template -inline void llk_math_set_dest_section_base(); - -template -inline void llk_math_set_dest_section_flip(); - -// -// LLK matrix multiplication -// -inline void llk_math_mmul_init(); -inline void llk_math_mmul(); - -// -// LLK Eltwise binary -// -template -inline void llk_math_eltwise_binary(); -template -inline void llk_math_eltwise_binary_init(); - -// -// LLK Eltwise unary sfpu -// -template -inline void llk_math_eltwise_unary_sfpu(); -template -inline void llk_math_eltwise_unary_sfpu_init(); - -// -// LLK Eltwise unary datacopy -// -template -inline void llk_math_eltwise_unary_datacopy(); -template -inline void llk_math_eltwise_unary_datacopy_init(); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h deleted file mode 100644 index a2adf15f705..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_packer_headers.h +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - - -#include "ckernel_defs.h" -#include "tensix_types.h" - -// MT: is it only formats for packer ?? -inline void llk_pack_hw_configure(DataFormat unpack_src, DataFormat unpack_dst); - -template -inline void llk_pack_wait_for_dest_available(); - -template -inline void llk_pack_set_dest_base(); - -template -inline void llk_pack_dest_section_done(); - -template -inline void llk_math_clear_dst(); - -// -// LLK pack tile to output stream - using row tables -// -inline void llk_pack_stream_row_tables_init(); -inline void llk_pack_stream_row_tables(std::uint32_t dst_tile_index); - -// -// LLK pack tile to output stream - using tile tables -// -inline void llk_pack_stream_tile_tables_init(); -inline void llk_pack_stream_tile_tables(std::uint32_t dst_tile_index); - - -// -// LLK pack tile to local L1 buffer - using row tables -// -inline void llk_pack_local_row_tables_init(); -inline void llk_pack_local_row_tables(std::uint32_t dst_tile_index); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h deleted file mode 100644 index fef79486088..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cllk_unpack_headers.h +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - - - -// -// LLK unpack tile A -// -inline void llk_unpack_A_hw_config(llk_unpack_A_params_t params); -inline void llk_unpack_A_init(); -inline void llk_unpack_A(std::uint32_t tile_index); - -// -// LLK unpack tile B -// -inline void llk_unpack_B_hw_config(llk_unpack_B_params_t params); -inline void llk_unpack_B_init(); -inline void llk_unpack_B(std::uint32_t tile_index); - -// -// LLK unpack tiles AB -// -inline void llk_unpack_AB_hw_config(llk_unpack_AB_params_t params); -inline void llk_unpack_AB_init(); -inline void llk_unpack_AB(std::uint32_t tile_index_a, std::uint32_t tile_index_b); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h index aaa08f8eed0..fa97031b17a 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h @@ -7,11 +7,8 @@ //#include "kernel_types.h" #include "ckernel.h" #include "ckernel_template.h" -#include "ckernel_sfpu.h" #include "ckernel_globals.h" - -#include "debug/fw_debug.h" -#include "debug/status.h" +#include "llk_defs.h" #ifndef SFPU_OP_PARAM #define SFPU_OP_PARAM 0 @@ -21,8 +18,6 @@ #define FUSE_SQRT_RECIP 0 #endif -#define EPS 1.19209e-07 //std::numeric_limits::epsilon() for FP32 - using namespace ckernel; namespace ckernel::math @@ -145,9 +140,7 @@ inline uint32_t get_dest_buffer_base() inline void wait_math_semaphores() { // wait while math semaphore is on max, no room to write math results - DEBUG_STATUS('W', 'M', 'S', 'W'); TTI_SEMWAIT(p_stall::STALL_MATH|p_stall::STALL_SFPU, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_MAX); - DEBUG_STATUS('W', 'M', 'S', 'D'); } inline void set_math_semaphores() @@ -156,13 +149,31 @@ inline void set_math_semaphores() t6_semaphore_post(semaphore::MATH_PACK); } -template +inline void math_unpack_to_dest_math_ready() +{ + t6_semaphore_wait_on_max(semaphore::MATH_DONE); + t6_semaphore_post(semaphore::MATH_DONE); + while (semaphore_read(semaphore::MATH_DONE) == 0) {} + semaphore_get(semaphore::MATH_DONE); +} + +inline void math_unpack_to_dest_tile_ready() +{ + t6_semaphore_wait_on_zero(semaphore::UNPACK_TO_DEST); + t6_semaphore_get(semaphore::UNPACK_TO_DEST); +} + +template inline void set_dst_write_addr(uint32_t tile_index) { if constexpr (layout == DstTileLayout::Default) { uint dst_index = tile_index << DstTileSizeLog2[tile_shape]; dst_index = dst_index + get_dest_buffer_base(); - TT_SETC16(DEST_TARGET_REG_CFG_MATH_Offset_ADDR32, dst_index); + if constexpr (unpack_to_dest) { + mailbox_write(ThreadId::UnpackThreadId, dst_index); // Send to unpacker + } else { + TT_SETC16(DEST_TARGET_REG_CFG_MATH_Offset_ADDR32, dst_index); + } } else { // FIXME MT: add this mapping for other layout } @@ -188,9 +199,8 @@ inline void clear_addr_mod_base() inline void math_dest_wait() { - DEBUG_STATUS('W', 'D', 'S', 'W'); + FWLOG0("XX math_full_dest_sync()->wait for whole dest available"); TTI_SEMWAIT(p_stall::STALL_MATH|p_stall::STALL_SFPU, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_MAX); - DEBUG_STATUS('W', 'D', 'S', 'D'); } inline void dest_section_flip() @@ -213,34 +223,11 @@ inline void set_dest_section_base() TT_SETC16(DEST_TARGET_REG_CFG_MATH_Offset_ADDR32, base_addr); } -inline uint32_t get_operand_id(uint32_t operand) -{ - const int INTERMEDIATE_BASE_ID = 24; - const int OPERAND_BASE_ID = 0; - return (operand>=INTERMEDIATE_BASE_ID) ? operand - 8 : operand - OPERAND_BASE_ID; -} - - -// FIXME: Added this --> Should be generated by compile trisc? -constexpr std::uint32_t math_tile_dims[32][2] = { - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32} -}; -constexpr std::uint32_t math_tile_num_faces[32] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; -constexpr std::uint32_t math_partial_face[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - -inline constexpr uint32_t get_num_faces(const std::uint32_t operand_id) -{ - return math_tile_num_faces[operand_id]; -} - -inline constexpr uint32_t get_partial_face(const std::uint32_t operand_id) -{ - return math_partial_face[operand_id]; +inline constexpr bool is_32bit_input(const std::uint32_t src_format, const std::uint32_t dst_format) { + const uint input_df = src_format; + const uint output_df = dst_format; + return ((input_df == (uint)DataFormat::Int32) || (input_df == (uint)DataFormat::Float32)) && + ((output_df == (uint)DataFormat::Int32) || (output_df == (uint)DataFormat::Float32)); } } // namespace ckernel::math diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h index 57990f84c94..bdc0b6b5063 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h @@ -7,13 +7,11 @@ #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_globals.h" -#include "debug/fw_debug.h" +#include "llk_defs.h" namespace ckernel::packer { - constexpr uint32_t OUTPUT_BASE = 0; - constexpr uint32_t OUTPUT_BASE_ID = 16; constexpr uint32_t PACK_CNT = 4; @@ -137,47 +135,6 @@ namespace ckernel::packer pack_counters_t f; } pack_counters_u; - - // FIXME: Added this --> Should be generated by compile trisc? - constexpr std::uint32_t pack_tile_dims[32][2] = { - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, - {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32}, {32, 32} - }; - constexpr std::uint32_t pack_tile_num_faces[32] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; - constexpr std::uint32_t pack_tile_face_r_dim[32] = { - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; - constexpr std::uint32_t pack_partial_face[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - constexpr std::uint32_t pack_narrow_tile[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - inline const uint32_t get_num_faces(const std::uint32_t output_id) //FIXME: why we have to always inline - { - return pack_tile_num_faces[output_id]; - } - - inline const uint32_t get_face_r_dim(const std::uint32_t output_id) - { - return pack_tile_face_r_dim[output_id]; - } - - inline const uint32_t get_tile_c_dim(const std::uint32_t output_id) - { - return pack_tile_dims[output_id][TileDim::C_IDX]; - } - - inline constexpr uint32_t get_partial_face(const std::uint32_t operand_id) - { - return pack_partial_face[operand_id]; - } - - inline constexpr uint32_t get_narrow_tile(const std::uint32_t operand_id) - { - return pack_narrow_tile[operand_id]; - } - // Set unpacker offsets to 0, except for unpacker 0, channel 1, X, which is the tile X dimension inline void packer_addr_counter_init() { @@ -185,15 +142,15 @@ namespace ckernel::packer TTI_SETADCZW(0b100, 0, 0, 0, 0, 0b1111); } - inline void set_packer_strides(const uint output_id){ + inline void set_packer_strides(const uint pack_src_format, const uint pack_dst_format){ // Get pointer to registers for current state ID volatile uint tt_reg_ptr *cfg = get_cfg_pointer(); - uint x_stride = (uint)(pack_src_format[output_id]&0x3) == (uint)DataFormat::Float32 ? 4 : - (uint)(pack_src_format[output_id]&0x3) == (uint)DataFormat::Float16 ? 2 : 1; - uint y_stride = 16*x_stride; - uint z_stride = PACK_CNT*16*y_stride; + uint x_stride = (uint)(pack_src_format&0x3) == (uint)DataFormat::Float32 ? 4 : + (uint)(pack_src_format&0x3) == (uint)DataFormat::Float16 ? 2 : 1; + uint y_stride = FACE_R_DIM*x_stride; + uint z_stride = PACK_CNT*FACE_C_DIM*y_stride; uint w_stride = z_stride; TT_SETDMAREG(0, LOWER_HALFWORD((y_stride< - inline void set_packer_config(const uint output_id){ + inline void set_packer_config(const uint pack_src_format, const uint pack_dst_format, const uint num_faces = 4, const bool partial_face = false){ // Get pointer to registers for current state ID volatile uint tt_reg_ptr *cfg = get_cfg_pointer(); - const uint num_faces = get_num_faces(output_id); - const bool partial_face = get_partial_face(output_id); - // Set packer config pack_config_u config; for (uint i=0; i<4; i++) { config.val[i] = 0; } - config.f.exp_section_size = (((uint)pack_dst_format[output_id] == (uint)DataFormat::Lf8) || - ((uint)pack_dst_format[output_id] == (uint)DataFormat::Int8)) ? 0 : (partial_face ? 1 : num_faces); // set to num_faces as exp section size is not used for non-bfp formats except for lf8/int8 + config.f.exp_section_size = ((pack_dst_format == (uint)DataFormat::Lf8) || + (pack_dst_format == (uint)DataFormat::Int8)) ? 0 : (partial_face ? 1 : num_faces); // set to num_faces as exp section size is not used for non-bfp formats except for lf8/int8 config.f.uncompress = 1; - config.f.out_data_format = (uint)pack_dst_format[output_id]; - config.f.in_data_format = (uint)pack_src_format[output_id]; + config.f.out_data_format = pack_dst_format; + config.f.in_data_format = pack_src_format; config.f.pack_per_xy_plane = 1; // Workaround for bug in HW: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1394 if constexpr (is_fp32_dest_acc_en) { - if (IS_BFP_A_FORMAT((uint)pack_dst_format[output_id])) { + if (IS_A_FORMAT(pack_dst_format)) { config.f.exp_threshold_en = 1; config.f.exp_threshold = 113; } @@ -276,27 +230,35 @@ namespace ckernel::packer dest_rd_ctrl_u dest_rd_ctrl; dest_rd_ctrl.val = 0; - dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Read_32b_data = ((uint)pack_src_format[output_id] == (uint)DataFormat::Int8) | (is_fp32_dest_acc_en ? 1 : 0); + dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Read_32b_data = (pack_src_format == (uint)DataFormat::Int8) | + (pack_src_format == (uint)DataFormat::Int32) | + (pack_src_format == (uint)DataFormat::Float32) | + (is_fp32_dest_acc_en ? 1 : 0); + + //Round to 10 bit mantissa from fp32 dest + if(is_fp32_dest_acc_en && (pack_src_format!=(uint)DataFormat::Float32)) { + dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Round_10b_mant = 1; + } cfg[PCK_DEST_RD_CTRL_Read_32b_data_ADDR32] = dest_rd_ctrl.val; - if (IS_BFP_FORMAT(pack_dst_format[output_id])) { + if (IS_BFP_FORMAT(pack_dst_format)) { // Override exp section size for packers 1,2,3 // Tile header + exp size + datum size - if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8_b) { + if ((uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp8 || (uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp8_b) { config.f.exp_section_size = 1 + ((num_faces>2) ? 2 : 0) + 16; cfg[THCON_SEC0_REG8_Row_start_section_size_ADDR32+0]=config.val[0]; config.f.exp_section_size = 1 + 1 + 32; cfg[THCON_SEC1_REG1_Row_start_section_size_ADDR32+0]=config.val[0]; config.f.exp_section_size = 1 + 0 + 48; cfg[THCON_SEC1_REG8_Row_start_section_size_ADDR32+0]=config.val[0]; - } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4_b) { + } else if ((uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp4 || (uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp4_b) { config.f.exp_section_size = 1 + ((num_faces>2) ? 2 : 0) + 8; cfg[THCON_SEC0_REG8_Row_start_section_size_ADDR32+0]=config.val[0]; config.f.exp_section_size = 1 + 1 + 16; cfg[THCON_SEC1_REG1_Row_start_section_size_ADDR32+0]=config.val[0]; config.f.exp_section_size = 1 + 0 + 24; cfg[THCON_SEC1_REG8_Row_start_section_size_ADDR32+0]=config.val[0]; - } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2_b) { + } else if ((uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp2 || (uint)(pack_dst_format&0x1F) == (uint)DataFormat::Bfp2_b) { config.f.exp_section_size = 1 + ((num_faces>2) ? 2 : 0) + 4; cfg[THCON_SEC0_REG8_Row_start_section_size_ADDR32+0]=config.val[0]; config.f.exp_section_size = 1 + 1 + 8; @@ -310,7 +272,7 @@ namespace ckernel::packer } // Save to GPR for quick data format reconfig - regfile[p_gpr_pack::EXP0_SEC_SIZE_BFP] = (num_faces) << THCON_SEC0_REG8_Exp_section_size_SHAMT; + regfile[p_gpr_pack::EXP0_SEC_SIZE_BFP] = (partial_face ? 1 : num_faces) << THCON_SEC0_REG8_Exp_section_size_SHAMT; regfile[p_gpr_pack::EXP1_SEC_SIZE_BFP8] = (1 + ((num_faces>2) ? 2 : 0) + 16) << THCON_SEC0_REG8_Exp_section_size_SHAMT; regfile[p_gpr_pack::EXP2_SEC_SIZE_BFP8] = (1 + 1 + 32) << THCON_SEC0_REG8_Exp_section_size_SHAMT; regfile[p_gpr_pack::EXP3_SEC_SIZE_BFP8] = (1 + 0 + 48) << THCON_SEC0_REG8_Exp_section_size_SHAMT; @@ -323,13 +285,12 @@ namespace ckernel::packer sync_regfile_write(p_gpr_pack::EXP3_SEC_SIZE_BFP2); } - inline void set_packer_l1_offset(const uint output_id){ + inline void set_packer_l1_offset(const uint pack_dst_format, const uint face_r_dim = FACE_R_DIM){ - const uint face_r_dim = get_face_r_dim(output_id); const uint face_dim = face_r_dim * FACE_C_DIM; - uint32_t l1_offset_1 = IS_BFP_FORMAT(pack_dst_format[output_id]) ? 1 : (((uint8_t)(pack_dst_format[output_id]&0x3) == (uint8_t)DataFormat::Float32) ? (face_dim/16)*4 : - ((uint8_t)(pack_dst_format[output_id]&0x3) == (uint8_t)DataFormat::Float16) ? (face_dim/16)*2 : (face_dim/16)); + uint32_t l1_offset_1 = IS_BFP_FORMAT(pack_dst_format) ? 1 : (((uint8_t)(pack_dst_format&0x3) == (uint8_t)DataFormat::Float32) ? (face_dim/16)*4 : + ((uint8_t)(pack_dst_format&0x3) == (uint8_t)DataFormat::Float16) ? (face_dim/16)*2 : (face_dim/16)); uint32_t l1_offset_2 = 2 * l1_offset_1; uint32_t l1_offset_3 = 3 * l1_offset_1; @@ -351,7 +312,11 @@ namespace ckernel::packer template - inline void reconfig_packer_data_format(const uint output_id) + inline void reconfig_packer_data_format( + const uint pack_src_format, + const uint pack_dst_format, + const uint tile_size, + const uint face_r_dim = FACE_R_DIM) { // Get pointer to registers for current state ID volatile uint *cfg = get_cfg_pointer(); @@ -361,35 +326,35 @@ namespace ckernel::packer config.val[2] = 0; // Only need to modify word[2][15:0] config.f.uncompress = 1; - config.f.out_data_format = (uint)pack_dst_format[output_id]; - config.f.in_data_format = (uint)pack_src_format[output_id]; + config.f.out_data_format = pack_dst_format; + config.f.in_data_format = pack_src_format; TT_SETDMAREG(0, LOWER_HALFWORD(config.val[2]), 0, LO_16(p_gpr_pack::TMP_LO)); TTI_REG2FLOP(2,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO); //16-bit write TTI_REG2FLOP(2,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO); TTI_REG2FLOP(2,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO); TTI_REG2FLOP(2,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO); - if (IS_BFP_FORMAT(pack_dst_format[output_id])) { + if (IS_BFP_FORMAT(pack_dst_format)) { // Override exp section size for packers 1,2,3 // Tile header + exp size + datum size TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP0_SEC_SIZE_BFP); - if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp8_b) { + if ((pack_dst_format&0x1F) == (uint)DataFormat::Bfp8 || (pack_dst_format&0x1F) == (uint)DataFormat::Bfp8_b) { TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP1_SEC_SIZE_BFP8); TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP2_SEC_SIZE_BFP8); TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP3_SEC_SIZE_BFP8); - } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp4_b) { + } else if ((pack_dst_format&0x1F) == (uint)DataFormat::Bfp4 || (pack_dst_format&0x1F) == (uint)DataFormat::Bfp4_b) { TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP1_SEC_SIZE_BFP4); TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP2_SEC_SIZE_BFP4); TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP3_SEC_SIZE_BFP4); - } else if ((uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2 || (uint)(pack_dst_format[output_id]&0x1F) == (uint)DataFormat::Bfp2_b) { + } else if ((pack_dst_format&0x1F) == (uint)DataFormat::Bfp2 || (pack_dst_format&0x1F) == (uint)DataFormat::Bfp2_b) { TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP1_SEC_SIZE_BFP2); TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP2_SEC_SIZE_BFP2); TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::EXP3_SEC_SIZE_BFP2); } else { FWASSERT("Other data formats not supported", false); } - } else if (((uint)pack_dst_format[output_id] == (uint)DataFormat::Lf8) || - ((uint)pack_dst_format[output_id] == (uint)DataFormat::Int8)) { + } else if ((pack_dst_format == (uint)DataFormat::Lf8) || + (pack_dst_format == (uint)DataFormat::Int8)) { TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO); TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO); TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO); @@ -397,59 +362,73 @@ namespace ckernel::packer } // Set l1 address offset - set_packer_l1_offset(output_id); + set_packer_l1_offset(pack_dst_format, face_r_dim); - TT_SETDMAREG(0, LOWER_HALFWORD((std::uint32_t)cb_interface[output_id].fifo_page_size), 0, LO_16(p_gpr_pack::TILE_HEADER)); + TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_pack::TILE_HEADER)); // Workaround for HW bug: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1394 if constexpr (is_fp32_dest_acc_en) { - if (IS_BFP_A_FORMAT((uint)pack_dst_format[output_id])) { + if (IS_BFP_A_FORMAT(pack_dst_format)) { config.val[3] = 0; // Only need to modify word[2][15:0] config.f.exp_threshold_en = 1; config.f.exp_threshold = 113; TT_SETDMAREG(0, UPPER_HALFWORD(config.val[3]), 0, HI_16(p_gpr_pack::TMP_HI)); TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI); + TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI); + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI); + TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_HI); } else { TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO); + TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO); + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO); + TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+3-THCON_CFGREG_BASE_ADDR32, p_gpr::ZERO); } } // Flush packer pipeline before strides gasket alu format change TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::PACK); - cfg_reg_rmw_tensix(pack_src_format[output_id]); + cfg_reg_rmw_tensix(pack_src_format); tensix_sync(); //FIXME: why stallwait on cfg write doesn't work! // Set packer strides - set_packer_strides(output_id); + set_packer_strides(pack_src_format, pack_dst_format); } template - inline void configure_pack(uint pack_output_id, uint relu_config = 0) + inline void configure_pack( + const uint pack_src_format, + const uint pack_dst_format, + const uint tile_size, + const uint face_r_dim = FACE_R_DIM, + const uint num_faces = 4, + const bool partial_face = false, + const bool narrow_tile = false, + const uint relu_config = 0) { // Get pointer to registers for current state ID volatile uint *cfg = get_cfg_pointer(); - if (pack_src_format[pack_output_id] != pack_dst_format[pack_output_id]) { + if (pack_src_format != pack_dst_format) { TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK); tensix_sync(); } - set_packer_strides(pack_output_id); + set_packer_strides(pack_src_format, pack_dst_format); t6_mutex_acquire(mutex::REG_RMW); - uint alu_dst_format = pack_src_format[pack_output_id]; + const uint alu_dst_format = pack_src_format; cfg_reg_rmw_tensix(alu_dst_format); t6_mutex_release(mutex::REG_RMW); - set_packer_config(pack_output_id); + set_packer_config(pack_src_format, pack_dst_format, num_faces, partial_face); - set_packer_l1_offset(pack_output_id); + set_packer_l1_offset(pack_dst_format, face_r_dim); // PACK_COUNTERS_SEC0_pack_per_xy_plane = cfg_reg_array[3][0 +: 8]; // PACK_COUNTERS_SEC0_pack_reads_per_xy_plane = cfg_reg_array[3][8 +: 8]; @@ -457,7 +436,7 @@ namespace ckernel::packer // PACK_COUNTERS_SEC0_pack_yz_transposed = cfg_reg_array[3][23 +: 1]; pack_counters_u pack_counters; pack_counters.val = 0; - pack_counters.f.pack_reads_per_xy_plane = get_face_r_dim(pack_output_id); // Number of reads per face + pack_counters.f.pack_reads_per_xy_plane = face_r_dim; // Number of reads per face // Used for resetting tile posistion generator for edge masks for (uint i=0; i<4; i++) cfg[PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32+i]=pack_counters.val; // disable auto last generation @@ -468,7 +447,7 @@ namespace ckernel::packer cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32]=pck_edge_offset.val; cfg[TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32] = 0x0; // All packers use row set mapping 0, edge offset 0 mask - regfile[p_gpr_pack::TILE_HEADER] = (std::uint32_t)cb_interface[pack_output_id].fifo_page_size; + regfile[p_gpr_pack::TILE_HEADER] = tile_size; regfile[p_gpr_pack::TILE_HEADER+1] = 0; regfile[p_gpr_pack::TILE_HEADER+2] = 0; regfile[p_gpr_pack::TILE_HEADER+3] = 0; @@ -484,74 +463,13 @@ namespace ckernel::packer cfg[STACC_RELU_ApplyRelu_ADDR32] = hw_relu_config.val[0]; - const uint face_r_dim = get_face_r_dim(pack_output_id); const uint face_dim = face_r_dim * FACE_C_DIM; - const bool narrow_tile = get_narrow_tile(pack_output_id); - const uint pack_x_dim = untilize ? (narrow_tile ? face_dim : 16) : face_dim; // Number of datums to pack per row - // To untilize narrow tile (32x16) we just pack 2 faces back to back - TT_SETADCXX(p_setadc::PAC, pack_x_dim-1, 0x0); - } - template - inline void init_packer_dest_offset_registers() - { - //Issue #3064: to avoid sfpu and packer stalling when dest is in FP32 mode - // use dest offset of 0x200 instead of 0x100 - // Wormhole a0/b0 HW translates these addreses to the correct dest bank, - // however dest capacity is unchanged (e.g 0x100 to 0x1FF should be unused now) - constexpr uint32_t DEST_OFFSET_SHIFT = 0; //is_fp32_dest_acc_en ? (1) : (0); - constexpr uint32_t DEST_HALF_OFFSET = DEST_REGISTER_HALF_SIZE >> DEST_OFFSET_SHIFT; - - if constexpr (untilize) { - if constexpr (FaceLayout == ColMajor) { - // Packer0 : 0,32, 1,33 ... 7, 39 - // Packer1 : 8,40, 9,41 ... 15, 47 - // Packer2 : 16,48, 17,49 ... 23, 55 - // Packer3 : 23,56, 24,57 ... 31, 63 - regfile[p_gpr_pack::DEST_OFFSET_LO] = 0x0; - regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x8; - regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x10; - regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x18; - regfile[p_gpr_pack::DEST_OFFSET_HI] = DEST_HALF_OFFSET; - regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x8; - regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x10; - regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x18; - } else { - // Packer0 : 0,16, 1,17 ... 7, 23 - // Packer1 : 8,24, 9,25 ... 15, 31 - // Packer2 : 32,48, 33,49 ... 39, 55 - // Packer3 : 40,56, 41,57 ... 47, 63 - regfile[p_gpr_pack::DEST_OFFSET_LO] = 0x0; - regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x8; - regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x20; - regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x28; - regfile[p_gpr_pack::DEST_OFFSET_HI] = DEST_HALF_OFFSET; - regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x8; - regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x20; - regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x28; - } - } else { - if constexpr (FaceLayout == ColMajor) { - regfile[p_gpr_pack::DEST_OFFSET_LO] = 0x0; - regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x20; - regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x10; - regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x30; - regfile[p_gpr_pack::DEST_OFFSET_HI] = DEST_HALF_OFFSET; - regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x20; - regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x10; - regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x30; - } else { // Default to row major layout - regfile[p_gpr_pack::DEST_OFFSET_LO] = 0x0; - regfile[p_gpr_pack::DEST_OFFSET_LO+1] = 0x0 + 0x10; - regfile[p_gpr_pack::DEST_OFFSET_LO+2] = 0x0 + 0x20; - regfile[p_gpr_pack::DEST_OFFSET_LO+3] = 0x0 + 0x30; - regfile[p_gpr_pack::DEST_OFFSET_HI] = DEST_HALF_OFFSET; - regfile[p_gpr_pack::DEST_OFFSET_HI+1] = DEST_HALF_OFFSET + 0x10; - regfile[p_gpr_pack::DEST_OFFSET_HI+2] = DEST_HALF_OFFSET + 0x20; - regfile[p_gpr_pack::DEST_OFFSET_HI+3] = DEST_HALF_OFFSET + 0x30; - } - } - sync_regfile_write(p_gpr_pack::DEST_OFFSET_HI+3); + // To untilize narrow tile (32x16) we just pack 2 faces back to back + // Number of datums to pack per row + const uint pack_x_dim = (narrow_tile || !untilize) ? face_dim : FACE_R_DIM; + + TT_SETADCXX(p_setadc::PAC, pack_x_dim-1, 0x0); } inline uint8_t get_packer_dest_offset_index() @@ -585,7 +503,7 @@ namespace ckernel::packer // Program packer destination addresses from GPRs template - inline void program_packer_destination(uint32_t addr, uint8_t pack_output_id) + inline void program_packer_destination(uint32_t addr) { uint32_t new_l1_addr = (1 << 31) | addr; TT_SETDMAREG(0, LOWER_HALFWORD(addr), 0, LO_16(p_gpr_pack::OUTPUT_ADDR)); @@ -638,14 +556,4 @@ namespace ckernel::packer TTI_STOREIND (1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::OUTPUT_ADDR); } - inline uint32_t get_output_id(uint32_t output) - { - return ((output) - OUTPUT_BASE); - } - - inline constexpr uint32_t get_output_base_id() - { - return (OUTPUT_BASE_ID); - } - } diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h deleted file mode 100644 index 5dff63c55e3..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpriority_queue.h +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include "debug/fw_debug.h" - -// Provides a priority queue where lowest priority value has highest priority (e.g. priority 0 is higher priority than 5) -// Can be reversed with REVERSE_PRIORITY (e.g. priority 5 will be higher priority than 0) -// Also provides a version that allocates memory for you (such as on the stack), see below - -template -class FixedSizePriorityQueue -{ - protected: - std::pair *heap; - uint32_t num_elem; - uint32_t max_size; - - public: - FixedSizePriorityQueue(uint32_t addr_, uint32_t max_size_) - { - heap = (std::pair *) addr_; - num_elem = 0; - max_size = max_size_; - } - - void push(uint32_t value, uint32_t priority) - { - push(std::make_pair(value, priority)); - } - - void push(std::pair value) - { - FWASSERT("You are trying to push a full priority queue.", !is_full()); - - heap[num_elem] = value; - - num_elem++; - bubble_up(num_elem - 1); - } - - std::pair pop() - { - FWASSERT("You are trying to pop an empty priority queue.", !is_empty()); - - // Swap first with last - auto first_elem = heap[0]; - heap[0] = heap[num_elem - 1]; - heap[num_elem - 1] = first_elem; - - num_elem--; - bubble_down(0); - - return first_elem; - } - - __attribute__((always_inline)) - inline const std::pair& top() const - { - FWASSERT("You are trying to view an empty priority queue.", !is_empty()); - - return heap[0]; - } - - __attribute__((always_inline)) - inline const uint32_t size() const - { - return num_elem; - } - - __attribute__((always_inline)) - inline const bool is_empty() const - { - return size() == 0; - } - - __attribute__((always_inline)) - inline const bool is_full() const - { - return size() == max_size; - } - - protected: - - void bubble_up(uint32_t idx) - { - if (idx == 0) - return; - - uint32_t parent = ((idx + 1) >> 1) - 1; - - if ((REVERSE_PRIORITY && (heap[parent].second < heap[idx].second)) || - (!REVERSE_PRIORITY && (heap[parent].second > heap[idx].second))) { - // swap - auto tmp = heap[idx]; - heap[idx] = heap[parent]; - heap[parent] = tmp; - - bubble_up(parent); - } - } - - void bubble_down(uint32_t idx) - { - uint32_t left = ((idx + 1) << 1) - 1; - uint32_t right = ((idx + 1) << 1); - uint32_t higher_priority = idx; - - if (left < size()) { - if ((REVERSE_PRIORITY && (heap[left].second > heap[higher_priority].second)) || - (!REVERSE_PRIORITY && (heap[left].second < heap[higher_priority].second))) { - higher_priority = left; - } - } - - if (right < size()) { - if ((REVERSE_PRIORITY && (heap[right].second > heap[higher_priority].second)) || - (!REVERSE_PRIORITY && (heap[right].second < heap[higher_priority].second))) { - higher_priority = right; - } - } - - if (higher_priority != idx) { - // swap - auto tmp = heap[idx]; - heap[idx] = heap[higher_priority]; - heap[higher_priority] = tmp; - - bubble_down(higher_priority); - } - } - -}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h index 818793a2680..55404e24d39 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h @@ -6,8 +6,6 @@ #include "ckernel.h" #include "ckernel_globals.h" -#include "debug/fw_debug.h" -#include "debug/status.h" #ifdef PERF_DUMP #include "perf_res_decouple.h" @@ -15,8 +13,6 @@ namespace ckernel::unpacker { - constexpr uint32_t OPERAND_BASE_ID = 0; - constexpr uint32_t INTERMEDIATE_BASE_ID = 24; constexpr uint32_t TILE_DESC_SIZE = 2; //Unpacker descriptor size in dwords constexpr uint32_t CONFIG_SIZE = 2; //Unpacker configuration size in dwords @@ -156,9 +152,7 @@ namespace ckernel::unpacker // Wait for threshold of busy contexts to fall below total available contexts inline void wait_for_next_context(const uint num_contexts) { - DEBUG_STATUS('W', 'N', 'C', 'W'); while (semaphore_read(semaphore::UNPACK_SYNC) >= num_contexts) {} - DEBUG_STATUS('W', 'N', 'C', 'D'); } inline void switch_config_context(uint &unp_cfg_context) @@ -180,56 +174,30 @@ namespace ckernel::unpacker TTI_SETC16(UNPACK_MISC_CFG_CfgContextOffset_0_ADDR32, 0x0000); } - // FIXME: Added this --> Should be generated by compile trisc? - constexpr std::uint32_t unpack_tile_num_faces[32] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; - constexpr std::uint32_t unpack_tile_face_r_dim[32] = { - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; - constexpr std::uint32_t unpack_partial_face[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - constexpr std::uint32_t unpack_narrow_tile[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // Sync on unpacker idle via waiting busy contexts counter 0 inline void wait_for_idle() { - DEBUG_STATUS('W', 'I', 'W'); while (semaphore_read(semaphore::UNPACK_SYNC) > 0) {} - DEBUG_STATUS('W', 'I', 'D'); } - inline constexpr uint32_t get_num_faces(const std::uint32_t operand_id) - { - return unpack_tile_num_faces[operand_id]; - } - - inline constexpr uint32_t get_face_r_dim(const std::uint32_t operand_id) - { - return unpack_tile_face_r_dim[operand_id]; - } - - inline constexpr uint32_t get_partial_face(const std::uint32_t operand_id) - { - return unpack_partial_face[operand_id]; - } - - inline constexpr uint32_t get_narrow_tile(const std::uint32_t operand_id) - { - return unpack_narrow_tile[operand_id]; + inline void enalbe_int8_fpu_math() { + alu_config_u alu_payload = {.val = 0}; + alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = 1; + cfg_reg_rmw_tensix(alu_payload.val); } + template inline void configure_unpack_AB( - uint unpA_operand_id, - uint unpB_operand_id, - uint unpA_face_r_dim=16, - uint unpB_face_r_dim=16, - bool row_pool=false, - bool transpose_xy_srca_en=false, - bool is_fp32_dest_acc_en=false, - bool srnd_fpu_en = false, + const uint unpA_src_format, + const uint unpB_src_format, + const uint unpA_dst_format, + const uint unpB_dst_format, + const uint unpA_face_r_dim=FACE_R_DIM, + const uint unpB_face_r_dim=FACE_R_DIM, + const bool transpose_xy_srca_en=false, const uint unpA_num_faces = 4, const uint unpB_num_faces = 4) { - // Check that unpacker is done (all contexts freed up) before starting hw configuration wait_for_idle(); @@ -239,11 +207,11 @@ namespace ckernel::unpacker // Get pointer to registers for current state ID volatile uint tt_reg_ptr *cfg = get_cfg_pointer(); - uint unpA_ch1_x_stride = (uint) (unpack_dst_format[unpA_operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[unpA_operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; - uint unpB_ch1_x_stride = (uint) (unpack_dst_format[unpB_operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[unpB_operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; + uint unpA_ch1_x_stride = (uint) (unpA_dst_format&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpA_dst_format&0x3) == (uint) DataFormat::Float16 ? 2 : 1; + uint unpB_ch1_x_stride = (uint) (unpB_dst_format&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpB_dst_format&0x3) == (uint) DataFormat::Float16 ? 2 : 1; uint unpA_ch1_z_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride; uint unpB_ch1_z_stride = FACE_C_DIM*FACE_R_DIM*unpB_ch1_x_stride; - uint exp_width = ((uint)unpack_dst_format[unpA_operand_id]>>2)&0x1; //0=5-bit, 1=8-bit + uint exp_width = ((uint)unpA_dst_format>>2)&0x1; //0=5-bit, 1=8-bit // Strides for incrementing ch1 address to srcA and srcB cfg[UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32] = (0 << UNP0_ADDR_CTRL_ZW_REG_1_Wstride_SHAMT) | @@ -262,33 +230,42 @@ namespace ckernel::unpacker alu_config_u alu_payload = {.val = 0}; uint32_t fp32_dest_acc_en = (is_fp32_dest_acc_en) ? (1) : (0); + uint32_t int8_math_enabled = ((uint)unpA_dst_format == (uint)DataFormat::Int8) || + ((uint)unpB_dst_format == (uint)DataFormat::Int8) || + ((uint)unpA_dst_format == (uint)DataFormat::Int32) || + ((uint)unpB_dst_format == (uint)DataFormat::Int32); + + constexpr uint alu_format_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK; + alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcA = unpA_dst_format; + alu_payload.f.ALU_FORMAT_SPEC_REG1_SrcB = row_pool ? ((uint) DataFormat::Float16 | (exp_width<<2)) : unpB_dst_format; - alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcA = unpack_dst_format[unpA_operand_id]; - alu_payload.f.ALU_FORMAT_SPEC_REG1_SrcB = row_pool ? ((uint) DataFormat::Float16 | (exp_width<<2)) : unpack_dst_format[unpB_operand_id]; // FP32 accumulation and SFPU to read dest as FP32 // NOTE: This assumes these config fields are adjacent and in same register!! static_assert(ALU_ACC_CTRL_Fp32_enabled_ADDR32 == ALU_FORMAT_SPEC_REG0_SrcA_ADDR32); static_assert(ALU_ACC_CTRL_Fp32_enabled_ADDR32 == ALU_ACC_CTRL_SFPU_Fp32_enabled_ADDR32); + constexpr uint alu_dest_format_mask = ALU_ACC_CTRL_INT8_math_enabled_MASK | ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK | ALU_ACC_CTRL_Fp32_enabled_MASK; alu_payload.f.ALU_ACC_CTRL_Fp32_enabled = fp32_dest_acc_en; alu_payload.f.ALU_ACC_CTRL_SFPU_Fp32_enabled = fp32_dest_acc_en; - alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = ((uint)unpack_dst_format[unpA_operand_id] == (uint)DataFormat::Int8) || - ((uint)unpack_dst_format[unpB_operand_id] == (uint)DataFormat::Int8); + alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = int8_math_enabled; - constexpr uint mask1 = ALU_ACC_CTRL_INT8_math_enabled_MASK | ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK | ALU_ACC_CTRL_Fp32_enabled_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK | ALU_FORMAT_SPEC_REG0_SrcA_MASK; + constexpr uint alu_stoch_rnd_mask = ALU_ROUNDING_MODE_Fpu_srnd_en_MASK | ALU_ROUNDING_MODE_Gasket_srnd_en_MASK | ALU_ROUNDING_MODE_Packer_srnd_en_MASK; + constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndMode::All); + alu_payload.f.ALU_ROUNDING_MODE_Fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Fpu); + alu_payload.f.ALU_ROUNDING_MODE_Gasket_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndMode::Pack); + alu_payload.f.ALU_ROUNDING_MODE_Packer_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Pack); - cfg_reg_rmw_tensix(alu_payload.val); + constexpr uint alu_mask = alu_format_mask | alu_dest_format_mask | alu_stoch_rnd_mask; - cfg_reg_rmw_tensix(srnd_fpu_en); + cfg_reg_rmw_tensix(alu_payload.val); t6_mutex_release(mutex::REG_RMW); - // Set tile descriptor unpack_tile_descriptor_u tile_descriptor; for (uint i=0; i(semaphore::UNPACK_TO_DEST); + } + + inline void unpack_to_dest_tile_done(uint &context_id) { + t6_semaphore_post(semaphore::UNPACK_TO_DEST); + TTI_WRCFG(p_gpr_unpack::UNPACK_STRIDE, p_cfg::WRCFG_32b, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32); // Restore unpack stride + // Restore config context + if (context_id == 0) { + cfg_reg_rmw_tensix(0); + cfg_reg_rmw_tensix(4*16); + } else { + cfg_reg_rmw_tensix(0); + cfg_reg_rmw_tensix(4*16); + } + TTI_SETC16(SRCA_SET_Base_ADDR32, 0x4); // re-enable address bit swizzle + } + - inline uint32_t get_operand_id(uint32_t operand) + inline void set_dst_write_addr(const uint32_t &context_id, const uint32_t &unpack_dst_format) { - return operand; + uint32_t dst_byte_addr = 16*(4 + mailbox_read(ThreadId::MathThreadId)); // Apply fixed offset of 4*16 to dest address + TTI_SETC16(SRCA_SET_Base_ADDR32, 0x0); // Disable address bit swizzle + TTI_RDCFG(p_gpr_unpack::UNPACK_STRIDE, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32); // Save current stride + uint unpA_ch1_x_stride = (uint) (unpack_dst_format&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format&0x3) == (uint) DataFormat::Float16 ? 2 : 1; + uint unpA_ch1_z_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride; + TT_SETDMAREG(0, LOWER_HALFWORD(unpA_ch1_z_stride << UNP0_ADDR_CTRL_ZW_REG_1_Zstride_SHAMT), 0, LO_16(p_gpr_unpack::TMP_LO)); + TTI_WRCFG(p_gpr_unpack::TMP_LO, p_cfg::WRCFG_32b, UNP0_ADDR_CTRL_ZW_REG_1_Zstride_ADDR32); // Set unpack stride + if (context_id == 0) { + cfg_reg_rmw_tensix(1); + cfg_reg_rmw_tensix(dst_byte_addr); + } else { + cfg_reg_rmw_tensix(1); + cfg_reg_rmw_tensix(dst_byte_addr); + } + } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc new file mode 100644 index 00000000000..3db907d6b99 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc @@ -0,0 +1,221 @@ + +#include "ckernel.h" +#include "ckernel_addr_map.h" +#include "ckernel_pcbuf.h" +#include "ckernel_main.h" +#include "ckernel_globals.h" +#include +#include +#ifdef PERF_DUMP +#include "ckernel_perf_unpack_pack.h" +#include "ckernel_perf_math.h" +#endif + +namespace ckernel +{ + +enum class ttRiscCores : std::uint32_t { Unpack = 0, Math = 1, Pack = 2, Brisc = 3, Nrisc = 4}; + +volatile uint tt_reg_ptr *reg_base = reinterpret_cast(0xFFB10000); +volatile uint tt_reg_ptr *pc_buf_base = reinterpret_cast(PC_BUF_BASE); +volatile uint tt_reg_ptr *regfile = reinterpret_cast(REGFILE_BASE); +volatile uint tt_reg_ptr *instrn_buffer = reinterpret_cast(INSTRN_BUF_BASE); +volatile uint tt_reg_ptr *mailbox_base[4] = { + reinterpret_cast(TENSIX_MAILBOX0_BASE), reinterpret_cast(TENSIX_MAILBOX1_BASE), + reinterpret_cast(TENSIX_MAILBOX2_BASE), reinterpret_cast(TENSIX_MAILBOX3_BASE) +}; +volatile uint tt_reg_ptr *dbg_event_scratch = nullptr; + +uint32_t cfg_state_id __attribute__((section(".bss"))) = 0; // Flip between 0 and 1 to keep state between kernel calls +uint32_t dest_offset_id __attribute__((section(".bss"))) = 0; // Flip between 0 and 1 to keep dest pointer between kernel calls + +uint32_t dbg_event_index __attribute__((section(".bss"))) = 0; +uint32_t dbg_event_end __attribute__((section(".bss"))) = 0; +volatile uint16_t tt_reg_ptr *debug_mailbox_base = nullptr; +uint8_t mailbox_index = 0; +const uint8_t mailbox_end = 32; +volatile uint8_t tt_l1_ptr *debug_buffer = nullptr; +volatile uint8_t tt_l1_ptr *debug_buffer_start = nullptr; +uint8_t thread_id __attribute__((section(".bss"))) = 0; + +#ifdef PERF_DUMP +uint32_t perf_index __attribute__((section(".bss"))) = 0; +uint32_t perf_end __attribute__((section(".bss"))) = 0; +volatile uint32_t *perf_buf_base[2]; +uint8_t perf_buf_base_id __attribute__((section(".bss"))) = 0; +bool record_perf_events __attribute__((section(".bss"))) = 0; +uint32_t perf_events_target_idx __attribute__((section(".bss"))) = 0; +uint16_t current_outer_loop_iter __attribute__((section(".bss"))) = 0; +int32_t dram_dump_req_local; +bool first_unpack_recorded __attribute__((section(".bss"))) = 0; +volatile uint *ncrisc_ack_addr = nullptr; +uint32_t header; +#if OVERLAY_DECOUPLE == 1 +uint8_t overlay_output_decouple_mask = 0; +inline void update_overlay_decoupling_mailbox() { + overlay_output_decouple_mask = PERF_RISC_MAILBOX_OUTPUT_DECOUPLE_MASK_PTR[0] & 0xff; + if (thread_id == 0 || thread_id == 1) { + while(semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {} + } +} +inline void reset_unpack_pack_sync() { + if (thread_id == 2) { + semaphore_get(semaphore::UNPACK_MATH_DONE); + } +} +#endif +#endif + +volatile uint tt_l1_ptr * trisc_l1_mailbox = reinterpret_cast(MAILBOX_ADDR); + +inline bool ready_for_next_epoch() { // place this through compiler into a section that is not going to overwritten + return true; + // mailbox_write(ttRiscCores::Nrisc); // signal done epoch to NCRisc + // mailbox_read(ttRiscCores::Nrisc); // This is blocking read, until NCrisc signals epoch is ready +} + +inline void set_thread_id_parameter() { + if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) { + thread_id = 0; + } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) { + thread_id = 1; + } else { + thread_id = 2; + } +} + +inline void allocate_debug_mailbox_buffer() { + std::int32_t debug_mailbox_addr; + if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) { + debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 0*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE; + } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { + debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 1*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE; + } else { + debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 2*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE; + } + debug_mailbox_base = reinterpret_cast(debug_mailbox_addr); + clear_mailbox_values(); +} + +inline void allocate_debug_buffer() { + std::int32_t debug_buffer_addr; + if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) { + debug_buffer_addr = l1_mem::address_map::TRISC0_DEBUG_BUFFER_BASE; + } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { + debug_buffer_addr = l1_mem::address_map::TRISC1_DEBUG_BUFFER_BASE; + } else { + debug_buffer_addr = l1_mem::address_map::TRISC2_DEBUG_BUFFER_BASE; + } + debug_buffer = reinterpret_cast(debug_buffer_addr); + debug_buffer[l1_mem::address_map::DEBUG_BUFFER_SIZE-1]=0x0; + debug_buffer_start = debug_buffer; +} + +__attribute__((noinline)) void debug_dump(const uint8_t *data, uint32_t byte_size) { + for (uint32_t i = 0; i < byte_size; i++) { + if ((((uint32_t) debug_buffer)&(l1_mem::address_map::DEBUG_BUFFER_SIZE-1)) == + l1_mem::address_map::DEBUG_BUFFER_SIZE-1) { + *(debug_buffer) = 0xff; //overflow detected + } else { + *debug_buffer = data[i]; + debug_buffer++; + } + } +} + +__attribute__((noinline)) void debug_dump_seek(uint8_t offset) { + debug_buffer = reinterpret_cast(debug_buffer_start + offset); +} + +} // namespace ckernel + +void local_mem_copy() { + volatile uint tt_l1_ptr *l1_local_mem_start_addr; + volatile uint *local_mem_start_addr = (volatile uint*) LOCAL_MEM_BASE_ADDR; + + if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) { + l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC0_LOCAL_MEM_BASE; + } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) { + l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC1_LOCAL_MEM_BASE; + } else { + l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC2_LOCAL_MEM_BASE; + } + uint word_size = ((uint)__local_mem_rodata_end_addr - (uint)__local_mem_rodata_start_addr)>>2; + + if (word_size>0) { + for (uint n=0;n> 4) - 1; //Store L1 buffer address for reduce input 1 + sync_regfile_write(p_gpr_unpack::L1_BUFFER_ADDR); + } + +#ifdef PERF_DUMP + set_thread_id_parameter(); + allocate_perf_buffer(); + setup_fpu_perf_cnt(); + record_dummy_math_event(); +#if OVERLAY_DECOUPLE == 1 + update_overlay_decoupling_mailbox(); +#endif +#endif + + //while (ready_for_next_epoch()) + { + run_kernel(); + } + + // Signal completion + tensix_sync(); +#ifdef PERF_DUMP +#if OVERLAY_DECOUPLE == 1 + reset_unpack_pack_sync(); +#endif + record_perf_dump_end_and_check_overflow(); + // There has to be a tensix_sync() before this last pass. + last_trisc_perf_dump_to_dram(); + tensix_sync(); +#endif + + trisc_l1_mailbox_write(KERNEL_COMPLETE); + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc new file mode 100644 index 00000000000..b2c39df3313 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc @@ -0,0 +1,21 @@ + +// This c-file's purpose is: +// 1) include the generated list of kernels +// The files hold run_kernel() definition and inline kernel_main functions for every ckernel +// Need to make sure no other file includes these lists since it also include global parameter definitions +// 2) instantiate global variables + + +#include "ckernel_globals.h" + +#if defined(UCK_CHLKC_UNPACK) || defined(UCK_CHLKC_MATH) || defined(UCK_CHLKC_PACK) +#include "chlkc_list.h" +#else +#include "ckernel_list.h" +#endif + +// Global vars +uint32_t unp_cfg_context = 0; +uint32_t pack_sync_tile_dst_ptr = 0; +uint32_t math_sync_tile_dst_index = 0; +volatile uint32_t tt_l1_ptr l1_buffer[16] __attribute__ ((section (".text#"))) __attribute__ ((aligned (16))); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc new file mode 100644 index 00000000000..446e14cb8f6 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc @@ -0,0 +1,301 @@ + +#include "ckernel_perf_unpack_pack.h" +#include "stream_interface.h" + +#pragma GCC diagnostic ignored "-Wunused-function" + + +namespace ckernel +{ +extern uint32_t perf_index; +extern uint32_t perf_end; +// Perf-buffer are double buffered for spill_to_dram. +// Ncrisc will move one half to dram while trisc populates the other half. +// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. +extern volatile uint32_t *perf_buf_base[2]; +// Selects the half of perf_buffer that trisc is currently writing into. +extern uint8_t perf_buf_base_id; +extern bool record_perf_events; +extern uint16_t current_outer_loop_iter; +extern uint8_t thread_id; +extern int32_t dram_dump_req_local; +extern volatile uint* ncrisc_ack_addr; +extern uint32_t header; + +void allocate_perf_buffer() { + std::int32_t perf_buf_base_addr; + if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) { + perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + 0*TRISC_PERF_BUF_SIZE; + perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d. + if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) { + perf_end = TRISC_PERF_BUF_SIZE >> 3; + } else { + perf_end = 3; + } + dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[0]; + ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[0]; + } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { + perf_buf_base_addr = l1_mem::address_map::MATH_PERF_BUF_BASE_ADDR; + perf_index = 4; // The first 4 32b regs are skipped in recording math perf counters. + perf_end = 16; + + // Initialize math_dram_dump_req_local in the beginning of epoch. + // EPOCH_INFO_PTR->perf_dram_copy_req counters do not get reset between epochs. + dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[1]; + ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[1]; + } else { + perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + TRISC_PERF_BUF_SIZE; + perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d. + if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) { + perf_end = TRISC_PERF_BUF_SIZE >> 3; + } else { + perf_end = 3; + } + TTI_SEMINIT(1, 0, 1 << semaphore::PACK_DONE); + dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[2]; + ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[2]; + } + // Tirsc starts dumping into the first half of the perf_buffers. + perf_buf_base_id = 0; + // Program the address for the first half of the perf buffer address. + perf_buf_base[0] = reinterpret_cast(perf_buf_base_addr); + // Program the address for the second half of the perf buffer address. + perf_buf_base[1] = reinterpret_cast(perf_buf_base_addr + (TRISC_PERF_BUF_SIZE >> 1)); + perf_buf_base[perf_buf_base_id][0] = PERF_DUMP_END_SIGNAL; +#if PERF_DUMP_CONCURRENT + volatile uint32_t* header_ptr = reinterpret_cast(l1_mem::address_map::PERF_THREAD_HEADER); + header = header_ptr[0]; + header = (header & 0xfff8ffff) | (((uint32_t)(thread_id) & 0b111) << 16); + perf_buf_base[perf_buf_base_id][1] = header; + for (uint i = 2; i < perf_index; i++) { + perf_buf_base[perf_buf_base_id][i] = 0xffffffff; + } +#else + for (uint i = 1; i < perf_index; i++) { + perf_buf_base[perf_buf_base_id][i] = 0xffffffff; + } +#endif +} + +void switch_perf_buffers() { + + if constexpr (INTERMED_DUMP || PERF_DUMP_CONCURRENT) { + for (uint i = perf_index; i < perf_end; i++) { + perf_buf_base[perf_buf_base_id][i] = 0xffffffff; + } + bool stalled = false; + uint32_t timestamp_stall_start_l; + uint32_t timestamp_stall_start_h; + uint32_t timestamp_stall_end_l; + uint32_t timestamp_stall_end_h; + + // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram + int32_t ack_local = *ncrisc_ack_addr; + if (ack_local <= dram_dump_req_local - 1) { + stalled = true; + timestamp_stall_start_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + timestamp_stall_start_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + + while (ack_local <= dram_dump_req_local - 1) { + ack_local = *ncrisc_ack_addr; + } + + timestamp_stall_end_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + timestamp_stall_end_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + } + + dram_dump_req_local++; + EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local; + + perf_buf_base_id = 1 - perf_buf_base_id; + if constexpr(INTERMED_DUMP) { + perf_index = 0; + } else { + perf_index = 0; + perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL; + perf_buf_base[perf_buf_base_id][perf_index+1] = *(uint32_t*)(&header); + perf_index = 2; + } + if (stalled && perf_index + 5 < perf_end - 1) { + uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::STALL_TRISC_FOR_DRAM_PERF_DUMP, current_outer_loop_iter); + perf_buf_base[perf_buf_base_id][perf_index] = event_id; + perf_buf_base[perf_buf_base_id][perf_index+1] = timestamp_stall_start_h; + perf_buf_base[perf_buf_base_id][perf_index+2] = timestamp_stall_start_l; + perf_buf_base[perf_buf_base_id][perf_index+3] = event_id; + perf_buf_base[perf_buf_base_id][perf_index+4] = timestamp_stall_end_h; + perf_buf_base[perf_buf_base_id][perf_index+5] = timestamp_stall_end_l; + perf_index += 6; + } + } +} + +void last_trisc_perf_dump_to_dram() { + if (perf_index > 0) { + + // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram + int32_t ack_local = *ncrisc_ack_addr; + while (ack_local <= dram_dump_req_local - 1) { + ack_local = *ncrisc_ack_addr; + } + + if constexpr (INTERMED_DUMP) { + if (thread_id == 1) { + dram_dump_req_local += 2; + } else { + dram_dump_req_local++; + } + } else if constexpr (PERF_DUMP_CONCURRENT) { + dram_dump_req_local++; + } else { + dram_dump_req_local += 2; + } + EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local; + } +} + +void increment_unpack_tiles(uint operand_idx, uint num_tiles) { + if (record_perf_events && (perf_events_target_idx == 1)) { + if (operand_idx >= PERF_MAX_NUM_INPUTS) { + return; + } + uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0; + regfile_base_idx += (operand_idx >> 1); + bool upper = operand_idx & 0b1; + uint32_t num_tiles_regfile = regfile[regfile_base_idx]; + uint32_t current_num_tiles; + if (upper) { + current_num_tiles = (num_tiles_regfile >> 16) & 0xffff; + current_num_tiles += num_tiles; + regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff) + ((current_num_tiles & 0xffff) << 16); + } else { + current_num_tiles = (num_tiles_regfile + num_tiles) & 0xffff; + regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff0000) + (current_num_tiles & 0xffff); + } + sync_regfile_write(regfile_base_idx); + } +} + +void increment_pack_tiles(uint num_tiles) { + if (record_perf_events && (perf_events_target_idx == 1)) { + regfile[p_gpr_pack::PERF_PACK_NUM_TILES] += num_tiles; + sync_regfile_write(p_gpr_pack::PERF_PACK_NUM_TILES); + } +} + +#if OVERLAY_DECOUPLE == 1 + +// This runs prior to set_perf_dump_flag_for_input so perf_end has to be adjusted +void record_overlay_decoupled_output_bw_start(uint32_t num_tiles) { + if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) { + perf_end += 6; + } + if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) { + perf_end = TRISC_PERF_BUF_SIZE >> 2; + } + uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_NUM_TILES, perf_events_target_inputs[0]); + record_perf_value_and_check_overflow(event_id, num_tiles, 0); + event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]); + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0); +} + +void record_overlay_decoupled_output_bw_end() { + if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) { + perf_end += 6; + } + if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) { + perf_end = TRISC_PERF_BUF_SIZE >> 2; + } + uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]); + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0); +} + +void llk_push_all_packer_tiles_for_decoupling() { + uint32_t operand = OPERAND_OUTPUT_START_INDEX; + uint32_t output = operand_to_output_index(operand); + + // Populate the output buffer with headers + uint32_t stream_buf_size_bytes = EPOCH_INFO_PTR->outputs[output]->buf_full_size_bytes; + uint32_t stream_buf_addr = EPOCH_INFO_PTR->outputs[output]->buf_base_addr; + uint32_t stream_msg_info_buf_ptr = (EPOCH_INFO_PTR->outputs[output]->msg_info_buf_start)*MEM_WORD_WIDTH; + uint32_t tile_size_words = *(volatile uint32_t tt_l1_ptr *)(stream_msg_info_buf_ptr); + uint32_t tile_size_bytes = tile_size_words*MEM_WORD_WIDTH; + for (uint32_t tile_header_ptr = stream_buf_addr; tile_header_ptr < stream_buf_addr + stream_buf_size_bytes; tile_header_ptr += tile_size_bytes) { + *((uint32_t *)(tile_header_ptr)) = tile_size_words; + } + + uint32_t total_num_tiles_to_push = 0; + uint32_t num_tiles_to_push[EPOCH_MAX_OUTPUT_FORKS+1]; + uint32_t stream_id = EPOCH_INFO_PTR->outputs[output]->stream_id; + uint32_t active_stream_idx = get_active_stream_idx(stream_id); + volatile epoch_stream_info_t * l1_stream_info = EPOCH_INFO_PTR->active_streams[active_stream_idx]; + for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) { + uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1]; + uint32_t epoch_num_tiles = EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->epoch_num_tiles; + num_tiles_to_push[k] = epoch_num_tiles; + total_num_tiles_to_push += epoch_num_tiles; + } + if (((l1_stream_info->flags & STREAM_MOVES_RAW_DATA) != 0) || l1_stream_info->legacy_pack) { + + record_overlay_decoupled_output_bw_start(total_num_tiles_to_push); + + while(total_num_tiles_to_push > 0) { + uint32_t stream_msg_info_buf_ptr = (l1_stream_info->msg_info_buf_start)*MEM_WORD_WIDTH; + uint32_t tile_size_words = *(volatile uint32_t *)(stream_msg_info_buf_ptr); + uint32_t stream_buf_size_tiles = l1_stream_info->buf_size_tiles; + bool any_streams_busy = false; + for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) { + uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1]; + uint32_t fork_stream_id = k == 0 ? stream_id : EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->stream_id; + if (num_tiles_to_push[k] == 0) { + continue; + } + uint32_t dram_output_no_push = ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_DRAM_NO_PUSH) != 0) || ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_MOVES_RAW_DATA) != 0); + if (dram_output_no_push) { + uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id); + uint16_t operand_tiles_received = (uint16_t)*get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id)); + uint16_t operand_tiles_acked = (uint16_t)*get_operand_tiles_acked_ptr(stream_id_to_operand(fork_stream_id)); + uint16_t tiles_available = operand_tiles_received - operand_tiles_acked;// op_pack_tiles_ptr_sub(operand_tiles_received, operand_tiles_acked); + uint32_t stream_buf_free_tiles = stream_buf_size_tiles - tiles_available; + uint32_t num_tiles = tiles_left_in_phase > stream_buf_free_tiles ? stream_buf_free_tiles : tiles_left_in_phase; + if (num_tiles > 0) { + stream_set_tiles_left_in_phase(fork_stream_id, num_tiles); + volatile uint32_t tt_reg_ptr* tiles_received_ptr = (volatile uint32_t tt_reg_ptr*)get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id)); + operand_tiles_received = (uint16_t)tiles_received_ptr[0]; + uint16_t new_epoch_tiles_received = operand_tiles_received + num_tiles;// op_pack_tiles_ptr_add(operand_tiles_received, num_tiles); + tiles_received_ptr[0] = new_epoch_tiles_received; + + num_tiles_to_push[k] -= num_tiles; + total_num_tiles_to_push -= num_tiles; + } + } else { + uint32_t phase_active = stream_phase_is_active(fork_stream_id) && !is_dummy_phase(fork_stream_id); + if (phase_active) { + uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id); + uint32_t num_free_words = stream_get_free_words(fork_stream_id); + uint32_t num_tiles = 0; + uint32_t num_words = 0; + while (num_words + tile_size_words <= num_free_words && num_tiles + 1 <= tiles_left_in_phase) { + num_tiles++; + num_words += tile_size_words; + } + if (num_tiles > 0) { + stream_set_tiles_left_in_phase(fork_stream_id, num_tiles); + stream_relay_tiles(fork_stream_id, num_tiles, num_words); + + num_tiles_to_push[k] -= num_tiles; + total_num_tiles_to_push -= num_tiles; + } + } + } + } + } + record_overlay_decoupled_output_bw_end(); + } +} +#endif + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc index 238301e0566..baeba52c6c6 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 +/* + * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 +*/ #include "ckernel_template.h" -#include "debug/fw_debug.h" namespace ckernel { diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc new file mode 100644 index 00000000000..35130c72520 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc @@ -0,0 +1,10 @@ +// combining multiple C++ source files into a single file +// to reduce the overhead of the compilation process and +// improve build times +#include "ckernel.cc" +#include "ckernel_template.cc" +#ifdef PERF_DUMP +#include "ckernel_perf_unpack_pack.cc" +#endif +#include "ckernel_main.cc" +#include "llk_io.cc" // sw stack specific io interface diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list new file mode 100644 index 00000000000..2a66c11d1a6 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list @@ -0,0 +1,2 @@ +ckernel.cc +ckernel_template.cc diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_3c.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_3c.h deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h index e19013f89e0..e205ec12747 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h @@ -5,22 +5,23 @@ #pragma once namespace ckernel { + +enum Dim { + None = 0, + R = 1, + C = 2, + Z = 3, + RC = 4, + ZR = 5, + Invalid = 0xFF, +}; + enum ReduceDim { REDUCE_ROW, REDUCE_COL, REDUCE_SCALAR, }; -enum Dim { - None = 0, - R = 1, - C = 2, - Z = 3, - RC = 4, - ZR = 5, - Invalid = 0xFF, -}; - enum TileDim { R_IDX = 0, C_IDX = 1, @@ -96,7 +97,6 @@ enum ReluType { MAX_THRESHOLD_RELU, }; - enum SfpuType { tanh, hardtanh, @@ -155,6 +155,9 @@ enum SfpuType { silu, mask, negative, + dequant_int32, + requant_int32, + quant_int32, unused, }; } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h index 0dfa2c30fe4..8eb5e084934 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h @@ -2,22 +2,19 @@ // // SPDX-License-Identifier: Apache-2.0 - #pragma once #include "ckernel_defs.h" #include "ckernel_include.h" #include "cmath_common.h" -#include "chlkc_unpack_data_format.h" #ifdef PERF_DUMP #include "ckernel_perf_api.h" #endif -#include "hostdevcommon/common_runtime_address_map.h" using namespace ckernel::math; template -inline void llk_math_wait_for_dest_available() { +inline void _llk_math_wait_for_dest_available_() { // These liteweight functions for sync with packer imply // no mode change - entire epoch is either double buffer or single buffer #ifdef PERF_DUMP @@ -30,7 +27,7 @@ inline void llk_math_wait_for_dest_available() { } template -inline void llk_math_dest_section_done() { +inline void _llk_math_dest_section_done_() { #ifdef PERF_DUMP if constexpr(MATH_PACK_DECOUPLE) { return; @@ -51,7 +48,7 @@ inline void llk_math_dest_section_done() { } template -inline void llk_math_pack_sync_init() { +inline void _llk_math_pack_sync_init_() { #ifdef PERF_DUMP if constexpr(MATH_PACK_DECOUPLE) { return; @@ -87,65 +84,42 @@ inline void llk_math_pack_sync_init() { } } -inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { - debug_dump(data, byte_size); -} +template +inline void _llk_math_get_tile_(std::uint32_t tile_index, std::uint32_t* p_tile) { + if constexpr (mail2math) { + *p_tile = mailbox_read(ThreadId::UnpackThreadId); + } else { + *p_tile = 0x0; + } -inline void llk_math_debug_dump_seek(std::uint8_t offset) { - debug_dump_seek(offset); } -inline void llk_math_reconfig_data_format(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand, const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { - std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); - std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); - std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); - std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - - if((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) && (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { - uint config_data = (unpack_dst_format[new_srca_operand_id] << ALU_FORMAT_SPEC_REG0_SrcA_SHAMT) | (unpack_dst_format[new_srcb_operand_id] << ALU_FORMAT_SPEC_REG1_SrcB_SHAMT); - constexpr uint config_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK; - cfg_reg_rmw_tensix(config_data); - - } else if((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])){ - cfg_reg_rmw_tensix((uint)unpack_dst_format[new_srca_operand_id]); - } else if((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])){ - cfg_reg_rmw_tensix((uint)unpack_dst_format[new_srcb_operand_id]); +template +inline void _llk_math_release_tile_() { + if constexpr (mail2math) { + semaphore_get(semaphore::UNPACK_OPERAND_SYNC); } } -inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { - std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); - std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - - uint config_data = (unpack_dst_format[new_srca_operand_id] << ALU_FORMAT_SPEC_REG0_SrcA_SHAMT) | (unpack_dst_format[new_srcb_operand_id] << ALU_FORMAT_SPEC_REG1_SrcB_SHAMT); - constexpr uint config_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK; - cfg_reg_rmw_tensix(config_data); +inline void _llk_math_debug_dump_(std::uint8_t *data, std::uint32_t byte_size) { + debug_dump(data, byte_size); } -inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { - std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); - std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); - - if((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])){ - cfg_reg_rmw_tensix((uint)unpack_dst_format[new_srca_operand_id]); - } +inline void _llk_math_debug_dump_seek_(std::uint8_t offset) { + debug_dump_seek(offset); } -inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { - std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); - cfg_reg_rmw_tensix((uint)unpack_dst_format[new_srca_operand_id]); +inline void _llk_math_reconfig_data_format_srca_(const std::uint32_t srca_data_format) { + cfg_reg_rmw_tensix(srca_data_format); } -inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { - std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); - std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - - if((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])){ - cfg_reg_rmw_tensix((uint)unpack_dst_format[new_srcb_operand_id]); - } +inline void _llk_math_reconfig_data_format_srcb_(const std::uint32_t srcb_data_format) { + cfg_reg_rmw_tensix(srcb_data_format); } -inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { - std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - cfg_reg_rmw_tensix((uint)unpack_dst_format[new_srcb_operand_id]); +inline void _llk_math_reconfig_data_format_(const std::uint32_t srca_data_format, const std::uint32_t srcb_data_format) { + + uint config_data = (srca_data_format << ALU_FORMAT_SPEC_REG0_SrcA_SHAMT) | (srcb_data_format << ALU_FORMAT_SPEC_REG1_SrcB_SHAMT); + constexpr uint config_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK; + cfg_reg_rmw_tensix(config_data); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h index dbe1512ee0c..0a70d430497 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h @@ -7,13 +7,11 @@ #include "ckernel_template.h" #include "cmath_common.h" #include "llk_math_common.h" -#include "llk_param_structs.h" using namespace ckernel; // local function declarations inline void eltwise_binary_configure_addrmod(); -inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0); template inline void eltwise_binary_reuse_dest_as_src() { @@ -32,16 +30,15 @@ template < int NUM_FIDELITY_PHASES = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, bool is_fp32_dest_acc_en = false> -inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const std::uint32_t num_faces_b, uint dst_index, const bool clear_fp32_dst_acc) { +inline void _llk_math_eltwise_binary_(const std::uint32_t num_faces, uint dst_index, const bool clear_fp32_dst_acc) { + constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0); constexpr uint32_t ZERO_ACC_MODE = p_zeroacc::CLR_16; - // Todo: do something with num_faces_a, num_faces_b - if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { math::set_dst_write_addr(math_sync_tile_dst_index); - if constexpr (eltwise_binary_type == ELWMUL) { + if constexpr (eltwise_binary_type == ELWMUL) { if (is_fp32_dest_acc_en && clear_fp32_dst_acc) { #pragma GCC unroll 0 for (std::uint32_t i = 0; i < 8; i++) { @@ -59,7 +56,7 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const (Dst == DstSync::SyncTile2)), "Dst clear in DstSync::SyncTile16 or DstSync::SyncTile2 dst sync mode is not supported!"); /* - if (clear_dest_acc) { + if (clear_dest_acc) { if constexpr (is_fp32_dest_acc_en) { #pragma GCC unroll 0 for(std::uint32_t i = 0; i < 8; i++) { @@ -85,13 +82,13 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const #pragma GCC unroll 0 for (std::uint32_t n = 0; n < outerloop; n++) { // N-num faces eltwise_binary_reuse_dest_as_src(); - ckernel_template::run(instrn_buffer); + ckernel_template::run(instrn_buffer); } TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); #pragma GCC unroll 0 for (std::uint32_t n = 0; n < outerloop; n++) { // N-num faces eltwise_binary_reuse_dest_as_src(); - ckernel_template::run(instrn_buffer); + ckernel_template::run(instrn_buffer); } TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); } else { @@ -99,8 +96,8 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const #pragma GCC unroll 0 for (std::uint32_t n = 0; n < outerloop; n++) { // N-num faces eltwise_binary_reuse_dest_as_src(); - ckernel_template::run(instrn_buffer); - } + ckernel_template::run(instrn_buffer); + } // Manually clear B once mop is done for scaler bcast if constexpr (src_b_bcast_type == BroadcastType::SCALAR) { TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, p_setrwc::SET_D); @@ -135,8 +132,8 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) + (0 + n)); // Clear faces 0 & 1 } } - ckernel_template::run(instrn_buffer); - } + ckernel_template::run(instrn_buffer); + } } TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); if constexpr (high_fidelity) { @@ -151,7 +148,7 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) + (2 + n)); // Clear faces 2 & 3 } } - ckernel_template::run(instrn_buffer); + ckernel_template::run(instrn_buffer); } } else { #pragma GCC unroll 0 @@ -165,16 +162,16 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) + (2 + n)); // Clear faces 2 & 3 } } - ckernel_template::run(instrn_buffer); - } + ckernel_template::run(instrn_buffer); + } } TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); } else { // Row and no broadcasted behaves similarly - constexpr uint32_t outerloop = (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? 4 : 1; + const uint32_t outerloop = (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? num_faces : 1; if constexpr (high_fidelity) { #pragma GCC unroll 0 - for (std::uint32_t n = 0; n < 4; n++) { // N-num faces + for (std::uint32_t n = 0; n < num_faces; n++) { // N-num faces eltwise_binary_reuse_dest_as_src(); if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) { if (is_fp32_dest_acc_en && clear_fp32_dst_acc) { @@ -198,9 +195,9 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const TT_ZEROACC(ZERO_ACC_MODE, ADDR_MOD_1, ((get_dest_buffer_base() >> 4) + (dst_index << 2)) + n); } } - ckernel_template::run(instrn_buffer); + ckernel_template::run(instrn_buffer); + } } - } if constexpr (src_b_bcast_type == BroadcastType::SCALAR) { TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, p_setrwc::SET_D); } @@ -211,33 +208,6 @@ inline void llk_math_eltwise_binary_impl(const std::uint32_t num_faces_a, const math::clear_dst_reg_addr(); } -template < - EltwiseBinaryType eltwise_binary_type, - BroadcastType src_b_bcast_type, - DstSync Dst = DstSync::SyncFull, - int NUM_FIDELITY_PHASES = 0, - EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, - bool is_fp32_dest_acc_en = false> -inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) { - llk_math_eltwise_binary_impl(4, 4, dst_index, clear_fp32_dst_acc); -} - -template < - EltwiseBinaryType eltwise_binary_type, - BroadcastType src_b_bcast_type, - DstSync Dst = DstSync::SyncFull, - int NUM_FIDELITY_PHASES = 0, - EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, - bool is_fp32_dest_acc_en = false> -inline void llk_math_eltwise_binary(const std::uint32_t operand_A, const std::uint32_t operand_B, uint dst_index, const bool clear_fp32_dst_acc = true) { - const std::uint32_t id_A = get_operand_id(operand_A); - const std::uint32_t id_B = get_operand_id(operand_B); - - const std::uint32_t num_faces_A = get_num_faces(id_A); - const std::uint32_t num_faces_B = get_num_faces(id_B); - - llk_math_eltwise_binary_impl(num_faces_A, num_faces_B, dst_index, clear_fp32_dst_acc); -} template inline void eltwise_binary_configure_addrmod() { @@ -287,11 +257,11 @@ template < BroadcastType bcast_type, int NUM_FIDELITY_PHASES = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t operand_id = 0) { +inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t num_faces = 4) { constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0); const uint addr_mod = ADDR_MOD_0; constexpr uint innerloop = 16 >> 3; // 8 rows per eltwise op at a time. - uint outerloop = get_num_faces(operand_id); + uint outerloop = num_faces; auto broadcast_type = p_elwise::SRCB_NO_BCAST; if constexpr (bcast_type == BroadcastType::COL) { // The mop only runs for 2 outer loops and mop is called twice for col broadcast @@ -362,13 +332,13 @@ template < BroadcastType src_b_bcast_type, int NUM_FIDELITY_PHASES = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void llk_math_eltwise_binary_init_impl(const std::uint32_t operand_id, const std::uint32_t transpose, const std::uint32_t acc_to_dest) { - // todo: do something with num_faces +inline void _llk_math_eltwise_binary_init_(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) { + eltwise_binary_configure_addrmod(); if constexpr ( (eltwise_binary_type == ELWADD) || (eltwise_binary_type == ELWSUB) || (eltwise_binary_type == ELWMUL)) { - eltwise_binary_configure_mop(acc_to_dest, operand_id); + eltwise_binary_configure_mop(acc_to_dest, num_faces); } else { FWASSERT("Unsupported op!", false); } @@ -377,24 +347,3 @@ inline void llk_math_eltwise_binary_init_impl(const std::uint32_t operand_id, co math::reset_counters(p_setrwc::SET_ABD_F); } - -// Version with no operand -template < - EltwiseBinaryType eltwise_binary_type, - BroadcastType src_b_bcast_type, - int NUM_FIDELITY_PHASES = 0, - EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void llk_math_eltwise_binary_init(const std::uint32_t transpose=0, const std::uint32_t acc_to_dest = 0) { - llk_math_eltwise_binary_init_impl(0, transpose, acc_to_dest); -} - -// Version with operands -template < - EltwiseBinaryType eltwise_binary_type, - BroadcastType src_b_bcast_type, - int NUM_FIDELITY_PHASES = 0, - EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void llk_math_eltwise_binary_init_with_operands(const std::uint32_t operand_A, const std::uint32_t operand_B, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest = 0) { - const std::uint32_t operand_id = get_operand_id(operand_A); // operand_id is used to extract tile dim data which is the same for both operands - llk_math_eltwise_binary_init_impl(operand_id, transpose, acc_to_dest); -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h new file mode 100644 index 00000000000..9e23dab17f2 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h @@ -0,0 +1,119 @@ +/* + * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 +*/ + +#pragma once + +#include "ckernel_include.h" +#include "ckernel_template.h" +#include + +#include "cmath_common.h" +#include "llk_math_common.h" +#include "ckernel_globals.h" +#include "ckernel_sfpu.h" + +using namespace ckernel; +// local function declarations +template +inline void eltwise_binary_sfpu_configure_addrmod(){ + // NOTE: this kernel is typically used in conjunction with + // A2D, which is using ADDR_MOD_0 and ADDR_MOD_2, so use one + // that doesn't conflict! + + addr_mod_t{ + .srca = {.incr = 0}, + .srcb = {.incr = 0}, + .dest = {.incr = 0}, + }.set(ADDR_MOD_7); + +} +inline void eltwise_binary_sfpu_configure_mop(); + +template +inline void _llk_math_eltwise_binary_sfpu_( + const uint face_r_dim, + const uint num_faces, + uint dst_index_a, + uint dst_index_b, + int vector_mode = (int)Dim::RC, + uint param0 = 0, + uint param1 = 0, + uint param2 = 0, + uint param3 = 0, + uint param4 = 0, + uint param5 = 0) { + constexpr int ITERATIONS = 8; + uint dst_index = (dst_index_a <= dst_index_b) ? dst_index_a : dst_index_b; + param0 = (dst_index_a > dst_index_b) ? dst_index_a-dst_index_b : dst_index_b-dst_index_a; + if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { + math::set_dst_write_addr(math_sync_tile_dst_index); + } else { + math::set_dst_write_addr(dst_index); + } + math::set_addr_mod_base(); + TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); + if (vector_mode == (int)Dim::R) { + // Do a row vector, Face0 + Face1 -- first iteration (first row) + const int iterations = (num_faces < 4) ? + ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns +#pragma GCC unroll 0 + for (int face = 0; face < 2; face++) { + sfpu::calculate_sfpu(iterations, param0, param1, param2, param3, param4, param5); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + } + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + } else if (vector_mode == (int)Dim::C) { + // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face +#pragma GCC unroll 0 + for (int face = 0; face < 2; face++) { + sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + if (num_faces>2) { // Skip next 2 faces if tile is 32x32 + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + } + } + if (num_faces<=2) { + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + } + } else { + // Do all four faces, and iterate through all 4 blocks of 4 rows each +#pragma GCC unroll 0 + for (int face = 0; face < 4; face++) { + sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + } + } + math::clear_dst_reg_addr(); + + TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::WAIT_SFPU); + math::clear_addr_mod_base(); +} + +template +inline void _llk_math_eltwise_binary_sfpu_init_( + uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { + eltwise_binary_sfpu_configure_addrmod< sfpu_op >(); + if constexpr (sfpu_op == SfpuType::quant_int32) { + sfpu::sfpu_init(sfpu_op, param0); + } else if constexpr (sfpu_op == SfpuType::requant_int32) { + sfpu::sfpu_init(sfpu_op, param0); + } else if constexpr (sfpu_op == SfpuType::dequant_int32) { + sfpu::sfpu_init(sfpu_op, param0); + } else { + sfpu::sfpu_init(sfpu_op); + } + math::reset_counters(p_setrwc::SET_ABD_F); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h index aede5275e89..f26d2ca3f46 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_param_structs.h" #include "ckernel_include.h" #include "ckernel_template.h" @@ -17,35 +16,43 @@ using namespace ckernel; // local function declarations inline void eltwise_unary_configure_addrmod(); -template -inline void llk_math_eltwise_unary_datacopy(uint dst_index) { - if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { - math::set_dst_write_addr(math_sync_tile_dst_index); +template +inline void _llk_math_eltwise_unary_datacopy_(const std::uint32_t dst_index, const std::uint32_t src_format, const std::uint32_t dst_format) { + + if (unpack_to_dest && math::is_32bit_input(src_format, dst_format)) { + math_unpack_to_dest_math_ready(); + math::set_dst_write_addr(dst_index); + math::math_unpack_to_dest_tile_ready(); } else { - math::set_dst_write_addr(dst_index); - } - if constexpr (type == A2D) { - ckernel_template::run(instrn_buffer); - } else if constexpr (type == B2D) { - if constexpr (src_b_bcast_type == BroadcastType::SCALAR) { - // Manually clear B once mop is done - ckernel_template::run(instrn_buffer); - TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); - } else if constexpr (src_b_bcast_type == BroadcastType::COL) { - // Mop for col broadcast only does 2 outerloops. Needs to clear B manually and call twice - ckernel_template::run(instrn_buffer); - TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); - ckernel_template::run(instrn_buffer); - TTI_SETRWC(p_setrwc::CLR_AB, 0, 0, 0, 0, 0); + if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { + math::set_dst_write_addr(math_sync_tile_dst_index); } else { + math::set_dst_write_addr(dst_index); + } + + if constexpr (type == A2D) { ckernel_template::run(instrn_buffer); + } else if constexpr (type == B2D) { + if constexpr (src_b_bcast_type == BroadcastType::SCALAR) { + // Manually clear B once mop is done + ckernel_template::run(instrn_buffer); + TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); + } else if constexpr (src_b_bcast_type == BroadcastType::COL) { + // Mop for col broadcast only does 2 outerloops. Needs to clear B manually and call twice + ckernel_template::run(instrn_buffer); + TTI_SETRWC(p_setrwc::CLR_B, 0, 0, 0, 0, 0); + ckernel_template::run(instrn_buffer); + TTI_SETRWC(p_setrwc::CLR_AB, 0, 0, 0, 0, 0); + } else { + ckernel_template::run(instrn_buffer); + } + } else { + FWASSERT("Unsupported op!", false); } - } else { - FWASSERT("Unsupported op!", false); - } - math::clear_dst_reg_addr(); + math::clear_dst_reg_addr(); + } } template @@ -102,11 +109,10 @@ inline void eltwise_unary_configure_addrmod() { } template -inline void eltwise_unary_configure_mop(uint rows_per_inst, uint total_rows, const uint operand_id) { +inline void eltwise_unary_configure_mop(uint rows_per_inst, uint total_rows, const uint num_faces) { // always move 32x32 tile, packed as 16x16x4 if constexpr (type == A2D) { - const std::uint32_t num_faces = get_num_faces(operand_id); uint addr_mod = (rows_per_inst == p_mova2d::MOV_1_ROW) ? ADDR_MOD_0 : ADDR_MOD_2; uint innerloop = (rows_per_inst == p_mova2d::MOV_1_ROW) ? total_rows : (total_rows >> 3); uint outerloop = num_faces; @@ -160,15 +166,14 @@ inline void eltwise_unary_configure_mop(uint rows_per_inst, uint total_rows, con template // within_face_16x16_transpose is used by unpacker, math does not transpose -inline void llk_math_eltwise_unary_datacopy_init(const std::uint32_t transpose_of_faces=0 /*unused*/, const std::uint32_t within_face_16x16_transpose=0 /* unused */, const std::uint32_t operand = 0) { - const std::uint32_t operand_id = get_operand_id(operand); +inline void _llk_math_eltwise_unary_datacopy_init_(const std::uint32_t transpose_of_faces=0 /*unused*/, const std::uint32_t within_face_16x16_transpose=0 /* unused */, const std::uint32_t num_faces = 4) { eltwise_unary_configure_addrmod(); if constexpr (type == A2D) { - eltwise_unary_configure_mop(p_mova2d::MOV_8_ROWS, 16, operand_id); + eltwise_unary_configure_mop(p_mova2d::MOV_8_ROWS, 16, num_faces); } else if constexpr (type == B2D) { - eltwise_unary_configure_mop(p_movb2d::MOV_4_ROWS, 16, operand_id); + eltwise_unary_configure_mop(p_movb2d::MOV_4_ROWS, 16, num_faces); } else { FWASSERT("Unsupported op!", false); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h index d781a4160dc..3f83bb707b0 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h @@ -2,16 +2,12 @@ // // SPDX-License-Identifier: Apache-2.0 -#pragma once -#include "llk_param_structs.h" - #include "ckernel_include.h" #include "ckernel_template.h" #include #include "cmath_common.h" #include "llk_math_common.h" -#include "llk_format_conversions.h" #include "ckernel_globals.h" #include "ckernel_sfpi.h" @@ -159,24 +155,3 @@ template inline void llk_math_eltwise_unary_sfpi_test19(uint dst_index) { llk_math_eltwise_unary_sfpi(dst_index); } - -//Logical Not -template -inline void llk_math_eltwise_unary_sfpi_logical_not(uint dst_index) { - llk_math_eltwise_unary_sfpi(dst_index); -} - -inline void llk_math_eltwise_unary_sfpi_logical_not_init() { - llk_math_eltwise_unary_sfpi_init(); -} - -//Bitwise Complement -template -inline void llk_math_eltwise_unary_sfpi_bitwise_complement(uint dst_index) { - llk_math_eltwise_unary_sfpi(dst_index); -} - - -inline void llk_math_eltwise_unary_sfpi_bitwise_complement_init() { - llk_math_eltwise_unary_sfpi_init(); -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h index e695f53e2bd..ccd0dc293ff 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h @@ -3,20 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_param_structs.h" - #include "ckernel_include.h" #include "ckernel_template.h" #include #include "cmath_common.h" #include "llk_math_common.h" -#include "llk_format_conversions.h" #include "ckernel_globals.h" #include "ckernel_sfpu.h" -namespace ckernel { - +using namespace ckernel; template void static_assert_sfpu_type_dependent() { static_assert(sfpu_type == SfpuType::unused, "sfpu_type exception"); @@ -37,16 +33,20 @@ inline void eltwise_unary_sfpu_configure_addrmod(){ } inline void eltwise_unary_sfpu_configure_mop(); -template -inline void llk_math_eltwise_unary_sfpu( +template +inline void _llk_math_eltwise_unary_sfpu_( + const uint face_r_dim, + const uint num_faces, uint dst_index, - int vector_mode = Dim::RC, + int vector_mode = (int)Dim::RC, uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { + + constexpr int ITERATIONS = 8; if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { math::set_dst_write_addr(math_sync_tile_dst_index); } else { @@ -54,25 +54,33 @@ inline void llk_math_eltwise_unary_sfpu( } math::set_addr_mod_base(); TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); - if (vector_mode == Dim::R) { + if (vector_mode == (int)Dim::R) { // Do a row vector, Face0 + Face1 -- first iteration (first row) - const int ITERATIONS = 1; + const int iterations = (num_faces < 4) ? + ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns #pragma GCC unroll 0 for (int face = 0; face < 2; face++) { - sfpu::calculate_sfpu(param0, param1, param2, param3, param4, param5); + sfpu::calculate_sfpu(iterations, param0, param1, param2, param3, param4, param5); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); } - // Skip the next 2 faces TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } else if (vector_mode == Dim::C) { - // Do a column vector, Face0 + Face2 -- All iterations for full face + } else if (vector_mode == (int)Dim::C) { + // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face #pragma GCC unroll 0 for (int face = 0; face < 2; face++) { - sfpu::calculate_sfpu(param0, param1, param2, param3, param4, param5); + sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + if (num_faces>2) { // Skip next 2 faces if tile is 32x32 + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); + } + } + if (num_faces<=2) { TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); @@ -82,7 +90,7 @@ inline void llk_math_eltwise_unary_sfpu( // Do all four faces, and iterate through all 4 blocks of 4 rows each #pragma GCC unroll 0 for (int face = 0; face < 4; face++) { - sfpu::calculate_sfpu(param0, param1, param2, param3, param4, param5); + sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); } @@ -94,7 +102,7 @@ inline void llk_math_eltwise_unary_sfpu( } template -inline void llk_math_eltwise_unary_sfpu_init( +inline void _llk_math_eltwise_unary_sfpu_init_( uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { eltwise_unary_sfpu_configure_addrmod< sfpu_op >(); if constexpr (sfpu_op == SfpuType::dropout) { @@ -104,304 +112,3 @@ inline void llk_math_eltwise_unary_sfpu_init( } math::reset_counters(p_setrwc::SET_ABD_F); } - -// New LLK SFPU APIs -template -inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_rsqrt_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_log_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) { - llk_math_eltwise_unary_sfpu(dst_index,base_scale); -} - -template -inline void llk_math_eltwise_unary_sfpu_log_with_base_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_signbit_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_tanh_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//sign -template -inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_sign_init() { - llk_math_eltwise_unary_sfpu_init(); -} -template -inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) { - constexpr bool dont_care = false; - llk_math_eltwise_unary_sfpu(dst_index, vector_mode, integer_dropout, scale_factor); -} - -inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) { - constexpr bool dont_care = false; - constexpr uint dont_care_param = 0; - - llk_math_eltwise_unary_sfpu_init(dont_care_param, dont_care_param, seed); -} - -template -inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_sigmoid_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//EQZ -template -inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_eqz_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//NEZ -template -inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_nez_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//LTZ -template -inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_ltz_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//GTZ -template -inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_gtz_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//LEZ -template -inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_lez_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//GEZ -template -inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_gez_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_max_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_square_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode, pow); -} - -template -inline void llk_math_eltwise_unary_sfpu_power_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_abs_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -template -inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//EXP2 -template -inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_exp2_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//heaviside -template -inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index,vector_mode,param0); -} - -template -inline void llk_math_eltwise_unary_sfpu_heaviside_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//EXPM1 -template -inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_expm1_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//Asin -template -inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_asin_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//Atan -template -inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_atan_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//Acos -template -inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_acos_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//silu -template -inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) { - llk_math_eltwise_unary_sfpu(dst_index); -} - -template -inline void llk_math_eltwise_unary_sfpu_silu_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -//Mask -template -inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_mask_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -// Negative -template -inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index,vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_negative_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h index 24e8738da78..5ebaefe0d96 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h @@ -3,8 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_param_structs.h" - #include "ckernel_include.h" #include "ckernel_template.h" @@ -17,17 +15,14 @@ using namespace ckernel; -template -inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_id, const std::uint32_t in1_id) { +template +inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) { constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0); - const bool is_in0_16x32 = (math_tile_dims[in0_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]> FACE_C_DIM); - const bool is_in0_32x16 = (math_tile_dims[in0_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]<=FACE_C_DIM); - const bool is_in1_32x16 = (math_tile_dims[in1_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in1_id][TileDim::C_IDX]<=FACE_C_DIM); - const bool partial_face = get_partial_face(in0_id); - - static_assert(FaceLayout == DstTileFaceLayout::RowMajor, "FaceLayout must be RowMajor"); + const bool is_in0_16x32 = (in0_tile_r_dim <=FACE_R_DIM) && (in0_tile_c_dim > FACE_C_DIM); + const bool is_in0_32x16 = (in0_tile_r_dim > FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM); + const bool is_in1_32x16 = (in1_tile_r_dim > FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM); // MVMUL does D = B*A @@ -68,12 +63,21 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c const uint8_t dest_increment = transpose == false ? 8 : 24; if (is_in0_16x32||is_in0_32x16) { - addr_mod_t{ - .srca = {.incr = 16, .clr = 0, .cr = 0}, - .srcb = {.incr = 0, .clr = 0, .cr = 1}, // cr=16 before - .dest = {.incr = 8, .clr = 0, .cr = 0}, + if (transpose) { + addr_mod_t{ + .srca = {.incr = 32, .clr = 0, .cr = 0}, + .srcb = {.incr = 0, .clr = 0, .cr = 1}, // cr=16 before + .dest = {.incr = 8, .clr = 0, .cr = 0}, + } + .set(ADDR_MOD_1); + } else { + addr_mod_t{ + .srca = {.incr = 16, .clr = 0, .cr = 0}, + .srcb = {.incr = 0, .clr = 0, .cr = 1}, // cr=16 before + .dest = {.incr = 8, .clr = 0, .cr = 0}, + } + .set(ADDR_MOD_1); } - .set(ADDR_MOD_1); } else { if (is_in1_32x16) { addr_mod_t{ @@ -111,20 +115,39 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c .set(ADDR_MOD_2); } else if (is_in0_16x32||is_in0_32x16) { if (partial_face) { - addr_mod_t{ - .srca = {.incr = 16, .clr = 0, .cr = 0}, - .srcb = {.incr = 0, .clr = 0, .cr = 0}, - .dest = {.incr = 16, .clr = 0, .cr = 0}, - .bias = {.incr = 1}, + if (transpose) { + addr_mod_t{ + .srca = {.incr = 32, .clr = 0, .cr = 0}, + .srcb = {.incr = 0, .clr = 0, .cr = 0}, + .dest = {.incr = 16, .clr = 0, .cr = 0}, + .bias = {.incr = 1}, + } + .set(ADDR_MOD_2); + } else { + addr_mod_t{ + .srca = {.incr = 16, .clr = 0, .cr = 0}, + .srcb = {.incr = 0, .clr = 0, .cr = 0}, + .dest = {.incr = 16, .clr = 0, .cr = 0}, + .bias = {.incr = 1}, + } + .set(ADDR_MOD_2); } - .set(ADDR_MOD_2); } else { - addr_mod_t{ - .srca = {.incr = 16, .clr = 0, .cr = 0}, - .srcb = {.incr = 0, .clr = 0, .cr = 1}, - .dest = {.incr = 8, .clr = 0, .cr = 0}, + if (transpose) { + addr_mod_t{ + .srca = {.incr = 32, .clr = 0, .cr = 0}, + .srcb = {.incr = 0, .clr = 0, .cr = 1}, + .dest = {.incr = 8, .clr = 0, .cr = 0}, + } + .set(ADDR_MOD_2); + } else { + addr_mod_t{ + .srca = {.incr = 16, .clr = 0, .cr = 0}, + .srcb = {.incr = 0, .clr = 0, .cr = 1}, + .dest = {.incr = 8, .clr = 0, .cr = 0}, + } + .set(ADDR_MOD_2); } - .set(ADDR_MOD_2); } } else { addr_mod_t{ @@ -137,21 +160,41 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c if (is_in0_16x32) { if (partial_face) { - addr_mod_t{ - .srca = {.incr =16, .clr = 0, .cr = 0}, - .srcb = {.incr =16, .clr = 0, .cr = 0}, - .dest = {.incr =0 , .clr = 1, .cr = 0}, - .bias = {.incr = 1}, + if (transpose) { + addr_mod_t{ + .srca = {.incr =16, .clr = 0, .cr = 1}, //srca=16 + .srcb = {.incr =16, .clr = 0, .cr = 0}, + .dest = {.incr =0 , .clr = 1, .cr = 0}, + .bias = {.incr = 1}, + } + .set(ADDR_MOD_4); + } else { + addr_mod_t{ + .srca = {.incr =16, .clr = 0, .cr = 0}, + .srcb = {.incr =16, .clr = 0, .cr = 0}, + .dest = {.incr =0 , .clr = 1, .cr = 0}, + .bias = {.incr = 1}, + } + .set(ADDR_MOD_4); } - .set(ADDR_MOD_4); } else { - addr_mod_t{ - .srca = {.incr =16, .clr = 0, .cr = 0}, - .srcb = {.incr =16, .clr = 0, .cr = 1}, - .dest = {.incr = 0, .clr = 0, .cr = 1}, - .bias = {.incr = 1}, + if (transpose) { + addr_mod_t{ + .srca = {.incr =16, .clr = 0, .cr = 1}, //srca=16 + .srcb = {.incr =16, .clr = 0, .cr = 1}, + .dest = {.incr = 0, .clr = 0, .cr = 1}, + .bias = {.incr = 1}, + } + .set(ADDR_MOD_4); + } else { + addr_mod_t{ + .srca = {.incr =16, .clr = 0, .cr = 0}, + .srcb = {.incr =16, .clr = 0, .cr = 1}, + .dest = {.incr = 0, .clr = 0, .cr = 1}, + .bias = {.incr = 1}, + } + .set(ADDR_MOD_4); } - .set(ADDR_MOD_4); } } else if (is_in0_32x16) { addr_mod_t{ @@ -192,8 +235,8 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c } -template -inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_id, const std::uint32_t in1_id) { +template +inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) { // in0 - loaded to SrcB // in1 - loaded to SrcA @@ -208,12 +251,11 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con const bool reuse_a = ct_dim>=rt_dim; const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim; - const bool is_in0_16x32 = (math_tile_dims[in0_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]> FACE_C_DIM); - const bool is_in1_32x16 = (math_tile_dims[in1_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in1_id][TileDim::C_IDX]<=FACE_C_DIM); - const bool is_in0_32x16 = (math_tile_dims[in0_id][TileDim::R_IDX]> FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]<=FACE_C_DIM); - const bool is_in0_16x16 = (math_tile_dims[in0_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in0_id][TileDim::C_IDX]<=FACE_C_DIM); - const bool is_in1_16x16 = (math_tile_dims[in1_id][TileDim::R_IDX]<=FACE_R_DIM) && (math_tile_dims[in1_id][TileDim::C_IDX]<=FACE_C_DIM); - const bool partial_face = get_partial_face(in0_id); + const bool is_in0_16x32 = (in0_tile_r_dim <=FACE_R_DIM) && (in0_tile_c_dim > FACE_C_DIM); + const bool is_in1_32x16 = (in1_tile_r_dim > FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM); + const bool is_in0_32x16 = (in0_tile_r_dim > FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM); + const bool is_in0_16x16 = (in0_tile_r_dim <= FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM); + const bool is_in1_16x16 = (in1_tile_r_dim <= FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM); const std::uint32_t replay_buf_len = (is_in0_16x16 || is_in1_16x16) ? (partial_face ? 2 : 4) : ((is_in0_16x32 || is_in1_32x16 || is_in0_32x16) ? (partial_face ? 4 : 8) : 16); @@ -243,14 +285,14 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_3, 0); // B3A1 // srca=srca, srcb+=8, dest+=8, bias=1 } else if (is_in0_16x32 || is_in0_32x16) { if (partial_face) { - TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16, srcb=0, dest=+16, bias = 1 - TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16, srcb+=16, dest=0 (addr_mod_4), bias=0 - TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B1A2 // srca+=16, srcb=0, dest=+16, bias = 1 + TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16/32, srcb=0, dest=+16, bias = 1, // srca+=32 if transposed + TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16/=16, srcb+=16, dest=0 (addr_mod_4), bias=0, // srca=16 if transposed + TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B1A2 // srca+=16/32, srcb=0, dest=+16, bias = 1 // srca+=32 if transposed } else { TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A0 // srca=srca, srcb+=8, dest+=8 - TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16, srcb=0, dest+=8 + TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_2, 0); // B0A0 // srca+=16/32, srcb=0, dest+=8 // srca+=32 if transposed TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_3, 0); // B0A1 // srca=srca, srcb+=8, dest+=8, bias=1 - TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16/=0, srcb=16, dest=0/+=8 (addr_mod_4), bias=0 // srca=0 dest+=8 if in0_32x16 + TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B0A1 // srca+=16/=0/=16, srcb=16, dest=0/+=8 (addr_mod_4), bias=0 // srca=0 dest+=8 if in0_32x16, srca=16 if transposed TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_0, 0); // B1A2 // srca=srca, srcb+=8, dest+=8 TTI_MVMUL(p_setrwc::CLR_NONE, 0, ADDR_MOD_1, 0); // B1A2 // srca+=16, srcb=16, dest+=8/24 // dest+=24 if transposed @@ -316,14 +358,10 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con tmp.program(instrn_buffer); } -template -inline void llk_math_matmul_init(const std::uint32_t operandA, const std::uint32_t operandB, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { - - const std::uint32_t operandA_id = get_operand_id(operandA); - const std::uint32_t operandB_id = get_operand_id(operandB); - - matmul_configure_addrmod(transpose, ct_dim, rt_dim, kt_dim, operandA_id, operandB_id); +template +inline void _llk_math_matmul_init_(const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { + matmul_configure_addrmod(transpose, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face); const bool reuse_a = ct_dim>=rt_dim; const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim; if (t_dim>1) { @@ -336,12 +374,12 @@ inline void llk_math_matmul_init(const std::uint32_t operandA, const std::uint32 TTI_SETC16(CLR_DVALID_SrcA_Disable_ADDR32, 0); } - matmul_configure_mop(transpose>0, ct_dim, rt_dim, kt_dim, operandA_id, operandB_id); + matmul_configure_mop(transpose>0, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face); math::reset_counters(p_setrwc::SET_ABD_F); } -template -inline void llk_math_matmul(uint dst_index, const bool transpose=false, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { +template +inline void _llk_math_matmul_(uint dst_index, const bool transpose=false, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { const bool reuse_a = ct_dim>=rt_dim; const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim; const std::uint32_t rut_dim = reuse_a ? ct_dim : rt_dim; //reuse-dim diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h index 8c126977d12..4c77069f857 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h @@ -3,8 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_param_structs.h" - #include "ckernel_include.h" #include "ckernel_template.h" @@ -20,8 +18,8 @@ inline void reduce_configure_addrmod(); template inline void reduce_configure_mop(); -template -inline void llk_math_reduce(uint dst_index) { +template +inline void _llk_math_reduce_(const uint dst_index) { constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4; math::set_dst_write_addr(dst_index); if constexpr (dim == ReduceDim::REDUCE_ROW) { @@ -47,6 +45,17 @@ inline void llk_math_reduce(uint dst_index) { } } + // Workaround for https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1948 + if constexpr (is_int_fpu_en) { + TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); + TTI_SFPLOAD(0, 4, ADDR_MOD_0, 0); + TTI_SFPSTORE(0,5,ADDR_MOD_0,0); + TTI_SFPLOAD(0, 4, ADDR_MOD_0, 2); + TTI_SFPSTORE(0,5,ADDR_MOD_0,2); + TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::WAIT_SFPU); + TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x1); + } + // Move back to B and transpose // we avoid clobbering weights in src B by moving to rows 16 - 31 TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 0, 0, 0, p_setrwc::SET_AB); @@ -64,6 +73,9 @@ inline void llk_math_reduce(uint dst_index) { // Note: transpose on src B on works on rows 16 - 31 TTI_TRNSPSRCB; TTI_MOVD2B(0, p_movd2b::SRC_ROW16_OFFSET, ADDR_MOD_0, p_movd2b::MOV_1_ROW, 0); + if constexpr (is_int_fpu_en) { + TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x0); + } TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B); @@ -102,6 +114,16 @@ inline void llk_math_reduce(uint dst_index) { TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } } + // Workaround for https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1948 + if constexpr (is_int_fpu_en) { + TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); + TTI_SFPLOAD(0, 4, ADDR_MOD_0, 0); + TTI_SFPSTORE(0,5,ADDR_MOD_0,0); + TTI_SFPLOAD(0, 4, ADDR_MOD_0, 2); + TTI_SFPSTORE(0,5,ADDR_MOD_0,2); + TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::WAIT_SFPU); + TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x1); + } // Move back to B and transpose TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 0, 0, 0, p_setrwc::SET_AB); @@ -119,6 +141,9 @@ inline void llk_math_reduce(uint dst_index) { // Note: transpose on src B on works on rows 16 - 31 TTI_TRNSPSRCB; TTI_MOVD2B(0, p_movd2b::SRC_ROW16_OFFSET, ADDR_MOD_0, p_movd2b::MOV_1_ROW, 0); + if constexpr (is_int_fpu_en) { + TTI_SETC16(FP16A_FORCE_Enable_ADDR32, 0x0); + } TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_B, 0, 8, 0, p_setrwc::SET_B); @@ -155,6 +180,7 @@ inline void llk_math_reduce(uint dst_index) { // Reset Dest Counter TTI_SETRWC(p_setrwc::CLR_AB, 0, 0, 0, 0, p_setrwc::SET_AD); } + } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) { //fp32 dest unsupported with reduce scalar, must fix zeroacc static_assert(!is_fp32_dest_acc_en); @@ -268,7 +294,7 @@ inline void reduce_configure_mop() { } template -inline void llk_math_reduce_init(const std::uint32_t within_face_16x16_transpose=0) { //within_face_16x16_transpose used for unpack, ignored by math +inline void _llk_math_reduce_init_(const std::uint32_t within_face_16x16_transpose=0) { //within_face_16x16_transpose used for unpack, ignored by math constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h index e2a91b56c3f..7df83739dc9 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_io_pack.h" #include "llk_defs.h" -#include "llk_param_structs.h" #include "ckernel.h" #include "ckernel_template.h" @@ -15,15 +13,11 @@ using namespace ckernel; using namespace ckernel::packer; -template -inline void llk_pack_mop_config(const uint32_t output_id) { - - const uint num_faces = get_num_faces(output_id); - const uint face_r_dim = get_face_r_dim(output_id); - const bool partial_face = get_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]); +template +inline void _llk_pack_configure_addrmod_() { addr_mod_pack_t{ - .y_src = {.incr = untilize ? 0 : 15}, // 4-bit value so max is 15. incadcxy will increment it by 1 + .y_src = {.incr = 15}, // 4-bit value so max is 15. incadcxy will increment it by 1 .y_dst = {.incr = 1}, } .set(ADDR_MOD_0); @@ -44,68 +38,98 @@ inline void llk_pack_mop_config(const uint32_t output_id) { } addr_mod_pack_t{ - .y_src = { .incr = 0, .clr = 0, .cr = 0 }, + .y_src = { .incr = 0, .clr = 1, .cr = 0 }, .y_dst = { .incr = 0, .clr = 0, .cr = 0 }, }.set(ADDR_MOD_2); - const uint PACKCNT = partial_face ? 1 : num_faces; - const uint MEGAROW = 1; - constexpr uint ZERO_OUTPUT_FLAG = zero_output ? p_pacr::P_ZERO_OUTPUT_ENABLED : p_pacr::P_ZERO_OUTPUT_DISABLED; +} +template +inline void _llk_pack_mop_config_(const std::uint32_t pack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false) { + static_assert(FaceLayout == DstTileFaceLayout::RowMajor, "FaceLayout must be RowMajor"); + + const uint PACKCNT = (partial_face && IS_BFP_FORMAT(pack_dst_format)) ? 1 : num_faces; + constexpr uint MEGAROW = 1; + constexpr uint ZERO_OUTPUT_FLAG = zero_output ? p_pacr::P_ZERO_OUTPUT_ENABLED : p_pacr::P_ZERO_OUTPUT_DISABLED; + constexpr uint MOP_INNER_LOOP = 1; - // Write header to l1 if constexpr (!untilize) { - const uint MOP_INNER_LOOP = 1; - const uint MOP_OUTER_LOOP = 1; + constexpr uint MOP_OUTER_LOOP = 1; ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1)); - if (partial_face) { + if (partial_face && IS_BFP_FORMAT(pack_dst_format)) { tmp.set_start_op(TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); // Don't close the tile, point to the next face tmp.set_loop_op0(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 1, 0)); // Inc ch0_y+=1 (addr_mod_0 will increment by 15) tmp.set_loop_op1(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1)); // Close the tile } - + // Write header to l1 + if constexpr (write_tile_header) { + tmp.set_end_op(TT_OP_STOREIND( + 1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::OUTPUT_ADDR)); + } tmp.program(instrn_buffer); } else { - const bool narrow_tile = get_narrow_tile(output_id); - const uint MOP_UNTILIZE_INNER_LOOP = narrow_tile ? 1 : (FaceLayout == DstTileFaceLayout::ColMajor ? 8 : 4); - const uint MOP_UNTILIZE_OUTER_LOOP = ((face_r_dim == 1) || narrow_tile) ? 1 : face_r_dim / 2; + const uint MOP_OUTER_LOOP = ((face_r_dim == 1) || narrow_tile) ? 1 : (face_r_dim >> 1); - ckernel::ckernel_template tmp(MOP_UNTILIZE_OUTER_LOOP, MOP_UNTILIZE_INNER_LOOP, TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); - if (narrow_tile) { - tmp.set_last_inner_loop_instr(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1)); // Close the tile and clear the counters + if ((face_r_dim == 1) || narrow_tile) { + ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 1)); + tmp.program(instrn_buffer); } else { + // Inc ch0_y+=1 (addr_mod_0 will increment by 15) + ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 1, 0)); tmp.set_start_op(TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); - if (face_r_dim>1) { - tmp.set_loop_op0(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 4, 0)); // If it's narrow tile (32x16) pack rows back to back otherwise jump between faces - tmp.set_end_op(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); - } - tmp.set_last_inner_loop_instr(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 4, 0)); - tmp.set_last_outer_loop_instr(TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 4, 0)); + tmp.set_end_op(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); + tmp.program(instrn_buffer); } - tmp.program(instrn_buffer); } - } -template -inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) { - configure_pack(get_output_id(pack_params->pack_output), pack_params->relu_config.val); +template +inline void _llk_pack_reconfig_data_format_(const std::uint32_t pack_src_format, const std::uint32_t pack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false) { + + reconfig_packer_data_format( + pack_src_format, + pack_dst_format, + tile_size, + face_r_dim + ); + + if constexpr (is_tile_dim_reconfig_en) { + _llk_pack_mop_config_(pack_dst_format, face_r_dim, num_faces, partial_face, narrow_tile); + } } -template -inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) { - llk_pack_params_t llk_pack_params = { - .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}}; - llk_pack_hw_configure(&llk_pack_params); +template +inline void _llk_pack_hw_configure_(const std::uint32_t pack_src_format, const std::uint32_t pack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false, const std::uint32_t relu_config = 0) { + + configure_pack( + pack_src_format, + pack_dst_format, + tile_size, + face_r_dim, + num_faces, + partial_face, + narrow_tile, + relu_config + ); } -// FIXME: Remove once edge mask spec is defined template -inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) { - configure_pack(get_output_id(pack_params->pack_output), pack_params->relu_config.val); +inline void _llk_pack_reduce_hw_configure_(const std::uint32_t pack_src_format, const std::uint32_t pack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false, const std::uint32_t relu_config = 0) { + + configure_pack( + pack_src_format, + pack_dst_format, + tile_size, + face_r_dim, + num_faces, + partial_face, + narrow_tile, + relu_config + ); + volatile uint tt_reg_ptr *cfg = get_cfg_pointer(); ckernel::packer::pck_edge_offset_u pack_edge_offset = {.val = 0}; @@ -143,84 +167,22 @@ inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) { } } -template -inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) { - llk_pack_params_t llk_pack_params = { - .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}}; - llk_pack_reduce_hw_configure(&llk_pack_params); -} - -template -inline void llk_pack_init(const std::uint32_t pack_output = 0) { - const std::uint32_t output_id = get_output_id(pack_output); - llk_pack_mop_config(output_id); -} - -template -inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) { - - std::uint32_t pack_tile_addr; - if constexpr (out_of_order_output) { - pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + - (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1; - } else { - if constexpr (untilize) { - // FIXME: Do we need support for pack untilize? - // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim + - // cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf - // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; - // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size); - - // cb_interface[output_id].ublock_tile_cnt++; - - // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) { - // cb_interface[output_id].ublock_tile_cnt=0; - // cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct; - // } - } else { - pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; - cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size; - } - } - return pack_tile_addr; -} - -#if defined(PERF_DUMP) && MATH_PACK_DECOUPLE -template -inline void llk_pack_decouple(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0, bool pack_l1_acc = false) { - - std::uint8_t output_id = get_output_id(output); - - static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); +template +inline void _llk_pack_init_(const std::uint32_t pack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const bool partial_face = false, const bool narrow_tile = false) { - std::uint32_t pack_tile_addr = get_output_tile_address(output_id, output_tile_index); + _llk_pack_configure_addrmod_(); - if (operand_is_intermediate(output)) { - return; - } - - if constexpr (!untilize) { - uint32_t tile_header[4]; - uint32_t* l1_dest = reinterpret_cast(pack_tile_addr << 4); - for (int i = 0; i < 4; i++) { - tile_header[i] = regfile[p_gpr_pack::TILE_HEADER + i]; - l1_dest[i] = tile_header[i]; - } - } + _llk_pack_mop_config_( + pack_dst_format, + face_r_dim, + num_faces, + partial_face, + narrow_tile + ); } -#endif - -template -inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) { - std::uint8_t output_id = get_output_id(output); - - // Access tile dims using the following logic: - // pack_tile_dims[output_id] - - static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); - - std::uint32_t pack_tile_addr = get_output_tile_address(output_id, output_tile_index); +template +inline void _llk_pack_(const std::uint32_t tile_index, const std::uint32_t address) { constexpr uint32_t DEST_NUM_TILES_SHIFT = is_fp32_dest_acc_en ? (1) : (0); constexpr uint32_t DEST_NUM_TILES = DEST_NUM_TILES_FP16 >> DEST_NUM_TILES_SHIFT; @@ -237,44 +199,11 @@ inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32 TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_W, tile_index); } - program_packer_destination(pack_tile_addr, output_id); + program_packer_destination(address); mop_run(1, 1); if constexpr (untilize) { TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close tile - TTI_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_Y, 0); - TTI_INCADCZW(p_setadc::PAC, 0, 0, 1, 0); - } - -} -// FIXME-WH-UPLIFT -template -inline void llk_pack_reduce_config_v2(uint32_t icb_out) { - - if constexpr (at_kernel_start) - configure_pack(get_output_id(icb_out), false); - else { - TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK); - tensix_sync(); - } - - volatile uint *cfg = get_cfg_pointer(); - if constexpr (dim == ReduceDim::REDUCE_ROW) { - for (uint i = 0; i < 4; i++) - //TTI_WRCFG(revert ? 0xFFFFffff : 0x1, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+i); - cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32 + i] = revert ? 0xFFFFffff : 0x1; - } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) { - //TTI_WRCFG(revert ? 0xFFFFffff : 0x0, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0); - //TTI_WRCFG(revert ? 0xFFFFffff : 0x1, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1); - //TTI_WRCFG(revert ? 0xFFFFffff : 0x1, p_cfg::WRCFG_32b, TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32); - cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0] = revert ? 0xFFFFffff : 0x0; - cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1] = revert ? 0xFFFFffff : 0x1; - cfg[TILE_ROW_SET_MAPPING_0_row_set_mapping_0_ADDR32] = revert ? 0xF : 0x1; - } else { - //TTI_WRCFG(revert ? 0xFFFFffff : 0x0, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0); - //TTI_WRCFG(revert ? 0xFFFFffff : 0xFFFF, p_cfg::WRCFG_32b, PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1); - //cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+0] = revert ? 0xFFFFffff : 0x0; - //cfg[PCK_EDGE_OFFSET_SEC0_mask_ADDR32+1] = revert ? 0xFFFFffff : 0x0000ffff; } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h index f3385908c37..88dbdb186a9 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h @@ -6,26 +6,31 @@ #include "ckernel.h" #include "ckernel_defs.h" -#include "debug/fw_debug.h" +#include "fw_debug.h" #include "cpack_common.h" #include "llk_defs.h" -#include "llk_param_structs.h" - -#include "hostdevcommon/common_runtime_address_map.h" - using namespace ckernel; using namespace ckernel::packer; +#ifdef PERF_DUMP +#include "ckernel_perf_api.h" +#endif // wait until math is done and has produced something to pack -inline void llk_packer_wait_for_math_done() { +inline void _llk_packer_wait_for_math_done_() { +#ifdef PERF_DUMP + if constexpr (MATH_PACK_DECOUPLE == 0) { + TTI_SEMWAIT(p_stall::STALL_TDMA, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_ZERO); + } +#else TTI_SEMWAIT(p_stall::STALL_TDMA, semaphore::t6_sem(semaphore::MATH_PACK), p_stall::STALL_ON_ZERO); +#endif } // Tell math that it can write again template -inline void llk_packer_set_math_semaphore() { +inline void _llk_packer_set_math_semaphore_() { t6_semaphore_get(semaphore::MATH_PACK); // Indicate that packer is done and header is written into L1 } @@ -33,7 +38,13 @@ inline void llk_packer_set_math_semaphore() { // Tell math it can write again // Clear dest template -inline void llk_pack_dest_section_done() { +inline void _llk_pack_dest_section_done_() { +#ifdef PERF_DUMP + if constexpr (MATH_PACK_DECOUPLE) { + return; + } +#endif + constexpr bool clear_dest = (Dst != DstSync::SyncTile16); if constexpr (clear_dest){ @@ -53,7 +64,7 @@ inline void llk_pack_dest_section_done() { constexpr uint32_t WaitRes = (Dst == DstSync::SyncTile16) ? (p_stall::PACK) : (p_stall::NONE); // Tell math that it can write again - llk_packer_set_math_semaphore(); + _llk_packer_set_math_semaphore_(); constexpr bool flip_dest = ((Dst == DstSync::SyncHalf) || (Dst == DstSync::SyncTile2)); @@ -63,97 +74,99 @@ inline void llk_pack_dest_section_done() { } } +template +inline void _llk_init_packer_dest_offset_registers_(const std::uint32_t face_r_dim = FACE_R_DIM, const bool narrow_tile = false) { + TTI_STALLWAIT(p_stall::STALL_TDMA|p_stall::STALL_THCON, p_stall::PACK); // wait for pack to finish + if constexpr (untilize) { + const uint face_r_offset = ((face_r_dim == 1) || narrow_tile) ? FACE_R_DIM : (face_r_dim >> 1); + if constexpr (FaceLayout == ColMajor) { + // Packer0 : 0,32, 1,33 ... 7, 39 + // Packer1 : 8,40, 9,41 ... 15, 47 + // Packer2 : 16,48, 17,49 ... 23, 55 + // Packer3 : 23,56, 24,57 ... 31, 63 + TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); + TT_SETDMAREG(0, 0x000 + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); + TT_SETDMAREG(0, 0x000 + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); + TT_SETDMAREG(0, 0x000 + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); + } else { + //For example if face_offset = 8: + // Packer0 : 0,16, 1,17 ... 7, 23 + // Packer1 : 8,24, 9,25 ... 15, 31 + // Packer2 : 32,48, 33,49 ... 39, 55 + // Packer3 : 40,56, 41,57 ... 47, 63 + TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); + TT_SETDMAREG(0, 0x000 + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); + TT_SETDMAREG(0, 0x000 + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); + TT_SETDMAREG(0, 0x000 + 0x20 + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20 + face_r_offset, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); + } + } else { + if constexpr (FaceLayout == ColMajor) { + TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); + TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); + TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); + TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); + } else { // Default to row major layout + TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); + TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); + TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); + TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); + TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); + } + } + select_packer_dest_registers(); +} + template -inline void llk_pack_dest_init() { +inline void _llk_pack_dest_init_(const std::uint32_t face_r_dim = FACE_R_DIM, const bool narrow_tile = false) { tensix_sync(); reset_dest_offset_id(); - init_packer_dest_offset_registers(); - select_packer_dest_registers(); + _llk_init_packer_dest_offset_registers_(face_r_dim, narrow_tile); packer_addr_counter_init(); pack_sync_tile_dst_ptr = 0; } -template -inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 0) { - TTI_STALLWAIT(p_stall::STALL_TDMA|p_stall::STALL_THCON, p_stall::PACK); // wait for pack to finish - if constexpr (untilize) { - if constexpr (FaceLayout == ColMajor) { - // Packer0 : 0,32, 1,33 ... 7, 39 - // Packer1 : 8,40, 9,41 ... 15, 47 - // Packer2 : 16,48, 17,49 ... 23, 55 - // Packer3 : 23,56, 24,57 ... 31, 63 - TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); - TT_SETDMAREG(0, 0x000 + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); - TT_SETDMAREG(0, 0x000 + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); - TT_SETDMAREG(0, 0x000 + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x18, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); - } else { - // Packer0 : 0,16, 1,17 ... 7, 23 - // Packer1 : 8,24, 9,25 ... 15, 31 - // Packer2 : 32,48, 33,49 ... 39, 55 - // Packer3 : 40,56, 41,57 ... 47, 63 - TT_SETDMAREG(0, 0x000 + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); - TT_SETDMAREG(0, 0x000 + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); - TT_SETDMAREG(0, 0x000 + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); - TT_SETDMAREG(0, 0x000 + 0x28, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x08, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x28, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); - } + +template +inline void _llk_pack_get_tile_(std::uint32_t tile_index, std::uint32_t *p_tile) { + if constexpr (mail2pack) { + *p_tile = mailbox_read(ThreadId::UnpackThreadId); } else { - if constexpr (FaceLayout == ColMajor) { - TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); - TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); - TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); - TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); - } else { // Default to row major layout - TT_SETDMAREG(0, 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 0)); - TT_SETDMAREG(0, 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 1)); - TT_SETDMAREG(0, 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 2)); - TT_SETDMAREG(0, 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_LO + 3)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x00, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 0)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x10, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 1)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x20, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 2)); - TT_SETDMAREG(0, DEST_REGISTER_HALF_SIZE + 0x30, 0, LO_16(p_gpr_pack::DEST_OFFSET_HI + 3)); - } + *p_tile = 0x0; } - select_packer_dest_registers(); -} -inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { - debug_dump(data, byte_size); } -inline void llk_pack_debug_dump_seek(std::uint8_t offset) { - debug_dump_seek(offset); +template +inline void _llk_pack_release_tile_() { + if constexpr (mail2pack) { + semaphore_get(semaphore::UNPACK_OPERAND_SYNC); + } } -template -inline void llk_pack_reconfig_data_format(const std::uint32_t new_operand) { - reconfig_packer_data_format(get_output_id(new_operand)); +inline void _llk_pack_debug_dump_(std::uint8_t *data, std::uint32_t byte_size) { + debug_dump(data, byte_size); } -template -inline void llk_pack_reconfig_data_format(const std::uint32_t old_operand, const std::uint32_t new_operand) { - std::uint32_t old_operand_id = get_output_id(old_operand); - std::uint32_t new_operand_id = get_output_id(new_operand); - - if((pack_dst_format[old_operand_id] != pack_dst_format[new_operand_id]) - && (pack_dst_format[old_operand_id] != (uint)DataFormat::Invalid) - && (pack_dst_format[new_operand_id] != (uint)DataFormat::Invalid)) { - reconfig_packer_data_format(new_operand_id); - } +inline void _llk_pack_debug_dump_seek_(std::uint8_t offset) { + debug_dump_seek(offset); } -inline void llk_pack_relu_config(std::uint32_t config) { +TT_ALWAYS_INLINE void _llk_pack_relu_config_(const std::uint32_t config) { ReluType mode = (config&0xf) == 0 ? ReluType::NO_RELU : ((config&0xf) == 3 ? ReluType::MAX_THRESHOLD_RELU : ReluType::MIN_THRESHOLD_RELU); uint32_t val = ((config>>16) << STACC_RELU_ReluThreshold_SHAMT) | (((uint32_t)mode) << STACC_RELU_ApplyRelu_SHAMT); TTI_SETDMAREG(0, val&0xffff, 0, LO_16(p_gpr_pack::TMP0)); @@ -163,13 +176,13 @@ inline void llk_pack_relu_config(std::uint32_t config) { TTI_NOP; TTI_NOP; } -inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) +inline void _llk_pack_reconfig_l1_acc_(const std::uint32_t enable) { reconfigure_packer_l1_acc(enable); } template -inline void llk_pack_reduce_mask_config() { +inline void _llk_pack_reduce_mask_config_() { ckernel::packer::pck_edge_offset_u pack_edge_offset = {.val = 0}; // We initialize PCK_EDGE_OFFSET_SEC0 mask to clear out all the datums in the row @@ -230,7 +243,7 @@ inline void llk_pack_reduce_mask_config() { TTI_NOP; TTI_NOP; } -inline void llk_pack_reduce_mask_clear() { +inline void _llk_pack_reduce_mask_clear_() { // By default, all packers are set to use TILE_ROW_SET_MAPPING_0 and // mask is configured to pass through all the datums pck_edge_offset_u pack_edge_offset = {.val = 0}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h index 6117df04e2e..61dd252e81e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_A.h @@ -3,9 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_io_unpack.h" -#include "llk_param_structs.h" - #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" @@ -15,14 +12,19 @@ using namespace ckernel; using namespace ckernel::unpacker; -template -inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::uint32_t operand_id) { +#ifndef SKIP_UNP +#define SKIP_UNP 0 +#endif + +template +inline void _llk_unpack_A_mop_config_(const bool transpose_of_faces, const std::uint32_t num_faces, const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format = 0) { static_assert(!((BType != BroadcastType::NONE) && acc_to_dest && (binary_reuse_dest == EltwiseBinaryReuseDestType::DEST_TO_SRCB)), "Not supported configuration!"); + static_assert((((BType == BroadcastType::NONE) && (!acc_to_dest) && (binary_reuse_dest == EltwiseBinaryReuseDestType::NONE)) || (!unpack_to_dest)), "Not supported configuration when unpacking to dest!"); - const uint32_t num_faces = get_num_faces(operand_id); #if SKIP_UNP == 1 static constexpr uint unpack_srca = TT_OP_NOP; + static constexpr uint unpack_srca_to_dest = TT_OP_NOP; static constexpr uint unpack_srca_set_dvalid = TT_OP_NOP; static constexpr uint unpack_srcb = TT_OP_NOP; static constexpr uint unpack_srcb_inc_z_0 = TT_OP_NOP; @@ -36,6 +38,7 @@ inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::ui TTI_NOP; #else static constexpr uint unpack_srca = TT_OP_UNPACR(SrcA, 0b1 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 1 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); + static constexpr uint unpack_srca_to_dest = TT_OP_UNPACR(SrcA, 0b00010001 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 0 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); // ch0/ch1 z_inc static constexpr uint unpack_srca_set_dvalid = TT_OP_UNPACR_NOP(SrcA, p_unpacr_nop::UNP_ZEROSRC_SET_DVALID); static constexpr uint unpack_srcb = TT_OP_UNPACR(SrcB, 0b1 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 1 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); static constexpr uint unpack_srcb_inc_z_0 = TT_OP_UNPACR(SrcB, 0b0 /*Z inc*/, 0, 0, 0, 1 /* Set OvrdThreadId*/, 1 /*Set Dvalid*/, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); @@ -46,7 +49,12 @@ inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::ui static constexpr uint srcb_clear_z = TT_OP_SETADCZW(p_setadc::UNP_B, 0, 0, 0, 0, 0b0001); // set srcB ch0_z = 0 #endif - if constexpr (BType == BroadcastType::COL) { + if (unpack_to_dest && unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) { + const uint32_t outerloop = num_faces; + constexpr uint32_t innerloop = 1; + ckernel_template tmp(outerloop, innerloop, unpack_srca_to_dest); + tmp.program(instrn_buffer); + } else if constexpr (BType == BroadcastType::COL) { if constexpr (acc_to_dest) { constexpr uint32_t innerloop = 1; constexpr uint32_t outerloop = 2; //TODO: add support for num_faces, add support for dest to srcB @@ -128,50 +136,30 @@ inline void llk_unpack_A_mop_config(const bool transpose_of_faces, const std::ui } } -template -inline void llk_unpack_A_hw_configure(const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) { +template +inline void _llk_unpack_A_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; - const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand); - - const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id); - - const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id); - - configure_unpack_AB(unpA_operand_id, unpA_operand_id, - unpA_face_r_dim, unpA_face_r_dim, is_row_pool, within_face_16x16_transpose, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpA_num_faces); -} - -template -inline void llk_unpack_A_hw_configure_disaggregated(const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) { - - const llk_unpack_A_params_t unpack_A_params = { - .unpA_operand = unpA_operand - }; - llk_unpack_A_hw_configure(&unpack_A_params, within_face_16x16_transpose); + configure_unpack_AB( + unpack_src_format, + unpack_src_format, + unpack_dst_format, + unpack_dst_format, + face_r_dim, + face_r_dim, + within_face_16x16_transpose, + num_faces, + num_faces); } -template -inline void llk_unpack_A_init(const std::uint32_t transpose_of_faces=0, const std::uint32_t within_face_16x16_transpose=0, const std::uint32_t operand = 0) { - - cfg_reg_rmw_tensix(within_face_16x16_transpose); - - const std::uint32_t operand_id = get_operand_id(operand); - - const std::uint32_t face_r_dim = get_face_r_dim(operand_id); - +template +inline void _llk_unpack_A_init_(const std::uint32_t transpose_of_faces=0, const std::uint32_t within_face_16x16_transpose=0, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const std::uint32_t unpack_src_format = 0, const std::uint32_t unpack_dst_format = 0) { constexpr std::uint32_t UNP_SEL = (BType == BroadcastType::NONE) ? p_setadc::UNP_A : p_setadc::UNP_B; config_face_dim(face_r_dim); - llk_unpack_A_mop_config(transpose_of_faces>0, operand_id); + _llk_unpack_A_mop_config_(transpose_of_faces>0, num_faces, unpack_src_format, unpack_dst_format); } -template -inline void llk_unpack_A(const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0 /*not used*/) { - std::uint32_t input = get_operand_id(operand); - std::uint32_t base_address = cb_interface[input].fifo_rd_ptr; - std::uint32_t offset_address = cb_interface[input].fifo_page_size * tile_index; - // note: unpacker is programmed to automatically skip the tile header (+1) - // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1 - std::uint32_t address = base_address + offset_address - 1; +template +inline void _llk_unpack_A_(const std::uint32_t address, const bool transpose_of_faces = 0, const std::uint32_t unpack_src_format = 0, const std::uint32_t unpack_dst_format = 0) { // Clear z/w start counters TTI_SETADCZW(0b011, 0, 0, 0, 0, 0b1111); @@ -206,13 +194,30 @@ inline void llk_unpack_A(const std::uint32_t operand, const std::uint32_t tile_i } } + if constexpr (unpack_to_dest) { + if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) { + set_dst_write_addr(unp_cfg_context, unpack_dst_format); + wait_for_dest_available(); + } + } + // Run MOP ckernel::ckernel_template::run(instrn_buffer); // T6::SEMGET for context release t6_semaphore_get(semaphore::UNPACK_SYNC); + if (unpack_to_dest) { + if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) { + unpack_to_dest_tile_done(unp_cfg_context); + } + } + // Switch unpacker config context switch_config_context(unp_cfg_context); + +#ifdef PERF_DUMP + first_unpack_recorded = true; +#endif } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h index de144ad5aec..0f6d54f2909 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h @@ -3,9 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_io_unpack.h" -#include "llk_param_structs.h" - #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" @@ -16,7 +13,7 @@ using namespace ckernel; using namespace ckernel::unpacker; template -inline void llk_unpack_AB_mop_config(const bool transpose_of_faces=false, const std::uint32_t operand_id=0) { +inline void _llk_unpack_AB_mop_config_(const bool transpose_of_faces=false, const std::uint32_t num_faces=4, const bool narrow_tile=false) { #if SKIP_UNP == 1 static constexpr uint unpack_srca = TT_OP_NOP; static constexpr uint unpack_srcb = TT_OP_NOP; @@ -26,9 +23,6 @@ inline void llk_unpack_AB_mop_config(const bool transpose_of_faces=false, const static constexpr uint unpack_srcb = TT_OP_UNPACR(SrcB, 0b1, 0, 0, 0, 1, 1, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); #endif - const uint32_t num_faces = get_num_faces(operand_id); - const bool narrow_tile = get_narrow_tile(operand_id); // if narrow tile read face 0 twice for row broadcast - // or read face 0 and 1 for col broadcast if constexpr (BType == BroadcastType::COL) { static constexpr uint unpack_srcb_set_z = TT_OP_SETADCZW(0b010, 0, 0, 0, 2, 0b0001); @@ -76,60 +70,35 @@ inline void llk_unpack_AB_mop_config(const bool transpose_of_faces=false, const } -template -inline void llk_unpack_AB_hw_configure(const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) { +template +inline void _llk_unpack_AB_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; - // In0 -> unpA - // In1 -> unpB - const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand); - const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand); - - // unpA -> srcA - // unpB -> srcB - const uint32_t num_faces = get_num_faces(unpA_operand_id); // num faces in unpA and unpB are the same - - const uint32_t face_r_dim = get_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same - - configure_unpack_AB(unpA_operand_id, unpB_operand_id, - face_r_dim, face_r_dim, is_row_pool, within_face_16x16_transpose, is_fp32_dest_acc_en, srnd_fpu_en, num_faces, num_faces); -} - -template -inline void llk_unpack_AB_hw_configure_disaggregated( - const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0 ) { - const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand}; - llk_unpack_AB_hw_configure(&unpack_AB_params, within_face_16x16_transpose); + configure_unpack_AB( + unpA_src_format, + unpB_src_format, + unpA_dst_format, + unpB_dst_format, + face_r_dim, + face_r_dim, + within_face_16x16_transpose, + num_faces, + num_faces); } template -inline void llk_unpack_AB_init(const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) { - const uint32_t unpA_operand_id = get_operand_id(unpA_operand); +inline void _llk_unpack_AB_init_(const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) { - //Need to be able to configure tranpose srca for fused ops cfg_reg_rmw_tensix(transpose); // transpose within the face - const uint32_t face_r_dim = get_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same - constexpr std::uint32_t UNP_SEL = p_setadc::UNP_AB; config_face_dim(face_r_dim); - llk_unpack_AB_mop_config(transpose>0, unpA_operand_id); // transpose of faces 0,2,1,3 + _llk_unpack_AB_mop_config_(transpose>0, num_faces, narrow_tile); // transpose of faces 0,2,1,3 } template -inline void llk_unpack_AB( - const std::uint32_t operandA, const std::uint32_t operandB, const std::uint32_t tile_index_a, const std::uint32_t tile_index_b, const bool transpose_of_faces = 0 /*not used*/) { - std::uint32_t inputA = get_operand_id(operandA); - std::uint32_t inputB = get_operand_id(operandB); - std::uint32_t base_address_a = cb_interface[inputA].fifo_rd_ptr; - std::uint32_t offset_address_a = cb_interface[inputA].fifo_page_size * tile_index_a; - std::uint32_t base_address_b = cb_interface[inputB].fifo_rd_ptr; - std::uint32_t offset_address_b = cb_interface[inputB].fifo_page_size * tile_index_b; - - // note: unpacker is programmed to automatically skip the tile header (+1) - // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1 - std::uint32_t address_a = base_address_a + offset_address_a - 1; - std::uint32_t address_b = base_address_b + offset_address_b - 1; +inline void _llk_unpack_AB_( + const std::uint32_t address_a, const std::uint32_t address_b, const bool transpose_of_faces = 0 /*not used*/) { TTI_SETADCZW(0b011, 0, 0, 0, 0, 0b1111); // reset counters @@ -159,4 +128,8 @@ inline void llk_unpack_AB( // Switch unpacker config context switch_config_context(unp_cfg_context); + +#ifdef PERF_DUMP + first_unpack_recorded = true; +#endif } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h index 0b432d532c5..4578126b9e8 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h @@ -3,9 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_io_unpack.h" -#include "llk_param_structs.h" - #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" @@ -16,7 +13,7 @@ using namespace ckernel; using namespace ckernel::unpacker; // transpose is unused, math is adjusted to take into account srca face layout when transpose=true -inline void llk_unpack_AB_matmul_mop_config(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const bool partial_face) { +inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const bool partial_face) { // in0 - loaded to SrcB // in1 - loaded to SrcA @@ -93,26 +90,21 @@ inline void llk_unpack_AB_matmul_mop_config(const bool transpose, const std::uin } -template -inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) { - constexpr bool is_row_pool = false; - const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca; - - // In0 -> unpB - // In1 -> unpA - const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand); - const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand); - - // unpA -> srcA - // unpB -> srcB - const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id); - const uint32_t unpB_num_faces = get_num_faces(unpB_operand_id); +template +inline void _llk_unpack_AB_matmul_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format, const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4, const std::uint32_t unpA_tile_size = 0, const std::uint32_t unpB_tile_size = 0) { - const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id); - const uint32_t unpB_face_r_dim = get_face_r_dim(unpB_operand_id); + constexpr bool is_row_pool = false; - configure_unpack_AB(unpA_operand_id, unpB_operand_id, - unpA_face_r_dim, unpB_face_r_dim, is_row_pool, transpose_xy_srca, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpB_num_faces); + configure_unpack_AB( + unpA_src_format, + unpB_src_format, + unpA_dst_format, + unpB_dst_format, + unpA_face_r_dim, + unpB_face_r_dim, + within_face_16x16_transpose, + unpA_num_faces, + unpB_num_faces); // Configure tile size in datums const uint32_t unpA_x_end = unpA_num_faces*unpA_face_r_dim*FACE_C_DIM-1; @@ -120,35 +112,14 @@ inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_ TT_SETADCXX(p_setadc::UNP_A, unpA_x_end, 0x0); TT_SETADCXX(p_setadc::UNP_B, unpB_x_end, 0x0); - std::uint32_t inputA = get_operand_id(unpack_AB_params->unpB_operand); - std::uint32_t inputB = get_operand_id(unpack_AB_params->unpA_operand); - regfile[p_gpr_unpack::TILE_SIZE_A] = cb_interface[inputA].fifo_page_size; - regfile[p_gpr_unpack::TILE_SIZE_B] = cb_interface[inputB].fifo_page_size; + regfile[p_gpr_unpack::TILE_SIZE_A] = unpA_tile_size; + regfile[p_gpr_unpack::TILE_SIZE_B] = unpB_tile_size; sync_regfile_write(p_gpr_unpack::TILE_SIZE_B); } -template -inline void llk_unpack_AB_matmul_hw_configure_disaggregated( - const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) { - const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = { - .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca }; - llk_unpack_AB_matmul_hw_configure(&unpack_AB_matmul_params); -} - -inline void llk_unpack_AB_matmul_init(const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { - // In0 -> srcB (supports partial face) - // In1 -> srcA - const uint32_t unpA_operand_id = get_operand_id(unpB_operand); - const uint32_t unpB_operand_id = get_operand_id(unpA_operand); - - const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id); - const uint32_t unpB_face_r_dim = get_face_r_dim(unpB_operand_id); +__attribute__((always_inline)) inline void _llk_unpack_AB_matmul_init_(const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1, const std::uint32_t unpA_face_r_dim=FACE_R_DIM, const std::uint32_t unpB_face_r_dim=FACE_R_DIM, const std::uint32_t unpA_num_faces=4, const std::uint32_t unpB_num_faces=4, const bool partial_face=false) { const bool reuse_a = ct_dim >= rt_dim; - const bool partial_face = get_partial_face(unpB_operand_id); - - const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id); - const uint32_t unpB_num_faces = partial_face ? 1 : get_num_faces(unpB_operand_id); // if partial face -> unpack face by face // also turn on within_face_16x16_transpose if it was turned off by datacopy at runtime // on WH, the unpacker performs both transpose of faces as well as transpose each face. @@ -175,50 +146,33 @@ inline void llk_unpack_AB_matmul_init(const std::uint32_t unpA_operand, const st TT_SETDMAREG(0, LOWER_HALFWORD(kt_dim), 0, LO_16(p_gpr_unpack::KT_DIM)); // store kt_dim to gpr for scaling tile size - llk_unpack_AB_matmul_mop_config(transpose != 0, ct_dim, rt_dim, kt_dim, partial_face); + _llk_unpack_AB_matmul_mop_config_(transpose != 0, ct_dim, rt_dim, kt_dim, partial_face); } -inline void llk_unpack_AB_matmul( - const std::uint32_t operandA, const std::uint32_t operandB, const std::uint32_t tile_index_a, const std::uint32_t tile_index_b, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { +inline void _llk_unpack_AB_matmul_( + const std::uint32_t base_address_a, const std::uint32_t base_address_b, const std::uint32_t tile_index_a, const std::uint32_t tile_index_b, const std::uint32_t tile_size_a, const std::uint32_t tile_size_b, const std::uint32_t unpA_face_r_dim=FACE_R_DIM, const std::uint32_t unpB_face_r_dim=FACE_R_DIM, const bool partial_face=false, std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { // In0/InA -> srcB (supports partial face) // In1/InB -> srcA - std::uint32_t inputA = get_operand_id(operandA); - std::uint32_t inputB = get_operand_id(operandB); - std::uint32_t base_address_a = cb_interface[inputA].fifo_rd_ptr; - std::uint32_t base_address_b = cb_interface[inputB].fifo_rd_ptr; volatile uint *cfg = get_cfg_pointer(); // get pointer to registers for current state ID - const std::uint32_t unpA_face_r_dim = get_face_r_dim(inputB); // In1/InB -> srcA - const std::uint32_t unpB_face_r_dim = get_face_r_dim(inputA); // In0/InA -> srcB - const bool reuse_a = ct_dim >= rt_dim; const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim; - const bool partial_face = get_partial_face(inputA); - - if (!reuse_a) { TTI_MULDMAREG(0, p_gpr_unpack::TMP_LO, p_gpr_unpack::TILE_SIZE_B, p_gpr_unpack::KT_DIM); } for (uint t = 0; t < t_dim; t++) { - std::uint32_t cur_tile_index_a = tile_index_a + (reuse_a ? (t*kt_dim) : (0)); - std::uint32_t cur_tile_index_b = tile_index_b + (reuse_a ? (0 ) : (t)); - std::uint32_t next_tile_index_a = tile_index_a + (reuse_a ? ((t+1)*kt_dim) : (0)); - std::uint32_t next_tile_index_b = tile_index_b + (reuse_a ? (0 ) : (t+1)); - std::uint32_t offset_address_a = cb_interface[inputA].fifo_page_size * cur_tile_index_a; - std::uint32_t offset_address_b = cb_interface[inputB].fifo_page_size * cur_tile_index_b; - std::uint32_t next_offset_address_a =cb_interface[inputA].fifo_page_size * next_tile_index_a; - std::uint32_t next_offset_address_b = cb_interface[inputB].fifo_page_size * next_tile_index_b; - // note: unpacker is programmed to automatically skip the tile header (+1) - // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1 - std::uint32_t address_a = base_address_a + offset_address_a - 1; - std::uint32_t address_b = base_address_b + offset_address_b - 1; - std::uint32_t next_address_a = base_address_a + next_offset_address_a - 1; - std::uint32_t next_address_b = base_address_b + next_offset_address_b - 1; - + std::uint32_t offset_address_a =tile_size_a*(tile_index_a + (reuse_a ? (t*kt_dim) : (0))); + std::uint32_t next_offset_address_a = tile_size_a*(tile_index_a + (reuse_a ? ((t+1)*kt_dim) : (0))); + std::uint32_t offset_address_b = tile_size_b*(tile_index_b + (reuse_a ? (0 ) : (t))); + std::uint32_t next_offset_address_b = tile_size_b*(tile_index_b + (reuse_a ? (0 ) : (t+1))); + std::uint32_t address_a = base_address_a + offset_address_a; + std::uint32_t next_address_a = base_address_a + next_offset_address_a; + std::uint32_t address_b = base_address_b + offset_address_b; + std::uint32_t next_address_b = base_address_b + next_offset_address_b; // Wait for free context wait_for_next_context(2); @@ -298,4 +252,9 @@ inline void llk_unpack_AB_matmul( switch_config_context(unp_cfg_context); } + + #ifdef PERF_DUMP + first_unpack_recorded = true; + #endif + } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h index 0f5ba7d9b52..92222ddaaa3 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h @@ -6,10 +6,8 @@ #include "ckernel.h" #include "ckernel_defs.h" -#include "debug/fw_debug.h" +#include "fw_debug.h" #include "cunpack_common.h" -#include "llk_param_structs.h" -#include "llk_io_unpack.h" #ifdef PERF_DUMP #include "ckernel_perf_api.h" @@ -18,17 +16,15 @@ using namespace ckernel; using namespace ckernel::unpacker; -void llk_zero_operand(std::uint32_t operand) { - std::uint32_t input = get_operand_id(operand); +void _llk_zero_buffer_(const std::uint32_t base_address, const std::uint32_t size) { TT_SETDMAREG(0, 0, 0, LO_16(p_gpr_unpack::OPERAND_OFFSET_ADDR)); TT_SETDMAREG(0, 0, 0, HI_16(p_gpr_unpack::OPERAND_OFFSET_ADDR)); - std::uint32_t fifo_base_addr = cb_interface[input].fifo_limit - cb_interface[input].fifo_size; - TT_SETDMAREG(0, LOWER_HALFWORD(fifo_base_addr), 0, LO_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR)); - TT_SETDMAREG(0, UPPER_HALFWORD(fifo_base_addr), 0, HI_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR)); + TT_SETDMAREG(0, LOWER_HALFWORD(base_address), 0, LO_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR)); + TT_SETDMAREG(0, UPPER_HALFWORD(base_address), 0, HI_16(p_gpr_unpack::p_gpr_unpack::OPERAND_BASE_ADDR)); - for (std::uint32_t i = 0; i < cb_interface[input].fifo_size; i++) { + for (std::uint32_t i = 0; i < size; i++) { TTI_STOREIND( 1, 0, @@ -40,65 +36,69 @@ void llk_zero_operand(std::uint32_t operand) { } } -inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { - debug_dump(data, byte_size); -} +template +inline void _llk_unpack_get_tile_(std::uint32_t address, std::uint32_t *p_tile) { + std::uint32_t byte_address = (address + TILE_HEADER_SIZE)<<4; -inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { - debug_dump_seek(offset); -} + if constexpr (mail2math) { + mailbox_write(ThreadId::MathThreadId, byte_address); + semaphore_post(semaphore::UNPACK_OPERAND_SYNC); + } -inline void llk_unpack_reconfig_data_format_srca_impl(std::uint32_t srca_operand_id) -{ - cfg_reg_rmw_tensix(unpack_src_format[srca_operand_id]); - cfg_reg_rmw_tensix(unpack_dst_format[srca_operand_id]); - TT_SETDMAREG(0, LOWER_HALFWORD(cb_interface[srca_operand_id].fifo_page_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A -} + if constexpr (mail2pack) { + mailbox_write(ThreadId::PackThreadId, byte_address); + semaphore_post(semaphore::UNPACK_OPERAND_SYNC); + } -inline void llk_unpack_reconfig_data_format_srcb_impl(std::uint32_t srcb_operand_id) -{ - cfg_reg_rmw_tensix(unpack_src_format[srcb_operand_id]); - cfg_reg_rmw_tensix(unpack_dst_format[srcb_operand_id]); - TT_SETDMAREG(0, LOWER_HALFWORD(cb_interface[srcb_operand_id].fifo_page_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_B)); // update gpr which holds tile size B + *p_tile = byte_address; } -inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { - llk_unpack_reconfig_data_format_srca_impl(get_operand_id(srca_new_operand)); +template +inline void _llk_unpack_release_tile_() { + while (semaphore_read(semaphore::UNPACK_OPERAND_SYNC) > 0); } -inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { - llk_unpack_reconfig_data_format_srcb_impl(get_operand_id(srcb_new_operand)); +inline void _llk_unpack_debug_dump_(std::uint8_t *data, std::uint32_t byte_size) { + debug_dump(data, byte_size); } -inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { - std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); - std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); - - if((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) { - llk_unpack_reconfig_data_format_srca_impl(new_srca_operand_id); - } +inline void _llk_unpack_debug_dump_seek_(std::uint8_t offset) { + debug_dump_seek(offset); } -inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { - std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); - std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); +template +inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) +{ + if constexpr(is_tile_dim_reconfig_en) { + const uint face_dim = face_r_dim*FACE_C_DIM; - if((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) { - llk_unpack_reconfig_data_format_srcb_impl(new_srcb_operand_id); + cfg_reg_rmw_tensix(num_faces); + cfg_reg_rmw_tensix(face_dim | face_dim << 16); } + cfg_reg_rmw_tensix(unpack_src_format); + cfg_reg_rmw_tensix(unpack_dst_format); + TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A } -inline void llk_unpack_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { - llk_unpack_reconfig_data_format_srca(srca_new_operand); - llk_unpack_reconfig_data_format_srcb(srcb_new_operand); -} +template +inline void _llk_unpack_reconfig_data_format_srcb_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) +{ + if constexpr(is_tile_dim_reconfig_en) { + const uint face_dim = face_r_dim*FACE_C_DIM; -inline void llk_unpack_reconfig_data_format(const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand, const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { - llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand); - llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand); + cfg_reg_rmw_tensix(face_r_dim*FACE_C_DIM); + cfg_reg_rmw_tensix(num_faces); + } + cfg_reg_rmw_tensix(unpack_src_format); + cfg_reg_rmw_tensix(unpack_dst_format); + TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_B)); // update gpr which holds tile size B } -inline void llk_unpack_dbg_feature_disable(){ +inline void _llk_unpack_dbg_feature_disable_(){ reg_write(RISCV_DEBUG_REG_DBG_FEATURE_DISABLE, 1<<11); // Set debug feature disable bit 11 // workaround for bug https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1372 } + +inline void _llk_enable_int8_fpu_math_() { + enalbe_int8_fpu_math(); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h index b59b552c7da..8f0ea52e4fa 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h @@ -3,9 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_io_unpack.h" -#include "llk_param_structs.h" - #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" @@ -16,7 +13,7 @@ using namespace ckernel; using namespace ckernel::unpacker; template -inline void llk_unpack_reduce_mop_config() { +inline void _llk_unpack_reduce_mop_config_() { #if SKIP_UNP == 1 static constexpr uint unpack_srca = TT_OP_NOP; #else @@ -43,75 +40,37 @@ inline void llk_unpack_reduce_mop_config() { tmp.program(instrn_buffer); } -template -inline void llk_unpack_reduce_hw_configure( - const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) { +template +inline void _llk_unpack_reduce_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format, const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4) { - constexpr uint32_t srca_height = 16; - constexpr uint32_t srcb_height = 16; constexpr bool is_row_pool = true; - constexpr bool transpose_xy_per_face = (ReduceDim::REDUCE_ROW == dim); - - configure_unpack_AB( - get_operand_id(unpack_reduce_params->unpA_operand), - get_operand_id(unpack_reduce_params->unpA_operand), - srca_height, - srcb_height, - is_row_pool, - transpose_xy_per_face, - is_fp32_dest_acc_en, - srnd_fpu_en); - - if constexpr (type != PoolType::MAX) { - union { - float f; - uint32_t u; - } f2u = {.f = const_mult}; - - for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u; // Load const into L1 buffer - } -} -template -inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) { - const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand}; - llk_unpack_reduce_hw_configure(&unpack_reduce_params, mult); + configure_unpack_AB( + unpA_src_format, + unpB_src_format, + unpA_dst_format, + unpB_dst_format, + unpA_face_r_dim, + unpB_face_r_dim, + within_face_16x16_transpose, + unpA_num_faces, + unpB_num_faces); } template -inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) { - llk_unpack_reduce_mop_config(); - volatile uint tt_reg_ptr *cfg = get_cfg_pointer(); // get pointer to registers for current state ID - - uint unpack_src_df = (uint) DataFormat::Float32; - - uint unpack_dst_df = (((uint)unpack_dst_format[0]>>2)&0x1) ? (uint) DataFormat::Float16_b : (uint) DataFormat::Float16; - - cfg_reg_rmw_tensix(unpack_dst_df); +inline void _llk_unpack_reduce_init_(const std::uint32_t within_face_16x16_transpose=0) { // REDUCE_ROW requires transpose itself; additionaly, within_face_16x16_transpose flag could require transpose; // if we have the flag set with REDUCE_ROW, we don't need to do anything cfg_reg_rmw_tensix(ReduceDim::REDUCE_ROW == dim ? !within_face_16x16_transpose : within_face_16x16_transpose); - TTI_SETADCXX(0b11, FACE_WIDTH*FACE_HEIGHT-1, 0x0); + TTI_SETADCXX(0b11, FACE_R_DIM*FACE_C_DIM-1, 0x0); - cfg_reg_rmw_tensix(unpack_src_df); - cfg_reg_rmw_tensix(unpack_dst_df); - - TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32); - TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32); - TTI_NOP; TTI_NOP; + _llk_unpack_reduce_mop_config_(); } template -inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) { - std::uint32_t input = get_operand_id(operand); - std::uint32_t base_address = cb_interface[input].fifo_rd_ptr; - std::uint32_t offset_address = cb_interface[input].fifo_page_size * tile_index; - // note: unpacker is programmed to automatically skip the tile header (+1) - // since there is no tile header, we need to -1 the address (in terms of 16B words), to offet unpacker's automatic +1 - std::uint32_t address = base_address + offset_address - 1; - +inline void _llk_unpack_reduce_(const std::uint32_t address) { // Clear z/w start counters TTI_SETADCZW(0b011, 0, 0, 0, 0, 0b1111); @@ -122,7 +81,7 @@ inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t t wait_for_next_context(2); // Load only 16 datums into srcB - TTI_SETADCXX(p_setadc::UNP1, DATUMS_PER_ROW-1, 0x0); + TTI_SETADCXX(p_setadc::UNP1, FACE_C_DIM-1, 0x0); // Trisc::SEMPOST for context acquire semaphore_post(semaphore::UNPACK_SYNC); @@ -138,7 +97,7 @@ inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t t mop_run(0, 4); // Restore face height - TTI_SETADCXX(p_setadc::UNP1, FACE_HEIGHT*16-1, 0x0); + TTI_SETADCXX(p_setadc::UNP1, FACE_R_DIM*FACE_C_DIM-1, 0x0); // T6::SEMGET for context release t6_semaphore_get(semaphore::UNPACK_SYNC); diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h index adede517c2e..ae1b22d830e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h @@ -3,9 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_io_unpack.h" -#include "llk_param_structs.h" - #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" @@ -15,7 +12,7 @@ using namespace ckernel; using namespace ckernel::unpacker; -inline void llk_unpack_tilize_mop_config(const std::uint32_t operand_id) { +inline void _llk_unpack_tilize_mop_config_(const bool narrow_tile=false) { #if SKIP_UNP == 1 static constexpr uint unpack_srca = TT_OP_NOP; static constexpr uint unpack_srcb_zerosrc = TT_OP_NOP; @@ -26,91 +23,69 @@ inline void llk_unpack_tilize_mop_config(const std::uint32_t operand_id) { static constexpr uint unpack_srcb_set_dvalid = TT_OP_UNPACR_NOP(SrcB, p_unpacr_nop::UNP_SET_DVALID); //WA for https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/issues/1230 #endif - const uint32_t outerloop = get_narrow_tile(operand_id) ? 1 : 2; + const uint32_t outerloop = narrow_tile ? 1 : 2; constexpr uint32_t innerloop = 1; ckernel_template tmp(outerloop, innerloop, unpack_srcb_zerosrc, unpack_srcb_set_dvalid); tmp.set_start_op(unpack_srca); tmp.program(instrn_buffer); } -template -inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) { +template +inline void _llk_unpack_tilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; - constexpr bool transpose_xy_srca = false; - constexpr bool srnd_fpu_en = false; - - const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand); - const uint32_t unpA_num_faces = get_num_faces(unpA_operand_id); - const uint32_t unpA_face_r_dim = get_face_r_dim(unpA_operand_id); - configure_unpack_AB(unpA_operand_id, unpA_operand_id, unpA_face_r_dim, unpA_face_r_dim, is_row_pool, transpose_xy_srca, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpA_num_faces); -} -template -inline void llk_unpack_tilize_hw_configure_disaggregated( - const std::uint32_t unpA_operand, const std::uint32_t unpA_block_ct_dim) { - const llk_unpack_A_params_t unpack_tilize_params = { - .unpA_operand = unpA_operand - }; - llk_unpack_tilize_hw_configure(&unpack_tilize_params); + configure_unpack_AB( + unpack_src_format, + unpack_src_format, + unpack_dst_format, + unpack_dst_format, + face_r_dim, + face_r_dim, + within_face_16x16_transpose, + num_faces, + num_faces); } -inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) { +inline void _llk_unpack_tilize_init_(const std::uint32_t unpack_src_format=0, const std::uint32_t unpack_dst_format=0, const std::uint32_t ct_dim=0, const std::uint32_t face_r_dim=FACE_R_DIM, const bool narrow_tile=false) { cfg_reg_rmw_tensix(0); - const std::uint32_t operand_id = get_operand_id(operand); - const std::uint32_t face_r_dim = get_face_r_dim(operand_id); - - const std::uint32_t block_c_dim = ct_dim * (get_narrow_tile(operand_id) ? FACE_C_DIM : TILE_C_DIM); + const std::uint32_t block_c_dim = ct_dim * (narrow_tile ? FACE_C_DIM : TILE_C_DIM); // Set face dim TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0); - // Save state of unpacker config for quick restore - TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0] - TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context - // Override default settings to enable tilize mode unpack_config_u config = {0}; - config.f.out_data_format = (uint)unpack_dst_format[operand_id]; + config.f.out_data_format = unpack_dst_format; config.f.throttle_mode = 2; config.f.tileize_mode = 1; - config.f.shift_amount = (SCALE_DATUM_SIZE((uint)unpack_src_format[operand_id], block_c_dim)) >> 4; + config.f.shift_amount = (SCALE_DATUM_SIZE(unpack_src_format, block_c_dim)) >> 4; TT_SETDMAREG(0, LOWER_HALFWORD(config.val[0]), 0, LO_16(p_gpr_unpack::TMP0)); TT_SETDMAREG(0, UPPER_HALFWORD(config.val[0]), 0, HI_16(p_gpr_unpack::TMP0)); TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::TMP0); // Load unpack config[0] TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16); //GPR preloaded with 16 | (16 << 16) - llk_unpack_tilize_mop_config(operand_id); + _llk_unpack_tilize_mop_config_(narrow_tile); } -inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) { - TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0); - TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0] - TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context -} - -inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) { - std::uint32_t operand_id = get_operand_id(operand); - std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; // Remove header size added by descriptor - const std::uint32_t face_r_dim = get_face_r_dim(operand_id); - const std::uint32_t num_faces = get_num_faces(operand_id); +inline void _llk_unpack_tilize_(const std::uint32_t base_address, const std::uint32_t tile_index, std::uint32_t unpack_src_format=0, std::uint32_t block_ct_dim=0, const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false) { volatile uint tt_reg_ptr *cfg = get_cfg_pointer(); // get pointer to registers for current state ID - std::uint32_t top_face_offset_address = SCALE_DATUM_SIZE((uint)unpack_src_format[operand_id], tile_index) << (get_narrow_tile(operand_id) ? 0 : 1); + std::uint32_t top_face_offset_address = SCALE_DATUM_SIZE(unpack_src_format, tile_index) << (narrow_tile ? 0 : 1); // Each iteration unpacks 2 face_r_dimx16 faces (1st 0,1 2nd 2,3 unless tile is <=16x32) // For narrow tile we unpack 1 face in each iteration // Offset address is in 16B words // Datum count = tile_index*face_r_dim (/16 to get word count) - const std::uint32_t block_c_dim_16B = block_ct_dim * (get_narrow_tile(operand_id) ? FACE_C_DIM/16 : TILE_C_DIM/16); + const std::uint32_t block_c_dim_16B = block_ct_dim * (narrow_tile ? FACE_C_DIM/16 : TILE_C_DIM/16); std::uint32_t bot_face_offset_address = - SCALE_DATUM_SIZE((uint)unpack_src_format[operand_id], face_r_dim*block_c_dim_16B); //*N rows / 16 to get 16B word aligned address + SCALE_DATUM_SIZE(unpack_src_format, face_r_dim*block_c_dim_16B); //*N rows / 16 to get 16B word aligned address // Program srcA and srcB base addresses - std::uint32_t num_loops = get_narrow_tile(operand_id) ? 2 : num_faces/2; + std::uint32_t num_loops = narrow_tile ? 2 : num_faces/2; for (std::uint32_t n = 0; n < num_loops; n++) { std::uint32_t address = base_address + top_face_offset_address + ((n == 1) ? bot_face_offset_address : 0); @@ -145,10 +120,3 @@ inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, s first_unpack_recorded = true; #endif } - -inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) { - std::uint32_t input = get_operand_id(operand); - for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) { - llk_unpack_tilize(operand, tile_index, block_c_tiles); - } -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h index d6d04cd55d6..723f9716c88 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h @@ -3,9 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "llk_io_unpack.h" -#include "llk_param_structs.h" - #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" @@ -19,18 +16,24 @@ using namespace ckernel::unpacker; #define SKIP_UNP (0) #endif -inline void llk_unpack_untilize_mop_config() { +inline void _llk_unpack_untilize_mop_config_() { - constexpr uint replay_buf_len = 5; + constexpr uint replay_buf_len = (SKIP_UNP == 1) ? 1 : 5; TTI_REPLAY(0, replay_buf_len, 0, 1); - +#if SKIP_UNP == 1 + TTI_NOP; + static constexpr uint load_offset_addr_cntx0 = TT_OP_NOP; + static constexpr uint load_offset_addr_cntx1 = TT_OP_NOP; +#else TTI_DMANOP; // REG2FLOP that sets offset in previous loop needs additional cycle to complete TTI_UNPACR(SrcA, 0b01000001, 0, 0, 0, 1, 0, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); TTI_UNPACR(SrcA, 0b01000001, 0, 0, 0, 1, 0, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); TTI_ADDDMAREG(0, p_gpr_unpack::TILE_OFFSET, p_gpr_unpack::TILE_OFFSET, p_gpr_unpack::TILE_SIZE); TTI_ADDRCRZW(0b001, 0, 0, 0, 0, 0b0001); + static constexpr uint load_offset_addr_cntx0 = TT_OP_REG2FLOP(1, 0, 0, 0, THCON_SEC0_REG7_Offset_address_ADDR32 - THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::TILE_OFFSET); static constexpr uint load_offset_addr_cntx1 = TT_OP_REG2FLOP(1, 0, 0, 0, THCON_SEC0_REG7_Offset_cntx1_address_ADDR32 - THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::TILE_OFFSET); +#endif ckernel_unpack_template tmp = ckernel_unpack_template( true, // src B @@ -45,31 +48,25 @@ inline void llk_unpack_untilize_mop_config() { tmp.program(instrn_buffer); } -template -inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) { +template +inline void _llk_unpack_untilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; - constexpr bool transpose_xy_srca = false; - constexpr bool srnd_fpu_en = false; - - const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand); - const uint32_t unpA_num_faces = 4; - const uint32_t unpA_face_r_dim = 16; - configure_unpack_AB(unpA_operand_id, unpA_operand_id, unpA_face_r_dim, unpA_face_r_dim, is_row_pool, transpose_xy_srca, is_fp32_dest_acc_en, srnd_fpu_en, unpA_num_faces, unpA_num_faces); + configure_unpack_AB( + unpack_src_format, + unpack_src_format, + unpack_dst_format, + unpack_dst_format, + face_r_dim, + face_r_dim, + within_face_16x16_transpose, + num_faces, + num_faces); } -inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) { - const llk_unpack_A_params_t unpack_untilize_params = { - .unpA_operand = unpA_operand, - }; - llk_unpack_untilize_hw_configure(&unpack_untilize_params); -} - -inline void llk_unpack_untilize_init(std::uint32_t operand = 0) { - std::uint32_t operand_id = get_operand_id(operand); - std::uint32_t face_r_dim = 1; +inline void _llk_unpack_untilize_init_(const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) { - std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; - std::uint32_t unpA_ch1_y_stride = FACE_R_DIM*unpA_ch1_x_stride; + const std::uint32_t unpA_ch1_x_stride = (unpack_dst_format&0x3) == (std::uint32_t) DataFormat::Float32 ? 4 : (unpack_dst_format&0x3) == (std::uint32_t) DataFormat::Float16 ? 2 : 1; + const std::uint32_t unpA_ch1_y_stride = FACE_R_DIM*unpA_ch1_x_stride; TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0); @@ -79,38 +76,14 @@ inline void llk_unpack_untilize_init(std::uint32_t operand = 0) { cfg_reg_rmw_tensix(FACE_C_DIM); TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16); //GPR preloaded with 16 | (16 << 16) - std::uint32_t tile_size_words = cb_interface[operand_id].fifo_page_size; - TT_SETDMAREG(0, LOWER_HALFWORD(tile_size_words), 0, LO_16(p_gpr_unpack::TILE_SIZE)); - TT_SETDMAREG(0, UPPER_HALFWORD(tile_size_words), 0, HI_16(p_gpr_unpack::TILE_SIZE)); - llk_unpack_untilize_mop_config(); -} - -inline void llk_unpack_untilize_uninit(const std::uint32_t operand) { - std::uint32_t operand_id = get_operand_id(operand); - std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; - std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride; - - // Check that unpacker is done (all contexts freed up) before starting hw configuration - wait_for_idle(); + TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE)); + TT_SETDMAREG(0, UPPER_HALFWORD(tile_size), 0, HI_16(p_gpr_unpack::TILE_SIZE)); - // Reset address counters - unpacker_addr_counter_init(); - - // Wait for cfg to be free to edit - TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK); - - // Reset the values to default in unpack AB common. - TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0); - TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16); - cfg_reg_rmw_tensix(1); - cfg_reg_rmw_tensix(unpA_ch1_y_stride); - TTI_NOP; TTI_NOP; // Do we need this for WH? + _llk_unpack_untilize_mop_config_(); } template -inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) { - std::uint32_t operand_id = get_operand_id(operand); - std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; +inline void _llk_unpack_untilize_pass_(const std::uint32_t base_address, const std::uint32_t block_tile_cols) { std::uint32_t rem_blocks_in_row = block_tile_cols; // Program srcA and srcB base addresses @@ -146,11 +119,13 @@ inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_ if ((face_2xr_cnt + rem_blocks_in_row) >= (FACE_HEIGHT / 2)) { // Run MOP TT_MOP(0, 8 - face_2xr_cnt - 1, unp_cfg_context == 0 ? 0 : 0xff); // Run the MOP - +#if SKIP_UNP == 1 + TTI_NOP; +#else TTI_UNPACR(SrcA, 0b0, 0, 0, 0, 1, 1, p_unpacr::RAREFYB_DISABLE, 0, 0, 0, 0, 1); // set data valid TTI_UNPACR_NOP(SrcB, p_unpacr_nop::UNP_ZEROSRC); TTI_UNPACR_NOP(SrcB, p_unpacr_nop::UNP_SET_DVALID); - +#endif TTI_SETADCXY(0b001, 0, 0, 0, 0, 0b1000); // Clear srcA addr y cnt rem_blocks_in_row -= (8 - face_2xr_cnt); face_2xr_cnt = 0; @@ -193,9 +168,7 @@ inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_ // Switch unpacker config context switch_config_context(unp_cfg_context); -} - -inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) { - llk_unpack_untilize_pass(operand, block_c_tiles); - llk_unpack_untilize_pass(operand, block_c_tiles); +#ifdef PERF_DUMP + first_unpack_recorded = true; +#endif } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp deleted file mode 100644 index 2e615199cf5..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "llk_math_common.h" -#include "llk_math_eltwise_unary_datacopy.h" -namespace NAMESPACE -{ - -struct hlk_args_t -{ -int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core -int32_t per_core_block_cnt; // Number of blocks of size 1xN tiles (1 rows and N cols) -int32_t per_core_block_c_dim; // Block c dim = (Nx32) -int32_t per_core_block_tile_cnt; // Block tile count = (1xN) -} -; - -void math_main(const struct hlk_args_t *args,const int outer_loop_cnt) -{ -int __outer_loop_iter; -llk_math_eltwise_unary_datacopy_init(false); -llk_math_pack_sync_init(); -for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) { - for (int b = 0; b < args -> per_core_tile_cnt; ++b) { - llk_math_wait_for_dest_available(); - llk_math_eltwise_unary_datacopy(0); - llk_math_dest_section_done(); - } -} -} -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h deleted file mode 100644 index 4e13ffa422a..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_math_fidelity.h +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -constexpr std::int32_t MATH_FIDELITY = 255; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp deleted file mode 100644 index 15b8912cfff..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "llk_pack_common.h" -#include "llk_pack.h" -namespace NAMESPACE -{ - -struct hlk_args_t -{ -int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core -int32_t per_core_block_cnt; // Number of blocks of size 1xN tiles (1 rows and N cols) -int32_t per_core_block_c_dim; // Block c dim = (Nx32) -int32_t per_core_block_tile_cnt; // Block tile count = (1xN) -} -; - -void pack_main(const struct hlk_args_t *args,const int outer_loop_cnt) -{ -int __outer_loop_iter; -llk_pack_init(16); -llk_pack_hw_configure_disaggregated(16); -llk_setup_outputs(); -llk_pack_dest_init(); -for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) { - for (int b = 0; b < args -> per_core_tile_cnt; ++b) { - llk_packer_wait_for_math_done(); - llk_wait_for_free_tiles(16,1); - llk_pack(0,16); - llk_push_tiles(16,1); - llk_pack_dest_section_done(); - } -} -} -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h deleted file mode 100644 index f4ff894944f..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_pack_data_format.h +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -const std::int32_t pack_src_format[16] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; -const std::int32_t pack_dst_format[16] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp deleted file mode 100644 index 37ed8574d7a..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "llk_unpack_common.h" -#include "llk_unpack_tilize.h" -namespace NAMESPACE -{ - -struct hlk_args_t -{ -int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core -int32_t per_core_block_cnt; // Number of blocks of size 1xN tiles (1 rows and N cols) -int32_t per_core_block_c_dim; // Block c dim = (Nx32) -int32_t per_core_block_tile_cnt; // Block tile count = (1xN) -} -; - -void unpack_main(const struct hlk_args_t *args,const int outer_loop_cnt) -{ -int __outer_loop_iter; -llk_setup_operands(); -llk_unpack_tilize_init(); -llk_unpack_tilize_hw_configure_disaggregated(0, args -> per_core_block_c_dim); -for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) { - for (int i = 0; i < args -> per_core_block_cnt; ++i) { - llk_wait_blocks(0,1); - for (int j = 0; j < args -> per_core_block_tile_cnt; ++j) { - llk_unpack_tilize(0,j,args -> per_core_block_c_dim); - } - llk_pop_blocks(0,1,args -> per_core_block_c_dim); - } -} -} -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h deleted file mode 100644 index 10d1799f857..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/chlkc_unpack_data_format.h +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -const std::int32_t unpack_src_format[24] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; -const std::int32_t unpack_dst_format[24] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h deleted file mode 100644 index 62a6e634ee4..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/hlk_args_struct_init.h +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -const NAMESPACE::hlk_args_t hlk_args = - { - .per_core_tile_cnt = 0x4, - .per_core_block_tile_cnt = 0x4, - .per_core_block_cnt = 0x1, - .per_core_block_c_dim = 128 - }; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h deleted file mode 100644 index 9be808dfbc5..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/tilize/loop_count.h +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -constexpr std::int32_t arg_loop_count = 1; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp deleted file mode 100644 index 6e3d0a44332..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "llk_math_common.h" -#include "llk_math_eltwise_unary_datacopy.h" -namespace NAMESPACE -{ - -struct hlk_args_t -{ -int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core -int32_t per_core_block_tile_r_dim; // Block tile r dim (RT) -int32_t per_core_block_tile_c_dim; // Block tile c dim (CT) -int32_t per_core_block_cnt; // Number of blocks of size (RTxCT) -} -; - -void math_main(const struct hlk_args_t *args,const int outer_loop_cnt) -{ -int __outer_loop_iter; -llk_math_eltwise_unary_datacopy_init(false); -llk_math_pack_sync_init(); -for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) { - for (int b = 0; b < args -> per_core_tile_cnt; ++b) { - llk_math_wait_for_dest_available(); - llk_math_eltwise_unary_datacopy(0); - llk_math_dest_section_done(); - } -} -} -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h deleted file mode 100644 index 4e13ffa422a..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_math_fidelity.h +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -constexpr std::int32_t MATH_FIDELITY = 255; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp deleted file mode 100644 index e18a8f81483..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "llk_pack_common.h" -#include "llk_pack.h" -namespace NAMESPACE -{ - -struct hlk_args_t -{ -int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core -int32_t per_core_block_tile_r_dim; // Block tile r dim (RT) -int32_t per_core_block_tile_c_dim; // Block tile c dim (CT) -int32_t per_core_block_cnt; // Number of blocks of size (RTxCT) -} -; - -void pack_main(const struct hlk_args_t *args,const int outer_loop_cnt) -{ -int __outer_loop_iter; -llk_pack_init(16); -llk_pack_hw_configure_disaggregated(16); -llk_setup_outputs(); -llk_pack_dest_init(); -for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) { - for (int b = 0; b < args -> per_core_tile_cnt; ++b) { - llk_packer_wait_for_math_done(); - llk_wait_for_free_blocks(16,1); - llk_pack(0,16); - llk_push_blocks(16,1); - llk_pack_dest_section_done(); - } -} -} -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h deleted file mode 100644 index f4ff894944f..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_pack_data_format.h +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -const std::int32_t pack_src_format[16] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; -const std::int32_t pack_dst_format[16] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp deleted file mode 100644 index f5da6ad4752..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "llk_unpack_common.h" -#include "llk_unpack_untilize.h" -namespace NAMESPACE -{ - -struct hlk_args_t -{ -int32_t per_core_tile_cnt; // Total number of tiles produced at the output per core -int32_t per_core_block_tile_r_dim; // Block tile r dim (RT) -int32_t per_core_block_tile_c_dim; // Block tile c dim (CT) -int32_t per_core_block_cnt; // Number of blocks of size (RTxCT) -} -; - -void unpack_main(const struct hlk_args_t *args,const int outer_loop_cnt) -{ -int __outer_loop_iter; -llk_setup_operands(); -llk_unpack_untilize_init(); -llk_unpack_untilize_hw_configure_disaggregated(0); -for (__outer_loop_iter = 0; __outer_loop_iter < outer_loop_cnt; __outer_loop_iter += 1) { - for (int i = 0; i < args -> per_core_block_cnt; ++i) { - for (int j = 0; j < args -> per_core_block_tile_r_dim; ++j) { - llk_wait_tiles(0,args -> per_core_block_tile_c_dim); - llk_unpack_untilize(0,args -> per_core_block_tile_c_dim); - llk_unpack_untilize(0,args -> per_core_block_tile_c_dim); - llk_pop_tiles(0,args -> per_core_block_tile_c_dim); - } - } -} -} -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h deleted file mode 100644 index 10d1799f857..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/chlkc_unpack_data_format.h +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -const std::int32_t unpack_src_format[24] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; -const std::int32_t unpack_dst_format[24] = { - 1,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, -}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h deleted file mode 100644 index 123b7bb0d4a..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/hlk_args_struct_init.h +++ /dev/null @@ -1,12 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -const NAMESPACE::hlk_args_t hlk_args = - { - .per_core_tile_cnt = 0x4, - .per_core_block_tile_cnt = 0x4, - .per_core_block_tile_r_dim = 0x2, // Block tile r dim (RT) - .per_core_block_tile_c_dim = 0x2, // Block tile c dim (CT) - .per_core_block_cnt = 0x1 - }; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h b/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h deleted file mode 100644 index 9be808dfbc5..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_ops/untilize/loop_count.h +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -constexpr std::int32_t arg_loop_count = 1; diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/chlkc_list.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/chlkc_list.h similarity index 96% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/chlkc_list.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/common/chlkc_list.h index bff17865521..d288ba0114d 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/chlkc_list.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/chlkc_list.h @@ -14,6 +14,7 @@ using namespace ckernel; #ifdef UCK_CHLKC_MATH +#include "chlkc_unpack_data_format.h" #include "chlkc_math_fidelity.h" #include "chlkc_math_approx_mode.h" #include "chlkc_math.cpp" diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h new file mode 100644 index 00000000000..f31efd1c3d0 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h @@ -0,0 +1,62 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "ckernel_globals.h" +#include "tensix_functions.h" +#include "metal_compile_time_args.h" +#include "risc_attribs.h" +#include "hostdevcommon/common_runtime_address_map.h" + +extern uint32_t __ldm_bss_start[]; +extern uint32_t __ldm_bss_end[]; +extern uint32_t __ldm_data_start[]; +extern uint32_t __ldm_data_end[]; +extern void (* __init_array_start[])(); +extern void (* __init_array_end[])(); +extern uint32_t __firmware_start[]; + +extern void kernel_init(); +extern void kernel_launch(); + +inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) { + // Cover L1 load latency of 6 cycles for the bulk of the copy + int32_t n = 0; + while (n < len - 5) { + uint32_t v0 = l1_addr[n + 0]; + uint32_t v1 = l1_addr[n + 1]; + uint32_t v2 = l1_addr[n + 2]; + uint32_t v3 = l1_addr[n + 3]; + uint32_t v4 = l1_addr[n + 4]; + uint32_t v5 = l1_addr[n + 5]; + local_mem_addr[n + 0] = v0; + local_mem_addr[n + 1] = v1; + local_mem_addr[n + 2] = v2; + local_mem_addr[n + 3] = v3; + local_mem_addr[n + 4] = v4; + local_mem_addr[n + 5] = v5; + n += 6; + } + // Could optimize this further (eg, loop of 2 or 4), probably not worth it + while (n < len) { + local_mem_addr[n] = l1_addr[n]; + n++; + } +} + +inline void firmware_kernel_common_init(void *init_local_l1_base) { + + // Handle stuff typically done in crt0 in asm. Easier to do in C + wzerorange(__ldm_bss_start, __ldm_bss_end); + + int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2; + uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE; + l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words); + + for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) { + (**fptr)(); + } +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h new file mode 100644 index 00000000000..e507966c516 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace ckernel { +#define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_##arg_idx +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h new file mode 100644 index 00000000000..e59e64b8ea3 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b) +{ + unsigned int r = 0; + while (a) + { + if (a & 1) + r += b; + a >>= 1; + b <<= 1; + } + return r; +} + +inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n) +{ + // Uses embedding style magic number + // * fixed point 1/12 then shifting. + // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm + return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3; +} + +inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n) +{ + // Uses embedding style magic number + // * fixed point 1/12 then shifting. + // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm + return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6; +} + +template +inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n) +{ + if constexpr (d == 12) { + // fast divide for 12 divisor + return fast_udiv_12(n); + } else if constexpr (d == 94) { + // fast divide for 94 divisor. Handles Banked L1 address generation for E75 + return fast_udiv_94(n); + } else { + // generic divide from llvm + const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT; + unsigned int q; + unsigned int r; + unsigned sr; + /* special cases */ + if (d == 0) + return 0; /* ?! */ + if (n == 0) + return 0; + sr = __builtin_clz(d) - __builtin_clz(n); + /* 0 <= sr <= n_uword_bits - 1 or sr large */ + if (sr > n_uword_bits - 1) /* d > r */ + return 0; + if (sr == n_uword_bits - 1) /* d == 1 */ + return n; + ++sr; + /* 1 <= sr <= n_uword_bits - 1 */ + /* Not a special case */ + q = n << (n_uword_bits - sr); + r = n >> sr; + unsigned int carry = 0; + for (; sr > 0; --sr) + { + /* r:q = ((r:q) << 1) | carry */ + r = (r << 1) | (q >> (n_uword_bits - 1)); + q = (q << 1) | carry; + /* carry = 0; + * if (r.all >= d.all) + * { + * r.all -= d.all; + * carry = 1; + * } + */ + const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1); + carry = s & 1; + r -= d & s; + } + q = (q << 1) | carry; + return q; + } +} +template +inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a) +{ + return a - udivsi3_const_divisor(a) * d; +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h new file mode 100644 index 00000000000..5ff63a3af7d --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/tt_log.h @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// Define TT_LOG and it's derivatives so the compile passes. +// If we are running workloads on hardware, TT_LOG will already have been defined. +#ifndef TT_LOG_DEFINED + #define TT_LOG_DEFINED + #define TT_LOG(...) (void)sizeof(__VA_ARGS__) + #define TT_LOG_NB(...) (void)sizeof(__VA_ARGS__) + #define TT_PAUSE(...) (void)sizeof(__VA_ARGS__) + #define TT_RISC_ASSERT(...) (void)sizeof(__VA_ARGS__) + #define TT_LLK_DUMP(...) (void)sizeof(__VA_ARGS__) + #define TT_DUMP_LOG(...) (void)sizeof(__VA_ARGS__) + #define TT_DUMP_ASSERT(...) (void)sizeof(__VA_ARGS__) +#endif diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h new file mode 100644 index 00000000000..cd282dd2df9 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_api.h @@ -0,0 +1,86 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_eltwise_binary.h" + +/************************************************************************* + * LLK ELTWISE BINARY + *************************************************************************/ + +// Version with no operand +template < + EltwiseBinaryType eltwise_binary_type, + BroadcastType src_b_bcast_type, + int NUM_FIDELITY_PHASES = 0, + EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> +inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) { + const std::uint32_t num_faces = 4; + + _llk_math_eltwise_binary_init_( + num_faces, transpose, acc_to_dest); +} + +// Version with operands +template < + EltwiseBinaryType eltwise_binary_type, + BroadcastType src_b_bcast_type, + int NUM_FIDELITY_PHASES = 0, + EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> +inline void llk_math_eltwise_binary_init_with_operands( + const std::uint32_t operand_A, + const std::uint32_t operand_B, + const std::uint32_t transpose = 0, + const std::uint32_t acc_to_dest = 0) { + const std::uint32_t operand_id = + get_operand_id(operand_A); // operand_id is used to extract tile dim data which is the same for both operands + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + + _llk_math_eltwise_binary_init_( + num_faces, transpose, acc_to_dest); +} + +template < + EltwiseBinaryType eltwise_binary_type, + BroadcastType src_b_bcast_type, + DstSync Dst = DstSync::SyncFull, + int NUM_FIDELITY_PHASES = 0, + EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, + bool is_fp32_dest_acc_en = false> +inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) { + const std::uint32_t num_faces = 4; + + _llk_math_eltwise_binary_< + eltwise_binary_type, + src_b_bcast_type, + Dst, + NUM_FIDELITY_PHASES, + binary_reuse_dest, + is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); +} + +template < + EltwiseBinaryType eltwise_binary_type, + BroadcastType src_b_bcast_type, + DstSync Dst = DstSync::SyncFull, + int NUM_FIDELITY_PHASES = 0, + EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, + bool is_fp32_dest_acc_en = false> +inline void llk_math_eltwise_binary( + const std::uint32_t operand_A, + const std::uint32_t operand_B, + uint dst_index, + const bool clear_fp32_dst_acc = true) { + const std::uint32_t operand_id = get_operand_id(operand_A); // both operands must have same number of faces + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + + _llk_math_eltwise_binary_< + eltwise_binary_type, + src_b_bcast_type, + Dst, + NUM_FIDELITY_PHASES, + binary_reuse_dest, + is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h new file mode 100644 index 00000000000..5f662f22081 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_eltwise_binary_sfpu.h" + +/************************************************************************* + * LLK ELTWISE BINARY SFPU + *************************************************************************/ + +template +inline void llk_math_eltwise_binary_sfpu( + const uint operand, + uint dst_index_a, + uint dst_index_b, + int vector_mode = (int)Dim::RC, + uint param0 = 0, + uint param1 = 0, + uint param2 = 0, + uint param3 = 0, + uint param4 = 0, + uint param5 = 0) { + const std::uint32_t operand_id = get_operand_id(0); + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + + _llk_math_eltwise_binary_sfpu_( + face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5); +} + +template +inline void llk_math_eltwise_binary_sfpu_init( + uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { + _llk_math_eltwise_binary_sfpu_init_(param0, param1, param2, param3, param4, param5); +} + +template +inline void llk_math_eltwise_binary_sfpu_quant_int32( + uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { + llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +} + +template +inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) { + llk_math_eltwise_binary_sfpu_init(zero_point); +} + +template +inline void llk_math_eltwise_binary_sfpu_requant_int32( + uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { + llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +} + +template +inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) { + llk_math_eltwise_binary_sfpu_init(zero_point); +} + +template +inline void llk_math_eltwise_binary_sfpu_dequant_int32( + uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { + llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +} + +template +inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) { + llk_math_eltwise_binary_sfpu_init(zero_point); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h new file mode 100644 index 00000000000..6f7d61cabb5 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_common_api.h @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_globals.h" +#include "ckernel_template.h" +#include "cmath_common.h" +#include "llk_defs.h" +#include "llk_io.h" +#include "llk_math_common.h" +#include "llk_operands.h" +#include "llk_param_structs.h" + +// Need to revisit why we even need this +#define EPS 1.19209e-07 // std::numeric_limits::epsilon() for FP32 + +/************************************************************************* + * LLK MATH COMMON + *************************************************************************/ + +template +inline void llk_math_wait_for_dest_available() { + _llk_math_wait_for_dest_available_(); +} + +template +inline void llk_math_dest_section_done() { + _llk_math_dest_section_done_(); +} + +template +inline void llk_math_pack_sync_init() { + _llk_math_pack_sync_init_(); +} + +template +inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { + _llk_math_get_tile_(tile_index, p_tile); +} + +template +inline void llk_math_release_tile(std::uint32_t operand) { + _llk_math_release_tile_(); +} + +inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); } + +inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); } + +inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { + std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]); +} + +inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { + std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]); +} + +inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { + std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + + _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]); +} + +inline void llk_math_reconfig_data_format( + const std::uint32_t srca_old_operand, + const std::uint32_t srca_new_operand, + const std::uint32_t srcb_old_operand, + const std::uint32_t srcb_new_operand) { + std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); + std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); + std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + + if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) && + (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { + llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand); + } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { + llk_math_reconfig_data_format_srca(srca_new_operand); + } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { + llk_math_reconfig_data_format_srcb(srcb_new_operand); + } +} + +inline void llk_math_reconfig_data_format_srca( + const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { + std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); + std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + + if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { + llk_math_reconfig_data_format_srca(srca_new_operand); + } +} + +inline void llk_math_reconfig_data_format_srcb( + const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { + std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); + std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + + if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { + llk_math_reconfig_data_format_srcb(srcb_new_operand); + } +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h new file mode 100644 index 00000000000..8f7ea1f5713 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_matmul.h" + +/************************************************************************* + * LLK MATMUL + *************************************************************************/ + +template +inline void llk_math_matmul_init( + const std::uint32_t operandA, + const std::uint32_t operandB, + const std::uint32_t transpose = 0, + const std::uint32_t ct_dim = 1, + const std::uint32_t rt_dim = 1, + const std::uint32_t kt_dim = 1) { + const std::uint32_t in0_id = get_operand_id(operandA); + const std::uint32_t in1_id = get_operand_id(operandB); + + const bool partial_face = get_operand_partial_face(in0_id); + + const auto unpack_tile_dims = get_operand_tile_dims(in0_id); + const std::uint32_t in0_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX]; + const std::uint32_t in0_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX]; + const std::uint32_t in1_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX]; + const std::uint32_t in1_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX]; + +#ifdef ARCH_GRAYSKULL + _llk_math_matmul_init_( + in0_tile_r_dim, + in0_tile_c_dim, + in1_tile_r_dim, + in1_tile_c_dim, + partial_face, + transpose, + ct_dim, + rt_dim, + kt_dim); +#else + _llk_math_matmul_init_( + in0_tile_r_dim, + in0_tile_c_dim, + in1_tile_r_dim, + in1_tile_c_dim, + partial_face, + transpose, + ct_dim, + rt_dim, + kt_dim); +#endif +} + +template +inline void llk_math_matmul( + uint dst_index, + const bool transpose = false, + const std::uint32_t ct_dim = 1, + const std::uint32_t rt_dim = 1, + const std::uint32_t kt_dim = 1) { +#ifdef ARCH_GRAYSKULL + _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); +#else + _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); +#endif +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h new file mode 100644 index 00000000000..f6d54ba067c --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_reduce_api.h @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_reduce.h" + +/************************************************************************* + * LLK REDUCE + *************************************************************************/ + +template < + PoolType type, + ReduceDim dim, + int num_fidelity_phases = 0, + bool is_fp32_dest_acc_en = false, + bool is_int_fpu_en = false> +inline void llk_math_reduce(const uint dst_index) { + _llk_math_reduce_(dst_index); +} + +template +inline void llk_math_reduce_init( + const std::uint32_t within_face_16x16_transpose = + 0) { // within_face_16x16_transpose used for unpack, ignored by math + _llk_math_reduce_init_(within_face_16x16_transpose); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h new file mode 100644 index 00000000000..4a280fa8119 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_datacopy_api.h @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_common_api.h" +#include "llk_math_eltwise_unary_datacopy.h" + +/************************************************************************* + * LLK ELTWISE UNARY DATACOPY + *************************************************************************/ + +template < + DataCopyType type, + BroadcastType src_b_bcast_type = BroadcastType::NONE, + DstSync Dst = DstSync::SyncFull, + bool is_fp32_dest_acc_en = false, + bool unpack_to_dest = false> +inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) { + const std::uint32_t operand_id = get_operand_id(0); + _llk_math_eltwise_unary_datacopy_( + dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]); +} + +template +// within_face_16x16_transpose is used by unpacker, math does not transpose +inline void llk_math_eltwise_unary_datacopy_init( + const std::uint32_t transpose_of_faces = 0 /*unused*/, + const std::uint32_t within_face_16x16_transpose = 0 /* unused */, + const std::uint32_t operand = 0) { + const std::uint32_t operand_id = get_operand_id(0); + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + _llk_math_eltwise_unary_datacopy_init_( + transpose_of_faces, within_face_16x16_transpose, num_faces); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h new file mode 100644 index 00000000000..17bba18f12a --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -0,0 +1,345 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "metal_ckernel_sfpu.h" +#include "llk_math_eltwise_unary_sfpu_init.h" + +namespace ckernel { + +/************************************************************************* +* LLK ELTWISE UNARY SFPU +*************************************************************************/ + +template +inline void llk_math_eltwise_unary_sfpu( + uint dst_index, + int vector_mode = (int)Dim::RC, + uint param0 = 0, + uint param1 = 0, + uint param2 = 0, + uint param3 = 0, + uint param4 = 0, + uint param5 = 0) { + + const std::uint32_t operand_id = get_operand_id(0); + const std::uint32_t num_faces = get_operand_num_faces(0); + const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + + _llk_math_eltwise_unary_sfpu_( + face_r_dim, + num_faces, + dst_index, + vector_mode, + param0, + param1, + param2, + param3, + param4, + param5 + ); +} + + +// New LLK SFPU APIs +template +inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_rsqrt_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_log_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) { + llk_math_eltwise_unary_sfpu(dst_index,base_scale); +} + +template +inline void llk_math_eltwise_unary_sfpu_log_with_base_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_signbit_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_tanh_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//sign +template +inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_sign_init() { + llk_math_eltwise_unary_sfpu_init(); +} +template +inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) { + constexpr bool dont_care = false; + llk_math_eltwise_unary_sfpu(dst_index, vector_mode, integer_dropout, scale_factor); +} + +inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) { + constexpr bool dont_care = false; + constexpr uint dont_care_param = 0; + + llk_math_eltwise_unary_sfpu_init(dont_care_param, dont_care_param, seed); +} + +template +inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_sigmoid_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//EQZ +template +inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_eqz_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//NEZ +template +inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_nez_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//LTZ +template +inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_ltz_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//GTZ +template +inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_gtz_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//LEZ +template +inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_lez_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//GEZ +template +inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_gez_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_max_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_square_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode, pow); +} + +template +inline void llk_math_eltwise_unary_sfpu_power_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_abs_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//EXP2 +template +inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_exp2_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//heaviside +template +inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index,vector_mode,param0); +} + +template +inline void llk_math_eltwise_unary_sfpu_heaviside_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//EXPM1 +template +inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_expm1_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//Asin +template +inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_asin_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//Atan +template +inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_atan_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//Acos +template +inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_acos_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//silu +template +inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) { + llk_math_eltwise_unary_sfpu(dst_index); +} + +template +inline void llk_math_eltwise_unary_sfpu_silu_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +//Mask +template +inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_mask_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +// Negative +template +inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) { + llk_math_eltwise_unary_sfpu(dst_index,vector_mode); +} + +template +inline void llk_math_eltwise_unary_sfpu_negative_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h new file mode 100644 index 00000000000..ca7e298a7c2 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_op_info_api.h @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +extern uint32_t op_info_offset; + +inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) { + + uint32_t* op_info_ptr = reinterpret_cast(OP_INFO_BASE_ADDR + op_info_offset); + static constexpr uint32_t op_info_num_items = 7; + + volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast(&op_info_struct); + for (uint32_t i = 0; i < op_info_num_items; i++) { + op_info_struct_ptr[i] = op_info_ptr[i]; + } + op_info_offset += 28; + + if (op_info_offset == OP_INFO_SIZE) { + op_info_offset = 0; // In case we go out of bounds + } +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h new file mode 100644 index 00000000000..808d88a6281 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h @@ -0,0 +1,270 @@ +#pragma once +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_template.h" +#include "cpack_common.h" +#include "ckernel_globals.h" +#include "circular_buffer.h" + +#include "llk_io.h" +#include "llk_defs.h" +#include "llk_outputs.h" +#include "llk_param_structs.h" +#include "llk_pack.h" +#include "llk_pack_common.h" + +/************************************************************************* +* LLK PACK +*************************************************************************/ + +template +inline void llk_pack_mop_config(const uint32_t output) { + + const std::uint32_t output_id = get_output_id(output); + const std::uint32_t num_faces = get_output_num_faces(output_id); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]); + const bool narrow_tile = get_output_narrow_tile(output_id); + + _llk_pack_mop_config_( + pack_dst_format[output_id], + face_r_dim, + num_faces, + partial_face, + narrow_tile + ); +} + +template +inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) { + + const std::uint32_t output_id = get_output_id(pack_params->pack_output); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const std::uint32_t num_faces = get_output_num_faces(output_id); + const bool partial_face = get_output_partial_face(output_id); + const bool narrow_tile = get_output_narrow_tile(output_id); + + const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; + + _llk_pack_hw_configure_( + pack_src_format[output_id], + pack_dst_format[output_id], + tile_size, + face_r_dim, + num_faces, + partial_face, + narrow_tile, + pack_params->relu_config.val + ); +} + +template +inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) { + llk_pack_params_t llk_pack_params = { + .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}}; + llk_pack_hw_configure(&llk_pack_params); +} + +template +inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) { + const std::uint32_t output_id = get_output_id(pack_params->pack_output); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const std::uint32_t num_faces = get_output_num_faces(output_id); + const bool partial_face = get_output_partial_face(output_id); + const bool narrow_tile = get_output_narrow_tile(output_id); + + const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; + + _llk_pack_reduce_hw_configure_( + pack_src_format[output_id], + pack_dst_format[output_id], + tile_size, + face_r_dim, + num_faces, + partial_face, + narrow_tile, + pack_params->relu_config.val + ); +} + +template +inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) { + llk_pack_params_t llk_pack_params = { + .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}}; + llk_pack_reduce_hw_configure(&llk_pack_params); +} + +template +inline void llk_pack_init(const std::uint32_t pack_output = 0) { + + const std::uint32_t output_id = get_output_id(pack_output); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const std::uint32_t num_faces = get_output_num_faces(output_id); + const bool partial_face = get_output_partial_face(output_id); + const bool narrow_tile = get_output_narrow_tile(output_id); + + _llk_pack_init_( + pack_dst_format[output_id], + face_r_dim, + num_faces, + partial_face, + narrow_tile + ); +} + +template +inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) { + + std::uint32_t pack_tile_addr; + if constexpr (out_of_order_output) { + pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + + (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1; + } else { + if constexpr (untilize) { + // FIXME: Need to support pack-untilize? + // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim + + // cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf + // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; + // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size); + + // cb_interface[output_id].ublock_tile_cnt++; + + // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) { + // cb_interface[output_id].ublock_tile_cnt=0; + // cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct; + // } + } else { + pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; + cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size; + } + } + return pack_tile_addr; +} + +template +inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) { + std::uint8_t output_id = get_output_id(output); + + static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); + + std::uint32_t pack_tile_addr = get_output_tile_address(output_id, output_tile_index); + + _llk_pack_( + tile_index, + pack_tile_addr + ); +} + +/************************************************************************* +* LLK PACK COMMON +*************************************************************************/ + + +inline void llk_packer_wait_for_math_done() { + _llk_packer_wait_for_math_done_(); +} + +template +inline void llk_packer_set_math_semaphore() { + _llk_packer_set_math_semaphore_(); +} + +template +inline void llk_pack_dest_section_done() { + _llk_pack_dest_section_done_(); +} + +template +inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 0) { + const std::uint32_t output_id = get_output_id(pack_output); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const bool narrow_tile = get_output_narrow_tile(output_id); + + _llk_init_packer_dest_offset_registers_( + face_r_dim, + narrow_tile + ); +} + +template +inline void llk_pack_dest_init(const std::uint32_t pack_output = 0) { + + const std::uint32_t output_id = get_output_id(pack_output); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const bool narrow_tile = get_output_narrow_tile(output_id); + + _llk_pack_dest_init_( + face_r_dim, + narrow_tile + ); +} + +template +inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) { + _llk_pack_get_tile_(tile_index, p_tile); +} + +template +inline void llk_pack_release_tile(std::uint32_t output) { + _llk_pack_release_tile_(); +} + +inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { + _llk_pack_debug_dump_(data, byte_size); +} + +inline void llk_pack_debug_dump_seek(std::uint8_t offset) { + _llk_pack_debug_dump_seek_(offset); +} + +template +inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { + + const std::uint32_t output_id = get_output_id(new_output); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const std::uint32_t num_faces = get_output_num_faces(output_id); + const bool partial_face = get_output_partial_face(output_id); + const bool narrow_tile = get_output_narrow_tile(output_id); + + _llk_pack_reconfig_data_format_( + pack_src_format[output_id], + pack_dst_format[output_id], + cb_interface[output_id].fifo_page_size, + face_r_dim, + num_faces, + partial_face, + narrow_tile + ); +} + +template +inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) { + std::uint32_t old_output_id = get_output_id(old_output); + std::uint32_t new_output_id = get_output_id(new_output); + + if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id]) + && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid) + && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) { + llk_pack_reconfig_data_format(new_output); + } else if constexpr (is_tile_dim_reconfig_en) { + // Same format but different tile dims + llk_pack_mop_config(new_output); + } +} + +TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) { + _llk_pack_relu_config_(config); +} + +inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) { + _llk_pack_reconfig_l1_acc_(enable); +} + +template +inline void llk_pack_reduce_mask_config() { + _llk_pack_reduce_mask_config_(); +} + +inline void llk_pack_reduce_mask_clear() { + _llk_pack_reduce_mask_clear_(); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_param_structs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_param_structs.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_param_structs.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_param_structs.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_reverseops.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_reverseops.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_reverseops.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_reverseops.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_cdf.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_cdf.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_converter.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_converter.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_elu.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erf_erfc.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_erfinv.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_exp.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_exp.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_gelu.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_i0.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_isinf_isnan.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_logical_not_noti.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_recip.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_recip.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_relu.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_sqrt.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_trigonometry.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_trigonometry.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h similarity index 83% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h index 822699707d1..f1e7d19acc8 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h @@ -8,9 +8,11 @@ #include "ckernel_globals.h" #include "ckernel_include.h" #include "ckernel_template.h" +#include "metal_ckernel_sfpu.h" #include "cmath_common.h" #include "llk_format_conversions.h" #include "llk_math_common.h" #include "llk_param_structs.h" +#include "llk_math_eltwise_unary_sfpu.h" using namespace ckernel; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_elu.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_exp.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_exp.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_i0.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h similarity index 66% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_init.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h index 691accf168a..b82b1f39cb4 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_init.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h @@ -28,4 +28,18 @@ inline void llk_math_eltwise_unary_sfpu_init(void (*func)()) { math::reset_counters(p_setrwc::SET_ABD_F); } +template +inline void llk_math_eltwise_unary_sfpu_init( + uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { + + _llk_math_eltwise_unary_sfpu_init_( + param0, + param1, + param2, + param3, + param4, + param5 + ); +} + } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_recip.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_recip.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_relu.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h new file mode 100644 index 00000000000..72c27cde02b --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h @@ -0,0 +1,778 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel_defs.h" +#include "ckernel_sfpu.h" +#include "ckernel.h" +#include "noc_nonblocking_api.h" + +#include "sfpi.h" + +#include "ckernel_sfpu_cdf.h" +#include "ckernel_sfpu_exp.h" +#include "ckernel_sfpu_recip.h" +#include "ckernel_sfpu_converter.h" + +using namespace sfpi; + +namespace ckernel +{ +namespace sfpu +{ + +template +inline void calculate_rsqrt() +{ + + for (int d = 0; d < ITERATIONS; d++) + { + + vFloat in = dst_reg[0]; + v_if(dst_reg[0] == 0.0f){ + dst_reg[0] = std::numeric_limits::infinity(); + }v_else{ + vFloat result = 1.0f; + v_if(dst_reg[0] > 1.0f){ + result = sfpu_reciprocal(in); + }v_endif; + + for (int r = 0; r < RECIPROCAL_ITERATIONS; r++) + { + // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration. + result = result * (1.5F - 0.5F * dst_reg[0] * result * result); + } + dst_reg[0] = result; + }v_endif; + + dst_reg++; + + } +} + +template +inline void calculate_sigmoid_appx() +{ + vUInt l0 = l_reg[LRegs::LReg0]; + vUInt l1 = l_reg[LRegs::LReg1]; + vUInt l2 = l_reg[LRegs::LReg2]; + + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + + dst_reg[0] = lut(val, l0, l1, l2) + 0.5f; + + dst_reg++; + } + + l_reg[LRegs::LReg0] = l0; + l_reg[LRegs::LReg1] = l1; + l_reg[LRegs::LReg2] = l2; +} + +// TODO: Implement using bitwise comparision +template +inline void calculate_signbit() +{ + + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + v_if (val <= -0.0f) { + val = 1.0f; + } v_elseif (val >= 0.0f) { + val = 0.0f; + } + v_endif; + dst_reg[0] = val; + + dst_reg++; + } + +} + +template +inline void calculate_tanh() +{ + // SFPU microcode + vUInt l0 = l_reg[LRegs::LReg0]; + vUInt l1 = l_reg[LRegs::LReg1]; + vUInt l2 = l_reg[LRegs::LReg2]; + + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + val = lut(val, l0, l1, l2); + dst_reg[0] = val; + + dst_reg++; + } + + l_reg[LRegs::LReg0] = l0; + l_reg[LRegs::LReg1] = l1; + l_reg[LRegs::LReg2] = l2; +} + +template +inline void calculate_hardtanh(uint param0, uint param1, uint param2) +{ + // All params are in FP16_B format + // param0 = -(neg_threshold) + // param1 = -(pos_threshold - neg_threshold) + // param2 = -(pos_threshold) + + vFloat p0 = s2vFloat16(param0); + vFloat p1 = s2vFloat16(param1); + vFloat p2 = s2vFloat16(param2); + // SFPU microcode + #pragma GCC unroll 0 + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + + val += p0;// 12 bits + v_if (val < 0.0f) { + val = 0.0f; + } + v_endif; + + val += p1;// 12 bits + v_if (val >= 0.0f) { + val = 0.0f; + } + v_endif; + + val += p2;// 12 bits + + dst_reg[0] = val; + + dst_reg++; + } +} + +template +inline void calculate_tanh_derivative() +{ + vUInt l0 = l_reg[LRegs::LReg0]; + vUInt l1 = l_reg[LRegs::LReg1]; + vUInt l2 = l_reg[LRegs::LReg2]; + + // tanh'(x) = 1 - (tanh(x))^2 + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + + if constexpr (!WITH_PRECOMPUTED_TANH) { + val = lut(val, l0, l1, l2); + } + + val = val * (-val) + vConst1; + dst_reg[0] = val; + + dst_reg++; + } + + l_reg[LRegs::LReg0] = l0; + l_reg[LRegs::LReg1] = l1; + l_reg[LRegs::LReg2] = l2; +} + +template +inline void calculate_dropout(uint prob, uint scale) +{ + // SFPU microcode + + vUInt rand = l_reg[LRegs::LReg3]; + + #pragma GCC unroll 0 + for (int d = 0; d < ITERATIONS; d++) { + //////////////////////// + // Scale samples + /////////////////////// + dst_reg[0] = dst_reg[0] * s2vFloat16b(scale); + + //////////////////////// + // Drop samples + /////////////////////// + v_if (rand < prob) { + dst_reg[0] = vConst0; + } + v_endif; + + //////////////////////// + // 16-bit PRNG update + /////////////////////// + vUInt lfsr = vConstIntPrgm1; + vUInt tmp = lfsr & rand; + rand = rand >> 1; + v_if (tmp != 0) { + vUInt mask = vConstIntPrgm0; + rand ^= mask; + } + v_endif; + + dst_reg++; + } + + l_reg[LRegs::LReg3] = rand; +} + +template +inline void calculate_power_iterative(const uint exponent) +{ + #pragma GCC unroll 8 + for (int d = 0; d < 8; d++) + { + vFloat in = dst_reg[0]; + vFloat result = 1.0f; + for (uint i = 0; i < exponent; i++) { + result *= in; + } + dst_reg[0]=result; + dst_reg++; + } +} + +template +inline void calculate_square() +{ + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) + { + vFloat in = dst_reg[0]; + vFloat result = in * in; + + dst_reg[0] = result; + + dst_reg++; + } +} + +template +sfpi_inline void calculate_log_body(const uint log_base_scale_factor) +{ + //////////////////////////// + // Load From dest + "normalize to calculation range" + //////////////////////////// + vFloat in = dst_reg[0]; + vFloat x = setexp(in, 127); // set exp to exp bias (put in range of 1-2) + + // XXXXXX ask Namal? if we can derive the coefficients below to higher precision + //////////////////////////// + // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order + // x* ( x* (A*x + B) + C) + D + // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006 + // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to + // save the subtract and get A',B',C',D'): + // A' = A + // B' = -3A + B + // C' = 3a -2B + C + // D' = -A + B - C + D + // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753 + //////////////////////////// + vFloat a = vConstFloatPrgm1; + vFloat b = vConstFloatPrgm2; + // XXXXX try variants of the below: B'=.7122, C'=2.0869 + vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f; + + //////////////////////////// + // Convert exponent to float + //////////////////////////// + vInt exp = exexp(in); + v_if (exp < 0) { + exp = setsgn(~exp + 1, 1); + } + v_endif; + + vFloat expf = int32_to_float(exp, 0); + vFloat vConstLn2 = vConstFloatPrgm0; + vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2) + + if constexpr (HAS_BASE_SCALING) { + result *= s2vFloat16a(log_base_scale_factor); + } + + //////////////////////////// + // Base case when input is 0. ln(0) = -inf + //////////////////////////// + v_if (in == 0.0F) { // Reload for register pressure + result = -std::numeric_limits::infinity(); + } + v_endif; + + dst_reg[0] = result; +} + +template +inline void calculate_log(uint log_base_scale_factor) +{ + #pragma GCC unroll 8 + for(int d = 0; d < ITERATIONS; d++){ + calculate_log_body(log_base_scale_factor); + dst_reg++; + } +} + +sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init) +{ + flag1 = init; + if (check) { + flag2 = init; + } +} + +template +inline void calculate_comp(uint exponent_size_8) +{ + const vFloat zero = 0.0f; + const vFloat one = 1.0f; + for (int d = 0; d < ITERATIONS; d++) + { + vFloat v = dst_reg[0]; + vFloat flag1, flag2; + + //a[i] == 0 + if constexpr(COMP_MODE == SfpuType::equal_zero) { + v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + v = one; + } v_else { + v = zero; + } + v_endif; + } + + //a[i] != 0 + if constexpr(COMP_MODE == SfpuType::not_equal_zero) { + v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + v = zero; + } v_else { + v = one; + } + v_endif; + } + + //a[i] < 0 + if constexpr(COMP_MODE == SfpuType::less_than_zero) { + v_if (v >= 0.0f) { + v = zero; + } v_else { + v = one; + } + v_endif; + } + + //a[i] >= 0 + if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) { + v_if (v >= 0.0f) { + v = one; + } v_else { + v = zero; + } + v_endif; + } + + //a[i] > 0 + if constexpr(COMP_MODE == SfpuType::greater_than_zero) { + v_if (v > 0.0f) { + v = one; + } v_else { + v = zero; + } + v_endif; + } + + //a[i] <= 0 + if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) { + v_if (v > 0.0f) { + v = zero; + } v_else { + v = one; + } + v_endif; + } + + dst_reg[0] = v; + dst_reg++; + } +} + +template +inline void calculate_clamp(uint param0, uint param1, uint param2) +{ + // All params are in FP16 format + // param0 = min + // param1 = max + + //uint format = (param0 >> 16)&0x1; + s2vFloat16::Format format = s2vFloat16::fp16a; + + // SFPU microcode + vFloat min = s2vFloat16(param0, format); + vFloat max = s2vFloat16(param1, format); + #pragma GCC unroll 0 + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + + v_if (val < min) { + val = s2vFloat16(param0, format); + } v_elseif (val >= max) { + val = s2vFloat16(param1, format); + } + v_endif; + + dst_reg[0] = val + s2vFloat16b(param2); // 12 bits + + dst_reg++; + } +} + +template +inline void calculate_abs() +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) + { + vFloat v = dst_reg[0]; + dst_reg[0] = sfpi::abs(v); + dst_reg++; + } +} + + +template +inline void calculate_exp2() +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) + { + vFloat v = dst_reg[0]; + // log(2) = 0.6931471805; + v = v * 0.6931471805f; + // exp = e^(v) + vFloat exp = calculate_exponential_body_improved(v); + dst_reg[0] = exp; + dst_reg++; + } +} + +template +inline void calculate_sign() +{ + // All params are in FP16 format + for (int d = 0; d < ITERATIONS; d++) + { + vFloat v = dst_reg[0]; + vFloat result = vConst1; + v_if (v < 0.0f) { + result = vConstNeg1; + } v_elseif(v > 0.0f) { + result = vConst1; + } v_else { + result = vConst0; + } + v_endif; + + dst_reg[0] = result; + dst_reg++; + } +} +template +inline void calculate_max() +{ + for (int d = 0; d < ITERATIONS; d++) + { + vFloat a = dst_reg[0]; + vFloat b = dst_reg[32]; + v_if(a < b) { + dst_reg[0] = b; + } + v_endif; + + dst_reg++; + } +} + +template +inline void calculate_min() +{ + for (int d = 0; d < ITERATIONS; d++) + { + vFloat a = dst_reg[0]; + vFloat b = dst_reg[32]; + v_if(a > b) { + dst_reg[0] = b; + } + v_endif; + + dst_reg++; + } +} + +template +inline void calculate_expm1() +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) + { + vFloat v = dst_reg[0]; + v = calculate_exponential_body_improved(v); + dst_reg[0] = v - 1.0f; + dst_reg++; + } +} + + +#define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4) (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0) + +template +sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val) +{ + v_if(1 > sfpi::abs(val)){ + dst_reg[0] = sfpi::abs(val) ; + } + v_else{ + dst_reg[0] = sfpu_reciprocal(sfpi::abs(val)); + } + v_endif; + + vFloat t1 = dst_reg[0] * dst_reg[0]; + + t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1); + + t1 = t1 * dst_reg[0]; + + v_if (sfpi::abs(val) > 1){ + t1 = 1.570796327f - t1; + } + v_endif; + + v_if(val < 0 ){ + t1 = -t1; + } + v_endif; + + return t1; +} + +template +inline void calculate_atan() +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + val = sfpu_atan_maclaurin_series(val); + dst_reg[0] = val; + dst_reg++; + } +} + + +template +sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val) +{ + // input for [-1:1] + // Mclauren series + // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ... + // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a + + vFloat tmp = val; + vFloat val_square = val * val; + // x + vFloat output = tmp; + // (1/6) * x^3 + tmp = tmp * val_square; + output += 0.166666666 * tmp; + // (3/40) * x^5 + tmp = tmp * val_square; + output += 0.075 * tmp; + + //(5/112) * x^7 + tmp = tmp * val_square; + output += 0.044642857 * tmp; + + // (35/1152) *x^9 + tmp = tmp * val_square; + output += 0.03038194 * tmp; + + //(63/2816) * x^11 + tmp = tmp * val_square; + output += 0.02237216 * tmp; + + // Write out output + return output; +} + +template +inline void calculate_asin() +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) + { + vFloat v = dst_reg[0]; + v = sfpu_asine_maclaurin_series(v); + dst_reg[0] = v; + dst_reg++; + } +} + + +#define PI_2 (1.570796326794) +template +inline void calculate_acos() +{ + // SFPU microcode + // acos = (pi/2 - asin) + for (int d = 0; d < ITERATIONS; d++) + { + vFloat v = dst_reg[0]; + v = sfpu_asine_maclaurin_series(v); + v = PI_2 - v; + dst_reg[0] = v; + dst_reg++; + } +} + +template +inline void cast_fp32_to_fp16a() +{ + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) + { + //vFloat val = dst_reg[0]; + //dst_reg[0] = float_to_fp16a(val, 0); + TTI_SFPLOAD(0, 0, 3, 0); + TTI_SFP_STOCH_RND(0,0,0,0,0,8); + TTI_SFPSTORE(0,1,3,0); + dst_reg++; + } +} + + + +template +inline void calculate_negative() +{ + + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + dst_reg[0] = -val; + dst_reg++; + } +} + +template +inline void calculate_add1() +{ + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + dst_reg[0] = 1.0f + val; + dst_reg++; + } +} + +inline +vFloat sigmoid_piecewise_linear_positive(vFloat val) { + vFloat result = 0.0f; + v_if ( val >= +5.0f) { + result = 1.0f; + } v_elseif ( val > 1.0f && val < 5.0f ) { + result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f, 0.24300185f, 0.50437757f,val); + } v_else { + result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5 + } + v_endif; + return result; +} + +//sigmoid is anti-symmetric and offset by 1 +//sigmoid[-x] = 1 - sigmoid[x] +template +inline void calculate_sigmoid() +{ + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + vFloat result = 0.0f; + + v_if ( val < 0.0f ) { + val = -val; + } + v_endif; + + result = sigmoid_piecewise_linear_positive(val); + + val = dst_reg[0]; + v_if ( val < 0.0f ) { + result = 1.0f - result; + } + v_endif; + + dst_reg[0] = result; + dst_reg++; + } + + return; +} + +template +inline void calculate_heaviside(uint value) +{ + // SFPU microcode + Converter c_value; + c_value.u = value; + vFloat s = c_value.f; + + #pragma GCC unroll 0 + for (int d = 0; d < ITERATIONS; d++) { + vFloat v = dst_reg[0]; + + v_if (v < 0.0f) { + v = 0.0f; + }v_elseif (v > 0.0f) { + v = 1.0f; + }v_else { + v = s; + } + v_endif; + + dst_reg[0] = v; + + dst_reg++; + } +} + +template +inline void calculate_silu() +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) { + vFloat val = dst_reg[0]; + v_if ( val < 0.0f ) { + val = -val; + } + v_endif; + + vFloat result = sigmoid_piecewise_linear_positive(val); + + val = dst_reg[0]; + v_if ( val < 0.0f ) { + result = 1.0f - result; + } + v_endif; + result = val * result; + dst_reg[0] = result; + dst_reg++; + } +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h new file mode 100644 index 00000000000..bce909a4395 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_AB.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* + * LLK UNPACK AB + *************************************************************************/ + +template +inline void llk_unpack_AB_hw_configure( + const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) { + // In0 -> unpA + // In1 -> unpB + const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand); + const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand); + + // unpA -> srcA + // unpB -> srcB + const uint32_t num_faces = get_operand_num_faces(unpA_operand_id); // num faces in unpA and unpB are the same + + const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same + + _llk_unpack_AB_hw_configure_( + unpack_src_format[unpA_operand_id], + unpack_src_format[unpB_operand_id], + unpack_dst_format[unpA_operand_id], + unpack_dst_format[unpB_operand_id], + face_r_dim, + within_face_16x16_transpose, + num_faces); +} + +template +inline void llk_unpack_AB_hw_configure_disaggregated( + const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) { + const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand}; + + llk_unpack_AB_hw_configure(&unpack_AB_params, within_face_16x16_transpose); +} + +template +inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) { + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + const bool narrow_tile = get_operand_narrow_tile(operand_id); // if narrow tile read face 0 twice for row broadcast + // or read face 0 and 1 for col broadcast + _llk_unpack_AB_mop_config_(transpose_of_faces, num_faces, narrow_tile); +} + +template +inline void llk_unpack_AB_init( + const std::uint32_t operandA, + const std::uint32_t operandB, + const std::uint32_t transpose = 0, + const std::uint32_t acc_to_dest = 0) { + const std::uint32_t operandA_id = get_operand_id(operandA); + const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id); // face r dim in unpA and unpB are the same + const std::uint32_t num_faces = get_operand_num_faces(operandA_id); + const bool narrow_tile = + get_operand_narrow_tile(operandA_id); // if narrow tile read face 0 twice for row broadcast + + _llk_unpack_AB_init_(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest); +} + +template +inline void llk_unpack_AB( + const std::uint32_t operandA, + const std::uint32_t operandB, + const std::uint32_t tile_index_a, + const std::uint32_t tile_index_b, + const bool transpose_of_faces = 0 /*not used*/) { + std::uint32_t operandA_id = get_operand_id(operandA); + std::uint32_t operandB_id = get_operand_id(operandB); + std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; + std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a; + std::uint32_t address_a = base_address_a + offset_address_a; + std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; + std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b; + std::uint32_t address_b = base_address_b + offset_address_b; + + _llk_unpack_AB_(address_a, address_b, transpose_of_faces > 0); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h new file mode 100644 index 00000000000..68eca79f4e9 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h @@ -0,0 +1,136 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_AB_matmul.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* + * LLK UNPACK AB MATMUL + *************************************************************************/ + +template +inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) { + const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca; + + // In0 -> unpB + // In1 -> unpA + const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand); + const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand); + + // unpA -> srcA + // unpB -> srcB + const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); + const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id); + + const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id); + + _llk_unpack_AB_matmul_hw_configure_( + unpack_src_format[unpA_operand_id], + unpack_src_format[unpB_operand_id], + unpack_dst_format[unpA_operand_id], + unpack_dst_format[unpB_operand_id], + unpA_face_r_dim, + unpB_face_r_dim, + transpose_xy_srca, + unpA_num_faces, + unpB_num_faces, + cb_interface[unpA_operand_id].fifo_page_size, + cb_interface[unpB_operand_id].fifo_page_size); +} + +template +inline void llk_unpack_AB_matmul_hw_configure_disaggregated( + const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) { + const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = { + .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca}; + llk_unpack_AB_matmul_hw_configure(&unpack_AB_matmul_params); +} + +inline void llk_unpack_AB_matmul_mop_config( + const bool transpose, + const std::uint32_t ct_dim, + const std::uint32_t rt_dim, + const std::uint32_t kt_dim, + const bool partial_face) { + // in0 - loaded to SrcB + // in1 - loaded to SrcA + _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face); +} + +__attribute__((always_inline)) inline void llk_unpack_AB_matmul_init( + const std::uint32_t operandA, + const std::uint32_t operandB, + const std::uint32_t transpose = 0, + const std::uint32_t ct_dim = 1, + const std::uint32_t rt_dim = 1, + const std::uint32_t kt_dim = 1) { + // In0 -> srcB (supports partial face) + // In1 -> srcA + const uint32_t operandA_id = get_operand_id(operandB); + const uint32_t operandB_id = get_operand_id(operandA); + + const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id); + const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id); + + const bool reuse_a = ct_dim >= rt_dim; + const bool partial_face = get_operand_partial_face(operandB_id); + + const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id); + const uint32_t unpB_num_faces = + partial_face ? 1 : get_operand_num_faces(operandB_id); // if partial face -> unpack face by face + + _llk_unpack_AB_matmul_init_( + transpose, + ct_dim, + rt_dim, + kt_dim, + unpA_face_r_dim, + unpB_face_r_dim, + unpA_num_faces, + unpB_num_faces, + partial_face); +} + +inline void llk_unpack_AB_matmul( + const std::uint32_t operandA, + const std::uint32_t operandB, + const std::uint32_t tile_index_a, + const std::uint32_t tile_index_b, + const std::uint32_t ct_dim = 1, + const std::uint32_t rt_dim = 1, + const std::uint32_t kt_dim = 1) { + // In0/InA -> srcB (supports partial face) + // In1/InB -> srcA + + volatile uint *cfg = get_cfg_pointer(); // get pointer to registers for current state ID + + const std::uint32_t operandA_id = get_operand_id(operandA); + const std::uint32_t operandB_id = get_operand_id(operandB); + const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id); // In1/InB -> srcA + const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id); // In0/InA -> srcB + + const bool partial_face = get_operand_partial_face(operandA_id); + + std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; + std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; + + std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size; + std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size; + + _llk_unpack_AB_matmul_( + base_address_a, + base_address_b, + tile_index_a, + tile_index_b, + tile_size_a, + tile_size_b, + unpA_face_r_dim, + unpB_face_r_dim, + partial_face, + ct_dim, + rt_dim, + kt_dim); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h new file mode 100644 index 00000000000..e8918793baa --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_A.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* + * LLK UNPACK A + *************************************************************************/ + +template +inline void llk_unpack_A_hw_configure( + const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) { + const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand); + const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); + const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + + _llk_unpack_A_hw_configure_( + unpack_src_format[unpA_operand_id], + unpack_dst_format[unpA_operand_id], + unpA_face_r_dim, + within_face_16x16_transpose, + unpA_num_faces); +} + +template +inline void llk_unpack_A_hw_configure_disaggregated( + const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) { + const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand}; + llk_unpack_A_hw_configure(&unpack_A_params, within_face_16x16_transpose); +} + +template < + BroadcastType BType = BroadcastType::NONE, + bool acc_to_dest = false, + EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, + bool unpack_to_dest = false> +inline void llk_unpack_A_mop_config( + const bool transpose_of_faces, + const std::uint32_t operand_id, + const std::uint32_t unpack_src_format = 0, + std::uint32_t unpack_dst_format = 0) { + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + + _llk_unpack_A_mop_config_( + transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format); +} + +template < + BroadcastType BType = BroadcastType::NONE, + bool acc_to_dest = false, + EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, + bool unpack_to_dest = false> +inline void llk_unpack_A_init( + const std::uint32_t transpose_of_faces = 0, + const std::uint32_t within_face_16x16_transpose = 0, + const std::uint32_t operand = 0) { + cfg_reg_rmw_tensix(within_face_16x16_transpose); + + const std::uint32_t operand_id = get_operand_id(operand); + const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + + _llk_unpack_A_init_( + transpose_of_faces, + within_face_16x16_transpose, + face_r_dim, + num_faces, + unpack_src_format[operand_id], + unpack_dst_format[operand_id]); +} + +template < + BroadcastType BType = BroadcastType::NONE, + bool acc_to_dest = false, + EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, + bool unpack_to_dest = false> +inline void llk_unpack_A( + const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) { + std::uint32_t operand_id = get_operand_id(operand); + std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; + std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; + std::uint32_t address = base_address + offset_address; + + _llk_unpack_A_( + address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h new file mode 100644 index 00000000000..6b61452722a --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h @@ -0,0 +1,141 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "circular_buffer.h" +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_globals.h" +#include "ckernel_template.h" +#include "cunpack_common.h" +#include "llk_defs.h" +#include "llk_io.h" +#include "llk_operands.h" +#include "llk_param_structs.h" +#include "llk_unpack_common.h" + +/************************************************************************* + * LLK UNPACK COMMON + *************************************************************************/ + +void llk_zero_operand(std::uint32_t operand) { + std::uint32_t operand_id = get_operand_id(operand); + std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size; + std::uint32_t size = cb_interface[operand_id].fifo_size; + _llk_zero_buffer_(fifo_base_addr, size); +} + +template +inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { + std::uint32_t operand_id = get_operand_id(operand); + std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; + std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; + std::uint32_t address = base_address + offset_address; + _llk_unpack_get_tile_(address, p_tile); +} + +template +inline void llk_unpack_release_tile(std::uint32_t operand) { + _llk_unpack_release_tile_(); +} + +inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { + _llk_unpack_debug_dump_(data, byte_size); +} + +inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); } + +template +inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { + const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand); + const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id); + const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id); + _llk_unpack_reconfig_data_format_srca_impl_( + unpack_src_format[srca_operand_id], + unpack_dst_format[srca_operand_id], + cb_interface[srca_operand_id].fifo_page_size, + face_r_dim, + num_faces); +} + +template +inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { + std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand); + const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id); + const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id); + _llk_unpack_reconfig_data_format_srcb_impl_( + unpack_src_format[srcb_operand_id], + unpack_dst_format[srcb_operand_id], + cb_interface[srcb_operand_id].fifo_page_size, + face_r_dim, + num_faces); +} + +template +inline void llk_unpack_reconfig_data_format_srca( + const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { + std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); + std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + + if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) { + llk_unpack_reconfig_data_format_srca(srca_new_operand); + } else if constexpr (is_tile_dim_reconfig_en) { + llk_unpack_reconfig_data_format_srca(srca_new_operand); + } +} + +template +inline void llk_unpack_reconfig_data_format_srcb( + const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { + std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); + std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + + if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) { + llk_unpack_reconfig_data_format_srcb(srcb_new_operand); + } else if constexpr (is_tile_dim_reconfig_en) { + llk_unpack_reconfig_data_format_srcb(srcb_new_operand); + } +} + +template +inline void llk_unpack_reconfig_data_format( + const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { + llk_unpack_reconfig_data_format_srca(srca_new_operand); + llk_unpack_reconfig_data_format_srcb(srcb_new_operand); +} + +template +inline void llk_unpack_reconfig_data_format( + const std::uint32_t srca_old_operand, + const std::uint32_t srca_new_operand, + const std::uint32_t srcb_old_operand, + const std::uint32_t srcb_new_operand) { + llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand); + llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand); +} + +inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); } + +inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); } + +// All TILE_SIZE related functions were deprecared in BBE for WH. The following is needed for pack_shifted so just +// keeping here. +// FIXME: Need to review and adjust accordingly +constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) { + switch (format & 0x1F) { + case ((uint8_t)DataFormat::Float32): return ((index << 8)); + case ((uint8_t)DataFormat::Float16): + case ((uint8_t)DataFormat::Float16_b): return ((index << 7)); + case ((uint8_t)DataFormat::Bfp8): + case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2)); + case ((uint8_t)DataFormat::Bfp4): + case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2)); + case ((uint8_t)DataFormat::Bfp2): + case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2)); + case ((uint8_t)DataFormat::Int8): + case ((uint8_t)DataFormat::Lf8): return ((index << 6)); + // Keep default as Bfp8? + default: return ((index << 6) + (index << 2)); + }; +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h new file mode 100644 index 00000000000..afa60f7947b --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h @@ -0,0 +1,94 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_reduce.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* +* LLK UNPACK REDUCE +*************************************************************************/ + +template +inline void llk_unpack_reduce_hw_configure( + const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) { + + constexpr bool within_face_16x16_transpose = (ReduceDim::REDUCE_ROW == dim); + + const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand); + const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); + const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + + constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; + const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a + ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); + + _llk_unpack_reduce_hw_configure_( + unpack_src_format[unpA_operand_id], + unpB_src_format, + unpack_dst_format[unpA_operand_id], + unpB_dst_format, + unpA_face_r_dim, + unpA_face_r_dim, + within_face_16x16_transpose, + unpA_num_faces, + unpA_num_faces + ); + + if constexpr (type != PoolType::MAX) { + union { + float f; + uint32_t u; + } f2u = {.f = const_mult}; + + for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u; // Load const into L1 buffer + } +} + +template +inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) { + const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand}; + llk_unpack_reduce_hw_configure(&unpack_reduce_params, mult); +} + +template +inline void llk_unpack_reduce_mop_config() { + _llk_unpack_reduce_mop_config_(); +} + +template +inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) { + + constexpr std::uint32_t unpA_operand_id = 0; + + const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; + const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a + ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); + + cfg_reg_rmw_tensix(unpB_dst_format); + + cfg_reg_rmw_tensix(unpB_src_format); + cfg_reg_rmw_tensix(unpB_dst_format); + + TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32); + TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32); + TTI_NOP; TTI_NOP; + + _llk_unpack_reduce_init_( + within_face_16x16_transpose + ); +} + +template +inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) { + + std::uint32_t operand_id = get_operand_id(operand); + std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; + std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; + std::uint32_t address = base_address + offset_address; + + _llk_unpack_reduce_( + address + ); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h new file mode 100644 index 00000000000..0f0a1b69ab3 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_tilize.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* +* LLK UNPACK TILIZE +*************************************************************************/ + +template +inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) { + + constexpr bool within_face_16x16_transpose = false; + constexpr StochRndMode stoch_rnd_mode = StochRndMode::None; + + const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand); + const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); + const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + + _llk_unpack_tilize_hw_configure_( + unpack_src_format[unpA_operand_id], + unpack_dst_format[unpA_operand_id], + unpA_face_r_dim, + within_face_16x16_transpose, + unpA_num_faces + ); +} + + +template +inline void llk_unpack_tilize_hw_configure_disaggregated( + const std::uint32_t unpA_operand) { + const llk_unpack_A_params_t unpack_tilize_params = { + .unpA_operand = unpA_operand + }; + llk_unpack_tilize_hw_configure(&unpack_tilize_params); +} + +inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) { + std::uint32_t operand_id = get_operand_id(operand); + const bool narrow_tile = get_operand_narrow_tile(operand_id); + _llk_unpack_tilize_mop_config_(narrow_tile); +} + +inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) { + cfg_reg_rmw_tensix(0); + + const std::uint32_t operand_id = get_operand_id(operand); + const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + const bool narrow_tile = get_operand_narrow_tile(operand_id); + + // Save state of unpacker config for quick restore + TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0] + TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context + + _llk_unpack_tilize_init_( + unpack_src_format[operand_id], + unpack_dst_format[operand_id], + ct_dim, + face_r_dim, + narrow_tile + ); + +} + +inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) { + TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0); + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0] + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context +} + +inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) { + + std::uint32_t operand_id = get_operand_id(operand); + const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + const bool narrow_tile = get_operand_narrow_tile(operand_id); + + std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; // Remove header size added by descriptor + + _llk_unpack_tilize_( + base_address, + tile_index, + unpack_src_format[operand_id], + block_ct_dim, + face_r_dim, + num_faces, + narrow_tile + ); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h new file mode 100644 index 00000000000..5a135ad8903 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_untilize.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* +* LLK UNPACK UNTILIZE +*************************************************************************/ +template +inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) { + constexpr bool is_row_pool = false; + constexpr bool within_face_16x16_transpose = false; + constexpr StochRndMode stoch_rnd_mode = StochRndMode::None; + + const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand); + const uint32_t unpA_num_faces = 4; + const uint32_t unpA_face_r_dim = FACE_R_DIM; + + _llk_unpack_untilize_hw_configure_( + unpack_src_format[unpA_operand_id], + unpack_dst_format[unpA_operand_id], + unpA_face_r_dim, + within_face_16x16_transpose, + unpA_num_faces + ); +} + +inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) { + const llk_unpack_A_params_t unpack_untilize_params = { + .unpA_operand = unpA_operand, + }; + llk_unpack_untilize_hw_configure(&unpack_untilize_params); +} + +inline void llk_unpack_untilize_mop_config() { + _llk_unpack_untilize_mop_config_(); +} + +inline void llk_unpack_untilize_init(std::uint32_t operand = 0) { + const std::uint32_t operand_id = get_operand_id(operand); + const std::uint32_t face_r_dim = 1; + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + + // Save state of unpacker config for quick restore + TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config + TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context + TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1 + + _llk_unpack_untilize_init_( + unpack_dst_format[operand_id], + cb_interface[operand_id].fifo_page_size, + face_r_dim, + num_faces + ); +} + +inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) { + std::uint32_t operand_id = get_operand_id(operand); + std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; + std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride; + + // Check that unpacker is done (all contexts freed up) before starting hw configuration + wait_for_idle(); + + // Reset address counters + unpacker_addr_counter_init(); + + // Wait for cfg to be free to edit + TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK); + + // Reset the values to default in unpack AB common. + TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0); + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16); + cfg_reg_rmw_tensix(1); + cfg_reg_rmw_tensix(unpA_ch1_y_stride); + TTI_NOP; TTI_NOP; // Do we need this for WH? +} + +template +inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) { + const std::uint32_t operand_id = get_operand_id(operand); + const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; + + _llk_unpack_untilize_pass_( + base_address, + block_tile_cols + ); +} + +inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) { + llk_unpack_untilize_pass(operand, block_c_tiles); + llk_unpack_untilize_pass(operand, block_c_tiles); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc new file mode 100644 index 00000000000..b3f31c2c095 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.cc @@ -0,0 +1,3 @@ +#include "llk_io.h" + +CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] = {0}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h new file mode 100644 index 00000000000..7d3e365a730 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h @@ -0,0 +1,6 @@ +#pragma once +#include + +#include "circular_buffer.h" + +extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_pack.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_pack.h similarity index 98% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_pack.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_pack.h index 7341143fbdb..29dc128e053 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_pack.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_pack.h @@ -14,8 +14,8 @@ using namespace ckernel; -inline void llk_setup_cb_interface() { - +// "llk_setup_outputs" is the old function name that HLKC emits +inline void llk_setup_outputs() { volatile tt_l1_ptr std::uint32_t* circular_buffer_config_addr = (volatile uint32_t*)(CIRCULAR_BUFFER_CONFIG_BASE); for (std::uint32_t cb_id = 0; cb_id < NUM_CIRCULAR_BUFFERS; cb_id++) { @@ -40,11 +40,6 @@ inline void llk_setup_cb_interface() { } } -// "llk_setup_outputs" is the old function name that HLKC emits -inline void llk_setup_outputs() { - llk_setup_cb_interface(); -} - // Blocking call to wait for free space needed to pack N tiles template inline void llk_wait_for_free_tiles(const std::int32_t operand, const std::int32_t num_tiles) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_unpack.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_unpack.h similarity index 97% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_unpack.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_unpack.h index 0cafd67dfa2..e9a882ce5da 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_io_unpack.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io_unpack.h @@ -7,14 +7,15 @@ #include "ckernel_globals.h" #include "ckernel.h" #include "stream_interface.h" +#include "stream_io_map.h" #include "hostdevcommon/common_runtime_address_map.h" -#include "llk_unpack_common.h" +#include "llk_unpack_common_api.h" using namespace ckernel; -inline void llk_setup_cb_interface() { - +// "llk_setup_operands" is the old function name that HLKC emits +inline void llk_setup_operands() { volatile tt_l1_ptr std::uint32_t* circular_buffer_config_addr = (volatile uint32_t*)(CIRCULAR_BUFFER_CONFIG_BASE); for (uint32_t cb_id = 0; cb_id < NUM_CIRCULAR_BUFFERS; cb_id++) { @@ -35,11 +36,6 @@ inline void llk_setup_cb_interface() { } } -// "llk_setup_operands" is the old function name that HLKC emits -inline void llk_setup_operands() { - llk_setup_cb_interface(); -} - // Wait for N tiles available in the incoming stream inline void llk_wait_tiles(int operand, std::int32_t num_tiles) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h new file mode 100644 index 00000000000..c6d1b438f42 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h @@ -0,0 +1,46 @@ + +#pragma once +#include +#include + +inline uint32_t get_operand_id(uint32_t operand) +{ + const int INTERMEDIATE_BASE_ID = 24; + const int OPERAND_BASE_ID = 0; + return (operand>=INTERMEDIATE_BASE_ID) ? operand - 8 : operand - OPERAND_BASE_ID; +} + +inline const uint32_t get_operand_src_format(const std::uint32_t operand_id) +{ + return unpack_src_format[operand_id]; +} + +inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id) +{ + return unpack_src_format[operand_id]; +} + +inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id) +{ + return 4; +} + +inline const uint32_t get_operand_partial_face(const std::uint32_t operand_id) +{ + return 0; +} + +inline const uint32_t get_operand_face_r_dim(const std::uint32_t operand_id) +{ + return 16; +} + +inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id) +{ + return 0; +} + +inline const std::vector get_operand_tile_dims(const std::uint32_t operand_id) +{ + return {32, 32}; +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h new file mode 100644 index 00000000000..596255257d0 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include + +// Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes +inline uint32_t get_output_id(uint32_t output) +{ + const uint32_t OUTPUT_BASE = 0; + return ((output) - OUTPUT_BASE); +} + +inline const uint32_t get_output_base_id() +{ + const uint32_t OUTPUT_BASE_ID = 16; + return (OUTPUT_BASE_ID); +} + +inline const uint32_t get_output_src_format(const std::uint32_t output_id) +{ + return pack_src_format[output_id]; +} + +inline const uint32_t get_output_dst_format(const std::uint32_t output_id) +{ + return pack_src_format[output_id]; +} + +inline const uint32_t get_output_num_faces(const std::uint32_t output_id) +{ + return 4; +} + +inline const uint32_t get_output_partial_face(const std::uint32_t output_id) +{ + return 0; +} + +inline const uint32_t get_output_face_r_dim(const std::uint32_t output_id) +{ + return 16; +} + +inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id) +{ + return 0; +} + +inline const std::vector get_output_tile_dims(const std::uint32_t operand_id) +{ + return {32, 32}; +} diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 6702c298990..465c7b74d44 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -17,7 +17,7 @@ #include "c_tensix_core.h" #include "tdma_xmov.h" #include "noc_nonblocking_api.h" -#include "ckernel_globals.h" +#include "metal_ckernel_globals.h" #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" diff --git a/tt_metal/hw/firmware/src/brisck.cc b/tt_metal/hw/firmware/src/brisck.cc index f00d6233b2b..06567a58a7d 100644 --- a/tt_metal/hw/firmware/src/brisck.cc +++ b/tt_metal/hw/firmware/src/brisck.cc @@ -15,7 +15,7 @@ #include "c_tensix_core.h" #include "tdma_xmov.h" #include "noc_nonblocking_api.h" -#include "ckernel_globals.h" +#include "metal_ckernel_globals.h" #include "tools/profiler/kernel_profiler.hpp" #include "dataflow_api.h" #include "noc_addr_ranges_gen.h" diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index ace48e9b4c6..6a96aa0fbb0 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -7,7 +7,7 @@ #include "noc_nonblocking_api.h" #include "dev_msgs.h" #include "stream_io_map.h" -#include "ckernel_globals.h" +#include "metal_ckernel_globals.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" #include "generated_bank_to_noc_coord_mapping.h" diff --git a/tt_metal/hw/firmware/src/ncrisck.cc b/tt_metal/hw/firmware/src/ncrisck.cc index 3aff6217abd..7a6d037733c 100644 --- a/tt_metal/hw/firmware/src/ncrisck.cc +++ b/tt_metal/hw/firmware/src/ncrisck.cc @@ -9,7 +9,7 @@ #ifdef PERF_DUMP #include "risc_perf.h" #endif -#include "ckernel_globals.h" +#include "metal_ckernel_globals.h" #include "tools/profiler/kernel_profiler.hpp" #include "dataflow_api.h" #include "tensix_functions.h" diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index 0267c005839..f1e0aad4b6f 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "ckernel.h" -#include "ckernel_globals.h" +#include "metal_ckernel_globals.h" #include "risc_common.h" #include #include "dev_msgs.h" @@ -12,6 +12,7 @@ #include "debug/fw_debug.h" #include "debug/status.h" +#include "circular_buffer.h" namespace kernel_profiler { uint32_t wIndex __attribute__((used)); @@ -22,10 +23,10 @@ namespace ckernel enum class ttRiscCores : std::uint32_t { Unpack = 0, Math = 1, Pack = 2, Brisc = 3, Nrisc = 4}; -volatile tt_reg_ptr uint * const reg_base = reinterpret_cast(0xFFB10000); -volatile tt_reg_ptr uint * const pc_buf_base = reinterpret_cast(PC_BUF_BASE); -volatile tt_reg_ptr uint * const regfile = reinterpret_cast(REGFILE_BASE); -volatile tt_reg_ptr uint * const instrn_buffer = reinterpret_cast(INSTRN_BUF_BASE); +volatile tt_reg_ptr uint * reg_base = reinterpret_cast(0xFFB10000); +volatile tt_reg_ptr uint * pc_buf_base = reinterpret_cast(PC_BUF_BASE); +volatile tt_reg_ptr uint * regfile = reinterpret_cast(REGFILE_BASE); +volatile tt_reg_ptr uint * instrn_buffer = reinterpret_cast(INSTRN_BUF_BASE); tt_reg_ptr uint *regmem = reinterpret_cast(REGFILE_BASE); uint32_t cfg_state_id __attribute__((used)) = 0; // Flip between 0 and 1 to keep state between kernel calls diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc index 1369a6d1bca..174fe265300 100644 --- a/tt_metal/hw/firmware/src/trisck.cc +++ b/tt_metal/hw/firmware/src/trisck.cc @@ -9,7 +9,7 @@ // Need to make sure no other file includes these lists since it also include global parameter definitions // 2) instantiate global variables -#include "ckernel_globals.h" +#include "metal_ckernel_globals.h" #include "chlkc_list.h" @@ -23,9 +23,9 @@ uint32_t gl_alu_format_spec_reg = 0; namespace ckernel { -volatile tt_reg_ptr uint * const regfile = reinterpret_cast(REGFILE_BASE); -volatile tt_reg_ptr uint * const instrn_buffer = reinterpret_cast(INSTRN_BUF_BASE); -volatile tt_reg_ptr uint * const pc_buf_base = reinterpret_cast(PC_BUF_BASE); +volatile tt_reg_ptr uint * regfile = reinterpret_cast(REGFILE_BASE); +volatile tt_reg_ptr uint * instrn_buffer = reinterpret_cast(INSTRN_BUF_BASE); +volatile tt_reg_ptr uint * pc_buf_base = reinterpret_cast(PC_BUF_BASE); } void kernel_launch() diff --git a/tt_metal/hw/inc/debug/fw_debug.h b/tt_metal/hw/inc/debug/fw_debug.h index 577743d5cc8..4232dcd6ddb 100644 --- a/tt_metal/hw/inc/debug/fw_debug.h +++ b/tt_metal/hw/inc/debug/fw_debug.h @@ -3,3 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #define FWASSERT(s, p) +#define FWLOG0(...) (void)sizeof(__VA_ARGS__) +#define FWLOG1(...) (void)sizeof(__VA_ARGS__) +#define FWLOG2(...) (void)sizeof(__VA_ARGS__) diff --git a/tt_metal/include/compute_kernel_api.h b/tt_metal/include/compute_kernel_api.h index 1a79892db6e..a1abc4c2d4f 100644 --- a/tt_metal/include/compute_kernel_api.h +++ b/tt_metal/include/compute_kernel_api.h @@ -10,18 +10,19 @@ #include "ckernel_include.h" #include "hostdevcommon/kernel_structs.h" #include "risc_attribs.h" +#include "llk_op_info_api.h" #define SYNC SyncHalf #define ALWI inline __attribute__((always_inline)) #ifdef TRISC_MATH -#include "llk_math_common.h" -#include "llk_math_matmul.h" -#include "llk_math_eltwise_unary_datacopy.h" -#include "llk_math_eltwise_binary.h" -#include "llk_math_eltwise_unary_sfpu.h" -#include "llk_math_reduce.h" +#include "llk_math_common_api.h" +#include "llk_math_matmul_api.h" +#include "llk_math_unary_datacopy_api.h" +#include "llk_math_binary_api.h" +#include "llk_math_unary_sfpu_api.h" +#include "llk_math_reduce_api.h" #define MATH(x) x #define MAIN math_main() #else @@ -38,13 +39,13 @@ #endif #ifdef TRISC_UNPACK -#include "llk_unpack_common.h" -#include "llk_unpack_AB_matmul.h" -#include "llk_unpack_A.h" -#include "llk_unpack_AB.h" -#include "llk_unpack_reduce.h" -#include "llk_unpack_tilize.h" -#include "llk_unpack_untilize.h" +#include "llk_unpack_common_api.h" +#include "llk_unpack_AB_matmul_api.h" +#include "llk_unpack_A_api.h" +#include "llk_unpack_AB_api.h" +#include "llk_unpack_reduce_api.h" +#include "llk_unpack_tilize_api.h" +#include "llk_unpack_untilize_api.h" #define UNPACK(x) x #define MAIN unpack_main() #else diff --git a/tt_metal/include/compute_kernel_api/bcast.h b/tt_metal/include/compute_kernel_api/bcast.h index 110cbd2db1d..3e5c8dc8c54 100644 --- a/tt_metal/include/compute_kernel_api/bcast.h +++ b/tt_metal/include/compute_kernel_api/bcast.h @@ -7,13 +7,13 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_binary.h" -#include "llk_math_matmul.h" +#include "llk_math_binary_api.h" +#include "llk_math_matmul_api.h" #include "llk_math_common.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_AB.h" -#include "llk_unpack_A.h" +#include "llk_unpack_AB_api.h" +#include "llk_unpack_A_api.h" #endif #ifdef TRISC_PACK #include "llk_pack.h" diff --git a/tt_metal/include/compute_kernel_api/cb_api.h b/tt_metal/include/compute_kernel_api/cb_api.h index a9ca5d2d97c..dbec2593108 100644 --- a/tt_metal/include/compute_kernel_api/cb_api.h +++ b/tt_metal/include/compute_kernel_api/cb_api.h @@ -6,6 +6,14 @@ #include "compute_kernel_api/common_globals.h" +#ifdef TRISC_PACK +#include "llk_io_pack.h" +#endif +#ifdef TRISC_UNPACK +#include "llk_io_unpack.h" +#endif + + namespace ckernel { /** diff --git a/tt_metal/include/compute_kernel_api/common_globals.h b/tt_metal/include/compute_kernel_api/common_globals.h index 8393566a239..213859b1ae4 100644 --- a/tt_metal/include/compute_kernel_api/common_globals.h +++ b/tt_metal/include/compute_kernel_api/common_globals.h @@ -10,12 +10,12 @@ #include "chlkc_list.h" #include "ckernel.h" -#include "ckernel_globals.h" +#include "metal_ckernel_globals.h" #include "ckernel_include.h" #include "hostdevcommon/kernel_structs.h" #ifdef TRISC_MATH -#include "llk_math_common.h" +#include "llk_math_common_api.h" #define MATH(x) x #define MAIN math_main() #else @@ -23,8 +23,7 @@ #endif #ifdef TRISC_PACK -#include "llk_pack_common.h" -#include "llk_pack.h" +#include "llk_pack_api.h" #define PACK(x) x #define MAIN pack_main() #else @@ -32,7 +31,6 @@ #endif #ifdef TRISC_UNPACK -#include "llk_unpack_common.h" #define UNPACK(x) x #define MAIN unpack_main() #else diff --git a/tt_metal/include/compute_kernel_api/eltwise_binary.h b/tt_metal/include/compute_kernel_api/eltwise_binary.h index 3dbd756686c..dc54de90a9f 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_binary.h +++ b/tt_metal/include/compute_kernel_api/eltwise_binary.h @@ -7,10 +7,10 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_binary.h" +#include "llk_math_binary_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_AB.h" +#include "llk_unpack_AB_api.h" #endif diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h index 418bac60302..0b926fdb253 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h @@ -7,10 +7,10 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_unary_datacopy_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_AB.h" +#include "llk_unpack_AB_api.h" #endif diff --git a/tt_metal/include/compute_kernel_api/matmul.h b/tt_metal/include/compute_kernel_api/matmul.h index 9e3ebf6ac19..8a22fe02e4c 100644 --- a/tt_metal/include/compute_kernel_api/matmul.h +++ b/tt_metal/include/compute_kernel_api/matmul.h @@ -7,10 +7,10 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_matmul.h" +#include "llk_math_matmul_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_AB_matmul.h" +#include "llk_unpack_AB_matmul_api.h" #endif namespace ckernel { @@ -146,12 +146,21 @@ ALWI void mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t #endif MATH(( llk_math_pack_sync_init() )); + #ifdef ARCH_GRAYSKULL PACK(( llk_pack_init() )); PACK(( llk_pack_hw_configure_disaggregated(out_cb_id) )); PACK(( llk_setup_outputs() )); PACK(( llk_pack_dest_init() )); // TODO(AP): ZM-only kernel PACK(( llk_init_packer_dest_offset_registers() )); + #else + PACK(( llk_pack_init() )); + PACK(( llk_pack_hw_configure_disaggregated(out_cb_id) )); + PACK(( llk_setup_outputs() )); + PACK(( llk_pack_dest_init() )); + // TODO(AP): ZM-only kernel + PACK(( llk_init_packer_dest_offset_registers() )); + #endif } /** diff --git a/tt_metal/include/compute_kernel_api/reduce.h b/tt_metal/include/compute_kernel_api/reduce.h index 82a74919d10..b00b317eda2 100644 --- a/tt_metal/include/compute_kernel_api/reduce.h +++ b/tt_metal/include/compute_kernel_api/reduce.h @@ -7,13 +7,13 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_reduce.h" +#include "llk_math_reduce_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_AB.h" -#include "llk_unpack_reduce.h" +#include "llk_unpack_AB_api.h" +#include "llk_unpack_reduce_api.h" #endif diff --git a/tt_metal/include/compute_kernel_api/tile_move_copy.h b/tt_metal/include/compute_kernel_api/tile_move_copy.h index ac04b9d4f8a..80a056ef038 100644 --- a/tt_metal/include/compute_kernel_api/tile_move_copy.h +++ b/tt_metal/include/compute_kernel_api/tile_move_copy.h @@ -7,11 +7,11 @@ #include "compute_kernel_api/common_globals.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_unary_datacopy_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_A.h" +#include "llk_unpack_A_api.h" #endif namespace ckernel { diff --git a/tt_metal/include/compute_kernel_api/tilize.h b/tt_metal/include/compute_kernel_api/tilize.h index 422069f8ce0..58c1a7fc1d5 100644 --- a/tt_metal/include/compute_kernel_api/tilize.h +++ b/tt_metal/include/compute_kernel_api/tilize.h @@ -7,10 +7,10 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_unary_datacopy_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_tilize.h" +#include "llk_unpack_tilize_api.h" #endif #include "debug/dprint.h" diff --git a/tt_metal/include/compute_kernel_api/transpose_wh.h b/tt_metal/include/compute_kernel_api/transpose_wh.h index 558e78676cd..167185e0947 100644 --- a/tt_metal/include/compute_kernel_api/transpose_wh.h +++ b/tt_metal/include/compute_kernel_api/transpose_wh.h @@ -6,10 +6,10 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_unary_datacopy_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_A.h" +#include "llk_unpack_A_api.h" #endif diff --git a/tt_metal/include/compute_kernel_api/untilize.h b/tt_metal/include/compute_kernel_api/untilize.h index 1b770be282f..7f52753ee3c 100644 --- a/tt_metal/include/compute_kernel_api/untilize.h +++ b/tt_metal/include/compute_kernel_api/untilize.h @@ -7,10 +7,10 @@ #include "compute_kernel_api/common.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_unary_datacopy.h" +#include "llk_math_unary_datacopy_api.h" #endif #ifdef TRISC_UNPACK -#include "llk_unpack_untilize.h" +#include "llk_unpack_untilize_api.h" #endif namespace ckernel { From 9da02bdf50af593ba6dc3c2d8748f5360a46c4f2 Mon Sep 17 00:00:00 2001 From: acejkov Date: Wed, 29 Nov 2023 17:59:59 +0000 Subject: [PATCH 02/16] #3908: Fix linker code size overflow error for matmul --- .../wormhole_b0/metal/llk_api/llk_math_matmul_api.h | 9 ++++----- .../hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h | 9 +++++++-- .../hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h | 9 +++++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h index 8f7ea1f5713..ff64fb27b2d 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h @@ -23,11 +23,10 @@ inline void llk_math_matmul_init( const bool partial_face = get_operand_partial_face(in0_id); - const auto unpack_tile_dims = get_operand_tile_dims(in0_id); - const std::uint32_t in0_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX]; - const std::uint32_t in0_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX]; - const std::uint32_t in1_tile_r_dim = unpack_tile_dims[ckernel::TileDim::R_IDX]; - const std::uint32_t in1_tile_c_dim = unpack_tile_dims[ckernel::TileDim::C_IDX]; + const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id); + const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id); + const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id); + const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id); #ifdef ARCH_GRAYSKULL _llk_math_matmul_init_( diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h index c6d1b438f42..9c71ef63b52 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h @@ -40,7 +40,12 @@ inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id) return 0; } -inline const std::vector get_operand_tile_dims(const std::uint32_t operand_id) +inline const uint32_t get_operand_tile_r_dim(const std::uint32_t operand_id) { - return {32, 32}; + return 32; +} + +inline const uint32_t get_operand_tile_c_dim(const std::uint32_t operand_id) +{ + return 32; } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h index 596255257d0..cba5398b604 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h @@ -49,7 +49,12 @@ inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id) return 0; } -inline const std::vector get_output_tile_dims(const std::uint32_t operand_id) +inline const uint32_t get_output_tile_r_dim(const std::uint32_t output_id) { - return {32, 32}; + return 32; +} + +inline const uint32_t get_output_tile_c_dim(const std::uint32_t output_id) +{ + return 32; } From 85865ae299a4cc60b671287da4c941d81a2ca058 Mon Sep 17 00:00:00 2001 From: Reem Tawfik Date: Wed, 29 Nov 2023 21:48:17 +0000 Subject: [PATCH 03/16] #3908: Fixes for regressions/piplines: - Fixed packer tile header write - Added missing spdx liscenses - Added missing includes/funcs - Make mod_div_lib common, remove conflicting functions --- .../compute/matmul_large_block.cpp | 2 +- .../kernels/compute/bmm_tilize_untilize.cpp | 1 + ...ts_in_l1_single_output_block_width_dim.cpp | 1 + .../conv_bmm_tilize_col_major_out_blocks.cpp | 1 + .../wormhole_b0/common/inc/ckernel_defs.h | 1 + .../wormhole_b0/common/inc/ckernel_perf_api.h | 4 + .../common/inc/ckernel_perf_include.h | 4 + .../common/inc/ckernel_perf_math.h | 4 + .../common/inc/ckernel_perf_unpack_pack.h | 4 + .../metal/common/metal_ckernel_globals.h | 1 + .../wormhole_b0/metal/llk_api/llk_pack_api.h | 16 ++-- .../metal/llk_api/llk_unpack_tilize_api.h | 6 ++ .../wormhole_b0/metal/llk_io/llk_io.h | 4 + .../wormhole_b0/metal/llk_io/llk_operands.h | 3 + .../metal_mod_div_lib.h => inc/mod_div_lib.h} | 0 tt_metal/hw/inc/risc_common.h | 88 +------------------ tt_metal/include/compute_kernel_api/tilize.h | 4 - 17 files changed, 46 insertions(+), 98 deletions(-) rename tt_metal/hw/{ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h => inc/mod_div_lib.h} (100%) diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp index 651955d6944..e0336993506 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp @@ -9,7 +9,7 @@ #include "compute_kernel_api/tile_move_copy.h" #include "compute_kernel_api/matmul.h" - +#include "mod_div_lib.h" inline void tilize_activation(uint32_t in0_cb, uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t out_cb) { diff --git a/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp b/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp index 9b969a8cb9d..c419a0f2e0f 100644 --- a/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp +++ b/tt_eager/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp @@ -4,6 +4,7 @@ #include +#include "mod_div_lib.h" #include "compute_kernel_api/tilize.h" #include "compute_kernel_api/untilize.h" #include "compute_kernel_api/tile_move_copy.h" diff --git a/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp b/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp index 83c68e74272..f2d348a4e8d 100644 --- a/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp +++ b/tt_eager/tt_dnn/op_library/conv/kernels/bmm_tilize_untilize_all_weights_in_l1_single_output_block_width_dim.cpp @@ -4,6 +4,7 @@ #include +#include "mod_div_lib.h" #include "compute_kernel_api/tilize.h" #include "compute_kernel_api/untilize.h" #include "compute_kernel_api/tile_move_copy.h" diff --git a/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp b/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp index 787849b31c0..83a75eacbff 100644 --- a/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp +++ b/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp @@ -4,6 +4,7 @@ #include +#include "mod_div_lib.h" #include "compute_kernel_api/tilize.h" #include "compute_kernel_api/untilize.h" #include "compute_kernel_api/tile_move_copy.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h index ffd8ad6dae9..41450e32f27 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h @@ -4,6 +4,7 @@ #pragma once +#include "llk_defs.h" #include "ckernel_ops.h" #include "tensix_types.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h index 0e0c729f4b2..9bfa79f6934 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h index 50b9ed3f7cc..d9ff57a5403 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + #pragma once #ifdef PERF_DUMP diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h index a5df5a61f62..812f5cc9884 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h index aaa854ebc2f..9a2b21b4756 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h index f31efd1c3d0..29a2dbf9cfe 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h @@ -10,6 +10,7 @@ #include "metal_compile_time_args.h" #include "risc_attribs.h" #include "hostdevcommon/common_runtime_address_map.h" +#include "hostdevcommon/kernel_structs.h" extern uint32_t __ldm_bss_start[]; extern uint32_t __ldm_bss_end[]; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h index 808d88a6281..1e57d003cfc 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" #include "ckernel_defs.h" @@ -17,7 +21,7 @@ * LLK PACK *************************************************************************/ -template +template inline void llk_pack_mop_config(const uint32_t output) { const std::uint32_t output_id = get_output_id(output); @@ -26,7 +30,7 @@ inline void llk_pack_mop_config(const uint32_t output) { const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]); const bool narrow_tile = get_output_narrow_tile(output_id); - _llk_pack_mop_config_( + _llk_pack_mop_config_( pack_dst_format[output_id], face_r_dim, num_faces, @@ -94,7 +98,7 @@ inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output llk_pack_reduce_hw_configure(&llk_pack_params); } -template +template inline void llk_pack_init(const std::uint32_t pack_output = 0) { const std::uint32_t output_id = get_output_id(pack_output); @@ -103,7 +107,7 @@ inline void llk_pack_init(const std::uint32_t pack_output = 0) { const bool partial_face = get_output_partial_face(output_id); const bool narrow_tile = get_output_narrow_tile(output_id); - _llk_pack_init_( + _llk_pack_init_( pack_dst_format[output_id], face_r_dim, num_faces, @@ -237,7 +241,7 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { ); } -template +template inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) { std::uint32_t old_output_id = get_output_id(old_output); std::uint32_t new_output_id = get_output_id(new_output); @@ -248,7 +252,7 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const llk_pack_reconfig_data_format(new_output); } else if constexpr (is_tile_dim_reconfig_en) { // Same format but different tile dims - llk_pack_mop_config(new_output); + llk_pack_mop_config(new_output); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h index 0f0a1b69ab3..71eeb6a0ba2 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h @@ -91,3 +91,9 @@ inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, s narrow_tile ); } + +inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) { + for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) { + llk_unpack_tilize(operand, tile_index, block_c_tiles); + } +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h index 7d3e365a730..37e018dc6b8 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_io.h @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h index 9c71ef63b52..4a03157715b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h b/tt_metal/hw/inc/mod_div_lib.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_mod_div_lib.h rename to tt_metal/hw/inc/mod_div_lib.h diff --git a/tt_metal/hw/inc/risc_common.h b/tt_metal/hw/inc/risc_common.h index 7afd2d84974..cd6b02c7a7d 100644 --- a/tt_metal/hw/inc/risc_common.h +++ b/tt_metal/hw/inc/risc_common.h @@ -15,6 +15,7 @@ #include "stream_io_map.h" #include "hostdevcommon/common_runtime_address_map.h" #include "limits.h" +#include "mod_div_lib.h" #define NOC_X(x) (noc_index == 0 ? (x) : (noc_size_x-1-(x))) #define NOC_Y(y) (noc_index == 0 ? (y) : (noc_size_y-1-(y))) @@ -147,93 +148,6 @@ inline uint32_t special_mult(uint32_t a, uint32_t special_b) { return 0; } -inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b) -{ - unsigned int r = 0; - while (a) - { - if (a & 1) - r += b; - a >>= 1; - b <<= 1; - } - return r; -} - -inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n) -{ - // Uses embedding style magic number - // * fixed point 1/12 then shifting. - // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm - return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3; -} - -inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n) -{ - // Uses embedding style magic number - // * fixed point 1/12 then shifting. - // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm - return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6; -} - -template -inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n) -{ - if constexpr (d == 12) { - // fast divide for 12 divisor - return fast_udiv_12(n); - } else if constexpr (d == 94) { - // fast divide for 94 divisor. Handles Banked L1 address generation for E75 - return fast_udiv_94(n); - } else { - // generic divide from llvm - const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT; - unsigned int q; - unsigned int r; - unsigned sr; - /* special cases */ - if (d == 0) - return 0; /* ?! */ - if (n == 0) - return 0; - sr = __builtin_clz(d) - __builtin_clz(n); - /* 0 <= sr <= n_uword_bits - 1 or sr large */ - if (sr > n_uword_bits - 1) /* d > r */ - return 0; - if (sr == n_uword_bits - 1) /* d == 1 */ - return n; - ++sr; - /* 1 <= sr <= n_uword_bits - 1 */ - /* Not a special case */ - q = n << (n_uword_bits - sr); - r = n >> sr; - unsigned int carry = 0; - for (; sr > 0; --sr) - { - /* r:q = ((r:q) << 1) | carry */ - r = (r << 1) | (q >> (n_uword_bits - 1)); - q = (q << 1) | carry; - /* carry = 0; - * if (r.all >= d.all) - * { - * r.all -= d.all; - * carry = 1; - * } - */ - const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1); - carry = s & 1; - r -= d & s; - } - q = (q << 1) | carry; - return q; - } -} -template -inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a) -{ - return a - udivsi3_const_divisor(a) * d; -} - void risc_init(); void replicate(uint32_t noc_id, uint32_t src_addr, uint64_t dest_addr, uint32_t chunk_size_bytes, uint32_t times_to_replicate); void replicate_l1(uint32_t noc_id, uint32_t src_addr, uint64_t dest_addr, uint32_t chunk_size_bytes, uint32_t times_to_replicate); diff --git a/tt_metal/include/compute_kernel_api/tilize.h b/tt_metal/include/compute_kernel_api/tilize.h index 58c1a7fc1d5..096b87027b3 100644 --- a/tt_metal/include/compute_kernel_api/tilize.h +++ b/tt_metal/include/compute_kernel_api/tilize.h @@ -37,11 +37,7 @@ ALWI void tilize_init(uint32_t icb, uint32_t block, uint32_t ocb = 16) PACK(( llk_pack_dest_init() )); UNPACK(( llk_setup_operands() )); - #ifdef ARCH_GRAYSKULL UNPACK(( llk_unpack_tilize_hw_configure_disaggregated(icb) )); - #else - UNPACK(( llk_unpack_tilize_hw_configure_disaggregated<>(icb, block) )); - #endif UNPACK(( llk_unpack_tilize_init(icb, block) )); } From 649c02e35f3a54f6abbb113ece28afcc89a80bfb Mon Sep 17 00:00:00 2001 From: Kei-Ming Kwong Date: Thu, 30 Nov 2023 19:21:55 +0000 Subject: [PATCH 04/16] #3908: Update llk uplift - Fix for reduce + build issues with cb_interface --- .../ckernels/wormhole_b0/common/inc/ckernel.h | 21 +- .../wormhole_b0/common/inc/ckernel_addrmod.h | 1 + .../wormhole_b0/common/inc/ckernel_defs.h | 93 +- .../wormhole_b0/common/inc/ckernel_globals.h | 1 + .../wormhole_b0/common/inc/ckernel_gpr_map.h | 1 + .../wormhole_b0/common/inc/ckernel_include.h | 1 + .../common/inc/ckernel_instr_params.h | 11 +- .../wormhole_b0/common/inc/ckernel_ops.h | 1130 +++++++++++++++++ .../wormhole_b0/common/inc/ckernel_pcbuf.h | 1 + .../wormhole_b0/common/inc/ckernel_sfpi.h | 1 + .../wormhole_b0/common/inc/ckernel_sfpu.h | 596 ++++----- .../wormhole_b0/common/inc/ckernel_structs.h | 1 + .../wormhole_b0/common/inc/ckernel_template.h | 218 ++++ .../wormhole_b0/common/inc/ckernel_xmov.h | 1 + .../wormhole_b0/common/inc/cmath_common.h | 19 + .../wormhole_b0/common/inc/cpack_common.h | 28 + .../wormhole_b0/common/inc/cunpack_common.h | 43 +- .../common/src/ckernel_template.cc | 230 ---- .../wormhole_b0/common/src/ckernel_unity.cc | 1 - .../wormhole_b0/common/src/fwlog_list | 1 - .../ckernels/wormhole_b0/llk_lib/llk_defs.h | 78 +- .../wormhole_b0/llk_lib/llk_math_common.h | 1 + .../llk_lib/llk_math_eltwise_binary.h | 14 +- .../llk_lib/llk_math_eltwise_binary_sfpu.h | 92 +- .../llk_lib/llk_math_eltwise_unary_datacopy.h | 3 +- .../llk_lib/llk_math_eltwise_unary_sfpi.h | 1 + .../llk_lib/llk_math_eltwise_unary_sfpu.h | 82 +- .../wormhole_b0/llk_lib/llk_math_matmul.h | 20 +- .../wormhole_b0/llk_lib/llk_math_reduce.h | 50 +- .../ckernels/wormhole_b0/llk_lib/llk_pack.h | 3 + .../wormhole_b0/llk_lib/llk_pack_common.h | 23 +- .../wormhole_b0/llk_lib/llk_pack_untilize.h | 71 ++ .../wormhole_b0/llk_lib/llk_unpack_A.h | 16 +- .../wormhole_b0/llk_lib/llk_unpack_AB.h | 10 +- .../llk_lib/llk_unpack_AB_matmul.h | 10 +- .../wormhole_b0/llk_lib/llk_unpack_common.h | 34 +- .../wormhole_b0/llk_lib/llk_unpack_reduce.h | 8 +- .../wormhole_b0/llk_lib/llk_unpack_tilize.h | 8 +- .../wormhole_b0/llk_lib/llk_unpack_untilize.h | 8 +- .../metal/llk_api/llk_math_matmul_api.h | 4 +- .../metal/llk_api/llk_math_unary_sfpu_api.h | 30 - .../wormhole_b0/metal/llk_api/llk_pack_api.h | 33 + .../metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h | 2 +- .../llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h | 4 +- .../llk_api/llk_sfpu/ckernel_sfpu_erfinv.h | 2 +- .../llk_api/llk_sfpu/ckernel_sfpu_gelu.h | 14 +- .../metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h | 2 +- .../llk_sfpu/ckernel_sfpu_logical_not_noti.h | 2 +- .../llk_api/llk_sfpu/ckernel_sfpu_relu.h | 6 +- ..._math_eltwise_unary_sfpu_common_includes.h | 155 +++ .../llk_math_eltwise_unary_sfpu_init.h | 45 +- .../llk_api/llk_sfpu/metal_ckernel_sfpu.h | 20 +- .../metal/llk_api/llk_sfpu_types.h | 69 + .../metal/llk_api/llk_unpack_AB_api.h | 4 +- .../metal/llk_api/llk_unpack_AB_matmul_api.h | 4 +- .../metal/llk_api/llk_unpack_A_api.h | 4 +- .../metal/llk_api/llk_unpack_common_api.h | 12 +- .../metal/llk_api/llk_unpack_reduce_api.h | 4 +- .../metal/llk_api/llk_unpack_tilize_api.h | 2 +- .../metal/llk_api/llk_unpack_untilize_api.h | 2 +- tt_metal/hw/inc/debug/dprint_tile.h | 2 +- 61 files changed, 2278 insertions(+), 1075 deletions(-) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc create mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h index 2f72476ade2..b731cc4bf81 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "risc_attribs.h" @@ -35,6 +36,9 @@ #define OVERLAY_DECOUPLE 0 #endif +#ifdef LLK_TB_TEST +#include "kernel_slowdown_config.h" +#endif #ifndef INSERT_UNPACK_DELAY #define INSERT_UNPACK_DELAY 0 @@ -56,10 +60,13 @@ #include "ckernel_include.h" #include "tensix.h" -#include "debug/fw_debug.h" -#include "eth_l1_address_map.h" -#include "hostdevcommon/common_runtime_address_map.h" +#include "fw_debug.h" // #include +#if defined(PERF_DUMP) || DELAY_EN > 0 +#include +#include "tt_log.h" +#include "perf_lib/scratch_api.h" +#endif namespace ckernel { @@ -73,10 +80,10 @@ constexpr uint RESET_VAL = 0; constexpr uint KERNEL_IN_PROGRESS = 15; constexpr uint KERNEL_COMPLETE = 1; -extern volatile uint tt_reg_ptr * reg_base; -extern volatile uint tt_reg_ptr * pc_buf_base; -extern volatile uint tt_reg_ptr * regfile; -extern volatile uint tt_reg_ptr * instrn_buffer; +extern volatile uint tt_reg_ptr *reg_base; +extern volatile uint tt_reg_ptr *pc_buf_base; +extern volatile uint tt_reg_ptr *regfile; +extern volatile uint tt_reg_ptr *instrn_buffer; extern volatile uint tt_reg_ptr *mailbox_base[4]; extern volatile uint tt_reg_ptr *dbg_event_scratch; extern volatile uint tt_reg_ptr *trisc_l1_mailbox; diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h index f917c1e009d..3009fae2695 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_addrmod.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h index 41450e32f27..b1630dfe798 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_defs.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "llk_defs.h" @@ -67,24 +68,6 @@ enum PackSelMask PACK_23=0xC }; -/* -Stochastic rounding modes: - None: No stochastic rounding enabled, default rounding is round to nearest even. - Fpu: Enables stochastic rounding for every accumulation in the fpu - Pack: Enables stochastic rounding in both gasket and packer. Gasket rounding is in - data format conversion stage from dest format to pack_src_format. Packer rounding - is in data format conversion stage from pack_src_format to pack_dst_format. - All: Enables fpu, pack and gasket rounding. -*/ -enum class StochRndMode : std::uint8_t -{ - None = 0, - Fpu = 1, - Pack = 2, - All = 0xf, - Invalid = 0xff, -}; - constexpr std::uint32_t FACE_HEIGHT = 16; constexpr std::uint32_t FACE_WIDTH = 16; constexpr std::uint32_t TILE_HEIGHT = 32; @@ -108,78 +91,6 @@ static_assert((DEST_NUM_TILES_FP16 & (DEST_NUM_TILES_FP16 - 1)) == 0); #define LO_16(REG) (2 * (REG)) #define HI_16(REG) (2 * (REG) + 1) - -/* -constexpr static std::int32_t MUL_TILE_SIZE_AND_INDEX(uint format, uint index) { - switch (format&0xF) { - case ((uint8_t)DataFormat::Float32): return ((index<<8)+(index<<1)); - case ((uint8_t)DataFormat::Float16): - case ((uint8_t)DataFormat::Float16_b): return ((index<<7)+(index<<1)); - case ((uint8_t)DataFormat::Bfp8): - case ((uint8_t)DataFormat::Bfp8_b): return ((index<<6)+(index<<2)+(index<<1)); - case ((uint8_t)DataFormat::Bfp4): - case ((uint8_t)DataFormat::Bfp4_b): return ((index<<5)+(index<<2)+(index<<1)); - case ((uint8_t)DataFormat::Bfp2): - case ((uint8_t)DataFormat::Bfp2_b): return ((index<<4)+(index<<2)+(index<<1)); - case ((uint8_t)DataFormat::Int8): - case ((uint8_t)DataFormat::Lf8): return ((index<<6)+(index<<1)); - //Keep default as Bfp8? - default: return ((index<<6)+(index<<2)+(index<<1)); - }; -} - -constexpr static std::int32_t MUL_DEST_TILE_SIZE_AND_INDEX(uint format, uint index) { - switch (format&0xF) { - case ((uint8_t)DataFormat::Float32): return (index<<12); - case ((uint8_t)DataFormat::Float16): - case ((uint8_t)DataFormat::Float16_b): return (index<<11); - case ((uint8_t)DataFormat::Bfp8): - case ((uint8_t)DataFormat::Bfp8_b): return (index<<10); - case ((uint8_t)DataFormat::Bfp4): - case ((uint8_t)DataFormat::Bfp4_b): return (index<<9); - case ((uint8_t)DataFormat::Bfp2): - case ((uint8_t)DataFormat::Bfp2_b): return (index<<8); - case ((uint8_t)DataFormat::Int8): - case ((uint8_t)DataFormat::Lf8): return (index<<10); - default: return (index<<10); - }; -} - -constexpr static std::int32_t GET_L1_TILE_SIZE(uint format) { - switch (format&0xF) { - case ((uint8_t)DataFormat::Float32): return ((4096>>4)+(32>>4)); - case ((uint8_t)DataFormat::Float16): - case ((uint8_t)DataFormat::Float16_b): return ((2048>>4)+(32>>4)); - case ((uint8_t)DataFormat::Bfp8): - case ((uint8_t)DataFormat::Bfp8_b): return ((1024>>4)+(64>>4)+(32>>4)); - case ((uint8_t)DataFormat::Bfp4): - case ((uint8_t)DataFormat::Bfp4_b): return ((512>>4)+(64>>4)+(32>>4)); - case ((uint8_t)DataFormat::Bfp2): - case ((uint8_t)DataFormat::Bfp2_b): return ((256>>4)+(64>>4)+(32>>4)); - case ((uint8_t)DataFormat::Int8): - case ((uint8_t)DataFormat::Lf8): return ((1024>>4)+(32>>4)); - default: return ((1024>>4)+(64>>4)+(32>>4)); - }; -} - -constexpr static std::int32_t GET_DEST_TILE_BYTE_SIZE(uint format) { - switch (format&0xF) { - case ((uint8_t)DataFormat::Float32): return 4096; - case ((uint8_t)DataFormat::Float16): - case ((uint8_t)DataFormat::Float16_b): return 2048; - case ((uint8_t)DataFormat::Bfp8): - case ((uint8_t)DataFormat::Bfp8_b): return 1024; - case ((uint8_t)DataFormat::Bfp4): - case ((uint8_t)DataFormat::Bfp4_b): return 512; - case ((uint8_t)DataFormat::Bfp2): - case ((uint8_t)DataFormat::Bfp2_b): return 256; - case ((uint8_t)DataFormat::Int8): - case ((uint8_t)DataFormat::Lf8): return 1024; - default: return 1024; - }; -} -*/ - constexpr static std::uint32_t GET_L1_HEADERLESS_TILE_SIZE(uint format) { switch (format&0xF) { case ((uint8_t)DataFormat::Int32): @@ -243,6 +154,4 @@ constexpr static std::uint32_t SCALE_DATUM_SIZE(uint format, uint datum_count) { #define LOWER_HALFWORD(x) ((x) & 0xFFFF) #define UPPER_HALFWORD(x) ((x) >> 16) -constexpr int WHB0_ITERATIONS = 8; - } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h index 90ac67944f5..f9359469e33 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h index 822704cc9e1..ba1abdca84e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_gpr_map.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once // Hand-coded parameter encoding for various GPR mappings diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h index 4418cfdb57e..6fa83799cb1 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_include.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once // diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h index bb28714aa71..4c72eecf213 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_instr_params.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #ifdef PERF_DUMP @@ -18,13 +19,13 @@ struct p_setrwc #if SKIP_UNP == 1 constexpr static uint CLR_A = 0x0; - constexpr static uint CLR_B = 0x0; - constexpr static uint CLR_AB = 0x0; + constexpr static uint CLR_B = 0x0; + constexpr static uint CLR_AB = 0x0; #else constexpr static uint CLR_A = 0x1; - constexpr static uint CLR_B = 0x2; - constexpr static uint CLR_AB = 0x3; - #endif + constexpr static uint CLR_B = 0x2; + constexpr static uint CLR_AB = 0x3; +#endif #else constexpr static uint CLR_A = 0x1; diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h new file mode 100644 index 00000000000..bdd4e9e048b --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_ops.h @@ -0,0 +1,1130 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + + +// +// Auto-generated file, do not modify! +// + +#pragma once + +#define TT_OP(opcode, params) ( (opcode << 24) + params ) +#define INSTRUCTION_WORD(x) __asm__ __volatile__(".word (%0)" : : "i" ((x))) // Drop 32 bits into the instruction stream. +#define TRISC_OP_SWIZZLE(x) ( (((x) >> 30) & 0x3) | (((x) & 0x3FFFFFFF) << 2) ) // Put top 2 bits, which are currently never 'b11 to bottom, indicating to Risc that they are not risc instructions + +#define TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + TT_OP(0x58, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) +#define TT_ADDDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) +#define TT_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) +#define TTI_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) )) + +#define TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + TT_OP(0x53, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0))) +#define TT_ADDRCRXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) +#define TT_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + ckernel::instrn_buffer[0] = TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) +#define TTI_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDRCRXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) )) + +#define TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + TT_OP(0x56, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0))) +#define TT_ADDRCRZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) +#define TT_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + ckernel::instrn_buffer[0] = TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) +#define TTI_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ADDRCRZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) )) + +#define TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ + TT_OP(0x25, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) +#define TT_APOOL3S1_VALID(clear_dvalid, addr_mode, index_en, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14)) +#define TT_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ + ckernel::instrn_buffer[0] = TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) +#define TTI_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_APOOL3S1(clear_dvalid, addr_mode, index_en, dst) )) + +#define TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ + TT_OP(0x32, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) +#define TT_APOOL3S2_VALID(clear_dvalid, addr_mode, index_en, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14)) +#define TT_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ + ckernel::instrn_buffer[0] = TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) +#define TTI_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_APOOL3S2(clear_dvalid, addr_mode, index_en, dst) )) + +#define TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ + TT_OP(0x64, (((MemHierSel) << 23) + ((SwapVal) << 18) + ((CmpVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) +#define TT_ATCAS_VALID(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ + (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SwapVal, 5) && ckernel::is_valid(CmpVal, 4) && ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) +#define TT_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) +#define TTI_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATCAS(MemHierSel, SwapVal, CmpVal, Sel32b, DataRegIndex, AddrRegIndex) )) + +#define TT_OP_ATGETM(mutex_index) \ + TT_OP(0xa0, (((mutex_index) << 0))) +#define TT_ATGETM_VALID(mutex_index) \ + (ckernel::is_valid(mutex_index, 24)) +#define TT_ATGETM(mutex_index) \ + ckernel::instrn_buffer[0] = TT_OP_ATGETM(mutex_index) +#define TTI_ATGETM(mutex_index) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATGETM(mutex_index) )) + +#define TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + TT_OP(0x61, (((MemHierSel) << 23) + ((WrapVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) +#define TT_ATINCGET_VALID(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(WrapVal, 9) && ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) +#define TT_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) +#define TTI_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATINCGET(MemHierSel, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) )) + +#define TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + TT_OP(0x62, (((MemHierSel) << 23) + ((NoIncr) << 22) + ((IncrVal) << 18) + ((WrapVal) << 14) + ((Sel32b) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) +#define TT_ATINCGETPTR_VALID(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(NoIncr, 1) && ckernel::is_valid(IncrVal, 4) && ckernel::is_valid(WrapVal, 4) && ckernel::is_valid(Sel32b, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) +#define TT_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) +#define TTI_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATINCGETPTR(MemHierSel, NoIncr, IncrVal, WrapVal, Sel32b, DataRegIndex, AddrRegIndex) )) + +#define TT_OP_ATRELM(mutex_index) \ + TT_OP(0xa1, (((mutex_index) << 0))) +#define TT_ATRELM_VALID(mutex_index) \ + (ckernel::is_valid(mutex_index, 24)) +#define TT_ATRELM(mutex_index) \ + ckernel::instrn_buffer[0] = TT_OP_ATRELM(mutex_index) +#define TTI_ATRELM(mutex_index) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATRELM(mutex_index) )) + +#define TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ + TT_OP(0x63, (((MemHierSel) << 23) + ((SwapMask) << 14) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) +#define TT_ATSWAP_VALID(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ + (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SwapMask, 9) && ckernel::is_valid(DataRegIndex, 8) && ckernel::is_valid(AddrRegIndex, 6)) +#define TT_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) +#define TTI_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ATSWAP(MemHierSel, SwapMask, DataRegIndex, AddrRegIndex) )) + +#define TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + TT_OP(0x5b, (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) +#define TT_BITWOPDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) +#define TT_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) +#define TTI_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_BITWOPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) )) + +#define TT_OP_CLEARDVALID(cleardvalid, reset) \ + TT_OP(0x36, (((cleardvalid) << 22) + ((reset) << 0))) +#define TT_CLEARDVALID_VALID(cleardvalid, reset) \ + (ckernel::is_valid(cleardvalid, 2) && ckernel::is_valid(reset, 22)) +#define TT_CLEARDVALID(cleardvalid, reset) \ + ckernel::instrn_buffer[0] = TT_OP_CLEARDVALID(cleardvalid, reset) +#define TTI_CLEARDVALID(cleardvalid, reset) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CLEARDVALID(cleardvalid, reset) )) + +#define TT_OP_CLREXPHIST\ + TT_OP(0x21, 0) +#define TTI_CLREXPHIST\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CLREXPHIST)) + +#define TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + TT_OP(0x5d, (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) +#define TT_CMPDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) +#define TT_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) +#define TTI_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CMPDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) )) + +#define TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ + TT_OP(0x22, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_CONV3S1_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(dst, 15)) +#define TT_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) +#define TTI_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) )) + +#define TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \ + TT_OP(0x23, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_CONV3S2_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(dst, 15)) +#define TT_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) +#define TTI_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_CONV3S2(clear_dvalid, rotate_weights, addr_mode, dst) )) + +#define TT_OP_DMANOP\ + TT_OP(0x60, 0) +#define TTI_DMANOP\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_DMANOP)) + +#define TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + TT_OP(0x29, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_DOTPV_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) +#define TT_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) +#define TTI_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_DOTPV(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) )) + +#define TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + TT_OP(0x28, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_ELWADD_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) +#define TT_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) +#define TTI_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWADD(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) )) + +#define TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + TT_OP(0x27, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_ELWMUL_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) +#define TT_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) +#define TTI_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWMUL(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) )) + +#define TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + TT_OP(0x30, (((clear_dvalid) << 22) + ((dest_accum_en) << 21) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_ELWSUB_VALID(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(dest_accum_en, 1) && ckernel::is_valid(instr_mod19, 2) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) +#define TT_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) +#define TTI_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ELWSUB(clear_dvalid, dest_accum_en, instr_mod19, addr_mode, dst) )) + +#define TT_OP_FLUSHDMA(FlushSpec) \ + TT_OP(0x46, (((FlushSpec) << 0))) +#define TT_FLUSHDMA_VALID(FlushSpec) \ + (ckernel::is_valid(FlushSpec, 24)) +#define TT_FLUSHDMA(FlushSpec) \ + ckernel::instrn_buffer[0] = TT_OP_FLUSHDMA(FlushSpec) +#define TTI_FLUSHDMA(FlushSpec) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_FLUSHDMA(FlushSpec) )) + +#define TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + TT_OP(0x34, (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((max_pool_index_en) << 14) + ((dst) << 0))) +#define TT_GAPOOL_VALID(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(max_pool_index_en, 1) && ckernel::is_valid(dst, 14)) +#define TT_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + ckernel::instrn_buffer[0] = TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) +#define TTI_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GAPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) )) + +#define TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \ + TT_OP(0x35, (((reset_srcb_gate_control) << 1) + ((reset_srca_gate_control) << 0))) +#define TT_GATESRCRST_VALID(reset_srcb_gate_control, reset_srca_gate_control) \ + (ckernel::is_valid(reset srcb gate control, 23) && ckernel::is_valid(reset srca gate control, 1)) +#define TT_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \ + ckernel::instrn_buffer[0] = TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) +#define TTI_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GATESRCRST(reset_srcb_gate_control, reset_srca_gate_control) )) + +#define TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + TT_OP(0x33, (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((max_pool_index_en) << 14) + ((dst) << 0))) +#define TT_GMPOOL_VALID(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(max_pool_index_en, 1) && ckernel::is_valid(dst, 14)) +#define TT_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + ckernel::instrn_buffer[0] = TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) +#define TTI_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_GMPOOL(clear_dvalid, instr_mod19, addr_mode, max_pool_index_en, dst) )) + +#define TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + TT_OP(0x52, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6))) +#define TT_INCADCXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3)) +#define TT_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + ckernel::instrn_buffer[0] = TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) +#define TTI_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) )) + +#define TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + TT_OP(0x55, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6))) +#define TT_INCADCZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3)) +#define TT_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + ckernel::instrn_buffer[0] = TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) +#define TTI_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X) )) + +#define TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \ + TT_OP(0x38, (((rwc_cr) << 18) + ((rwc_d) << 14) + ((rwc_b) << 10) + ((rwc_a) << 6))) +#define TT_INCRWC_VALID(rwc_cr, rwc_d, rwc_b, rwc_a) \ + (ckernel::is_valid(rwc_cr, 6) && ckernel::is_valid(rwc_d, 4) && ckernel::is_valid(rwc_b, 4) && ckernel::is_valid(rwc_a, 4)) +#define TT_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \ + ckernel::instrn_buffer[0] = TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) +#define TTI_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_INCRWC(rwc_cr, rwc_d, rwc_b, rwc_a) )) + +#define TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + TT_OP(0x49, (((SizeSel) << 22) + ((OffsetIndex) << 14) + ((AutoIncSpec) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) +#define TT_LOADIND_VALID(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + (ckernel::is_valid(SizeSel, 2) && ckernel::is_valid(OffsetIndex, 8) && ckernel::is_valid(AutoIncSpec, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) +#define TT_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) +#define TTI_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_LOADIND(SizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) )) + +#define TT_OP_LOADREG(TdmaDataRegIndex, RegAddr) \ + TT_OP(0x68, (((TdmaDataRegIndex) << 18) + ((RegAddr) << 0))) +#define TT_LOADREG_VALID(TdmaDataRegIndex, RegAddr) \ + (ckernel::is_valid(TdmaDataRegIndex, 6) && ckernel::is_valid(RegAddr, 18)) +#define TT_LOADREG(TdmaDataRegIndex, RegAddr) \ + ckernel::instrn_buffer[0] = TT_OP_LOADREG(TdmaDataRegIndex, RegAddr) +#define TTI_LOADREG(TdmaDataRegIndex, RegAddr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_LOADREG(TdmaDataRegIndex, RegAddr) )) + +#define TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ + TT_OP(0x3a, (((clear_dvalid) << 22) + ((rotate_weights) << 17) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_MFCONV3S1_VALID(clear_dvalid, rotate_weights, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(rotate_weights, 5) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(dst, 15)) +#define TT_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) +#define TTI_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MFCONV3S1(clear_dvalid, rotate_weights, addr_mode, dst) )) + +#define TT_OP_MOP(mop_type, loop_count, zmask_lo16) \ + TT_OP(0x01, (((mop_type) << 23) + ((loop_count) << 16) + ((zmask_lo16) << 0))) +#define TT_MOP_VALID(mop_type, loop_count, zmask_lo16) \ + (ckernel::is_valid(mop_type, 1) && ckernel::is_valid(loop_count, 7) && ckernel::is_valid(zmask_lo16, 16)) +#define TT_MOP(mop_type, loop_count, zmask_lo16) \ + ckernel::instrn_buffer[0] = TT_OP_MOP(mop_type, loop_count, zmask_lo16) +#define TTI_MOP(mop_type, loop_count, zmask_lo16) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOP(mop_type, loop_count, zmask_lo16) )) + +#define TT_OP_MOP_CFG(zmask_hi16) \ + TT_OP(0x03, (((zmask_hi16) << 0))) +#define TT_MOP_CFG_VALID(zmask_hi16) \ + (ckernel::is_valid(zmask_hi16, 24)) +#define TT_MOP_CFG(zmask_hi16) \ + ckernel::instrn_buffer[0] = TT_OP_MOP_CFG(zmask_hi16) +#define TTI_MOP_CFG(zmask_hi16) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOP_CFG(zmask_hi16) )) + +#define TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + TT_OP(0x12, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) +#define TT_MOVA2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) +#define TT_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) +#define TTI_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) )) + +#define TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb) \ + TT_OP(0x0b, (((srca) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((srcb) << 0))) +#define TT_MOVB2A_VALID(srca, addr_mode, instr_mod, srcb) \ + (ckernel::is_valid(srca, 7) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(srcb, 12)) +#define TT_MOVB2A(srca, addr_mode, instr_mod, srcb) \ + ckernel::instrn_buffer[0] = TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb) +#define TTI_MOVB2A(srca, addr_mode, instr_mod, srcb) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVB2A(srca, addr_mode, instr_mod, srcb) )) + +#define TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + TT_OP(0x13, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) +#define TT_MOVB2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) +#define TT_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) +#define TTI_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVB2D(dest_32b_lo, src, addr_mode, instr_mod, dst) )) + +#define TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + TT_OP(0x08, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) +#define TT_MOVD2A_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) +#define TT_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) +#define TTI_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVD2A(dest_32b_lo, src, addr_mode, instr_mod, dst) )) + +#define TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + TT_OP(0x0a, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) +#define TT_MOVD2B_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) +#define TT_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) +#define TTI_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVD2B(dest_32b_lo, src, addr_mode, instr_mod, dst) )) + +#define TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + TT_OP(0x09, (((dest_32b_lo) << 23) + ((src) << 17) + ((addr_mode) << 15) + ((instr_mod) << 12) + ((dst) << 0))) +#define TT_MOVDBGA2D_VALID(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + (ckernel::is_valid(dest_32b_lo, 1) && ckernel::is_valid(src, 6) && ckernel::is_valid(addr_mode, 2) && ckernel::is_valid(instr_mod, 3) && ckernel::is_valid(dst, 12)) +#define TT_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) +#define TTI_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MOVDBGA2D(dest_32b_lo, src, addr_mode, instr_mod, dst) )) + +#define TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ + TT_OP(0x24, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) +#define TT_MPOOL3S1_VALID(clear_dvalid, addr_mode, index_en, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14)) +#define TT_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) +#define TTI_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MPOOL3S1(clear_dvalid, addr_mode, index_en, dst) )) + +#define TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ + TT_OP(0x31, (((clear_dvalid) << 22) + ((addr_mode) << 15) + ((index_en) << 14) + ((dst) << 0))) +#define TT_MPOOL3S2_VALID(clear_dvalid, addr_mode, index_en, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(addr_mode, 7) && ckernel::is_valid(index_en, 1) && ckernel::is_valid(dst, 14)) +#define TT_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) +#define TTI_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MPOOL3S2(clear_dvalid, addr_mode, index_en, dst) )) + +#define TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + TT_OP(0x5a, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) +#define TT_MULDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) +#define TT_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) +#define TTI_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MULDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) )) + +#define TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \ + TT_OP(0x26, (((clear_dvalid) << 22) + ((instr_mod19) << 19) + ((addr_mode) << 15) + ((dst) << 0))) +#define TT_MVMUL_VALID(clear_dvalid, instr_mod19, addr_mode, dst) \ + (ckernel::is_valid(clear_dvalid, 2) && ckernel::is_valid(instr_mod19, 3) && ckernel::is_valid(addr_mode, 4) && ckernel::is_valid(dst, 15)) +#define TT_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) +#define TTI_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_MVMUL(clear_dvalid, instr_mod19, addr_mode, dst) )) + +#define TT_OP_NOP\ + TT_OP(0x02, 0) +#define TTI_NOP\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_NOP)) + +#define TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ + TT_OP(0x41, (((AddrMode) << 15) + ((ZeroWrite) << 12) + ((PackSel) << 8) + ((OvrdThreadId) << 7) + ((Concat) << 4) + ((Flush) << 1) + ((Last) << 0))) +#define TT_PACR_VALID(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ + (ckernel::is_valid(AddrMode, 9) && ckernel::is_valid(ZeroWrite, 3) && ckernel::is_valid(PackSel, 4) && ckernel::is_valid(OvrdThreadId, 1) && ckernel::is_valid(Concat, 3) && ckernel::is_valid(Flush, 3) && ckernel::is_valid(Last, 1)) +#define TT_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ + ckernel::instrn_buffer[0] = TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) +#define TTI_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_PACR(AddrMode, ZeroWrite, PackSel, OvrdThreadId, Concat, Flush, Last) )) + +#define TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ + TT_OP(0x4a, (((Push) << 23) + ((AddrSel) << 22) + ((WrData) << 12) + ((PackSel) << 8) + ((StreamId) << 2) + ((Flush) << 1) + ((Last) << 0))) +#define TT_PACR_SETREG_VALID(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ + (ckernel::is_valid(Push, 1) && ckernel::is_valid(AddrSel, 1) && ckernel::is_valid(WrData, 10) && ckernel::is_valid(PackSel, 4) && ckernel::is_valid(StreamId, 6) && ckernel::is_valid(Flush, 1) && ckernel::is_valid(Last, 1)) +#define TT_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ + ckernel::instrn_buffer[0] = TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) +#define TTI_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_PACR_SETREG(Push, AddrSel, WrData, PackSel, StreamId, Flush, Last) )) + +#define TT_OP_RAREB\ + TT_OP(0x15, 0) +#define TTI_RAREB\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RAREB)) + +#define TT_OP_RDCFG(GprAddress, CfgReg) \ + TT_OP(0xb1, (((GprAddress) << 16) + ((CfgReg) << 0))) +#define TT_RDCFG_VALID(GprAddress, CfgReg) \ + (ckernel::is_valid(GprAddress, 8) && ckernel::is_valid(CfgReg, 16)) +#define TT_RDCFG(GprAddress, CfgReg) \ + ckernel::instrn_buffer[0] = TT_OP_RDCFG(GprAddress, CfgReg) +#define TTI_RDCFG(GprAddress, CfgReg) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RDCFG(GprAddress, CfgReg) )) + +#define TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ + TT_OP(0x48, (((SizeSel) << 22) + ((TargetSel) << 20) + ((ByteOffset) << 18) + ((ContextId_2) << 16) + ((FlopIndex) << 6) + ((RegIndex) << 0))) +#define TT_REG2FLOP_VALID(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ + (ckernel::is_valid(SizeSel, 2) && ckernel::is_valid(TargetSel, 2) && ckernel::is_valid(ByteOffset, 2) && ckernel::is_valid(ContextId_2, 2) && ckernel::is_valid(FlopIndex, 10) && ckernel::is_valid(RegIndex, 6)) +#define TT_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) +#define TTI_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_REG2FLOP(SizeSel, TargetSel, ByteOffset, ContextId_2, FlopIndex, RegIndex) )) + +#define TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode) \ + TT_OP(0x04, (((start_idx) << 14) + ((len) << 4) + ((execute_while_loading) << 1) + ((load_mode) << 0))) +#define TT_REPLAY_VALID(start_idx, len, execute_while_loading, load_mode) \ + (ckernel::is_valid(start_idx, 10) && ckernel::is_valid(len, 10) && ckernel::is_valid(execute_while_loading, 3) && ckernel::is_valid(load_mode, 1)) +#define TT_REPLAY(start_idx, len, execute_while_loading, load_mode) \ + ckernel::instrn_buffer[0] = TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode) +#define TTI_REPLAY(start_idx, len, execute_while_loading, load_mode) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_REPLAY(start_idx, len, execute_while_loading, load_mode) )) + +#define TT_OP_RMWCIB0(Mask, Data, CfgRegAddr) \ + TT_OP(0xb3, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) +#define TT_RMWCIB0_VALID(Mask, Data, CfgRegAddr) \ + (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) +#define TT_RMWCIB0(Mask, Data, CfgRegAddr) \ + ckernel::instrn_buffer[0] = TT_OP_RMWCIB0(Mask, Data, CfgRegAddr) +#define TTI_RMWCIB0(Mask, Data, CfgRegAddr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB0(Mask, Data, CfgRegAddr) )) + +#define TT_OP_RMWCIB1(Mask, Data, CfgRegAddr) \ + TT_OP(0xb4, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) +#define TT_RMWCIB1_VALID(Mask, Data, CfgRegAddr) \ + (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) +#define TT_RMWCIB1(Mask, Data, CfgRegAddr) \ + ckernel::instrn_buffer[0] = TT_OP_RMWCIB1(Mask, Data, CfgRegAddr) +#define TTI_RMWCIB1(Mask, Data, CfgRegAddr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB1(Mask, Data, CfgRegAddr) )) + +#define TT_OP_RMWCIB2(Mask, Data, CfgRegAddr) \ + TT_OP(0xb5, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) +#define TT_RMWCIB2_VALID(Mask, Data, CfgRegAddr) \ + (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) +#define TT_RMWCIB2(Mask, Data, CfgRegAddr) \ + ckernel::instrn_buffer[0] = TT_OP_RMWCIB2(Mask, Data, CfgRegAddr) +#define TTI_RMWCIB2(Mask, Data, CfgRegAddr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB2(Mask, Data, CfgRegAddr) )) + +#define TT_OP_RMWCIB3(Mask, Data, CfgRegAddr) \ + TT_OP(0xb6, (((Mask) << 16) + ((Data) << 8) + ((CfgRegAddr) << 0))) +#define TT_RMWCIB3_VALID(Mask, Data, CfgRegAddr) \ + (ckernel::is_valid(Mask, 8) && ckernel::is_valid(Data, 8) && ckernel::is_valid(CfgRegAddr, 8)) +#define TT_RMWCIB3(Mask, Data, CfgRegAddr) \ + ckernel::instrn_buffer[0] = TT_OP_RMWCIB3(Mask, Data, CfgRegAddr) +#define TTI_RMWCIB3(Mask, Data, CfgRegAddr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RMWCIB3(Mask, Data, CfgRegAddr) )) + +#define TT_OP_RSTDMA\ + TT_OP(0x44, 0) +#define TTI_RSTDMA\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_RSTDMA)) + +#define TT_OP_SEMGET(sem_sel) \ + TT_OP(0xa5, (((sem_sel) << 2))) +#define TT_SEMGET_VALID(sem_sel) \ + (ckernel::is_valid(sem_sel, 22)) +#define TT_SEMGET(sem_sel) \ + ckernel::instrn_buffer[0] = TT_OP_SEMGET(sem_sel) +#define TTI_SEMGET(sem_sel) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMGET(sem_sel) )) + +#define TT_OP_SEMINIT(max_value, init_value, sem_sel) \ + TT_OP(0xa3, (((max_value) << 20) + ((init_value) << 16) + ((sem_sel) << 2))) +#define TT_SEMINIT_VALID(max_value, init_value, sem_sel) \ + (ckernel::is_valid(max_value, 4) && ckernel::is_valid(init_value, 4) && ckernel::is_valid(sem_sel, 14)) +#define TT_SEMINIT(max_value, init_value, sem_sel) \ + ckernel::instrn_buffer[0] = TT_OP_SEMINIT(max_value, init_value, sem_sel) +#define TTI_SEMINIT(max_value, init_value, sem_sel) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMINIT(max_value, init_value, sem_sel) )) + +#define TT_OP_SEMPOST(sem_sel) \ + TT_OP(0xa4, (((sem_sel) << 2))) +#define TT_SEMPOST_VALID(sem_sel) \ + (ckernel::is_valid(sem_sel, 22)) +#define TT_SEMPOST(sem_sel) \ + ckernel::instrn_buffer[0] = TT_OP_SEMPOST(sem_sel) +#define TTI_SEMPOST(sem_sel) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMPOST(sem_sel) )) + +#define TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \ + TT_OP(0xa6, (((stall_res) << 15) + ((sem_sel) << 2) + ((wait_sem_cond) << 0))) +#define TT_SEMWAIT_VALID(stall_res, sem_sel, wait_sem_cond) \ + (ckernel::is_valid(stall_res, 9) && ckernel::is_valid(sem_sel, 13) && ckernel::is_valid(wait_sem_cond, 2)) +#define TT_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \ + ckernel::instrn_buffer[0] = TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond) +#define TTI_SEMWAIT(stall_res, sem_sel, wait_sem_cond) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SEMWAIT(stall_res, sem_sel, wait_sem_cond) )) + +#define TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \ + TT_OP(0x50, (((CntSetMask) << 21) + ((ChannelIndex) << 20) + ((DimensionIndex) << 18) + ((Value) << 0))) +#define TT_SETADC_VALID(CntSetMask, ChannelIndex, DimensionIndex, Value) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(ChannelIndex, 1) && ckernel::is_valid(DimensionIndex, 2) && ckernel::is_valid(Value, 18)) +#define TT_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \ + ckernel::instrn_buffer[0] = TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) +#define TTI_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADC(CntSetMask, ChannelIndex, DimensionIndex, Value) )) + +#define TT_OP_SETADCXX(CntSetMask, x_end2, x_start) \ + TT_OP(0x5e, (((CntSetMask) << 21) + ((x_end2) << 10) + ((x_start) << 0))) +#define TT_SETADCXX_VALID(CntSetMask, x_end2, x_start) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(x_end2, 11) && ckernel::is_valid(x_start, 10)) +#define TT_SETADCXX(CntSetMask, x_end2, x_start) \ + ckernel::instrn_buffer[0] = TT_OP_SETADCXX(CntSetMask, x_end2, x_start) +#define TTI_SETADCXX(CntSetMask, x_end2, x_start) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCXX(CntSetMask, x_end2, x_start) )) + +#define TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + TT_OP(0x51, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0))) +#define TT_SETADCXY_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) +#define TT_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + ckernel::instrn_buffer[0] = TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) +#define TTI_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCXY(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) )) + +#define TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + TT_OP(0x54, (((CntSetMask) << 21) + ((Ch1_Y) << 15) + ((Ch1_X) << 12) + ((Ch0_Y) << 9) + ((Ch0_X) << 6) + ((BitMask) << 0))) +#define TT_SETADCZW_VALID(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + (ckernel::is_valid(CntSetMask, 3) && ckernel::is_valid(Ch1_Y, 6) && ckernel::is_valid(Ch1_X, 3) && ckernel::is_valid(Ch0_Y, 3) && ckernel::is_valid(Ch0_X, 3) && ckernel::is_valid(BitMask, 6)) +#define TT_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + ckernel::instrn_buffer[0] = TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) +#define TTI_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETADCZW(CntSetMask, Ch1_Y, Ch1_X, Ch0_Y, Ch0_X, BitMask) )) + +#define TT_OP_SETASHRMH(reg_mask, halo_mask) \ + TT_OP(0x1e, (((reg_mask) << 1) + ((halo_mask) << 0))) +#define TT_SETASHRMH_VALID(reg_mask, halo_mask) \ + (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1)) +#define TT_SETASHRMH(reg_mask, halo_mask) \ + ckernel::instrn_buffer[0] = TT_OP_SETASHRMH(reg_mask, halo_mask) +#define TTI_SETASHRMH(reg_mask, halo_mask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH(reg_mask, halo_mask) )) + +#define TT_OP_SETASHRMH0(reg_mask, halo_mask) \ + TT_OP(0x1a, (((reg_mask) << 1) + ((halo_mask) << 0))) +#define TT_SETASHRMH0_VALID(reg_mask, halo_mask) \ + (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1)) +#define TT_SETASHRMH0(reg_mask, halo_mask) \ + ckernel::instrn_buffer[0] = TT_OP_SETASHRMH0(reg_mask, halo_mask) +#define TTI_SETASHRMH0(reg_mask, halo_mask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH0(reg_mask, halo_mask) )) + +#define TT_OP_SETASHRMH1(reg_mask, halo_mask) \ + TT_OP(0x1b, (((reg_mask) << 1) + ((halo_mask) << 0))) +#define TT_SETASHRMH1_VALID(reg_mask, halo_mask) \ + (ckernel::is_valid(reg_mask, 23) && ckernel::is_valid(halo_mask, 1)) +#define TT_SETASHRMH1(reg_mask, halo_mask) \ + ckernel::instrn_buffer[0] = TT_OP_SETASHRMH1(reg_mask, halo_mask) +#define TTI_SETASHRMH1(reg_mask, halo_mask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMH1(reg_mask, halo_mask) )) + +#define TT_OP_SETASHRMV(reg_mask2) \ + TT_OP(0x1c, (((reg_mask2) << 0))) +#define TT_SETASHRMV_VALID(reg_mask2) \ + (ckernel::is_valid(reg_mask2, 24)) +#define TT_SETASHRMV(reg_mask2) \ + ckernel::instrn_buffer[0] = TT_OP_SETASHRMV(reg_mask2) +#define TTI_SETASHRMV(reg_mask2) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETASHRMV(reg_mask2) )) + +#define TT_OP_SETC16(setc16_reg, setc16_value) \ + TT_OP(0xb2, (((setc16_reg) << 16) + ((setc16_value) << 0))) +#define TT_SETC16_VALID(setc16_reg, setc16_value) \ + (ckernel::is_valid(setc16_reg, 8) && ckernel::is_valid(setc16_value, 16)) +#define TT_SETC16(setc16_reg, setc16_value) \ + ckernel::instrn_buffer[0] = TT_OP_SETC16(setc16_reg, setc16_value) +#define TTI_SETC16(setc16_reg, setc16_value) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETC16(setc16_reg, setc16_value) )) + +#define TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ + TT_OP(0x45, (((Payload_SigSelSize) << 22) + ((Payload_SigSel) << 8) + ((SetSignalsMode) << 7) + ((RegIndex16b) << 0))) +#define TT_SETDMAREG_VALID(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ + (ckernel::is_valid(Payload_SigSelSize, 2) && ckernel::is_valid(Payload_SigSel, 14) && ckernel::is_valid(SetSignalsMode, 1) && ckernel::is_valid(RegIndex16b, 7)) +#define TT_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ + ckernel::instrn_buffer[0] = TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) +#define TTI_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETDMAREG(Payload_SigSelSize, Payload_SigSel, SetSignalsMode, RegIndex16b) )) + +#define TT_OP_SETDVALID(setvalid) \ + TT_OP(0x57, (((setvalid) << 0))) +#define TT_SETDVALID_VALID(setvalid) \ + (ckernel::is_valid(setvalid, 24)) +#define TT_SETDVALID(setvalid) \ + ckernel::instrn_buffer[0] = TT_OP_SETDVALID(setvalid) +#define TTI_SETDVALID(setvalid) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETDVALID(setvalid) )) + +#define TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \ + TT_OP(0x39, (((rwc_cr) << 18) + ((rwc_bias) << 6) + ((set_inc_ctrl) << 0))) +#define TT_SETIBRWC_VALID(rwc_cr, rwc_bias, set_inc_ctrl) \ + (ckernel::is_valid(rwc_cr, 6) && ckernel::is_valid(rwc_bias, 12) && ckernel::is_valid(set_inc_ctrl, 6)) +#define TT_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \ + ckernel::instrn_buffer[0] = TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) +#define TTI_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETIBRWC(rwc_cr, rwc_bias, set_inc_ctrl) )) + +#define TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start) \ + TT_OP(0x1d, (((y_end) << 12) + ((y_start) << 8) + ((x_end) << 4) + ((x_start) << 0))) +#define TT_SETPKEDGOF_VALID(y_end, y_start, x_end, x_start) \ + (ckernel::is_valid(y_end, 12) && ckernel::is_valid(y_start, 4) && ckernel::is_valid(x_end, 4) && ckernel::is_valid(x_start, 4)) +#define TT_SETPKEDGOF(y_end, y_start, x_end, x_start) \ + ckernel::instrn_buffer[0] = TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start) +#define TTI_SETPKEDGOF(y_end, y_start, x_end, x_start) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETPKEDGOF(y_end, y_start, x_end, x_start) )) + +#define TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ + TT_OP(0x37, (((clear_ab_vld) << 22) + ((rwc_cr) << 18) + ((rwc_d) << 14) + ((rwc_b) << 10) + ((rwc_a) << 6) + ((BitMask) << 0))) +#define TT_SETRWC_VALID(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ + (ckernel::is_valid(clear_ab_vld, 2) && ckernel::is_valid(rwc_cr, 4) && ckernel::is_valid(rwc_d, 4) && ckernel::is_valid(rwc_b, 4) && ckernel::is_valid(rwc_a, 4) && ckernel::is_valid(BitMask, 6)) +#define TT_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ + ckernel::instrn_buffer[0] = TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) +#define TTI_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SETRWC(clear_ab_vld, rwc_cr, rwc_d, rwc_b, rwc_a, BitMask) )) + +#define TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x7d, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPABS_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPABS(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + TT_OP(0x85, (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPADD_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) +#define TTI_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPADD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1) \ + TT_OP(0x75, (((imm16_math) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPADDI_VALID(imm16_math, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPADDI(imm16_math, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1) +#define TTI_SFPADDI(imm16_math, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPADDI(imm16_math, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x7e, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPAND_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPAND(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \ + TT_OP(0x90, (((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPCAST_VALID(lreg_src_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(lreg_src_c, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) +#define TTI_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCAST(lreg_src_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x8b, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPCOMPC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCOMPC(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1) \ + TT_OP(0x91, (((imm16_math) << 8) + ((config_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPCONFIG_VALID(imm16_math, config_dest, instr_mod1) \ + (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(config_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPCONFIG(imm16_math, config_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1) +#define TTI_SFPCONFIG(imm16_math, config_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPCONFIG(imm16_math, config_dest, instr_mod1) )) + +#define TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x76, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPDIVP2_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPDIVP2(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x8a, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPENCC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPENCC(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x77, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPEXEXP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPEXEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x78, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPEXMAN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPEXMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x79, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPIADD_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPIADD(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + TT_OP(0x70, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0))) +#define TT_SFPLOAD_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && ckernel::is_valid(dest_reg_addr, 14)) +#define TT_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + ckernel::instrn_buffer[0] = TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) +#define TTI_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOAD(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) )) + +#define TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16) \ + TT_OP(0x71, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((imm16) << 0))) +#define TT_SFPLOADI_VALID(lreg_ind, instr_mod0, imm16) \ + (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(imm16, 16)) +#define TT_SFPLOADI(lreg_ind, instr_mod0, imm16) \ + ckernel::instrn_buffer[0] = TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16) +#define TTI_SFPLOADI(lreg_ind, instr_mod0, imm16) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOADI(lreg_ind, instr_mod0, imm16) )) + +#define TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + TT_OP(0x93, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0))) +#define TT_SFPLOADMACRO_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && ckernel::is_valid(dest_reg_addr, 14)) +#define TT_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + ckernel::instrn_buffer[0] = TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) +#define TTI_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLOADMACRO(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) )) + +#define TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \ + TT_OP(0x73, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((dest_reg_addr) << 0))) +#define TT_SFPLUT_VALID(lreg_ind, instr_mod0, dest_reg_addr) \ + (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(dest_reg_addr, 16)) +#define TT_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \ + ckernel::instrn_buffer[0] = TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) +#define TTI_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLUT(lreg_ind, instr_mod0, dest_reg_addr) )) + +#define TT_OP_SFPLUTFP32(lreg_dest, instr_mod1) \ + TT_OP(0x95, (((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPLUTFP32_VALID(lreg_dest, instr_mod1) \ + (ckernel::is_valid(lreg_dest, 20) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPLUTFP32(lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPLUTFP32(lreg_dest, instr_mod1) +#define TTI_SFPLUTFP32(lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLUTFP32(lreg_dest, instr_mod1) )) + +#define TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x81, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPLZ_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPLZ(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + TT_OP(0x84, (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPMAD_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) +#define TTI_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMAD(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x7c, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPMOV_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMOV(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + TT_OP(0x86, (((lreg_src_a) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPMUL_VALID(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(lreg_src_a, 8) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) +#define TTI_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMUL(lreg_src_a, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1) \ + TT_OP(0x74, (((imm16_math) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPMULI_VALID(imm16_math, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm16_math, 16) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPMULI(imm16_math, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1) +#define TTI_SFPMULI(imm16_math, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPMULI(imm16_math, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPNOP\ + TT_OP(0x8f, 0) +#define TTI_SFPNOP\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPNOP)) + +#define TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x80, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPNOT_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPNOT(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x7f, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPOR_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPOR(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x88, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPPOPC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPPOPC(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x87, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPPUSHC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPPUSHC(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x7b, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPSETCC_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETCC(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x82, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPSETEXP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETEXP(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x83, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPSETMAN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETMAN(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x89, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPSETSGN_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSETSGN(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x7a, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPSHFT_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSHFT(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + TT_OP(0x94, (((imm12_math) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPSHFT2_VALID(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) +#define TTI_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSHFT2(imm12_math, lreg_src_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + TT_OP(0x72, (((lreg_ind) << 20) + ((instr_mod0) << 16) + ((sfpu_addr_mode) << 14) + ((dest_reg_addr) << 0))) +#define TT_SFPSTORE_VALID(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + (ckernel::is_valid(lreg_ind, 4) && ckernel::is_valid(instr_mod0, 4) && ckernel::is_valid(sfpu_addr_mode, 2) && ckernel::is_valid(dest_reg_addr, 14)) +#define TT_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) +#define TTI_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSTORE(lreg_ind, instr_mod0, sfpu_addr_mode, dest_reg_addr) )) + +#define TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + TT_OP(0x92, (((imm12_math) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPSWAP_VALID(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) +#define TTI_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPSWAP(imm12_math, lreg_src_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x8c, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPTRANSP_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPTRANSP(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + TT_OP(0x8d, (((imm12_math) << 12) + ((lreg_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFPXOR_VALID(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(imm12_math, 12) && ckernel::is_valid(lreg_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) +#define TTI_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFPXOR(imm12_math, lreg_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + TT_OP(0x8e, (((rnd_mode) << 21) + ((imm8_math) << 16) + ((lreg_src_b) << 12) + ((lreg_src_c) << 8) + ((lreg_dest) << 4) + ((instr_mod1) << 0))) +#define TT_SFP_STOCH_RND_VALID(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + (ckernel::is_valid(rnd_mode, 3) && ckernel::is_valid(imm8_math, 5) && ckernel::is_valid(lreg_src_b, 4) && ckernel::is_valid(lreg_src_c, 4) && ckernel::is_valid(lreg_dest, 4) && ckernel::is_valid(instr_mod1, 4)) +#define TT_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + ckernel::instrn_buffer[0] = TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) +#define TTI_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SFP_STOCH_RND(rnd_mode, imm8_math, lreg_src_b, lreg_src_c, lreg_dest, instr_mod1) )) + +#define TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + TT_OP(0x5c, (((OpBisConst) << 23) + ((OpSel) << 18) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) +#define TT_SHIFTDMAREG_VALID(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(OpSel, 5) && ckernel::is_valid(ResultRegIndex, 6) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) +#define TT_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) +#define TTI_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTDMAREG(OpBisConst, OpSel, ResultRegIndex, OpBRegIndex, OpARegIndex) )) + +#define TT_OP_SHIFTXA(log2_amount2, shift_mode) \ + TT_OP(0x17, (((log2_amount2) << 2) + ((shift_mode) << 0))) +#define TT_SHIFTXA_VALID(log2_amount2, shift_mode) \ + (ckernel::is_valid(log2_amount2, 22) && ckernel::is_valid(shift_mode, 2)) +#define TT_SHIFTXA(log2_amount2, shift_mode) \ + ckernel::instrn_buffer[0] = TT_OP_SHIFTXA(log2_amount2, shift_mode) +#define TTI_SHIFTXA(log2_amount2, shift_mode) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTXA(log2_amount2, shift_mode) )) + +#define TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row) \ + TT_OP(0x18, (((addr_mode) << 15) + ((rot_shift) << 10) + ((shift_row) << 0))) +#define TT_SHIFTXB_VALID(addr_mode, rot_shift, shift_row) \ + (ckernel::is_valid(addr_mode, 9) && ckernel::is_valid(rot_shift, 5) && ckernel::is_valid(shift_row, 10)) +#define TT_SHIFTXB(addr_mode, rot_shift, shift_row) \ + ckernel::instrn_buffer[0] = TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row) +#define TTI_SHIFTXB(addr_mode, rot_shift, shift_row) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SHIFTXB(addr_mode, rot_shift, shift_row) )) + +#define TT_OP_STALLWAIT(stall_res, wait_res) \ + TT_OP(0xa2, (((stall_res) << 15) + ((wait_res) << 0))) +#define TT_STALLWAIT_VALID(stall_res, wait_res) \ + (ckernel::is_valid(stall_res, 9) && ckernel::is_valid(wait_res, 15)) +#define TT_STALLWAIT(stall_res, wait_res) \ + ckernel::instrn_buffer[0] = TT_OP_STALLWAIT(stall_res, wait_res) +#define TTI_STALLWAIT(stall_res, wait_res) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STALLWAIT(stall_res, wait_res) )) + +#define TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + TT_OP(0x66, (((MemHierSel) << 23) + ((SizeSel) << 22) + ((RegSizeSel) << 21) + ((OffsetIndex) << 14) + ((AutoIncSpec) << 12) + ((DataRegIndex) << 6) + ((AddrRegIndex) << 0))) +#define TT_STOREIND_VALID(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + (ckernel::is_valid(MemHierSel, 1) && ckernel::is_valid(SizeSel, 1) && ckernel::is_valid(RegSizeSel, 1) && ckernel::is_valid(OffsetIndex, 7) && ckernel::is_valid(AutoIncSpec, 2) && ckernel::is_valid(DataRegIndex, 6) && ckernel::is_valid(AddrRegIndex, 6)) +#define TT_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) +#define TTI_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STOREIND(MemHierSel, SizeSel, RegSizeSel, OffsetIndex, AutoIncSpec, DataRegIndex, AddrRegIndex) )) + +#define TT_OP_STOREREG(TdmaDataRegIndex, RegAddr) \ + TT_OP(0x67, (((TdmaDataRegIndex) << 18) + ((RegAddr) << 0))) +#define TT_STOREREG_VALID(TdmaDataRegIndex, RegAddr) \ + (ckernel::is_valid(TdmaDataRegIndex, 6) && ckernel::is_valid(RegAddr, 18)) +#define TT_STOREREG(TdmaDataRegIndex, RegAddr) \ + ckernel::instrn_buffer[0] = TT_OP_STOREREG(TdmaDataRegIndex, RegAddr) +#define TTI_STOREREG(TdmaDataRegIndex, RegAddr) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_STOREREG(TdmaDataRegIndex, RegAddr) )) + +#define TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + TT_OP(0x59, (((OpBisConst) << 23) + ((ResultRegIndex) << 12) + ((OpBRegIndex) << 6) + ((OpARegIndex) << 0))) +#define TT_SUBDMAREG_VALID(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + (ckernel::is_valid(OpBisConst, 1) && ckernel::is_valid(ResultRegIndex, 11) && ckernel::is_valid(OpBRegIndex, 6) && ckernel::is_valid(OpARegIndex, 6)) +#define TT_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + ckernel::instrn_buffer[0] = TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) +#define TTI_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_SUBDMAREG(OpBisConst, ResultRegIndex, OpBRegIndex, OpARegIndex) )) + +#define TT_OP_TBUFCMD\ + TT_OP(0x4b, 0) +#define TTI_TBUFCMD\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TBUFCMD)) + +#define TT_OP_TRNSPSRCA\ + TT_OP(0x14, 0) +#define TTI_TRNSPSRCA\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TRNSPSRCA)) + +#define TT_OP_TRNSPSRCB\ + TT_OP(0x16, 0) +#define TTI_TRNSPSRCB\ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_TRNSPSRCB)) + +#define TT_OP_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \ + TT_OP(0x42, (((Unpack_block_selection) << 23) + ((AddrMode) << 15) + ((CfgContextCntInc) << 13) + ((CfgContextId) << 10) + ((AddrCntContextId) << 8) + ((OvrdThreadId) << 7) + ((SetDatValid) << 6) + ((rareb_en) << 5) + ((ZeroWrite2) << 4) + ((AutoIncContextID) << 3) + ((RowSearch) << 2) + ((SearchCacheFlush) << 1) + ((Last) << 0))) +#define TT_UNPACR_VALID(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \ + (ckernel::is_valid(Unpack_block_selection, 1) && ckernel::is_valid(AddrMode, 8) && ckernel::is_valid(CfgContextCntInc, 2) && ckernel::is_valid(CfgContextId, 3) && ckernel::is_valid(AddrCntContextId, 2) && ckernel::is_valid(OvrdThreadId, 1) && ckernel::is_valid(SetDatValid, 1) && ckernel::is_valid(rareb_en, 1) && ckernel::is_valid(ZeroWrite2, 1) && ckernel::is_valid(AutoIncContextID, 1) && ckernel::is_valid(RowSearch, 1) && ckernel::is_valid(SearchCacheFlush, 1) && ckernel::is_valid(Last, 1)) +#define TT_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \ + ckernel::instrn_buffer[0] = TT_OP_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) +#define TTI_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_UNPACR(Unpack_block_selection, AddrMode, CfgContextCntInc, CfgContextId, AddrCntContextId, OvrdThreadId, SetDatValid, rareb_en, ZeroWrite2, AutoIncContextID, RowSearch, SearchCacheFlush, Last) )) + +#define TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp) \ + TT_OP(0x43, (((Unpack_block_selection) << 23) + ((NoOp) << 0))) +#define TT_UNPACR_NOP_VALID(Unpack_block_selection, NoOp) \ + (ckernel::is_valid(Unpack_block_selection, 1) && ckernel::is_valid(NoOp, 23)) +#define TT_UNPACR_NOP(Unpack_block_selection, NoOp) \ + ckernel::instrn_buffer[0] = TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp) +#define TTI_UNPACR_NOP(Unpack_block_selection, NoOp) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_UNPACR_NOP(Unpack_block_selection, NoOp) )) + +#define TT_OP_WRCFG(GprAddress, wr128b, CfgReg) \ + TT_OP(0xb0, (((GprAddress) << 16) + ((wr128b) << 15) + ((CfgReg) << 0))) +#define TT_WRCFG_VALID(GprAddress, wr128b, CfgReg) \ + (ckernel::is_valid(GprAddress, 8) && ckernel::is_valid(wr128b, 1) && ckernel::is_valid(CfgReg, 15)) +#define TT_WRCFG(GprAddress, wr128b, CfgReg) \ + ckernel::instrn_buffer[0] = TT_OP_WRCFG(GprAddress, wr128b, CfgReg) +#define TTI_WRCFG(GprAddress, wr128b, CfgReg) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_WRCFG(GprAddress, wr128b, CfgReg) )) + +#define TT_OP_XMOV(Mov_block_selection, Last) \ + TT_OP(0x40, (((Mov_block_selection) << 23) + ((Last) << 0))) +#define TT_XMOV_VALID(Mov_block_selection, Last) \ + (ckernel::is_valid(Mov block selection, 1) && ckernel::is_valid(Last, 23)) +#define TT_XMOV(Mov_block_selection, Last) \ + ckernel::instrn_buffer[0] = TT_OP_XMOV(Mov_block_selection, Last) +#define TTI_XMOV(Mov_block_selection, Last) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_XMOV(Mov_block_selection, Last) )) + +#define TT_OP_ZEROACC(clear_mode, AddrMode, dst) \ + TT_OP(0x10, (((clear_mode) << 19) + ((AddrMode) << 15) + ((dst) << 0))) +#define TT_ZEROACC_VALID(clear_mode, AddrMode, dst) \ + (ckernel::is_valid(clear_mode, 5) && ckernel::is_valid(AddrMode, 4) && ckernel::is_valid(dst, 15)) +#define TT_ZEROACC(clear_mode, AddrMode, dst) \ + ckernel::instrn_buffer[0] = TT_OP_ZEROACC(clear_mode, AddrMode, dst) +#define TTI_ZEROACC(clear_mode, AddrMode, dst) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ZEROACC(clear_mode, AddrMode, dst) )) + +#define TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \ + TT_OP(0x11, (((zero_val) << 4) + ((write_mode) << 3) + ((bank_mask) << 2) + ((src_mask) << 0))) +#define TT_ZEROSRC_VALID(zero_val, write_mode, bank_mask, src_mask) \ + (ckernel::is_valid(zero_val, 20) && ckernel::is_valid(write_mode, 1) && ckernel::is_valid(bank_mask, 1) && ckernel::is_valid(src_mask, 2)) +#define TT_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \ + ckernel::instrn_buffer[0] = TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) +#define TTI_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) \ + INSTRUCTION_WORD(TRISC_OP_SWIZZLE(TT_OP_ZEROSRC(zero_val, write_mode, bank_mask, src_mask) )) diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h index d0c2c755411..ef70dc53f88 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_pcbuf.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once // Functions for encoding and decoding PC buffer writes diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h index 0dd06c65dc8..38d054cd6b1 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpi.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #include "ckernel_template.h" #include "ckernel.h" #include "cmath_common.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h index e7b80e9cabf..b00ea4a0b1f 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h @@ -2,12 +2,12 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel_defs.h" #include "noc_nonblocking_api.h" #include "ckernel.h" -#include "llk_defs.h" #include #include "sfpi.h" @@ -20,22 +20,27 @@ namespace sfpu { -inline void sfpu_load_imm32(const uint dest, const uint val) +inline void _sfpu_load_imm32_(const uint dest, const uint val) +{ + TT_SFPLOADI(dest, 10, (val & 0xFFFF)); // insmod == 10 will write the lower bits, and not affect the upper bits; + TT_SFPLOADI(dest, 8, (val>>16) & 0xFFFF); // insmod == 8 will write the upper bits, and not affect the lower bits; +} + +inline void _sfpu_load_imm16_(const uint dest, const uint val) { - TT_SFPLOADI(dest, 0xA, (val & 0xFFFF)); // insmod == A will write the lower bits, and not affect the upper bits; - TT_SFPLOADI(dest, 0x8, (val>>16) & 0xFFFF); // insmod == 8 will write the upper bits, and not affect the lower bits; + TT_SFPLOADI(dest, 2, val); // insmod == 2 will write imm16 value treated as unsigned integer, right justified and padded with zeroes on the MSBs } -inline void sfpu_load_config32(const uint dest, const uint upper16, const uint lower16) +inline void _sfpu_load_config32_(const uint dest, const uint upper16, const uint lower16) { // registers 11 through 14 are programmable "constants" which are shared across all 4 rows // They are updated only through the CONFIG path, which uses LREG[0] first and then copies it to the desired register location - TTI_SFPLOADI(0, 0xA, lower16); // insmod == A will write the lower bits, and not affect the upper bits; - TTI_SFPLOADI(0, 0x8, upper16); // insmod == 8 will write the upper bits, and not affect the lower bits; + TTI_SFPLOADI(0, 10, lower16); // insmod == A will write the lower bits, and not affect the upper bits; + TTI_SFPLOADI(0, 8, upper16); // insmod == 8 will write the upper bits, and not affect the lower bits; TTI_SFPCONFIG(0, dest, 0); } -sfpi_inline vInt sfpu_is_fp16_zero(const vFloat& v, uint exponent_size_8) +sfpi_inline vInt _sfpu_is_fp16_zero_(const vFloat& v, uint exponent_size_8) { if (exponent_size_8) { // fp16b @@ -114,7 +119,8 @@ sfpi_inline vFloat _sfpu_reciprocal_(const vFloat in) return setexp(result, new_exp); } -inline void init_dropout_seed(uint16_t p2){ +inline void _init_dropout_seed_(uint16_t p2){ + FWLOG1("calculate_dropout() -- input seed:%x", p2); uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(0, 0, NOC_NODE_ID); @@ -123,6 +129,8 @@ inline void init_dropout_seed(uint16_t p2){ uint16_t per_tensix_input_seed = p2 ^ (my_x << my_y); + FWLOG1("calculate_dropout() -- calculated seed:%x", per_tensix_input_seed); + vInt result = l_reg[LRegs::LReg3]; vInt tmp = vConstTileId << 10; @@ -133,217 +141,187 @@ inline void init_dropout_seed(uint16_t p2){ } template -inline void configure_programmable_constants(SfpuType operation) +inline void _init_exponential_() { - switch (operation) { - case SfpuType::gelu: - vConstFloatPrgm0 = 0.5f; - break; - case SfpuType::exponential: - if (APPROXIMATION_MODE) { - vConstFloatPrgm0 = 1.442695f; // ln2_recip - vConstFloatPrgm1 = s2vFloat16b(p_exp::C23_73); - vConstFloatPrgm2 = s2vFloat16b(p_exp::ADJ_EXP); - break; - } - - - - // Fall through - case SfpuType::gelu_derivative: - vConstFloatPrgm2 = 0.863281f; - - // Fall through - case SfpuType::reciprocal: + if (APPROXIMATION_MODE) { + vConstFloatPrgm0 = 1.442695f; // ln2_recip + vConstFloatPrgm1 = s2vFloat16b(p_exp::C23_73); + vConstFloatPrgm2 = s2vFloat16b(p_exp::ADJ_EXP); + } else { vConstFloatPrgm0 = 1.442695f; // ln2_recip vConstFloatPrgm1 = 2.0f; - break; - - case SfpuType::log: - // ln2 - vConstFloatPrgm0 = 0.692871f; // ln2 + vConstFloatPrgm2 = 0.863281f; + } +} - // XXXXX could do these to higher precision - vConstFloatPrgm1 = 0.1058f; - vConstFloatPrgm2 = -0.7166f; - break; +template +inline void _init_reciprocal_() +{ + vConstFloatPrgm0 = 1.442695f; // ln2_recip + vConstFloatPrgm1 = 2.0f; +} - case SfpuType::sqrt: - if (APPROXIMATION_MODE) { - vConstFloatPrgm0 = s2vFloat16b(127 << 7); - } else { - vConstFloatPrgm0 = s2vFloat16b(0x5f37); - } - break; +template +inline void _init_log_() +{ + vConstFloatPrgm0 = 0.692871f; // ln2 - case SfpuType::dropout: - vConstIntPrgm0 = 0xb400; - vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB - break; + // XXXXX could do these to higher precision + vConstFloatPrgm1 = 0.1058f; + vConstFloatPrgm2 = -0.7166f; +} - default: - // Should result in compile time error?? - break; +template +inline void _init_sqrt_() +{ + if (APPROXIMATION_MODE) { + vConstFloatPrgm0 = s2vFloat16b(127 << 7); + } else { + vConstFloatPrgm0 = s2vFloat16b(0x5f37); } } template -inline void sfpu_init(SfpuType operation, uint param0 = 0) +inline void _init_tanh_() { - configure_programmable_constants(operation); uint imm0; uint imm1; uint imm2; - uint imm0_high; - uint imm0_low; - uint imm1_high; - uint imm1_low; - uint imm2_high; - uint imm2_low; - uint imm3_high; - uint imm3_low; - uint imm4_high; - uint imm4_low; - uint imm5_high; - uint imm5_low; - switch (operation) { - case SfpuType::tanh: - case SfpuType::tanh_derivative: - imm0 = 0x1DFF; //0.90625*x - imm1 = 0x481A; //0.09375*x + 0.8125 - imm2 = 0xFF00; //1 - TTI_SFPLOADI(0, 2, imm0); - TTI_SFPLOADI(1, 2, imm1); - TTI_SFPLOADI(2, 2, imm2); - break; - case SfpuType::sigmoid: - // imm0 = 0x3DFF; - // imm1 = 0x21D8; - // imm2 = 0xFF10; - // TTI_SFPLOADI(0, 2, imm0); - // TTI_SFPLOADI(1, 2, imm1); - // TTI_SFPLOADI(2, 2, imm2); - // Using a 6 piece LUT to calculate and model sigmoid directly - // x <= 0.5 --> 0.2452x + (-0.0004997) - // x <= 1.0 --> 0.2173x + 0.0152 - // x <= 1.5 --> 0.1731x + 0.05988 - // x <= 2.0 --> 0.1262x + 0.1298 - // x <= 4.0 --> 0.0485x + 0.2998 - // x > 4.0 --> 0.4998 - - // imm0[15:0] = A0=0.2452 = 0x33D9 -- imm0[31:16] = A1=0.2173 = 0x32F4 - sfpu_load_imm32(0,0x32F433D9); - // imm4[15:0] = B0= -0.0004997 = 0x9018 -- imm4[31:16] = B1= 0.0152 = 0x23c8 - sfpu_load_imm32(4,0x23C89018); - - // imm1[15:0] = A2=0.1731 = 0x318a -- imm1[31:16] = A3=0.1262 = 0x300a - sfpu_load_imm32(1,0x300A318A); - // imm5[15:0] = B2=0.05988 = 0x2BAA -- imm5[31:16] = B3=0.1298 = 0x3027 - sfpu_load_imm32(5,0x30272BAA); - - // imm2[15:0] = A4=0.0485 = 0x2A35 -- imm2[31:16] = A5=0.0 = 0x7C00 - sfpu_load_imm32(2,0x7C002A35); - // imm6[15:0] = B4=0.2998 = 0x34CC -- imm6[31:16] = B5=0.4998 = 0x37ff - sfpu_load_imm32(6,0x37ff34CC); - - break; - case SfpuType::gelu_derivative: - if constexpr (APPROXIMATION_MODE) { - // Using a 6 piece LUT to calculate and model gelu_derivative directly - // x <= 0.5 --> 0.8x + 0.5 - // x <= 1.0 --> 0.4x + 0.7 - // x <= 1.5 --> 0.1x + 0.99 - // x <= 2.0 --> -0.09x + 1.27 - // x <= 3.0 --> -0.075x + 1.235 - // x > 3.0 --> 1.0 - // imm0[15:0] = A0=0.8 = 0x3A66 -- imm0[31:16] = A1=0.4 = 0x3666 - imm0_high = 0x3666; - imm0_low = 0x3A66; - // imm1[15:0] = A2=0.1 = 0x2E66 -- imm1[31:16] = A3=-0.09 = 0xADC3 - imm1_high = 0xADC3; - imm1_low = 0x2E66; - // imm2[15:0] = A4=-0.075 = 0xACCD -- imm2[31:16] = A5=0 = 0x7C00 - imm2_high = 0x7C00; - imm2_low = 0xACCD; - // imm3[15:0] = B0=0.5 = 0x3800 -- imm3[31:16] = B1=0.7 = 0x399A - imm3_high = 0x399A; - imm3_low = 0x3800; - // imm4[15:0] = B2=0.99 = 0x3BEC -- imm4[31:16] = B3=1.27 = 0x3D14 - imm4_high = 0x3D14; - imm4_low = 0x3BEC; - // imm5[15:0] = B4=1.235 = 0x3CF1 -- imm5[31:16] = B5=1.0 = 0x3C00 - imm5_high = 0x3C00; - imm5_low = 0x3CF1; - TTI_SFPLOADI(0, 10, imm0_low); - TTI_SFPLOADI(0, 8, imm0_high); - TTI_SFPLOADI(1, 10, imm1_low); - TTI_SFPLOADI(1, 8, imm1_high); - TTI_SFPLOADI(2, 10, imm2_low); - TTI_SFPLOADI(2, 8, imm2_high); - TTI_SFPLOADI(4, 10, imm3_low); - TTI_SFPLOADI(4, 8, imm3_high); - TTI_SFPLOADI(5, 10, imm4_low); - TTI_SFPLOADI(5, 8, imm4_high); - TTI_SFPLOADI(6, 10, imm5_low); - TTI_SFPLOADI(6, 8, imm5_high); - } else { - imm0 = 0x28FF; - imm1 = 0x3020; - TTI_SFPLOADI(0, 2, imm0); - TTI_SFPLOADI(1, 2, imm1); - } - break; - case SfpuType::gelu: - // //SG: FIXME - // imm0 = 0x18FF; - // imm1 = (APPROXIMATION_MODE)? 0x212C : 0x2010; - // imm2 = 0xFF00; - // TTI_SFPLOADI(0, 2, imm0); - // TTI_SFPLOADI(1, 2, imm1); - // TTI_SFPLOADI(2, 2, imm2); - - // // >= 3.0f - // lreg2_hi=0.50;//3800 - // lreg6_hi=0.0f;//7c00 - // // 2.0f -> 3.0f - // lreg2_lo= 0.5402f;//3852 - // lreg6_lo= -0.1194f;//AFA4 - // // 1.5f -> 2.0f - // lreg1_hi= .6099f; //38E1 - // lreg5_hi= -.2635f; //B437 - // // 1.0f -> 1.5f - // lreg1_lo=0.6189;//38F3 - // lreg5_lo=-.2797;//B479 - // // 0.5f -> 1.0f - // lreg0_hi=.4939f;//37E7 - // lreg4_hi=-.1605f;//B122 - // // 0.0f -> 0.5f - // lreg0_lo=0.1928f;//322B - // lreg4_lo=-0.0150f;//A3AE - sfpu_load_imm32(0,0x37E7322B); - //sfpu_load_imm32(4,0xB122A3AE); - sfpu_load_imm32(4,0xB12286D8); - - - sfpu_load_imm32(1,0x38E138F3); - sfpu_load_imm32(5,0xB437B479); - - sfpu_load_imm32(2,0x38003852); - sfpu_load_imm32(6,0x7c00afa4); - - break; - case SfpuType::dropout: - init_dropout_seed(param0); - break; - case SfpuType::quant_int32: - case SfpuType::requant_int32: - case SfpuType::dequant_int32: - sfpu_load_imm32(2,param0); - break; - default: - // Should result in compile time error?? - break; + imm0 = 0x1DFF; //0.90625*x + imm1 = 0x481A; //0.09375*x + 0.8125 + imm2 = 0xFF00; //1 + _sfpu_load_imm16_(0, imm0); + _sfpu_load_imm16_(1, imm1); + _sfpu_load_imm16_(2, imm2); +} + +template +inline void _init_sigmoid_() +{ + // imm0 = 0x3DFF; + // imm1 = 0x21D8; + // imm2 = 0xFF10; + // TTI_SFPLOADI(0, 2, imm0); + // TTI_SFPLOADI(1, 2, imm1); + // TTI_SFPLOADI(2, 2, imm2); + // Using a 6 piece LUT to calculate and model sigmoid directly + // x <= 0.5 --> 0.2452x + (-0.0004997) + // x <= 1.0 --> 0.2173x + 0.0152 + // x <= 1.5 --> 0.1731x + 0.05988 + // x <= 2.0 --> 0.1262x + 0.1298 + // x <= 4.0 --> 0.0485x + 0.2998 + // x > 4.0 --> 0.4998 + + // imm0[15:0] = A0=0.2452 = 0x33D9 -- imm0[31:16] = A1=0.2173 = 0x32F4 + _sfpu_load_imm32_(0,0x32F433D9); + // imm4[15:0] = B0= -0.0004997 = 0x9018 -- imm4[31:16] = B1= 0.0152 = 0x23c8 + _sfpu_load_imm32_(4,0x23C89018); + + // imm1[15:0] = A2=0.1731 = 0x318a -- imm1[31:16] = A3=0.1262 = 0x300a + _sfpu_load_imm32_(1,0x300A318A); + // imm5[15:0] = B2=0.05988 = 0x2BAA -- imm5[31:16] = B3=0.1298 = 0x3027 + _sfpu_load_imm32_(5,0x30272BAA); + + // imm2[15:0] = A4=0.0485 = 0x2A35 -- imm2[31:16] = A5=0.0 = 0x7C00 + _sfpu_load_imm32_(2,0x7C002A35); + // imm6[15:0] = B4=0.2998 = 0x34CC -- imm6[31:16] = B5=0.4998 = 0x37ff + _sfpu_load_imm32_(6,0x37ff34CC); +} + +template +inline void _init_gelu_derivative_() +{ + vConstFloatPrgm0 = 1.442695f; // ln2_recip + vConstFloatPrgm1 = 2.0f; + vConstFloatPrgm2 = 0.863281f; + + uint imm0; + uint imm1; + uint imm2; + uint imm3; + uint imm4; + uint imm5; + + if constexpr (APPROXIMATION_MODE) { + // Using a 6 piece LUT to calculate and model gelu_derivative directly + // x <= 0.5 --> 0.8x + 0.5 + // x <= 1.0 --> 0.4x + 0.7 + // x <= 1.5 --> 0.1x + 0.99 + // x <= 2.0 --> -0.09x + 1.27 + // x <= 3.0 --> -0.075x + 1.235 + // x > 3.0 --> 1.0 + // imm0[15:0] = A0=0.8 = 0x3A66 -- imm0[31:16] = A1=0.4 = 0x3666 + imm0 = 0x36663A66; + // imm1[15:0] = A2=0.1 = 0x2E66 -- imm1[31:16] = A3=-0.09 = 0xADC3 + imm1 = 0xADC32E66; + // imm2[15:0] = A4=-0.075 = 0xACCD -- imm2[31:16] = A5=0 = 0x7C00 + imm2 = 0x7C00ACCD; + // imm3[15:0] = B0=0.5 = 0x3800 -- imm3[31:16] = B1=0.7 = 0x399A + imm3 = 0x399A3800; + // imm4[15:0] = B2=0.99 = 0x3BEC -- imm4[31:16] = B3=1.27 = 0x3D14 + imm4 = 0x3D143BEC; + // imm5[15:0] = B4=1.235 = 0x3CF1 -- imm5[31:16] = B5=1.0 = 0x3C00 + imm5 = 0x3C003CF1; + _sfpu_load_imm32_(0, imm0); + _sfpu_load_imm32_(1, imm1); + _sfpu_load_imm32_(2, imm2); + _sfpu_load_imm32_(4, imm3); + _sfpu_load_imm32_(5, imm4); + _sfpu_load_imm32_(6, imm5); + } else { + imm0 = 0x28FF; + imm1 = 0x3020; + _sfpu_load_imm16_(0, imm0); + _sfpu_load_imm16_(1, imm1); } + +} + +template +inline void _init_gelu_() +{ + vConstFloatPrgm0 = 0.5f; + + // // >= 3.0f + // lreg2_hi=0.50;//3800 + // lreg6_hi=0.0f;//7c00 + // // 2.0f -> 3.0f + // lreg2_lo= 0.5402f;//3852 + // lreg6_lo= -0.1194f;//AFA4 + // // 1.5f -> 2.0f + // lreg1_hi= .6099f; //38E1 + // lreg5_hi= -.2635f; //B437 + // // 1.0f -> 1.5f + // lreg1_lo=0.6189;//38F3 + // lreg5_lo=-.2797;//B479 + // // 0.5f -> 1.0f + // lreg0_hi=.4939f;//37E7 + // lreg4_hi=-.1605f;//B122 + // // 0.0f -> 0.5f + // lreg0_lo=0.1928f;//322B + // lreg4_lo=-0.0150f;//A3AE + _sfpu_load_imm32_(0,0x37E7322B); + _sfpu_load_imm32_(4,0xB12286D8); + + _sfpu_load_imm32_(1,0x38E138F3); + _sfpu_load_imm32_(5,0xB437B479); + + _sfpu_load_imm32_(2,0x38003852); + _sfpu_load_imm32_(6,0x7c00afa4); + +} + +inline void _init_dropout_(const uint seed) +{ + vConstIntPrgm0 = 0xb400; + vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB + + _init_dropout_seed_(seed); +} + +inline void init_quant_zero_point(const uint zero_point) +{ + _sfpu_load_imm32_(2,zero_point); } template @@ -403,7 +381,7 @@ void calculate_cube(uint16_t exp_base_scale_factor = 0) */ template -void calculate_exponential(const int iterations, uint16_t exp_base_scale_factor = 0) +void _calculate_exponential_(const int iterations, uint16_t exp_base_scale_factor = 0) { // Unroll 8 best for approx, unroll 0 for precise, compiler figures this out for (int d = 0; d < iterations; d++) @@ -471,7 +449,7 @@ inline vFloat _calculate_gelu_core_(vFloat in) } template -inline void calculate_gelu(const int iterations) +inline void _calculate_gelu_(const int iterations) { vUInt l0 = l_reg[LRegs::LReg0]; @@ -485,7 +463,7 @@ inline void calculate_gelu(const int iterations) for (int d = 0; d < iterations; d++) { // vFloat in = dst_reg[0]; - // vFloat result = _calculate_gelu_core_(in); + // vFloat result = calculate_gelu_core(in); // vFloat half_in = in * half; // result = lut(result, l0, l1, l2); @@ -522,7 +500,7 @@ inline void calculate_gelu(const int iterations) } template -inline void calculate_sigmoid(const int iterations) +inline void _calculate_sigmoid_(const int iterations) { constexpr int lut_mode = 0; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1 vUInt l0 = l_reg[LRegs::LReg0]; @@ -553,7 +531,7 @@ inline void calculate_sigmoid(const int iterations) } template -inline void calculate_tanh(const int iterations) +inline void _calculate_tanh_(const int iterations) { // SFPU microcode vUInt l0 = l_reg[LRegs::LReg0]; @@ -576,7 +554,7 @@ inline void calculate_tanh(const int iterations) } template -inline void calculate_hardtanh(const int iterations, uint param0, uint param1, uint param2) +inline void _calculate_hardtanh_(const int iterations, uint param0, uint param1, uint param2) { // All params are in FP16_B format // param0 = -(neg_threshold) @@ -613,7 +591,7 @@ inline void calculate_hardtanh(const int iterations, uint param0, uint param1, u } template -inline void calculate_tanh_derivative(const int iterations) +inline void _calculate_tanh_derivative_(const int iterations) { vUInt l0 = l_reg[LRegs::LReg0]; vUInt l1 = l_reg[LRegs::LReg1]; @@ -640,7 +618,7 @@ inline void calculate_tanh_derivative(const int iterations) } template -inline void calculate_gelu_derivative(const int iterations) +inline void _calculate_gelu_derivative_(const int iterations) { if constexpr (APPROXIMATION_MODE) { constexpr int lut_mode = 1; // SFPLUTFP32_MOD0_FP16_6ENTRY_TABLE1 @@ -706,7 +684,7 @@ inline void calculate_gelu_derivative(const int iterations) } template -inline void calculate_reciprocal(const int iterations) +inline void _calculate_reciprocal_(const int iterations) { #pragma GCC unroll 8 for (int d = 0; d < iterations; d++) @@ -727,7 +705,7 @@ inline void calculate_reciprocal(const int iterations) } template -inline void calculate_sqrt(const int iterations) +inline void _calculate_sqrt_(const int iterations) { #pragma GCC unroll 8 for (int d = 0; d < iterations; d++) @@ -773,9 +751,13 @@ inline void calculate_sqrt(const int iterations) } template -inline void calculate_dropout(const int iterations, uint prob, uint scale) +inline void _calculate_dropout_(const int iterations, uint prob, uint scale) { // SFPU microcode + + FWLOG1("calculate_dropout() -- prob:%x", prob); + FWLOG1("calculate_dropout() -- scale:%x", scale); + vUInt rand = l_reg[LRegs::LReg3]; #pragma GCC unroll 0 @@ -812,7 +794,7 @@ inline void calculate_dropout(const int iterations, uint prob, uint scale) } template -inline void calculate_lrelu(const int iterations, uint slope) +inline void _calculate_lrelu_(const int iterations, uint slope) { // SFPU microcode vFloat s = s2vFloat16b(slope); @@ -833,7 +815,7 @@ inline void calculate_lrelu(const int iterations, uint slope) } template -inline void calculate_power(const int iterations, uint exponent) +inline void _calculate_power_(const int iterations, uint exponent) { for (int d = 0; d < iterations; d++) { @@ -850,7 +832,7 @@ inline void calculate_power(const int iterations, uint exponent) } template -inline void calculate_square(const int iterations) +inline void _calculate_square_(const int iterations) { #pragma GCC unroll 8 for (int d = 0; d < iterations; d++) @@ -920,7 +902,7 @@ sfpi_inline void _calculate_log_body_(const uint log_base_scale_factor) } template -inline void calculate_log(const int iterations, uint log_base_scale_factor) +inline void _calculate_log_(const int iterations, uint log_base_scale_factor) { #pragma GCC unroll 8 for(int d = 0; d < iterations; d++){ @@ -937,13 +919,9 @@ sfpi_inline void _calculate_comp_init_flag_(bool check, vFloat& flag1, vFloat& f } } -template -inline void calculate_comp(const int iterations, uint exponent_size_8) +template +inline void _calculate_comp_(const int iterations, uint exponent_size_8) { - //invert output and use same comparison check - constexpr bool invert_output = ((COMP_MODE == SfpuType::greater_than_equal_zero) || - (COMP_MODE == SfpuType::not_equal_zero) || - (COMP_MODE == SfpuType::greater_than_zero)); // output_0 and output_1 hold the outputs use use when a zero or negative check is true/false. // False = 0.0 = kCONST_0 (5/8-bit exponent format) @@ -954,16 +932,13 @@ inline void calculate_comp(const int iterations, uint exponent_size_8) constexpr float output_0 = invert_output ? 0.0f : 1.0f; constexpr float output_1 = invert_output ? 1.0f : 0.0f; - constexpr bool check_zero = (COMP_MODE == SfpuType::equal_zero) || (COMP_MODE == SfpuType::not_equal_zero); - constexpr bool second_check = (COMP_MODE == SfpuType::less_than_equal_zero) || (COMP_MODE == SfpuType::greater_than_zero); - for (int d = 0; d < iterations; d++) { vFloat v = dst_reg[0]; vFloat flag1, flag2; if constexpr(check_zero) { - v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { _calculate_comp_init_flag_(second_check, flag1, flag2, output_0); } v_else { _calculate_comp_init_flag_(second_check, flag1, flag2, output_1); @@ -983,18 +958,18 @@ inline void calculate_comp(const int iterations, uint exponent_size_8) vFloat result; if constexpr (second_check) { - // SfpuType::less_than_equal_zero + // less_than_equal_zero // flag1 = 0x3F80(1.0) if DST < 0 else 0 // flag2 = 0x3F80(1.0) if DST == 0 else 0 // Do a bitwise Or (flag1 | flag2) to get <= condition. // flag1 < 0 OR flag2 == 0 => DST is Less than or Equal to zero. // Result will be either 0x0000(0.0) or 0x3F80(1.0) - if constexpr (COMP_MODE == SfpuType::less_than_equal_zero) { + if constexpr (is_less_than_equal_zero) { result = reinterpret(reinterpret(flag1) | reinterpret(flag2)); } else { - // SfpuType::greater_than_zero + // greater_than_zero // flag1 = 0x3F80(1.0) if DST >= 0 else 0 // flag2 = 0x3F80(1.0) if DST != 0 else 0 // Do a bitwise And (flag1 & flag2) to get > condition. @@ -1013,7 +988,7 @@ inline void calculate_comp(const int iterations, uint exponent_size_8) } template -inline void calculate_clamp(const int iterations, uint param0, uint param1, uint param2) +inline void _calculate_clamp_(const int iterations, uint param0, uint param1, uint param2) { // All params are in FP16 format // param0 = min @@ -1044,7 +1019,7 @@ inline void calculate_clamp(const int iterations, uint param0, uint param1, uint } template -inline void calculate_abs(const int iterations) +inline void _calculate_abs_(const int iterations) { // SFPU microcode for (int d = 0; d < iterations; d++) @@ -1056,7 +1031,7 @@ inline void calculate_abs(const int iterations) } template -inline void calculate_sign(const int iterations, uint exponent_size_8) +inline void _calculate_sign_(const int iterations, uint exponent_size_8) { // All params are in FP16 format // uint format = 1; @@ -1072,7 +1047,7 @@ inline void calculate_sign(const int iterations, uint exponent_size_8) //param0 == 0 is Bfp8 format. It does not require bias removal. //param0 != 0 is Float16 format and exp bias needs to be removed for zero check. - v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { dst_reg[0] = vConst0; } v_endif; @@ -1082,7 +1057,7 @@ inline void calculate_sign(const int iterations, uint exponent_size_8) } template -inline void calculate_max(const int iterations) +inline void _calculate_max_(const int iterations) { for (int d = 0; d < iterations; d++) { @@ -1098,7 +1073,7 @@ inline void calculate_max(const int iterations) } template -inline void calculate_max_int32(const int iterations) +inline void _calculate_max_int32_(const int iterations) { for (int d = 0; d < iterations; d++) { @@ -1113,7 +1088,7 @@ inline void calculate_max_int32(const int iterations) } template -sfpi_inline vFloat sfpu_sine_maclaurin_series(vFloat val) +sfpi_inline vFloat _sfpu_sine_maclaurin_series_(vFloat val) { // Good for [-pi:pi] // Mclauren series = x - x^3/3! + x^5/5! - x^7/7! + x^9/9! - x^11/11! @@ -1142,7 +1117,7 @@ sfpi_inline vFloat sfpu_sine_maclaurin_series(vFloat val) return output; } template -sfpi_inline vFloat sfpu_cosine_maclaurin_series(vFloat val) +sfpi_inline vFloat _sfpu_cosine_maclaurin_series_(vFloat val) { // Good for [-pi:pi] // Mclauren series = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! - x^10/10! + x^12/12! @@ -1170,7 +1145,7 @@ sfpi_inline vFloat sfpu_cosine_maclaurin_series(vFloat val) return output; } template -inline void calculate_sine(const int iterations) +inline void _calculate_sine_(const int iterations) { // SFPU microcode for (int d = 0; d < iterations; d++) @@ -1181,7 +1156,7 @@ inline void calculate_sine(const int iterations) vFloat whole_v_float = int32_to_float(whole_v, 0); v = v - whole_v_float; v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi] - v = sfpu_sine_maclaurin_series(v); + v = _sfpu_sine_maclaurin_series_(v); whole_v = whole_v & 0x1; v_if(whole_v != 0) { // odd so flip the sign @@ -1193,7 +1168,7 @@ inline void calculate_sine(const int iterations) } } template -inline void calculate_cosine(const int iterations) +inline void _calculate_cosine_(const int iterations) { // SFPU microcode for (int d = 0; d < iterations; d++) @@ -1204,7 +1179,7 @@ inline void calculate_cosine(const int iterations) vFloat whole_v_float = int32_to_float(whole_v, 0); v = v - whole_v_float; v *= 3.141592653589793f; // fractional * pi to get it in [-pi:pi] - v = sfpu_cosine_maclaurin_series(v); + v = _sfpu_cosine_maclaurin_series_(v); whole_v = whole_v & 0x1; v_if(whole_v != 0) { // odd so flip the sign @@ -1216,7 +1191,7 @@ inline void calculate_cosine(const int iterations) } } template -inline void relu_max(const int iterations, uint uint_threshold) +inline void _relu_max_(const int iterations, uint uint_threshold) { vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a); for (int d = 0; d < iterations; d++) @@ -1235,7 +1210,7 @@ inline void relu_max(const int iterations, uint uint_threshold) } } template -inline void relu_min(const int iterations, uint uint_threshold) +inline void _relu_min_(const int iterations, uint uint_threshold) { vFloat threshold = s2vFloat16(uint_threshold, s2vFloat16::fp16a); for (int d = 0; d < iterations; d++) @@ -1250,7 +1225,7 @@ inline void relu_min(const int iterations, uint uint_threshold) } } template -inline void cast_fp32_to_fp16a(const int iterations) +inline void _cast_fp32_to_fp16a_(const int iterations) { #pragma GCC unroll 8 for (int d = 0; d < iterations; d++) @@ -1265,7 +1240,7 @@ inline void cast_fp32_to_fp16a(const int iterations) } template -inline void quant_int32(const int iterations, const uint dst_offset) +inline void _quant_int32_(const int iterations, const uint dst_offset) { // Operand A is input (fp32) // Operand B is scaling factor (fp32) @@ -1290,7 +1265,7 @@ inline void quant_int32(const int iterations, const uint dst_offset) } template -inline void requant_int32(const int iterations, const uint dst_offset) +inline void _requant_int32_(const int iterations, const uint dst_offset) { // Operand A is input to requant (int32) // Operand B is scaling factor (fp32) @@ -1318,7 +1293,7 @@ inline void requant_int32(const int iterations, const uint dst_offset) } template -inline void dequant_int32(const int iterations, const uint dst_offset) +inline void _dequant_int32_(const int iterations, const uint dst_offset) { // Operand A[LREG0] is input to dequant (int32) // Operand B[LREG1] is scaling factor (fp32) @@ -1344,126 +1319,5 @@ inline void dequant_int32(const int iterations, const uint dst_offset) } } -template -inline void calculate_mask() -{ - bool exponent_size_8 = true; - for (int d = 0; d < ITERATIONS; d++) - { - vFloat mask = dst_reg[32]; - v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) { - dst_reg[0] = 0; - } - v_endif; - dst_reg++; - } -} - -template -inline void calculate_sfpu(const int iterations = ITERATIONS, uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) -{ - if constexpr (operation == SfpuType::exponential) { - calculate_exponential(iterations, param0); - } - else if constexpr (operation == SfpuType::exp_with_base) { - calculate_exponential(iterations, param0); - } - else if constexpr (operation == SfpuType::tanh) { - calculate_tanh(iterations); - } - else if constexpr (operation == SfpuType::hardtanh) { - calculate_hardtanh(iterations, param0, param1, param2); - } - else if constexpr (operation == SfpuType::gelu) { - calculate_gelu(iterations); - } - else if constexpr (operation == SfpuType::reciprocal) { - calculate_reciprocal(iterations); - } - else if constexpr (operation == SfpuType::sigmoid) { - calculate_sigmoid(iterations); - } - else if constexpr (operation == SfpuType::sqrt) { - calculate_sqrt(iterations); - } - else if constexpr (operation == SfpuType::tanh_derivative) { - calculate_tanh_derivative(iterations); - } - else if constexpr (operation == SfpuType::lrelu) { - calculate_lrelu(iterations, param0); - } - else if constexpr (operation == SfpuType::dropout) { - calculate_dropout(iterations, param0, param1); - } - else if constexpr (operation == SfpuType::power) { - calculate_power(iterations, param0); - } - else if constexpr (operation == SfpuType::square) { - calculate_square(iterations); - } - else if constexpr (operation == SfpuType::log) { - calculate_log(iterations, param0); - } - else if constexpr (operation == SfpuType::log_with_base) { - calculate_log(iterations, param0); - } - else if constexpr (operation == SfpuType::gelu_derivative) { - calculate_gelu_derivative(iterations); - } - else if constexpr ((operation == SfpuType::equal_zero) || - (operation == SfpuType::not_equal_zero) || - (operation == SfpuType::less_than_zero) || - (operation == SfpuType::greater_than_equal_zero) || - (operation == SfpuType::less_than_equal_zero) || - (operation == SfpuType::greater_than_zero)) { - calculate_comp(iterations, param5); - } - else if constexpr (operation == SfpuType::clamp) { - calculate_clamp(iterations, param0, param1, param2); - } - else if constexpr (operation == SfpuType::abs) { - calculate_abs(iterations); - } - else if constexpr (operation == SfpuType::sign) { - calculate_sign(iterations, param5); - } - else if constexpr (operation == SfpuType::max) { - if constexpr (IS_INT_SFPU_EN) - calculate_max_int32(iterations); - else - calculate_max(iterations); - } - else if constexpr (operation == SfpuType::sine) { - calculate_sine(iterations); - } - else if constexpr (operation == SfpuType::cosine) { - calculate_cosine(iterations); - } - else if constexpr (operation == SfpuType::relu_min) { - relu_min(iterations, param0); - } - else if constexpr (operation == SfpuType::relu_max) { - relu_max(iterations, param0); - } - else if constexpr (operation == SfpuType::cast_fp32_to_fp16a) { - cast_fp32_to_fp16a(iterations); - } - else if constexpr (operation == SfpuType::quant_int32) { - quant_int32(iterations, param0); - } - else if constexpr (operation == SfpuType::requant_int32) { - requant_int32(iterations, param0); - } - else if constexpr (operation == SfpuType::dequant_int32) { - dequant_int32(iterations, param0); - } - else if constexpr (operation == SfpuType::mask) { - calculate_mask(); - } - else if constexpr (operation == SfpuType::negative) { - calculate_negative(); - } -} - } // namespace sfpu } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h index a8134eb8d47..d9acf613adc 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_structs.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h index 2e1b7acb36f..35edc65483b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_template.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" @@ -243,4 +244,221 @@ class ckernel_unpack_template void program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask = 0); // calls program, then run }; + inline ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op) + : m_outer_loop_len(outer_loop_len) + , m_inner_loop_len(inner_loop_len) + , m_loop_op0(loop_op) + , m_loop_op1(TT_OP_NOP) + , m_end_op0(TT_OP_NOP) + , m_end_op1(TT_OP_NOP) + , m_start_op0(TT_OP_NOP) + { + m_loop0_last_instr = loop_op; + m_loop1_last_instr = loop_op; + } + + inline ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1) + : m_outer_loop_len(outer_loop_len) + , m_inner_loop_len(inner_loop_len) + , m_loop_op0(loop_op0) + , m_loop_op1(loop_op1) + , m_end_op0(TT_OP_NOP) + , m_end_op1(TT_OP_NOP) + , m_start_op0(TT_OP_NOP) + { + m_loop0_last_instr = loop_op1; + m_loop1_last_instr = loop_op1; + } + + inline void ckernel_template::set_loop_op0(uint loop_op) + { + m_loop_op0 = loop_op; + } + + inline void ckernel_template::set_loop_op1(uint loop_op) + { + m_loop_op1 = loop_op; + } + + inline void ckernel_template::set_end_ops(uint end_op0, uint end_op1) + { + m_end_op0 = end_op0; + m_end_op1 = end_op1; + } + + inline void ckernel_template::set_end_op(uint end_op0) + { + set_end_ops(end_op0, TT_OP_NOP); + } + + inline void ckernel_template::set_start_op(uint start_op0) + { + m_start_op0 = start_op0; + } + + inline void ckernel_template::set_last_inner_loop_instr(uint op) + { + m_loop1_last_instr = op; + } + + inline void ckernel_template::set_last_outer_loop_instr(uint op) + { + m_loop0_last_instr = op; + } + + inline void ckernel_template::program_and_run(volatile uint *instrn_buffer) + { + program(instrn_buffer); + run(instrn_buffer); + } + + inline void ckernel_template::run(volatile uint *instrn_buffer) + { + TTI_MOP(1, 0, 0); // run the double-loop template + } + + inline void ckernel_template::program(volatile uint *instrn_buffer) + { + volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); + + mop_sync(); // wait until previous mops have completed + + mop_cfg[0] = m_outer_loop_len; + mop_cfg[1] = m_inner_loop_len; + mop_cfg[2] = m_start_op0; + mop_cfg[3] = m_end_op0; + mop_cfg[4] = m_end_op1; + mop_cfg[5] = m_loop_op0; + mop_cfg[6] = m_loop_op1; + mop_cfg[7] = m_loop0_last_instr; + mop_cfg[8] = m_loop1_last_instr; + } + + inline void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) + { + program(instrn_buffer); + run(instrn_buffer, count, zmask); + } + + inline void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) + { + FWASSERT("Unpack template only supports loops up to 128", count <= 128); + TT_MOP_CFG(zmask >> 16); // Set the top 16 bits of zmask - we could skip this for count <= 16 + TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template + } + + // Version without zmask, should be slightly faster by eliminating one instruction. + inline void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count) + { + FWASSERT("Unpack template only supports loops up to 128", count <= 128); + TT_MOP(0, count - 1, 0); // Run the template + } + + inline void ckernel_unpack_template::program(volatile uint *instrn_buffer) const + { + volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); + + mop_sync(); // wait until previous mops have completed + + mop_cfg[1] = m_unpackB | (m_unpack_halo << 1); + mop_cfg[2] = m_B_instr; + mop_cfg[3] = m_A0_instr; + mop_cfg[4] = m_A1_instr; + mop_cfg[5] = m_A2_instr; + mop_cfg[6] = m_A3_instr; + mop_cfg[7] = m_skipA_instr; + mop_cfg[8] = m_skipB_instr; + } + + inline ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr) + { + return ckernel_unpack_template(false, // src B + false, // halo + A_instr, 0, 0, 0, skipA_instr, 0, 0); + } + + inline ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr) + { + return ckernel_unpack_template(false, // src B + false, // halo + B_instr, 0, 0, 0, skipB_instr, 0, 0); + } + + inline ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr) + { + return ckernel_unpack_template(false, // src B + true, // halo + neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0); + } + + inline ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(false, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0); + } + + inline ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(false, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0); + } + + inline ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(true, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B); + } + + inline ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(true, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B); + } + + inline ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr, + + uint B_instr, uint skipB_instr) + { + return ckernel_unpack_template(true, // src B + false, // halo + A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr); + } + + inline ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){ + return ckernel_unpack_template::lA(instr0, skip0); + } + + inline ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){ + // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA. + return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1); + } + } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h index e9b2559a1b7..120f8898adf 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_xmov.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h index fa97031b17a..0c06a710cbb 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cmath_common.h @@ -2,11 +2,13 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once //#include "kernel_types.h" #include "ckernel.h" #include "ckernel_template.h" +#include "ckernel_sfpu.h" #include "ckernel_globals.h" #include "llk_defs.h" @@ -197,6 +199,13 @@ inline void clear_addr_mod_base() TTI_SETC16(ADDR_MOD_SET_Base_ADDR32, 0); // clear addr mod base (use addr mods 0..3) } +template +inline void inc_dst_addr() +{ + static_assert(num_rows <= 15, "num_rows must be <= 15"); + TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, num_rows, 0, 0, p_setrwc::SET_D); +} + inline void math_dest_wait() { FWLOG0("XX math_full_dest_sync()->wait for whole dest available"); @@ -230,4 +239,14 @@ inline constexpr bool is_32bit_input(const std::uint32_t src_format, const std:: ((output_df == (uint)DataFormat::Int32) || (output_df == (uint)DataFormat::Float32)); } +inline constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc) +{ + return (math_fidelity_desc & 0x7); +} + +inline constexpr int get_math_fidelity_increment(const int math_fidelity_desc) +{ + return ((math_fidelity_desc >> 3) & 0x1) + 1; +} + } // namespace ckernel::math diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h index bdc0b6b5063..011092787c7 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cpack_common.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" @@ -517,6 +518,33 @@ namespace ckernel::packer TT_SETDMAREG(0, UPPER_HALFWORD(addr), 0, HI_16(p_gpr_pack::OUTPUT_ADDR)); } + template + inline void program_packer_untilized_destination(const uint32_t addr, const uint32_t pack_dst_format) + { + // Each packer packs 8 rows of block_ct_dim*TILE_C_DIM datums + const uint32_t block_size = SCALE_DATUM_SIZE(pack_dst_format, block_ct_dim * TILE_C_DIM * (TILE_R_DIM/4)); + constexpr uint32_t offset0 = 0; + const uint32_t offset1 = (1*block_size)/16; + const uint32_t offset2 = (2*block_size)/16; + const uint32_t offset3 = (3*block_size)/16; + + TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset0), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+0)); + TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset0), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+0)); + TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset1), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+1)); + TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset1), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+1)); + TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset2), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+2)); + TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset2), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+2)); + TT_SETDMAREG(0, LOWER_HALFWORD(addr+offset3), 0, LO_16(p_gpr_pack::OUTPUT_ADDR+3)); + TT_SETDMAREG(0, UPPER_HALFWORD(addr+offset3), 0, HI_16(p_gpr_pack::OUTPUT_ADDR+3)); + + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR); + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+1); + TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+2); + TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+3); + + TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 0); // pack flush + } + inline void program_packer_dest_offset_registers(uint32_t dest_tile_offset) { TT_SETDMAREG(0, LOWER_HALFWORD(dest_tile_offset), 0, LO_16(p_gpr_pack::TEMP_TILE_OFFSET)); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h index 55404e24d39..eec20973ec2 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/cunpack_common.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" @@ -180,13 +181,13 @@ namespace ckernel::unpacker while (semaphore_read(semaphore::UNPACK_SYNC) > 0) {} } - inline void enalbe_int8_fpu_math() { + inline void enable_int8_fpu_math() { alu_config_u alu_payload = {.val = 0}; alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = 1; cfg_reg_rmw_tensix(alu_payload.val); } - template + template inline void configure_unpack_AB( const uint unpA_src_format, const uint unpB_src_format, @@ -249,10 +250,9 @@ namespace ckernel::unpacker alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = int8_math_enabled; constexpr uint alu_stoch_rnd_mask = ALU_ROUNDING_MODE_Fpu_srnd_en_MASK | ALU_ROUNDING_MODE_Gasket_srnd_en_MASK | ALU_ROUNDING_MODE_Packer_srnd_en_MASK; - constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndMode::All); - alu_payload.f.ALU_ROUNDING_MODE_Fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Fpu); - alu_payload.f.ALU_ROUNDING_MODE_Gasket_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndMode::Pack); - alu_payload.f.ALU_ROUNDING_MODE_Packer_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndMode::Pack); + alu_payload.f.ALU_ROUNDING_MODE_Fpu_srnd_en = fpu_srnd_en; + alu_payload.f.ALU_ROUNDING_MODE_Gasket_srnd_en = pack_srnd_en; + alu_payload.f.ALU_ROUNDING_MODE_Packer_srnd_en = pack_srnd_en; constexpr uint alu_mask = alu_format_mask | alu_dest_format_mask | alu_stoch_rnd_mask; @@ -348,28 +348,47 @@ namespace ckernel::unpacker reset_config_context(); } - template - inline void config_face_dim(const uint32_t face_r_dim) + template + inline void config_unpacker_x_end(const uint32_t face_r_dim) { switch (face_r_dim) { case 1: TTI_SETADCXX(UNP_SEL, 1*FACE_C_DIM-1, 0x0); - TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16); break; case 2: TTI_SETADCXX(UNP_SEL, 2*FACE_C_DIM-1, 0x0); - TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_2x16); break; case 4: TTI_SETADCXX(UNP_SEL, 4*FACE_C_DIM-1, 0x0); - TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_4x16); break; case 8: TTI_SETADCXX(UNP_SEL, 8*FACE_C_DIM-1, 0x0); - TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_8x16); break; default: TTI_SETADCXX(UNP_SEL, FACE_R_DIM*FACE_C_DIM-1, 0x0); + break; + } + } + + template + inline void config_unpacker_0_face_dim(const uint32_t face_r_dim) + { + //tile x dim registers are only for unpacker 0 + static_assert(UNP_SEL != p_setadc::UNP_B); + switch (face_r_dim) { + case 1: + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_1x16); + break; + case 2: + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_2x16); + break; + case 4: + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_4x16); + break; + case 8: + TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_8x16); + break; + default: TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16); break; } diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc deleted file mode 100644 index baeba52c6c6..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_template.cc +++ /dev/null @@ -1,230 +0,0 @@ -/* - * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 -*/ - -#include "ckernel_template.h" - -namespace ckernel -{ -extern volatile uint *cfg_regs; - -ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op) - : m_outer_loop_len(outer_loop_len) - , m_inner_loop_len(inner_loop_len) - , m_loop_op0(loop_op) - , m_loop_op1(TT_OP_NOP) - , m_end_op0(TT_OP_NOP) - , m_end_op1(TT_OP_NOP) - , m_start_op0(TT_OP_NOP) -{ - m_loop0_last_instr = loop_op; - m_loop1_last_instr = loop_op; -} - -ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1) - : m_outer_loop_len(outer_loop_len) - , m_inner_loop_len(inner_loop_len) - , m_loop_op0(loop_op0) - , m_loop_op1(loop_op1) - , m_end_op0(TT_OP_NOP) - , m_end_op1(TT_OP_NOP) - , m_start_op0(TT_OP_NOP) -{ - m_loop0_last_instr = loop_op1; - m_loop1_last_instr = loop_op1; -} - -void ckernel_template::set_loop_op0(uint loop_op) -{ - m_loop_op0 = loop_op; -} - -void ckernel_template::set_loop_op1(uint loop_op) -{ - m_loop_op1 = loop_op; -} - -void ckernel_template::set_end_ops(uint end_op0, uint end_op1) -{ - m_end_op0 = end_op0; - m_end_op1 = end_op1; -} - -void ckernel_template::set_end_op(uint end_op0) -{ - set_end_ops(end_op0, TT_OP_NOP); -} - -void ckernel_template::set_start_op(uint start_op0) -{ - m_start_op0 = start_op0; -} - -void ckernel_template::set_last_inner_loop_instr(uint op) -{ - m_loop1_last_instr = op; -} - -void ckernel_template::set_last_outer_loop_instr(uint op) -{ - m_loop0_last_instr = op; -} - -void ckernel_template::program_and_run(volatile uint *instrn_buffer) -{ - program(instrn_buffer); - run(instrn_buffer); -} - -void ckernel_template::run(volatile uint *instrn_buffer) -{ - TTI_MOP(1, 0, 0); // run the double-loop template -} - -void ckernel_template::program(volatile uint *instrn_buffer) -{ - volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); - - mop_sync(); // wait until previous mops have completed - - mop_cfg[0] = m_outer_loop_len; - mop_cfg[1] = m_inner_loop_len; - mop_cfg[2] = m_start_op0; - mop_cfg[3] = m_end_op0; - mop_cfg[4] = m_end_op1; - mop_cfg[5] = m_loop_op0; - mop_cfg[6] = m_loop_op1; - mop_cfg[7] = m_loop0_last_instr; - mop_cfg[8] = m_loop1_last_instr; -} - -void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) -{ - program(instrn_buffer); - run(instrn_buffer, count, zmask); -} - -void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) -{ - FWASSERT("Unpack template only supports loops up to 128", count <= 128); - TT_MOP_CFG(zmask >> 16); // Set the top 16 bits of zmask - we could skip this for count <= 16 - TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template -} - -// Version without zmask, should be slightly faster by eliminating one instruction. -void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count) -{ - FWASSERT("Unpack template only supports loops up to 128", count <= 128); - TT_MOP(0, count - 1, 0); // Run the template -} - -void ckernel_unpack_template::program(volatile uint *instrn_buffer) const -{ - volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); - - mop_sync(); // wait until previous mops have completed - - mop_cfg[1] = m_unpackB | (m_unpack_halo << 1); - mop_cfg[2] = m_B_instr; - mop_cfg[3] = m_A0_instr; - mop_cfg[4] = m_A1_instr; - mop_cfg[5] = m_A2_instr; - mop_cfg[6] = m_A3_instr; - mop_cfg[7] = m_skipA_instr; - mop_cfg[8] = m_skipB_instr; -} - -ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr) -{ - return ckernel_unpack_template(false, // src B - false, // halo - A_instr, 0, 0, 0, skipA_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr) -{ - return ckernel_unpack_template(false, // src B - false, // halo - B_instr, 0, 0, 0, skipB_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr) -{ - return ckernel_unpack_template(false, // src B - true, // halo - neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(false, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(false, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(true, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B); -} - -ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(true, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B); -} - -ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr, - - uint B_instr, uint skipB_instr) -{ - return ckernel_unpack_template(true, // src B - false, // halo - A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr); -} - -ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){ - return ckernel_unpack_template::lA(instr0, skip0); -} - -ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){ - // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA. - return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1); -} - -} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc index 35130c72520..103269694e5 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc @@ -2,7 +2,6 @@ // to reduce the overhead of the compilation process and // improve build times #include "ckernel.cc" -#include "ckernel_template.cc" #ifdef PERF_DUMP #include "ckernel_perf_unpack_pack.cc" #endif diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list index 2a66c11d1a6..99880029ff6 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list +++ b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list @@ -1,2 +1 @@ ckernel.cc -ckernel_template.cc diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h index e205ec12747..e5250fb7412 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once namespace ckernel { @@ -97,67 +98,20 @@ enum ReluType { MAX_THRESHOLD_RELU, }; -enum SfpuType { - tanh, - hardtanh, - gelu, - exponential, - exp_with_base, - sigmoid, - reciprocal, - sqrt, - lrelu, - power, - square, - tanh_derivative, - log, - log_with_base, - equal_zero, - not_equal_zero, - less_than_zero, - greater_than_equal_zero, - less_than_equal_zero, - greater_than_zero, - clamp, - gelu_derivative, - dropout, - abs, - sign, - max, - sine, - cosine, - tan, - relu_max, - relu_min, - cast_fp32_to_fp16a, - sigmoid_appx, - gelu_appx, - elu, - min, - exp2, - heaviside, - expm1, - signbit, - asin, - acos, - atan, - erf, - erfc, - rsqrt, - isfinite, - isinf, - isposinf, - isneginf, - isnan, - logical_not_unary, - erfinv, - i0, - silu, - mask, - negative, - dequant_int32, - requant_int32, - quant_int32, - unused, +/* +Stochastic rounding modes: + None: No stochastic rounding enabled, default rounding is round to nearest even. + Fpu: Enables stochastic rounding for every accumulation in the fpu + Pack: Enables stochastic rounding in both gasket and packer. Gasket rounding is in + data format conversion stage from dest format to pack_src_format. Packer rounding + is in data format conversion stage from pack_src_format to pack_dst_format. + All: Enables fpu, pack and gasket rounding. +*/ +enum struct StochRndType { + None = 0, + Fpu = 1, + Pack = 2, + All = 0xf, }; + } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h index 8eb5e084934..a626e1b8180 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_common.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel_defs.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h index 0a70d430497..f34cd4aa74b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel_include.h" #include "ckernel_template.h" @@ -209,7 +210,7 @@ inline void _llk_math_eltwise_binary_(const std::uint32_t num_faces, uint dst_in } -template +template inline void eltwise_binary_configure_addrmod() { // Use srcA for data movement if constexpr ( @@ -240,7 +241,7 @@ inline void eltwise_binary_configure_addrmod() { .srca = {.incr = 0, .clr = 1}, .srcb = {.incr = 0, .clr = 1}, .dest = {.incr = 0, .clr = 0, .cr = 1}, - .fidelity = {.incr = 1}} + .fidelity = {.incr = FIDELITY_INCREMENT}} .set(ADDR_MOD_2); addr_mod_t{ @@ -330,15 +331,18 @@ inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, co template < EltwiseBinaryType eltwise_binary_type, BroadcastType src_b_bcast_type, - int NUM_FIDELITY_PHASES = 0, + int MATH_FIDELITY_DESC = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> inline void _llk_math_eltwise_binary_init_(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) { - eltwise_binary_configure_addrmod(); + constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); + constexpr int MATH_FIDELITY_INCREMENT = get_math_fidelity_increment(MATH_FIDELITY_DESC); + + eltwise_binary_configure_addrmod(); if constexpr ( (eltwise_binary_type == ELWADD) || (eltwise_binary_type == ELWSUB) || (eltwise_binary_type == ELWMUL)) { - eltwise_binary_configure_mop(acc_to_dest, num_faces); + eltwise_binary_configure_mop(acc_to_dest, num_faces); } else { FWASSERT("Unsupported op!", false); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h index 9e23dab17f2..36b5e02c21e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_binary_sfpu.h @@ -1,8 +1,7 @@ -/* - * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 -*/ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + #pragma once @@ -17,7 +16,6 @@ using namespace ckernel; // local function declarations -template inline void eltwise_binary_sfpu_configure_addrmod(){ // NOTE: this kernel is typically used in conjunction with // A2D, which is using ADDR_MOD_0 and ADDR_MOD_2, so use one @@ -32,22 +30,8 @@ inline void eltwise_binary_sfpu_configure_addrmod(){ } inline void eltwise_binary_sfpu_configure_mop(); -template -inline void _llk_math_eltwise_binary_sfpu_( - const uint face_r_dim, - const uint num_faces, - uint dst_index_a, - uint dst_index_b, - int vector_mode = (int)Dim::RC, - uint param0 = 0, - uint param1 = 0, - uint param2 = 0, - uint param3 = 0, - uint param4 = 0, - uint param5 = 0) { - constexpr int ITERATIONS = 8; - uint dst_index = (dst_index_a <= dst_index_b) ? dst_index_a : dst_index_b; - param0 = (dst_index_a > dst_index_b) ? dst_index_a-dst_index_b : dst_index_b-dst_index_a; +template +inline void _llk_math_eltwise_binary_sfpu_start_(const uint dst_index) { if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { math::set_dst_write_addr(math_sync_tile_dst_index); } else { @@ -55,65 +39,21 @@ inline void _llk_math_eltwise_binary_sfpu_( } math::set_addr_mod_base(); TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); - if (vector_mode == (int)Dim::R) { - // Do a row vector, Face0 + Face1 -- first iteration (first row) - const int iterations = (num_faces < 4) ? - ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns -#pragma GCC unroll 0 - for (int face = 0; face < 2; face++) { - sfpu::calculate_sfpu(iterations, param0, param1, param2, param3, param4, param5); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } else if (vector_mode == (int)Dim::C) { - // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face -#pragma GCC unroll 0 - for (int face = 0; face < 2; face++) { - sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - if (num_faces>2) { // Skip next 2 faces if tile is 32x32 - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - } - if (num_faces<=2) { - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - } else { - // Do all four faces, and iterate through all 4 blocks of 4 rows each -#pragma GCC unroll 0 - for (int face = 0; face < 4; face++) { - sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - } +} + +inline void _llk_math_eltwise_binary_sfpu_done_() { math::clear_dst_reg_addr(); TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::WAIT_SFPU); math::clear_addr_mod_base(); } -template -inline void _llk_math_eltwise_binary_sfpu_init_( - uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { - eltwise_binary_sfpu_configure_addrmod< sfpu_op >(); - if constexpr (sfpu_op == SfpuType::quant_int32) { - sfpu::sfpu_init(sfpu_op, param0); - } else if constexpr (sfpu_op == SfpuType::requant_int32) { - sfpu::sfpu_init(sfpu_op, param0); - } else if constexpr (sfpu_op == SfpuType::dequant_int32) { - sfpu::sfpu_init(sfpu_op, param0); - } else { - sfpu::sfpu_init(sfpu_op); - } +inline void _llk_math_eltwise_binary_sfpu_inc_dst_face_addr_() { + math::inc_dst_addr<8>(); + math::inc_dst_addr<8>(); +} + +inline void _llk_math_eltwise_binary_sfpu_init_() { + eltwise_binary_sfpu_configure_addrmod(); math::reset_counters(p_setrwc::SET_ABD_F); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h index f26d2ca3f46..c471e91a797 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_datacopy.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel_include.h" @@ -19,7 +20,7 @@ inline void eltwise_unary_configure_addrmod(); template inline void _llk_math_eltwise_unary_datacopy_(const std::uint32_t dst_index, const std::uint32_t src_format, const std::uint32_t dst_format) { - if (unpack_to_dest && math::is_32bit_input(src_format, dst_format)) { + if (unpack_to_dest && is_32bit_input(src_format, dst_format)) { math_unpack_to_dest_math_ready(); math::set_dst_write_addr(dst_index); math::math_unpack_to_dest_tile_ready(); diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h index 3f83bb707b0..e8b293b0597 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpi.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #include "ckernel_include.h" #include "ckernel_template.h" #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h index ccd0dc293ff..33dec5ac11f 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel_include.h" #include "ckernel_template.h" @@ -13,12 +14,7 @@ #include "ckernel_sfpu.h" using namespace ckernel; -template -void static_assert_sfpu_type_dependent() { - static_assert(sfpu_type == SfpuType::unused, "sfpu_type exception"); -} // local function declarations -template inline void eltwise_unary_sfpu_configure_addrmod(){ // NOTE: this kernel is typically used in conjunction with // A2D, which is using ADDR_MOD_0 and ADDR_MOD_2, so use one @@ -33,20 +29,8 @@ inline void eltwise_unary_sfpu_configure_addrmod(){ } inline void eltwise_unary_sfpu_configure_mop(); -template -inline void _llk_math_eltwise_unary_sfpu_( - const uint face_r_dim, - const uint num_faces, - uint dst_index, - int vector_mode = (int)Dim::RC, - uint param0 = 0, - uint param1 = 0, - uint param2 = 0, - uint param3 = 0, - uint param4 = 0, - uint param5 = 0) { - - constexpr int ITERATIONS = 8; +template +inline void _llk_math_eltwise_unary_sfpu_start_(const uint dst_index) { if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { math::set_dst_write_addr(math_sync_tile_dst_index); } else { @@ -54,61 +38,21 @@ inline void _llk_math_eltwise_unary_sfpu_( } math::set_addr_mod_base(); TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); - if (vector_mode == (int)Dim::R) { - // Do a row vector, Face0 + Face1 -- first iteration (first row) - const int iterations = (num_faces < 4) ? - ((face_r_dim <= 2) ? 2 : face_r_dim/2) : 2; // At least 2 iterations for odd and even columns -#pragma GCC unroll 0 - for (int face = 0; face < 2; face++) { - sfpu::calculate_sfpu(iterations, param0, param1, param2, param3, param4, param5); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } else if (vector_mode == (int)Dim::C) { - // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for full face -#pragma GCC unroll 0 - for (int face = 0; face < 2; face++) { - sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - if (num_faces>2) { // Skip next 2 faces if tile is 32x32 - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - } - if (num_faces<=2) { - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - } else { - // Do all four faces, and iterate through all 4 blocks of 4 rows each -#pragma GCC unroll 0 - for (int face = 0; face < 4; face++) { - sfpu::calculate_sfpu(ITERATIONS, param0, param1, param2, param3, param4, param5); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } - } +} + +inline void _llk_math_eltwise_unary_sfpu_done_() { math::clear_dst_reg_addr(); TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::WAIT_SFPU); math::clear_addr_mod_base(); } -template -inline void _llk_math_eltwise_unary_sfpu_init_( - uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { - eltwise_unary_sfpu_configure_addrmod< sfpu_op >(); - if constexpr (sfpu_op == SfpuType::dropout) { - sfpu::sfpu_init(sfpu_op, param2); - } else { - sfpu::sfpu_init(sfpu_op); - } +inline void _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_() { + math::inc_dst_addr<8>(); + math::inc_dst_addr<8>(); +} + +inline void _llk_math_eltwise_unary_sfpu_init_() { + eltwise_unary_sfpu_configure_addrmod(); math::reset_counters(p_setrwc::SET_ABD_F); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h index 5ebaefe0d96..bf111343ccd 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_matmul.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel_include.h" #include "ckernel_template.h" @@ -15,15 +16,19 @@ using namespace ckernel; -template +template inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) { + constexpr int NUM_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0); + constexpr int FIDELITY_INCREMENT = high_fidelity ? get_math_fidelity_increment(MATH_FIDELITY_DESC) : 0; const bool is_in0_16x32 = (in0_tile_r_dim <=FACE_R_DIM) && (in0_tile_c_dim > FACE_C_DIM); const bool is_in0_32x16 = (in0_tile_r_dim > FACE_R_DIM) && (in0_tile_c_dim <= FACE_C_DIM); const bool is_in1_32x16 = (in1_tile_r_dim > FACE_R_DIM) && (in1_tile_c_dim <= FACE_C_DIM); + static_assert(FaceLayout == DstTileFaceLayout::RowMajor, "FaceLayout must be RowMajor"); + // MVMUL does D = B*A // Inner Loop --> 32/8 = 4 times for the full 32x16 face @@ -52,7 +57,7 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c .srca = {.incr = 0, .clr = 1, .cr = 1}, .srcb = {.incr = 0, .clr = 1, .cr = 1}, .dest = {.incr = 0, .clr = 1, .cr = 1}, - .fidelity = {.incr = high_fidelity, .clr = 0}, + .fidelity = {.incr = FIDELITY_INCREMENT, .clr = 0}, .bias = {.incr = 1}, } .set(ADDR_MOD_5); @@ -235,7 +240,7 @@ inline void matmul_configure_addrmod(const bool transpose, const std::uint32_t c } -template +template inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, const std::uint32_t rt_dim, const std::uint32_t kt_dim, const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false) { // in0 - loaded to SrcB @@ -358,10 +363,10 @@ inline void matmul_configure_mop(bool transpose, const std::uint32_t ct_dim, con tmp.program(instrn_buffer); } -template +template inline void _llk_math_matmul_init_(const std::uint32_t in0_tile_r_dim = TILE_R_DIM, const std::uint32_t in0_tile_c_dim = TILE_C_DIM, const std::uint32_t in1_tile_r_dim = TILE_R_DIM, const std::uint32_t in1_tile_c_dim = TILE_C_DIM, const bool partial_face = false, const std::uint32_t transpose=0, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { - matmul_configure_addrmod(transpose, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face); + matmul_configure_addrmod(transpose, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face); const bool reuse_a = ct_dim>=rt_dim; const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim; if (t_dim>1) { @@ -374,11 +379,12 @@ inline void _llk_math_matmul_init_(const std::uint32_t in0_tile_r_dim = TILE_R_D TTI_SETC16(CLR_DVALID_SrcA_Disable_ADDR32, 0); } - matmul_configure_mop(transpose>0, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face); + constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); + matmul_configure_mop(transpose>0, ct_dim, rt_dim, kt_dim, in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, in1_tile_c_dim, partial_face); math::reset_counters(p_setrwc::SET_ABD_F); } -template +template inline void _llk_math_matmul_(uint dst_index, const bool transpose=false, const std::uint32_t ct_dim=1, const std::uint32_t rt_dim=1, const std::uint32_t kt_dim=1) { const bool reuse_a = ct_dim>=rt_dim; const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim; diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h index 4c77069f857..66879b3ea5a 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_reduce.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel_include.h" #include "ckernel_template.h" @@ -18,16 +19,19 @@ inline void reduce_configure_addrmod(); template inline void reduce_configure_mop(); -template +template inline void _llk_math_reduce_(const uint dst_index) { - constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4; + + constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); + constexpr bool HIGH_FIDELITY = MATH_FIDELITY_PHASES > 0; + math::set_dst_write_addr(dst_index); if constexpr (dim == ReduceDim::REDUCE_ROW) { // Transpose for each face in src A done at unpacker, and pool if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); TTI_CLEARDVALID(p_setrwc::CLR_AB, 0); } else { @@ -38,7 +42,7 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); } else { TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); @@ -97,7 +101,7 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); TTI_CLEARDVALID(p_setrwc::CLR_AB, 0); } else { @@ -108,7 +112,7 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); } else { TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); @@ -159,7 +163,7 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); } else { TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); @@ -171,7 +175,7 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); } else { TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); @@ -189,7 +193,7 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 4); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); TTI_CLEARDVALID(p_setrwc::CLR_AB, 0); } else { @@ -201,7 +205,7 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 4); } else { - if constexpr (high_fidelity) { + if constexpr (HIGH_FIDELITY) { ckernel_template::run(instrn_buffer); } else { TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 4); @@ -230,8 +234,8 @@ inline void _llk_math_reduce_(const uint dst_index) { if constexpr (type == PoolType::MAX) { TTI_GMPOOL(p_setrwc::CLR_AB, p_gpool::DIM_16X16, ADDR_MOD_0, p_gpool::INDEX_DIS, 0); } else { - if constexpr (high_fidelity) { - for (int i = 0; i < num_fidelity_phases - 1; i++) { + if constexpr (HIGH_FIDELITY) { + for (int i = 0; i < MATH_FIDELITY_PHASES - 1; i++) { TTI_GAPOOL(p_setrwc::CLR_NONE, p_gpool::DIM_16X16, ADDR_MOD_3, p_gpool::INDEX_DIS, 0); } } @@ -240,8 +244,13 @@ inline void _llk_math_reduce_(const uint dst_index) { } } -template +template inline void reduce_configure_addrmod() { + + constexpr int NUM_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); + constexpr int FIDELITY_INCREMENT = get_math_fidelity_increment(MATH_FIDELITY_DESC); + constexpr bool HIGH_FIDELITY = NUM_FIDELITY_PHASES > 0; + addr_mod_t{ .srca = {.incr = 0 }, .srcb = {.incr = 0 }, @@ -262,12 +271,12 @@ inline void reduce_configure_addrmod() { } .set(ADDR_MOD_2); - if constexpr (is_high_fidelity) { + if constexpr (HIGH_FIDELITY) { addr_mod_t{ .srca = {.incr = 0}, .srcb = {.incr = 0}, .dest = {.incr = 0}, - .fidelity = { .incr = 1} + .fidelity = { .incr = FIDELITY_INCREMENT} }.set(ADDR_MOD_3); } } @@ -293,14 +302,15 @@ inline void reduce_configure_mop() { } } -template +template inline void _llk_math_reduce_init_(const std::uint32_t within_face_16x16_transpose=0) { //within_face_16x16_transpose used for unpack, ignored by math - constexpr bool high_fidelity = num_fidelity_phases > 0 && num_fidelity_phases <= 4; + constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); + constexpr bool HIGH_FIDELITY = MATH_FIDELITY_PHASES > 0; - reduce_configure_addrmod(); - if constexpr (high_fidelity) { - reduce_configure_mop(); + reduce_configure_addrmod(); + if constexpr (HIGH_FIDELITY) { + reduce_configure_mop(); } TTI_SETC16(CLR_DVALID_SrcA_Disable_ADDR32, 0); diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h index 7df83739dc9..cb4a26dbf1a 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "llk_defs.h" @@ -207,3 +208,5 @@ inline void _llk_pack_(const std::uint32_t tile_index, const std::uint32_t addre TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close tile } } + +#include "llk_pack_untilize.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h index 88dbdb186a9..5f796f4c5b3 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" @@ -187,8 +188,8 @@ inline void _llk_pack_reduce_mask_config_() { // We initialize PCK_EDGE_OFFSET_SEC0 mask to clear out all the datums in the row pack_edge_offset.f.mask = 0x0; - uint32_t row_set_mapping_1; - uint32_t edge_offset_sec1_mask; + uint32_t row_set_mapping_1 = 0; + uint32_t edge_offset_sec1_mask = 0; if constexpr (dim == ReduceDim::REDUCE_ROW) { // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one @@ -223,6 +224,24 @@ inline void _llk_pack_reduce_mask_config_() { // TILE_ROW_SET_MAPPING_1 configuration sets only first row to use PCK_EDGE_OFFSET_SEC1 mask row_set_mapping_1 = 0x00000001; // each packer packs 1x16 row } + } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) { + // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one + edge_offset_sec1_mask = 0x0001; + if constexpr (untilize) { + pack_edge_offset.f.tile_row_set_select_pack0 = 1; + pack_edge_offset.f.tile_row_set_select_pack1 = 1; + pack_edge_offset.f.tile_row_set_select_pack2 = 1; + pack_edge_offset.f.tile_row_set_select_pack3 = 1; + row_set_mapping_1 = 0x00000005; + } else { + // Packer 0 and 2 will use TILE_ROW_SET_MAPPING_1, while packer 1 and 3 will keep using + // TILE_ROW_SET_MAPPING_0 configuration which is the default one + pack_edge_offset.f.tile_row_set_select_pack0 = 1; + pack_edge_offset.f.tile_row_set_select_pack2 = 1; + + // TILE_ROW_SET_MAPPING_1 configuration sets all rows to use PCK_EDGE_OFFSET_SEC1 mask + row_set_mapping_1 = 0x00000001; + } } // Initialize TMP registers with values we need to write in CFG registers diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h new file mode 100644 index 00000000000..a1defc58dde --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_untilize.h @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + + +#pragma once +#include "llk_defs.h" + +#include "ckernel.h" +#include "ckernel_template.h" +#include "llk_pack_common.h" +#include "ckernel_globals.h" + +using namespace ckernel; +using namespace ckernel::packer; + +inline void _llk_pack_untilize_configure_addrmod_() { + + addr_mod_pack_t{ + .y_src = {.incr = 15}, // 4-bit value so max is 15. incadcxy will increment it by 1 + } + .set(ADDR_MOD_0); + + addr_mod_pack_t{ + .y_src = { .incr = 0, .clr = 0, .cr = 1 }, + }.set(ADDR_MOD_1); + + addr_mod_pack_t{ + .y_src = { .incr = 0, .clr = 1, .cr = 0 }, + }.set(ADDR_MOD_2); + +} + +template +inline void _llk_pack_untilize_mop_config_() { + const uint PACKCNT = 4; + constexpr uint MEGAROW = 1; + constexpr uint ZERO_OUTPUT_FLAG = p_pacr::P_ZERO_OUTPUT_DISABLED; + constexpr uint MOP_INNER_LOOP = 1; + + constexpr uint MOP_OUTER_LOOP = block_ct_dim; + + // Inc ch0_y+=1 (addr_mod_0 will increment by 15) + ckernel::ckernel_template tmp(MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_INCADCXY(p_setadc::PAC, 0, 0, 1, 0)); + tmp.set_start_op(TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); + tmp.set_end_ops(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0), + TT_OP_INCADCZW(p_setadc::PAC, 0, 0, 1, 0)); // w cnt points to the next tile + tmp.program(instrn_buffer); +} + +template +inline void _llk_pack_untilize_init_() { + + _llk_pack_untilize_configure_addrmod_(); + + _llk_pack_untilize_mop_config_(); +} + +template +inline void _llk_pack_untilize_(const std::uint32_t address, const std::uint32_t pack_dst_format) { + + program_packer_untilized_destination(address, pack_dst_format); + + for (std::uint32_t row=0; row +template inline void _llk_unpack_A_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; - configure_unpack_AB( + constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All); + constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu); + constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack); + configure_unpack_AB( unpack_src_format, unpack_src_format, unpack_dst_format, @@ -154,7 +158,7 @@ inline void _llk_unpack_A_hw_configure_(const std::uint32_t unpack_src_format, c template inline void _llk_unpack_A_init_(const std::uint32_t transpose_of_faces=0, const std::uint32_t within_face_16x16_transpose=0, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4, const std::uint32_t unpack_src_format = 0, const std::uint32_t unpack_dst_format = 0) { constexpr std::uint32_t UNP_SEL = (BType == BroadcastType::NONE) ? p_setadc::UNP_A : p_setadc::UNP_B; - config_face_dim(face_r_dim); + config_unpacker_x_end(face_r_dim); _llk_unpack_A_mop_config_(transpose_of_faces>0, num_faces, unpack_src_format, unpack_dst_format); } @@ -195,7 +199,7 @@ inline void _llk_unpack_A_(const std::uint32_t address, const bool transpose_of_ } if constexpr (unpack_to_dest) { - if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) { + if (is_32bit_input(unpack_src_format, unpack_dst_format)) { set_dst_write_addr(unp_cfg_context, unpack_dst_format); wait_for_dest_available(); } @@ -208,7 +212,7 @@ inline void _llk_unpack_A_(const std::uint32_t address, const bool transpose_of_ t6_semaphore_get(semaphore::UNPACK_SYNC); if (unpack_to_dest) { - if (unpacker::is_32bit_input(unpack_src_format, unpack_dst_format)) { + if (is_32bit_input(unpack_src_format, unpack_dst_format)) { unpack_to_dest_tile_done(unp_cfg_context); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h index 0f6d54f2909..fb10f53d08e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" #include "ckernel_defs.h" @@ -70,10 +71,13 @@ inline void _llk_unpack_AB_mop_config_(const bool transpose_of_faces=false, cons } -template +template inline void _llk_unpack_AB_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; - configure_unpack_AB( + constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All); + constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu); + constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack); + configure_unpack_AB( unpA_src_format, unpB_src_format, unpA_dst_format, @@ -91,7 +95,7 @@ inline void _llk_unpack_AB_init_(const std::uint32_t face_r_dim=FACE_R_DIM, cons cfg_reg_rmw_tensix(transpose); // transpose within the face constexpr std::uint32_t UNP_SEL = p_setadc::UNP_AB; - config_face_dim(face_r_dim); + config_unpacker_x_end(face_r_dim); _llk_unpack_AB_mop_config_(transpose>0, num_faces, narrow_tile); // transpose of faces 0,2,1,3 } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h index 4578126b9e8..6759e3b3065 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_AB_matmul.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" #include "ckernel_defs.h" @@ -90,12 +91,15 @@ inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::u } -template +template inline void _llk_unpack_AB_matmul_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format, const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4, const std::uint32_t unpA_tile_size = 0, const std::uint32_t unpB_tile_size = 0) { constexpr bool is_row_pool = false; + constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All); + constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu); + constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack); - configure_unpack_AB( + configure_unpack_AB( unpA_src_format, unpB_src_format, unpA_dst_format, @@ -136,7 +140,7 @@ __attribute__((always_inline)) inline void _llk_unpack_AB_matmul_init_(const std if (partial_face) { // Do face by face unpacking. Need to program correct face dim // to compute address of the next face - config_face_dim(unpB_face_r_dim); + config_unpacker_x_end(unpB_face_r_dim); } else { // Do full tile unpacking. No need to program face dim // as address counter pointing to the face is not incremented diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h index 92222ddaaa3..cb747267e34 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_common.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" @@ -38,7 +39,7 @@ void _llk_zero_buffer_(const std::uint32_t base_address, const std::uint32_t siz template inline void _llk_unpack_get_tile_(std::uint32_t address, std::uint32_t *p_tile) { - std::uint32_t byte_address = (address + TILE_HEADER_SIZE)<<4; + std::uint32_t byte_address = (address)<<4; if constexpr (mail2math) { mailbox_write(ThreadId::MathThreadId, byte_address); @@ -66,29 +67,28 @@ inline void _llk_unpack_debug_dump_seek_(std::uint8_t offset) { debug_dump_seek(offset); } -template -inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) +inline void _llk_unpack_config_tile_dim_srca_impl_(const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) { - if constexpr(is_tile_dim_reconfig_en) { - const uint face_dim = face_r_dim*FACE_C_DIM; + cfg_reg_rmw_tensix(num_faces); + config_unpacker_0_face_dim(face_r_dim); +} - cfg_reg_rmw_tensix(num_faces); - cfg_reg_rmw_tensix(face_dim | face_dim << 16); - } +inline void _llk_unpack_config_tile_dim_srcb_impl_(const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) +{ + const uint face_dim = face_r_dim*FACE_C_DIM; + cfg_reg_rmw_tensix(face_dim); + cfg_reg_rmw_tensix(num_faces); +} + +inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size) +{ cfg_reg_rmw_tensix(unpack_src_format); cfg_reg_rmw_tensix(unpack_dst_format); TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A } -template -inline void _llk_unpack_reconfig_data_format_srcb_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) +inline void _llk_unpack_reconfig_data_format_srcb_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size) { - if constexpr(is_tile_dim_reconfig_en) { - const uint face_dim = face_r_dim*FACE_C_DIM; - - cfg_reg_rmw_tensix(face_r_dim*FACE_C_DIM); - cfg_reg_rmw_tensix(num_faces); - } cfg_reg_rmw_tensix(unpack_src_format); cfg_reg_rmw_tensix(unpack_dst_format); TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_B)); // update gpr which holds tile size B @@ -100,5 +100,5 @@ inline void _llk_unpack_dbg_feature_disable_(){ } inline void _llk_enable_int8_fpu_math_() { - enalbe_int8_fpu_math(); + enable_int8_fpu_math(); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h index 8f0ea52e4fa..0fdaae9df61 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_reduce.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" #include "ckernel_defs.h" @@ -40,12 +41,15 @@ inline void _llk_unpack_reduce_mop_config_() { tmp.program(instrn_buffer); } -template +template inline void _llk_unpack_reduce_hw_configure_(const std::uint32_t unpA_src_format, const std::uint32_t unpB_src_format, const std::uint32_t unpA_dst_format, const std::uint32_t unpB_dst_format, const std::uint32_t unpA_face_r_dim = FACE_R_DIM, const std::uint32_t unpB_face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t unpA_num_faces = 4, const std::uint32_t unpB_num_faces = 4) { constexpr bool is_row_pool = true; + constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All); + constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu); + constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack); - configure_unpack_AB( + configure_unpack_AB( unpA_src_format, unpB_src_format, unpA_dst_format, diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h index ae1b22d830e..b695e2f296a 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_tilize.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" #include "ckernel_defs.h" @@ -30,12 +31,15 @@ inline void _llk_unpack_tilize_mop_config_(const bool narrow_tile=false) { tmp.program(instrn_buffer); } -template +template inline void _llk_unpack_tilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; + constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All); + constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu); + constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack); - configure_unpack_AB( + configure_unpack_AB( unpack_src_format, unpack_src_format, unpack_dst_format, diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h index 723f9716c88..f67b72f5cee 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_unpack_untilize.h @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 + #pragma once #include "ckernel.h" #include "ckernel_defs.h" @@ -48,10 +49,13 @@ inline void _llk_unpack_untilize_mop_config_() { tmp.program(instrn_buffer); } -template +template inline void _llk_unpack_untilize_hw_configure_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t within_face_16x16_transpose = 0, const std::uint32_t num_faces = 4) { constexpr bool is_row_pool = false; - configure_unpack_AB( + constexpr bool stoch_rnd_en = (stoch_rnd_mode == StochRndType::All); + constexpr bool fpu_srnd_en = stoch_rnd_en || (stoch_rnd_mode == StochRndType::Fpu); + constexpr bool pack_srnd_en = stoch_rnd_en ||(stoch_rnd_mode == StochRndType::Pack); + configure_unpack_AB( unpack_src_format, unpack_src_format, unpack_dst_format, diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h index ff64fb27b2d..513a0a15972 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h @@ -40,7 +40,7 @@ inline void llk_math_matmul_init( rt_dim, kt_dim); #else - _llk_math_matmul_init_( + _llk_math_matmul_init_( in0_tile_r_dim, in0_tile_c_dim, in1_tile_r_dim, @@ -63,6 +63,6 @@ inline void llk_math_matmul( #ifdef ARCH_GRAYSKULL _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); #else - _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); + _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); #endif } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h index 17bba18f12a..898788ca415 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -13,36 +13,6 @@ namespace ckernel { * LLK ELTWISE UNARY SFPU *************************************************************************/ -template -inline void llk_math_eltwise_unary_sfpu( - uint dst_index, - int vector_mode = (int)Dim::RC, - uint param0 = 0, - uint param1 = 0, - uint param2 = 0, - uint param3 = 0, - uint param4 = 0, - uint param5 = 0) { - - const std::uint32_t operand_id = get_operand_id(0); - const std::uint32_t num_faces = get_operand_num_faces(0); - const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); - - _llk_math_eltwise_unary_sfpu_( - face_r_dim, - num_faces, - dst_index, - vector_mode, - param0, - param1, - param2, - param3, - param4, - param5 - ); -} - - // New LLK SFPU APIs template inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h index 1e57d003cfc..249f62bc71b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h @@ -272,3 +272,36 @@ inline void llk_pack_reduce_mask_config() { inline void llk_pack_reduce_mask_clear() { _llk_pack_reduce_mask_clear_(); } + +// FIXME-WH-UPLIFT +template +inline void llk_pack_reduce_config_v2(uint32_t icb_out) { + + const bool untilize = false; + if constexpr (at_kernel_start) { + + const std::uint32_t output_id = get_output_id(icb_out); + const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); + const std::uint32_t num_faces = get_output_num_faces(output_id); + const bool partial_face = get_output_partial_face(output_id); + const bool narrow_tile = get_output_narrow_tile(output_id); + const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; + const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}}; + + _llk_pack_hw_configure_( + pack_src_format[output_id], + pack_dst_format[output_id], + tile_size, + face_r_dim, + num_faces, + partial_face, + narrow_tile, + relu_config.val + ); + } else { + TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK); + tensix_sync(); + } + + _llk_pack_reduce_mask_config_(); +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h index eb7928d3090..e8bbec37fc6 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h @@ -28,7 +28,7 @@ inline void calculate_elu(uint slope) vFloat s = c_slope.f; #pragma GCC unroll 0 - for (int d = 0; d < WHB0_ITERATIONS; d++) { + for (int d = 0; d < 8; d++) { vFloat v = dst_reg[0]; v_if (v < 0.0f) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h index 42d4a30a7ce..cec5879a69a 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h @@ -33,7 +33,7 @@ sfpi_inline vFloat calculate_erf_body(vFloat x) { // TODO: Fix assertion error for accurate mode template inline void calculate_erf() { - for (int d = 0; d < WHB0_ITERATIONS; d++) { + for (int d = 0; d < 8; d++) { // SFPU microcode: vFloat x = dst_reg[0]; v_if(x < 0.0f) { @@ -51,7 +51,7 @@ inline void calculate_erf() { template inline void calculate_erfc() { // SFPU microcode: - for (int d = 0; d < WHB0_ITERATIONS; d++) { + for (int d = 0; d < 8; d++) { vFloat x = dst_reg[0]; v_if(x < 0.0f) { x = 1.0 + (calculate_erf_body(x)); } v_else { x = 1.0 - (calculate_erf_body(x)); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h index 56d1a98cb69..ea77be75900 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h @@ -59,7 +59,7 @@ template inline void calculate_erfinv() { // SFPU microcode - for (int d = 0; d < WHB0_ITERATIONS; d++) + for (int d = 0; d < 8; d++) { vFloat v = dst_reg[0]; v_if (v == 1.0f) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h index 6301e66ba55..a60ef1c4628 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h @@ -89,16 +89,16 @@ template void gelu_init() { vConstFloatPrgm0 = 0.5f; if constexpr (APPROXIMATION_MODE) { - sfpu_load_imm32(0,0x37E7322B); - //sfpu_load_imm32(4,0xB122A3AE); - sfpu_load_imm32(4,0xB12286D8); + _sfpu_load_imm32_(0,0x37E7322B); + //_sfpu_load_imm32_(4,0xB122A3AE); + _sfpu_load_imm32_(4,0xB12286D8); - sfpu_load_imm32(1,0x38E138F3); - sfpu_load_imm32(5,0xB437B479); + _sfpu_load_imm32_(1,0x38E138F3); + _sfpu_load_imm32_(5,0xB437B479); - sfpu_load_imm32(2,0x38003852); - sfpu_load_imm32(6,0x7c00afa4); + _sfpu_load_imm32_(2,0x38003852); + _sfpu_load_imm32_(6,0x7c00afa4); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h index 5aaac5b8000..b8c818c8bbe 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h @@ -19,7 +19,7 @@ inline void calculate_i0() { #pragma GCC unroll 0 - for (int d = 0; d < WHB0_ITERATIONS; d++) + for (int d = 0; d < 8; d++) { vFloat result = 0.0f; vFloat input = dst_reg[0]; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h index 03b72c8962d..4aaadf3c305 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h @@ -19,7 +19,7 @@ template inline void calculate_logical_not_unary() { #pragma GCC unroll 0 - for (int d = 0; d < WHB0_ITERATIONS; d++) { + for (int d = 0; d < 8; d++) { vFloat v = dst_reg[0]; v_if (v == 0) { dst_reg[0] = 1.0f; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h index f75819f4239..4ab63536c78 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h @@ -20,7 +20,7 @@ template inline void relu_min(uint uint_threshold) { vFloat threshold = Converter::to_float(uint_threshold); - for (int d = 0; d < WHB0_ITERATIONS; d++) + for (int d = 0; d < 8; d++) { vFloat a = dst_reg[0]; v_if(a < threshold) { @@ -37,7 +37,7 @@ template inline void relu_max(uint uint_threshold) { vFloat threshold = Converter::to_float(uint_threshold); - for (int d = 0; d < WHB0_ITERATIONS; d++) + for (int d = 0; d < 8; d++) { vFloat a = dst_reg[0]; v_if(a > threshold) { @@ -62,7 +62,7 @@ inline void calculate_lrelu(uint slope) vFloat s = c_slope.f; #pragma GCC unroll 0 - for (int d = 0; d < WHB0_ITERATIONS; d++) { + for (int d = 0; d < 8; d++) { vFloat v = dst_reg[0]; v_if (v < 0.0f) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h index f1e7d19acc8..cc08a9f346c 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h @@ -5,6 +5,7 @@ #pragma once #include +#include "llk_sfpu_types.h" #include "ckernel_globals.h" #include "ckernel_include.h" #include "ckernel_template.h" @@ -16,3 +17,157 @@ #include "llk_math_eltwise_unary_sfpu.h" using namespace ckernel; +using namespace ckernel::sfpu; +namespace ckernel { + +/************************************************************************* + * LLK ELTWISE UNARY SFPU + *************************************************************************/ + +template < + SfpuType operation, + bool APPROXIMATION_MODE, + int SfpuType_PARAM = 0, + int ITERATIONS = 8, + bool IS_INT_SFPU_EN = false> +inline void llk_math_calculate_sfpu( + const int iterations = ITERATIONS, + uint param0 = 0, + uint param1 = 0, + uint param2 = 0, + uint param3 = 0, + uint param4 = 0, + uint param5 = 0) { + if constexpr (operation == SfpuType::exp_with_base) { + constexpr bool zero_negative = true; + _calculate_exponential_(iterations, param0); + } else if constexpr (operation == SfpuType::tanh) { + _calculate_tanh_(iterations); + } else if constexpr (operation == SfpuType::hardtanh) { + _calculate_hardtanh_(iterations, param0, param1, param2); + } else if constexpr (operation == SfpuType::rsqrt) { + // param0 = true -> approximate fast mode + // false -> high precision mode + // The algorithm uses Newton's method based on no.of iteration better approximation can be calculated + if (param0) { + calculate_rsqrt(); + } else { + calculate_rsqrt(); + } + } else if constexpr (operation == SfpuType::sigmoid) { + calculate_sigmoid(); + } else if constexpr (operation == SfpuType::sigmoid_appx) { + calculate_sigmoid_appx(); + } else if constexpr (operation == SfpuType::tanh_derivative) { + calculate_tanh_derivative(); + } else if constexpr (operation == SfpuType::dropout) { + calculate_dropout(param0, param1); + } else if constexpr (operation == SfpuType::power) { + calculate_power_iterative(param0); + } else if constexpr (operation == SfpuType::square) { + calculate_square(); + } else if constexpr (operation == SfpuType::log) { + calculate_log(param0); + } else if constexpr (operation == SfpuType::log_with_base) { + calculate_log(param0); + } else if constexpr ( + (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) || + (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) || + (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) { + calculate_comp(8); // BFLOAT16 - exp + } else if constexpr (operation == SfpuType::clamp) { + calculate_clamp(param0, param1, param2); + } else if constexpr (operation == SfpuType::abs) { + calculate_abs(); + } else if constexpr (operation == SfpuType::sign) { + calculate_sign(); + } else if constexpr (operation == SfpuType::max) { + calculate_max(); + } else if constexpr (operation == SfpuType::min) { + calculate_min(); + } else if constexpr (operation == SfpuType::exp2) { + calculate_exp2(); + } else if constexpr (operation == SfpuType::heaviside) { + calculate_heaviside(param0); + } else if constexpr (operation == SfpuType::expm1) { + calculate_expm1(); + } else if constexpr (operation == SfpuType::asin) { + calculate_asin(); + } else if constexpr (operation == SfpuType::acos) { + calculate_acos(); + } else if constexpr (operation == SfpuType::atan) { + calculate_atan(); + } else if constexpr (operation == SfpuType::signbit) { + calculate_signbit(); + } else if constexpr (operation == SfpuType::silu) { + calculate_silu(); + } else if constexpr (operation == SfpuType::mask) { + calculate_mask(); + } else if constexpr (operation == SfpuType::negative) { + calculate_negative(); + } +} + +template +inline void llk_math_eltwise_unary_sfpu( + uint dst_index, + int vector_mode = (int)Dim::RC, + uint param0 = 0, + uint param1 = 0, + uint param2 = 0, + uint param3 = 0, + uint param4 = 0, + uint param5 = 0) { + const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support + const std::uint32_t num_faces = get_operand_num_faces(operand_id); + const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + + constexpr int ITERATIONS = 8; + + _llk_math_eltwise_unary_sfpu_start_(dst_index); + + if (vector_mode == (int)Dim::R) { + // Do a row vector, Face0 + Face1 -- first iteration (first row) + const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2) + : 2; // At least 2 iterations for odd and even columns +#pragma GCC unroll 0 + for (int face = 0; face < 2; face++) { + llk_math_calculate_sfpu( + iterations, param0, param1, param2, param3, param4, param5); + // Move to the next face + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + } + // Skip next two faces + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + } else if (vector_mode == (int)Dim::C) { + // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for + // full face +#pragma GCC unroll 0 + for (int face = 0; face < 2; face++) { + llk_math_calculate_sfpu( + ITERATIONS, param0, param1, param2, param3, param4, param5); + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + if (num_faces > 2) { // Skip next face if tile is 32x32 + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + } + } + if (num_faces <= 2) { + // Skip next two faces + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + } + } else { + // Do all four faces, and iterate through all 4 blocks of 4 rows each +#pragma GCC unroll 0 + for (int face = 0; face < 4; face++) { + llk_math_calculate_sfpu( + ITERATIONS, param0, param1, param2, param3, param4, param5); + // Move to the next face + _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); + } + } + _llk_math_eltwise_unary_sfpu_done_(); +} + +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h index b82b1f39cb4..e43682ab4f2 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h @@ -30,16 +30,41 @@ inline void llk_math_eltwise_unary_sfpu_init(void (*func)()) { template inline void llk_math_eltwise_unary_sfpu_init( - uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { - - _llk_math_eltwise_unary_sfpu_init_( - param0, - param1, - param2, - param3, - param4, - param5 - ); + const uint param0 = 0, const uint param1 = 0, const uint param2 = 0, const uint param3 = 0, const uint param4 = 0, const uint param5 = 0) { + _llk_math_eltwise_unary_sfpu_init_(); + + switch (sfpu_op) { + case SfpuType::reciprocal: + sfpu::_init_reciprocal_(); + break; + case SfpuType::exponential: + sfpu::_init_exponential_(); + break; + case SfpuType::log: + sfpu::_init_log_(); + break; + case SfpuType::sqrt: + sfpu::_init_sqrt_(); + break; + case SfpuType::tanh: + case SfpuType::tanh_derivative: + sfpu::_init_tanh_(); + break; + case SfpuType::sigmoid: + sfpu::_init_sigmoid_(); + break; + case SfpuType::gelu_derivative: + sfpu::_init_gelu_derivative_(); + break; + case SfpuType::gelu: + sfpu::_init_gelu_(); + break; + case SfpuType::dropout: + sfpu::_init_dropout_(param2); + break; + default: + break; + } } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h index 72c27cde02b..4c059e37585 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h @@ -4,6 +4,7 @@ #pragma once +#include "llk_sfpu_types.h" #include "ckernel_defs.h" #include "ckernel_sfpu.h" #include "ckernel.h" @@ -338,7 +339,7 @@ inline void calculate_comp(uint exponent_size_8) //a[i] == 0 if constexpr(COMP_MODE == SfpuType::equal_zero) { - v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { v = one; } v_else { v = zero; @@ -348,7 +349,7 @@ inline void calculate_comp(uint exponent_size_8) //a[i] != 0 if constexpr(COMP_MODE == SfpuType::not_equal_zero) { - v_if (sfpu_is_fp16_zero(v, exponent_size_8)) { + v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { v = zero; } v_else { v = one; @@ -774,5 +775,20 @@ inline void calculate_silu() } } +template +inline void calculate_mask() +{ + bool exponent_size_8 = true; + for (int d = 0; d < ITERATIONS; d++) + { + vFloat mask = dst_reg[32]; + v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) { + dst_reg[0] = 0; + } + v_endif; + dst_reg++; + } +} + } // namespace sfpu } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h new file mode 100644 index 00000000000..ce290b5ef6c --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +enum SfpuType { + tanh, + hardtanh, + gelu, + exponential, + exp_with_base, + sigmoid, + reciprocal, + sqrt, + lrelu, + power, + square, + tanh_derivative, + log, + log_with_base, + equal_zero, + not_equal_zero, + less_than_zero, + greater_than_equal_zero, + less_than_equal_zero, + greater_than_zero, + clamp, + gelu_derivative, + dropout, + abs, + sign, + max, + sine, + cosine, + tan, + relu_max, + relu_min, + cast_fp32_to_fp16a, + sigmoid_appx, + gelu_appx, + elu, + min, + exp2, + heaviside, + expm1, + signbit, + asin, + acos, + atan, + erf, + erfc, + rsqrt, + isfinite, + isinf, + isposinf, + isneginf, + isnan, + logical_not_unary, + erfinv, + i0, + silu, + mask, + negative, + dequant_int32, + requant_int32, + quant_int32, + unused, +}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h index bce909a4395..420cb04a9d3 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_api.h @@ -10,7 +10,7 @@ * LLK UNPACK AB *************************************************************************/ -template +template inline void llk_unpack_AB_hw_configure( const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) { // In0 -> unpA @@ -34,7 +34,7 @@ inline void llk_unpack_AB_hw_configure( num_faces); } -template +template inline void llk_unpack_AB_hw_configure_disaggregated( const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) { const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h index 68eca79f4e9..bab5b81e885 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_AB_matmul_api.h @@ -10,7 +10,7 @@ * LLK UNPACK AB MATMUL *************************************************************************/ -template +template inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) { const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca; @@ -41,7 +41,7 @@ inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_ cb_interface[unpB_operand_id].fifo_page_size); } -template +template inline void llk_unpack_AB_matmul_hw_configure_disaggregated( const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) { const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h index e8918793baa..26f943ec1af 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_A_api.h @@ -10,7 +10,7 @@ * LLK UNPACK A *************************************************************************/ -template +template inline void llk_unpack_A_hw_configure( const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) { const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand); @@ -25,7 +25,7 @@ inline void llk_unpack_A_hw_configure( unpA_num_faces); } -template +template inline void llk_unpack_A_hw_configure_disaggregated( const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) { const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand}; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h index 6b61452722a..114d6f79389 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_common_api.h @@ -51,12 +51,10 @@ inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_op const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand); const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id); const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id); - _llk_unpack_reconfig_data_format_srca_impl_( + _llk_unpack_reconfig_data_format_srca_impl_( unpack_src_format[srca_operand_id], unpack_dst_format[srca_operand_id], - cb_interface[srca_operand_id].fifo_page_size, - face_r_dim, - num_faces); + cb_interface[srca_operand_id].fifo_page_size); } template @@ -64,12 +62,10 @@ inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_op std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand); const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id); const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id); - _llk_unpack_reconfig_data_format_srcb_impl_( + _llk_unpack_reconfig_data_format_srcb_impl_( unpack_src_format[srcb_operand_id], unpack_dst_format[srcb_operand_id], - cb_interface[srcb_operand_id].fifo_page_size, - face_r_dim, - num_faces); + cb_interface[srcb_operand_id].fifo_page_size); } template diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h index afa60f7947b..433e33184ec 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_reduce_api.h @@ -10,7 +10,7 @@ * LLK UNPACK REDUCE *************************************************************************/ -template +template inline void llk_unpack_reduce_hw_configure( const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) { @@ -46,7 +46,7 @@ inline void llk_unpack_reduce_hw_configure( } } -template +template inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) { const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand}; llk_unpack_reduce_hw_configure(&unpack_reduce_params, mult); diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h index 71eeb6a0ba2..b0c3f50cca7 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_tilize_api.h @@ -14,7 +14,7 @@ template inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) { constexpr bool within_face_16x16_transpose = false; - constexpr StochRndMode stoch_rnd_mode = StochRndMode::None; + constexpr StochRndType stoch_rnd_mode = StochRndType::None; const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand); const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h index 5a135ad8903..16751995c93 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_unpack_untilize_api.h @@ -13,7 +13,7 @@ template inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) { constexpr bool is_row_pool = false; constexpr bool within_face_16x16_transpose = false; - constexpr StochRndMode stoch_rnd_mode = StochRndMode::None; + constexpr StochRndType stoch_rnd_mode = StochRndType::None; const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand); const uint32_t unpA_num_faces = 4; diff --git a/tt_metal/hw/inc/debug/dprint_tile.h b/tt_metal/hw/inc/debug/dprint_tile.h index 1b650924d9c..76b23d752d5 100644 --- a/tt_metal/hw/inc/debug/dprint_tile.h +++ b/tt_metal/hw/inc/debug/dprint_tile.h @@ -5,7 +5,7 @@ #pragma once #include "hostdevcommon/dprint_common.h" - +#include "llk_io.h" struct SliceRange { // A slice object encoding semantics of np.slice(h0:h1:hs, w0:w1:ws) From 8c58fb12a38208b5f55530f552ca43041675af33 Mon Sep 17 00:00:00 2001 From: acejkov Date: Mon, 4 Dec 2023 20:58:47 +0000 Subject: [PATCH 05/16] #3908: Fixtypo in llk_operands.h and llk_outputs.h to get *dst* format instead of src. Added revert mode to remove edge mask config which was missing --- .../wormhole_b0/common/src/ckernel.cc | 221 ------------- .../wormhole_b0/common/src/ckernel_main.cc | 21 -- .../common/src/ckernel_perf_unpack_pack.cc | 301 ------------------ .../wormhole_b0/common/src/ckernel_unity.cc | 9 - .../wormhole_b0/common/src/fwlog_list | 1 - .../wormhole_b0/llk_lib/llk_pack_common.h | 21 +- .../wormhole_b0/llk_lib/llk_pack_shifted.h | 202 ------------ .../wormhole_b0/metal/llk_api/llk_pack_api.h | 9 +- .../wormhole_b0/metal/llk_io/llk_operands.h | 2 +- .../wormhole_b0/metal/llk_io/llk_outputs.h | 2 +- 10 files changed, 13 insertions(+), 776 deletions(-) delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc deleted file mode 100644 index 3db907d6b99..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel.cc +++ /dev/null @@ -1,221 +0,0 @@ - -#include "ckernel.h" -#include "ckernel_addr_map.h" -#include "ckernel_pcbuf.h" -#include "ckernel_main.h" -#include "ckernel_globals.h" -#include -#include -#ifdef PERF_DUMP -#include "ckernel_perf_unpack_pack.h" -#include "ckernel_perf_math.h" -#endif - -namespace ckernel -{ - -enum class ttRiscCores : std::uint32_t { Unpack = 0, Math = 1, Pack = 2, Brisc = 3, Nrisc = 4}; - -volatile uint tt_reg_ptr *reg_base = reinterpret_cast(0xFFB10000); -volatile uint tt_reg_ptr *pc_buf_base = reinterpret_cast(PC_BUF_BASE); -volatile uint tt_reg_ptr *regfile = reinterpret_cast(REGFILE_BASE); -volatile uint tt_reg_ptr *instrn_buffer = reinterpret_cast(INSTRN_BUF_BASE); -volatile uint tt_reg_ptr *mailbox_base[4] = { - reinterpret_cast(TENSIX_MAILBOX0_BASE), reinterpret_cast(TENSIX_MAILBOX1_BASE), - reinterpret_cast(TENSIX_MAILBOX2_BASE), reinterpret_cast(TENSIX_MAILBOX3_BASE) -}; -volatile uint tt_reg_ptr *dbg_event_scratch = nullptr; - -uint32_t cfg_state_id __attribute__((section(".bss"))) = 0; // Flip between 0 and 1 to keep state between kernel calls -uint32_t dest_offset_id __attribute__((section(".bss"))) = 0; // Flip between 0 and 1 to keep dest pointer between kernel calls - -uint32_t dbg_event_index __attribute__((section(".bss"))) = 0; -uint32_t dbg_event_end __attribute__((section(".bss"))) = 0; -volatile uint16_t tt_reg_ptr *debug_mailbox_base = nullptr; -uint8_t mailbox_index = 0; -const uint8_t mailbox_end = 32; -volatile uint8_t tt_l1_ptr *debug_buffer = nullptr; -volatile uint8_t tt_l1_ptr *debug_buffer_start = nullptr; -uint8_t thread_id __attribute__((section(".bss"))) = 0; - -#ifdef PERF_DUMP -uint32_t perf_index __attribute__((section(".bss"))) = 0; -uint32_t perf_end __attribute__((section(".bss"))) = 0; -volatile uint32_t *perf_buf_base[2]; -uint8_t perf_buf_base_id __attribute__((section(".bss"))) = 0; -bool record_perf_events __attribute__((section(".bss"))) = 0; -uint32_t perf_events_target_idx __attribute__((section(".bss"))) = 0; -uint16_t current_outer_loop_iter __attribute__((section(".bss"))) = 0; -int32_t dram_dump_req_local; -bool first_unpack_recorded __attribute__((section(".bss"))) = 0; -volatile uint *ncrisc_ack_addr = nullptr; -uint32_t header; -#if OVERLAY_DECOUPLE == 1 -uint8_t overlay_output_decouple_mask = 0; -inline void update_overlay_decoupling_mailbox() { - overlay_output_decouple_mask = PERF_RISC_MAILBOX_OUTPUT_DECOUPLE_MASK_PTR[0] & 0xff; - if (thread_id == 0 || thread_id == 1) { - while(semaphore_read(semaphore::UNPACK_MATH_DONE) == 0) {} - } -} -inline void reset_unpack_pack_sync() { - if (thread_id == 2) { - semaphore_get(semaphore::UNPACK_MATH_DONE); - } -} -#endif -#endif - -volatile uint tt_l1_ptr * trisc_l1_mailbox = reinterpret_cast(MAILBOX_ADDR); - -inline bool ready_for_next_epoch() { // place this through compiler into a section that is not going to overwritten - return true; - // mailbox_write(ttRiscCores::Nrisc); // signal done epoch to NCRisc - // mailbox_read(ttRiscCores::Nrisc); // This is blocking read, until NCrisc signals epoch is ready -} - -inline void set_thread_id_parameter() { - if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) { - thread_id = 0; - } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) { - thread_id = 1; - } else { - thread_id = 2; - } -} - -inline void allocate_debug_mailbox_buffer() { - std::int32_t debug_mailbox_addr; - if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) { - debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 0*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE; - } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { - debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 1*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE; - } else { - debug_mailbox_addr = l1_mem::address_map::DEBUG_MAILBOX_BUF_BASE + 2*l1_mem::address_map::DEBUG_MAILBOX_BUF_SIZE; - } - debug_mailbox_base = reinterpret_cast(debug_mailbox_addr); - clear_mailbox_values(); -} - -inline void allocate_debug_buffer() { - std::int32_t debug_buffer_addr; - if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) { - debug_buffer_addr = l1_mem::address_map::TRISC0_DEBUG_BUFFER_BASE; - } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { - debug_buffer_addr = l1_mem::address_map::TRISC1_DEBUG_BUFFER_BASE; - } else { - debug_buffer_addr = l1_mem::address_map::TRISC2_DEBUG_BUFFER_BASE; - } - debug_buffer = reinterpret_cast(debug_buffer_addr); - debug_buffer[l1_mem::address_map::DEBUG_BUFFER_SIZE-1]=0x0; - debug_buffer_start = debug_buffer; -} - -__attribute__((noinline)) void debug_dump(const uint8_t *data, uint32_t byte_size) { - for (uint32_t i = 0; i < byte_size; i++) { - if ((((uint32_t) debug_buffer)&(l1_mem::address_map::DEBUG_BUFFER_SIZE-1)) == - l1_mem::address_map::DEBUG_BUFFER_SIZE-1) { - *(debug_buffer) = 0xff; //overflow detected - } else { - *debug_buffer = data[i]; - debug_buffer++; - } - } -} - -__attribute__((noinline)) void debug_dump_seek(uint8_t offset) { - debug_buffer = reinterpret_cast(debug_buffer_start + offset); -} - -} // namespace ckernel - -void local_mem_copy() { - volatile uint tt_l1_ptr *l1_local_mem_start_addr; - volatile uint *local_mem_start_addr = (volatile uint*) LOCAL_MEM_BASE_ADDR; - - if ((uint)__firmware_start == (uint)l1_mem::address_map::TRISC0_BASE) { - l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC0_LOCAL_MEM_BASE; - } else if ((uint) __firmware_start == (uint)l1_mem::address_map::TRISC1_BASE) { - l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC1_LOCAL_MEM_BASE; - } else { - l1_local_mem_start_addr = (volatile uint tt_l1_ptr *)l1_mem::address_map::TRISC2_LOCAL_MEM_BASE; - } - uint word_size = ((uint)__local_mem_rodata_end_addr - (uint)__local_mem_rodata_start_addr)>>2; - - if (word_size>0) { - for (uint n=0;n> 4) - 1; //Store L1 buffer address for reduce input 1 - sync_regfile_write(p_gpr_unpack::L1_BUFFER_ADDR); - } - -#ifdef PERF_DUMP - set_thread_id_parameter(); - allocate_perf_buffer(); - setup_fpu_perf_cnt(); - record_dummy_math_event(); -#if OVERLAY_DECOUPLE == 1 - update_overlay_decoupling_mailbox(); -#endif -#endif - - //while (ready_for_next_epoch()) - { - run_kernel(); - } - - // Signal completion - tensix_sync(); -#ifdef PERF_DUMP -#if OVERLAY_DECOUPLE == 1 - reset_unpack_pack_sync(); -#endif - record_perf_dump_end_and_check_overflow(); - // There has to be a tensix_sync() before this last pass. - last_trisc_perf_dump_to_dram(); - tensix_sync(); -#endif - - trisc_l1_mailbox_write(KERNEL_COMPLETE); - -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc deleted file mode 100644 index b2c39df3313..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_main.cc +++ /dev/null @@ -1,21 +0,0 @@ - -// This c-file's purpose is: -// 1) include the generated list of kernels -// The files hold run_kernel() definition and inline kernel_main functions for every ckernel -// Need to make sure no other file includes these lists since it also include global parameter definitions -// 2) instantiate global variables - - -#include "ckernel_globals.h" - -#if defined(UCK_CHLKC_UNPACK) || defined(UCK_CHLKC_MATH) || defined(UCK_CHLKC_PACK) -#include "chlkc_list.h" -#else -#include "ckernel_list.h" -#endif - -// Global vars -uint32_t unp_cfg_context = 0; -uint32_t pack_sync_tile_dst_ptr = 0; -uint32_t math_sync_tile_dst_index = 0; -volatile uint32_t tt_l1_ptr l1_buffer[16] __attribute__ ((section (".text#"))) __attribute__ ((aligned (16))); diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc deleted file mode 100644 index 446e14cb8f6..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_perf_unpack_pack.cc +++ /dev/null @@ -1,301 +0,0 @@ - -#include "ckernel_perf_unpack_pack.h" -#include "stream_interface.h" - -#pragma GCC diagnostic ignored "-Wunused-function" - - -namespace ckernel -{ -extern uint32_t perf_index; -extern uint32_t perf_end; -// Perf-buffer are double buffered for spill_to_dram. -// Ncrisc will move one half to dram while trisc populates the other half. -// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. -extern volatile uint32_t *perf_buf_base[2]; -// Selects the half of perf_buffer that trisc is currently writing into. -extern uint8_t perf_buf_base_id; -extern bool record_perf_events; -extern uint16_t current_outer_loop_iter; -extern uint8_t thread_id; -extern int32_t dram_dump_req_local; -extern volatile uint* ncrisc_ack_addr; -extern uint32_t header; - -void allocate_perf_buffer() { - std::int32_t perf_buf_base_addr; - if ((uint32_t)__firmware_start == (uint32_t)l1_mem::address_map::TRISC0_BASE) { - perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + 0*TRISC_PERF_BUF_SIZE; - perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d. - if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) { - perf_end = TRISC_PERF_BUF_SIZE >> 3; - } else { - perf_end = 3; - } - dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[0]; - ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[0]; - } else if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { - perf_buf_base_addr = l1_mem::address_map::MATH_PERF_BUF_BASE_ADDR; - perf_index = 4; // The first 4 32b regs are skipped in recording math perf counters. - perf_end = 16; - - // Initialize math_dram_dump_req_local in the beginning of epoch. - // EPOCH_INFO_PTR->perf_dram_copy_req counters do not get reset between epochs. - dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[1]; - ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[1]; - } else { - perf_buf_base_addr = l1_mem::address_map::UNPACK_PACK_PERF_BUF_BASE_ADDR + TRISC_PERF_BUF_SIZE; - perf_index = 2; // The first 4B value is always initialized to 0xbaddf00d. - if constexpr (PERF_DUMP_CONCURRENT == 1 || INTERMED_DUMP == 1) { - perf_end = TRISC_PERF_BUF_SIZE >> 3; - } else { - perf_end = 3; - } - TTI_SEMINIT(1, 0, 1 << semaphore::PACK_DONE); - dram_dump_req_local = EPOCH_INFO_PTR->perf_dram_copy_req[2]; - ncrisc_ack_addr = &EPOCH_INFO_PTR->perf_dram_copy_ack[2]; - } - // Tirsc starts dumping into the first half of the perf_buffers. - perf_buf_base_id = 0; - // Program the address for the first half of the perf buffer address. - perf_buf_base[0] = reinterpret_cast(perf_buf_base_addr); - // Program the address for the second half of the perf buffer address. - perf_buf_base[1] = reinterpret_cast(perf_buf_base_addr + (TRISC_PERF_BUF_SIZE >> 1)); - perf_buf_base[perf_buf_base_id][0] = PERF_DUMP_END_SIGNAL; -#if PERF_DUMP_CONCURRENT - volatile uint32_t* header_ptr = reinterpret_cast(l1_mem::address_map::PERF_THREAD_HEADER); - header = header_ptr[0]; - header = (header & 0xfff8ffff) | (((uint32_t)(thread_id) & 0b111) << 16); - perf_buf_base[perf_buf_base_id][1] = header; - for (uint i = 2; i < perf_index; i++) { - perf_buf_base[perf_buf_base_id][i] = 0xffffffff; - } -#else - for (uint i = 1; i < perf_index; i++) { - perf_buf_base[perf_buf_base_id][i] = 0xffffffff; - } -#endif -} - -void switch_perf_buffers() { - - if constexpr (INTERMED_DUMP || PERF_DUMP_CONCURRENT) { - for (uint i = perf_index; i < perf_end; i++) { - perf_buf_base[perf_buf_base_id][i] = 0xffffffff; - } - bool stalled = false; - uint32_t timestamp_stall_start_l; - uint32_t timestamp_stall_start_h; - uint32_t timestamp_stall_end_l; - uint32_t timestamp_stall_end_h; - - // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram - int32_t ack_local = *ncrisc_ack_addr; - if (ack_local <= dram_dump_req_local - 1) { - stalled = true; - timestamp_stall_start_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - timestamp_stall_start_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - - while (ack_local <= dram_dump_req_local - 1) { - ack_local = *ncrisc_ack_addr; - } - - timestamp_stall_end_l = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - timestamp_stall_end_h = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - } - - dram_dump_req_local++; - EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local; - - perf_buf_base_id = 1 - perf_buf_base_id; - if constexpr(INTERMED_DUMP) { - perf_index = 0; - } else { - perf_index = 0; - perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL; - perf_buf_base[perf_buf_base_id][perf_index+1] = *(uint32_t*)(&header); - perf_index = 2; - } - if (stalled && perf_index + 5 < perf_end - 1) { - uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::STALL_TRISC_FOR_DRAM_PERF_DUMP, current_outer_loop_iter); - perf_buf_base[perf_buf_base_id][perf_index] = event_id; - perf_buf_base[perf_buf_base_id][perf_index+1] = timestamp_stall_start_h; - perf_buf_base[perf_buf_base_id][perf_index+2] = timestamp_stall_start_l; - perf_buf_base[perf_buf_base_id][perf_index+3] = event_id; - perf_buf_base[perf_buf_base_id][perf_index+4] = timestamp_stall_end_h; - perf_buf_base[perf_buf_base_id][perf_index+5] = timestamp_stall_end_l; - perf_index += 6; - } - } -} - -void last_trisc_perf_dump_to_dram() { - if (perf_index > 0) { - - // Before advancing to the other half of perf-buffer, make sure ncrisc is done copying that half into dram - int32_t ack_local = *ncrisc_ack_addr; - while (ack_local <= dram_dump_req_local - 1) { - ack_local = *ncrisc_ack_addr; - } - - if constexpr (INTERMED_DUMP) { - if (thread_id == 1) { - dram_dump_req_local += 2; - } else { - dram_dump_req_local++; - } - } else if constexpr (PERF_DUMP_CONCURRENT) { - dram_dump_req_local++; - } else { - dram_dump_req_local += 2; - } - EPOCH_INFO_PTR->perf_dram_copy_req[thread_id] = dram_dump_req_local; - } -} - -void increment_unpack_tiles(uint operand_idx, uint num_tiles) { - if (record_perf_events && (perf_events_target_idx == 1)) { - if (operand_idx >= PERF_MAX_NUM_INPUTS) { - return; - } - uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0; - regfile_base_idx += (operand_idx >> 1); - bool upper = operand_idx & 0b1; - uint32_t num_tiles_regfile = regfile[regfile_base_idx]; - uint32_t current_num_tiles; - if (upper) { - current_num_tiles = (num_tiles_regfile >> 16) & 0xffff; - current_num_tiles += num_tiles; - regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff) + ((current_num_tiles & 0xffff) << 16); - } else { - current_num_tiles = (num_tiles_regfile + num_tiles) & 0xffff; - regfile[regfile_base_idx] = (num_tiles_regfile & 0xffff0000) + (current_num_tiles & 0xffff); - } - sync_regfile_write(regfile_base_idx); - } -} - -void increment_pack_tiles(uint num_tiles) { - if (record_perf_events && (perf_events_target_idx == 1)) { - regfile[p_gpr_pack::PERF_PACK_NUM_TILES] += num_tiles; - sync_regfile_write(p_gpr_pack::PERF_PACK_NUM_TILES); - } -} - -#if OVERLAY_DECOUPLE == 1 - -// This runs prior to set_perf_dump_flag_for_input so perf_end has to be adjusted -void record_overlay_decoupled_output_bw_start(uint32_t num_tiles) { - if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) { - perf_end += 6; - } - if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) { - perf_end = TRISC_PERF_BUF_SIZE >> 2; - } - uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_NUM_TILES, perf_events_target_inputs[0]); - record_perf_value_and_check_overflow(event_id, num_tiles, 0); - event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]); - uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0); -} - -void record_overlay_decoupled_output_bw_end() { - if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) { - perf_end += 6; - } - if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) { - perf_end = TRISC_PERF_BUF_SIZE >> 2; - } - uint32_t event_id = get_event_id(0, 0, perf::EventType::OUTPUT_TIMESTAMP, perf_events_target_inputs[0]); - uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, 0); -} - -void llk_push_all_packer_tiles_for_decoupling() { - uint32_t operand = OPERAND_OUTPUT_START_INDEX; - uint32_t output = operand_to_output_index(operand); - - // Populate the output buffer with headers - uint32_t stream_buf_size_bytes = EPOCH_INFO_PTR->outputs[output]->buf_full_size_bytes; - uint32_t stream_buf_addr = EPOCH_INFO_PTR->outputs[output]->buf_base_addr; - uint32_t stream_msg_info_buf_ptr = (EPOCH_INFO_PTR->outputs[output]->msg_info_buf_start)*MEM_WORD_WIDTH; - uint32_t tile_size_words = *(volatile uint32_t tt_l1_ptr *)(stream_msg_info_buf_ptr); - uint32_t tile_size_bytes = tile_size_words*MEM_WORD_WIDTH; - for (uint32_t tile_header_ptr = stream_buf_addr; tile_header_ptr < stream_buf_addr + stream_buf_size_bytes; tile_header_ptr += tile_size_bytes) { - *((uint32_t *)(tile_header_ptr)) = tile_size_words; - } - - uint32_t total_num_tiles_to_push = 0; - uint32_t num_tiles_to_push[EPOCH_MAX_OUTPUT_FORKS+1]; - uint32_t stream_id = EPOCH_INFO_PTR->outputs[output]->stream_id; - uint32_t active_stream_idx = get_active_stream_idx(stream_id); - volatile epoch_stream_info_t * l1_stream_info = EPOCH_INFO_PTR->active_streams[active_stream_idx]; - for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) { - uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1]; - uint32_t epoch_num_tiles = EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->epoch_num_tiles; - num_tiles_to_push[k] = epoch_num_tiles; - total_num_tiles_to_push += epoch_num_tiles; - } - if (((l1_stream_info->flags & STREAM_MOVES_RAW_DATA) != 0) || l1_stream_info->legacy_pack) { - - record_overlay_decoupled_output_bw_start(total_num_tiles_to_push); - - while(total_num_tiles_to_push > 0) { - uint32_t stream_msg_info_buf_ptr = (l1_stream_info->msg_info_buf_start)*MEM_WORD_WIDTH; - uint32_t tile_size_words = *(volatile uint32_t *)(stream_msg_info_buf_ptr); - uint32_t stream_buf_size_tiles = l1_stream_info->buf_size_tiles; - bool any_streams_busy = false; - for (int32_t k = 0; k < l1_stream_info->num_fork_streams+1; k++) { - uint32_t fork_active_streams_idx = k == 0 ? active_stream_idx : l1_stream_info->fork_idxs[k-1]; - uint32_t fork_stream_id = k == 0 ? stream_id : EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->stream_id; - if (num_tiles_to_push[k] == 0) { - continue; - } - uint32_t dram_output_no_push = ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_DRAM_NO_PUSH) != 0) || ((EPOCH_INFO_PTR->active_streams[fork_active_streams_idx]->flags & STREAM_MOVES_RAW_DATA) != 0); - if (dram_output_no_push) { - uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id); - uint16_t operand_tiles_received = (uint16_t)*get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id)); - uint16_t operand_tiles_acked = (uint16_t)*get_operand_tiles_acked_ptr(stream_id_to_operand(fork_stream_id)); - uint16_t tiles_available = operand_tiles_received - operand_tiles_acked;// op_pack_tiles_ptr_sub(operand_tiles_received, operand_tiles_acked); - uint32_t stream_buf_free_tiles = stream_buf_size_tiles - tiles_available; - uint32_t num_tiles = tiles_left_in_phase > stream_buf_free_tiles ? stream_buf_free_tiles : tiles_left_in_phase; - if (num_tiles > 0) { - stream_set_tiles_left_in_phase(fork_stream_id, num_tiles); - volatile uint32_t tt_reg_ptr* tiles_received_ptr = (volatile uint32_t tt_reg_ptr*)get_operand_tiles_received_ptr(stream_id_to_operand(fork_stream_id)); - operand_tiles_received = (uint16_t)tiles_received_ptr[0]; - uint16_t new_epoch_tiles_received = operand_tiles_received + num_tiles;// op_pack_tiles_ptr_add(operand_tiles_received, num_tiles); - tiles_received_ptr[0] = new_epoch_tiles_received; - - num_tiles_to_push[k] -= num_tiles; - total_num_tiles_to_push -= num_tiles; - } - } else { - uint32_t phase_active = stream_phase_is_active(fork_stream_id) && !is_dummy_phase(fork_stream_id); - if (phase_active) { - uint32_t tiles_left_in_phase = stream_src_endpoint_get_phase_tiles_count(fork_stream_id); - uint32_t num_free_words = stream_get_free_words(fork_stream_id); - uint32_t num_tiles = 0; - uint32_t num_words = 0; - while (num_words + tile_size_words <= num_free_words && num_tiles + 1 <= tiles_left_in_phase) { - num_tiles++; - num_words += tile_size_words; - } - if (num_tiles > 0) { - stream_set_tiles_left_in_phase(fork_stream_id, num_tiles); - stream_relay_tiles(fork_stream_id, num_tiles, num_words); - - num_tiles_to_push[k] -= num_tiles; - total_num_tiles_to_push -= num_tiles; - } - } - } - } - } - record_overlay_decoupled_output_bw_end(); - } -} -#endif - -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc b/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc deleted file mode 100644 index 103269694e5..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/ckernel_unity.cc +++ /dev/null @@ -1,9 +0,0 @@ -// combining multiple C++ source files into a single file -// to reduce the overhead of the compilation process and -// improve build times -#include "ckernel.cc" -#ifdef PERF_DUMP -#include "ckernel_perf_unpack_pack.cc" -#endif -#include "ckernel_main.cc" -#include "llk_io.cc" // sw stack specific io interface diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list b/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list deleted file mode 100644 index 99880029ff6..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/src/fwlog_list +++ /dev/null @@ -1 +0,0 @@ -ckernel.cc diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h index 5f796f4c5b3..8952ec8cb87 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_common.h @@ -225,23 +225,14 @@ inline void _llk_pack_reduce_mask_config_() { row_set_mapping_1 = 0x00000001; // each packer packs 1x16 row } } else if constexpr (dim == ReduceDim::REDUCE_SCALAR) { - // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one + // PCK_EDGE_OFFSET_SEC1 mask will clear out all the datums in the row except the first one edge_offset_sec1_mask = 0x0001; - if constexpr (untilize) { - pack_edge_offset.f.tile_row_set_select_pack0 = 1; - pack_edge_offset.f.tile_row_set_select_pack1 = 1; - pack_edge_offset.f.tile_row_set_select_pack2 = 1; - pack_edge_offset.f.tile_row_set_select_pack3 = 1; - row_set_mapping_1 = 0x00000005; - } else { - // Packer 0 and 2 will use TILE_ROW_SET_MAPPING_1, while packer 1 and 3 will keep using - // TILE_ROW_SET_MAPPING_0 configuration which is the default one - pack_edge_offset.f.tile_row_set_select_pack0 = 1; - pack_edge_offset.f.tile_row_set_select_pack2 = 1; + // Packer 0 will use TILE_ROW_SET_MAPPING_1, while packers 1,2 and 3 will keep using + // TILE_ROW_SET_MAPPING_0 configuration which is the default one + pack_edge_offset.f.tile_row_set_select_pack0 = 1; - // TILE_ROW_SET_MAPPING_1 configuration sets all rows to use PCK_EDGE_OFFSET_SEC1 mask - row_set_mapping_1 = 0x00000001; - } + // TILE_ROW_SET_MAPPING_1 configuration sets only first row to use PCK_EDGE_OFFSET_SEC1 mask + row_set_mapping_1 = 0x00000001; } // Initialize TMP registers with values we need to write in CFG registers diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h deleted file mode 100644 index 725d008b626..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_pack_shifted.h +++ /dev/null @@ -1,202 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once -#include "llk_io_pack.h" -#include "llk_param_structs.h" - -#include "ckernel.h" -#include "ckernel_template.h" -#include "llk_pack_common.h" -#include "ckernel_globals.h" - -using namespace ckernel; -using namespace ckernel::packer; - -inline void llk_pack_shifted_mop_config(std::uint32_t stride) { - addr_mod_pack_t{ - .y_src = {.incr = (std::uint8_t) stride}, - .y_dst = {.incr = (std::uint8_t) stride}, - } - .set(ADDR_MOD_0); - - addr_mod_pack_t{ - .y_src = {.incr = 0, .clr = 1, .cr = 0}, - .y_dst = {.incr = 0, .clr = 1, .cr = 0}, - .z_src = {.incr = 0, .clr = 0}, - .z_dst = {.incr = 0, .clr = 0}, - } - .set(ADDR_MOD_1); - - addr_mod_pack_t{ - .y_src = {.incr = 0, .clr = 0, .cr = 0}, - .y_dst = {.incr = 0, .clr = 0, .cr = 0}, - .z_src = {.incr = 0, .clr = 0}, - .z_dst = {.incr = 0, .clr = 0}, - } - .set(ADDR_MOD_2); - - const uint MOP_INNER_LOOP = 16; - const uint MOP_OUTER_LOOP = 1; - const uint PACKCNT = 4; - const uint MEGAROW = 1; - constexpr uint ZERO_OUTPUT_FLAG = p_pacr::P_ZERO_OUTPUT_DISABLED; - - ckernel::ckernel_template tmp( - MOP_OUTER_LOOP, MOP_INNER_LOOP, TT_OP_PACR(ADDR_MOD_0, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, MEGAROW, 0, 0)); - - tmp.set_last_inner_loop_instr(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, 0, 0, 0)); - tmp.set_last_outer_loop_instr(TT_OP_PACR(ADDR_MOD_1, ZERO_OUTPUT_FLAG, PACK_SEL(PACKCNT), 0, 0, 0, 0)); - - // Write header to l1 - tmp.set_end_op(TT_OP_STOREIND(1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::OUTPUT_ADDR)); - - tmp.program(instrn_buffer); -} - -template -inline void llk_pack_shifted_hw_configure(const llk_pack_shifted_params_t *pack_params) { - configure_pack(get_output_id(pack_params->pack_output), pack_params->relu_config.val); - - std::uint32_t output = get_output_id(pack_params->pack_output); -} - -template -inline void llk_pack_shifted_hw_configure_disaggregated(std::uint32_t pack_output) { - llk_pack_shifted_params_t llk_pack_shifted_params = { - .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}}; - llk_pack_shifted_hw_configure(&llk_pack_shifted_params); - volatile uint *cfg = get_cfg_pointer(); - // Disable auto-last generation - for (uint i=0; i<4; i++) { cfg[PACK_COUNTERS_SEC0_pack_per_xy_plane_ADDR32+i]=0; } - - // FIXME: configure based on initial padding param value - //regfile[p_gpr_pack::TMP_DEST_OFFSET] = 0x0 - 1; - //regfile[p_gpr_pack::TMP_DEST_OFFSET+1] = 0x0 + 0x20 - 1; - //regfile[p_gpr_pack::TMP_DEST_OFFSET+2] = 0x0 + 0x10 - 1; - //regfile[p_gpr_pack::TMP_DEST_OFFSET+3] = 0x0 + 0x30 - 1; -} - -inline void llk_pack_shifted_init(const llk_pack_shifted_params_t *params=0) { - llk_pack_shifted_mop_config(params->stride); -} - -inline void llk_pack_shifted(const llk_pack_shifted_params_t *params, llk_pack_shifted_state_t *state, std::uint32_t output, std::uint32_t output_tile_index = 0) { - std::uint8_t output_id = get_output_id(output); - constexpr std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id(); - - std::uint16_t pack_tile_base_addr; - std::uint16_t pack_tile_offset_addr = 0; - pack_tile_base_addr = cb_interface[output_id].fifo_wr_ptr + MUL_TILE_SIZE_AND_INDEX((std::uint8_t)pack_dst_format[OUTPUT_BASE_ID], output_tile_index); - - int write_row_index = state->current_wr_ptr; - - if (state->partial_tile) { - pack_tile_offset_addr = ((write_row_index&(FACE_HEIGHT-1))+2*(write_row_index&FACE_HEIGHT))*2; //FIXME: scale row index with format - state->partial_tile = false; - } - - program_packer_destination((pack_tile_base_addr+pack_tile_offset_addr), OUTPUT_BASE_ID); - - if (params->initial_padding>0) { - if (params->initial_padding <= FACE_HEIGHT) { - TT_SETADCXX(p_setadc::PAC, ((params->initial_padding*16)-1), 0x0); - TTI_PACR(ADDR_MOD_2, 1, 0x3, 0, 0, 0, 0); - write_row_index+=params->initial_padding; - } else if (params->initial_padding < TILE_HEIGHT) { - TTI_SETADCXX(p_setadc::PAC, (16*FACE_HEIGHT)-1, 0x0); - TTI_PACR(ADDR_MOD_2, 1, 0x3, 0, 0, 0, 1); - TT_SETADCXX(p_setadc::PAC, (((params->initial_padding-FACE_HEIGHT)*16)-1), 0x0); - program_packer_destination((std::uint16_t)(pack_tile_base_addr+2*(2*FACE_HEIGHT)), OUTPUT_BASE_ID); //FIXME: scale based on the format - TTI_PACR(ADDR_MOD_2, 1, 0x3, 0, 0, 0, 0); - write_row_index+=params->initial_padding; - } else { - program_packer_destination((std::uint16_t)pack_tile_base_addr, OUTPUT_BASE_ID); - TTI_SETADCXX(p_setadc::PAC, (256)-1, 0x0); // zero tile detected - TTI_PACR(ADDR_MOD_2, 1, 0xF, 0, 0, 0, 1); - write_row_index+=TILE_HEIGHT; - } - // Pack single rows - TTI_SETADCXX(p_setadc::PAC, 16-1, 0x0); - } - - int curr_tile_index=-1; - while ( (write_row_index < TILE_HEIGHT) && - // Keep going until we reached end of valid dest, unless it's final iteration in which case we just pad to the end - ( (state->current_rd_ptr < params->valid_row_count) || params->final_iteration) ) - { - bool insert_blank = - ((state->current_y) >= params->original_y) || // we're past the end - (((state->current_x) < params->row_shift_x) && (params->row_shift_x > 0)) || // initial postive X-shift - (((state->current_x) >= (params->original_x + params->row_shift_x)) && (params->row_shift_x < 0)); // final negative X-shift - - if (write_row_index == FACE_HEIGHT) { - TTI_PACR(ADDR_MOD_2, 0, 0x3, 0, 0, 1, 1); //close tile in order to update address - program_packer_destination((std::uint16_t)(pack_tile_base_addr+2*(2*FACE_HEIGHT)), OUTPUT_BASE_ID); //FIXME: scale based on the format - } - - - - if (insert_blank) - { - // Insert empty rows - TTI_PACR(ADDR_MOD_0, 1, 0x3, 0, 0, 0, 0); - } - else - { - int tile_index = state->current_rd_ptr / TILE_HEIGHT; - int pack_zeros = 0; - - if (curr_tile_index != tile_index) { - curr_tile_index = tile_index; - if ( (tile_index < 0) || (tile_index >= 16) ) { - pack_zeros = 1; - } else { - uint16_t row_index = (state->current_rd_ptr & (TILE_HEIGHT-1)); - TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_Y, row_index); - TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_W, tile_index); - } - } - - TT_PACR(ADDR_MOD_0, pack_zeros, 0x3, 0, 0, 0, 0); - - - } - write_row_index++; - - // Move read pointers accordingly - state->current_rd_ptr += params->stride; - state->current_x += params->stride; - if (state->current_x >= params->original_x) - { - if (state->current_x > params->original_x) - { - // Stride got us too far, let's rewind back - state->current_rd_ptr -= state->current_x - params->original_x; - } - state->current_x = params->stride_offset; - state->current_y += params->stride; - state->current_rd_ptr += (params->stride - 1) * params->original_x; // stride Y - if (params->stride > 1) { - uint16_t row_index = (state->current_rd_ptr & (TILE_HEIGHT-1)); - TT_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_Y, row_index); - } - } - - } - - if (write_row_index == TILE_HEIGHT) { - state->current_wr_ptr = 0; - state->partial_tile = false; - // write header - TT_SETDMAREG(0, pack_tile_base_addr, 0, LO_16(p_gpr_pack::HEADER_ADDR)); - TTI_STOREIND(1, 0, p_ind::LD_16B, LO_16(0), p_ind::INC_NONE, p_gpr_pack::TILE_HEADER, p_gpr_pack::HEADER_ADDR); - } - else { - state->current_wr_ptr = write_row_index; - state->partial_tile = true; - } - - TTI_PACR(ADDR_MOD_2, 0, 0x3, 0, 0, 1, 1); //close tile -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h index 249f62bc71b..1a787231608 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h @@ -298,10 +298,11 @@ inline void llk_pack_reduce_config_v2(uint32_t icb_out) { narrow_tile, relu_config.val ); - } else { - TTI_STALLWAIT(p_stall::STALL_PACK, p_stall::PACK); - tensix_sync(); } - _llk_pack_reduce_mask_config_(); + if constexpr (revert) { + _llk_pack_reduce_mask_clear_(); + } else { + _llk_pack_reduce_mask_config_(); + } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h index 4a03157715b..0207ecc345f 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h @@ -20,7 +20,7 @@ inline const uint32_t get_operand_src_format(const std::uint32_t operand_id) inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id) { - return unpack_src_format[operand_id]; + return unpack_dst_format[operand_id]; } inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id) diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h index cba5398b604..11d634c25e4 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h @@ -26,7 +26,7 @@ inline const uint32_t get_output_src_format(const std::uint32_t output_id) inline const uint32_t get_output_dst_format(const std::uint32_t output_id) { - return pack_src_format[output_id]; + return pack_dst_format[output_id]; } inline const uint32_t get_output_num_faces(const std::uint32_t output_id) From 55c5c657a109d54edbff52e6087e52e5ab2b541b Mon Sep 17 00:00:00 2001 From: acejkov Date: Tue, 5 Dec 2023 00:00:47 +0000 Subject: [PATCH 06/16] #3908: Set default pack_output to 16 --- .../hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h index 1a787231608..81584704615 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h @@ -99,7 +99,7 @@ inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output } template -inline void llk_pack_init(const std::uint32_t pack_output = 0) { +inline void llk_pack_init(const std::uint32_t pack_output = 16) { const std::uint32_t output_id = get_output_id(pack_output); const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); @@ -179,7 +179,7 @@ inline void llk_pack_dest_section_done() { } template -inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 0) { +inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) { const std::uint32_t output_id = get_output_id(pack_output); const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); const bool narrow_tile = get_output_narrow_tile(output_id); @@ -191,7 +191,7 @@ inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_outpu } template -inline void llk_pack_dest_init(const std::uint32_t pack_output = 0) { +inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) { const std::uint32_t output_id = get_output_id(pack_output); const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); From 5e838a834169b66bc282a2de640e20d42437ff47 Mon Sep 17 00:00:00 2001 From: acejkov Date: Tue, 5 Dec 2023 19:57:03 +0000 Subject: [PATCH 07/16] #3908: Fix get_operand_id to return correct value --- tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h index 0207ecc345f..2b94607012d 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h @@ -8,9 +8,8 @@ inline uint32_t get_operand_id(uint32_t operand) { - const int INTERMEDIATE_BASE_ID = 24; const int OPERAND_BASE_ID = 0; - return (operand>=INTERMEDIATE_BASE_ID) ? operand - 8 : operand - OPERAND_BASE_ID; + return (operand); } inline const uint32_t get_operand_src_format(const std::uint32_t operand_id) From dc9f23993553a7012a0da714fe5a286e0f020169 Mon Sep 17 00:00:00 2001 From: acejkov Date: Tue, 5 Dec 2023 23:03:10 +0000 Subject: [PATCH 08/16] #3908: Uplift the latest changes from core llk lib. Common files are now identical --- .../ckernels/wormhole_b0/common/inc/ckernel.h | 4 ++-- .../wormhole_b0/common/inc/ckernel_sfpu.h | 20 +++++++++++++++++++ .../ckernels/wormhole_b0/llk_lib/llk_defs.h | 4 +--- .../metal/llk_api/llk_math_binary_sfpu_api.h | 8 ++++---- .../metal/llk_api/llk_math_unary_sfpu_api.h | 18 ++++++++--------- .../llk_math_eltwise_unary_sfpu_0_param.h | 6 +++--- .../llk_math_eltwise_unary_sfpu_1_param.h | 6 +++--- ..._math_eltwise_unary_sfpu_common_includes.h | 6 +++--- .../llk_math_eltwise_unary_sfpu_elu.h | 2 +- .../llk_math_eltwise_unary_sfpu_erf_erfc.h | 4 ++-- .../llk_math_eltwise_unary_sfpu_erfinv.h | 2 +- .../llk_math_eltwise_unary_sfpu_exp.h | 2 +- .../llk_math_eltwise_unary_sfpu_gelu.h | 4 ++-- .../llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h | 2 +- .../llk_math_eltwise_unary_sfpu_isinf_isnan.h | 10 +++++----- ...math_eltwise_unary_sfpu_logical_not_noti.h | 2 +- .../llk_math_eltwise_unary_sfpu_recip.h | 2 +- .../llk_math_eltwise_unary_sfpu_relu.h | 8 ++++---- .../llk_math_eltwise_unary_sfpu_reverseops.h | 2 +- .../llk_math_eltwise_unary_sfpu_sqrt.h | 2 +- ...llk_math_eltwise_unary_sfpu_trigonometry.h | 6 +++--- 21 files changed, 69 insertions(+), 51 deletions(-) diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h index b731cc4bf81..ebc48646c8a 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel.h @@ -36,7 +36,7 @@ #define OVERLAY_DECOUPLE 0 #endif -#ifdef LLK_TB_TEST +#if defined(EN_KERNEL_SLOWDOWN) #include "kernel_slowdown_config.h" #endif @@ -61,10 +61,10 @@ #include "ckernel_include.h" #include "tensix.h" #include "fw_debug.h" +#include "tt_log.h" // #include #if defined(PERF_DUMP) || DELAY_EN > 0 #include -#include "tt_log.h" #include "perf_lib/scratch_api.h" #endif diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h index b00ea4a0b1f..c0fcaf197cb 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu.h @@ -1319,5 +1319,25 @@ inline void _dequant_int32_(const int iterations, const uint dst_offset) } } +template +inline void _add_int32_(const int iterations, const uint dst_offset) { + // Operand A is input1 (int32) + // Operand B is input2 (int32) + // Output is int32 + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) { + // operand A - int32 + TTI_SFPLOAD(0, 12, 3, 0); + // operand B - int32 + TT_SFPLOAD(1, 12, 3, dst_offset * 64); + TTI_SFPIADD(0, 1, 0, 4); + // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result + TTI_NOP; + // LREG_0 -> dest as int32 + TTI_SFPSTORE(0, 12, 3, 0); + dst_reg++; + } +} + } // namespace sfpu } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h index e5250fb7412..e80e29d83c7 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_defs.h @@ -7,13 +7,11 @@ namespace ckernel { -enum Dim { +enum VectorMode { None = 0, R = 1, C = 2, - Z = 3, RC = 4, - ZR = 5, Invalid = 0xFF, }; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h index 5f662f22081..c7c42763d95 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_binary_sfpu_api.h @@ -15,7 +15,7 @@ inline void llk_math_eltwise_binary_sfpu( const uint operand, uint dst_index_a, uint dst_index_b, - int vector_mode = (int)Dim::RC, + int vector_mode = (int)VectorMode::RC, uint param0 = 0, uint param1 = 0, uint param2 = 0, @@ -38,7 +38,7 @@ inline void llk_math_eltwise_binary_sfpu_init( template inline void llk_math_eltwise_binary_sfpu_quant_int32( - uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { + uint dst_index_a, uint dst_index_b, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); } @@ -49,7 +49,7 @@ inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) template inline void llk_math_eltwise_binary_sfpu_requant_int32( - uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { + uint dst_index_a, uint dst_index_b, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); } @@ -60,7 +60,7 @@ inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_poin template inline void llk_math_eltwise_binary_sfpu_dequant_int32( - uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { + uint dst_index_a, uint dst_index_b, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h index 898788ca415..f65a6b86ddd 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -25,7 +25,7 @@ inline void llk_math_eltwise_unary_sfpu_rsqrt_init() { } template -inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode); } @@ -45,7 +45,7 @@ inline void llk_math_eltwise_unary_sfpu_log_with_base_init() { } template -inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode); } @@ -88,7 +88,7 @@ inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) { } template -inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode); } @@ -164,7 +164,7 @@ inline void llk_math_eltwise_unary_sfpu_gez_init() { } template -inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode); } @@ -174,7 +174,7 @@ inline void llk_math_eltwise_unary_sfpu_max_init() { } template -inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode); } @@ -184,7 +184,7 @@ inline void llk_math_eltwise_unary_sfpu_square_init() { } template -inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode, pow); } @@ -194,7 +194,7 @@ inline void llk_math_eltwise_unary_sfpu_power_init() { } template -inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode); } @@ -204,7 +204,7 @@ inline void llk_math_eltwise_unary_sfpu_abs_init() { } template -inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index, vector_mode); } @@ -226,7 +226,7 @@ inline void llk_math_eltwise_unary_sfpu_exp2_init() { //heaviside template -inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu(dst_index,vector_mode,param0); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h index a896c4064c3..c72b136f851 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h @@ -11,7 +11,7 @@ inline void llk_math_eltwise_unary_sfpu_0_param( void (*first_func)(), void (*func)(), uint dst_index, - int vector_mode = Dim::RC) { + int vector_mode = (int)VectorMode::RC) { if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { math::set_dst_write_addr(math_sync_tile_dst_index); } else { @@ -19,7 +19,7 @@ inline void llk_math_eltwise_unary_sfpu_0_param( } math::set_addr_mod_base(); TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); - if (vector_mode == Dim::R) { + if (vector_mode == (int)VectorMode::R) { // Do a row vector, Face0 + Face1 -- first iteration (first row) const int ITERATIONS = 1; #pragma GCC unroll 0 @@ -33,7 +33,7 @@ inline void llk_math_eltwise_unary_sfpu_0_param( TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } else if (vector_mode == Dim::C) { + } else if (vector_mode == (int)VectorMode::C) { // Do a column vector, Face0 + Face2 -- All iterations for full face #pragma GCC unroll 0 for (int face = 0; face < 2; face++) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h index c3477ea5c7e..892ed513ea2 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h @@ -11,7 +11,7 @@ inline void llk_math_eltwise_unary_sfpu_1_param( void (*first_func)(uint), void (*func)(uint), uint dst_index, - int vector_mode = Dim::RC, + int vector_mode = (int)VectorMode::RC, uint param0 = 0) { if constexpr ((Dst == DstSync::SyncTile16) || (Dst == DstSync::SyncTile2)) { math::set_dst_write_addr(math_sync_tile_dst_index); @@ -20,7 +20,7 @@ inline void llk_math_eltwise_unary_sfpu_1_param( } math::set_addr_mod_base(); TTI_STALLWAIT(p_stall::STALL_SFPU, p_stall::MATH); - if (vector_mode == Dim::R) { + if (vector_mode == (int)VectorMode::R) { // Do a row vector, Face0 + Face1 -- first iteration (first row) const int ITERATIONS = 1; #pragma GCC unroll 0 @@ -34,7 +34,7 @@ inline void llk_math_eltwise_unary_sfpu_1_param( TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); TTI_SETRWC(p_setrwc::CLR_NONE, p_setrwc::CR_D, 8, 0, 0, p_setrwc::SET_D); - } else if (vector_mode == Dim::C) { + } else if (vector_mode == (int)VectorMode::C) { // Do a column vector, Face0 + Face2 -- All iterations for full face #pragma GCC unroll 0 for (int face = 0; face < 2; face++) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h index cc08a9f346c..06aa57e9e34 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h @@ -111,7 +111,7 @@ inline void llk_math_calculate_sfpu( template inline void llk_math_eltwise_unary_sfpu( uint dst_index, - int vector_mode = (int)Dim::RC, + int vector_mode = (int)VectorMode::RC, uint param0 = 0, uint param1 = 0, uint param2 = 0, @@ -126,7 +126,7 @@ inline void llk_math_eltwise_unary_sfpu( _llk_math_eltwise_unary_sfpu_start_(dst_index); - if (vector_mode == (int)Dim::R) { + if (vector_mode == (int)VectorMode::R) { // Do a row vector, Face0 + Face1 -- first iteration (first row) const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2) : 2; // At least 2 iterations for odd and even columns @@ -140,7 +140,7 @@ inline void llk_math_eltwise_unary_sfpu( // Skip next two faces _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); - } else if (vector_mode == (int)Dim::C) { + } else if (vector_mode == (int)VectorMode::C) { // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for // full face #pragma GCC unroll 0 diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h index 7917165a25f..81250ea1d3b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h @@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_elu(uint dst_index, uint param0) { llk_math_eltwise_unary_sfpu_1_param (ckernel::sfpu::calculate_elu, ckernel::sfpu::calculate_elu, - dst_index, Dim::RC, param0); + dst_index, (int)VectorMode::RC, param0); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h index 48f6b8dc398..da9a44c5382 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h @@ -29,7 +29,7 @@ inline void llk_math_eltwise_unary_sfpu_erf(uint dst_index, int param0 = 0) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_erf_erfc, ckernel::sfpu::calculate_sfpu_erf_erfc, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } template @@ -37,7 +37,7 @@ inline void llk_math_eltwise_unary_sfpu_erfc(uint dst_index, int param0 = 0) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_erf_erfc, ckernel::sfpu::calculate_sfpu_erf_erfc, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h index aadcb422609..6b7f6a9311e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h @@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_erfinv_op(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_erfinv, ckernel::sfpu::calculate_erfinv, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h index 4022d34274b..ca18cfd4fb8 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h @@ -15,7 +15,7 @@ namespace ckernel { // New LLK SFPU APIs template -inline void llk_math_eltwise_unary_sfpu_exponential(uint dst_index, int vector_mode = Dim::RC, int param0 = 0) { +inline void llk_math_eltwise_unary_sfpu_exponential(uint dst_index, int vector_mode = (int)VectorMode::RC, int param0 = 0) { constexpr bool zero_negative = true; constexpr int first_iterations = 1; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h index 9c96bd6877a..c4976bae921 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h @@ -13,7 +13,7 @@ namespace ckernel { // New LLK SFPU APIs template -inline void llk_math_eltwise_unary_sfpu_gelu(uint dst_index, int vector_mode = Dim::RC, int param0=0) { +inline void llk_math_eltwise_unary_sfpu_gelu(uint dst_index, int vector_mode = (int)VectorMode::RC, int param0=0) { constexpr int first_iterations = 1; llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_gelu, @@ -27,7 +27,7 @@ inline void llk_math_eltwise_unary_sfpu_gelu_init() { } template -inline void llk_math_eltwise_unary_sfpu_gelu_derivative(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_gelu_derivative(uint dst_index, int vector_mode = (int)VectorMode::RC) { constexpr int first_iterations = 1; llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_gelu_derivative, diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h index 3b3e8ef87a3..c0b686a269b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h @@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_i0_op(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_i0, ckernel::sfpu::calculate_i0, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h index be0d2cfea43..216bcba507f 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h @@ -26,7 +26,7 @@ inline void llk_math_eltwise_unary_sfpu_isinf(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_isinf_isnan, ckernel::sfpu::calculate_sfpu_isinf_isnan, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } @@ -41,7 +41,7 @@ inline void llk_math_eltwise_unary_sfpu_isposinf(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_isinf_isnan, ckernel::sfpu::calculate_sfpu_isinf_isnan, - dst_index,Dim::RC); + dst_index,(int)VectorMode::RC); } @@ -58,7 +58,7 @@ inline void llk_math_eltwise_unary_sfpu_isneginf(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_isinf_isnan, ckernel::sfpu::calculate_sfpu_isinf_isnan, - dst_index,Dim::RC); + dst_index,(int)VectorMode::RC); } @@ -73,7 +73,7 @@ inline void llk_math_eltwise_unary_sfpu_isnan(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_isinf_isnan, ckernel::sfpu::calculate_sfpu_isinf_isnan, - dst_index,Dim::RC); + dst_index,(int)VectorMode::RC); } @@ -88,7 +88,7 @@ inline void llk_math_eltwise_unary_sfpu_isfinite(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_isinf_isnan, ckernel::sfpu::calculate_sfpu_isinf_isnan, - dst_index,Dim::RC); + dst_index,(int)VectorMode::RC); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h index 668701d0aa9..ca3db8419de 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h @@ -24,7 +24,7 @@ inline void llk_math_eltwise_unary_sfpu_logical_not_unary_op(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_logical_not_unary, ckernel::sfpu::calculate_logical_not_unary, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h index a2baf2e58e6..251bed18f26 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h @@ -14,7 +14,7 @@ namespace ckernel { // New LLK SFPU APIs template -inline void llk_math_eltwise_unary_sfpu_reciprocal(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_reciprocal(uint dst_index, int vector_mode = (int)VectorMode::RC) { constexpr int first_iterations = 1; llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_reciprocal, diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h index 9d737a27db4..40862b65b8b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h @@ -38,7 +38,7 @@ inline void llk_math_eltwise_unary_sfpu_lrelu(uint dst_index, uint param0 = 0) { llk_math_eltwise_unary_sfpu_1_param (ckernel::sfpu::calculate_lrelu, ckernel::sfpu::calculate_lrelu, - dst_index, Dim::RC, param0); + dst_index, (int)VectorMode::RC, param0); } template @@ -46,7 +46,7 @@ inline void llk_math_eltwise_unary_sfpu_relu_max(uint dst_index, uint param0 = 0 llk_math_eltwise_unary_sfpu_1_param (ckernel::sfpu::relu_max, ckernel::sfpu::relu_max, - dst_index, Dim::RC, param0); + dst_index, (int)VectorMode::RC, param0); } template @@ -54,7 +54,7 @@ inline void llk_math_eltwise_unary_sfpu_relu_min(uint dst_index, uint param0 = 0 llk_math_eltwise_unary_sfpu_1_param (ckernel::sfpu::relu_min, ckernel::sfpu::relu_min, - dst_index, Dim::RC, param0); + dst_index, (int)VectorMode::RC, param0); } template @@ -62,7 +62,7 @@ inline void llk_math_eltwise_unary_sfpu_relu(uint dst_index) { llk_math_eltwise_unary_sfpu_1_param (ckernel::sfpu::relu_min, ckernel::sfpu::relu_min, - dst_index, Dim::RC, 0); + dst_index, (int)VectorMode::RC, 0); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h index c1c6c697f81..d307f9490ad 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h @@ -24,7 +24,7 @@ namespace ckernel { llk_math_eltwise_unary_sfpu_1_param (ckernel::sfpu::calculate_rsub, ckernel::sfpu::calculate_rsub, - dst_index, Dim::RC, param0); + dst_index, (int)VectorMode::RC, param0); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h index dc1be1f16f9..ec50f756429 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h @@ -13,7 +13,7 @@ namespace ckernel { // New LLK SFPU APIs template -inline void llk_math_eltwise_unary_sfpu_sqrt(uint dst_index, int vector_mode = Dim::RC) { +inline void llk_math_eltwise_unary_sfpu_sqrt(uint dst_index, int vector_mode = (int)VectorMode::RC) { constexpr bool zero_negative = true; constexpr int first_iterations = 1; llk_math_eltwise_unary_sfpu_0_param diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h index e5066307377..94022110bc3 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h @@ -25,7 +25,7 @@ inline void llk_math_eltwise_unary_sfpu_sine_op(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_trig, ckernel::sfpu::calculate_sfpu_trig, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } @@ -40,7 +40,7 @@ inline void llk_math_eltwise_unary_sfpu_cosine_op(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_trig, ckernel::sfpu::calculate_sfpu_trig, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } @@ -55,7 +55,7 @@ inline void llk_math_eltwise_unary_sfpu_tan_op(uint dst_index) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sfpu_trig, ckernel::sfpu::calculate_sfpu_trig, - dst_index, Dim::RC); + dst_index, (int)VectorMode::RC); } From c07542fe347205ff2e274f29a48b3817add0ccc2 Mon Sep 17 00:00:00 2001 From: acejkov Date: Wed, 6 Dec 2023 16:39:21 +0000 Subject: [PATCH 09/16] #3908: Fix kernel compile error for test_device_profiler tests --- tt_metal/include/compute_kernel_api.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tt_metal/include/compute_kernel_api.h b/tt_metal/include/compute_kernel_api.h index a1abc4c2d4f..433f9ea07f5 100644 --- a/tt_metal/include/compute_kernel_api.h +++ b/tt_metal/include/compute_kernel_api.h @@ -30,8 +30,8 @@ #endif #ifdef TRISC_PACK -#include "llk_pack_common.h" -#include "llk_pack.h" +#include "llk_pack_api.h" +#include "llk_io_pack.h" #define PACK(x) x #define MAIN pack_main() #else @@ -46,6 +46,7 @@ #include "llk_unpack_reduce_api.h" #include "llk_unpack_tilize_api.h" #include "llk_unpack_untilize_api.h" +#include "llk_io_unpack.h" #define UNPACK(x) x #define MAIN unpack_main() #else From 24d0029a5c638e34f36c81e0550dc99d14ea63ad Mon Sep 17 00:00:00 2001 From: acejkov Date: Wed, 6 Dec 2023 18:17:44 +0000 Subject: [PATCH 10/16] #3908: Fix sfpu init for exp2,expm1,rsqrt,atan --- .../llk_math_eltwise_unary_sfpu_init.h | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h index e43682ab4f2..6d735702f3c 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h @@ -34,30 +34,24 @@ inline void llk_math_eltwise_unary_sfpu_init( _llk_math_eltwise_unary_sfpu_init_(); switch (sfpu_op) { - case SfpuType::reciprocal: - sfpu::_init_reciprocal_(); - break; - case SfpuType::exponential: - sfpu::_init_exponential_(); - break; - case SfpuType::log: - sfpu::_init_log_(); - break; - case SfpuType::sqrt: - sfpu::_init_sqrt_(); - break; case SfpuType::tanh: case SfpuType::tanh_derivative: - sfpu::_init_tanh_(); - break; + sfpu::_init_tanh_(); + break; case SfpuType::sigmoid: - sfpu::_init_sigmoid_(); + sfpu::_init_sigmoid_(); + break; + case SfpuType::expm1: + case SfpuType::exp2: + sfpu::_init_exponential_(); break; - case SfpuType::gelu_derivative: - sfpu::_init_gelu_derivative_(); + case SfpuType::rsqrt: + case SfpuType::atan: + sfpu::_init_reciprocal_(); break; - case SfpuType::gelu: - sfpu::_init_gelu_(); + case SfpuType::log_with_base: + case SfpuType::log: + sfpu::_init_log_(); break; case SfpuType::dropout: sfpu::_init_dropout_(param2); From 1e6d679b8e67d159dc3dbff0840a910036119c60 Mon Sep 17 00:00:00 2001 From: acejkov Date: Thu, 7 Dec 2023 20:19:29 +0000 Subject: [PATCH 11/16] #3908: Add missing global var to trisck.cc to fix kernel compile error for test_graph_interpreter --- tt_metal/hw/firmware/src/trisck.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc index 174fe265300..0115db2f96f 100644 --- a/tt_metal/hw/firmware/src/trisck.cc +++ b/tt_metal/hw/firmware/src/trisck.cc @@ -20,6 +20,7 @@ uint32_t unp_cfg_context = 0; uint32_t pack_sync_tile_dst_ptr = 0; uint32_t math_sync_tile_dst_index = 0; uint32_t gl_alu_format_spec_reg = 0; +uint32_t op_info_offset = 0; namespace ckernel { From fae0364d154c5519a6dc7b21688d65c973d629c7 Mon Sep 17 00:00:00 2001 From: Reem Tawfik Date: Wed, 6 Dec 2023 18:07:26 +0000 Subject: [PATCH 12/16] #3908: Add GS llk api/lib files to fix compile --- .../ckernels/grayskull/common/inc/ckernel.h | 105 --- .../grayskull/common/inc/ckernel_globals.h | 59 -- .../grayskull/common/inc/ckernel_template.h | 217 +++++ .../grayskull/common/inc/cpack_common.h | 10 - .../grayskull/common/inc/cunpack_common.h | 5 - .../grayskull/common/src/ckernel_template.cc | 229 ----- .../hw/ckernels/grayskull/llk_lib/llk_defs.h | 61 -- ..._math_eltwise_unary_sfpu_common_includes.h | 16 - .../hw/ckernels/grayskull/llk_lib/llk_pack.h | 2 +- .../ckernels/grayskull/llk_lib/llk_unpack_A.h | 3 + .../grayskull/llk_lib/llk_unpack_AB.h | 3 + .../grayskull/llk_lib/llk_unpack_AB_matmul.h | 3 + .../grayskull/llk_lib/llk_unpack_common.h | 3 + .../{common/inc => metal/common}/chlkc_list.h | 4 +- .../metal/common/metal_ckernel_globals.h | 63 ++ .../metal/llk_api/llk_math_binary_api.h | 86 ++ .../metal/llk_api/llk_math_binary_sfpu_api.h | 70 ++ .../metal/llk_api/llk_math_common_api.h | 108 +++ .../metal/llk_api/llk_math_matmul_api.h | 68 ++ .../metal/llk_api/llk_math_reduce_api.h | 28 + .../llk_api/llk_math_unary_datacopy_api.h | 36 + .../metal/llk_api/llk_math_unary_sfpu_api.h | 293 +++++++ .../grayskull/metal/llk_api/llk_op_info_api.h | 23 + .../grayskull/metal/llk_api/llk_pack_api.h | 308 +++++++ .../llk_api}/llk_param_structs.h | 0 .../llk_api/llk_sfpu}/ckernel_reverseops.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_cdf.h | 0 .../llk_sfpu}/ckernel_sfpu_converter.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_elu.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_exp.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_gelu.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_i0.h | 0 .../llk_sfpu}/ckernel_sfpu_isinf_isnan.h | 0 .../llk_sfpu}/ckernel_sfpu_logical_not_noti.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_recip.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_relu.h | 0 .../llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h | 0 .../llk_sfpu}/ckernel_sfpu_trigonometry.h | 0 .../llk_math_eltwise_unary_sfpu_0_param.h | 0 .../llk_math_eltwise_unary_sfpu_1_param.h | 0 ..._math_eltwise_unary_sfpu_common_includes.h | 171 ++++ .../llk_math_eltwise_unary_sfpu_elu.h | 0 .../llk_math_eltwise_unary_sfpu_erf_erfc.h | 0 .../llk_math_eltwise_unary_sfpu_erfinv.h | 0 .../llk_math_eltwise_unary_sfpu_exp.h | 0 .../llk_math_eltwise_unary_sfpu_gelu.h | 0 .../llk_math_eltwise_unary_sfpu_i0.h | 0 .../llk_math_eltwise_unary_sfpu_init.h | 0 .../llk_math_eltwise_unary_sfpu_isinf_isnan.h | 0 ...math_eltwise_unary_sfpu_logical_not_noti.h | 0 .../llk_math_eltwise_unary_sfpu_recip.h | 0 .../llk_math_eltwise_unary_sfpu_relu.h | 0 .../llk_math_eltwise_unary_sfpu_reverseops.h | 0 .../llk_math_eltwise_unary_sfpu_sqrt.h | 0 ...llk_math_eltwise_unary_sfpu_trigonometry.h | 0 .../llk_api/llk_sfpu/metal_ckernel_sfpu.h | 780 ++++++++++++++++++ .../grayskull/metal/llk_api/llk_sfpu_types.h | 64 ++ .../metal/llk_api/llk_unpack_AB_api.h | 85 ++ .../metal/llk_api/llk_unpack_AB_matmul_api.h | 136 +++ .../metal/llk_api/llk_unpack_A_api.h | 89 ++ .../metal/llk_api/llk_unpack_common_api.h | 137 +++ .../metal/llk_api/llk_unpack_reduce_api.h | 94 +++ .../metal/llk_api/llk_unpack_tilize_api.h | 99 +++ .../metal/llk_api/llk_unpack_untilize_api.h | 96 +++ .../ckernels/grayskull/metal/llk_io/llk_io.cc | 3 + .../ckernels/grayskull/metal/llk_io/llk_io.h | 10 + .../{llk_lib => metal/llk_io}/llk_io_pack.h | 0 .../{llk_lib => metal/llk_io}/llk_io_unpack.h | 0 .../grayskull/metal/llk_io/llk_operands.h | 53 ++ .../grayskull/metal/llk_io/llk_outputs.h | 61 ++ .../wormhole_b0/common/inc/ckernel_globals.h | 1 - .../metal/common/metal_ckernel_globals.h | 2 + .../wormhole_b0/metal/llk_io/llk_operands.h | 3 +- .../compute_kernel_api/common_globals.h | 1 + tt_metal/include/compute_kernel_api/unpack.h | 2 +- 77 files changed, 3197 insertions(+), 493 deletions(-) delete mode 100644 tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc delete mode 100644 tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/common}/chlkc_list.h (91%) create mode 100644 tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api}/llk_param_structs.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_reverseops.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_cdf.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_converter.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_elu.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erf_erfc.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_erfinv.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_exp.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_gelu.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_i0.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_isinf_isnan.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_logical_not_noti.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_recip.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_relu.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_sqrt.h (100%) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_trigonometry.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_0_param.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_1_param.h (100%) create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_elu.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erf_erfc.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_erfinv.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_exp.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_gelu.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_i0.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_init.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_isinf_isnan.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_logical_not_noti.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_recip.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_relu.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_reverseops.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_sqrt.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_trigonometry.h (100%) create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_io}/llk_io_pack.h (100%) rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_io}/llk_io_unpack.h (100%) create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h index b2de68e862a..8bbf675af9e 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h @@ -62,8 +62,6 @@ extern uint32_t dest_offset_id; extern uint32_t dbg_event_index; extern uint32_t dbg_event_end; -extern uint32_t op_info_offset; - // Internal scope to namespace methods only (C++ does not allow namespace private ownership) namespace internal { } @@ -281,109 +279,6 @@ inline void debug_dump(uint8_t *data, uint32_t byte_size) { // TODO(pk) re-implement } -inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) { - - uint32_t* op_info_ptr = reinterpret_cast(OP_INFO_BASE_ADDR + op_info_offset); - static constexpr uint32_t op_info_num_items = 7; - - volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast(&op_info_struct); - for (uint32_t i = 0; i < op_info_num_items; i++) { - op_info_struct_ptr[i] = op_info_ptr[i]; - } - op_info_offset += 28; - - if (op_info_offset == OP_INFO_SIZE) { - op_info_offset = 0; // In case we go out of bounds - } -} - -inline __attribute__((always_inline)) unsigned int mulsi3 (unsigned int a, unsigned int b) -{ - unsigned int r = 0; - while (a) - { - if (a & 1) - r += b; - a >>= 1; - b <<= 1; - } - return r; -} - -inline __attribute__((always_inline)) uint32_t fast_udiv_12(uint32_t n) -{ - // Uses embedding style magic number - // * fixed point 1/12 then shifting. - // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm - return (((uint64_t) n * 0xAAAAAAAB) >> 32) >> 3; -} - -inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n) -{ - // Uses embedding style magic number - // * fixed point 1/12 then shifting. - // https://web.archive.org/web/20190703172151/http://www.hackersdelight.org/magic.htm - return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6; -} - -template -inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n) -{ - if constexpr (d == 12) { - // fast divide for 12 divisor - return fast_udiv_12(n); - } else if constexpr (d == 94) { - // fast divide for 94 divisor. Handles Banked L1 address generation for E75 - return fast_udiv_94(n); - } else { - // generic divide from llvm - const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT; - unsigned int q; - unsigned int r; - unsigned sr; - /* special cases */ - if (d == 0) - return 0; /* ?! */ - if (n == 0) - return 0; - sr = __builtin_clz(d) - __builtin_clz(n); - /* 0 <= sr <= n_uword_bits - 1 or sr large */ - if (sr > n_uword_bits - 1) /* d > r */ - return 0; - if (sr == n_uword_bits - 1) /* d == 1 */ - return n; - ++sr; - /* 1 <= sr <= n_uword_bits - 1 */ - /* Not a special case */ - q = n << (n_uword_bits - sr); - r = n >> sr; - unsigned int carry = 0; - for (; sr > 0; --sr) - { - /* r:q = ((r:q) << 1) | carry */ - r = (r << 1) | (q >> (n_uword_bits - 1)); - q = (q << 1) | carry; - /* carry = 0; - * if (r.all >= d.all) - * { - * r.all -= d.all; - * carry = 1; - * } - */ - const int s = (unsigned int)(d - r - 1) >> (n_uword_bits - 1); - carry = s & 1; - r -= d & s; - } - q = (q << 1) | carry; - return q; - } -} -template -inline __attribute__((always_inline)) uint32_t umodsi3_const_divisor(uint32_t a) -{ - return a - udivsi3_const_divisor(a) * d; -} - inline void tensix_sync() { volatile uint foo = 0; diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h index a98ae7577ae..445c77d1e0b 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_globals.h @@ -7,8 +7,6 @@ #include #include "ckernel_structs.h" #include "risc_attribs.h" -#include "tensix_functions.h" -#include "hostdevcommon/common_runtime_address_map.h" extern uint32_t cfg_state_id; extern uint32_t unp_cfg_context; @@ -16,62 +14,5 @@ extern uint32_t gl_alu_format_spec_reg; extern volatile uint32_t l1_buffer[16]; -//extern const int32_t unpack_src_format[24]; -//extern const int32_t unpack_dst_format[24]; -//extern const int32_t pack_src_format[16]; -//extern const int32_t pack_dst_format[16]; - extern uint32_t pack_sync_tile_dst_ptr; extern uint32_t math_sync_tile_dst_index; - -extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; - -extern uint32_t __ldm_bss_start[]; -extern uint32_t __ldm_bss_end[]; -extern uint32_t __ldm_data_start[]; -extern uint32_t __ldm_data_end[]; -extern void (* __init_array_start[])(); -extern void (* __init_array_end[])(); -extern uint32_t __firmware_start[]; - -extern void kernel_init(); -extern void kernel_launch(); - -inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) { - // Cover L1 load latency of 6 cycles for the bulk of the copy - int32_t n = 0; - while (n < len - 5) { - uint32_t v0 = l1_addr[n + 0]; - uint32_t v1 = l1_addr[n + 1]; - uint32_t v2 = l1_addr[n + 2]; - uint32_t v3 = l1_addr[n + 3]; - uint32_t v4 = l1_addr[n + 4]; - uint32_t v5 = l1_addr[n + 5]; - local_mem_addr[n + 0] = v0; - local_mem_addr[n + 1] = v1; - local_mem_addr[n + 2] = v2; - local_mem_addr[n + 3] = v3; - local_mem_addr[n + 4] = v4; - local_mem_addr[n + 5] = v5; - n += 6; - } - // Could optimize this further (eg, loop of 2 or 4), probably not worth it - while (n < len) { - local_mem_addr[n] = l1_addr[n]; - n++; - } -} - -inline void firmware_kernel_common_init(void *init_local_l1_base) { - - // Handle stuff typically done in crt0 in asm. Easier to do in C - wzerorange(__ldm_bss_start, __ldm_bss_end); - - int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2; - uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE; - l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words); - - for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) { - (**fptr)(); - } -} diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h index c8968d06577..ba1c08033b1 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_template.h @@ -237,4 +237,221 @@ class ckernel_unpack_template void program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask = 0); // calls program, then run }; + ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op) + : m_outer_loop_len(outer_loop_len) + , m_inner_loop_len(inner_loop_len) + , m_loop_op0(loop_op) + , m_loop_op1(TT_OP_NOP) + , m_end_op0(TT_OP_NOP) + , m_end_op1(TT_OP_NOP) + , m_start_op0(TT_OP_NOP) + { + m_loop0_last_instr = loop_op; + m_loop1_last_instr = loop_op; + } + + ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1) + : m_outer_loop_len(outer_loop_len) + , m_inner_loop_len(inner_loop_len) + , m_loop_op0(loop_op0) + , m_loop_op1(loop_op1) + , m_end_op0(TT_OP_NOP) + , m_end_op1(TT_OP_NOP) + , m_start_op0(TT_OP_NOP) + { + m_loop0_last_instr = loop_op1; + m_loop1_last_instr = loop_op1; + } + + void ckernel_template::set_loop_op0(uint loop_op) + { + m_loop_op0 = loop_op; + } + + void ckernel_template::set_loop_op1(uint loop_op) + { + m_loop_op1 = loop_op; + } + + void ckernel_template::set_end_ops(uint end_op0, uint end_op1) + { + m_end_op0 = end_op0; + m_end_op1 = end_op1; + } + + void ckernel_template::set_end_op(uint end_op0) + { + set_end_ops(end_op0, TT_OP_NOP); + } + + void ckernel_template::set_start_op(uint start_op0) + { + m_start_op0 = start_op0; + } + + void ckernel_template::set_last_inner_loop_instr(uint op) + { + m_loop1_last_instr = op; + } + + void ckernel_template::set_last_outer_loop_instr(uint op) + { + m_loop0_last_instr = op; + } + + void ckernel_template::program_and_run(volatile uint *instrn_buffer) + { + program(instrn_buffer); + run(instrn_buffer); + } + + void ckernel_template::run(volatile uint *instrn_buffer) + { + TTI_MOP(1, 0, 0); // run the double-loop template + } + + void ckernel_template::program(volatile uint *instrn_buffer) + { + volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); + + mop_sync(); // wait until previous mops have completed + + mop_cfg[0] = m_outer_loop_len; + mop_cfg[1] = m_inner_loop_len; + mop_cfg[2] = m_start_op0; + mop_cfg[3] = m_end_op0; + mop_cfg[4] = m_end_op1; + mop_cfg[5] = m_loop_op0; + mop_cfg[6] = m_loop_op1; + mop_cfg[7] = m_loop0_last_instr; + mop_cfg[8] = m_loop1_last_instr; + } + + void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) + { + program(instrn_buffer); + run(instrn_buffer, count, zmask); + } + + void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) + { + FWASSERT("Unpack template only supports loops up to 128", count <= 128); + TT_MOP_CFG(zmask >> 16); // Set the top 16 bits of zmask - we could skip this for count <= 16 + TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template + } + + // Version without zmask, should be slightly faster by eliminating one instruction. + void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count) + { + FWASSERT("Unpack template only supports loops up to 128", count <= 128); + TT_MOP(0, count - 1, 0); // Run the template + } + + void ckernel_unpack_template::program(volatile uint *instrn_buffer) const + { + volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); + + mop_sync(); // wait until previous mops have completed + + mop_cfg[1] = m_unpackB | (m_unpack_halo << 1); + mop_cfg[2] = m_B_instr; + mop_cfg[3] = m_A0_instr; + mop_cfg[4] = m_A1_instr; + mop_cfg[5] = m_A2_instr; + mop_cfg[6] = m_A3_instr; + mop_cfg[7] = m_skipA_instr; + mop_cfg[8] = m_skipB_instr; + } + + ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr) + { + return ckernel_unpack_template(false, // src B + false, // halo + A_instr, 0, 0, 0, skipA_instr, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr) + { + return ckernel_unpack_template(false, // src B + false, // halo + B_instr, 0, 0, 0, skipB_instr, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr) + { + return ckernel_unpack_template(false, // src B + true, // halo + neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(false, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(false, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0); + } + + ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(true, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B); + } + + ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask) + { + // Figure out which unpack is last + const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; + + return ckernel_unpack_template(true, // src B + true, // halo + ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, + ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, + ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, + ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B); + } + + ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr, + + uint B_instr, uint skipB_instr) + { + return ckernel_unpack_template(true, // src B + false, // halo + A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr); + } + + ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){ + return ckernel_unpack_template::lA(instr0, skip0); + } + + ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){ + // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA. + return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1); + } + } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h index b3c32f94d0e..6d3a1b6fa7b 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/cpack_common.h @@ -416,14 +416,4 @@ namespace ckernel::packer { dest_offset_id = 0; } - - inline uint32_t get_output_id(uint32_t output) - { - return ((output) - OUTPUT_BASE); - } - - inline constexpr uint32_t get_output_base_id() - { - return (OUTPUT_BASE_ID); - } } diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h index 99e4a2c892f..33a3c7f0a58 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/cunpack_common.h @@ -335,9 +335,4 @@ namespace ckernel::unpacker // Clear context ID //reset_config_context(); } - - inline uint32_t get_operand_id(uint32_t operand) - { - return operand; - } } diff --git a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc b/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc deleted file mode 100644 index 238301e0566..00000000000 --- a/tt_metal/hw/ckernels/grayskull/common/src/ckernel_template.cc +++ /dev/null @@ -1,229 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "ckernel_template.h" -#include "debug/fw_debug.h" - -namespace ckernel -{ -extern volatile uint *cfg_regs; - -ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op) - : m_outer_loop_len(outer_loop_len) - , m_inner_loop_len(inner_loop_len) - , m_loop_op0(loop_op) - , m_loop_op1(TT_OP_NOP) - , m_end_op0(TT_OP_NOP) - , m_end_op1(TT_OP_NOP) - , m_start_op0(TT_OP_NOP) -{ - m_loop0_last_instr = loop_op; - m_loop1_last_instr = loop_op; -} - -ckernel_template::ckernel_template(uint outer_loop_len, uint inner_loop_len, uint loop_op0, uint loop_op1) - : m_outer_loop_len(outer_loop_len) - , m_inner_loop_len(inner_loop_len) - , m_loop_op0(loop_op0) - , m_loop_op1(loop_op1) - , m_end_op0(TT_OP_NOP) - , m_end_op1(TT_OP_NOP) - , m_start_op0(TT_OP_NOP) -{ - m_loop0_last_instr = loop_op1; - m_loop1_last_instr = loop_op1; -} - -void ckernel_template::set_loop_op0(uint loop_op) -{ - m_loop_op0 = loop_op; -} - -void ckernel_template::set_loop_op1(uint loop_op) -{ - m_loop_op1 = loop_op; -} - -void ckernel_template::set_end_ops(uint end_op0, uint end_op1) -{ - m_end_op0 = end_op0; - m_end_op1 = end_op1; -} - -void ckernel_template::set_end_op(uint end_op0) -{ - set_end_ops(end_op0, TT_OP_NOP); -} - -void ckernel_template::set_start_op(uint start_op0) -{ - m_start_op0 = start_op0; -} - -void ckernel_template::set_last_inner_loop_instr(uint op) -{ - m_loop1_last_instr = op; -} - -void ckernel_template::set_last_outer_loop_instr(uint op) -{ - m_loop0_last_instr = op; -} - -void ckernel_template::program_and_run(volatile uint *instrn_buffer) -{ - program(instrn_buffer); - run(instrn_buffer); -} - -void ckernel_template::run(volatile uint *instrn_buffer) -{ - TTI_MOP(1, 0, 0); // run the double-loop template -} - -void ckernel_template::program(volatile uint *instrn_buffer) -{ - volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); - - mop_sync(); // wait until previous mops have completed - - mop_cfg[0] = m_outer_loop_len; - mop_cfg[1] = m_inner_loop_len; - mop_cfg[2] = m_start_op0; - mop_cfg[3] = m_end_op0; - mop_cfg[4] = m_end_op1; - mop_cfg[5] = m_loop_op0; - mop_cfg[6] = m_loop_op1; - mop_cfg[7] = m_loop0_last_instr; - mop_cfg[8] = m_loop1_last_instr; -} - -void ckernel_unpack_template::program_and_run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) -{ - program(instrn_buffer); - run(instrn_buffer, count, zmask); -} - -void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count, const uint32_t zmask) -{ - FWASSERT("Unpack template only supports loops up to 128", count <= 128); - TT_MOP_CFG(zmask >> 16); // Set the top 16 bits of zmask - we could skip this for count <= 16 - TT_MOP(0, count - 1, zmask & 0xFFFF); // Run the template -} - -// Version without zmask, should be slightly faster by eliminating one instruction. -void ckernel_unpack_template::run(volatile uint *instrn_buffer, const uint8_t count) -{ - FWASSERT("Unpack template only supports loops up to 128", count <= 128); - TT_MOP(0, count - 1, 0); // Run the template -} - -void ckernel_unpack_template::program(volatile uint *instrn_buffer) const -{ - volatile uint *mop_cfg = reinterpret_cast(TENSIX_MOP_CFG_BASE); - - mop_sync(); // wait until previous mops have completed - - mop_cfg[1] = m_unpackB | (m_unpack_halo << 1); - mop_cfg[2] = m_B_instr; - mop_cfg[3] = m_A0_instr; - mop_cfg[4] = m_A1_instr; - mop_cfg[5] = m_A2_instr; - mop_cfg[6] = m_A3_instr; - mop_cfg[7] = m_skipA_instr; - mop_cfg[8] = m_skipB_instr; -} - -ckernel_unpack_template ckernel_unpack_template::lA(uint A_instr, uint skipA_instr) -{ - return ckernel_unpack_template(false, // src B - false, // halo - A_instr, 0, 0, 0, skipA_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lB(uint B_instr, uint skipB_instr) -{ - return ckernel_unpack_template(false, // src B - false, // halo - B_instr, 0, 0, 0, skipB_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lzA(bool neginf, uint A_instr, uint skipA_instr) -{ - return ckernel_unpack_template(false, // src B - true, // halo - neginf ? DEF_NINFSRCA : DEF_ZEROSRCA, A_instr, DEF_UNPACR_NOP, DEF_UNPACR_NOP, skipA_instr, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(false, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::flhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(false, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, 0, 0); -} - -ckernel_unpack_template ckernel_unpack_template::lBhA(const uint32_t halo_mask, const bool rarefy) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(true, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_last_instr : DEF_A0_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_last_instr : DEF_A1_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_last_instr : DEF_A2_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_instr : SKIP_A3_instr, DEF_SKIP_A, rarefy ? DEF_B_rarefy_cntx_ovrd_instr : DEF_B_cntx_ovrd_instr, DEF_SKIP_B); -} - -ckernel_unpack_template ckernel_unpack_template::flBhA(const uint32_t halo_mask) -{ - // Figure out which unpack is last - const uint last_mask = (halo_mask == 0x1) ? 0x1 : (halo_mask <= 0x3) ? 0x2 : (halo_mask <= 0x7) ? 0x4 : 0; - - return ckernel_unpack_template(true, // src B - true, // halo - ((halo_mask >> 0) & 0x1) ? ((last_mask >> 0) & 0x1) ? DEF_A0_fconv_last_instr : DEF_A0_fconv_instr : SKIP_A0_instr, - ((halo_mask >> 1) & 0x1) ? ((last_mask >> 1) & 0x1) ? DEF_A1_fconv_last_instr : DEF_A1_fconv_instr : SKIP_A1_instr, - ((halo_mask >> 2) & 0x1) ? ((last_mask >> 2) & 0x1) ? DEF_A2_fconv_last_instr : DEF_A2_fconv_instr : SKIP_A2_instr, - ((halo_mask >> 3) & 0x1) ? DEF_A3_fconv_instr : SKIP_A3_instr, TT_OP_NOP, DEF_B_cntx_ovrd_no_z_inc_instr, DEF_SKIP_B); -} - -ckernel_unpack_template ckernel_unpack_template::lBA(uint A_instr, uint skipA_instr, - - uint B_instr, uint skipB_instr) -{ - return ckernel_unpack_template(true, // src B - false, // halo - A_instr, 0, 0, 0, skipA_instr, B_instr, skipB_instr); -} - -ckernel_unpack_template ckernel_unpack_template::loopx1instr(uint instr0, uint skip0){ - return ckernel_unpack_template::lA(instr0, skip0); -} - -ckernel_unpack_template ckernel_unpack_template::loopx2instr(uint instr0, uint instr1, uint skip0, uint skip1){ - // Note - 2 instr loop so we will hijack B_instr slot for 2nd instruction via lBA. - return ckernel_unpack_template::lBA(instr0, skip0, instr1, skip1); -} - -} // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h index 815b76c9d93..2c28acf94e0 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_defs.h @@ -90,67 +90,6 @@ enum ReluType { MAX_THRESHOLD_RELU }; -enum SfpuType -{ - tanh, - hardtanh, - gelu, - exponential, - exp_with_base, - sigmoid, - sigmoid_appx, - reciprocal, - sqrt, - rsqrt, - lrelu, - power, - square, - tanh_derivative, - log, - log_with_base, - equal_zero, - not_equal_zero, - less_than_zero, - greater_than_equal_zero, - less_than_equal_zero, - greater_than_zero, - clamp, - gelu_derivative, - dropout, - abs, - sign, - max, - min, - sine, - cosine, - tan, - relu_min, - relu_max, - elu, - exp2, - heaviside, - expm1, - signbit, - asin, - acos, - atan, - erf, - erfc, - isfinite, - isinf, - isposinf, - isneginf, - isnan, - logical_not_unary, - erfinv, - i0, - silu, - mask, - negative, - unused -}; - - enum SfpiTestType { logical_not, diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h deleted file mode 100644 index 822699707d1..00000000000 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_common_includes.h +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once -#include - -#include "ckernel_globals.h" -#include "ckernel_include.h" -#include "ckernel_template.h" -#include "cmath_common.h" -#include "llk_format_conversions.h" -#include "llk_math_common.h" -#include "llk_param_structs.h" - -using namespace ckernel; diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h index a6f0e32f133..97c0b3d4909 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_pack.h @@ -149,7 +149,7 @@ inline void llk_pack_init() { template inline void llk_matmul_pack(std::uint32_t start_tile_index, std::uint32_t output, uint32_t ntiles, std::uint32_t output_tile_index = 0) { std::uint8_t output_id = get_output_id(output); - constexpr std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id(); + const std::uint8_t OUTPUT_BASE_ID = (std::uint8_t) get_output_base_id(); static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h index 3f1e76ea0d1..03848f843e5 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_A.h @@ -6,6 +6,9 @@ #include "llk_io_unpack.h" #include "llk_param_structs.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h index ee9127628ce..307e94b25ef 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB.h @@ -6,6 +6,9 @@ #include "llk_io_unpack.h" #include "llk_param_structs.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h index b438a9715c5..78a28594917 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_AB_matmul.h @@ -7,6 +7,9 @@ #include "llk_io_unpack.h" #include "llk_param_structs.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #include "ckernel.h" #include "ckernel_defs.h" #include "ckernel_template.h" diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h index 3269aab1192..2c46633312f 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_unpack_common.h @@ -11,6 +11,9 @@ #include "llk_param_structs.h" #include "llk_io_unpack.h" +//TODO: Remove with GS uplift +#include "llk_operands.h" + #ifdef PERF_DUMP #include "ckernel_perf_api.h" #endif diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h b/tt_metal/hw/ckernels/grayskull/metal/common/chlkc_list.h similarity index 91% rename from tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h rename to tt_metal/hw/ckernels/grayskull/metal/common/chlkc_list.h index 3d02d79f908..0a30e5f179b 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/chlkc_list.h +++ b/tt_metal/hw/ckernels/grayskull/metal/common/chlkc_list.h @@ -14,20 +14,18 @@ using namespace ckernel; #ifdef UCK_CHLKC_MATH -// #include "chlkc_math_llk_args.h" +#include "chlkc_unpack_data_format.h" #include "chlkc_math_fidelity.h" #include "chlkc_math_approx_mode.h" #include "chlkc_math.cpp" #endif #ifdef UCK_CHLKC_PACK -// #include "chlkc_pack_llk_args.h" #include "chlkc_pack_data_format.h" #include "chlkc_pack.cpp" #endif #ifdef UCK_CHLKC_UNPACK -// #include "chlkc_unpack_llk_args.h" #include "chlkc_unpack_data_format.h" #include "chlkc_unpack.cpp" #endif diff --git a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h new file mode 100644 index 00000000000..7800a9934d7 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +//TODO: This file should be deleted after fixing redefinition errors, +// functions should be moved to ckernel_globals.h +#pragma once + +#include +#include "ckernel_structs.h" +#include "risc_attribs.h" +#include "tensix_functions.h" +#include "hostdevcommon/common_runtime_address_map.h" + +extern uint32_t __ldm_bss_start[]; +extern uint32_t __ldm_bss_end[]; +extern uint32_t __ldm_data_start[]; +extern uint32_t __ldm_data_end[]; +extern void (* __init_array_start[])(); +extern void (* __init_array_end[])(); +extern uint32_t __firmware_start[]; + +extern void kernel_init(); +extern void kernel_launch(); + +inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) { + // Cover L1 load latency of 6 cycles for the bulk of the copy + int32_t n = 0; + while (n < len - 5) { + uint32_t v0 = l1_addr[n + 0]; + uint32_t v1 = l1_addr[n + 1]; + uint32_t v2 = l1_addr[n + 2]; + uint32_t v3 = l1_addr[n + 3]; + uint32_t v4 = l1_addr[n + 4]; + uint32_t v5 = l1_addr[n + 5]; + local_mem_addr[n + 0] = v0; + local_mem_addr[n + 1] = v1; + local_mem_addr[n + 2] = v2; + local_mem_addr[n + 3] = v3; + local_mem_addr[n + 4] = v4; + local_mem_addr[n + 5] = v5; + n += 6; + } + // Could optimize this further (eg, loop of 2 or 4), probably not worth it + while (n < len) { + local_mem_addr[n] = l1_addr[n]; + n++; + } +} + +inline void firmware_kernel_common_init(void *init_local_l1_base) { + + // Handle stuff typically done in crt0 in asm. Easier to do in C + wzerorange(__ldm_bss_start, __ldm_bss_end); + + int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2; + uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE; + l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words); + + for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) { + (**fptr)(); + } +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h new file mode 100644 index 00000000000..317c14707ca --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h @@ -0,0 +1,86 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_eltwise_binary.h" + +// /************************************************************************* +// * LLK ELTWISE BINARY +// *************************************************************************/ + +// // Version with no operand +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> +// inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) { +// const std::uint32_t num_faces = 4; + +// _llk_math_eltwise_binary_init_( +// num_faces, transpose, acc_to_dest); +// } + +// // Version with operands +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> +// inline void llk_math_eltwise_binary_init_with_operands( +// const std::uint32_t operand_A, +// const std::uint32_t operand_B, +// const std::uint32_t transpose = 0, +// const std::uint32_t acc_to_dest = 0) { +// const std::uint32_t operand_id = +// get_operand_id(operand_A); // operand_id is used to extract tile dim data which is the same for both operands +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_math_eltwise_binary_init_( +// num_faces, transpose, acc_to_dest); +// } + +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// DstSync Dst = DstSync::SyncFull, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool is_fp32_dest_acc_en = false> +// inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) { +// const std::uint32_t num_faces = 4; + +// _llk_math_eltwise_binary_< +// eltwise_binary_type, +// src_b_bcast_type, +// Dst, +// NUM_FIDELITY_PHASES, +// binary_reuse_dest, +// is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); +// } + +// template < +// EltwiseBinaryType eltwise_binary_type, +// BroadcastType src_b_bcast_type, +// DstSync Dst = DstSync::SyncFull, +// int NUM_FIDELITY_PHASES = 0, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool is_fp32_dest_acc_en = false> +// inline void llk_math_eltwise_binary( +// const std::uint32_t operand_A, +// const std::uint32_t operand_B, +// uint dst_index, +// const bool clear_fp32_dst_acc = true) { +// const std::uint32_t operand_id = get_operand_id(operand_A); // both operands must have same number of faces +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_math_eltwise_binary_< +// eltwise_binary_type, +// src_b_bcast_type, +// Dst, +// NUM_FIDELITY_PHASES, +// binary_reuse_dest, +// is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h new file mode 100644 index 00000000000..21c3e8ae428 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_eltwise_binary_sfpu.h" + +// /************************************************************************* +// * LLK ELTWISE BINARY SFPU +// *************************************************************************/ + +// template +// inline void llk_math_eltwise_binary_sfpu( +// const uint operand, +// uint dst_index_a, +// uint dst_index_b, +// int vector_mode = (int)Dim::RC, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// const std::uint32_t operand_id = get_operand_id(0); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + +// _llk_math_eltwise_binary_sfpu_( +// face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_init( +// uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { +// _llk_math_eltwise_binary_sfpu_init_(param0, param1, param2, param3, param4, param5); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_quant_int32( +// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { +// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) { +// llk_math_eltwise_binary_sfpu_init(zero_point); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_requant_int32( +// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { +// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) { +// llk_math_eltwise_binary_sfpu_init(zero_point); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_dequant_int32( +// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { +// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); +// } + +// template +// inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) { +// llk_math_eltwise_binary_sfpu_init(zero_point); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h new file mode 100644 index 00000000000..3da220f0cba --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_globals.h" +#include "ckernel_template.h" +#include "cmath_common.h" +#include "llk_defs.h" +#include "llk_io.h" +#include "llk_math_common.h" +#include "llk_operands.h" +#include "llk_param_structs.h" + +// // Need to revisit why we even need this +// #define EPS 1.19209e-07 // std::numeric_limits::epsilon() for FP32 + +// /************************************************************************* +// * LLK MATH COMMON +// *************************************************************************/ + +// template +// inline void llk_math_wait_for_dest_available() { +// _llk_math_wait_for_dest_available_(); +// } + +// template +// inline void llk_math_dest_section_done() { +// _llk_math_dest_section_done_(); +// } + +// template +// inline void llk_math_pack_sync_init() { +// _llk_math_pack_sync_init_(); +// } + +// template +// inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { +// _llk_math_get_tile_(tile_index, p_tile); +// } + +// template +// inline void llk_math_release_tile(std::uint32_t operand) { +// _llk_math_release_tile_(); +// } + +// inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); } + +// inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); } + +// inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); +// _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]); +// } + +// inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); +// _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]); +// } + +// inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]); +// } + +// inline void llk_math_reconfig_data_format( +// const std::uint32_t srca_old_operand, +// const std::uint32_t srca_new_operand, +// const std::uint32_t srcb_old_operand, +// const std::uint32_t srcb_new_operand) { +// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); +// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) && +// (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { +// llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand); +// } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { +// llk_math_reconfig_data_format_srca(srca_new_operand); +// } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { +// llk_math_reconfig_data_format_srcb(srcb_new_operand); +// } +// } + +// inline void llk_math_reconfig_data_format_srca( +// const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { +// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + +// if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { +// llk_math_reconfig_data_format_srca(srca_new_operand); +// } +// } + +// inline void llk_math_reconfig_data_format_srcb( +// const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { +// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { +// llk_math_reconfig_data_format_srcb(srcb_new_operand); +// } +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h new file mode 100644 index 00000000000..a12bcca1ef4 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_matmul.h" + +// /************************************************************************* +// * LLK MATMUL +// *************************************************************************/ + +// template +// inline void llk_math_matmul_init( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t transpose = 0, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// const std::uint32_t in0_id = get_operand_id(operandA); +// const std::uint32_t in1_id = get_operand_id(operandB); + +// const bool partial_face = get_operand_partial_face(in0_id); + +// const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id); +// const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id); +// const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id); +// const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id); + +// #ifdef ARCH_GRAYSKULL +// _llk_math_matmul_init_( +// in0_tile_r_dim, +// in0_tile_c_dim, +// in1_tile_r_dim, +// in1_tile_c_dim, +// partial_face, +// transpose, +// ct_dim, +// rt_dim, +// kt_dim); +// #else +// _llk_math_matmul_init_( +// in0_tile_r_dim, +// in0_tile_c_dim, +// in1_tile_r_dim, +// in1_tile_c_dim, +// partial_face, +// transpose, +// ct_dim, +// rt_dim, +// kt_dim); +// #endif +// } + +// template +// inline void llk_math_matmul( +// uint dst_index, +// const bool transpose = false, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// #ifdef ARCH_GRAYSKULL +// _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); +// #else +// _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); +// #endif +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h new file mode 100644 index 00000000000..c5f11d005f2 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "llk_math_reduce.h" + +// /************************************************************************* +// * LLK REDUCE +// *************************************************************************/ + +// template < +// PoolType type, +// ReduceDim dim, +// int num_fidelity_phases = 0, +// bool is_fp32_dest_acc_en = false, +// bool is_int_fpu_en = false> +// inline void llk_math_reduce(const uint dst_index) { +// _llk_math_reduce_(dst_index); +// } + +// template +// inline void llk_math_reduce_init( +// const std::uint32_t within_face_16x16_transpose = +// 0) { // within_face_16x16_transpose used for unpack, ignored by math +// _llk_math_reduce_init_(within_face_16x16_transpose); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h new file mode 100644 index 00000000000..ca2a5d39e40 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_common_api.h" +#include "llk_math_eltwise_unary_datacopy.h" + +// /************************************************************************* +// * LLK ELTWISE UNARY DATACOPY +// *************************************************************************/ + +// template < +// DataCopyType type, +// BroadcastType src_b_bcast_type = BroadcastType::NONE, +// DstSync Dst = DstSync::SyncFull, +// bool is_fp32_dest_acc_en = false, +// bool unpack_to_dest = false> +// inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) { +// const std::uint32_t operand_id = get_operand_id(0); +// _llk_math_eltwise_unary_datacopy_( +// dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]); +// } + +// template +// // within_face_16x16_transpose is used by unpacker, math does not transpose +// inline void llk_math_eltwise_unary_datacopy_init( +// const std::uint32_t transpose_of_faces = 0 /*unused*/, +// const std::uint32_t within_face_16x16_transpose = 0 /* unused */, +// const std::uint32_t operand = 0) { +// const std::uint32_t operand_id = get_operand_id(0); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// _llk_math_eltwise_unary_datacopy_init_( +// transpose_of_faces, within_face_16x16_transpose, num_faces); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h new file mode 100644 index 00000000000..53b9d1afe8b --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h @@ -0,0 +1,293 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_math_common_api.h" +#include "metal_ckernel_sfpu.h" +#include "llk_math_eltwise_unary_sfpu_init.h" + +// namespace ckernel { + +// /************************************************************************* +// * LLK ELTWISE UNARY SFPU +// *************************************************************************/ + +// // New LLK SFPU APIs +// template +// inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_rsqrt_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) { +// llk_math_eltwise_unary_sfpu(dst_index,base_scale); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_log_with_base_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_signbit_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_tanh_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //sign +// template +// inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_sign_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } +// template +// inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) { +// constexpr bool dont_care = false; +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode, integer_dropout, scale_factor); +// } + +// inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) { +// constexpr bool dont_care = false; +// constexpr uint dont_care_param = 0; + +// llk_math_eltwise_unary_sfpu_init(dont_care_param, dont_care_param, seed); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_sigmoid_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //EQZ +// template +// inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_eqz_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //NEZ +// template +// inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_nez_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //LTZ +// template +// inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_ltz_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //GTZ +// template +// inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_gtz_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //LEZ +// template +// inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_lez_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //GEZ +// template +// inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_gez_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_max_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_square_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode, pow); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_power_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_abs_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //EXP2 +// template +// inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_exp2_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //heaviside +// template +// inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) { +// llk_math_eltwise_unary_sfpu(dst_index,vector_mode,param0); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_heaviside_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //EXPM1 +// template +// inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_expm1_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //Asin +// template +// inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_asin_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //Atan +// template +// inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_atan_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //Acos +// template +// inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_acos_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// //silu +// template +// inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) { +// llk_math_eltwise_unary_sfpu(dst_index); +// } + +// template +// inline void llk_math_eltwise_unary_sfpu_silu_init() { +// llk_math_eltwise_unary_sfpu_init(); +// } + +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h new file mode 100644 index 00000000000..ca7e298a7c2 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_op_info_api.h @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +extern uint32_t op_info_offset; + +inline void llk_get_next_op_info(tt::op_info_t& op_info_struct) { + + uint32_t* op_info_ptr = reinterpret_cast(OP_INFO_BASE_ADDR + op_info_offset); + static constexpr uint32_t op_info_num_items = 7; + + volatile tt_l1_ptr uint32_t* op_info_struct_ptr = reinterpret_cast(&op_info_struct); + for (uint32_t i = 0; i < op_info_num_items; i++) { + op_info_struct_ptr[i] = op_info_ptr[i]; + } + op_info_offset += 28; + + if (op_info_offset == OP_INFO_SIZE) { + op_info_offset = 0; // In case we go out of bounds + } +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h new file mode 100644 index 00000000000..37ee8a0fe56 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h @@ -0,0 +1,308 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_template.h" +#include "cpack_common.h" +#include "ckernel_globals.h" +#include "circular_buffer.h" + +#include "llk_io.h" +#include "llk_defs.h" +#include "llk_outputs.h" +#include "llk_param_structs.h" +#include "llk_pack.h" +#include "llk_pack_common.h" + +/************************************************************************* +* LLK PACK +*************************************************************************/ + +// template +// inline void llk_pack_mop_config(const uint32_t output) { + +// const std::uint32_t output_id = get_output_id(output); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_mop_config_( +// pack_dst_format[output_id], +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) { + +// const std::uint32_t output_id = get_output_id(pack_params->pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; + +// _llk_pack_hw_configure_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// tile_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile, +// pack_params->relu_config.val +// ); +// } + +// template +// inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) { +// llk_pack_params_t llk_pack_params = { +// .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}}; +// llk_pack_hw_configure(&llk_pack_params); +// } + +// template +// inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) { +// const std::uint32_t output_id = get_output_id(pack_params->pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; + +// _llk_pack_reduce_hw_configure_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// tile_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile, +// pack_params->relu_config.val +// ); +// } + +// template +// inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) { +// llk_pack_params_t llk_pack_params = { +// .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}}; +// llk_pack_reduce_hw_configure(&llk_pack_params); +// } + +// template +// inline void llk_pack_init(const std::uint32_t pack_output = 16) { + +// const std::uint32_t output_id = get_output_id(pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_init_( +// pack_dst_format[output_id], +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile +// ); +// } + +// template +// inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) { + +// std::uint32_t pack_tile_addr; +// if constexpr (out_of_order_output) { +// pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + +// (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1; +// } else { +// if constexpr (untilize) { +// // FIXME: Need to support pack-untilize? +// // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim + +// // cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf +// // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; +// // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size); + +// // cb_interface[output_id].ublock_tile_cnt++; + +// // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) { +// // cb_interface[output_id].ublock_tile_cnt=0; +// // cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct; +// // } +// } else { +// pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; +// cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size; +// } +// } +// return pack_tile_addr; +// } + +// template +// inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) { +// std::uint8_t output_id = get_output_id(output); + +// static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); + +// std::uint32_t pack_tile_addr = get_output_tile_address(output_id, output_tile_index); + +// _llk_pack_( +// tile_index, +// pack_tile_addr +// ); +// } + +// /************************************************************************* +// * LLK PACK COMMON +// *************************************************************************/ + + +// inline void llk_packer_wait_for_math_done() { +// _llk_packer_wait_for_math_done_(); +// } + +// template +// inline void llk_packer_set_math_semaphore() { +// _llk_packer_set_math_semaphore_(); +// } + +// template +// inline void llk_pack_dest_section_done() { +// _llk_pack_dest_section_done_(); +// } + +// template +// inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) { +// const std::uint32_t output_id = get_output_id(pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_init_packer_dest_offset_registers_( +// face_r_dim, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) { + +// const std::uint32_t output_id = get_output_id(pack_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_dest_init_( +// face_r_dim, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) { +// _llk_pack_get_tile_(tile_index, p_tile); +// } + +// template +// inline void llk_pack_release_tile(std::uint32_t output) { +// _llk_pack_release_tile_(); +// } + +// inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { +// _llk_pack_debug_dump_(data, byte_size); +// } + +// inline void llk_pack_debug_dump_seek(std::uint8_t offset) { +// _llk_pack_debug_dump_seek_(offset); +// } + +// template +// inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { + +// const std::uint32_t output_id = get_output_id(new_output); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); + +// _llk_pack_reconfig_data_format_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// cb_interface[output_id].fifo_page_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile +// ); +// } + +// template +// inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) { +// std::uint32_t old_output_id = get_output_id(old_output); +// std::uint32_t new_output_id = get_output_id(new_output); + +// if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id]) +// && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid) +// && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) { +// llk_pack_reconfig_data_format(new_output); +// } else if constexpr (is_tile_dim_reconfig_en) { +// // Same format but different tile dims +// llk_pack_mop_config(new_output); +// } +// } + +// TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) { +// _llk_pack_relu_config_(config); +// } + +// inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) { +// _llk_pack_reconfig_l1_acc_(enable); +// } + +// template +// inline void llk_pack_reduce_mask_config() { +// _llk_pack_reduce_mask_config_(); +// } + +// inline void llk_pack_reduce_mask_clear() { +// _llk_pack_reduce_mask_clear_(); +// } + +// // FIXME-WH-UPLIFT +// template +// inline void llk_pack_reduce_config_v2(uint32_t icb_out) { + +// const bool untilize = false; +// if constexpr (at_kernel_start) { + +// const std::uint32_t output_id = get_output_id(icb_out); +// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); +// const std::uint32_t num_faces = get_output_num_faces(output_id); +// const bool partial_face = get_output_partial_face(output_id); +// const bool narrow_tile = get_output_narrow_tile(output_id); +// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; +// const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}}; + +// _llk_pack_hw_configure_( +// pack_src_format[output_id], +// pack_dst_format[output_id], +// tile_size, +// face_r_dim, +// num_faces, +// partial_face, +// narrow_tile, +// relu_config.val +// ); +// } + +// if constexpr (revert) { +// _llk_pack_reduce_mask_clear_(); +// } else { +// _llk_pack_reduce_mask_config_(); +// } +// } diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_param_structs.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_reverseops.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_reverseops.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_reverseops.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_cdf.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_cdf.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_cdf.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_converter.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_converter.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_converter.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_elu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_elu.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_elu.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erf_erfc.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erf_erfc.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erfinv.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_erfinv.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_erfinv.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_exp.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_exp.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_exp.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_gelu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_gelu.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_gelu.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_i0.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_i0.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_i0.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_isinf_isnan.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_isinf_isnan.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_logical_not_noti.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_logical_not_noti.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_recip.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_recip.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_recip.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_relu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_relu.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_relu.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_sqrt.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_sqrt.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_sqrt.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_trigonometry.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_trigonometry.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_trigonometry.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_0_param.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_0_param.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_1_param.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_1_param.h diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h new file mode 100644 index 00000000000..83a5fdcca92 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +#include "llk_sfpu_types.h" +#include "ckernel_globals.h" +#include "ckernel_include.h" +#include "ckernel_template.h" +#include "metal_ckernel_sfpu.h" +#include "cmath_common.h" +#include "llk_format_conversions.h" +#include "llk_math_common.h" +#include "llk_param_structs.h" +#include "llk_math_eltwise_unary_sfpu.h" + +//TODO: Fix for GS uplift + +// using namespace ckernel; +// using namespace ckernel::sfpu; +// namespace ckernel { + +// /************************************************************************* +// * LLK ELTWISE UNARY SFPU +// *************************************************************************/ + +// template < +// SfpuType operation, +// bool APPROXIMATION_MODE, +// int SfpuType_PARAM = 0, +// int ITERATIONS = 8, +// bool IS_INT_SFPU_EN = false> +// inline void llk_math_calculate_sfpu( +// const int iterations = ITERATIONS, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// if constexpr (operation == SfpuType::exp_with_base) { +// constexpr bool zero_negative = true; +// _calculate_exponential_(iterations, param0); +// } else if constexpr (operation == SfpuType::tanh) { +// _calculate_tanh_(iterations); +// } else if constexpr (operation == SfpuType::hardtanh) { +// _calculate_hardtanh_(iterations, param0, param1, param2); +// } else if constexpr (operation == SfpuType::rsqrt) { +// // param0 = true -> approximate fast mode +// // false -> high precision mode +// // The algorithm uses Newton's method based on no.of iteration better approximation can be calculated +// if (param0) { +// calculate_rsqrt(); +// } else { +// calculate_rsqrt(); +// } +// } else if constexpr (operation == SfpuType::sigmoid) { +// calculate_sigmoid(); +// } else if constexpr (operation == SfpuType::sigmoid_appx) { +// calculate_sigmoid_appx(); +// } else if constexpr (operation == SfpuType::tanh_derivative) { +// calculate_tanh_derivative(); +// } else if constexpr (operation == SfpuType::dropout) { +// calculate_dropout(param0, param1); +// } else if constexpr (operation == SfpuType::power) { +// calculate_power_iterative(param0); +// } else if constexpr (operation == SfpuType::square) { +// calculate_square(); +// } else if constexpr (operation == SfpuType::log) { +// calculate_log(param0); +// } else if constexpr (operation == SfpuType::log_with_base) { +// calculate_log(param0); +// } else if constexpr ( +// (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) || +// (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) || +// (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) { +// calculate_comp(8); // BFLOAT16 - exp +// } else if constexpr (operation == SfpuType::clamp) { +// calculate_clamp(param0, param1, param2); +// } else if constexpr (operation == SfpuType::abs) { +// calculate_abs(); +// } else if constexpr (operation == SfpuType::sign) { +// calculate_sign(); +// } else if constexpr (operation == SfpuType::max) { +// calculate_max(); +// } else if constexpr (operation == SfpuType::min) { +// calculate_min(); +// } else if constexpr (operation == SfpuType::exp2) { +// calculate_exp2(); +// } else if constexpr (operation == SfpuType::heaviside) { +// calculate_heaviside(param0); +// } else if constexpr (operation == SfpuType::expm1) { +// calculate_expm1(); +// } else if constexpr (operation == SfpuType::asin) { +// calculate_asin(); +// } else if constexpr (operation == SfpuType::acos) { +// calculate_acos(); +// } else if constexpr (operation == SfpuType::atan) { +// calculate_atan(); +// } else if constexpr (operation == SfpuType::signbit) { +// calculate_signbit(); +// } else if constexpr (operation == SfpuType::silu) { +// calculate_silu(); +// } +// } + +// template +// inline void llk_math_eltwise_unary_sfpu( +// uint dst_index, +// int vector_mode = (int)Dim::RC, +// uint param0 = 0, +// uint param1 = 0, +// uint param2 = 0, +// uint param3 = 0, +// uint param4 = 0, +// uint param5 = 0) { +// const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); + +// constexpr int ITERATIONS = 8; + +// _llk_math_eltwise_unary_sfpu_start_(dst_index); + +// if (vector_mode == (int)Dim::R) { +// // Do a row vector, Face0 + Face1 -- first iteration (first row) +// const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2) +// : 2; // At least 2 iterations for odd and even columns +// #pragma GCC unroll 0 +// for (int face = 0; face < 2; face++) { +// llk_math_calculate_sfpu( +// iterations, param0, param1, param2, param3, param4, param5); +// // Move to the next face +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// // Skip next two faces +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } else if (vector_mode == (int)Dim::C) { +// // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for +// // full face +// #pragma GCC unroll 0 +// for (int face = 0; face < 2; face++) { +// llk_math_calculate_sfpu( +// ITERATIONS, param0, param1, param2, param3, param4, param5); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// if (num_faces > 2) { // Skip next face if tile is 32x32 +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } +// if (num_faces <= 2) { +// // Skip next two faces +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } else { +// // Do all four faces, and iterate through all 4 blocks of 4 rows each +// #pragma GCC unroll 0 +// for (int face = 0; face < 4; face++) { +// llk_math_calculate_sfpu( +// ITERATIONS, param0, param1, param2, param3, param4, param5); +// // Move to the next face +// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); +// } +// } +// _llk_math_eltwise_unary_sfpu_done_(); +// } + +// } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_elu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_elu.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_elu.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erf_erfc.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erf_erfc.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_erfinv.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_erfinv.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_exp.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_exp.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_exp.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_gelu.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_gelu.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_i0.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_i0.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_i0.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_init.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_init.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_init.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_isinf_isnan.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_isinf_isnan.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_logical_not_noti.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_logical_not_noti.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_recip.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_recip.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_recip.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_relu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_relu.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_relu.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_reverseops.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_reverseops.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_sqrt.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_trigonometry.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_trigonometry.h diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h new file mode 100644 index 00000000000..50018e399c3 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h @@ -0,0 +1,780 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_sfpu_types.h" +#include "ckernel_defs.h" +#include "ckernel_sfpu.h" +#include "ckernel.h" +#include "noc_nonblocking_api.h" + +#include "sfpi.h" + +#include "ckernel_sfpu_cdf.h" +#include "ckernel_sfpu_exp.h" +#include "ckernel_sfpu_recip.h" +#include "ckernel_sfpu_converter.h" + +//TODO: Delete this file once GS uplift is done +// using namespace sfpi; + +// namespace ckernel +// { +// namespace sfpu +// { + +// template +// inline void calculate_rsqrt() +// { + +// for (int d = 0; d < ITERATIONS; d++) +// { + +// vFloat in = dst_reg[0]; +// v_if(dst_reg[0] == 0.0f){ +// dst_reg[0] = std::numeric_limits::infinity(); +// }v_else{ +// vFloat result = 1.0f; +// v_if(dst_reg[0] > 1.0f){ +// result = sfpu_reciprocal(in); +// }v_endif; + +// for (int r = 0; r < RECIPROCAL_ITERATIONS; r++) +// { +// // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration. +// result = result * (1.5F - 0.5F * dst_reg[0] * result * result); +// } +// dst_reg[0] = result; +// }v_endif; + +// dst_reg++; + +// } +// } + +// template +// inline void calculate_sigmoid_appx() +// { +// vUInt l0 = l_reg[LRegs::LReg0]; +// vUInt l1 = l_reg[LRegs::LReg1]; +// vUInt l2 = l_reg[LRegs::LReg2]; + +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// dst_reg[0] = lut(val, l0, l1, l2) + 0.5f; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg0] = l0; +// l_reg[LRegs::LReg1] = l1; +// l_reg[LRegs::LReg2] = l2; +// } + +// // TODO: Implement using bitwise comparision +// template +// inline void calculate_signbit() +// { + +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// v_if (val <= -0.0f) { +// val = 1.0f; +// } v_elseif (val >= 0.0f) { +// val = 0.0f; +// } +// v_endif; +// dst_reg[0] = val; + +// dst_reg++; +// } + +// } + +// template +// inline void calculate_tanh() +// { +// // SFPU microcode +// vUInt l0 = l_reg[LRegs::LReg0]; +// vUInt l1 = l_reg[LRegs::LReg1]; +// vUInt l2 = l_reg[LRegs::LReg2]; + +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// val = lut(val, l0, l1, l2); +// dst_reg[0] = val; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg0] = l0; +// l_reg[LRegs::LReg1] = l1; +// l_reg[LRegs::LReg2] = l2; +// } + +// template +// inline void calculate_hardtanh(uint param0, uint param1, uint param2) +// { +// // All params are in FP16_B format +// // param0 = -(neg_threshold) +// // param1 = -(pos_threshold - neg_threshold) +// // param2 = -(pos_threshold) + +// vFloat p0 = s2vFloat16(param0); +// vFloat p1 = s2vFloat16(param1); +// vFloat p2 = s2vFloat16(param2); +// // SFPU microcode +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// val += p0;// 12 bits +// v_if (val < 0.0f) { +// val = 0.0f; +// } +// v_endif; + +// val += p1;// 12 bits +// v_if (val >= 0.0f) { +// val = 0.0f; +// } +// v_endif; + +// val += p2;// 12 bits + +// dst_reg[0] = val; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_tanh_derivative() +// { +// vUInt l0 = l_reg[LRegs::LReg0]; +// vUInt l1 = l_reg[LRegs::LReg1]; +// vUInt l2 = l_reg[LRegs::LReg2]; + +// // tanh'(x) = 1 - (tanh(x))^2 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// if constexpr (!WITH_PRECOMPUTED_TANH) { +// val = lut(val, l0, l1, l2); +// } + +// val = val * (-val) + vConst1; +// dst_reg[0] = val; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg0] = l0; +// l_reg[LRegs::LReg1] = l1; +// l_reg[LRegs::LReg2] = l2; +// } + +// template +// inline void calculate_dropout(uint prob, uint scale) +// { +// // SFPU microcode + +// vUInt rand = l_reg[LRegs::LReg3]; + +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) { +// //////////////////////// +// // Scale samples +// /////////////////////// +// dst_reg[0] = dst_reg[0] * s2vFloat16b(scale); + +// //////////////////////// +// // Drop samples +// /////////////////////// +// v_if (rand < prob) { +// dst_reg[0] = vConst0; +// } +// v_endif; + +// //////////////////////// +// // 16-bit PRNG update +// /////////////////////// +// vUInt lfsr = vConstIntPrgm1; +// vUInt tmp = lfsr & rand; +// rand = rand >> 1; +// v_if (tmp != 0) { +// vUInt mask = vConstIntPrgm0; +// rand ^= mask; +// } +// v_endif; + +// dst_reg++; +// } + +// l_reg[LRegs::LReg3] = rand; +// } + +// template +// inline void calculate_power_iterative(const uint exponent) +// { +// #pragma GCC unroll 8 +// for (int d = 0; d < 8; d++) +// { +// vFloat in = dst_reg[0]; +// vFloat result = 1.0f; +// for (uint i = 0; i < exponent; i++) { +// result *= in; +// } +// dst_reg[0]=result; +// dst_reg++; +// } +// } + +// template +// inline void calculate_square() +// { +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat in = dst_reg[0]; +// vFloat result = in * in; + +// dst_reg[0] = result; + +// dst_reg++; +// } +// } + +// template +// sfpi_inline void calculate_log_body(const uint log_base_scale_factor) +// { +// //////////////////////////// +// // Load From dest + "normalize to calculation range" +// //////////////////////////// +// vFloat in = dst_reg[0]; +// vFloat x = setexp(in, 127); // set exp to exp bias (put in range of 1-2) + +// // XXXXXX ask Namal? if we can derive the coefficients below to higher precision +// //////////////////////////// +// // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order +// // x* ( x* (A*x + B) + C) + D +// // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006 +// // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to +// // save the subtract and get A',B',C',D'): +// // A' = A +// // B' = -3A + B +// // C' = 3a -2B + C +// // D' = -A + B - C + D +// // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753 +// //////////////////////////// +// vFloat a = vConstFloatPrgm1; +// vFloat b = vConstFloatPrgm2; +// // XXXXX try variants of the below: B'=.7122, C'=2.0869 +// vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f; + +// //////////////////////////// +// // Convert exponent to float +// //////////////////////////// +// vInt exp = exexp(in); +// v_if (exp < 0) { +// exp = setsgn(~exp + 1, 1); +// } +// v_endif; + +// vFloat expf = int32_to_float(exp, 0); +// vFloat vConstLn2 = vConstFloatPrgm0; +// vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2) + +// if constexpr (HAS_BASE_SCALING) { +// result *= s2vFloat16a(log_base_scale_factor); +// } + +// //////////////////////////// +// // Base case when input is 0. ln(0) = -inf +// //////////////////////////// +// v_if (in == 0.0F) { // Reload for register pressure +// result = -std::numeric_limits::infinity(); +// } +// v_endif; + +// dst_reg[0] = result; +// } + +// template +// inline void calculate_log(uint log_base_scale_factor) +// { +// #pragma GCC unroll 8 +// for(int d = 0; d < ITERATIONS; d++){ +// calculate_log_body(log_base_scale_factor); +// dst_reg++; +// } +// } + +// sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init) +// { +// flag1 = init; +// if (check) { +// flag2 = init; +// } +// } + +// template +// inline void calculate_comp(uint exponent_size_8) +// { +// const vFloat zero = 0.0f; +// const vFloat one = 1.0f; +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// vFloat flag1, flag2; + +// //a[i] == 0 +// if constexpr(COMP_MODE == SfpuType::equal_zero) { +// v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { +// v = one; +// } v_else { +// v = zero; +// } +// v_endif; +// } + +// //a[i] != 0 +// if constexpr(COMP_MODE == SfpuType::not_equal_zero) { +// v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { +// v = zero; +// } v_else { +// v = one; +// } +// v_endif; +// } + +// //a[i] < 0 +// if constexpr(COMP_MODE == SfpuType::less_than_zero) { +// v_if (v >= 0.0f) { +// v = zero; +// } v_else { +// v = one; +// } +// v_endif; +// } + +// //a[i] >= 0 +// if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) { +// v_if (v >= 0.0f) { +// v = one; +// } v_else { +// v = zero; +// } +// v_endif; +// } + +// //a[i] > 0 +// if constexpr(COMP_MODE == SfpuType::greater_than_zero) { +// v_if (v > 0.0f) { +// v = one; +// } v_else { +// v = zero; +// } +// v_endif; +// } + +// //a[i] <= 0 +// if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) { +// v_if (v > 0.0f) { +// v = zero; +// } v_else { +// v = one; +// } +// v_endif; +// } + +// dst_reg[0] = v; +// dst_reg++; +// } +// } + +// template +// inline void calculate_clamp(uint param0, uint param1, uint param2) +// { +// // All params are in FP16 format +// // param0 = min +// // param1 = max + +// //uint format = (param0 >> 16)&0x1; +// s2vFloat16::Format format = s2vFloat16::fp16a; + +// // SFPU microcode +// vFloat min = s2vFloat16(param0, format); +// vFloat max = s2vFloat16(param1, format); +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; + +// v_if (val < min) { +// val = s2vFloat16(param0, format); +// } v_elseif (val >= max) { +// val = s2vFloat16(param1, format); +// } +// v_endif; + +// dst_reg[0] = val + s2vFloat16b(param2); // 12 bits + +// dst_reg++; +// } +// } + +// template +// inline void calculate_abs() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// dst_reg[0] = sfpi::abs(v); +// dst_reg++; +// } +// } + + +// template +// inline void calculate_exp2() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// // log(2) = 0.6931471805; +// v = v * 0.6931471805f; +// // exp = e^(v) +// vFloat exp = calculate_exponential_body_improved(v); +// dst_reg[0] = exp; +// dst_reg++; +// } +// } + +// template +// inline void calculate_sign() +// { +// // All params are in FP16 format +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// vFloat result = vConst1; +// v_if (v < 0.0f) { +// result = vConstNeg1; +// } v_elseif(v > 0.0f) { +// result = vConst1; +// } v_else { +// result = vConst0; +// } +// v_endif; + +// dst_reg[0] = result; +// dst_reg++; +// } +// } +// template +// inline void calculate_max() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat a = dst_reg[0]; +// vFloat b = dst_reg[32]; +// v_if(a < b) { +// dst_reg[0] = b; +// } +// v_endif; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_min() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat a = dst_reg[0]; +// vFloat b = dst_reg[32]; +// v_if(a > b) { +// dst_reg[0] = b; +// } +// v_endif; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_expm1() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// v = calculate_exponential_body_improved(v); +// dst_reg[0] = v - 1.0f; +// dst_reg++; +// } +// } + + +// #define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4) (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0) + +// template +// sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val) +// { +// v_if(1 > sfpi::abs(val)){ +// dst_reg[0] = sfpi::abs(val) ; +// } +// v_else{ +// dst_reg[0] = sfpu_reciprocal(sfpi::abs(val)); +// } +// v_endif; + +// vFloat t1 = dst_reg[0] * dst_reg[0]; + +// t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1); + +// t1 = t1 * dst_reg[0]; + +// v_if (sfpi::abs(val) > 1){ +// t1 = 1.570796327f - t1; +// } +// v_endif; + +// v_if(val < 0 ){ +// t1 = -t1; +// } +// v_endif; + +// return t1; +// } + +// template +// inline void calculate_atan() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// val = sfpu_atan_maclaurin_series(val); +// dst_reg[0] = val; +// dst_reg++; +// } +// } + + +// template +// sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val) +// { +// // input for [-1:1] +// // Mclauren series +// // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ... +// // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a + +// vFloat tmp = val; +// vFloat val_square = val * val; +// // x +// vFloat output = tmp; +// // (1/6) * x^3 +// tmp = tmp * val_square; +// output += 0.166666666 * tmp; +// // (3/40) * x^5 +// tmp = tmp * val_square; +// output += 0.075 * tmp; + +// //(5/112) * x^7 +// tmp = tmp * val_square; +// output += 0.044642857 * tmp; + +// // (35/1152) *x^9 +// tmp = tmp * val_square; +// output += 0.03038194 * tmp; + +// //(63/2816) * x^11 +// tmp = tmp * val_square; +// output += 0.02237216 * tmp; + +// // Write out output +// return output; +// } + +// template +// inline void calculate_asin() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// v = sfpu_asine_maclaurin_series(v); +// dst_reg[0] = v; +// dst_reg++; +// } +// } + + +// #define PI_2 (1.570796326794) +// template +// inline void calculate_acos() +// { +// // SFPU microcode +// // acos = (pi/2 - asin) +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat v = dst_reg[0]; +// v = sfpu_asine_maclaurin_series(v); +// v = PI_2 - v; +// dst_reg[0] = v; +// dst_reg++; +// } +// } + +// template +// inline void cast_fp32_to_fp16a() +// { +// #pragma GCC unroll 8 +// for (int d = 0; d < ITERATIONS; d++) +// { +// //vFloat val = dst_reg[0]; +// //dst_reg[0] = float_to_fp16a(val, 0); +// TTI_SFPLOAD(0, 0, 3, 0); +// TTI_SFP_STOCH_RND(0,0,0,0,0,8); +// TTI_SFPSTORE(0,1,3,0); +// dst_reg++; +// } +// } + + + +// template +// inline void calculate_negative() +// { + +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// dst_reg[0] = -val; +// dst_reg++; +// } +// } + +// template +// inline void calculate_add1() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// dst_reg[0] = 1.0f + val; +// dst_reg++; +// } +// } + +// inline +// vFloat sigmoid_piecewise_linear_positive(vFloat val) { +// vFloat result = 0.0f; +// v_if ( val >= +5.0f) { +// result = 1.0f; +// } v_elseif ( val > 1.0f && val < 5.0f ) { +// result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f, 0.24300185f, 0.50437757f,val); +// } v_else { +// result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5 +// } +// v_endif; +// return result; +// } + +// //sigmoid is anti-symmetric and offset by 1 +// //sigmoid[-x] = 1 - sigmoid[x] +// template +// inline void calculate_sigmoid() +// { +// for (int d = 0; d < ITERATIONS; d++) +// { +// vFloat val = dst_reg[0]; +// vFloat result = 0.0f; + +// v_if ( val < 0.0f ) { +// val = -val; +// } +// v_endif; + +// result = sigmoid_piecewise_linear_positive(val); + +// val = dst_reg[0]; +// v_if ( val < 0.0f ) { +// result = 1.0f - result; +// } +// v_endif; + +// dst_reg[0] = result; +// dst_reg++; +// } + +// return; +// } + +// template +// inline void calculate_heaviside(uint value) +// { +// // SFPU microcode +// Converter c_value; +// c_value.u = value; +// vFloat s = c_value.f; + +// #pragma GCC unroll 0 +// for (int d = 0; d < ITERATIONS; d++) { +// vFloat v = dst_reg[0]; + +// v_if (v < 0.0f) { +// v = 0.0f; +// }v_elseif (v > 0.0f) { +// v = 1.0f; +// }v_else { +// v = s; +// } +// v_endif; + +// dst_reg[0] = v; + +// dst_reg++; +// } +// } + +// template +// inline void calculate_silu() +// { +// // SFPU microcode +// for (int d = 0; d < ITERATIONS; d++) { +// vFloat val = dst_reg[0]; +// v_if ( val < 0.0f ) { +// val = -val; +// } +// v_endif; + +// vFloat result = sigmoid_piecewise_linear_positive(val); + +// val = dst_reg[0]; +// v_if ( val < 0.0f ) { +// result = 1.0f - result; +// } +// v_endif; +// result = val * result; +// dst_reg[0] = result; +// dst_reg++; +// } +// } + +// } // namespace sfpu +// } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h new file mode 100644 index 00000000000..bf23a084b6d --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +enum SfpuType { + tanh, + hardtanh, + gelu, + exponential, + exp_with_base, + sigmoid, + sigmoid_appx, + reciprocal, + sqrt, + rsqrt, + lrelu, + power, + square, + tanh_derivative, + log, + log_with_base, + equal_zero, + not_equal_zero, + less_than_zero, + greater_than_equal_zero, + less_than_equal_zero, + greater_than_zero, + clamp, + gelu_derivative, + dropout, + abs, + sign, + max, + min, + sine, + cosine, + tan, + relu_min, + relu_max, + elu, + exp2, + heaviside, + expm1, + signbit, + asin, + acos, + atan, + erf, + erfc, + isfinite, + isinf, + isposinf, + isneginf, + isnan, + logical_not_unary, + erfinv, + i0, + silu, + mask, + negative, + unused, +}; diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h new file mode 100644 index 00000000000..642fbb1591e --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_AB.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK AB +// *************************************************************************/ + +// template +// inline void llk_unpack_AB_hw_configure( +// const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) { +// // In0 -> unpA +// // In1 -> unpB +// const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand); +// const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand); + +// // unpA -> srcA +// // unpB -> srcB +// const uint32_t num_faces = get_operand_num_faces(unpA_operand_id); // num faces in unpA and unpB are the same + +// const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same + +// _llk_unpack_AB_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_src_format[unpB_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpack_dst_format[unpB_operand_id], +// face_r_dim, +// within_face_16x16_transpose, +// num_faces); +// } + +// template +// inline void llk_unpack_AB_hw_configure_disaggregated( +// const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) { +// const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand}; + +// llk_unpack_AB_hw_configure(&unpack_AB_params, within_face_16x16_transpose); +// } + +// template +// inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) { +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); // if narrow tile read face 0 twice for row broadcast +// // or read face 0 and 1 for col broadcast +// _llk_unpack_AB_mop_config_(transpose_of_faces, num_faces, narrow_tile); +// } + +// template +// inline void llk_unpack_AB_init( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t transpose = 0, +// const std::uint32_t acc_to_dest = 0) { +// const std::uint32_t operandA_id = get_operand_id(operandA); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id); // face r dim in unpA and unpB are the same +// const std::uint32_t num_faces = get_operand_num_faces(operandA_id); +// const bool narrow_tile = +// get_operand_narrow_tile(operandA_id); // if narrow tile read face 0 twice for row broadcast + +// _llk_unpack_AB_init_(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest); +// } + +// template +// inline void llk_unpack_AB( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t tile_index_a, +// const std::uint32_t tile_index_b, +// const bool transpose_of_faces = 0 /*not used*/) { +// std::uint32_t operandA_id = get_operand_id(operandA); +// std::uint32_t operandB_id = get_operand_id(operandB); +// std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a; +// std::uint32_t address_a = base_address_a + offset_address_a; +// std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b; +// std::uint32_t address_b = base_address_b + offset_address_b; + +// _llk_unpack_AB_(address_a, address_b, transpose_of_faces > 0); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h new file mode 100644 index 00000000000..f4aee2da6bd --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h @@ -0,0 +1,136 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_AB_matmul.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK AB MATMUL +// *************************************************************************/ + +// template +// inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) { +// const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca; + +// // In0 -> unpB +// // In1 -> unpA +// const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand); +// const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand); + +// // unpA -> srcA +// // unpB -> srcB +// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id); + +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); +// const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id); + +// _llk_unpack_AB_matmul_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_src_format[unpB_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpack_dst_format[unpB_operand_id], +// unpA_face_r_dim, +// unpB_face_r_dim, +// transpose_xy_srca, +// unpA_num_faces, +// unpB_num_faces, +// cb_interface[unpA_operand_id].fifo_page_size, +// cb_interface[unpB_operand_id].fifo_page_size); +// } + +// template +// inline void llk_unpack_AB_matmul_hw_configure_disaggregated( +// const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) { +// const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = { +// .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca}; +// llk_unpack_AB_matmul_hw_configure(&unpack_AB_matmul_params); +// } + +// inline void llk_unpack_AB_matmul_mop_config( +// const bool transpose, +// const std::uint32_t ct_dim, +// const std::uint32_t rt_dim, +// const std::uint32_t kt_dim, +// const bool partial_face) { +// // in0 - loaded to SrcB +// // in1 - loaded to SrcA +// _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face); +// } + +// __attribute__((always_inline)) inline void llk_unpack_AB_matmul_init( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t transpose = 0, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// // In0 -> srcB (supports partial face) +// // In1 -> srcA +// const uint32_t operandA_id = get_operand_id(operandB); +// const uint32_t operandB_id = get_operand_id(operandA); + +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id); +// const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id); + +// const bool reuse_a = ct_dim >= rt_dim; +// const bool partial_face = get_operand_partial_face(operandB_id); + +// const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id); +// const uint32_t unpB_num_faces = +// partial_face ? 1 : get_operand_num_faces(operandB_id); // if partial face -> unpack face by face + +// _llk_unpack_AB_matmul_init_( +// transpose, +// ct_dim, +// rt_dim, +// kt_dim, +// unpA_face_r_dim, +// unpB_face_r_dim, +// unpA_num_faces, +// unpB_num_faces, +// partial_face); +// } + +// inline void llk_unpack_AB_matmul( +// const std::uint32_t operandA, +// const std::uint32_t operandB, +// const std::uint32_t tile_index_a, +// const std::uint32_t tile_index_b, +// const std::uint32_t ct_dim = 1, +// const std::uint32_t rt_dim = 1, +// const std::uint32_t kt_dim = 1) { +// // In0/InA -> srcB (supports partial face) +// // In1/InB -> srcA + +// volatile uint *cfg = get_cfg_pointer(); // get pointer to registers for current state ID + +// const std::uint32_t operandA_id = get_operand_id(operandA); +// const std::uint32_t operandB_id = get_operand_id(operandB); +// const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id); // In1/InB -> srcA +// const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id); // In0/InA -> srcB + +// const bool partial_face = get_operand_partial_face(operandA_id); + +// std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; +// std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; + +// std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size; +// std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size; + +// _llk_unpack_AB_matmul_( +// base_address_a, +// base_address_b, +// tile_index_a, +// tile_index_b, +// tile_size_a, +// tile_size_b, +// unpA_face_r_dim, +// unpB_face_r_dim, +// partial_face, +// ct_dim, +// rt_dim, +// kt_dim); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h new file mode 100644 index 00000000000..ca39397653c --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_A.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK A +// *************************************************************************/ + +// template +// inline void llk_unpack_A_hw_configure( +// const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) { +// const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand); +// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + +// _llk_unpack_A_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces); +// } + +// template +// inline void llk_unpack_A_hw_configure_disaggregated( +// const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) { +// const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand}; +// llk_unpack_A_hw_configure(&unpack_A_params, within_face_16x16_transpose); +// } + +// template < +// BroadcastType BType = BroadcastType::NONE, +// bool acc_to_dest = false, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool unpack_to_dest = false> +// inline void llk_unpack_A_mop_config( +// const bool transpose_of_faces, +// const std::uint32_t operand_id, +// const std::uint32_t unpack_src_format = 0, +// std::uint32_t unpack_dst_format = 0) { +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_unpack_A_mop_config_( +// transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format); +// } + +// template < +// BroadcastType BType = BroadcastType::NONE, +// bool acc_to_dest = false, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool unpack_to_dest = false> +// inline void llk_unpack_A_init( +// const std::uint32_t transpose_of_faces = 0, +// const std::uint32_t within_face_16x16_transpose = 0, +// const std::uint32_t operand = 0) { +// cfg_reg_rmw_tensix(within_face_16x16_transpose); + +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// _llk_unpack_A_init_( +// transpose_of_faces, +// within_face_16x16_transpose, +// face_r_dim, +// num_faces, +// unpack_src_format[operand_id], +// unpack_dst_format[operand_id]); +// } + +// template < +// BroadcastType BType = BroadcastType::NONE, +// bool acc_to_dest = false, +// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, +// bool unpack_to_dest = false> +// inline void llk_unpack_A( +// const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; +// std::uint32_t address = base_address + offset_address; + +// _llk_unpack_A_( +// address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h new file mode 100644 index 00000000000..a2f5d8c675f --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "circular_buffer.h" +#include "ckernel.h" +#include "ckernel_defs.h" +#include "ckernel_globals.h" +#include "ckernel_template.h" +#include "cunpack_common.h" +#include "llk_defs.h" +#include "llk_io.h" +#include "llk_operands.h" +#include "llk_param_structs.h" +#include "llk_unpack_common.h" + +// /************************************************************************* +// * LLK UNPACK COMMON +// *************************************************************************/ + +// void llk_zero_operand(std::uint32_t operand) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size; +// std::uint32_t size = cb_interface[operand_id].fifo_size; +// _llk_zero_buffer_(fifo_base_addr, size); +// } + +// template +// inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; +// std::uint32_t address = base_address + offset_address; +// _llk_unpack_get_tile_(address, p_tile); +// } + +// template +// inline void llk_unpack_release_tile(std::uint32_t operand) { +// _llk_unpack_release_tile_(); +// } + +// inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { +// _llk_unpack_debug_dump_(data, byte_size); +// } + +// inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); } + +// template +// inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { +// const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand); +// const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id); +// _llk_unpack_reconfig_data_format_srca_impl_( +// unpack_src_format[srca_operand_id], +// unpack_dst_format[srca_operand_id], +// cb_interface[srca_operand_id].fifo_page_size); +// } + +// template +// inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { +// std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand); +// const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id); +// _llk_unpack_reconfig_data_format_srcb_impl_( +// unpack_src_format[srcb_operand_id], +// unpack_dst_format[srcb_operand_id], +// cb_interface[srcb_operand_id].fifo_page_size); +// } + +// template +// inline void llk_unpack_reconfig_data_format_srca( +// const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { +// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); +// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); + +// if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) { +// llk_unpack_reconfig_data_format_srca(srca_new_operand); +// } else if constexpr (is_tile_dim_reconfig_en) { +// llk_unpack_reconfig_data_format_srca(srca_new_operand); +// } +// } + +// template +// inline void llk_unpack_reconfig_data_format_srcb( +// const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { +// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); +// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); + +// if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) { +// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); +// } else if constexpr (is_tile_dim_reconfig_en) { +// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); +// } +// } + +// template +// inline void llk_unpack_reconfig_data_format( +// const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { +// llk_unpack_reconfig_data_format_srca(srca_new_operand); +// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); +// } + +// template +// inline void llk_unpack_reconfig_data_format( +// const std::uint32_t srca_old_operand, +// const std::uint32_t srca_new_operand, +// const std::uint32_t srcb_old_operand, +// const std::uint32_t srcb_new_operand) { +// llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand); +// llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand); +// } + +// inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); } + +// inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); } + +// // All TILE_SIZE related functions were deprecared in BBE for WH. The following is needed for pack_shifted so just +// // keeping here. +// // FIXME: Need to review and adjust accordingly +// constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) { +// switch (format & 0x1F) { +// case ((uint8_t)DataFormat::Float32): return ((index << 8)); +// case ((uint8_t)DataFormat::Float16): +// case ((uint8_t)DataFormat::Float16_b): return ((index << 7)); +// case ((uint8_t)DataFormat::Bfp8): +// case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2)); +// case ((uint8_t)DataFormat::Bfp4): +// case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2)); +// case ((uint8_t)DataFormat::Bfp2): +// case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2)); +// case ((uint8_t)DataFormat::Int8): +// case ((uint8_t)DataFormat::Lf8): return ((index << 6)); +// // Keep default as Bfp8? +// default: return ((index << 6) + (index << 2)); +// }; +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h new file mode 100644 index 00000000000..01a12122375 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h @@ -0,0 +1,94 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_reduce.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* +* LLK UNPACK REDUCE +*************************************************************************/ + +// template +// inline void llk_unpack_reduce_hw_configure( +// const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) { + +// constexpr bool within_face_16x16_transpose = (ReduceDim::REDUCE_ROW == dim); + +// const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand); +// const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + +// constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; +// const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a +// ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); + +// _llk_unpack_reduce_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpB_src_format, +// unpack_dst_format[unpA_operand_id], +// unpB_dst_format, +// unpA_face_r_dim, +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces, +// unpA_num_faces +// ); + +// if constexpr (type != PoolType::MAX) { +// union { +// float f; +// uint32_t u; +// } f2u = {.f = const_mult}; + +// for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u; // Load const into L1 buffer +// } +// } + +// template +// inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) { +// const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand}; +// llk_unpack_reduce_hw_configure(&unpack_reduce_params, mult); +// } + +// template +// inline void llk_unpack_reduce_mop_config() { +// _llk_unpack_reduce_mop_config_(); +// } + +// template +// inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) { + +// constexpr std::uint32_t unpA_operand_id = 0; + +// const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; +// const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a +// ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); + +// cfg_reg_rmw_tensix(unpB_dst_format); + +// cfg_reg_rmw_tensix(unpB_src_format); +// cfg_reg_rmw_tensix(unpB_dst_format); + +// TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32); +// TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32); +// TTI_NOP; TTI_NOP; + +// _llk_unpack_reduce_init_( +// within_face_16x16_transpose +// ); +// } + +// template +// inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) { + +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; +// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; +// std::uint32_t address = base_address + offset_address; + +// _llk_unpack_reduce_( +// address +// ); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h new file mode 100644 index 00000000000..59ede271732 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_tilize.h" +#include "llk_unpack_common_api.h" + +/************************************************************************* +* LLK UNPACK TILIZE +*************************************************************************/ + +// template +// inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) { + +// constexpr bool within_face_16x16_transpose = false; +// constexpr StochRndType stoch_rnd_mode = StochRndType::None; + +// const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand); +// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); +// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); + +// _llk_unpack_tilize_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces +// ); +// } + + +// template +// inline void llk_unpack_tilize_hw_configure_disaggregated( +// const std::uint32_t unpA_operand) { +// const llk_unpack_A_params_t unpack_tilize_params = { +// .unpA_operand = unpA_operand +// }; +// llk_unpack_tilize_hw_configure(&unpack_tilize_params); +// } + +// inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) { +// std::uint32_t operand_id = get_operand_id(operand); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); +// _llk_unpack_tilize_mop_config_(narrow_tile); +// } + +// inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) { +// cfg_reg_rmw_tensix(0); + +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); + +// // Save state of unpacker config for quick restore +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0] +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context + +// _llk_unpack_tilize_init_( +// unpack_src_format[operand_id], +// unpack_dst_format[operand_id], +// ct_dim, +// face_r_dim, +// narrow_tile +// ); + +// } + +// inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) { +// TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0); +// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0] +// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context +// } + +// inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) { + +// std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); +// const bool narrow_tile = get_operand_narrow_tile(operand_id); + +// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; // Remove header size added by descriptor + +// _llk_unpack_tilize_( +// base_address, +// tile_index, +// unpack_src_format[operand_id], +// block_ct_dim, +// face_r_dim, +// num_faces, +// narrow_tile +// ); +// } + +// inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) { +// for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) { +// llk_unpack_tilize(operand, tile_index, block_c_tiles); +// } +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h new file mode 100644 index 00000000000..dded559e94d --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "llk_unpack_untilize.h" +#include "llk_unpack_common_api.h" + +// /************************************************************************* +// * LLK UNPACK UNTILIZE +// *************************************************************************/ +// template +// inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) { +// constexpr bool is_row_pool = false; +// constexpr bool within_face_16x16_transpose = false; +// constexpr StochRndType stoch_rnd_mode = StochRndType::None; + +// const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand); +// const uint32_t unpA_num_faces = 4; +// const uint32_t unpA_face_r_dim = FACE_R_DIM; + +// _llk_unpack_untilize_hw_configure_( +// unpack_src_format[unpA_operand_id], +// unpack_dst_format[unpA_operand_id], +// unpA_face_r_dim, +// within_face_16x16_transpose, +// unpA_num_faces +// ); +// } + +// inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) { +// const llk_unpack_A_params_t unpack_untilize_params = { +// .unpA_operand = unpA_operand, +// }; +// llk_unpack_untilize_hw_configure(&unpack_untilize_params); +// } + +// inline void llk_unpack_untilize_mop_config() { +// _llk_unpack_untilize_mop_config_(); +// } + +// inline void llk_unpack_untilize_init(std::uint32_t operand = 0) { +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t face_r_dim = 1; +// const std::uint32_t num_faces = get_operand_num_faces(operand_id); + +// // Save state of unpacker config for quick restore +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context +// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1 + +// _llk_unpack_untilize_init_( +// unpack_dst_format[operand_id], +// cb_interface[operand_id].fifo_page_size, +// face_r_dim, +// num_faces +// ); +// } + +// inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) { +// std::uint32_t operand_id = get_operand_id(operand); +// std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; +// std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride; + +// // Check that unpacker is done (all contexts freed up) before starting hw configuration +// wait_for_idle(); + +// // Reset address counters +// unpacker_addr_counter_init(); + +// // Wait for cfg to be free to edit +// TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK); + +// // Reset the values to default in unpack AB common. +// TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0); +// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16); +// cfg_reg_rmw_tensix(1); +// cfg_reg_rmw_tensix(unpA_ch1_y_stride); +// TTI_NOP; TTI_NOP; // Do we need this for WH? +// } + +// template +// inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) { +// const std::uint32_t operand_id = get_operand_id(operand); +// const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; + +// _llk_unpack_untilize_pass_( +// base_address, +// block_tile_cols +// ); +// } + +// inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) { +// llk_unpack_untilize_pass(operand, block_c_tiles); +// llk_unpack_untilize_pass(operand, block_c_tiles); +// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc new file mode 100644 index 00000000000..b3f31c2c095 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.cc @@ -0,0 +1,3 @@ +#include "llk_io.h" + +CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] = {0}; diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h new file mode 100644 index 00000000000..37e018dc6b8 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io.h @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +#include "circular_buffer.h" + +extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_pack.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_pack.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_pack.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_pack.h diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_unpack.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_unpack.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_io_unpack.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_io_unpack.h diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h new file mode 100644 index 00000000000..1569b4cdcd1 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_operands.h @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include + +inline uint32_t get_operand_id(uint32_t operand) +{ + return (operand); +} + +inline const uint32_t get_operand_src_format(const std::uint32_t operand_id) +{ + return unpack_src_format[operand_id]; +} + +inline const uint32_t get_operand_dst_format(const std::uint32_t operand_id) +{ + return unpack_dst_format[operand_id]; +} + +//TODO: Do we need tile dim functions for GS? +inline const uint32_t get_operand_num_faces(const std::uint32_t operand_id) +{ + return 4; +} + +inline const uint32_t get_operand_partial_face(const std::uint32_t operand_id) +{ + return 0; +} + +inline const uint32_t get_operand_face_r_dim(const std::uint32_t operand_id) +{ + return 16; +} + +inline const uint32_t get_operand_narrow_tile(const std::uint32_t operand_id) +{ + return 0; +} + +inline const uint32_t get_operand_tile_r_dim(const std::uint32_t operand_id) +{ + return 32; +} + +inline const uint32_t get_operand_tile_c_dim(const std::uint32_t operand_id) +{ + return 32; +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h new file mode 100644 index 00000000000..bd010082bbd --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include + +// Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes +inline uint32_t get_output_id(uint32_t output) +{ + const uint32_t OUTPUT_BASE = 0; + return ((output) - OUTPUT_BASE); +} + +inline const uint32_t get_output_base_id() +{ + const uint32_t OUTPUT_BASE_ID = 16; + return (OUTPUT_BASE_ID); +} + +inline const uint32_t get_output_src_format(const std::uint32_t output_id) +{ + return pack_src_format[output_id]; +} + +inline const uint32_t get_output_dst_format(const std::uint32_t output_id) +{ + return pack_dst_format[output_id]; +} + +//TODO: Do we need tile dim functions for GS? +inline const uint32_t get_output_num_faces(const std::uint32_t output_id) +{ + return 4; +} + +inline const uint32_t get_output_partial_face(const std::uint32_t output_id) +{ + return 0; +} + +inline const uint32_t get_output_face_r_dim(const std::uint32_t output_id) +{ + return 16; +} + +inline const uint32_t get_output_narrow_tile(const std::uint32_t output_id) +{ + return 0; +} + +inline const uint32_t get_output_tile_r_dim(const std::uint32_t output_id) +{ + return 32; +} + +inline const uint32_t get_output_tile_c_dim(const std::uint32_t output_id) +{ + return 32; +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h index f9359469e33..3dd7dbe114c 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h +++ b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_globals.h @@ -19,4 +19,3 @@ extern uint32_t math_sync_tile_dst_index; extern uint32_t __local_mem_rodata_start_addr[]; extern uint32_t __local_mem_rodata_end_addr[]; -extern uint32_t __firmware_start[]; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h index 29a2dbf9cfe..cf08580ad69 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +//TODO: This file should be deleted after fixing redefinition errors, +// functions should be moved to ckernel_globals.h #pragma once #include diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h index 2b94607012d..ea113ce5fa0 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_operands.h @@ -8,8 +8,7 @@ inline uint32_t get_operand_id(uint32_t operand) { - const int OPERAND_BASE_ID = 0; - return (operand); + return (operand); } inline const uint32_t get_operand_src_format(const std::uint32_t operand_id) diff --git a/tt_metal/include/compute_kernel_api/common_globals.h b/tt_metal/include/compute_kernel_api/common_globals.h index 213859b1ae4..0b0eee877dc 100644 --- a/tt_metal/include/compute_kernel_api/common_globals.h +++ b/tt_metal/include/compute_kernel_api/common_globals.h @@ -31,6 +31,7 @@ #endif #ifdef TRISC_UNPACK +#include "llk_unpack_common_api.h" #define UNPACK(x) x #define MAIN unpack_main() #else diff --git a/tt_metal/include/compute_kernel_api/unpack.h b/tt_metal/include/compute_kernel_api/unpack.h index 2aaefe1d9d4..ba676d4b938 100644 --- a/tt_metal/include/compute_kernel_api/unpack.h +++ b/tt_metal/include/compute_kernel_api/unpack.h @@ -7,7 +7,7 @@ #include "common_globals.h" - +//TODO: Should WHB0 functions be added here? namespace ckernel { /** From 655fc5f729927d123e514580d4ee5cfad19b27b4 Mon Sep 17 00:00:00 2001 From: Reem Tawfik Date: Thu, 7 Dec 2023 16:59:36 +0000 Subject: [PATCH 13/16] #3908: Fixes for llk lib compile/regressions: - Add separate llk api files for sfpu negative & mask - Move sfpu identity to metal api folder - Add llk_io files to erisc core compile --- .../grayskull/common/inc/ckernel_sfpu.h | 36 - .../llk_lib/llk_math_eltwise_unary_sfpu.h | 22 - .../metal/llk_api/llk_math_binary_api.h | 75 -- .../metal/llk_api/llk_math_binary_sfpu_api.h | 59 -- .../metal/llk_api/llk_math_common_api.h | 89 -- .../metal/llk_api/llk_math_matmul_api.h | 57 -- .../metal/llk_api/llk_math_reduce_api.h | 17 - .../llk_api/llk_math_unary_datacopy_api.h | 24 - .../metal/llk_api/llk_math_unary_sfpu_api.h | 279 ------- .../grayskull/metal/llk_api/llk_pack_api.h | 286 ------- .../metal/llk_api/llk_param_structs.h | 1 - .../llk_api/llk_sfpu}/ckernel_sfpu_identity.h | 0 .../llk_api/llk_sfpu/ckernel_sfpu_mask.h | 34 + .../llk_api/llk_sfpu/ckernel_sfpu_negative.h | 31 + ..._math_eltwise_unary_sfpu_common_includes.h | 150 ---- .../llk_math_eltwise_unary_sfpu_identity.h | 0 .../llk_math_eltwise_unary_sfpu_mask.h | 31 + .../llk_math_eltwise_unary_sfpu_negative.h | 31 + .../llk_api/llk_sfpu/metal_ckernel_sfpu.h | 762 ------------------ .../metal/llk_api/llk_unpack_AB_api.h | 74 -- .../metal/llk_api/llk_unpack_AB_matmul_api.h | 125 --- .../metal/llk_api/llk_unpack_A_api.h | 78 -- .../metal/llk_api/llk_unpack_common_api.h | 117 --- .../metal/llk_api/llk_unpack_reduce_api.h | 83 -- .../metal/llk_api/llk_unpack_tilize_api.h | 88 -- .../metal/llk_api/llk_unpack_untilize_api.h | 85 -- .../metal/llk_api/llk_math_unary_sfpu_api.h | 22 - .../llk_api/llk_sfpu}/ckernel_sfpu_identity.h | 0 .../llk_api/llk_sfpu/ckernel_sfpu_mask.h | 34 + .../llk_api/llk_sfpu/ckernel_sfpu_negative.h | 31 + ..._math_eltwise_unary_sfpu_common_includes.h | 4 - .../llk_math_eltwise_unary_sfpu_identity.h | 5 +- .../llk_math_eltwise_unary_sfpu_mask.h | 31 + .../llk_math_eltwise_unary_sfpu_negative.h | 30 + .../llk_math_eltwise_unary_sfpu_sqrt.h | 1 - .../llk_api/llk_sfpu/metal_ckernel_sfpu.h | 29 - .../eltwise_unary/negative.h | 2 +- tt_metal/include/compute_kernel_api/mask.h | 2 +- tt_metal/jit_build/build.cpp | 18 +- 39 files changed, 270 insertions(+), 2573 deletions(-) rename tt_metal/hw/ckernels/grayskull/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_identity.h (100%) create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h rename tt_metal/hw/ckernels/grayskull/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_identity.h (100%) create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h rename tt_metal/hw/ckernels/wormhole_b0/{common/inc => metal/llk_api/llk_sfpu}/ckernel_sfpu_identity.h (100%) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h rename tt_metal/hw/ckernels/wormhole_b0/{llk_lib => metal/llk_api/llk_sfpu}/llk_math_eltwise_unary_sfpu_identity.h (88%) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h index 80b27698ef9..10673511969 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu.h @@ -244,20 +244,6 @@ inline void calculate_atan() } } - -template -inline void calculate_negative() -{ - - for (int d = 0; d < ITERATIONS; d++) - { - vFloat val = dst_reg[0]; - dst_reg[0] = -val; - dst_reg++; - } -} - - template inline void calculate_rsqrt() { @@ -888,21 +874,6 @@ inline void calculate_silu() } } -template -inline void calculate_mask() -{ - bool exponent_size_8 = true; - for (int d = 0; d < ITERATIONS; d++) - { - vFloat mask = dst_reg[16]; - v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) { - dst_reg[0] = 0; - } - v_endif; - dst_reg++; - } -} - template inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { @@ -997,13 +968,6 @@ inline void calculate_sfpu(uint param0 = 0, uint param1 = 0, uint param2 = 0, ui else if constexpr (operation == SfpuType::silu) { calculate_silu(); } - else if constexpr (operation == SfpuType::mask) { - calculate_mask(); - } - else if constexpr (operation == SfpuType::negative) { - calculate_negative(); - } - //erf, erfc are dispatched directly. } diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h index bdc159d0d08..91b2e60d506 100644 --- a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h +++ b/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu.h @@ -366,26 +366,4 @@ inline void llk_math_eltwise_unary_sfpu_silu_init() { llk_math_eltwise_unary_sfpu_init(); } -//Mask -template -inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_mask_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -// Negative -template -inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index,vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_negative_init() { - llk_math_eltwise_unary_sfpu_init(); -} - } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h index 317c14707ca..0dd9613dfe0 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_api.h @@ -9,78 +9,3 @@ // /************************************************************************* // * LLK ELTWISE BINARY // *************************************************************************/ - -// // Version with no operand -// template < -// EltwiseBinaryType eltwise_binary_type, -// BroadcastType src_b_bcast_type, -// int NUM_FIDELITY_PHASES = 0, -// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -// inline void llk_math_eltwise_binary_init(const std::uint32_t transpose = 0, const std::uint32_t acc_to_dest = 0) { -// const std::uint32_t num_faces = 4; - -// _llk_math_eltwise_binary_init_( -// num_faces, transpose, acc_to_dest); -// } - -// // Version with operands -// template < -// EltwiseBinaryType eltwise_binary_type, -// BroadcastType src_b_bcast_type, -// int NUM_FIDELITY_PHASES = 0, -// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -// inline void llk_math_eltwise_binary_init_with_operands( -// const std::uint32_t operand_A, -// const std::uint32_t operand_B, -// const std::uint32_t transpose = 0, -// const std::uint32_t acc_to_dest = 0) { -// const std::uint32_t operand_id = -// get_operand_id(operand_A); // operand_id is used to extract tile dim data which is the same for both operands -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); - -// _llk_math_eltwise_binary_init_( -// num_faces, transpose, acc_to_dest); -// } - -// template < -// EltwiseBinaryType eltwise_binary_type, -// BroadcastType src_b_bcast_type, -// DstSync Dst = DstSync::SyncFull, -// int NUM_FIDELITY_PHASES = 0, -// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, -// bool is_fp32_dest_acc_en = false> -// inline void llk_math_eltwise_binary(uint dst_index, const bool clear_fp32_dst_acc = true) { -// const std::uint32_t num_faces = 4; - -// _llk_math_eltwise_binary_< -// eltwise_binary_type, -// src_b_bcast_type, -// Dst, -// NUM_FIDELITY_PHASES, -// binary_reuse_dest, -// is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); -// } - -// template < -// EltwiseBinaryType eltwise_binary_type, -// BroadcastType src_b_bcast_type, -// DstSync Dst = DstSync::SyncFull, -// int NUM_FIDELITY_PHASES = 0, -// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, -// bool is_fp32_dest_acc_en = false> -// inline void llk_math_eltwise_binary( -// const std::uint32_t operand_A, -// const std::uint32_t operand_B, -// uint dst_index, -// const bool clear_fp32_dst_acc = true) { -// const std::uint32_t operand_id = get_operand_id(operand_A); // both operands must have same number of faces -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); - -// _llk_math_eltwise_binary_< -// eltwise_binary_type, -// src_b_bcast_type, -// Dst, -// NUM_FIDELITY_PHASES, -// binary_reuse_dest, -// is_fp32_dest_acc_en>(num_faces, dst_index, clear_fp32_dst_acc); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h index 21c3e8ae428..41ba7fc4b73 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_binary_sfpu_api.h @@ -9,62 +9,3 @@ // /************************************************************************* // * LLK ELTWISE BINARY SFPU // *************************************************************************/ - -// template -// inline void llk_math_eltwise_binary_sfpu( -// const uint operand, -// uint dst_index_a, -// uint dst_index_b, -// int vector_mode = (int)Dim::RC, -// uint param0 = 0, -// uint param1 = 0, -// uint param2 = 0, -// uint param3 = 0, -// uint param4 = 0, -// uint param5 = 0) { -// const std::uint32_t operand_id = get_operand_id(0); -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); - -// _llk_math_eltwise_binary_sfpu_( -// face_r_dim, num_faces, dst_index_a, dst_index_b, vector_mode, param0, param1, param2, param3, param4, param5); -// } - -// template -// inline void llk_math_eltwise_binary_sfpu_init( -// uint param0 = 0, uint param1 = 0, uint param2 = 0, uint param3 = 0, uint param4 = 0, uint param5 = 0) { -// _llk_math_eltwise_binary_sfpu_init_(param0, param1, param2, param3, param4, param5); -// } - -// template -// inline void llk_math_eltwise_binary_sfpu_quant_int32( -// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { -// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); -// } - -// template -// inline void llk_math_eltwise_binary_sfpu_quant_int32_init(const uint zero_point) { -// llk_math_eltwise_binary_sfpu_init(zero_point); -// } - -// template -// inline void llk_math_eltwise_binary_sfpu_requant_int32( -// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { -// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); -// } - -// template -// inline void llk_math_eltwise_binary_sfpu_requant_int32_init(const uint zero_point) { -// llk_math_eltwise_binary_sfpu_init(zero_point); -// } - -// template -// inline void llk_math_eltwise_binary_sfpu_dequant_int32( -// uint dst_index_a, uint dst_index_b, int vector_mode = (int)Dim::RC) { -// llk_math_eltwise_binary_sfpu(dst_index_a, dst_index_b, vector_mode); -// } - -// template -// inline void llk_math_eltwise_binary_sfpu_dequant_int32_init(const uint zero_point) { -// llk_math_eltwise_binary_sfpu_init(zero_point); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h index 3da220f0cba..f56234c0a0e 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_common_api.h @@ -14,95 +14,6 @@ #include "llk_operands.h" #include "llk_param_structs.h" -// // Need to revisit why we even need this -// #define EPS 1.19209e-07 // std::numeric_limits::epsilon() for FP32 - // /************************************************************************* // * LLK MATH COMMON // *************************************************************************/ - -// template -// inline void llk_math_wait_for_dest_available() { -// _llk_math_wait_for_dest_available_(); -// } - -// template -// inline void llk_math_dest_section_done() { -// _llk_math_dest_section_done_(); -// } - -// template -// inline void llk_math_pack_sync_init() { -// _llk_math_pack_sync_init_(); -// } - -// template -// inline void llk_math_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { -// _llk_math_get_tile_(tile_index, p_tile); -// } - -// template -// inline void llk_math_release_tile(std::uint32_t operand) { -// _llk_math_release_tile_(); -// } - -// inline void llk_math_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { _llk_math_debug_dump_(data, byte_size); } - -// inline void llk_math_debug_dump_seek(std::uint8_t offset) { _llk_math_debug_dump_seek_(offset); } - -// inline void llk_math_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { -// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); -// _llk_math_reconfig_data_format_srca_(unpack_dst_format[new_srca_operand_id]); -// } - -// inline void llk_math_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { -// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); -// _llk_math_reconfig_data_format_srcb_(unpack_dst_format[new_srcb_operand_id]); -// } - -// inline void llk_math_reconfig_data_format(const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { -// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); -// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - -// _llk_math_reconfig_data_format_(unpack_dst_format[new_srca_operand_id], unpack_dst_format[new_srcb_operand_id]); -// } - -// inline void llk_math_reconfig_data_format( -// const std::uint32_t srca_old_operand, -// const std::uint32_t srca_new_operand, -// const std::uint32_t srcb_old_operand, -// const std::uint32_t srcb_new_operand) { -// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); -// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); -// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); -// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - -// if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id]) && -// (unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { -// llk_math_reconfig_data_format(srca_new_operand, srcb_new_operand); -// } else if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { -// llk_math_reconfig_data_format_srca(srca_new_operand); -// } else if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { -// llk_math_reconfig_data_format_srcb(srcb_new_operand); -// } -// } - -// inline void llk_math_reconfig_data_format_srca( -// const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { -// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); -// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); - -// if ((unpack_dst_format[old_srca_operand_id] != unpack_dst_format[new_srca_operand_id])) { -// llk_math_reconfig_data_format_srca(srca_new_operand); -// } -// } - -// inline void llk_math_reconfig_data_format_srcb( -// const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { -// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); -// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - -// if ((unpack_dst_format[old_srcb_operand_id] != unpack_dst_format[new_srcb_operand_id])) { -// llk_math_reconfig_data_format_srcb(srcb_new_operand); -// } -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h index a12bcca1ef4..70d2109196b 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h @@ -9,60 +9,3 @@ // /************************************************************************* // * LLK MATMUL // *************************************************************************/ - -// template -// inline void llk_math_matmul_init( -// const std::uint32_t operandA, -// const std::uint32_t operandB, -// const std::uint32_t transpose = 0, -// const std::uint32_t ct_dim = 1, -// const std::uint32_t rt_dim = 1, -// const std::uint32_t kt_dim = 1) { -// const std::uint32_t in0_id = get_operand_id(operandA); -// const std::uint32_t in1_id = get_operand_id(operandB); - -// const bool partial_face = get_operand_partial_face(in0_id); - -// const std::uint32_t in0_tile_r_dim = get_operand_tile_r_dim(in0_id); -// const std::uint32_t in0_tile_c_dim = get_operand_tile_c_dim(in0_id); -// const std::uint32_t in1_tile_r_dim = get_operand_tile_r_dim(in1_id); -// const std::uint32_t in1_tile_c_dim = get_operand_tile_c_dim(in1_id); - -// #ifdef ARCH_GRAYSKULL -// _llk_math_matmul_init_( -// in0_tile_r_dim, -// in0_tile_c_dim, -// in1_tile_r_dim, -// in1_tile_c_dim, -// partial_face, -// transpose, -// ct_dim, -// rt_dim, -// kt_dim); -// #else -// _llk_math_matmul_init_( -// in0_tile_r_dim, -// in0_tile_c_dim, -// in1_tile_r_dim, -// in1_tile_c_dim, -// partial_face, -// transpose, -// ct_dim, -// rt_dim, -// kt_dim); -// #endif -// } - -// template -// inline void llk_math_matmul( -// uint dst_index, -// const bool transpose = false, -// const std::uint32_t ct_dim = 1, -// const std::uint32_t rt_dim = 1, -// const std::uint32_t kt_dim = 1) { -// #ifdef ARCH_GRAYSKULL -// _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); -// #else -// _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); -// #endif -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h index c5f11d005f2..be0284f144d 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_reduce_api.h @@ -9,20 +9,3 @@ // /************************************************************************* // * LLK REDUCE // *************************************************************************/ - -// template < -// PoolType type, -// ReduceDim dim, -// int num_fidelity_phases = 0, -// bool is_fp32_dest_acc_en = false, -// bool is_int_fpu_en = false> -// inline void llk_math_reduce(const uint dst_index) { -// _llk_math_reduce_(dst_index); -// } - -// template -// inline void llk_math_reduce_init( -// const std::uint32_t within_face_16x16_transpose = -// 0) { // within_face_16x16_transpose used for unpack, ignored by math -// _llk_math_reduce_init_(within_face_16x16_transpose); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h index ca2a5d39e40..33ec73901a8 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_datacopy_api.h @@ -10,27 +10,3 @@ // /************************************************************************* // * LLK ELTWISE UNARY DATACOPY // *************************************************************************/ - -// template < -// DataCopyType type, -// BroadcastType src_b_bcast_type = BroadcastType::NONE, -// DstSync Dst = DstSync::SyncFull, -// bool is_fp32_dest_acc_en = false, -// bool unpack_to_dest = false> -// inline void llk_math_eltwise_unary_datacopy(uint dst_index, uint operand = 0) { -// const std::uint32_t operand_id = get_operand_id(0); -// _llk_math_eltwise_unary_datacopy_( -// dst_index, unpack_src_format[operand_id], unpack_dst_format[operand_id]); -// } - -// template -// // within_face_16x16_transpose is used by unpacker, math does not transpose -// inline void llk_math_eltwise_unary_datacopy_init( -// const std::uint32_t transpose_of_faces = 0 /*unused*/, -// const std::uint32_t within_face_16x16_transpose = 0 /* unused */, -// const std::uint32_t operand = 0) { -// const std::uint32_t operand_id = get_operand_id(0); -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); -// _llk_math_eltwise_unary_datacopy_init_( -// transpose_of_faces, within_face_16x16_transpose, num_faces); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h index 53b9d1afe8b..0972e48ebb0 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h @@ -12,282 +12,3 @@ // /************************************************************************* // * LLK ELTWISE UNARY SFPU // *************************************************************************/ - -// // New LLK SFPU APIs -// template -// inline void llk_math_eltwise_unary_sfpu_rsqrt(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_rsqrt_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_log(uint dst_index, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_log_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_log_with_base(uint dst_index,uint base_scale) { -// llk_math_eltwise_unary_sfpu(dst_index,base_scale); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_log_with_base_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_tanh(uint dst_index, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_signbit(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_signbit_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_tanh_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //sign -// template -// inline void llk_math_eltwise_unary_sfpu_sign(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_sign_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } -// template -// inline void llk_math_eltwise_unary_sfpu_dropout(uint dst_index, int vector_mode, int integer_dropout, int scale_factor) { -// constexpr bool dont_care = false; -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode, integer_dropout, scale_factor); -// } - -// inline void llk_math_eltwise_unary_sfpu_dropout_init(uint seed = 0) { -// constexpr bool dont_care = false; -// constexpr uint dont_care_param = 0; - -// llk_math_eltwise_unary_sfpu_init(dont_care_param, dont_care_param, seed); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_sigmoid(uint dst_index, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_sigmoid_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //EQZ -// template -// inline void llk_math_eltwise_unary_sfpu_eqz(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_eqz_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //NEZ -// template -// inline void llk_math_eltwise_unary_sfpu_nez(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_nez_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //LTZ -// template -// inline void llk_math_eltwise_unary_sfpu_ltz(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_ltz_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //GTZ -// template -// inline void llk_math_eltwise_unary_sfpu_gtz(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_gtz_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //LEZ -// template -// inline void llk_math_eltwise_unary_sfpu_lez(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_lez_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //GEZ -// template -// inline void llk_math_eltwise_unary_sfpu_gez(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_gez_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_max(uint dst_index, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_max_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_square(uint dst_index, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_square_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_power(uint dst_index, int pow = 0, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode, pow); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_power_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_abs(uint dst_index, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_abs_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a(uint dst_index, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_cast_fp32_to_fp16a_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //EXP2 -// template -// inline void llk_math_eltwise_unary_sfpu_exp2(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_exp2_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //heaviside -// template -// inline void llk_math_eltwise_unary_sfpu_heaviside(uint dst_index,uint param0, int vector_mode = Dim::RC) { -// llk_math_eltwise_unary_sfpu(dst_index,vector_mode,param0); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_heaviside_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //EXPM1 -// template -// inline void llk_math_eltwise_unary_sfpu_expm1(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_expm1_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //Asin -// template -// inline void llk_math_eltwise_unary_sfpu_asin(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_asin_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //Atan -// template -// inline void llk_math_eltwise_unary_sfpu_atan(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_atan_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //Acos -// template -// inline void llk_math_eltwise_unary_sfpu_acos(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_acos_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// //silu -// template -// inline void llk_math_eltwise_unary_sfpu_silu(uint dst_index) { -// llk_math_eltwise_unary_sfpu(dst_index); -// } - -// template -// inline void llk_math_eltwise_unary_sfpu_silu_init() { -// llk_math_eltwise_unary_sfpu_init(); -// } - -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h index 37ee8a0fe56..b648be30f3c 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h @@ -20,289 +20,3 @@ /************************************************************************* * LLK PACK *************************************************************************/ - -// template -// inline void llk_pack_mop_config(const uint32_t output) { - -// const std::uint32_t output_id = get_output_id(output); -// const std::uint32_t num_faces = get_output_num_faces(output_id); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]); -// const bool narrow_tile = get_output_narrow_tile(output_id); - -// _llk_pack_mop_config_( -// pack_dst_format[output_id], -// face_r_dim, -// num_faces, -// partial_face, -// narrow_tile -// ); -// } - -// template -// inline void llk_pack_hw_configure(const llk_pack_params_t *pack_params) { - -// const std::uint32_t output_id = get_output_id(pack_params->pack_output); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const std::uint32_t num_faces = get_output_num_faces(output_id); -// const bool partial_face = get_output_partial_face(output_id); -// const bool narrow_tile = get_output_narrow_tile(output_id); - -// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; - -// _llk_pack_hw_configure_( -// pack_src_format[output_id], -// pack_dst_format[output_id], -// tile_size, -// face_r_dim, -// num_faces, -// partial_face, -// narrow_tile, -// pack_params->relu_config.val -// ); -// } - -// template -// inline void llk_pack_hw_configure_disaggregated(std::uint32_t pack_output) { -// llk_pack_params_t llk_pack_params = { -// .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold,}}}; -// llk_pack_hw_configure(&llk_pack_params); -// } - -// template -// inline void llk_pack_reduce_hw_configure(const llk_pack_params_t *pack_params) { -// const std::uint32_t output_id = get_output_id(pack_params->pack_output); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const std::uint32_t num_faces = get_output_num_faces(output_id); -// const bool partial_face = get_output_partial_face(output_id); -// const bool narrow_tile = get_output_narrow_tile(output_id); - -// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; - -// _llk_pack_reduce_hw_configure_( -// pack_src_format[output_id], -// pack_dst_format[output_id], -// tile_size, -// face_r_dim, -// num_faces, -// partial_face, -// narrow_tile, -// pack_params->relu_config.val -// ); -// } - -// template -// inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output) { -// llk_pack_params_t llk_pack_params = { -// .pack_output = pack_output, .relu_config = {.f = {.ApplyRelu = (std::uint32_t)relu_type, .Threshold = relu_threshold}}}; -// llk_pack_reduce_hw_configure(&llk_pack_params); -// } - -// template -// inline void llk_pack_init(const std::uint32_t pack_output = 16) { - -// const std::uint32_t output_id = get_output_id(pack_output); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const std::uint32_t num_faces = get_output_num_faces(output_id); -// const bool partial_face = get_output_partial_face(output_id); -// const bool narrow_tile = get_output_narrow_tile(output_id); - -// _llk_pack_init_( -// pack_dst_format[output_id], -// face_r_dim, -// num_faces, -// partial_face, -// narrow_tile -// ); -// } - -// template -// inline std::uint32_t get_output_tile_address(std::uint8_t output_id, std::uint32_t output_tile_index) { - -// std::uint32_t pack_tile_addr; -// if constexpr (out_of_order_output) { -// pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + -// (std::uint32_t)(cb_interface[output_id].fifo_page_size)*output_tile_index - 1; -// } else { -// if constexpr (untilize) { -// // FIXME: Need to support pack-untilize? -// // std::uint16_t out_tile_index = (cb_interface[output_id].ublock_tile_cnt/cb_interface[output_id].ublock_ct)*cb_interface[output_id].row_tile_dim + -// // cb_interface[output_id].ublock_tile_cnt%cb_interface[output_id].ublock_ct; //FIXME: optimize perf -// // pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; -// // pack_tile_addr += out_tile_index*(std::uint32_t)(cb_interface[output_id].fifo_page_size); - -// // cb_interface[output_id].ublock_tile_cnt++; - -// // if (cb_interface[output_id].ublock_tile_cnt == cb_interface[output_id].ublock_tile_dim) { -// // cb_interface[output_id].ublock_tile_cnt=0; -// // cb_interface[output_id].fifo_wr_tile_ptr += (std::uint32_t)(cb_interface[output_id].fifo_page_size)*cb_interface[output_id].ublock_ct; -// // } -// } else { -// pack_tile_addr = cb_interface[output_id].fifo_wr_ptr + cb_interface[output_id].fifo_wr_tile_ptr - 1; -// cb_interface[output_id].fifo_wr_tile_ptr += cb_interface[output_id].fifo_page_size; -// } -// } -// return pack_tile_addr; -// } - -// template -// inline void llk_pack(std::uint32_t tile_index, std::uint32_t output, std::uint32_t output_tile_index = 0) { -// std::uint8_t output_id = get_output_id(output); - -// static_assert((!(untilize && out_of_order_output)) && "untilize out of order packing is not supported!"); - -// std::uint32_t pack_tile_addr = get_output_tile_address(output_id, output_tile_index); - -// _llk_pack_( -// tile_index, -// pack_tile_addr -// ); -// } - -// /************************************************************************* -// * LLK PACK COMMON -// *************************************************************************/ - - -// inline void llk_packer_wait_for_math_done() { -// _llk_packer_wait_for_math_done_(); -// } - -// template -// inline void llk_packer_set_math_semaphore() { -// _llk_packer_set_math_semaphore_(); -// } - -// template -// inline void llk_pack_dest_section_done() { -// _llk_pack_dest_section_done_(); -// } - -// template -// inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) { -// const std::uint32_t output_id = get_output_id(pack_output); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const bool narrow_tile = get_output_narrow_tile(output_id); - -// _llk_init_packer_dest_offset_registers_( -// face_r_dim, -// narrow_tile -// ); -// } - -// template -// inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) { - -// const std::uint32_t output_id = get_output_id(pack_output); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const bool narrow_tile = get_output_narrow_tile(output_id); - -// _llk_pack_dest_init_( -// face_r_dim, -// narrow_tile -// ); -// } - -// template -// inline void llk_pack_get_tile(std::uint32_t output, std::uint32_t tile_index, std::uint32_t *p_tile) { -// _llk_pack_get_tile_(tile_index, p_tile); -// } - -// template -// inline void llk_pack_release_tile(std::uint32_t output) { -// _llk_pack_release_tile_(); -// } - -// inline void llk_pack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { -// _llk_pack_debug_dump_(data, byte_size); -// } - -// inline void llk_pack_debug_dump_seek(std::uint8_t offset) { -// _llk_pack_debug_dump_seek_(offset); -// } - -// template -// inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { - -// const std::uint32_t output_id = get_output_id(new_output); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const std::uint32_t num_faces = get_output_num_faces(output_id); -// const bool partial_face = get_output_partial_face(output_id); -// const bool narrow_tile = get_output_narrow_tile(output_id); - -// _llk_pack_reconfig_data_format_( -// pack_src_format[output_id], -// pack_dst_format[output_id], -// cb_interface[output_id].fifo_page_size, -// face_r_dim, -// num_faces, -// partial_face, -// narrow_tile -// ); -// } - -// template -// inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) { -// std::uint32_t old_output_id = get_output_id(old_output); -// std::uint32_t new_output_id = get_output_id(new_output); - -// if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id]) -// && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid) -// && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) { -// llk_pack_reconfig_data_format(new_output); -// } else if constexpr (is_tile_dim_reconfig_en) { -// // Same format but different tile dims -// llk_pack_mop_config(new_output); -// } -// } - -// TT_ALWAYS_INLINE void llk_pack_relu_config(const std::uint32_t config) { -// _llk_pack_relu_config_(config); -// } - -// inline void llk_pack_reconfig_l1_acc(const std::uint32_t enable) { -// _llk_pack_reconfig_l1_acc_(enable); -// } - -// template -// inline void llk_pack_reduce_mask_config() { -// _llk_pack_reduce_mask_config_(); -// } - -// inline void llk_pack_reduce_mask_clear() { -// _llk_pack_reduce_mask_clear_(); -// } - -// // FIXME-WH-UPLIFT -// template -// inline void llk_pack_reduce_config_v2(uint32_t icb_out) { - -// const bool untilize = false; -// if constexpr (at_kernel_start) { - -// const std::uint32_t output_id = get_output_id(icb_out); -// const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); -// const std::uint32_t num_faces = get_output_num_faces(output_id); -// const bool partial_face = get_output_partial_face(output_id); -// const bool narrow_tile = get_output_narrow_tile(output_id); -// const std::uint32_t tile_size = cb_interface[output_id].fifo_page_size; -// const llk_relu_config_u relu_config = {.f = {.ApplyRelu = (std::uint32_t)ReluType::NO_RELU, .Threshold = 0,}}; - -// _llk_pack_hw_configure_( -// pack_src_format[output_id], -// pack_dst_format[output_id], -// tile_size, -// face_r_dim, -// num_faces, -// partial_face, -// narrow_tile, -// relu_config.val -// ); -// } - -// if constexpr (revert) { -// _llk_pack_reduce_mask_clear_(); -// } else { -// _llk_pack_reduce_mask_config_(); -// } -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h index 83f94387efa..62d59b90afe 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_param_structs.h @@ -80,7 +80,6 @@ struct llk_pack_params_t { llk_relu_config_u relu_config; }; -// TODO: nsmith move this to a common place where the hlk can include it struct hlk_pack_shifted_params_t { std::uint32_t pack_output; llk_relu_config_u relu_config; diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_identity.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/common/inc/ckernel_sfpu_identity.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h new file mode 100644 index 00000000000..302cb97c934 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" + +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + + +template +inline void calculate_mask() +{ + bool exponent_size_8 = true; + for (int d = 0; d < ITERATIONS; d++) + { + vFloat mask = dst_reg[16]; + v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) { + dst_reg[0] = 0; + } + v_endif; + dst_reg++; + } +} +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h new file mode 100644 index 00000000000..fd9cfef2da6 --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" + +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + + +template +inline void calculate_negative() +{ + + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + dst_reg[0] = -val; + dst_reg++; + } +} +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h index 83a5fdcca92..fd920521909 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h @@ -16,156 +16,6 @@ #include "llk_param_structs.h" #include "llk_math_eltwise_unary_sfpu.h" -//TODO: Fix for GS uplift - -// using namespace ckernel; -// using namespace ckernel::sfpu; -// namespace ckernel { - // /************************************************************************* // * LLK ELTWISE UNARY SFPU // *************************************************************************/ - -// template < -// SfpuType operation, -// bool APPROXIMATION_MODE, -// int SfpuType_PARAM = 0, -// int ITERATIONS = 8, -// bool IS_INT_SFPU_EN = false> -// inline void llk_math_calculate_sfpu( -// const int iterations = ITERATIONS, -// uint param0 = 0, -// uint param1 = 0, -// uint param2 = 0, -// uint param3 = 0, -// uint param4 = 0, -// uint param5 = 0) { -// if constexpr (operation == SfpuType::exp_with_base) { -// constexpr bool zero_negative = true; -// _calculate_exponential_(iterations, param0); -// } else if constexpr (operation == SfpuType::tanh) { -// _calculate_tanh_(iterations); -// } else if constexpr (operation == SfpuType::hardtanh) { -// _calculate_hardtanh_(iterations, param0, param1, param2); -// } else if constexpr (operation == SfpuType::rsqrt) { -// // param0 = true -> approximate fast mode -// // false -> high precision mode -// // The algorithm uses Newton's method based on no.of iteration better approximation can be calculated -// if (param0) { -// calculate_rsqrt(); -// } else { -// calculate_rsqrt(); -// } -// } else if constexpr (operation == SfpuType::sigmoid) { -// calculate_sigmoid(); -// } else if constexpr (operation == SfpuType::sigmoid_appx) { -// calculate_sigmoid_appx(); -// } else if constexpr (operation == SfpuType::tanh_derivative) { -// calculate_tanh_derivative(); -// } else if constexpr (operation == SfpuType::dropout) { -// calculate_dropout(param0, param1); -// } else if constexpr (operation == SfpuType::power) { -// calculate_power_iterative(param0); -// } else if constexpr (operation == SfpuType::square) { -// calculate_square(); -// } else if constexpr (operation == SfpuType::log) { -// calculate_log(param0); -// } else if constexpr (operation == SfpuType::log_with_base) { -// calculate_log(param0); -// } else if constexpr ( -// (operation == SfpuType::equal_zero) || (operation == SfpuType::not_equal_zero) || -// (operation == SfpuType::less_than_zero) || (operation == SfpuType::greater_than_equal_zero) || -// (operation == SfpuType::less_than_equal_zero) || (operation == SfpuType::greater_than_zero)) { -// calculate_comp(8); // BFLOAT16 - exp -// } else if constexpr (operation == SfpuType::clamp) { -// calculate_clamp(param0, param1, param2); -// } else if constexpr (operation == SfpuType::abs) { -// calculate_abs(); -// } else if constexpr (operation == SfpuType::sign) { -// calculate_sign(); -// } else if constexpr (operation == SfpuType::max) { -// calculate_max(); -// } else if constexpr (operation == SfpuType::min) { -// calculate_min(); -// } else if constexpr (operation == SfpuType::exp2) { -// calculate_exp2(); -// } else if constexpr (operation == SfpuType::heaviside) { -// calculate_heaviside(param0); -// } else if constexpr (operation == SfpuType::expm1) { -// calculate_expm1(); -// } else if constexpr (operation == SfpuType::asin) { -// calculate_asin(); -// } else if constexpr (operation == SfpuType::acos) { -// calculate_acos(); -// } else if constexpr (operation == SfpuType::atan) { -// calculate_atan(); -// } else if constexpr (operation == SfpuType::signbit) { -// calculate_signbit(); -// } else if constexpr (operation == SfpuType::silu) { -// calculate_silu(); -// } -// } - -// template -// inline void llk_math_eltwise_unary_sfpu( -// uint dst_index, -// int vector_mode = (int)Dim::RC, -// uint param0 = 0, -// uint param1 = 0, -// uint param2 = 0, -// uint param3 = 0, -// uint param4 = 0, -// uint param5 = 0) { -// const std::uint32_t operand_id = get_operand_id(0); // Fix to operand 0. assume no tiny-tile support -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); - -// constexpr int ITERATIONS = 8; - -// _llk_math_eltwise_unary_sfpu_start_(dst_index); - -// if (vector_mode == (int)Dim::R) { -// // Do a row vector, Face0 + Face1 -- first iteration (first row) -// const int iterations = (num_faces < 4) ? ((face_r_dim <= 2) ? 2 : face_r_dim / 2) -// : 2; // At least 2 iterations for odd and even columns -// #pragma GCC unroll 0 -// for (int face = 0; face < 2; face++) { -// llk_math_calculate_sfpu( -// iterations, param0, param1, param2, param3, param4, param5); -// // Move to the next face -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// } -// // Skip next two faces -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// } else if (vector_mode == (int)Dim::C) { -// // Do a column vector, Face0 + Face2 if tile is 32x32 or Face0+Face1 if tiles is 32x16 -- All iterations for -// // full face -// #pragma GCC unroll 0 -// for (int face = 0; face < 2; face++) { -// llk_math_calculate_sfpu( -// ITERATIONS, param0, param1, param2, param3, param4, param5); -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// if (num_faces > 2) { // Skip next face if tile is 32x32 -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// } -// } -// if (num_faces <= 2) { -// // Skip next two faces -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// } -// } else { -// // Do all four faces, and iterate through all 4 blocks of 4 rows each -// #pragma GCC unroll 0 -// for (int face = 0; face < 4; face++) { -// llk_math_calculate_sfpu( -// ITERATIONS, param0, param1, param2, param3, param4, param5); -// // Move to the next face -// _llk_math_eltwise_unary_sfpu_inc_dst_face_addr_(); -// } -// } -// _llk_math_eltwise_unary_sfpu_done_(); -// } - -// } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_identity.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h similarity index 100% rename from tt_metal/hw/ckernels/grayskull/llk_lib/llk_math_eltwise_unary_sfpu_identity.h rename to tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h new file mode 100644 index 00000000000..eed4732e5eb --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include "llk_math_eltwise_unary_sfpu_common_includes.h" +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_0_param.h" +#include "ckernel_sfpu_mask.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_mask_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) { + constexpr int first_iterations = 1; + llk_math_eltwise_unary_sfpu_0_param + (ckernel::sfpu::calculate_mask, + ckernel::sfpu::calculate_mask, + dst_index, vector_mode); +} + +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h new file mode 100644 index 00000000000..5badb47497b --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include "llk_math_eltwise_unary_sfpu_common_includes.h" +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_0_param.h" +#include "ckernel_sfpu_negative.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_negative_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) { + constexpr int first_iterations = 1; + llk_math_eltwise_unary_sfpu_0_param + (ckernel::sfpu::calculate_negative, + ckernel::sfpu::calculate_negative, + dst_index, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h index 50018e399c3..f2dd3abf0ce 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h @@ -16,765 +16,3 @@ #include "ckernel_sfpu_exp.h" #include "ckernel_sfpu_recip.h" #include "ckernel_sfpu_converter.h" - -//TODO: Delete this file once GS uplift is done -// using namespace sfpi; - -// namespace ckernel -// { -// namespace sfpu -// { - -// template -// inline void calculate_rsqrt() -// { - -// for (int d = 0; d < ITERATIONS; d++) -// { - -// vFloat in = dst_reg[0]; -// v_if(dst_reg[0] == 0.0f){ -// dst_reg[0] = std::numeric_limits::infinity(); -// }v_else{ -// vFloat result = 1.0f; -// v_if(dst_reg[0] > 1.0f){ -// result = sfpu_reciprocal(in); -// }v_endif; - -// for (int r = 0; r < RECIPROCAL_ITERATIONS; r++) -// { -// // y = y * (1.5 - 0.5 * x * y * y) Newton's method iteration. -// result = result * (1.5F - 0.5F * dst_reg[0] * result * result); -// } -// dst_reg[0] = result; -// }v_endif; - -// dst_reg++; - -// } -// } - -// template -// inline void calculate_sigmoid_appx() -// { -// vUInt l0 = l_reg[LRegs::LReg0]; -// vUInt l1 = l_reg[LRegs::LReg1]; -// vUInt l2 = l_reg[LRegs::LReg2]; - -// #pragma GCC unroll 8 -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; - -// dst_reg[0] = lut(val, l0, l1, l2) + 0.5f; - -// dst_reg++; -// } - -// l_reg[LRegs::LReg0] = l0; -// l_reg[LRegs::LReg1] = l1; -// l_reg[LRegs::LReg2] = l2; -// } - -// // TODO: Implement using bitwise comparision -// template -// inline void calculate_signbit() -// { - -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; -// v_if (val <= -0.0f) { -// val = 1.0f; -// } v_elseif (val >= 0.0f) { -// val = 0.0f; -// } -// v_endif; -// dst_reg[0] = val; - -// dst_reg++; -// } - -// } - -// template -// inline void calculate_tanh() -// { -// // SFPU microcode -// vUInt l0 = l_reg[LRegs::LReg0]; -// vUInt l1 = l_reg[LRegs::LReg1]; -// vUInt l2 = l_reg[LRegs::LReg2]; - -// #pragma GCC unroll 8 -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; -// val = lut(val, l0, l1, l2); -// dst_reg[0] = val; - -// dst_reg++; -// } - -// l_reg[LRegs::LReg0] = l0; -// l_reg[LRegs::LReg1] = l1; -// l_reg[LRegs::LReg2] = l2; -// } - -// template -// inline void calculate_hardtanh(uint param0, uint param1, uint param2) -// { -// // All params are in FP16_B format -// // param0 = -(neg_threshold) -// // param1 = -(pos_threshold - neg_threshold) -// // param2 = -(pos_threshold) - -// vFloat p0 = s2vFloat16(param0); -// vFloat p1 = s2vFloat16(param1); -// vFloat p2 = s2vFloat16(param2); -// // SFPU microcode -// #pragma GCC unroll 0 -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; - -// val += p0;// 12 bits -// v_if (val < 0.0f) { -// val = 0.0f; -// } -// v_endif; - -// val += p1;// 12 bits -// v_if (val >= 0.0f) { -// val = 0.0f; -// } -// v_endif; - -// val += p2;// 12 bits - -// dst_reg[0] = val; - -// dst_reg++; -// } -// } - -// template -// inline void calculate_tanh_derivative() -// { -// vUInt l0 = l_reg[LRegs::LReg0]; -// vUInt l1 = l_reg[LRegs::LReg1]; -// vUInt l2 = l_reg[LRegs::LReg2]; - -// // tanh'(x) = 1 - (tanh(x))^2 -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; - -// if constexpr (!WITH_PRECOMPUTED_TANH) { -// val = lut(val, l0, l1, l2); -// } - -// val = val * (-val) + vConst1; -// dst_reg[0] = val; - -// dst_reg++; -// } - -// l_reg[LRegs::LReg0] = l0; -// l_reg[LRegs::LReg1] = l1; -// l_reg[LRegs::LReg2] = l2; -// } - -// template -// inline void calculate_dropout(uint prob, uint scale) -// { -// // SFPU microcode - -// vUInt rand = l_reg[LRegs::LReg3]; - -// #pragma GCC unroll 0 -// for (int d = 0; d < ITERATIONS; d++) { -// //////////////////////// -// // Scale samples -// /////////////////////// -// dst_reg[0] = dst_reg[0] * s2vFloat16b(scale); - -// //////////////////////// -// // Drop samples -// /////////////////////// -// v_if (rand < prob) { -// dst_reg[0] = vConst0; -// } -// v_endif; - -// //////////////////////// -// // 16-bit PRNG update -// /////////////////////// -// vUInt lfsr = vConstIntPrgm1; -// vUInt tmp = lfsr & rand; -// rand = rand >> 1; -// v_if (tmp != 0) { -// vUInt mask = vConstIntPrgm0; -// rand ^= mask; -// } -// v_endif; - -// dst_reg++; -// } - -// l_reg[LRegs::LReg3] = rand; -// } - -// template -// inline void calculate_power_iterative(const uint exponent) -// { -// #pragma GCC unroll 8 -// for (int d = 0; d < 8; d++) -// { -// vFloat in = dst_reg[0]; -// vFloat result = 1.0f; -// for (uint i = 0; i < exponent; i++) { -// result *= in; -// } -// dst_reg[0]=result; -// dst_reg++; -// } -// } - -// template -// inline void calculate_square() -// { -// #pragma GCC unroll 8 -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat in = dst_reg[0]; -// vFloat result = in * in; - -// dst_reg[0] = result; - -// dst_reg++; -// } -// } - -// template -// sfpi_inline void calculate_log_body(const uint log_base_scale_factor) -// { -// //////////////////////////// -// // Load From dest + "normalize to calculation range" -// //////////////////////////// -// vFloat in = dst_reg[0]; -// vFloat x = setexp(in, 127); // set exp to exp bias (put in range of 1-2) - -// // XXXXXX ask Namal? if we can derive the coefficients below to higher precision -// //////////////////////////// -// // Calculate Cheby Approximation using Horner Form Multiplication: 3rd Order -// // x* ( x* (A*x + B) + C) + D -// // A :0.1058, B: -0.3942, C: 0.9813, D: 0.006 -// // Run above on (x-1) so x is in ln(x+1), plug (x-1 into equation above to -// // save the subtract and get A',B',C',D'): -// // A' = A -// // B' = -3A + B -// // C' = 3a -2B + C -// // D' = -A + B - C + D -// // A':0.1058, B':-0.7116, C':2.0871, D':-1.4753 -// //////////////////////////// -// vFloat a = vConstFloatPrgm1; -// vFloat b = vConstFloatPrgm2; -// // XXXXX try variants of the below: B'=.7122, C'=2.0869 -// vFloat series_result = x * (x * (x * a + b) + 2.0871) + -1.4753f; - -// //////////////////////////// -// // Convert exponent to float -// //////////////////////////// -// vInt exp = exexp(in); -// v_if (exp < 0) { -// exp = setsgn(~exp + 1, 1); -// } -// v_endif; - -// vFloat expf = int32_to_float(exp, 0); -// vFloat vConstLn2 = vConstFloatPrgm0; -// vFloat result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2) - -// if constexpr (HAS_BASE_SCALING) { -// result *= s2vFloat16a(log_base_scale_factor); -// } - -// //////////////////////////// -// // Base case when input is 0. ln(0) = -inf -// //////////////////////////// -// v_if (in == 0.0F) { // Reload for register pressure -// result = -std::numeric_limits::infinity(); -// } -// v_endif; - -// dst_reg[0] = result; -// } - -// template -// inline void calculate_log(uint log_base_scale_factor) -// { -// #pragma GCC unroll 8 -// for(int d = 0; d < ITERATIONS; d++){ -// calculate_log_body(log_base_scale_factor); -// dst_reg++; -// } -// } - -// sfpi_inline void calculate_comp_init_flag(bool check, vFloat& flag1, vFloat& flag2, float init) -// { -// flag1 = init; -// if (check) { -// flag2 = init; -// } -// } - -// template -// inline void calculate_comp(uint exponent_size_8) -// { -// const vFloat zero = 0.0f; -// const vFloat one = 1.0f; -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat v = dst_reg[0]; -// vFloat flag1, flag2; - -// //a[i] == 0 -// if constexpr(COMP_MODE == SfpuType::equal_zero) { -// v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { -// v = one; -// } v_else { -// v = zero; -// } -// v_endif; -// } - -// //a[i] != 0 -// if constexpr(COMP_MODE == SfpuType::not_equal_zero) { -// v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) { -// v = zero; -// } v_else { -// v = one; -// } -// v_endif; -// } - -// //a[i] < 0 -// if constexpr(COMP_MODE == SfpuType::less_than_zero) { -// v_if (v >= 0.0f) { -// v = zero; -// } v_else { -// v = one; -// } -// v_endif; -// } - -// //a[i] >= 0 -// if constexpr(COMP_MODE == SfpuType::greater_than_equal_zero) { -// v_if (v >= 0.0f) { -// v = one; -// } v_else { -// v = zero; -// } -// v_endif; -// } - -// //a[i] > 0 -// if constexpr(COMP_MODE == SfpuType::greater_than_zero) { -// v_if (v > 0.0f) { -// v = one; -// } v_else { -// v = zero; -// } -// v_endif; -// } - -// //a[i] <= 0 -// if constexpr(COMP_MODE == SfpuType::less_than_equal_zero) { -// v_if (v > 0.0f) { -// v = zero; -// } v_else { -// v = one; -// } -// v_endif; -// } - -// dst_reg[0] = v; -// dst_reg++; -// } -// } - -// template -// inline void calculate_clamp(uint param0, uint param1, uint param2) -// { -// // All params are in FP16 format -// // param0 = min -// // param1 = max - -// //uint format = (param0 >> 16)&0x1; -// s2vFloat16::Format format = s2vFloat16::fp16a; - -// // SFPU microcode -// vFloat min = s2vFloat16(param0, format); -// vFloat max = s2vFloat16(param1, format); -// #pragma GCC unroll 0 -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; - -// v_if (val < min) { -// val = s2vFloat16(param0, format); -// } v_elseif (val >= max) { -// val = s2vFloat16(param1, format); -// } -// v_endif; - -// dst_reg[0] = val + s2vFloat16b(param2); // 12 bits - -// dst_reg++; -// } -// } - -// template -// inline void calculate_abs() -// { -// // SFPU microcode -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat v = dst_reg[0]; -// dst_reg[0] = sfpi::abs(v); -// dst_reg++; -// } -// } - - -// template -// inline void calculate_exp2() -// { -// // SFPU microcode -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat v = dst_reg[0]; -// // log(2) = 0.6931471805; -// v = v * 0.6931471805f; -// // exp = e^(v) -// vFloat exp = calculate_exponential_body_improved(v); -// dst_reg[0] = exp; -// dst_reg++; -// } -// } - -// template -// inline void calculate_sign() -// { -// // All params are in FP16 format -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat v = dst_reg[0]; -// vFloat result = vConst1; -// v_if (v < 0.0f) { -// result = vConstNeg1; -// } v_elseif(v > 0.0f) { -// result = vConst1; -// } v_else { -// result = vConst0; -// } -// v_endif; - -// dst_reg[0] = result; -// dst_reg++; -// } -// } -// template -// inline void calculate_max() -// { -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat a = dst_reg[0]; -// vFloat b = dst_reg[32]; -// v_if(a < b) { -// dst_reg[0] = b; -// } -// v_endif; - -// dst_reg++; -// } -// } - -// template -// inline void calculate_min() -// { -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat a = dst_reg[0]; -// vFloat b = dst_reg[32]; -// v_if(a > b) { -// dst_reg[0] = b; -// } -// v_endif; - -// dst_reg++; -// } -// } - -// template -// inline void calculate_expm1() -// { -// // SFPU microcode -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat v = dst_reg[0]; -// v = calculate_exponential_body_improved(v); -// dst_reg[0] = v - 1.0f; -// dst_reg++; -// } -// } - - -// #define POLYVAL6(coef5, coef4, coef3, coef2, coef1, coef0, t4) (t4 * (t4 * (t4 * (t4 * (coef5 * t4 + coef4) + coef3) + coef2) + coef1) + coef0) - -// template -// sfpi_inline vFloat sfpu_atan_maclaurin_series(vFloat val) -// { -// v_if(1 > sfpi::abs(val)){ -// dst_reg[0] = sfpi::abs(val) ; -// } -// v_else{ -// dst_reg[0] = sfpu_reciprocal(sfpi::abs(val)); -// } -// v_endif; - -// vFloat t1 = dst_reg[0] * dst_reg[0]; - -// t1 = POLYVAL6(-0.013480470f, 0.057477314f, -0.121239071f, 0.195635925f, -0.332994597f, 0.999995630f, t1); - -// t1 = t1 * dst_reg[0]; - -// v_if (sfpi::abs(val) > 1){ -// t1 = 1.570796327f - t1; -// } -// v_endif; - -// v_if(val < 0 ){ -// t1 = -t1; -// } -// v_endif; - -// return t1; -// } - -// template -// inline void calculate_atan() -// { -// // SFPU microcode -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; -// val = sfpu_atan_maclaurin_series(val); -// dst_reg[0] = val; -// dst_reg++; -// } -// } - - -// template -// sfpi_inline vFloat sfpu_asine_maclaurin_series(vFloat val) -// { -// // input for [-1:1] -// // Mclauren series -// // arcsin(x) = x + [(1/2) *x^3/3] + [(1 * 3) / (2 * 4) * x^5 / 5] + [(1 * 3 * 5) / (2 * 4 * 6) * x^7 / 7 ] + ... -// // arcsin(x) ≈ x + (1/6) * x^3 + (3/40) * x^5 + (5/112) * x^7 + (35/1152) * x^9 + (63/2816) * x^11a - -// vFloat tmp = val; -// vFloat val_square = val * val; -// // x -// vFloat output = tmp; -// // (1/6) * x^3 -// tmp = tmp * val_square; -// output += 0.166666666 * tmp; -// // (3/40) * x^5 -// tmp = tmp * val_square; -// output += 0.075 * tmp; - -// //(5/112) * x^7 -// tmp = tmp * val_square; -// output += 0.044642857 * tmp; - -// // (35/1152) *x^9 -// tmp = tmp * val_square; -// output += 0.03038194 * tmp; - -// //(63/2816) * x^11 -// tmp = tmp * val_square; -// output += 0.02237216 * tmp; - -// // Write out output -// return output; -// } - -// template -// inline void calculate_asin() -// { -// // SFPU microcode -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat v = dst_reg[0]; -// v = sfpu_asine_maclaurin_series(v); -// dst_reg[0] = v; -// dst_reg++; -// } -// } - - -// #define PI_2 (1.570796326794) -// template -// inline void calculate_acos() -// { -// // SFPU microcode -// // acos = (pi/2 - asin) -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat v = dst_reg[0]; -// v = sfpu_asine_maclaurin_series(v); -// v = PI_2 - v; -// dst_reg[0] = v; -// dst_reg++; -// } -// } - -// template -// inline void cast_fp32_to_fp16a() -// { -// #pragma GCC unroll 8 -// for (int d = 0; d < ITERATIONS; d++) -// { -// //vFloat val = dst_reg[0]; -// //dst_reg[0] = float_to_fp16a(val, 0); -// TTI_SFPLOAD(0, 0, 3, 0); -// TTI_SFP_STOCH_RND(0,0,0,0,0,8); -// TTI_SFPSTORE(0,1,3,0); -// dst_reg++; -// } -// } - - - -// template -// inline void calculate_negative() -// { - -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; -// dst_reg[0] = -val; -// dst_reg++; -// } -// } - -// template -// inline void calculate_add1() -// { -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; -// dst_reg[0] = 1.0f + val; -// dst_reg++; -// } -// } - -// inline -// vFloat sigmoid_piecewise_linear_positive(vFloat val) { -// vFloat result = 0.0f; -// v_if ( val >= +5.0f) { -// result = 1.0f; -// } v_elseif ( val > 1.0f && val < 5.0f ) { -// result = POLYVAL5(0.00144462f, -0.01055479f, -0.01203685f, 0.24300185f, 0.50437757f,val); -// } v_else { -// result = 0.229f*val + 0.5f; // linear appx as y = 0.229x + 0.5 -// } -// v_endif; -// return result; -// } - -// //sigmoid is anti-symmetric and offset by 1 -// //sigmoid[-x] = 1 - sigmoid[x] -// template -// inline void calculate_sigmoid() -// { -// for (int d = 0; d < ITERATIONS; d++) -// { -// vFloat val = dst_reg[0]; -// vFloat result = 0.0f; - -// v_if ( val < 0.0f ) { -// val = -val; -// } -// v_endif; - -// result = sigmoid_piecewise_linear_positive(val); - -// val = dst_reg[0]; -// v_if ( val < 0.0f ) { -// result = 1.0f - result; -// } -// v_endif; - -// dst_reg[0] = result; -// dst_reg++; -// } - -// return; -// } - -// template -// inline void calculate_heaviside(uint value) -// { -// // SFPU microcode -// Converter c_value; -// c_value.u = value; -// vFloat s = c_value.f; - -// #pragma GCC unroll 0 -// for (int d = 0; d < ITERATIONS; d++) { -// vFloat v = dst_reg[0]; - -// v_if (v < 0.0f) { -// v = 0.0f; -// }v_elseif (v > 0.0f) { -// v = 1.0f; -// }v_else { -// v = s; -// } -// v_endif; - -// dst_reg[0] = v; - -// dst_reg++; -// } -// } - -// template -// inline void calculate_silu() -// { -// // SFPU microcode -// for (int d = 0; d < ITERATIONS; d++) { -// vFloat val = dst_reg[0]; -// v_if ( val < 0.0f ) { -// val = -val; -// } -// v_endif; - -// vFloat result = sigmoid_piecewise_linear_positive(val); - -// val = dst_reg[0]; -// v_if ( val < 0.0f ) { -// result = 1.0f - result; -// } -// v_endif; -// result = val * result; -// dst_reg[0] = result; -// dst_reg++; -// } -// } - -// } // namespace sfpu -// } // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h index 642fbb1591e..77c43ef0650 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_api.h @@ -9,77 +9,3 @@ // /************************************************************************* // * LLK UNPACK AB // *************************************************************************/ - -// template -// inline void llk_unpack_AB_hw_configure( -// const llk_unpack_AB_params_t *unpack_AB_params, const int within_face_16x16_transpose = 0) { -// // In0 -> unpA -// // In1 -> unpB -// const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpA_operand); -// const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpB_operand); - -// // unpA -> srcA -// // unpB -> srcB -// const uint32_t num_faces = get_operand_num_faces(unpA_operand_id); // num faces in unpA and unpB are the same - -// const uint32_t face_r_dim = get_operand_face_r_dim(unpA_operand_id); // face r dim in unpA and unpB are the same - -// _llk_unpack_AB_hw_configure_( -// unpack_src_format[unpA_operand_id], -// unpack_src_format[unpB_operand_id], -// unpack_dst_format[unpA_operand_id], -// unpack_dst_format[unpB_operand_id], -// face_r_dim, -// within_face_16x16_transpose, -// num_faces); -// } - -// template -// inline void llk_unpack_AB_hw_configure_disaggregated( -// const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const int within_face_16x16_transpose = 0) { -// const llk_unpack_AB_params_t unpack_AB_params = {.unpA_operand = unpA_operand, .unpB_operand = unpB_operand}; - -// llk_unpack_AB_hw_configure(&unpack_AB_params, within_face_16x16_transpose); -// } - -// template -// inline void llk_unpack_AB_mop_config(const bool transpose_of_faces = false, const std::uint32_t operand_id = 0) { -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); -// const bool narrow_tile = get_operand_narrow_tile(operand_id); // if narrow tile read face 0 twice for row broadcast -// // or read face 0 and 1 for col broadcast -// _llk_unpack_AB_mop_config_(transpose_of_faces, num_faces, narrow_tile); -// } - -// template -// inline void llk_unpack_AB_init( -// const std::uint32_t operandA, -// const std::uint32_t operandB, -// const std::uint32_t transpose = 0, -// const std::uint32_t acc_to_dest = 0) { -// const std::uint32_t operandA_id = get_operand_id(operandA); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(operandA_id); // face r dim in unpA and unpB are the same -// const std::uint32_t num_faces = get_operand_num_faces(operandA_id); -// const bool narrow_tile = -// get_operand_narrow_tile(operandA_id); // if narrow tile read face 0 twice for row broadcast - -// _llk_unpack_AB_init_(face_r_dim, num_faces, narrow_tile, transpose, acc_to_dest); -// } - -// template -// inline void llk_unpack_AB( -// const std::uint32_t operandA, -// const std::uint32_t operandB, -// const std::uint32_t tile_index_a, -// const std::uint32_t tile_index_b, -// const bool transpose_of_faces = 0 /*not used*/) { -// std::uint32_t operandA_id = get_operand_id(operandA); -// std::uint32_t operandB_id = get_operand_id(operandB); -// std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; -// std::uint32_t offset_address_a = cb_interface[operandA_id].fifo_page_size * tile_index_a; -// std::uint32_t address_a = base_address_a + offset_address_a; -// std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; -// std::uint32_t offset_address_b = cb_interface[operandB_id].fifo_page_size * tile_index_b; -// std::uint32_t address_b = base_address_b + offset_address_b; - -// _llk_unpack_AB_(address_a, address_b, transpose_of_faces > 0); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h index f4aee2da6bd..d3299a0d299 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_AB_matmul_api.h @@ -9,128 +9,3 @@ // /************************************************************************* // * LLK UNPACK AB MATMUL // *************************************************************************/ - -// template -// inline void llk_unpack_AB_matmul_hw_configure(const llk_unpack_AB_matmul_params_t *unpack_AB_params) { -// const bool transpose_xy_srca = unpack_AB_params->transpose_xy_srca; - -// // In0 -> unpB -// // In1 -> unpA -// const uint32_t unpA_operand_id = get_operand_id(unpack_AB_params->unpB_operand); -// const uint32_t unpB_operand_id = get_operand_id(unpack_AB_params->unpA_operand); - -// // unpA -> srcA -// // unpB -> srcB -// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); -// const uint32_t unpB_num_faces = get_operand_num_faces(unpB_operand_id); - -// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); -// const uint32_t unpB_face_r_dim = get_operand_face_r_dim(unpB_operand_id); - -// _llk_unpack_AB_matmul_hw_configure_( -// unpack_src_format[unpA_operand_id], -// unpack_src_format[unpB_operand_id], -// unpack_dst_format[unpA_operand_id], -// unpack_dst_format[unpB_operand_id], -// unpA_face_r_dim, -// unpB_face_r_dim, -// transpose_xy_srca, -// unpA_num_faces, -// unpB_num_faces, -// cb_interface[unpA_operand_id].fifo_page_size, -// cb_interface[unpB_operand_id].fifo_page_size); -// } - -// template -// inline void llk_unpack_AB_matmul_hw_configure_disaggregated( -// const std::uint32_t unpA_operand, const std::uint32_t unpB_operand, const std::uint32_t transpose_xy_srca = 0) { -// const llk_unpack_AB_matmul_params_t unpack_AB_matmul_params = { -// .unpA_operand = unpA_operand, .unpB_operand = unpB_operand, .transpose_xy_srca = transpose_xy_srca}; -// llk_unpack_AB_matmul_hw_configure(&unpack_AB_matmul_params); -// } - -// inline void llk_unpack_AB_matmul_mop_config( -// const bool transpose, -// const std::uint32_t ct_dim, -// const std::uint32_t rt_dim, -// const std::uint32_t kt_dim, -// const bool partial_face) { -// // in0 - loaded to SrcB -// // in1 - loaded to SrcA -// _llk_unpack_AB_matmul_mop_config_(transpose, ct_dim, rt_dim, kt_dim, partial_face); -// } - -// __attribute__((always_inline)) inline void llk_unpack_AB_matmul_init( -// const std::uint32_t operandA, -// const std::uint32_t operandB, -// const std::uint32_t transpose = 0, -// const std::uint32_t ct_dim = 1, -// const std::uint32_t rt_dim = 1, -// const std::uint32_t kt_dim = 1) { -// // In0 -> srcB (supports partial face) -// // In1 -> srcA -// const uint32_t operandA_id = get_operand_id(operandB); -// const uint32_t operandB_id = get_operand_id(operandA); - -// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandA_id); -// const uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandB_id); - -// const bool reuse_a = ct_dim >= rt_dim; -// const bool partial_face = get_operand_partial_face(operandB_id); - -// const uint32_t unpA_num_faces = get_operand_num_faces(operandA_id); -// const uint32_t unpB_num_faces = -// partial_face ? 1 : get_operand_num_faces(operandB_id); // if partial face -> unpack face by face - -// _llk_unpack_AB_matmul_init_( -// transpose, -// ct_dim, -// rt_dim, -// kt_dim, -// unpA_face_r_dim, -// unpB_face_r_dim, -// unpA_num_faces, -// unpB_num_faces, -// partial_face); -// } - -// inline void llk_unpack_AB_matmul( -// const std::uint32_t operandA, -// const std::uint32_t operandB, -// const std::uint32_t tile_index_a, -// const std::uint32_t tile_index_b, -// const std::uint32_t ct_dim = 1, -// const std::uint32_t rt_dim = 1, -// const std::uint32_t kt_dim = 1) { -// // In0/InA -> srcB (supports partial face) -// // In1/InB -> srcA - -// volatile uint *cfg = get_cfg_pointer(); // get pointer to registers for current state ID - -// const std::uint32_t operandA_id = get_operand_id(operandA); -// const std::uint32_t operandB_id = get_operand_id(operandB); -// const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(operandB_id); // In1/InB -> srcA -// const std::uint32_t unpB_face_r_dim = get_operand_face_r_dim(operandA_id); // In0/InA -> srcB - -// const bool partial_face = get_operand_partial_face(operandA_id); - -// std::uint32_t base_address_a = cb_interface[operandA_id].fifo_rd_ptr - 1; -// std::uint32_t base_address_b = cb_interface[operandB_id].fifo_rd_ptr - 1; - -// std::uint32_t tile_size_a = cb_interface[operandA_id].fifo_page_size; -// std::uint32_t tile_size_b = cb_interface[operandB_id].fifo_page_size; - -// _llk_unpack_AB_matmul_( -// base_address_a, -// base_address_b, -// tile_index_a, -// tile_index_b, -// tile_size_a, -// tile_size_b, -// unpA_face_r_dim, -// unpB_face_r_dim, -// partial_face, -// ct_dim, -// rt_dim, -// kt_dim); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h index ca39397653c..9d9f30a6c75 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_A_api.h @@ -9,81 +9,3 @@ // /************************************************************************* // * LLK UNPACK A // *************************************************************************/ - -// template -// inline void llk_unpack_A_hw_configure( -// const llk_unpack_A_params_t *unpack_A_params, const int within_face_16x16_transpose = 0) { -// const uint32_t unpA_operand_id = get_operand_id(unpack_A_params->unpA_operand); -// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); -// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); - -// _llk_unpack_A_hw_configure_( -// unpack_src_format[unpA_operand_id], -// unpack_dst_format[unpA_operand_id], -// unpA_face_r_dim, -// within_face_16x16_transpose, -// unpA_num_faces); -// } - -// template -// inline void llk_unpack_A_hw_configure_disaggregated( -// const std::uint32_t unpA_operand, const int within_face_16x16_transpose = 0) { -// const llk_unpack_A_params_t unpack_A_params = {.unpA_operand = unpA_operand}; -// llk_unpack_A_hw_configure(&unpack_A_params, within_face_16x16_transpose); -// } - -// template < -// BroadcastType BType = BroadcastType::NONE, -// bool acc_to_dest = false, -// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, -// bool unpack_to_dest = false> -// inline void llk_unpack_A_mop_config( -// const bool transpose_of_faces, -// const std::uint32_t operand_id, -// const std::uint32_t unpack_src_format = 0, -// std::uint32_t unpack_dst_format = 0) { -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); - -// _llk_unpack_A_mop_config_( -// transpose_of_faces > 0, num_faces, unpack_src_format, unpack_dst_format); -// } - -// template < -// BroadcastType BType = BroadcastType::NONE, -// bool acc_to_dest = false, -// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, -// bool unpack_to_dest = false> -// inline void llk_unpack_A_init( -// const std::uint32_t transpose_of_faces = 0, -// const std::uint32_t within_face_16x16_transpose = 0, -// const std::uint32_t operand = 0) { -// cfg_reg_rmw_tensix(within_face_16x16_transpose); - -// const std::uint32_t operand_id = get_operand_id(operand); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); - -// _llk_unpack_A_init_( -// transpose_of_faces, -// within_face_16x16_transpose, -// face_r_dim, -// num_faces, -// unpack_src_format[operand_id], -// unpack_dst_format[operand_id]); -// } - -// template < -// BroadcastType BType = BroadcastType::NONE, -// bool acc_to_dest = false, -// EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE, -// bool unpack_to_dest = false> -// inline void llk_unpack_A( -// const std::uint32_t operand, const std::uint32_t tile_index, const bool transpose_of_faces = 0) { -// std::uint32_t operand_id = get_operand_id(operand); -// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; -// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; -// std::uint32_t address = base_address + offset_address; - -// _llk_unpack_A_( -// address, transpose_of_faces > 0, unpack_src_format[operand_id], unpack_dst_format[operand_id]); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h index a2f5d8c675f..1ba062360fb 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_common_api.h @@ -18,120 +18,3 @@ // /************************************************************************* // * LLK UNPACK COMMON // *************************************************************************/ - -// void llk_zero_operand(std::uint32_t operand) { -// std::uint32_t operand_id = get_operand_id(operand); -// std::uint32_t fifo_base_addr = (cb_interface[operand_id].fifo_limit + 1) - cb_interface[operand_id].fifo_size; -// std::uint32_t size = cb_interface[operand_id].fifo_size; -// _llk_zero_buffer_(fifo_base_addr, size); -// } - -// template -// inline void llk_unpack_get_tile(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t *p_tile) { -// std::uint32_t operand_id = get_operand_id(operand); -// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; -// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; -// std::uint32_t address = base_address + offset_address; -// _llk_unpack_get_tile_(address, p_tile); -// } - -// template -// inline void llk_unpack_release_tile(std::uint32_t operand) { -// _llk_unpack_release_tile_(); -// } - -// inline void llk_unpack_debug_dump(std::uint8_t *data, std::uint32_t byte_size) { -// _llk_unpack_debug_dump_(data, byte_size); -// } - -// inline void llk_unpack_debug_dump_seek(std::uint8_t offset) { _llk_unpack_debug_dump_seek_(offset); } - -// template -// inline void llk_unpack_reconfig_data_format_srca(const std::uint32_t srca_new_operand) { -// const std::uint32_t srca_operand_id = get_operand_id(srca_new_operand); -// const std::uint32_t num_faces = get_operand_num_faces(srca_operand_id); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(srca_operand_id); -// _llk_unpack_reconfig_data_format_srca_impl_( -// unpack_src_format[srca_operand_id], -// unpack_dst_format[srca_operand_id], -// cb_interface[srca_operand_id].fifo_page_size); -// } - -// template -// inline void llk_unpack_reconfig_data_format_srcb(const std::uint32_t srcb_new_operand) { -// std::uint32_t srcb_operand_id = get_operand_id(srcb_new_operand); -// const std::uint32_t num_faces = get_operand_num_faces(srcb_operand_id); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(srcb_operand_id); -// _llk_unpack_reconfig_data_format_srcb_impl_( -// unpack_src_format[srcb_operand_id], -// unpack_dst_format[srcb_operand_id], -// cb_interface[srcb_operand_id].fifo_page_size); -// } - -// template -// inline void llk_unpack_reconfig_data_format_srca( -// const std::uint32_t srca_old_operand, const std::uint32_t srca_new_operand) { -// std::uint32_t old_srca_operand_id = get_operand_id(srca_old_operand); -// std::uint32_t new_srca_operand_id = get_operand_id(srca_new_operand); - -// if ((unpack_src_format[old_srca_operand_id] != unpack_src_format[new_srca_operand_id])) { -// llk_unpack_reconfig_data_format_srca(srca_new_operand); -// } else if constexpr (is_tile_dim_reconfig_en) { -// llk_unpack_reconfig_data_format_srca(srca_new_operand); -// } -// } - -// template -// inline void llk_unpack_reconfig_data_format_srcb( -// const std::uint32_t srcb_old_operand, const std::uint32_t srcb_new_operand) { -// std::uint32_t old_srcb_operand_id = get_operand_id(srcb_old_operand); -// std::uint32_t new_srcb_operand_id = get_operand_id(srcb_new_operand); - -// if ((unpack_src_format[old_srcb_operand_id] != unpack_src_format[new_srcb_operand_id])) { -// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); -// } else if constexpr (is_tile_dim_reconfig_en) { -// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); -// } -// } - -// template -// inline void llk_unpack_reconfig_data_format( -// const std::uint32_t srca_new_operand, const std::uint32_t srcb_new_operand) { -// llk_unpack_reconfig_data_format_srca(srca_new_operand); -// llk_unpack_reconfig_data_format_srcb(srcb_new_operand); -// } - -// template -// inline void llk_unpack_reconfig_data_format( -// const std::uint32_t srca_old_operand, -// const std::uint32_t srca_new_operand, -// const std::uint32_t srcb_old_operand, -// const std::uint32_t srcb_new_operand) { -// llk_unpack_reconfig_data_format_srca(srca_old_operand, srca_new_operand); -// llk_unpack_reconfig_data_format_srcb(srcb_old_operand, srcb_new_operand); -// } - -// inline void llk_unpack_dbg_feature_disable() { _llk_unpack_dbg_feature_disable_(); } - -// inline void llk_enable_int8_fpu_math() { _llk_enable_int8_fpu_math_(); } - -// // All TILE_SIZE related functions were deprecared in BBE for WH. The following is needed for pack_shifted so just -// // keeping here. -// // FIXME: Need to review and adjust accordingly -// constexpr static std::int32_t MUL_HEADERLESS_TILE_SIZE_AND_INDEX(uint format, uint index) { -// switch (format & 0x1F) { -// case ((uint8_t)DataFormat::Float32): return ((index << 8)); -// case ((uint8_t)DataFormat::Float16): -// case ((uint8_t)DataFormat::Float16_b): return ((index << 7)); -// case ((uint8_t)DataFormat::Bfp8): -// case ((uint8_t)DataFormat::Bfp8_b): return ((index << 6) + (index << 2)); -// case ((uint8_t)DataFormat::Bfp4): -// case ((uint8_t)DataFormat::Bfp4_b): return ((index << 5) + (index << 2)); -// case ((uint8_t)DataFormat::Bfp2): -// case ((uint8_t)DataFormat::Bfp2_b): return ((index << 4) + (index << 2)); -// case ((uint8_t)DataFormat::Int8): -// case ((uint8_t)DataFormat::Lf8): return ((index << 6)); -// // Keep default as Bfp8? -// default: return ((index << 6) + (index << 2)); -// }; -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h index 01a12122375..c68c94b6de9 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_reduce_api.h @@ -9,86 +9,3 @@ /************************************************************************* * LLK UNPACK REDUCE *************************************************************************/ - -// template -// inline void llk_unpack_reduce_hw_configure( -// const llk_unpack_reduce_params_t *unpack_reduce_params, const float const_mult) { - -// constexpr bool within_face_16x16_transpose = (ReduceDim::REDUCE_ROW == dim); - -// const std::uint32_t unpA_operand_id = get_operand_id(unpack_reduce_params->unpA_operand); -// const std::uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); -// const std::uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); - -// constexpr std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; -// const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a -// ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); - -// _llk_unpack_reduce_hw_configure_( -// unpack_src_format[unpA_operand_id], -// unpB_src_format, -// unpack_dst_format[unpA_operand_id], -// unpB_dst_format, -// unpA_face_r_dim, -// unpA_face_r_dim, -// within_face_16x16_transpose, -// unpA_num_faces, -// unpA_num_faces -// ); - -// if constexpr (type != PoolType::MAX) { -// union { -// float f; -// uint32_t u; -// } f2u = {.f = const_mult}; - -// for (uint i = 0; i < 16; i++) l1_buffer[i] = f2u.u; // Load const into L1 buffer -// } -// } - -// template -// inline void llk_unpack_reduce_hw_configure_disaggregated(const std::uint32_t unpA_operand, const float mult) { -// const llk_unpack_reduce_params_t unpack_reduce_params = {.unpA_operand = unpA_operand}; -// llk_unpack_reduce_hw_configure(&unpack_reduce_params, mult); -// } - -// template -// inline void llk_unpack_reduce_mop_config() { -// _llk_unpack_reduce_mop_config_(); -// } - -// template -// inline void llk_unpack_reduce_init(const std::uint32_t within_face_16x16_transpose=0) { - -// constexpr std::uint32_t unpA_operand_id = 0; - -// const std::uint32_t unpB_src_format = (std::uint32_t) DataFormat::Float32; -// const std::uint32_t unpB_dst_format = ((std::uint32_t)unpack_dst_format[unpA_operand_id] == (std::uint32_t) DataFormat::Int8) ? (std::uint32_t) DataFormat::Float16 : // Int8 is treated as fp16_a -// ((((std::uint32_t)unpack_dst_format[unpA_operand_id]>>2)&0x1) ? (std::uint32_t) DataFormat::Float16_b : (std::uint32_t) DataFormat::Float16); - -// cfg_reg_rmw_tensix(unpB_dst_format); - -// cfg_reg_rmw_tensix(unpB_src_format); -// cfg_reg_rmw_tensix(unpB_dst_format); - -// TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_address_ADDR32); -// TTI_WRCFG(p_gpr_unpack::L1_BUFFER_ADDR, p_cfg::WRCFG_32b, THCON_SEC1_REG3_Base_cntx1_address_ADDR32); -// TTI_NOP; TTI_NOP; - -// _llk_unpack_reduce_init_( -// within_face_16x16_transpose -// ); -// } - -// template -// inline void llk_unpack_reduce(const std::uint32_t operand, const std::uint32_t tile_index) { - -// std::uint32_t operand_id = get_operand_id(operand); -// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; -// std::uint32_t offset_address = cb_interface[operand_id].fifo_page_size * tile_index; -// std::uint32_t address = base_address + offset_address; - -// _llk_unpack_reduce_( -// address -// ); -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h index 59ede271732..7ac7b91b52e 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_tilize_api.h @@ -9,91 +9,3 @@ /************************************************************************* * LLK UNPACK TILIZE *************************************************************************/ - -// template -// inline void llk_unpack_tilize_hw_configure(const llk_unpack_A_params_t *unpack_tilize_params) { - -// constexpr bool within_face_16x16_transpose = false; -// constexpr StochRndType stoch_rnd_mode = StochRndType::None; - -// const uint32_t unpA_operand_id = get_operand_id(unpack_tilize_params->unpA_operand); -// const uint32_t unpA_num_faces = get_operand_num_faces(unpA_operand_id); -// const uint32_t unpA_face_r_dim = get_operand_face_r_dim(unpA_operand_id); - -// _llk_unpack_tilize_hw_configure_( -// unpack_src_format[unpA_operand_id], -// unpack_dst_format[unpA_operand_id], -// unpA_face_r_dim, -// within_face_16x16_transpose, -// unpA_num_faces -// ); -// } - - -// template -// inline void llk_unpack_tilize_hw_configure_disaggregated( -// const std::uint32_t unpA_operand) { -// const llk_unpack_A_params_t unpack_tilize_params = { -// .unpA_operand = unpA_operand -// }; -// llk_unpack_tilize_hw_configure(&unpack_tilize_params); -// } - -// inline void llk_unpack_tilize_mop_config(const std::uint32_t operand) { -// std::uint32_t operand_id = get_operand_id(operand); -// const bool narrow_tile = get_operand_narrow_tile(operand_id); -// _llk_unpack_tilize_mop_config_(narrow_tile); -// } - -// inline void llk_unpack_tilize_init(const std::uint32_t operand = 0, const std::uint32_t ct_dim = 0) { -// cfg_reg_rmw_tensix(0); - -// const std::uint32_t operand_id = get_operand_id(operand); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); -// const bool narrow_tile = get_operand_narrow_tile(operand_id); - -// // Save state of unpacker config for quick restore -// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0, THCON_SEC0_REG2_Out_data_format_ADDR32); // Save unpack config[0] -// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context - -// _llk_unpack_tilize_init_( -// unpack_src_format[operand_id], -// unpack_dst_format[operand_id], -// ct_dim, -// face_r_dim, -// narrow_tile -// ); - -// } - -// inline void llk_unpack_tilize_uninit(const std::uint32_t face_r_dim = FACE_R_DIM) { -// TT_SETADCXX(p_setadc::UNP_A, face_r_dim*FACE_C_DIM-1, 0x0); -// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG2_Out_data_format_ADDR32+0-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_0); // Restore unpack config[0] -// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::SR_UNPACK_TILIZER_STATE_1); // Restore tile x dim per context -// } - -// inline void llk_unpack_tilize(std::uint32_t operand, std::uint32_t tile_index, std::uint32_t block_ct_dim) { - -// std::uint32_t operand_id = get_operand_id(operand); -// const std::uint32_t face_r_dim = get_operand_face_r_dim(operand_id); -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); -// const bool narrow_tile = get_operand_narrow_tile(operand_id); - -// std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; // Remove header size added by descriptor - -// _llk_unpack_tilize_( -// base_address, -// tile_index, -// unpack_src_format[operand_id], -// block_ct_dim, -// face_r_dim, -// num_faces, -// narrow_tile -// ); -// } - -// inline void llk_unpack_tilize_block(std::uint32_t operand, std::uint32_t block_c_tiles) { -// for (std::uint32_t tile_index = 0; tile_index < block_c_tiles; tile_index++) { -// llk_unpack_tilize(operand, tile_index, block_c_tiles); -// } -// } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h index dded559e94d..51f7b91e8bf 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_unpack_untilize_api.h @@ -9,88 +9,3 @@ // /************************************************************************* // * LLK UNPACK UNTILIZE // *************************************************************************/ -// template -// inline void llk_unpack_untilize_hw_configure(const llk_unpack_A_params_t *unpack_untilize_params) { -// constexpr bool is_row_pool = false; -// constexpr bool within_face_16x16_transpose = false; -// constexpr StochRndType stoch_rnd_mode = StochRndType::None; - -// const uint32_t unpA_operand_id = get_operand_id(unpack_untilize_params->unpA_operand); -// const uint32_t unpA_num_faces = 4; -// const uint32_t unpA_face_r_dim = FACE_R_DIM; - -// _llk_unpack_untilize_hw_configure_( -// unpack_src_format[unpA_operand_id], -// unpack_dst_format[unpA_operand_id], -// unpA_face_r_dim, -// within_face_16x16_transpose, -// unpA_num_faces -// ); -// } - -// inline void llk_unpack_untilize_hw_configure_disaggregated(const std::uint32_t unpA_operand) { -// const llk_unpack_A_params_t unpack_untilize_params = { -// .unpA_operand = unpA_operand, -// }; -// llk_unpack_untilize_hw_configure(&unpack_untilize_params); -// } - -// inline void llk_unpack_untilize_mop_config() { -// _llk_unpack_untilize_mop_config_(); -// } - -// inline void llk_unpack_untilize_init(std::uint32_t operand = 0) { -// const std::uint32_t operand_id = get_operand_id(operand); -// const std::uint32_t face_r_dim = 1; -// const std::uint32_t num_faces = get_operand_num_faces(operand_id); - -// // Save state of unpacker config for quick restore -// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_0, UNP0_ADDR_CTRL_XY_REG_1_Ystride_ADDR32); // Save unpack stride config -// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_1, THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32); // Save tile x dim per context -// TTI_RDCFG(p_gpr_unpack::SR_UNPACK_UNTILIZER_STATE_2, THCON_SEC0_REG0_TileDescriptor_ADDR32+1); // Save descriptor 1 - -// _llk_unpack_untilize_init_( -// unpack_dst_format[operand_id], -// cb_interface[operand_id].fifo_page_size, -// face_r_dim, -// num_faces -// ); -// } - -// inline void llk_unpack_untilize_uninit(const std::uint32_t operand, const std::uint32_t face_r_dim = FACE_R_DIM) { -// std::uint32_t operand_id = get_operand_id(operand); -// std::uint32_t unpA_ch1_x_stride = (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float32 ? 4 : (uint) (unpack_dst_format[operand_id]&0x3) == (uint) DataFormat::Float16 ? 2 : 1; -// std::uint32_t unpA_ch1_y_stride = FACE_C_DIM*FACE_R_DIM*unpA_ch1_x_stride; - -// // Check that unpacker is done (all contexts freed up) before starting hw configuration -// wait_for_idle(); - -// // Reset address counters -// unpacker_addr_counter_init(); - -// // Wait for cfg to be free to edit -// TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK); - -// // Reset the values to default in unpack AB common. -// TT_SETADCXX(p_setadc::UNP_A, FACE_R_DIM*FACE_C_DIM-1, 0x0); -// TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG5_Tile_x_dim_cntx0_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_unpack::FACE_DIM_16x16); -// cfg_reg_rmw_tensix(1); -// cfg_reg_rmw_tensix(unpA_ch1_y_stride); -// TTI_NOP; TTI_NOP; // Do we need this for WH? -// } - -// template -// inline void llk_unpack_untilize_pass(std::uint32_t operand, std::uint32_t block_tile_cols) { -// const std::uint32_t operand_id = get_operand_id(operand); -// const std::uint32_t base_address = cb_interface[operand_id].fifo_rd_ptr - 1; - -// _llk_unpack_untilize_pass_( -// base_address, -// block_tile_cols -// ); -// } - -// inline void llk_unpack_untilize(std::uint32_t operand, std::uint32_t block_c_tiles) { -// llk_unpack_untilize_pass(operand, block_c_tiles); -// llk_unpack_untilize_pass(operand, block_c_tiles); -// } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h index f65a6b86ddd..7b504ae34ac 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -290,26 +290,4 @@ inline void llk_math_eltwise_unary_sfpu_silu_init() { llk_math_eltwise_unary_sfpu_init(); } -//Mask -template -inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index, vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_mask_init() { - llk_math_eltwise_unary_sfpu_init(); -} - -// Negative -template -inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) { - llk_math_eltwise_unary_sfpu(dst_index,vector_mode); -} - -template -inline void llk_math_eltwise_unary_sfpu_negative_init() { - llk_math_eltwise_unary_sfpu_init(); -} - } diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_identity.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_sfpu_identity.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_identity.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h new file mode 100644 index 00000000000..c0f73fb172e --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" + +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + + +template +inline void calculate_mask() +{ + bool exponent_size_8 = true; + for (int d = 0; d < ITERATIONS; d++) + { + vFloat mask = dst_reg[32]; + v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) { + dst_reg[0] = 0; + } + v_endif; + dst_reg++; + } +} +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h new file mode 100644 index 00000000000..503843211e7 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" + +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + + +template +inline void calculate_negative() +{ + for (int d = 0; d < ITERATIONS; d++) + { + vFloat val = dst_reg[0]; + dst_reg[0] = -val; + dst_reg++; + } +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h index 06aa57e9e34..360c0c9c9a0 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_common_includes.h @@ -101,10 +101,6 @@ inline void llk_math_calculate_sfpu( calculate_signbit(); } else if constexpr (operation == SfpuType::silu) { calculate_silu(); - } else if constexpr (operation == SfpuType::mask) { - calculate_mask(); - } else if constexpr (operation == SfpuType::negative) { - calculate_negative(); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_identity.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h similarity index 88% rename from tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_identity.h rename to tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h index 0bd1d26c78d..e59defb4588 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/llk_lib/llk_math_eltwise_unary_sfpu_identity.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_identity.h @@ -15,10 +15,7 @@ namespace ckernel { // New LLK SFPU APIs template -inline void llk_math_eltwise_unary_sfpu_identity(uint dst_index, int vector_mode = Dim::RC) { - - constexpr bool zero_negative = true; - constexpr int first_iterations = 1; +inline void llk_math_eltwise_unary_sfpu_identity(uint dst_index, int vector_mode = (int)VectorMode::RC) { llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_identity, ckernel::sfpu::calculate_identity, diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h new file mode 100644 index 00000000000..c59ab659106 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include "llk_math_eltwise_unary_sfpu_common_includes.h" +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_0_param.h" +#include "ckernel_sfpu_mask.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_mask_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, int vector_mode = (int)VectorMode::RC) { + constexpr int first_iterations = 1; + llk_math_eltwise_unary_sfpu_0_param + (ckernel::sfpu::calculate_mask, + ckernel::sfpu::calculate_mask, + dst_index, vector_mode); +} + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h new file mode 100644 index 00000000000..54f8c84ce6b --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include "llk_math_eltwise_unary_sfpu_common_includes.h" +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_0_param.h" +#include "ckernel_sfpu_negative.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_negative_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_0_param + (ckernel::sfpu::calculate_negative, + ckernel::sfpu::calculate_negative, + dst_index, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h index ec50f756429..d5f48028601 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_sqrt.h @@ -14,7 +14,6 @@ namespace ckernel { template inline void llk_math_eltwise_unary_sfpu_sqrt(uint dst_index, int vector_mode = (int)VectorMode::RC) { - constexpr bool zero_negative = true; constexpr int first_iterations = 1; llk_math_eltwise_unary_sfpu_0_param (ckernel::sfpu::calculate_sqrt, diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h index 4c059e37585..6eb95e8c730 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/metal_ckernel_sfpu.h @@ -655,20 +655,6 @@ inline void cast_fp32_to_fp16a() } } - - -template -inline void calculate_negative() -{ - - for (int d = 0; d < ITERATIONS; d++) - { - vFloat val = dst_reg[0]; - dst_reg[0] = -val; - dst_reg++; - } -} - template inline void calculate_add1() { @@ -775,20 +761,5 @@ inline void calculate_silu() } } -template -inline void calculate_mask() -{ - bool exponent_size_8 = true; - for (int d = 0; d < ITERATIONS; d++) - { - vFloat mask = dst_reg[32]; - v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) { - dst_reg[0] = 0; - } - v_endif; - dst_reg++; - } -} - } // namespace sfpu } // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h b/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h index bb65c153f49..b74823e4927 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/negative.h @@ -9,7 +9,7 @@ #include "compute_kernel_api/common_globals.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_unary_sfpu.h" +#include "llk_math_eltwise_unary_sfpu_negative.h" #define MAIN math_main() #define MATH(x) x #else diff --git a/tt_metal/include/compute_kernel_api/mask.h b/tt_metal/include/compute_kernel_api/mask.h index 9b8a75999f7..2d3f370a788 100644 --- a/tt_metal/include/compute_kernel_api/mask.h +++ b/tt_metal/include/compute_kernel_api/mask.h @@ -9,7 +9,7 @@ #include "compute_kernel_api/common_globals.h" #ifdef TRISC_MATH -#include "llk_math_eltwise_unary_sfpu.h" +#include "llk_math_eltwise_unary_sfpu_mask.h" #define MAIN math_main() #define MATH(x) x #else diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 80efc1fe988..8a0285268e5 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -103,11 +103,14 @@ void JitBuildEnv::init(uint32_t device_id, tt::ARCH arch) "-I" + this->root_ + "tt_metal " + "-I" + this->root_ + "tt_metal/include " + "-I" + this->root_ + "tt_metal/hw/inc " + + "-I" + this->root_ + "tt_metal/hw/inc/debug " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + " " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/" + this->arch_name_ + "_defines " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/noc " + "-I" + this->root_ + "tt_metal/third_party/umd/device/" + this->arch_name_ + " " + // TODO(fixme) - "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/common/inc "; // TODO(fixme) datamovement fw shouldn't read this + "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/common/inc " + // TODO(fixme) datamovement fw shouldn't read this + "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/common " + + "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/llk_io "; this->lflags_ = common_flags; this->lflags_ += "-fno-exceptions -Wl,-z,max-page-size=16 -Wl,-z,common-page-size=16 -nostartfiles "; @@ -160,7 +163,9 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, int which, bo "-Os " + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops this->includes_ = env_.includes_ + - "-I " + env_.root_ + "tt_metal/hw/firmware/src "; + "-I " + env_.root_ + "tt_metal/hw/firmware/src " + + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; this->defines_ = env_.defines_; @@ -230,6 +235,10 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, int which, bool is_fw) this->includes_ = env_.includes_ + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/inc " + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/llk_lib " + + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io " + + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api " + + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api/llk_sfpu " + "-I" + env_.root_ + "tt_metal/third_party/sfpi/include " + "-I" + env_.root_ + "tt_metal/hw/firmware/src "; @@ -238,7 +247,6 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, int which, bool is_fw) this->srcs_.push_back("tt_metal/hw/firmware/src/trisc.cc"); this->srcs_.push_back("tt_metal/hw/toolchain/tmu-crt0.S"); } else { - this->srcs_.push_back("tt_metal/hw/ckernels/" + env_.arch_name_ + "/common/src/ckernel_template.cc"); this->srcs_.push_back("tt_metal/hw/firmware/src/trisck.cc"); this->srcs_.push_back("tt_metal/hw/toolchain/tmu-crt0k.S"); } @@ -305,7 +313,9 @@ JitBuildEthernet::JitBuildEthernet(const JitBuildEnv& env, int which, bool is_fw } this->includes_ = env_.includes_ + - "-I " + env_.root_ + "tt_metal/hw/inc/ethernet "; + "-I " + env_.root_ + "tt_metal/hw/inc/ethernet " + + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; this->srcs_.push_back("tt_metal/hw/toolchain/substitutes.cpp"); if (this->is_fw_) { From 525692513edcc4b221d7ca096a309d054d453435 Mon Sep 17 00:00:00 2001 From: Reem Tawfik Date: Tue, 12 Dec 2023 23:14:46 +0000 Subject: [PATCH 14/16] #3908: Remove perf kernels, add sfpu loop unroll --- .../llk_api/llk_sfpu/ckernel_sfpu_mask.h | 1 + .../llk_api/llk_sfpu/ckernel_sfpu_negative.h | 2 +- .../llk_math_eltwise_unary_sfpu_negative.h | 3 +- .../wormhole_b0/common/inc/ckernel_perf_api.h | 162 ---------- .../common/inc/ckernel_perf_include.h | 32 -- .../common/inc/ckernel_perf_math.h | 283 ------------------ .../common/inc/ckernel_perf_unpack_pack.h | 162 ---------- .../llk_api/llk_sfpu/ckernel_sfpu_mask.h | 1 + .../llk_api/llk_sfpu/ckernel_sfpu_negative.h | 1 + 9 files changed, 5 insertions(+), 642 deletions(-) delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h delete mode 100644 tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h index 302cb97c934..2dcd2a5d63e 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h @@ -20,6 +20,7 @@ template inline void calculate_mask() { bool exponent_size_8 = true; + #pragma GCC unroll 4 for (int d = 0; d < ITERATIONS; d++) { vFloat mask = dst_reg[16]; diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h index fd9cfef2da6..136877237ab 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h @@ -19,7 +19,7 @@ namespace sfpu { template inline void calculate_negative() { - + #pragma GCC unroll 4 for (int d = 0; d < ITERATIONS; d++) { vFloat val = dst_reg[0]; diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h index 5badb47497b..fd6ca275adc 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_negative.h @@ -21,9 +21,8 @@ inline void llk_math_eltwise_unary_sfpu_negative_init() { template inline void llk_math_eltwise_unary_sfpu_negative(uint dst_index, int vector_mode = Dim::RC) { - constexpr int first_iterations = 1; llk_math_eltwise_unary_sfpu_0_param - (ckernel::sfpu::calculate_negative, + (ckernel::sfpu::calculate_negative, ckernel::sfpu::calculate_negative, dst_index, vector_mode); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h deleted file mode 100644 index 9bfa79f6934..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_api.h +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include "ckernel_include.h" -#include "ckernel_globals.h" -#include "ckernel.h" -#include "tensix.h" -#include "fw_debug.h" -#include "epoch.h" - -#ifdef PERF_DUMP -#include "perf_lib/scratch_api.h" -#include "perf_res_decouple.h" -#include "ckernel_perf_math.h" -#include "ckernel_perf_unpack_pack.h" -#endif - -#ifndef INTERMED_DUMP -#define INTERMED_DUMP 0 -#endif - -#pragma GCC diagnostic ignored "-Wunused-function" - -// Comment in/out to enable perf scratch even logging - -namespace ckernel -{ -extern uint32_t perf_index; -extern uint32_t perf_end; -// Perf-buffer are double buffered for spill_to_dram. -// Ncrisc will move one half to dram while trisc populates the other half. -// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. -extern volatile uint32_t *perf_buf_base[2]; -// Selects the half of perf_buffer that trisc is currently writing into. -extern uint8_t perf_buf_base_id; -extern bool record_perf_events; -extern uint32_t perf_events_target_idx; -extern uint16_t current_outer_loop_iter; -extern uint8_t thread_id; -extern bool first_unpack_recorded; - -inline void set_perf_dump_flag_for_input(int input_idx) { - #ifdef PERF_DUMP - TT_LLK_DUMP("set_perf_dump_flag_for_input({})", input_idx); - if (perf_events_target_inputs[perf_events_target_idx] == input_idx) { - record_perf_events = true; - perf_events_target_idx++; - if constexpr(PERF_DUMP_CONCURRENT == 0 && INTERMED_DUMP == 0) { - if (thread_id == 0 || thread_id == 2) { - perf_end += num_events_per_input; - // The buffer size available for each thread after double buffering is (l1_mem::address_map::TRISC_PERF_BUF_SIZE)/2. - // Max number of events we can record in each half of the buffer will be that size divided by 4, since each event will be 4 bytes. - if (perf_end > (TRISC_PERF_BUF_SIZE >> 2)) { - perf_end = TRISC_PERF_BUF_SIZE >> 2; - } - } - } - current_outer_loop_iter = input_idx; - } else { - record_perf_events = false; - } - first_unpack_recorded = false; - #endif -} - -inline void record_pack_input_init_timestamp() { - #ifdef PERF_DUMP - TT_LLK_DUMP("record_pack_input_init_timestamp()"); - if (record_perf_events) { - uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter); - record_timestamp_64b(event_id); - } - #endif -} - -void record_pack_input_end_timestamp() { - #ifdef PERF_DUMP - TT_LLK_DUMP("record_pack_input_end_timestamp()"); - if (record_perf_events) { - uint32_t event_id = perf::get_event_id(0, 0, perf::EventType::PACK_EACH_INPUT, current_outer_loop_iter); - record_timestamp_64b(event_id); - if (perf_events_target_idx == 1) { - uint32_t event_id_num_tiles_pack = perf::get_event_id(0, 0, perf::EventType::NUM_TILES_PACK, current_outer_loop_iter); - uint16_t num_tiles = regfile[p_gpr_pack::PERF_PACK_NUM_TILES] & 0xffff; - record_perf_value_and_check_overflow(event_id_num_tiles_pack, num_tiles, 0); - } - } - #endif -} - -inline void perf_math_counter_start() { - #ifdef PERF_DUMP - TT_LLK_DUMP("perf_math_counter_start()"); - if constexpr(SKIP_UNP) { - TTI_SETDVALID(p_setrwc::SET_A); - TTI_SETDVALID(p_setrwc::SET_B); - } - if (record_perf_events) { - // Due to a race condition that corrupts the write address of the fpu counters, reprogram them for every input - dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[perf_buf_base_id][perf_index], (uint32_t)&perf_buf_base[perf_buf_base_id][perf_end]); - start_fpu_perf_cnt(); - } - #endif -} - -inline void record_perf_math_counter() { - #ifdef PERF_DUMP - TT_LLK_DUMP("record_perf_math_counter()"); - if constexpr(SKIP_UNP) { - TTI_CLEARDVALID(0x1, 0); - TTI_CLEARDVALID(0x2, 0); - } - if (record_perf_events) { - stop_fpu_perf_cnt(); - // record_fpu_perf_cnt_value(); - } - #endif -} - -void record_unpack_num_tiles() { - #ifdef PERF_DUMP - if (perf_events_target_idx == 1) { - for (uint8_t operand = 0; operand < PERF_MAX_NUM_INPUTS; operand++) { - uint regfile_base_idx = p_gpr_unpack::PERF_UNPACK_NUM_TILES_0; - regfile_base_idx += (operand >> 1); - bool upper = operand & 0b1; - uint16_t num_tiles; - if (upper) { - num_tiles = (regfile[regfile_base_idx] >> 16) & 0xffff; - } else { - num_tiles = regfile[regfile_base_idx] & 0xffff; - } - if (num_tiles != 0) { - uint32_t event_id_num_tiles_unpack = perf::get_event_id(operand, 0, perf::EventType::NUM_TILES_UNPACK, current_outer_loop_iter); - record_perf_value_and_check_overflow(event_id_num_tiles_unpack, num_tiles, 0); - } - } - } - #endif -} - -void record_unpack_first_instruction_timestamp() { - #ifdef PERF_DUMP - TT_LLK_DUMP("record_unpack_first_instruction_timestamp()"); - if (record_perf_events) { - uint32_t clock_lo = regfile[p_gpr_unpack::PERF_FIRST_UNP_LO]; - uint32_t clock_hi = regfile[p_gpr_unpack::PERF_FIRST_UNP_HI]; - uint32_t event_id_last_wait_tile = perf::get_event_id(0, 0, perf::EventType::UNPACK_FIRST_INSTRUCTION, current_outer_loop_iter); - record_perf_value_and_check_overflow(event_id_last_wait_tile, clock_lo, clock_hi); - if (perf_events_target_idx == 1) { - record_unpack_num_tiles(); - } - } - #endif -} - -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h deleted file mode 100644 index d9ff57a5403..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_include.h +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#ifdef PERF_DUMP -#include - -#include "perf_events_target_inputs.h" -#include "perf_lib/scratch_api.h" - -#ifndef INTERMED_DUMP -#define INTERMED_DUMP 0 -#endif - -#ifndef PERF_DUMP_CONCURRENT -#define PERF_DUMP_CONCURRENT 0 -#endif - -#pragma GCC diagnostic ignored "-Wunused-function" - -static constexpr uint32_t PERF_DUMP_END_SIGNAL = 0xbeeff00d; -static constexpr uint32_t PERF_CNT_DUMP_ENTRY_SIZE = 16; // Entry size in bytes - -#if PERF_DUMP_LEVEL == 0 -static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_0; -#else -static constexpr int32_t TRISC_PERF_BUF_SIZE = l1_mem::address_map::UNPACK_PACK_PERF_BUF_SIZE_LEVEL_1; -#endif - -#endif diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h deleted file mode 100644 index 812f5cc9884..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_math.h +++ /dev/null @@ -1,283 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include "ckernel_include.h" -#include "ckernel_globals.h" -#include "ckernel.h" -#include "tensix.h" -#include "fw_debug.h" -#include "epoch.h" - -#include "ckernel_perf_include.h" - -#ifndef INTERMED_DUMP -#define INTERMED_DUMP 0 -#endif - -#pragma GCC diagnostic ignored "-Wunused-function" - -// Comment in/out to enable perf scratch even logging - -namespace ckernel -{ -extern uint32_t perf_index; -extern uint32_t perf_end; -// Perf-buffer are double buffered for spill_to_dram. -// Ncrisc will move one half to dram while trisc populates the other half. -// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. -extern volatile uint32_t *perf_buf_base[2]; -// Selects the half of perf_buffer that trisc is currently writing into. -extern uint8_t perf_buf_base_id; -extern uint16_t current_outer_loop_iter; -extern uint8_t thread_id; -extern uint32_t perf_events_target_idx; - -// In math thread, THCON dumps perf buffers in l1. -// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well. -// Flipping the l1 perf start address must also be done by THCON for math thread. -// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective. -// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed. -// We read this value initially for all threads to reduce the l1-reads. -extern int32_t dram_dump_req_local; - -struct cperf_cnt_mode -{ - constexpr static uint32_t PERF_CNT_MODE_FREE = 0; // Free running period counter - constexpr static uint32_t PERF_CNT_MODE_STOP = 1; // Stop counter - constexpr static uint32_t PERF_CNT_MODE_WRAP = 2; // Wrap period counter -}; - -struct cperf_cnt_block_sel -{ - constexpr static uint32_t PERF_CNT_INSTR_THREAD = 0; // Select all instruction thread perf counters(includes TDMA) - constexpr static uint32_t PERF_CNT_FPU = 1; // Select FPU perf counters - constexpr static uint32_t PERF_CNT_L1 = 2; // Select L1 perf counters - constexpr static uint32_t PERF_CNT_ALL = 3; // Select all perf counters -}; - -struct cperf_dbg_daisy_id -{ - constexpr static uint32_t DEBUG_DAISY_INSTRN_THREAD = 1; // Thread specific perf counters - constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_0 = 4; // TDMA+math - constexpr static uint32_t DEBUG_DAISY_INSTRN_ISSUE_1 = 5; // math+instruction issue - constexpr static uint32_t DEBUG_DAISY_TENSIX = 7; // FPU and L1 perf counters -}; - -struct cperf_dbg_dump_to_mem_mode -{ - constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_WR = 0; - constexpr static uint32_t DEBUG_MEM_MODE_AUTO_WR = 1; - constexpr static uint32_t DEBUG_MEM_MODE_MANUAL_RD = 2; - constexpr static uint32_t DEBUG_MEM_MODE_AUTO_RD = 3; -}; - -inline void set_perf_cnt_params(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU, uint32_t ref_period=0xffffffff, uint32_t mode=cperf_cnt_mode::PERF_CNT_MODE_FREE) { - uint32_t perf_cnt_ref_period_reg; - switch (block_sel) { - case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD0; break; - case cperf_cnt_block_sel::PERF_CNT_L1: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_L1_0; break; - default: perf_cnt_ref_period_reg = RISCV_DEBUG_REG_PERF_CNT_FPU0; - } - reg_write(perf_cnt_ref_period_reg, ref_period); - reg_write(perf_cnt_ref_period_reg+4, 0x00010100); -} - -inline void stop_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) { - uint32_t perf_cnt_cntl_reg; - switch (block_sel) { - case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break; - case cperf_cnt_block_sel::PERF_CNT_L1: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break; - case cperf_cnt_block_sel::PERF_CNT_ALL: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break; - default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2; - } - reg_write(perf_cnt_cntl_reg, 0x00000002); - reg_write(perf_cnt_cntl_reg, 0x00000000); -} - -template -inline void stop_fpu_perf_cnt() { - if (perf_events_target_idx <= 1) { - if constexpr (use_tensix) { - if constexpr (stall_on_math) { - TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH); - } - TTI_STOREREG(p_gpr_math::PERF_CNT_STOP, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); - TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); - } else { - reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000002); - reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000); - } - } -} - -inline void start_perf_cnt(uint32_t block_sel=cperf_cnt_block_sel::PERF_CNT_FPU) { - uint32_t perf_cnt_cntl_reg; - switch (block_sel) { - case cperf_cnt_block_sel::PERF_CNT_INSTR_THREAD: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_INSTRN_THREAD2; break; - case cperf_cnt_block_sel::PERF_CNT_L1: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_L1_2; break; - case cperf_cnt_block_sel::PERF_CNT_ALL: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_ALL; break; - default: perf_cnt_cntl_reg = RISCV_DEBUG_REG_PERF_CNT_FPU2; - } - reg_write(perf_cnt_cntl_reg, 0x00000001); - reg_write(perf_cnt_cntl_reg, 0x00000000); -} - -template -inline void start_fpu_perf_cnt() { - if (perf_events_target_idx <= 1) { - if constexpr (use_tensix) { - TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH); - TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::THCON); - TTI_STOREREG(p_gpr_math::PERF_CNT_START, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); - TTI_STOREREG(p_gpr::ZERO, (RISCV_DEBUG_REG_PERF_CNT_FPU2>>2)&0x3ffff); - } else { - reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000001); - reg_write(RISCV_DEBUG_REG_PERF_CNT_FPU2, 0x00000000); - } - } -} - - -inline void sel_fpu_perf_cnt(uint32_t cnt_id) { - riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl; - dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG);; - dbg_bus_cntl.f.dbg_daisy_sel = cperf_dbg_daisy_id::DEBUG_DAISY_TENSIX; - dbg_bus_cntl.f.dbg_sig_sel = 0x0; - dbg_bus_cntl.f.dbg_rd_sel = cnt_id<<1; //rd_sel is aligned to 16-bit - reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val); -} - -// Return value of the selected perf counter -inline uint32_t get_perf_cnt() { - return reg_read(RISCV_DEBUG_REG_DBG_RD_DATA); -} - -template -inline void dump_perf_cnt_to_mem() { - if constexpr (use_tensix) { - TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_SET, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff); - TTI_STOREREG(p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2>>2)&0x3ffff); - } else { - riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2; - dbg_l1_mem_reg2.val = 0; - dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; - dbg_l1_mem_reg2.f.mem_write = 1; - reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); - dbg_l1_mem_reg2.f.mem_write = 0; - reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); - } -} - -inline void dbg_daisy_enable() { - riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl; - dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG); - dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x1; - dbg_bus_cntl.f.dbg_daisy_en = 0x1; - reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val); -} - -inline void dbg_daisy_disable() { - riscv_debug_reg_dbg_dbus_cntl_u dbg_bus_cntl; - dbg_bus_cntl.val = reg_read(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG); - dbg_bus_cntl.f.dbg_reg_ovrd_en = 0x0; - dbg_bus_cntl.f.dbg_daisy_en = 0x0; - reg_write(RISCV_DEBUG_REG_DBG_BUS_CNTL_REG, dbg_bus_cntl.val); -} - -inline void dbg_enable_dump_to_mem(uint32_t start_addr, uint32_t end_addr) { - - TTI_STALLWAIT(p_stall::STALL_THCON, p_stall::MATH); - uint32_t start_addr_lo = (start_addr >> 4) & 0xffff; - uint32_t start_addr_hi = (start_addr >> 4) >> 16; - TT_SETDMAREG(0, start_addr_lo, 0, LO_16(p_gpr_math::TMP0)); - TT_SETDMAREG(0, start_addr_hi, 0, HI_16(p_gpr_math::TMP0)); - TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG0 >> 2) & 0x3ffff); - - uint32_t end_addr_lo = (end_addr >> 4) & 0xffff; - uint32_t end_addr_hi = (end_addr >> 4) >> 16; - TT_SETDMAREG(0, end_addr_lo, 0, LO_16(p_gpr_math::TMP0)); - TT_SETDMAREG(0, end_addr_hi, 0, HI_16(p_gpr_math::TMP0)); - TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG1 >> 2) & 0x3ffff); - - // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG0, start_addr>>4); - // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG1, end_addr>>4); - riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2; - dbg_l1_mem_reg2.val = 0; - dbg_l1_mem_reg2.f.mem_dump_mode = 0xf; //invalid and overriden below to trigger pulse needed to latch start address - dbg_l1_mem_reg2.f.skip_cycles = 0; - - uint32_t debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff; - uint32_t debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff; - TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0)); - TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0)); - TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff); - - - // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); - dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; // This value must change in order to latch new start address!!! - // reg_write(RISCV_DEBUG_REG_DBG_L1_MEM_REG2, dbg_l1_mem_reg2.val); - - debug_l1_reg2_lo = dbg_l1_mem_reg2.val & 0xffff; - debug_l1_reg2_hi = (dbg_l1_mem_reg2.val >> 16) & 0xffff; - TT_SETDMAREG(0, debug_l1_reg2_lo, 0, LO_16(p_gpr_math::TMP0)); - TT_SETDMAREG(0, debug_l1_reg2_hi, 0, HI_16(p_gpr_math::TMP0)); - TTI_STOREREG(p_gpr_math::TMP0, (RISCV_DEBUG_REG_DBG_L1_MEM_REG2 >> 2) & 0x3ffff); - - TTI_STALLWAIT(p_stall::STALL_MATH, p_stall::THCON); -} - -template -inline void record_fpu_perf_cnt_value() { - // if (perf_events_target_idx <= 1) { - // // In l1 mode always reserve last event for PERF_DUMP_END_SIGNAL. - // uint32_t reserve_space_for_trisc_end_signal = 1; - // if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default. - // //perf_buf_base[perf_index] = get_perf_cnt(); - // //perf_buf_base[perf_index + 1] = get_perf_cnt(); - // dump_perf_cnt_to_mem(); //Dump 16B to L1 - // perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)); - // } - // } -} - -// Dump a dummy math event to get the initial fpu counter value. -inline void record_dummy_math_event() { - if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { - uint32_t reserve_space_for_trisc_end_signal = 1; - if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default. - perf_buf_base[perf_buf_base_id][perf_index] = 0; - perf_buf_base[perf_buf_base_id][perf_index+1] = 0; - perf_buf_base[perf_buf_base_id][perf_index+2] = 0; - perf_buf_base[perf_buf_base_id][perf_index+3] = 0; - perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)); - } - } -} - -inline void setup_fpu_perf_cnt() { - // Only program perf counters for math thread (trisc1) - if ((uint32_t) __firmware_start == (uint32_t)l1_mem::address_map::TRISC1_BASE) { - set_perf_cnt_params(cperf_cnt_block_sel::PERF_CNT_FPU,0xffffffff,cperf_cnt_mode::PERF_CNT_MODE_FREE); - sel_fpu_perf_cnt(0); - dbg_daisy_enable(); - dbg_enable_dump_to_mem((uint32_t)&perf_buf_base[0][PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)], (uint32_t)&perf_buf_base[0][perf_end]); - - riscv_debug_reg_dbg_l1_mem_reg2_u dbg_l1_mem_reg2; - dbg_l1_mem_reg2.val = 0; - dbg_l1_mem_reg2.f.mem_dump_mode = cperf_dbg_dump_to_mem_mode::DEBUG_MEM_MODE_AUTO_WR; - dbg_l1_mem_reg2.f.mem_write = 0; - regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_CLEAR]=dbg_l1_mem_reg2.val; - dbg_l1_mem_reg2.f.mem_write = 1; - regfile[p_gpr_math::PERF_MEM_DUMP_CNTL_SET]=dbg_l1_mem_reg2.val; - - regfile[p_gpr_math::PERF_CNT_START]=0x1; - regfile[p_gpr_math::PERF_CNT_STOP]=0x2; - sync_regfile_write(p_gpr_math::PERF_CNT_STOP); - } -} -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h b/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h deleted file mode 100644 index 9a2b21b4756..00000000000 --- a/tt_metal/hw/ckernels/wormhole_b0/common/inc/ckernel_perf_unpack_pack.h +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include "ckernel_include.h" -#include "ckernel_globals.h" -#include "ckernel.h" -#include "tensix.h" -#include "fw_debug.h" -#include "epoch.h" - -#include "ckernel_perf_include.h" - -#pragma GCC diagnostic ignored "-Wunused-function" - -// Comment in/out to enable perf scratch even logging - -namespace ckernel -{ -extern uint32_t perf_index; -extern uint32_t perf_end; -// Perf-buffer are double buffered for spill_to_dram. -// Ncrisc will move one half to dram while trisc populates the other half. -// When INTERMED_DUMP = 0, we only dump into perf_buf_base[0]. -extern volatile uint32_t *perf_buf_base[2]; -// Selects the half of perf_buffer that trisc is currently writing into. -extern uint8_t perf_buf_base_id; -extern uint8_t thread_id; - -// In math thread, THCON dumps perf buffers in l1. -// Therefore, incrementing the ncrisc perf_dram_buffer_req must be done by THCON as well. -// Flipping the l1 perf start address must also be done by THCON for math thread. -// Following variable keeps track of latest value of perf_dram_copy_req[1] from trisc perspective. -// The actual value might be different, because the queued THCON updates for perf_dram_copy_req[1] might have yet not been executed. -// We read this value initially for all threads to reduce the l1-reads. -extern int32_t dram_dump_req_local; -extern bool record_perf_events; -extern uint32_t perf_events_target_idx; -extern bool first_unpack_recorded; -extern volatile uint * ncrisc_ack_addr; -extern uint16_t current_outer_loop_iter; -#if OVERLAY_DECOUPLE == 1 -extern uint8_t overlay_output_decouple_mask; -#endif - -void allocate_perf_buffer(); - -// This function gets called when half-perf-buffer is full and need to switch. -// Only used for threads 0 and 2. -// For thread 1 a different function is used: switch_perf_buffers_for_math_thread -// If ncrisc has not yet finished dumping the next half of perf-buffer, trisc will stall. -// If is_perf_end_signal is true, we just need to write the PERF_DUMP_END_SIGNAL. -// This function should only get executed in INTERMED_DUMP mode. -void switch_perf_buffers(); -void last_trisc_perf_dump_to_dram(); - -// The two following functions are separated to avoid inline recursive function calls. -// TODO: Check the behaviour of the compiler if the two following functions were merged into a template function. -inline void record_perf_value(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b) { - perf_buf_base[perf_buf_base_id][perf_index] = event_id; - perf_buf_base[perf_buf_base_id][perf_index + 1] = event_value_hi_32b; - perf_buf_base[perf_buf_base_id][perf_index + 2] = event_value_lo_32b; - perf_index += 3; -} - -inline void record_perf_dump_end() { - if (perf_index < perf_end) { - perf_buf_base[perf_buf_base_id][perf_index] = PERF_DUMP_END_SIGNAL; - perf_index += 1; - } -#if PERF_DUMP_CONCURRENT == 1 - if (perf_index < perf_end) { - perf_buf_base[perf_buf_base_id][perf_end - 1] = PERF_DUMP_END_SIGNAL; - } -#endif -} - -inline void record_perf_value_and_check_overflow(uint32_t event_id, uint32_t event_value_lo_32b, uint32_t event_value_hi_32b, uint32_t leave_space = 0) { - // In l1 mode always reserve the last event for PERF_DUMP_END_SIGNAL. - int reserve_space_for_trisc_end_signal = 1; - -#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1) - leave_space = 0; - reserve_space_for_trisc_end_signal = 0; - if (perf_index + 2 >= perf_end - reserve_space_for_trisc_end_signal - leave_space) { - switch_perf_buffers(); - } - record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b); -#else - if (perf_index + 2 < perf_end - reserve_space_for_trisc_end_signal - leave_space) { - record_perf_value(event_id, event_value_lo_32b, event_value_hi_32b); - } -#endif -} - -inline void record_timestamp_64b(uint event_id, uint leave_space = 0) { - if (record_perf_events) { - uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - record_perf_value_and_check_overflow(event_id, timestamp_low, timestamp_high, leave_space); - } -} - -inline void record_perf_dump_end_and_check_overflow() { - if (thread_id == 1) { - uint32_t reserve_space_for_trisc_end_signal = 1; - if (perf_index + 3 <= perf_end-reserve_space_for_trisc_end_signal) { // Last event is always set to a default. - perf_buf_base[perf_buf_base_id][perf_index] = reg_read(0xFFB12000 + 0x120); - perf_buf_base[perf_buf_base_id][perf_index+1] = reg_read(0xFFB12000 + 0x124); - perf_buf_base[perf_buf_base_id][perf_index+2] = 0; - perf_buf_base[perf_buf_base_id][perf_index+3] = 0; - perf_index += (PERF_CNT_DUMP_ENTRY_SIZE/sizeof(uint32_t)); - } - } - -#if (INTERMED_DUMP == 1) || (PERF_DUMP_CONCURRENT == 1) - if (perf_index >= perf_end) { - switch_perf_buffers(); - } - record_perf_dump_end(); -#else - if (perf_index < perf_end) { - record_perf_dump_end(); - } -#endif -} - -inline void record_latest_wait_for_tile() { -#if defined(PERF_DUMP) - if (!first_unpack_recorded) { - uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); - uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); - regfile[p_gpr_unpack::PERF_FIRST_UNP_LO] = timestamp_low & 0xffffffff; - sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_LO); - regfile[p_gpr_unpack::PERF_FIRST_UNP_HI] = timestamp_high & 0xffffffff; - sync_regfile_write(p_gpr_unpack::PERF_FIRST_UNP_HI); - } -#endif -} - -void increment_unpack_tiles(uint operand_idx, uint num_tiles); -void increment_pack_tiles(uint num_tiles); -#if OVERLAY_DECOUPLE == 1 -inline uint32_t get_active_stream_idx(uint32_t stream_id) { - std::uint32_t active_stream_idx; - for (uint32_t active_streams_idx = 0; active_streams_idx < NOC_NUM_STREAMS; active_streams_idx++) { - if (stream_id == EPOCH_INFO_PTR->active_streams[active_streams_idx]->stream_id) { - active_stream_idx = active_streams_idx; - break; - } - } - return active_stream_idx; -} - -void llk_push_all_packer_tiles_for_decoupling(); -#endif - -} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h index c0f73fb172e..f00bc07cbb3 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h @@ -20,6 +20,7 @@ template inline void calculate_mask() { bool exponent_size_8 = true; + #pragma GCC unroll 8 for (int d = 0; d < ITERATIONS; d++) { vFloat mask = dst_reg[32]; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h index 503843211e7..3af9e78c3d9 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_negative.h @@ -19,6 +19,7 @@ namespace sfpu { template inline void calculate_negative() { + #pragma GCC unroll 8 for (int d = 0; d < ITERATIONS; d++) { vFloat val = dst_reg[0]; From 77cadf2addfec744531338d54892af260924ed62 Mon Sep 17 00:00:00 2001 From: Reem Tawfik Date: Wed, 13 Dec 2023 00:41:36 +0000 Subject: [PATCH 15/16] #3908: Move common files under inc, instead of under folders --- .../ckernels/grayskull/common/inc/ckernel.h | 2 - .../metal/common/metal_ckernel_globals.h | 63 ------------------- tt_metal/hw/firmware/src/brisc.cc | 2 +- tt_metal/hw/firmware/src/brisck.cc | 2 +- tt_metal/hw/firmware/src/ncrisc.cc | 2 +- tt_metal/hw/firmware/src/ncrisck.cc | 2 +- tt_metal/hw/firmware/src/trisc.cc | 2 +- tt_metal/hw/firmware/src/trisck.cc | 2 +- .../compile_time_args.h} | 0 .../firmware_common.h} | 2 +- .../compute_kernel_api/common_globals.h | 2 +- tt_metal/jit_build/build.cpp | 4 +- 12 files changed, 10 insertions(+), 75 deletions(-) delete mode 100644 tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h rename tt_metal/hw/{ckernels/wormhole_b0/metal/common/metal_compile_time_args.h => inc/compile_time_args.h} (100%) rename tt_metal/hw/{ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h => inc/firmware_common.h} (98%) diff --git a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h index 8bbf675af9e..1c2a86ef572 100644 --- a/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h +++ b/tt_metal/hw/ckernels/grayskull/common/inc/ckernel.h @@ -42,8 +42,6 @@ namespace ckernel { -#define get_compile_time_arg_val(arg_idx) KERNEL_COMPILE_TIME_ARG_ ## arg_idx - constexpr uint PACK_FLUSH_COUNTERS = // counters flush (1 << PACK_COUNTERS_SEC2_pack_per_xy_plane_SHAMT) | (1 << PACK_COUNTERS_SEC2_pack_reads_per_xy_plane_SHAMT) | diff --git a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h b/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h deleted file mode 100644 index 7800a9934d7..00000000000 --- a/tt_metal/hw/ckernels/grayskull/metal/common/metal_ckernel_globals.h +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -//TODO: This file should be deleted after fixing redefinition errors, -// functions should be moved to ckernel_globals.h -#pragma once - -#include -#include "ckernel_structs.h" -#include "risc_attribs.h" -#include "tensix_functions.h" -#include "hostdevcommon/common_runtime_address_map.h" - -extern uint32_t __ldm_bss_start[]; -extern uint32_t __ldm_bss_end[]; -extern uint32_t __ldm_data_start[]; -extern uint32_t __ldm_data_end[]; -extern void (* __init_array_start[])(); -extern void (* __init_array_end[])(); -extern uint32_t __firmware_start[]; - -extern void kernel_init(); -extern void kernel_launch(); - -inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) { - // Cover L1 load latency of 6 cycles for the bulk of the copy - int32_t n = 0; - while (n < len - 5) { - uint32_t v0 = l1_addr[n + 0]; - uint32_t v1 = l1_addr[n + 1]; - uint32_t v2 = l1_addr[n + 2]; - uint32_t v3 = l1_addr[n + 3]; - uint32_t v4 = l1_addr[n + 4]; - uint32_t v5 = l1_addr[n + 5]; - local_mem_addr[n + 0] = v0; - local_mem_addr[n + 1] = v1; - local_mem_addr[n + 2] = v2; - local_mem_addr[n + 3] = v3; - local_mem_addr[n + 4] = v4; - local_mem_addr[n + 5] = v5; - n += 6; - } - // Could optimize this further (eg, loop of 2 or 4), probably not worth it - while (n < len) { - local_mem_addr[n] = l1_addr[n]; - n++; - } -} - -inline void firmware_kernel_common_init(void *init_local_l1_base) { - - // Handle stuff typically done in crt0 in asm. Easier to do in C - wzerorange(__ldm_bss_start, __ldm_bss_end); - - int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2; - uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE; - l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words); - - for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) { - (**fptr)(); - } -} diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 465c7b74d44..b7151c38589 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -17,7 +17,7 @@ #include "c_tensix_core.h" #include "tdma_xmov.h" #include "noc_nonblocking_api.h" -#include "metal_ckernel_globals.h" +#include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" diff --git a/tt_metal/hw/firmware/src/brisck.cc b/tt_metal/hw/firmware/src/brisck.cc index 06567a58a7d..bc6a252eefb 100644 --- a/tt_metal/hw/firmware/src/brisck.cc +++ b/tt_metal/hw/firmware/src/brisck.cc @@ -15,7 +15,7 @@ #include "c_tensix_core.h" #include "tdma_xmov.h" #include "noc_nonblocking_api.h" -#include "metal_ckernel_globals.h" +#include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "dataflow_api.h" #include "noc_addr_ranges_gen.h" diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index 6a96aa0fbb0..fe40b9f6eb5 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -7,7 +7,7 @@ #include "noc_nonblocking_api.h" #include "dev_msgs.h" #include "stream_io_map.h" -#include "metal_ckernel_globals.h" +#include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" #include "generated_bank_to_noc_coord_mapping.h" diff --git a/tt_metal/hw/firmware/src/ncrisck.cc b/tt_metal/hw/firmware/src/ncrisck.cc index 7a6d037733c..ef7f78d6ea8 100644 --- a/tt_metal/hw/firmware/src/ncrisck.cc +++ b/tt_metal/hw/firmware/src/ncrisck.cc @@ -9,7 +9,7 @@ #ifdef PERF_DUMP #include "risc_perf.h" #endif -#include "metal_ckernel_globals.h" +#include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "dataflow_api.h" #include "tensix_functions.h" diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index f1e0aad4b6f..78497e3e3e0 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "ckernel.h" -#include "metal_ckernel_globals.h" +#include "firmware_common.h" #include "risc_common.h" #include #include "dev_msgs.h" diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc index 0115db2f96f..9746b263a6a 100644 --- a/tt_metal/hw/firmware/src/trisck.cc +++ b/tt_metal/hw/firmware/src/trisck.cc @@ -9,7 +9,7 @@ // Need to make sure no other file includes these lists since it also include global parameter definitions // 2) instantiate global variables -#include "metal_ckernel_globals.h" +#include "firmware_common.h" #include "chlkc_list.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h b/tt_metal/hw/inc/compile_time_args.h similarity index 100% rename from tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_compile_time_args.h rename to tt_metal/hw/inc/compile_time_args.h diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h b/tt_metal/hw/inc/firmware_common.h similarity index 98% rename from tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h rename to tt_metal/hw/inc/firmware_common.h index cf08580ad69..d6b6b7b9d5f 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/common/metal_ckernel_globals.h +++ b/tt_metal/hw/inc/firmware_common.h @@ -9,8 +9,8 @@ #include #include "ckernel_globals.h" #include "tensix_functions.h" -#include "metal_compile_time_args.h" #include "risc_attribs.h" +#include "compile_time_args.h" #include "hostdevcommon/common_runtime_address_map.h" #include "hostdevcommon/kernel_structs.h" diff --git a/tt_metal/include/compute_kernel_api/common_globals.h b/tt_metal/include/compute_kernel_api/common_globals.h index 0b0eee877dc..5c789bdc7d8 100644 --- a/tt_metal/include/compute_kernel_api/common_globals.h +++ b/tt_metal/include/compute_kernel_api/common_globals.h @@ -10,7 +10,7 @@ #include "chlkc_list.h" #include "ckernel.h" -#include "metal_ckernel_globals.h" +#include "firmware_common.h" #include "ckernel_include.h" #include "hostdevcommon/kernel_structs.h" diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 8a0285268e5..04d4cfa12e4 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -165,7 +165,7 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, int which, bo this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/firmware/src " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + - "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; this->defines_ = env_.defines_; @@ -315,7 +315,7 @@ JitBuildEthernet::JitBuildEthernet(const JitBuildEnv& env, int which, bool is_fw this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/inc/ethernet " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + - "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; this->srcs_.push_back("tt_metal/hw/toolchain/substitutes.cpp"); if (this->is_fw_) { From 222653025afdc64a952d9792aa933199a7c07373 Mon Sep 17 00:00:00 2001 From: Reem Tawfik Date: Wed, 13 Dec 2023 01:20:41 -0500 Subject: [PATCH 16/16] #3908: Clean up some magic values --- .../grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h | 5 +++-- tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h | 3 +-- .../wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h | 3 ++- tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h | 3 +-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h index 2dcd2a5d63e..1bd2e70d97e 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h @@ -19,11 +19,12 @@ namespace sfpu { template inline void calculate_mask() { - bool exponent_size_8 = true; + const bool exponent_size_8 = true; + const int mask_val_idx = 16; #pragma GCC unroll 4 for (int d = 0; d < ITERATIONS; d++) { - vFloat mask = dst_reg[16]; + vFloat mask = dst_reg[mask_val_idx]; v_if(sfpu_is_fp16_zero(mask, exponent_size_8)) { dst_reg[0] = 0; } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h index bd010082bbd..7558f53219a 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_io/llk_outputs.h @@ -9,8 +9,7 @@ // Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes inline uint32_t get_output_id(uint32_t output) { - const uint32_t OUTPUT_BASE = 0; - return ((output) - OUTPUT_BASE); + return (output); } inline const uint32_t get_output_base_id() diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h index f00bc07cbb3..f2292c5281d 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h @@ -19,7 +19,8 @@ namespace sfpu { template inline void calculate_mask() { - bool exponent_size_8 = true; + const bool exponent_size_8 = true; + const int mask_val_idx = 16; #pragma GCC unroll 8 for (int d = 0; d < ITERATIONS; d++) { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h index 11d634c25e4..b92af5b8ddc 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_io/llk_outputs.h @@ -9,8 +9,7 @@ // Metal specific overrides -- No support for partial tiles so hard-code to fixed 32x32 sizes inline uint32_t get_output_id(uint32_t output) { - const uint32_t OUTPUT_BASE = 0; - return ((output) - OUTPUT_BASE); + return (output); } inline const uint32_t get_output_base_id()