From 95f8ddc63a4595618bfb56f54bcdcd473da3ba14 Mon Sep 17 00:00:00 2001 From: Anasuya G Nair Date: Tue, 25 Jun 2024 16:47:39 +0530 Subject: [PATCH] #8682: Add forward support for unary fmod (#9579) Co-authored-by: mouliraj-mcw --- docs/source/ttnn/ttnn/dependencies/tt_lib.rst | 2 + .../python_api_testing/sweep_tests/op_map.py | 4 ++ .../pytests/tt_dnn/test_eltwise_unary.py | 30 ++++++++++ .../sweep_tests/pytorch_ops.py | 6 ++ .../sweep_tests/tt_lib_ops.py | 18 ++++++ .../eltwise_unary/eltwise_unary_op.cpp | 11 ++++ .../eltwise_unary/eltwise_unary_op.hpp | 7 ++- .../csrc/tt_lib_bindings_tensor_xary_ops.cpp | 7 +++ .../metal/llk_api/llk_math_unary_sfpu_api.h | 1 + .../llk_api/llk_sfpu/ckernel_sfpu_fmod.h | 58 +++++++++++++++++++ .../llk_api/llk_sfpu/ckernel_sfpu_remainder.h | 1 - .../llk_math_eltwise_unary_sfpu_fmod.h | 30 ++++++++++ .../metal/llk_api/llk_sfpu_types.h | 1 + .../compute_kernel_api/eltwise_unary/fmod.h | 47 +++++++++++++++ .../eltwise_unary/sfpu_split_includes.h | 4 ++ 15 files changed, 224 insertions(+), 3 deletions(-) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_fmod.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_fmod.h create mode 100644 tt_metal/include/compute_kernel_api/eltwise_unary/fmod.h diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst index b617ccc9345..31a8215b510 100644 --- a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst +++ b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst @@ -416,6 +416,8 @@ Tensor elementwise operations .. autofunction:: tt_lib.tensor.unary_remainder .. autofunction:: tt_lib.tensor.remainder + +.. autofunction:: tt_lib.tensor.unary_fmod .. autofunction:: tt_lib.tensor.fmod diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py index 73d23e276f2..28cbd062841 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py @@ -520,6 +520,10 @@ "tt_op": tt_lib_ops.eltwise_fmod, "pytorch_op": pytorch_ops.fmod, }, + "eltwise-unary_fmod": { + "tt_op": tt_lib_ops.eltwise_unary_fmod, + "pytorch_op": pytorch_ops.unary_fmod, + }, "eltwise-unary_ne": { "tt_op": tt_lib_ops.eltwise_unary_ne, "pytorch_op": pytorch_ops.unary_ne, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py index 401e76bd6c8..6b6d6e6a4aa 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py @@ -679,6 +679,36 @@ def test_run_eltwise_unary_remainder( test_args, ) + @skip_for_grayskull("#ToDo: GS implementation needs to be done for fmod op") + def test_run_eltwise_unary_fmod( + self, + input_shapes, + device, + function_level_defaults, + input_mem_config, + output_mem_config, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e5, high=1e5), torch.bfloat16) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args.update({"value": np.random.randint(-100, 100) + 0.5}) + test_args.update( + { + "input_mem_config": [input_mem_config], + "output_mem_config": output_mem_config, + } + ) + comparison_func = comparison_funcs.comp_pcc + run_single_pytorch_test( + "eltwise-unary_fmod", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) + @pytest.mark.parametrize("unary_comp", ["unary_ne"]) @pytest.mark.parametrize("scalar", [0.5, 1.0, -1.0, 0.0]) def test_run_eltwise_unary_comp( diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index 918ebdc87c4..9160c347397 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -566,6 +566,12 @@ def fmod(x, y, *args, **kwargs): return result +def unary_fmod(x, *args, **kwargs): + value = kwargs.pop("value") + result = torch.fmod(x, value) + return result + + def unary_ne(x, *args, **kwargs): value = kwargs.pop("scalar") result = torch.ne(x, value) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py index 3fa455cb021..c99b7eb1b2d 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py @@ -1296,6 +1296,24 @@ def eltwise_unary_remainder( return tt2torch_tensor(t1) +@setup_host_and_device +def eltwise_unary_fmod( + x, + *args, + value, + device, + dtype, + layout, + input_mem_config, + output_mem_config, + **kwargs, +): + t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) + t1 = ttl.tensor.unary_fmod(t0, value, output_mem_config=output_mem_config) + + return tt2torch_tensor(t1) + + @setup_host_and_device def eltwise_heaviside( x, diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp index 37c287490ec..b876ff5f465 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp @@ -71,6 +71,7 @@ void update_macro_defines(UnaryOpType op_type, std::map get_op_init_and_func_parameterized( "remainder_tile_init();", fmt::format("remainder_tile({}, {}u, {}u);", idst, Converter::to_hex(param0), Converter::to_hex(1.0f/param0))}; break; + case UnaryOpType::FMOD: + op_init_and_name = { + "fmod_tile_init();", + fmt::format("fmod_tile({}, {}u, {}u);", idst, Converter::to_hex(param0), Converter::to_hex(1.0f/param0))}; + break; case UnaryOpType::EXP: op_init_and_name = { fmt::format("exp_tile_init<{}u>();", std::to_string((uint32_t)param0)), @@ -365,6 +371,11 @@ inline void validate_supported_arch_dtype(tt::ARCH arch, DataType input_datatype TT_FATAL(input_datatype == DataType::INT32, "Data type is not supported for Bitwise operations"); TT_FATAL(output_datatype == DataType::INT32, "Data type is not supported for Bitwise operations"); break; + case UnaryOpType::FMOD: + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); + TT_FATAL(input_datatype == DataType::BFLOAT16, "Data type is not supported for Fmod operations"); + TT_FATAL(output_datatype == DataType::BFLOAT16, "Data type is not supported for Fmod operations"); + break; default: return; } diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp index 00e6f7fcc61..ffe7874c4e2 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp @@ -84,7 +84,8 @@ enum class UnaryOpType { RIGHT_SHIFT, FLOOR, LEFT_SHIFT, - REMAINDER + REMAINDER, + FMOD }; template @@ -116,7 +117,8 @@ bool is_parametrized_type(T val) { case UnaryOpType::BITWISE_NOT: case UnaryOpType::RIGHT_SHIFT: case UnaryOpType::LEFT_SHIFT: - case UnaryOpType::REMAINDER: return true; + case UnaryOpType::REMAINDER: + case UnaryOpType::FMOD: return true; default: return false; } return false; @@ -424,6 +426,7 @@ constexpr auto bitwise_not = make_eltwise_unary_with_param{}; constexpr auto left_shift = make_eltwise_unary_with_param{}; constexpr auto unary_remainder = make_eltwise_unary_with_param{}; +constexpr auto unary_fmod = make_eltwise_unary_with_param{}; constexpr auto unary_ne = make_eltwise_unary_with_param{}; constexpr auto rsub = make_eltwise_unary_with_param{}; constexpr auto silu = make_eltwise_unary{}; diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp index 307acc33119..82f01945dd2 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp @@ -267,6 +267,13 @@ namespace tt::tt_metal::detail { R"doc(Perform an eltwise-modulus operation on ``{0}`` and ``{1}``. Formula : ``a - a.div(b, rounding_mode="floor") * b`` . Support provided only for WH_B0.)doc", R"doc("value", "float", "")doc" + ); + detail::bind_unary_op_with_param( + m_tensor, "unary_fmod", unary_fmod, + py::arg("value"), + R"doc(Perform an eltwise-fmod operation on ``{0}`` and ``{1}``. Formula : ``a - a.div(b, rounding_mode="trunc") * b`` . Support provided only for WH_B0.)doc", + R"doc("value", "float", "")doc" + ); detail::bind_unary_op_with_param( m_tensor, "unary_ne", unary_ne, diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h index 6a4223dbe64..9309b2c624e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -29,5 +29,6 @@ #include "llk_math_eltwise_unary_sfpu_remainder.h" #include "llk_math_eltwise_unary_sfpu_bitwise_xor.h" #include "llk_math_eltwise_unary_sfpu_bitwise_not.h" +#include "llk_math_eltwise_unary_sfpu_fmod.h" #include "llk_math_eltwise_unary_sfpu_right_shift.h" #include "llk_math_eltwise_unary_sfpu_left_shift.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_fmod.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_fmod.h new file mode 100644 index 00000000000..f87ba75de10 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_fmod.h @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" +#include "ckernel_sfpu_converter.h" +#include "ckernel_sfpu_recip.h" + +using namespace sfpi; +namespace ckernel { +namespace sfpu { + +template +inline void calculate_fmod(const uint value, const uint recip) { + + // SFPU microcode + Converter c_value; + c_value.u = value; + vFloat s = c_value.f; + s = sfpi::abs(s); + + c_value.u = recip; + vFloat recip_val = c_value.f; + recip_val = sfpi::abs(recip_val); + + #pragma GCC unroll 0 + for (int d = 0; d < ITERATIONS; d++) { + vFloat val = dst_reg[0]; + vFloat v = sfpi::abs(val); + + vFloat quotient = v*recip_val; + + vInt tmp = float_to_int16(quotient); //TODO: Replace float_to_int16 to float_to_int32 once it is available + vFloat newquotient= int32_to_float(tmp); + v_if (newquotient > quotient){ + newquotient = newquotient - 1; + } + v_endif; + + v = v - newquotient * s; + v = setsgn(v, val); + + v_if(s==0){ + v = std::numeric_limits::quiet_NaN(); + } + v_endif; + + dst_reg[0] = v; + dst_reg++; + } +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_remainder.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_remainder.h index 97f64dbb48b..72d2d6829d4 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_remainder.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_remainder.h @@ -11,7 +11,6 @@ #include "ckernel_sfpu_recip.h" using namespace sfpi; - namespace ckernel { namespace sfpu { diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_fmod.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_fmod.h new file mode 100644 index 00000000000..ca00daae99f --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_fmod.h @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel_sfpu_fmod.h" +#include "llk_math_eltwise_unary_sfpu_params.h" +#include "llk_math_eltwise_unary_sfpu_init.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_fmod_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_fmod(uint dst_index, uint param0, uint param1, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_params( + ckernel::sfpu::calculate_fmod, + dst_index, + vector_mode, + param0, + param1); +} + +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h index 917f4c08aa1..fcdc1d17202 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h @@ -81,5 +81,6 @@ enum SfpuType { floor, left_shift, remainder, + fmod, unused, }; diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/fmod.h b/tt_metal/include/compute_kernel_api/eltwise_unary/fmod.h new file mode 100644 index 00000000000..ed39cafc0e4 --- /dev/null +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/fmod.h @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include "compute_kernel_api/common_globals.h" +#ifdef TRISC_MATH +#include "llk_math_eltwise_unary_sfpu_fmod.h" +#define MAIN math_main() +#define MATH(x) x +#else +#define MATH(x) +#endif + + + +namespace ckernel { + +/** + * Performs element-wise fmod computation on input x by y , where x is each element of a tile + * in DST register at index tile_index. The input can be of float data type. The value is provided as const param0 The DST register buffer must be in + * acquired state via *acquire_dst* call. This call is blocking and is only + * available on the compute engine. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | Required | + * |----------------|-----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst | The index of the tile in DST register buffer to perform fmod operation | uint32_t | Must be less than the size of the DST register buffer | True | + * | param0 | Denominator value to perform fmod operation | uint32_t | | True | + * | param1 | Reciprocal of param0, calculated on-host | uint32_t | | False | + */ + + +ALWI void fmod_tile(uint32_t idst, uint32_t param0, uint32_t param1) { + MATH((llk_math_eltwise_unary_sfpu_fmod(idst, param0, param1))); +} + +/** + * Please refer to documentation for any_init. + */ +ALWI void fmod_tile_init() { MATH((llk_math_eltwise_unary_sfpu_fmod_init())); } + + +} // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h index fc7bc8fd056..ec4920b91c5 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h @@ -92,6 +92,10 @@ #include "compute_kernel_api/eltwise_unary/remainder.h" #endif +#if SFPU_OP_FMOD_INCLUDE +#include "compute_kernel_api/eltwise_unary/fmod.h" +#endif + #if SFPU_OP_BINOP_WITH_SCALAR_INCLUDE #include "compute_kernel_api/eltwise_unary/binop_with_scalar.h" #endif