tenstorrent · muthutt · Jan 31, 2024 · Dec 7, 2023
@@ -466,6 +466,8 @@ but in general retaining the data.
 
 .. autofunction:: tt_lib.tensor.clone
 
+.. autofunction:: tt_lib.tensor.typecast
+
 .. autofunction:: tt_lib.tensor.copy
 
 Tensor creation operations

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py
@@ -24,6 +24,10 @@
         "tt_lib_op": tt_lib_ops.clone,
         "pytorch_op": pytorch_ops.clone,
     },
+    "typecast": {
+        "tt_lib_op": tt_lib_ops.typecast,
+        "pytorch_op": pytorch_ops.typecast,
+    },
     "copy": {
         "tt_lib_op": tt_lib_ops.copy,
         "pytorch_op": pytorch_ops.copy,

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc.
 
 # SPDX-License-Identifier: Apache-2.0
 
@@ -117,21 +117,59 @@ def test_run_assign_binary_op(
             test_args,
         )
 
+
+@pytest.mark.parametrize(
+    "input_shapes",
+    [
+        [[1, 1, 1, 30]],  # Single core
+        [[1, 1, 300, 380]],  # multi core
+        [[1, 3, 320, 380]],  # multi core
+        [[1, 1, 32, 32]],  # Single core
+        [[1, 1, 320, 384]],  # Multi core
+        [[1, 3, 320, 384]],  # Multi core
+    ],
+)
+@pytest.mark.parametrize(
+    "input_mem_config",
+    mem_configs,
+)
+@pytest.mark.parametrize(
+    "dst_mem_config",
+    mem_configs,
+)
+@pytest.mark.parametrize(
+    "output_type",
+    [
+        ttl.tensor.DataType.BFLOAT16,
+    ],
+)
+@pytest.mark.parametrize(
+    "input_type",
+    [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ],
+)
+class TestClone:
     def test_run_clone_op(
         self,
+        input_type,
+        output_type,
         input_shapes,
         input_mem_config,
         dst_mem_config,
         device,
         function_level_defaults,
     ):
         datagen_func = [
-            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), input_type)
         ]
         test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
         test_args["input_mem_config"] = [input_mem_config]
+        test_args["dtype"] = [output_type]
         test_args.update({"output_mem_config": dst_mem_config})
-        comparison_func = comparison_funcs.comp_equal
+        comparison_func = partial(comparison_funcs.comp_allclose, rtol=1e-1, atol=1e-1)
         run_single_pytorch_test(
             "clone",
             input_shapes,

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_typecast.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_typecast.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+from functools import partial
+import tt_lib as ttl
+
+
+from tests.tt_eager.python_api_testing.sweep_tests import (
+    comparison_funcs,
+    generation_funcs,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import (
+    run_single_pytorch_test,
+)
+
+mem_configs = [
+    ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM),
+    ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1),
+]
+
+
+@pytest.mark.parametrize(
+    "pt_input_dtype, tt_input_dtype",
+    (
+        (torch.float16, ttl.tensor.DataType.FLOAT32),
+        (torch.float32, ttl.tensor.DataType.BFLOAT8_B),
+        (torch.bfloat16, ttl.tensor.DataType.BFLOAT16),
+        (torch.int, ttl.tensor.DataType.UINT32),
+    ),
+)
+@pytest.mark.parametrize(
+    "pt_output_dtype, tt_output_dtype",
+    (
+        (torch.bfloat16, ttl.tensor.DataType.BFLOAT16),
+        (torch.float32, ttl.tensor.DataType.BFLOAT8_B),
+    ),
+)
+@pytest.mark.parametrize(
+    "input_shapes",
+    [
+        [[1, 1, 32, 32]],  # Single core
+        [[1, 1, 320, 320]],  # multi core
+        [[1, 3, 320, 320]],  # multi core
+        [[1, 1, 32, 32]],  # Single core
+        [[1, 1, 320, 384]],  # Multi core
+        [[1, 3, 320, 384]],  # Multi core
+    ],
+)
+@pytest.mark.parametrize(
+    "input_mem_config",
+    mem_configs,
+)
+@pytest.mark.parametrize(
+    "dst_mem_config",
+    mem_configs,
+)
+class TestTypecast:
+    def test_run_typecast_op(
+        self,
+        pt_output_dtype,
+        tt_output_dtype,
+        pt_input_dtype,
+        tt_input_dtype,
+        input_shapes,
+        input_mem_config,
+        dst_mem_config,
+        device,
+        function_level_defaults,
+    ):
+        if tt_input_dtype in [ttl.tensor.DataType.FLOAT32, ttl.tensor.DataType.UINT32]:
+            pytest.skip(f"{tt_input_dtype} cannot be converted yet. Skip")
+        if tt_input_dtype == tt_output_dtype:
+            pytest.skip("Same I/O data types. Skip.")
+        datagen_func = [
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=0, high=10), pt_input_dtype)
+        ]
+        test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
+        test_args["pt_input_dtype"] = [pt_input_dtype]
+        test_args["tt_input_dtype"] = [tt_input_dtype]
+        test_args["pt_output_dtype"] = [pt_output_dtype]
+        test_args["tt_output_dtype"] = [tt_output_dtype]
+        test_args["input_mem_config"] = [input_mem_config]
+        test_args.update({"output_mem_config": dst_mem_config})
+        comparison_func = comparison_funcs.comp_pcc
+
+        run_single_pytorch_test(
+            "typecast",
+            input_shapes,
+            datagen_func,
+            comparison_func,
+            device,
+            test_args,
+        )
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -34,6 +34,10 @@ def clone(x, *args, **kwargs):
     return torch.clone(x)
 
 
+def typecast(x, pt_input_dtype, pt_output_dtype, *args, **kwargs):
+    return x.to(pt_input_dtype[0]).to(pt_output_dtype[0])
+
+
 def move(x, *args, **kwargs):
     return x
 

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/run_pytorch_ci_tests.py b/tests/tt_eager/python_api_testing/sweep_tests/run_pytorch_ci_tests.py
@@ -28,7 +28,7 @@ def run_single_pytorch_test(
     env="",
     plot_func=None,
 ):
-    assert test_name in op_map.keys()
+    assert test_name in op_map
 
     default_env_dict = {}
     # Get env variables from CLI

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py
@@ -98,6 +98,25 @@ def clone(
     return tt2torch_tensor(t1)
 
 
+@setup_host_and_device
+def typecast(
+    x,
+    *args,
+    device,
+    tt_input_dtype,
+    tt_output_dtype,
+    layout,
+    input_mem_config,
+    output_mem_config,
+    **kwargs,
+):
+    t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], tt_input_dtype[0])
+
+    t1 = ttl.tensor.typecast(t0, tt_output_dtype[0], output_mem_config=output_mem_config)
+
+    return tt2torch_tensor(t1)
+
+
 @setup_host_and_device
 def move(
     x,

diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp
@@ -78,6 +78,30 @@ tt::stl::reflection::Attributes Copy::attributes() const {
     };
 }
 
+Tensor copy(const Tensor& src_tensor, const Tensor& dst_tensor) {
+    operation::run(Copy{dst_tensor.memory_config(), dst_tensor.dtype()}, {src_tensor, dst_tensor});
+    return dst_tensor;
+}
+
+Tensor clone(const Tensor& input, const MemoryConfig& output_mem_config, std::optional<const DataType> output_dtype) {
+    return operation::run(Copy{output_mem_config, output_dtype.value_or(input.dtype())}, {input}).at(0);
+}
+
+Tensor typecast(const Tensor& input_tensor, const DataType& dtype, const MemoryConfig& output_mem_config ) {
+    return operation::run(Copy{output_mem_config, dtype}, {input_tensor}).at(0);
+}
+
+//unary assign
+Tensor assign(const Tensor& input, const MemoryConfig& output_mem_config , std::optional<const DataType> output_dtype ) {
+    return operation::run(Copy{output_mem_config, output_dtype.value_or(input.dtype())}, {input}).at(0);
+}
+
+// binary assign
+Tensor assign(const Tensor& input_a, const Tensor& input_b) {
+    operation::run(Copy{input_b.memory_config(), input_b.dtype()}, {input_a, input_b});
+    return input_b;
+}
+
 }  // namespace tt_metal
 
 }  // namespace tt
diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.hpp b/tt_eager/tt_dnn/op_library/copy/copy_op.hpp
@@ -37,25 +37,17 @@ struct Copy {
 operation::ProgramWithCallbacks copy_multi_core(const Tensor &input, const Tensor &output, bool backwards = false);
 operation::ProgramWithCallbacks copy_single_core(const Tensor &input, const Tensor &output, bool backwards = false);
 
-inline Tensor copy(const Tensor& src_tensor, const Tensor& dst_tensor) {
-    operation::run(Copy{dst_tensor.memory_config(), dst_tensor.dtype()}, {src_tensor, dst_tensor});
-    return dst_tensor;
-}
+Tensor copy(const Tensor& src_tensor, const Tensor& dst_tensor);
 
-inline Tensor clone(const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional<const DataType> output_dtype = std::nullopt) {
-    return operation::run(Copy{output_mem_config, output_dtype.value_or(input.dtype())}, {input}).at(0);
-}
+Tensor clone(const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional<const DataType> output_dtype = std::nullopt);
+
+Tensor typecast(const Tensor& input_tensor, const DataType& dtype, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG);
 
 //unary assign
-inline Tensor assign(const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional<const DataType> output_dtype = std::nullopt) {
-    return operation::run(Copy{output_mem_config, output_dtype.value_or(input.dtype())}, {input}).at(0);
-}
+Tensor assign(const Tensor& input, const MemoryConfig& output_mem_config, std::optional<const DataType> output_dtype = std::nullopt);
 
 // binary assign
-inline Tensor assign(const Tensor& input_a, const Tensor& input_b) {
-    operation::run(Copy{input_b.memory_config(), input_b.dtype()}, {input_a, input_b});
-    return input_b;
-}
+Tensor assign(const Tensor& input_a, const Tensor& input_b);
 
 }  // namespace tt_metal
 

diff --git a/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp b/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp
@@ -8,6 +8,8 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/common/constants.hpp"
 #include "tt_metal/detail/util.hpp"
+#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "common/bfloat8.hpp"
 
 using namespace tt::constants;
 

diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp
@@ -47,6 +47,24 @@ namespace tt::tt_metal::detail{
         detail::bind_unary_op<true, true>(m_tensor, "assign", py::overload_cast<const Tensor&, const MemoryConfig&, std::optional<const DataType>>(&assign), R"doc(  Returns a new tensor which is a new copy of input tensor ``{0}``.)doc");
 
         // *** tensor manipulation ***
+        m_tensor.def("typecast", &typecast,
+            py::arg("input_tensors").noconvert(), py::arg("dtype"), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc(
+                Returns a new tensor which is a typecast of input tensor with new datatype``{0}``.
+
+                Input tensors must be on device, in ROW MAJOR or TILE layout, and have matching data type.
+
+                Datatype must be one ofthe following types BFLOAT16,BFLOAT8_B,UINT32 and UINT16.
+
+                Output tensor will be on device, in same layout, and have the given data type.
+
+                .. csv-table::
+                    :header: "Argument", "Description", "Data type", "Required"
+
+                    "input_tensors", "Input tensors to typecast", "List of Tensors", "Yes"
+                    "dtype", "datatype of typecast", "Datatype", "Yes"
+                    "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "No"
+            )doc"
+        );
         m_tensor.def("concat", &concat,
             py::arg("input_tensors").noconvert(), py::arg("dim") = 0, py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc(
             Concatenates shape of tensors ``arg0`` and ``arg1`` to new shape ``[W, Z, Y, X]`` along the specified dimension ``arg1``.