#8835: added TMP-based device operation infra

tenstorrent · Jun 15, 2024 · ba1108f · ba1108f
1 parent 33ce964
commit ba1108f
Show file tree

Hide file tree

Showing 18 changed files with 1,813 additions and 433 deletions.
diff --git a/tests/ttnn/unit_tests/operations/test_relational.py b/tests/ttnn/unit_tests/operations/test_relational.py
@@ -236,16 +236,16 @@ def test_expand_and_broadcast(device, h, w):
 @pytest.mark.parametrize("h", [500])
 @pytest.mark.parametrize("w", [512])
 def test_expand_and_broadcast_reversed(device, h, w):
-    torch_a = torch.rand((1, h, w), dtype=torch.bfloat16)
-    torch_b = torch.rand((h, w), dtype=torch.bfloat16)
-    torch_output = torch.lt(torch_b, torch_a)
+    torch_input_tensor_a = torch.rand((1, h, w), dtype=torch.bfloat16)
+    torch_input_tensor_b = torch.rand((h, w), dtype=torch.bfloat16)
+    torch_output = torch.lt(torch_input_tensor_b, torch_input_tensor_a)
 
-    a = ttnn.from_torch(torch_a, layout=ttnn.TILE_LAYOUT, device=device)
-    b = ttnn.from_torch(torch_b, layout=ttnn.TILE_LAYOUT, device=device)
-    tt_output = ttnn.lt(b, a)
-    tt_output = ttnn.to_torch(tt_output)
+    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
+    output = ttnn.lt(input_tensor_b, input_tensor_a)
+    output = ttnn.to_torch(output)
 
-    assert_with_pcc(torch_output, tt_output, 0.9999)
+    assert_with_pcc(torch_output, output, 0.9999)
 
 
 @pytest.mark.parametrize("atol", [1e-8, 1e-10])

diff --git a/tt_eager/tensor/tensor.hpp b/tt_eager/tensor/tensor.hpp
@@ -43,7 +43,8 @@ struct Tensor {
         bool track_ref_count = false;
         TensorAttributes(const Storage storage, const ttnn::Shape shape, DataType dtype, Layout layout) :
             storage(storage), shape(shape), dtype(dtype), layout(layout) {}
-        TensorAttributes() : shape({0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
+        TensorAttributes() :
+            shape(std::array<uint32_t, 4>{0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
         ~TensorAttributes() = default;
 
         // Use these functions to manage the main_thread_ref_count for a tensor attr instance.
@@ -392,6 +393,15 @@ Tensor create_device_tensor(
     Device *device,
     const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED});
 
+static Tensor create_device_tensor(
+    const ttnn::Shape &shape,
+    DataType dtype,
+    Layout layout,
+    Device *device,
+    const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
+    return create_device_tensor(shape.value(), dtype, layout, device, memory_config);
+}
+
 // template<typename Buffer>
 // void *get_host_buffer(const Tensor &tensor);
 void *get_raw_host_data_ptr(const Tensor &tensor);

diff --git a/tt_eager/tensor/types.hpp b/tt_eager/tensor/types.hpp
@@ -173,19 +173,21 @@ class Shape {
     }
 
     template <std::size_t Rank>
-    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
+    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
         rank_(Rank), dimensions_{}, padding_{Rank} {
         for (auto index = 0; index < Rank; index++) {
-            auto padded_dimension = shape_tile_padding[index];
+            auto padded_dimension = shape_with_tile_padding[index];
             this->dimensions_[index] = padded_dimension;
             this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
         }
     }
-    explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_tile_padding) :
+    explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) :
         rank_(shape.size()), dimensions_{}, padding_{shape.size()} {
-        TT_ASSERT(shape.size() == shape_tile_padding.size(), "Shape and shape_tile_padding must have the same size");
+        TT_ASSERT(
+            shape.size() == shape_with_tile_padding.size(),
+            "Shape and shape_with_tile_padding must have the same size");
         for (auto index = 0; index < shape.size(); index++) {
-            auto padded_dimension = shape_tile_padding[index];
+            auto padded_dimension = shape_with_tile_padding[index];
             this->dimensions_[index] = padded_dimension;
             this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
         }
@@ -720,14 +722,20 @@ struct Shape {
     explicit Shape(const std::array<uint32_t, Rank> &shape) : ranked_shape{RankedShape<Rank>{shape}} {}
 
     template <std::size_t Rank>
-    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
-        ranked_shape{RankedShape<Rank>{shape, shape_tile_padding}} {}
+    explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
+        ranked_shape{RankedShape<Rank>{shape, shape_with_tile_padding}} {}
 
     template <std::size_t Rank>
     explicit Shape(
         const std::array<uint32_t, Rank> &shape, const std::array<std::array<uint32_t, 2>, Rank> &tile_padding) :
         ranked_shape{RankedShape<Rank>{shape, tile_padding}} {}
 
+    static Shape from_vector(const std::vector<uint32_t> &shape) { return Shape{tt::tt_metal::Shape{shape}}; }
+
+    static Shape from_vector(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) {
+        return Shape{tt::tt_metal::Shape{shape, shape_with_tile_padding}};
+    }
+
     const auto rank() const {
         return std::visit(
             []<std::size_t Rank>(const RankedShape<Rank> &shape) -> const auto { return Rank; }, this->ranked_shape);

diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp
@@ -119,7 +119,7 @@ constexpr auto decorate_device_operation(const Function& function) {
 template <typename OutputTensors>
 OutputTensors run_host_operation(const HostOperation<OutputTensors>& operation, const Tensors& input_tensors) {
     ZoneScopedN("TT_DNN_HOST_OP");
-    uint32_t op_id = assign_id();
+    uint32_t op_id = assign_operation_id();
 
     operation.validate(input_tensors);
     auto output_tensors = operation.compute_output_tensors(input_tensors);
@@ -143,7 +143,7 @@ OutputTensors run_device_operation(
     const OptionalConstTensors& optional_input_tensors,
     const OptionalTensors& optional_output_tensors) {
     ZoneScopedN("TT_DNN_DEVICE_OP");
-    uint32_t op_id = assign_id();
+    uint32_t op_id = assign_operation_id();
 
     std::function<std::variant<std::shared_ptr<Program>, std::reference_wrapper<Program>>(
         const DeviceOperation<OutputTensors>&,

diff --git a/tt_eager/tt_dnn/op_library/run_operation.hpp b/tt_eager/tt_dnn/op_library/run_operation.hpp
@@ -275,12 +275,6 @@ inline void log_operation(
     const OptionalTensors& optional_output_tensors = {}) {}
 #endif
 
-inline uint32_t assign_id()
-{
-    static std::atomic<uint32_t> atomic_count{0};
-    return atomic_count.fetch_add(1);
-}
-
 template<class OutputTensors=Tensors>
 OutputTensors run(
     const HostOperation<OutputTensors>& operation,

diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_pytensor.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_pytensor.cpp
@@ -567,7 +567,7 @@ Tensor convert_python_tensors_to_tt_tensors(py::list tensor_shards, std::optiona
                     ZoneScopedN("TT_DNN_FALLBACK_OP");
                     auto [op, input_tensors] = detail::parse_external_operation(function, args, kwargs, function_name);
                     operation::log_operation(op, input_tensors);
-                    uint32_t op_id = tt::tt_metal::operation::assign_id();
+                    uint32_t op_id = tt::tt_metal::assign_operation_id();
 
                     auto output_tensors = function(*args, **kwargs);
 

diff --git a/tt_metal/impl/device/program_cache.hpp b/tt_metal/impl/device/program_cache.hpp
@@ -51,7 +51,7 @@ struct ProgramCache {
 
    private:
     inline static bool is_enabled_ = false;
-    std::unordered_map<uint64_t, tt::stl::unique_any<1024, 32>> cache_{};
+    std::unordered_map<uint64_t, tt::stl::unique_any<1152, 32>> cache_{};
 };
 
 }

diff --git a/tt_metal/tools/profiler/op_profiler.hpp b/tt_metal/tools/profiler/op_profiler.hpp
@@ -22,6 +22,11 @@ namespace tt {
 
 namespace tt_metal {
 
+static uint32_t assign_operation_id() {
+    static std::atomic<uint32_t> atomic_count{0};
+    return atomic_count.fetch_add(1);
+}
+
 namespace op_profiler {
 
 enum class OpType { python_fallback, tt_dnn_cpu, tt_dnn_device, unknown };
@@ -251,6 +256,23 @@ inline json get_base_json(
     return j;
 }
 
+inline json get_base_json(uint32_t opID, const auto& op) {
+    ZoneScoped;
+    json j;
+    j["global_call_count"] = opID;
+
+    std::string opName = "device operation";
+
+    std::replace(opName.begin(), opName.end(), ',', ';');
+    j["op_code"] = opName;
+
+    json attributesObj;
+    j["attributes"] = attributesObj;
+    j["input_tensors"] = get_tensors_json(std::vector<Tensor>{});
+    j["output_tensors"] = get_tensors_json(std::vector<Tensor>{});
+    return j;
+}
+
 inline std::string op_meta_data_serialized_json(
     uint32_t opID, const tt::tt_metal::operation::ExternalOperation& op, const std::vector<Tensor>& input_tensors) {
     auto j = get_base_json<true>(opID, op, input_tensors);
@@ -321,6 +343,12 @@ inline std::string op_meta_data_serialized_json(
         return fmt::format("{}{}`", cached_ops.at(device_id).at(opHash), opID);
     }
 }
+inline std::string op_meta_data_serialized_json(uint32_t opID, const auto& op) {
+    auto j = get_base_json(opID, op);
+    j["op_type"] = magic_enum::enum_name(OpType::tt_dnn_device);
+    std::string ser = j.dump(4);
+    return fmt::format("`Device Operation:{} ->\n{}`", j["op_code"], ser);
+}
 
 #define TracyOpTTNNDevice(                                                                                           \
     op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors) \
@@ -338,23 +366,30 @@ inline std::string op_meta_data_serialized_json(
     ZoneText(op_text.c_str(), op_text.size());                                                                       \
     TracyMessage(op_message.c_str(), op_message.size());
 
+#define TracyOpTNNNDeviceV2(op_id, op)                                             \
+    std::string op_message = op_profiler::op_meta_data_serialized_json(op_id, op); \
+    std::string op_text = fmt::format("id:{}", op_id);                             \
+    ZoneText(op_text.c_str(), op_text.size());                                     \
+    TracyMessage(op_message.c_str(), op_message.size());
+
 #define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors)                            \
     std::string op_message =                                                                        \
         op_profiler::op_meta_data_serialized_json(op_id, operation, input_tensors, output_tensors); \
     std::string op_text = fmt::format("id:{}", op_id);                                              \
     ZoneText(op_text.c_str(), op_text.size());                                                      \
     TracyMessage(op_message.c_str(), op_message.size());
 
-#define TracyOpTTNNExternal(op_id, op, input_tensors)                                             \
-    std::string op_message = op_profiler::op_meta_data_serialized_json(op_id, op, input_tensors); \
-    std::string op_text = fmt::format("id:{}", op_id);                                            \
-    ZoneText(op_text.c_str(), op_text.size());                                                    \
+#define TracyOpTTNNExternal(op_id, op, input_tensors)                              \
+    std::string op_message = op_profiler::op_meta_data_serialized_json(op_id, op); \
+    std::string op_text = fmt::format("id:{}", op_id);                             \
+    ZoneText(op_text.c_str(), op_text.size());                                     \
     TracyMessage(op_message.c_str(), op_message.size());
 
 #else
 
 #define TracyOpTTNNDevice( \
     op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors)
+#define TracyOpTNNNDeviceV2(op_id, op)
 #define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors)
 #define TracyOpTTNNExternal(op_id, op, input_tensors)
 

diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp
@@ -213,7 +213,7 @@ struct operation_t {
 
     template <typename... args_t>
     auto operator()(args_t&&... args) const {
-        ZoneScoped;
+        ZoneScopedN("Run ttnn operation (struct-based)");
         ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
         tt::log_debug(tt::LogOp, "Started   C++ ttnn operation: {}", this->cpp_fully_qualified_name);
 
@@ -324,7 +324,7 @@ struct lambda_operation_t {
 
     template <typename... args_t>
     auto operator()(args_t&&... args) const {
-        ZoneScoped;
+        ZoneScopedN("Run ttnn operation (lambda-based)");
         ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
         tt::log_debug(tt::LogOp, "Started   C++ ttnn operation: {}", this->cpp_fully_qualified_name);
         auto output = this->lambda(std::forward<decltype(args)>(args)...);