Skip to content

Commit

Permalink
#8835: added TMP-based device operation infra
Browse files Browse the repository at this point in the history
  • Loading branch information
arakhmati committed Jun 15, 2024
1 parent 33ce964 commit ba1108f
Show file tree
Hide file tree
Showing 18 changed files with 1,813 additions and 433 deletions.
16 changes: 8 additions & 8 deletions tests/ttnn/unit_tests/operations/test_relational.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,16 +236,16 @@ def test_expand_and_broadcast(device, h, w):
@pytest.mark.parametrize("h", [500])
@pytest.mark.parametrize("w", [512])
def test_expand_and_broadcast_reversed(device, h, w):
torch_a = torch.rand((1, h, w), dtype=torch.bfloat16)
torch_b = torch.rand((h, w), dtype=torch.bfloat16)
torch_output = torch.lt(torch_b, torch_a)
torch_input_tensor_a = torch.rand((1, h, w), dtype=torch.bfloat16)
torch_input_tensor_b = torch.rand((h, w), dtype=torch.bfloat16)
torch_output = torch.lt(torch_input_tensor_b, torch_input_tensor_a)

a = ttnn.from_torch(torch_a, layout=ttnn.TILE_LAYOUT, device=device)
b = ttnn.from_torch(torch_b, layout=ttnn.TILE_LAYOUT, device=device)
tt_output = ttnn.lt(b, a)
tt_output = ttnn.to_torch(tt_output)
input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
output = ttnn.lt(input_tensor_b, input_tensor_a)
output = ttnn.to_torch(output)

assert_with_pcc(torch_output, tt_output, 0.9999)
assert_with_pcc(torch_output, output, 0.9999)


@pytest.mark.parametrize("atol", [1e-8, 1e-10])
Expand Down
12 changes: 11 additions & 1 deletion tt_eager/tensor/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ struct Tensor {
bool track_ref_count = false;
TensorAttributes(const Storage storage, const ttnn::Shape shape, DataType dtype, Layout layout) :
storage(storage), shape(shape), dtype(dtype), layout(layout) {}
TensorAttributes() : shape({0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
TensorAttributes() :
shape(std::array<uint32_t, 4>{0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
~TensorAttributes() = default;

// Use these functions to manage the main_thread_ref_count for a tensor attr instance.
Expand Down Expand Up @@ -392,6 +393,15 @@ Tensor create_device_tensor(
Device *device,
const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED});

static Tensor create_device_tensor(
const ttnn::Shape &shape,
DataType dtype,
Layout layout,
Device *device,
const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
return create_device_tensor(shape.value(), dtype, layout, device, memory_config);
}

// template<typename Buffer>
// void *get_host_buffer(const Tensor &tensor);
void *get_raw_host_data_ptr(const Tensor &tensor);
Expand Down
22 changes: 15 additions & 7 deletions tt_eager/tensor/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,19 +173,21 @@ class Shape {
}

template <std::size_t Rank>
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
rank_(Rank), dimensions_{}, padding_{Rank} {
for (auto index = 0; index < Rank; index++) {
auto padded_dimension = shape_tile_padding[index];
auto padded_dimension = shape_with_tile_padding[index];
this->dimensions_[index] = padded_dimension;
this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
}
}
explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_tile_padding) :
explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) :
rank_(shape.size()), dimensions_{}, padding_{shape.size()} {
TT_ASSERT(shape.size() == shape_tile_padding.size(), "Shape and shape_tile_padding must have the same size");
TT_ASSERT(
shape.size() == shape_with_tile_padding.size(),
"Shape and shape_with_tile_padding must have the same size");
for (auto index = 0; index < shape.size(); index++) {
auto padded_dimension = shape_tile_padding[index];
auto padded_dimension = shape_with_tile_padding[index];
this->dimensions_[index] = padded_dimension;
this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
}
Expand Down Expand Up @@ -720,14 +722,20 @@ struct Shape {
explicit Shape(const std::array<uint32_t, Rank> &shape) : ranked_shape{RankedShape<Rank>{shape}} {}

template <std::size_t Rank>
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
ranked_shape{RankedShape<Rank>{shape, shape_tile_padding}} {}
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
ranked_shape{RankedShape<Rank>{shape, shape_with_tile_padding}} {}

template <std::size_t Rank>
explicit Shape(
const std::array<uint32_t, Rank> &shape, const std::array<std::array<uint32_t, 2>, Rank> &tile_padding) :
ranked_shape{RankedShape<Rank>{shape, tile_padding}} {}

static Shape from_vector(const std::vector<uint32_t> &shape) { return Shape{tt::tt_metal::Shape{shape}}; }

static Shape from_vector(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) {
return Shape{tt::tt_metal::Shape{shape, shape_with_tile_padding}};
}

const auto rank() const {
return std::visit(
[]<std::size_t Rank>(const RankedShape<Rank> &shape) -> const auto { return Rank; }, this->ranked_shape);
Expand Down
4 changes: 2 additions & 2 deletions tt_eager/tt_dnn/op_library/run_operation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ constexpr auto decorate_device_operation(const Function& function) {
template <typename OutputTensors>
OutputTensors run_host_operation(const HostOperation<OutputTensors>& operation, const Tensors& input_tensors) {
ZoneScopedN("TT_DNN_HOST_OP");
uint32_t op_id = assign_id();
uint32_t op_id = assign_operation_id();

operation.validate(input_tensors);
auto output_tensors = operation.compute_output_tensors(input_tensors);
Expand All @@ -143,7 +143,7 @@ OutputTensors run_device_operation(
const OptionalConstTensors& optional_input_tensors,
const OptionalTensors& optional_output_tensors) {
ZoneScopedN("TT_DNN_DEVICE_OP");
uint32_t op_id = assign_id();
uint32_t op_id = assign_operation_id();

std::function<std::variant<std::shared_ptr<Program>, std::reference_wrapper<Program>>(
const DeviceOperation<OutputTensors>&,
Expand Down
6 changes: 0 additions & 6 deletions tt_eager/tt_dnn/op_library/run_operation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,6 @@ inline void log_operation(
const OptionalTensors& optional_output_tensors = {}) {}
#endif

inline uint32_t assign_id()
{
static std::atomic<uint32_t> atomic_count{0};
return atomic_count.fetch_add(1);
}

template<class OutputTensors=Tensors>
OutputTensors run(
const HostOperation<OutputTensors>& operation,
Expand Down
2 changes: 1 addition & 1 deletion tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_pytensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ Tensor convert_python_tensors_to_tt_tensors(py::list tensor_shards, std::optiona
ZoneScopedN("TT_DNN_FALLBACK_OP");
auto [op, input_tensors] = detail::parse_external_operation(function, args, kwargs, function_name);
operation::log_operation(op, input_tensors);
uint32_t op_id = tt::tt_metal::operation::assign_id();
uint32_t op_id = tt::tt_metal::assign_operation_id();

auto output_tensors = function(*args, **kwargs);

Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/device/program_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ struct ProgramCache {

private:
inline static bool is_enabled_ = false;
std::unordered_map<uint64_t, tt::stl::unique_any<1024, 32>> cache_{};
std::unordered_map<uint64_t, tt::stl::unique_any<1152, 32>> cache_{};
};

}
Expand Down
43 changes: 39 additions & 4 deletions tt_metal/tools/profiler/op_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ namespace tt {

namespace tt_metal {

static uint32_t assign_operation_id() {
static std::atomic<uint32_t> atomic_count{0};
return atomic_count.fetch_add(1);
}

namespace op_profiler {

enum class OpType { python_fallback, tt_dnn_cpu, tt_dnn_device, unknown };
Expand Down Expand Up @@ -251,6 +256,23 @@ inline json get_base_json(
return j;
}

inline json get_base_json(uint32_t opID, const auto& op) {
ZoneScoped;
json j;
j["global_call_count"] = opID;

std::string opName = "device operation";

std::replace(opName.begin(), opName.end(), ',', ';');
j["op_code"] = opName;

json attributesObj;
j["attributes"] = attributesObj;
j["input_tensors"] = get_tensors_json(std::vector<Tensor>{});
j["output_tensors"] = get_tensors_json(std::vector<Tensor>{});
return j;
}

inline std::string op_meta_data_serialized_json(
uint32_t opID, const tt::tt_metal::operation::ExternalOperation& op, const std::vector<Tensor>& input_tensors) {
auto j = get_base_json<true>(opID, op, input_tensors);
Expand Down Expand Up @@ -321,6 +343,12 @@ inline std::string op_meta_data_serialized_json(
return fmt::format("{}{}`", cached_ops.at(device_id).at(opHash), opID);
}
}
inline std::string op_meta_data_serialized_json(uint32_t opID, const auto& op) {
auto j = get_base_json(opID, op);
j["op_type"] = magic_enum::enum_name(OpType::tt_dnn_device);
std::string ser = j.dump(4);
return fmt::format("`Device Operation:{} ->\n{}`", j["op_code"], ser);
}

#define TracyOpTTNNDevice( \
op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors) \
Expand All @@ -338,23 +366,30 @@ inline std::string op_meta_data_serialized_json(
ZoneText(op_text.c_str(), op_text.size()); \
TracyMessage(op_message.c_str(), op_message.size());

#define TracyOpTNNNDeviceV2(op_id, op) \
std::string op_message = op_profiler::op_meta_data_serialized_json(op_id, op); \
std::string op_text = fmt::format("id:{}", op_id); \
ZoneText(op_text.c_str(), op_text.size()); \
TracyMessage(op_message.c_str(), op_message.size());

#define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors) \
std::string op_message = \
op_profiler::op_meta_data_serialized_json(op_id, operation, input_tensors, output_tensors); \
std::string op_text = fmt::format("id:{}", op_id); \
ZoneText(op_text.c_str(), op_text.size()); \
TracyMessage(op_message.c_str(), op_message.size());

#define TracyOpTTNNExternal(op_id, op, input_tensors) \
std::string op_message = op_profiler::op_meta_data_serialized_json(op_id, op, input_tensors); \
std::string op_text = fmt::format("id:{}", op_id); \
ZoneText(op_text.c_str(), op_text.size()); \
#define TracyOpTTNNExternal(op_id, op, input_tensors) \
std::string op_message = op_profiler::op_meta_data_serialized_json(op_id, op); \
std::string op_text = fmt::format("id:{}", op_id); \
ZoneText(op_text.c_str(), op_text.size()); \
TracyMessage(op_message.c_str(), op_message.size());

#else

#define TracyOpTTNNDevice( \
op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors)
#define TracyOpTNNNDeviceV2(op_id, op)
#define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors)
#define TracyOpTTNNExternal(op_id, op, input_tensors)

Expand Down
4 changes: 2 additions & 2 deletions ttnn/cpp/ttnn/decorators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ struct operation_t {

template <typename... args_t>
auto operator()(args_t&&... args) const {
ZoneScoped;
ZoneScopedN("Run ttnn operation (struct-based)");
ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
tt::log_debug(tt::LogOp, "Started C++ ttnn operation: {}", this->cpp_fully_qualified_name);

Expand Down Expand Up @@ -324,7 +324,7 @@ struct lambda_operation_t {

template <typename... args_t>
auto operator()(args_t&&... args) const {
ZoneScoped;
ZoneScopedN("Run ttnn operation (lambda-based)");
ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
tt::log_debug(tt::LogOp, "Started C++ ttnn operation: {}", this->cpp_fully_qualified_name);
auto output = this->lambda(std::forward<decltype(args)>(args)...);
Expand Down
Loading

0 comments on commit ba1108f

Please sign in to comment.