From b4b2a15795e14170c251eb373106b1e3ffe32d27 Mon Sep 17 00:00:00 2001 From: Abhinav Sarje Date: Thu, 30 May 2024 21:35:15 +0000 Subject: [PATCH] #0: e2e perf script for new c++ ttnn resnet50 --- .../resnet/tests/test_perf_resnet_new.py | 25 +++++++++++++------ .../optimized_conv_op_sharded_v2.cpp | 1 - 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/models/demos/resnet/tests/test_perf_resnet_new.py b/models/demos/resnet/tests/test_perf_resnet_new.py index d15b09d6dde..cbeaed45e71 100644 --- a/models/demos/resnet/tests/test_perf_resnet_new.py +++ b/models/demos/resnet/tests/test_perf_resnet_new.py @@ -9,9 +9,6 @@ import pytest import ttnn -from models.utility_functions import is_e75 -from models.utility_functions import profiler -from models.utility_functions import disable_persistent_kernel_cache, skip_for_wormhole_b0 from models.perf.perf_utils import prep_perf_report from loguru import logger @@ -22,6 +19,10 @@ convert_torch_model_to_ttnn_model, ) from models.utility_functions import ( + profiler, + is_e75, + disable_persistent_kernel_cache, + skip_for_wormhole_b0, pad_and_fold_conv_filters_for_unity_stride, ) @@ -83,6 +84,8 @@ def run_perf_resnet( ): disable_persistent_kernel_cache() + profiler.clear() + cpu_key = f"ref_key_batchsize{batch_size}" image = hf_cat_image_sample_input @@ -119,7 +122,6 @@ def run_perf_resnet( tt_inputs, device=device, batch_size=batch_size, ops_parallel_config=ops_parallel_config ).cpu(blocking=True) profiler.end(f"iter_{iter}_key") - ttnn.device.DumpDeviceProfiler(device) num_warm_iterations = 15 warm_start = warmup_end @@ -128,10 +130,13 @@ def run_perf_resnet( outputs = [] profiler.start(f"run") for iter in range(warm_start, warm_end): - outputs.append(tt_resnet50(tt_inputs).cpu(blocking=False)) - ttnn.device.Synchronize(device) + outputs.append( + tt_resnet50( + tt_inputs, device=device, batch_size=batch_size, ops_parallel_config=ops_parallel_config + ).cpu(blocking=False) + ) + ttnn.synchronize_device(device) profiler.end(f"run") - ttnn.device.DumpDeviceProfiler(device) # enable_persistent_kernel_cache() @@ -140,6 +145,10 @@ def run_perf_resnet( # ensuring inference time fluctuations is not noise inference_time_avg = profiler.get("run") / num_warm_iterations + for iter in range(0, 5): + logger.info(f'iter_{iter}_key: {profiler.get(f"iter_{iter}_key")}') + logger.info(f'{warm_start} to {warm_end} run: {profiler.get("run")}') + cpu_time = profiler.get(cpu_key) compile_time = first_iter_time - inference_time_avg prep_perf_report( @@ -158,7 +167,7 @@ def run_perf_resnet( @skip_for_wormhole_b0(reason_str="Not tested on single WH") -@pytest.mark.parametrize("device_l1_small_size", [32768], indirect=True) +@pytest.mark.parametrize("device_l1_small_size", [24576], indirect=True) @pytest.mark.models_performance_bare_metal @pytest.mark.parametrize( "batch_size, expected_inference_time, expected_compile_time", diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp index e4e2e855f50..72589106795 100644 --- a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp +++ b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp @@ -1320,7 +1320,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( const std::vector>& optional_input_tensors, const std::vector& output_tensors) { // Reader config indices is an optional static sharded tensor, so no need to update address - TT_ASSERT(input_tensors.size() + optional_input_tensors.size() == 4); TT_ASSERT(output_tensors.size() == 1); auto src_buffer_a = input_tensors.at(0).buffer();