From 32fd6cf4c5bed049799a5bdf90979739ae024cbd Mon Sep 17 00:00:00 2001 From: Mo Date: Tue, 29 Oct 2024 17:08:13 +0000 Subject: [PATCH] #9956: Trace profiling smoke test --- tests/scripts/run_profiler_regressions.sh | 56 +++++++++++++++------ tt_metal/tools/profiler/process_ops_logs.py | 6 +-- tt_metal/tools/profiler/tt_metal_tracy.hpp | 1 + 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/tests/scripts/run_profiler_regressions.sh b/tests/scripts/run_profiler_regressions.sh index 5977e761a78..fce757e90bc 100755 --- a/tests/scripts/run_profiler_regressions.sh +++ b/tests/scripts/run_profiler_regressions.sh @@ -4,31 +4,35 @@ source scripts/tools_setup_common.sh set -eo pipefail -run_additional_T3000_test(){ - remove_default_log_locations - mkdir -p $PROFILER_ARTIFACTS_DIR - ./tt_metal/tools/profiler/profile_this.py -c "'pytest tests/ttnn/unit_tests/operations/ccl/test_all_gather.py::test_all_gather_on_t3000_post_commit_for_profiler_regression'" | tee $PROFILER_ARTIFACTS_DIR/test_out.log +run_async_mode_T3000_test(){ + #Some tests here do not skip grayskull + if [ "$ARCH_NAME" == "wormhole_b0" ]; then + remove_default_log_locations + mkdir -p $PROFILER_ARTIFACTS_DIR - if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" - then - echo "No verification as test was skipped" - else - echo "Verifying test results" - runDate=$(ls $PROFILER_OUTPUT_DIR/) - LINE_COUNT=9 #1 header + 8 devices - res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") - echo $res + ./tt_metal/tools/profiler/profile_this.py -c "pytest -svv models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py::test_falcon_causal_lm[wormhole_b0-True-True-20-2-BFLOAT16-L1-falcon_7b-layers_2-decode_batch32]" | tee $PROFILER_ARTIFACTS_DIR/test_out.log + + if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" + then + echo "No verification as test was skipped" + else + echo "Verifying test results" + runDate=$(ls $PROFILER_OUTPUT_DIR/) + LINE_COUNT=1000 # Smoke test to see at least 1000 ops are reported + res=$(verify_perf_line_count_floor "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") + echo $res + fi fi } -run_async_mode_T3000_test(){ +run_tracing_async_mode_T3000_test(){ #Some tests here do not skip grayskull if [ "$ARCH_NAME" == "wormhole_b0" ]; then remove_default_log_locations mkdir -p $PROFILER_ARTIFACTS_DIR - ./tt_metal/tools/profiler/profile_this.py -c "pytest -svv models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py::test_falcon_causal_lm[wormhole_b0-True-True-20-2-BFLOAT16-L1-falcon_7b-layers_2-decode_batch32]" | tee $PROFILER_ARTIFACTS_DIR/test_out.log + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./tt_metal/tools/profiler/profile_this.py -c "pytest models/demos/t3000/resnet50/tests/test_resnet50_performant.py::test_run_resnet50_trace_2cqs_inference[wormhole_b0-True-True-16-act_dtype0-weight_dtype0-math_fidelity0-device_params0]" | tee $PROFILER_ARTIFACTS_DIR/test_out.log if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" then @@ -36,13 +40,33 @@ run_async_mode_T3000_test(){ else echo "Verifying test results" runDate=$(ls $PROFILER_OUTPUT_DIR/) - LINE_COUNT=1000 # Smoke test to see at least 1000 ops are reported + LINE_COUNT=4100 # Smoke test to see at least 1000 ops are reported res=$(verify_perf_line_count_floor "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") echo $res fi fi } +run_additional_T3000_test(){ + remove_default_log_locations + mkdir -p $PROFILER_ARTIFACTS_DIR + + ./tt_metal/tools/profiler/profile_this.py -c "'pytest tests/ttnn/unit_tests/operations/ccl/test_all_gather.py::test_all_gather_on_t3000_post_commit_for_profiler_regression'" | tee $PROFILER_ARTIFACTS_DIR/test_out.log + + if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" + then + echo "No verification as test was skipped" + else + echo "Verifying test results" + runDate=$(ls $PROFILER_OUTPUT_DIR/) + LINE_COUNT=9 #1 header + 8 devices + res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") + echo $res + + run_tracing_async_mode_T3000_test + fi +} + run_profiling_test(){ if [[ -z "$ARCH_NAME" ]]; then echo "Must provide ARCH_NAME in environment" 1>&2 diff --git a/tt_metal/tools/profiler/process_ops_logs.py b/tt_metal/tools/profiler/process_ops_logs.py index ced75eadc8d..f2418ab8523 100755 --- a/tt_metal/tools/profiler/process_ops_logs.py +++ b/tt_metal/tools/profiler/process_ops_logs.py @@ -110,6 +110,7 @@ def import_tracy_op_logs(logFolder): if len(tmpStrs) > 1: # uncached device op, host op, or fallback op jsonStr = tmpStrs[-1] opData = json.loads(jsonStr) + opData["trace_id"] = None if "op_hash" in opData.keys(): assert "device_id" in opData.keys() deviceID = int(opData["device_id"]) @@ -119,9 +120,8 @@ def import_tracy_op_logs(logFolder): else: cached_ops[deviceID] = {opHash: opData.copy()} del cached_ops[deviceID][opHash]["global_call_count"] - opData["trace_id"] = None - if deviceID in traceIDs: - opData["trace_id"] = traceIDs[deviceID] + if deviceID in traceIDs: + opData["trace_id"] = traceIDs[deviceID] else: # cached device op opDataList = opDataStr.split(":", 1)[-1].split(",") assert len(opDataList) > 3, "Wrong cached op info format" diff --git a/tt_metal/tools/profiler/tt_metal_tracy.hpp b/tt_metal/tools/profiler/tt_metal_tracy.hpp index 0b0f94d19ac..7fe76eaa9e9 100644 --- a/tt_metal/tools/profiler/tt_metal_tracy.hpp +++ b/tt_metal/tools/profiler/tt_metal_tracy.hpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 +#pragma once #if defined(TRACY_ENABLE)