From dc23a85325c1348334bff7ef2c86876a724061ea Mon Sep 17 00:00:00 2001
From: Di Nguyen <dinguyennhu@gmail.com>
Date: Tue, 23 Jul 2024 16:49:03 -0600
Subject: [PATCH] Updated benchmarks output name field that was not in the
 format key:value (#571)

* Fixed FileNotFound error on windows

* updated changelog

* updated copyright in rtest.py

* fixed copyright to 2021-2024

* Fixed FileNotFound error on windows

* updated changelog

* updated copyright in rtest.py

* fixed copyright to 2021-2024

* init;

* updated benchmark_block_run_length_decode

* added name_format option

* updated benchmark_config_dispatch

* updated some typos

* Update benchmark/benchmark_config_dispatch.cpp

Co-authored-by: Nara <naraenda@prasetya.eu>

* tuned up block run length decode and config dispatch

* updated changelog

Co-authored-by: Nara <naraenda@prasetya.eu>

* removed debug comments

---------

Co-authored-by: Nara <naraenda@prasetya.eu>
---
 CHANGELOG.md                                  |  5 +-
 .../benchmark_block_run_length_decode.cpp     | 36 +++++-----
 benchmark/benchmark_config_dispatch.cpp       | 70 ++++++++++++++++---
 benchmark/benchmark_device_scan_by_key.cpp    |  1 +
 4 files changed, 86 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 347dbb71f..73e6c18b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,8 +6,9 @@ Documentation for rocPRIM is available at
 ## Unreleased rocPRIM-3.3.0 for ROCm 6.3.0
 
 ### Fixes
-* Fixed an issue where while running rtest.py on windows and passing in an absolute path to --install_dir 
-  causes FileNotFound error
+
+
+* Fixed an issue where while running rtest.py on windows and passing in an absolute path to `--install_dir` causes a `FileNotFound` error.
 
 
 ## Unreleased rocPRIM-3.2.0 for ROCm 6.2.0
diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp
index 7d10630fd..7dd9faf61 100644
--- a/benchmark/benchmark_block_run_length_decode.cpp
+++ b/benchmark/benchmark_block_run_length_decode.cpp
@@ -171,36 +171,40 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     HIP_CHECK(hipFree(d_output));
 }
 
-#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT)                                 \
-    benchmark::RegisterBenchmark("block_run_length_decode<Item Type:" #IT ",Offset Type:" #OT \
-                                 ",Min RunLength:" #MINRL ",Max RunLength:" #MAXRL            \
-                                 ",BlockSize: " #BS ",Runs Per Thread:" #RPT                  \
-                                 ",Decoded Items Per Thread:" #DIPT ">",                      \
-                                 &run_benchmark<IT, OT, MINRL, MAXRL, BS, RPT, DIPT>,         \
-                                 stream,                                                      \
-                                 size)
+#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT)                                   \
+    benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:block,algo:run_length_decode"  \
+                                    ",item_type:" #IT                                           \
+                                    ",offset_type:" #OT                                         \
+                                    ",min_run_length:" #MINRL                                   \
+                                    ",max_run_length:" #MAXRL                                   \
+                                    ",cfg:{block_size:" #BS                                     \
+                                    ",run_per_thread:" #RPT                                     \
+                                    ",decoded_items_per_thread:" #DIPT                          \
+                                    "}}"                                                        \
+                                ).c_str(),                                                      \
+                                 &run_benchmark<IT, OT, MINRL, MAXRL, BS, RPT, DIPT>,           \
+                                 stream,                                                        \
+                                 size)                                                          
 
 int main(int argc, char* argv[])
 {
     cli::Parser parser(argc, argv);
     parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
     parser.set_optional<int>("trials", "trials", -1, "number of iterations");
+    parser.set_optional<std::string>("name_format",
+                                    "name_format",
+                                    "human",
+                                    "either: json,human,txt");
     parser.run_and_exit_if_error();
 
     // Parse argv
     benchmark::Initialize(&argc, argv);
     const size_t size   = parser.get<size_t>("size");
     const int    trials = parser.get<int>("trials");
+    bench_naming::set_format(parser.get<std::string>("name_format"));
 
-    std::cout << "benchmark_block_run_length_decode" << std::endl;
-
-    // HIP
+    // // HIP
     hipStream_t     stream = 0; // default
-    hipDeviceProp_t devProp;
-    int             device_id = 0;
-    HIP_CHECK(hipGetDevice(&device_id));
-    HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
-    std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     // Add benchmarks
     std::vector<benchmark::internal::Benchmark*> benchmarks{
diff --git a/benchmark/benchmark_config_dispatch.cpp b/benchmark/benchmark_config_dispatch.cpp
index 2b9f917e6..298e5e55c 100644
--- a/benchmark/benchmark_config_dispatch.cpp
+++ b/benchmark/benchmark_config_dispatch.cpp
@@ -1,6 +1,6 @@
 
 #include "benchmark_utils.hpp"
-
+#include "cmdparser.hpp"
 #include <rocprim/device/config_types.hpp>
 
 #include <benchmark/benchmark.h>
@@ -9,6 +9,11 @@
 
 #include <iostream>
 
+#ifndef DEFAULT_N
+const size_t DEFAULT_N = 1024 * 1024 * 32;
+#endif
+
+
 enum class stream_kind
 {
     default_stream,
@@ -61,15 +66,64 @@ static void BM_kernel_launch(benchmark::State& state)
     hipStreamSynchronize(stream);
 }
 
-BENCHMARK_CAPTURE(BM_host_target_arch, default_stream, stream_kind::default_stream);
-BENCHMARK_CAPTURE(BM_host_target_arch, per_thread_stream, stream_kind::per_thread_stream);
-BENCHMARK_CAPTURE(BM_host_target_arch, explicit_stream, stream_kind::explicit_stream);
-BENCHMARK_CAPTURE(BM_host_target_arch, async_stream, stream_kind::async_stream);
-BENCHMARK(BM_kernel_launch);
+#define CREATE_BENCHMARK(ST, SK)                \
+    benchmark::RegisterBenchmark(               \
+        bench_naming::format_name(              \
+            "{lvl:na"                           \
+            ",algo:" #ST                        \
+            ",cfg:default_config}"              \
+        ).c_str(),                              \
+        &BM_host_target_arch,                   \
+        SK                                      \
+    )                                           \
+
 
 int main(int argc, char** argv)
 {
+    cli::Parser parser(argc, argv);
+    parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
+    parser.set_optional<int>("trials", "trials", -1, "number of iterations");
+    parser.set_optional<std::string>("name_format",
+                                    "name_format",
+                                    "human",
+                                    "either: json,human,txt");
+    parser.run_and_exit_if_error();
+
+    // Parse argv
     benchmark::Initialize(&argc, argv);
-    add_common_benchmark_info();
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
+    bench_naming::set_format(parser.get<std::string>("name_format"));
+
+
+    // HIP
+
+    std::vector<benchmark::internal::Benchmark*> benchmarks{
+        CREATE_BENCHMARK(default_stream, stream_kind::default_stream),
+        CREATE_BENCHMARK(per_thread_stream, stream_kind::per_thread_stream),
+        CREATE_BENCHMARK(explicit_stream, stream_kind::explicit_stream),
+        CREATE_BENCHMARK(async_stream, stream_kind::async_stream)
+    };
+
+
+    // Use manual timing
+    for(auto& b : benchmarks)
+    {
+        b->UseManualTime();
+        b->Unit(benchmark::kMillisecond);
+    }
+
+    // Force number of iterations
+    if(trials > 0)
+    {
+        for(auto& b : benchmarks)
+        {
+            b->Iterations(trials);
+        }
+    }
+
+    // Run benchmarks
     benchmark::RunSpecifiedBenchmarks();
-}
\ No newline at end of file
+    return 0;
+
+}
diff --git a/benchmark/benchmark_device_scan_by_key.cpp b/benchmark/benchmark_device_scan_by_key.cpp
index 7e8cfface..956c0af38 100644
--- a/benchmark/benchmark_device_scan_by_key.cpp
+++ b/benchmark/benchmark_device_scan_by_key.cpp
@@ -71,6 +71,7 @@ int main(int argc, char* argv[])
                                      "name_format",
                                      "human",
                                      "either: json,human,txt");
+                                     
 #ifdef BENCHMARK_CONFIG_TUNING
     // optionally run an evenly split subset of benchmarks, when making multiple program invocations
     parser.set_optional<int>("parallel_instance",