Skip to content

Commit

Permalink
Enhance no-op test logics (#8423)
Browse files Browse the repository at this point in the history
Signed-off-by: AShivangi <[email protected]>
  • Loading branch information
AShivangi authored Sep 18, 2024
1 parent 0cf086c commit fb9dfad
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 47 deletions.
61 changes: 38 additions & 23 deletions src/runtime_src/core/tools/common/tests/TestNPULatency.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "xrt/xrt_device.h"
#include "xrt/xrt_hw_context.h"
#include "xrt/xrt_kernel.h"
#include <experimental/xrt_kernel.h>
namespace XBU = XBUtilities;

#include <filesystem>
Expand Down Expand Up @@ -80,23 +81,38 @@ TestNPULatency::run(std::shared_ptr<xrt_core::device> dev)
return ptree;
}

//Create BOs, the values are not initialized as they are not really used by this special test running on the device
int argno = 1;
xrt::bo bo_ifm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_param(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_ofm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_inter(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_instr(working_dev, buffer_size, XCL_BO_FLAGS_CACHEABLE, testker.group_id(argno++));
argno++;
xrt::bo bo_mc(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
//Create ctrlcode with NOPs
std::memset(bo_instr.map<char*>(), 0, buffer_size);

//Sync BOs
bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE);
xrt::xclbin::ip cu;
for (const auto& ip : xclbin.get_ips()) {
if (ip.get_type() != xrt::xclbin::ip::ip_type::ps)
continue;
cu = ip;
break;
}

std::vector<xrt::bo> global_args;

// create specified a run and populate with arguments
auto run = xrt::run(testker);
for (const auto& arg : cu.get_args()) {
auto arg_idx = static_cast<int>(arg.get_index());
if (arg.get_host_type() == "uint64_t")
run.set_arg(arg_idx, static_cast<uint64_t>(1));
else if (arg.get_host_type() == "uint32_t")
run.set_arg(arg_idx, static_cast<uint32_t>(1));
else if (arg.get_host_type().find('*') != std::string::npos) {
xrt::bo bo;

if (arg.get_name() == "instruct")
bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx));
else
bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx));

bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
global_args.push_back(bo);
run.set_arg(arg_idx, bo);
}
}

//Log
if(XBU::getVerbose()) {
logger(ptree, "Details", boost::str(boost::format("Instruction size: '%f' bytes") % buffer_size));
Expand All @@ -105,25 +121,24 @@ TestNPULatency::run(std::shared_ptr<xrt_core::device> dev)

// Run the test to compute latency where we submit one job at a time and wait for its completion before
// we submit the next one
float elapsedSecs = 0.0;
float elapsed_secs = 0.0;

try {
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < itr_count; i++) {
auto hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc);
// Wait for kernel to be done
hand.wait2();
run.start();
run.wait2();
}
auto end = std::chrono::high_resolution_clock::now();
elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
elapsed_secs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
}
catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
}

// Calculate end-to-end latency of one job execution
const float latency = (elapsedSecs / itr_count) * 1000000; //convert s to us
const float latency = (elapsed_secs / itr_count) * 1000000; //convert s to us
logger(ptree, "Details", boost::str(boost::format("Average latency: '%.1f' us") % latency));
ptree.put("status", test_token_passed);
return ptree;
Expand Down
72 changes: 48 additions & 24 deletions src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
#include "xrt/xrt_device.h"
#include "xrt/xrt_hw_context.h"
#include "xrt/xrt_kernel.h"
#include <experimental/xrt_kernel.h>
namespace XBU = XBUtilities;

#include <filesystem>

static constexpr size_t host_app = 1; //opcode
static constexpr size_t buffer_size = 20;
static constexpr int itr_count_throughput = 2500;
static constexpr int run_buffer = 9;
static constexpr int itr_count_throughput = 2502;
// ----- C L A S S M E T H O D S -------------------------------------------
TestNPUThroughput::TestNPUThroughput()
: TestRunner("throughput", "Run end-to-end throughput test")
Expand Down Expand Up @@ -79,23 +81,41 @@ TestNPUThroughput::run(std::shared_ptr<xrt_core::device> dev)
return ptree;
}

//Create BOs, the values are not initialized as they are not really used by this special test running on the device
int argno = 1;
xrt::bo bo_ifm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_param(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_ofm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_inter(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_instr(working_dev, buffer_size, XCL_BO_FLAGS_CACHEABLE, testker.group_id(argno++));
argno++;
xrt::bo bo_mc(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
//Create ctrlcode with NOPs
std::memset(bo_instr.map<char*>(), 0, buffer_size);

//Sync BOs
bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE);
xrt::xclbin::ip cu;
for (const auto& ip : xclbin.get_ips()) {
if (ip.get_type() != xrt::xclbin::ip::ip_type::ps)
continue;
cu = ip;
break;
}

// create specified number of runs and populate with arguments
std::vector<xrt::bo> global_args;
std::vector<xrt::run> run_handles;

for (int i=0; i < run_buffer; ++i) {
auto run = xrt::run(testker);
for (const auto& arg : cu.get_args()) {
auto arg_idx = static_cast<int>(arg.get_index());
if (arg.get_host_type() == "uint64_t")
run.set_arg(arg_idx, static_cast<uint64_t>(1));
else if (arg.get_host_type() == "uint32_t")
run.set_arg(arg_idx, static_cast<uint32_t>(1));
else if (arg.get_host_type().find('*') != std::string::npos) {
xrt::bo bo;

if (arg.get_name() == "instruct")
bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx));
else
bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx));

bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
global_args.push_back(bo);
run.set_arg(arg_idx, bo);
}
}
run_handles.push_back(std::move(run));
}

//Log
if(XBU::getVerbose()) {
Expand All @@ -106,14 +126,18 @@ TestNPUThroughput::run(std::shared_ptr<xrt_core::device> dev)
// Run the test to compute throughput where we saturate NPU with jobs and then wait for all
// completions at the end
float elapsedSecs = 0.0;
std::array<xrt::run, itr_count_throughput> runhandles;

try {
auto start = std::chrono::high_resolution_clock::now();
for (auto & hand : runhandles)
hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc);
for (const auto& hand: runhandles)
hand.wait2();
//enqueue 9 commnds
for(int i = 0; i < run_buffer; i++) {
run_handles[i%run_buffer].start();
}
//wait for each command to finish and add them to the queue
for(int i = 0; i < (itr_count_throughput-run_buffer); i++) {
run_handles[i%run_buffer].wait2();
run_handles[i%run_buffer].start();
}
auto end = std::chrono::high_resolution_clock::now();
elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
}
Expand All @@ -123,7 +147,7 @@ TestNPUThroughput::run(std::shared_ptr<xrt_core::device> dev)
}

// Compute the throughput
const double throughput = (elapsedSecs != 0.0) ? runhandles.size() / elapsedSecs : 0.0;
const double throughput = (elapsedSecs != 0.0) ? itr_count_throughput / elapsedSecs : 0.0;

logger(ptree, "Details", boost::str(boost::format("Average throughput: '%.1f' ops") % throughput));
ptree.put("status", test_token_passed);
Expand Down

0 comments on commit fb9dfad

Please sign in to comment.