Skip to content

Commit

Permalink
VITIS-11829 Xbutil: Separate throughput and latency from verify as in…
Browse files Browse the repository at this point in the history
…dividual microbenchmarks (#8154)

* Separate throughput and latency

Signed-off-by: AShivangi <[email protected]>

* fix

Signed-off-by: AShivangi <[email protected]>

---------

Signed-off-by: AShivangi <[email protected]>
  • Loading branch information
AShivangi authored May 13, 2024
1 parent 03825c0 commit 1457461
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 67 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "TestIPU.h"
#include "TestNPULatency.h"
#include "tools/common/XBUtilities.h"
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
Expand All @@ -16,14 +16,14 @@ namespace XBU = XBUtilities;
static constexpr size_t host_app = 1; //opcode
static constexpr size_t buffer_size = 20;
static constexpr int itr_count = 10000;
static constexpr int itr_count_throughput = itr_count/4;

// ----- C L A S S M E T H O D S -------------------------------------------
TestIPU::TestIPU()
: TestRunner("verify", "Run end-to-end latency and throughput test on NPU")
TestNPULatency::TestNPULatency()
: TestRunner("latency", "Run end-to-end latency test")
{}

boost::property_tree::ptree
TestIPU::run(std::shared_ptr<xrt_core::device> dev)
TestNPULatency::run(std::shared_ptr<xrt_core::device> dev)
{
boost::property_tree::ptree ptree = get_test_header();

Expand Down Expand Up @@ -90,7 +90,7 @@ TestIPU::run(std::shared_ptr<xrt_core::device> dev)
logger(ptree, "Details", boost::str(boost::format("Instruction size: '%f' bytes") % buffer_size));
logger(ptree, "Details", boost::str(boost::format("No. of iterations: '%f'") % itr_count));

// First run the test to compute latency where we submit one job at a time and wait for its completion before
// Run the test to compute latency where we submit one job at a time and wait for its completion before
// we submit the next one
float elapsedSecs = 0.0;

Expand All @@ -110,39 +110,8 @@ TestIPU::run(std::shared_ptr<xrt_core::device> dev)
}

// Calculate end-to-end latency of one job execution
// if (elapsedSecs == 0.0) then it means we had an exception and we will report 0.0 for latency
const float latency = (elapsedSecs / itr_count) * 1000000; //convert s to us

// Next we run the test to compute throughput where we saturate NPU with jobs and then wait for all
// completions at the end
std::array<xrt::run, itr_count_throughput> runhandles;

// A value of 0.0 indicates there was an exception
elapsedSecs = 0.0;

try {
auto start = std::chrono::high_resolution_clock::now();
for (auto & hand : runhandles)
hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc);
for (const auto& hand: runhandles)
hand.wait2();
auto end = std::chrono::high_resolution_clock::now();
elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
}
catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
}

// Now compute the throughput
// if (elapsedSecs == 0.0) then it means we had an exception and we will report 0.0 for throughput
const double throughput = (elapsedSecs != 0.0) ? runhandles.size() / elapsedSecs : 0.0;

// Do we need to store 'Total duration'?"
// logger(ptree, "Details", boost::str(boost::format("Total duration: '%.1f's") % elapsedSecs));
logger(ptree, "Details", boost::str(boost::format("Average throughput: '%.1f' ops/s") % throughput));
logger(ptree, "Details", boost::str(boost::format("Average latency: '%.1f' us") % latency));

ptree.put("status", test_token_passed);
return ptree;
}
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

#ifndef __TestIPU_h_
#define __TestIPU_h_
#ifndef __TestNPULatency_h_
#define __TestNPULatency_h_

#include "tools/common/TestRunner.h"
#include "xrt/xrt_device.h"

class TestIPU : public TestRunner {
class TestNPULatency : public TestRunner {
public:
boost::property_tree::ptree run(std::shared_ptr<xrt_core::device> dev);

public:
TestIPU();
TestNPULatency();
};

#endif
117 changes: 117 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "TestNPUThroughput.h"
#include "tools/common/XBUtilities.h"
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_hw_context.h"
#include "xrt/xrt_kernel.h"
namespace XBU = XBUtilities;

#include <filesystem>

static constexpr size_t host_app = 1; //opcode
static constexpr size_t buffer_size = 20;
static constexpr int itr_count_throughput = 2500;
// ----- C L A S S M E T H O D S -------------------------------------------
TestNPUThroughput::TestNPUThroughput()
: TestRunner("throughput", "Run end-to-end throughput test")
{}

boost::property_tree::ptree
TestNPUThroughput::run(std::shared_ptr<xrt_core::device> dev)
{
boost::property_tree::ptree ptree = get_test_header();

const auto xclbin_name = xrt_core::device_query<xrt_core::query::xclbin_name>(dev, xrt_core::query::xclbin_name::type::validate);
auto xclbin_path = findPlatformFile(xclbin_name, ptree);
if (!std::filesystem::exists(xclbin_path))
return ptree;

logger(ptree, "Xclbin", xclbin_path);

xrt::xclbin xclbin;
try {
xclbin = xrt::xclbin(xclbin_path);
}
catch (const std::runtime_error& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
return ptree;
}

// Determine The DPU Kernel Name
auto xkernels = xclbin.get_kernels();

auto itr = std::find_if(xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel& k) {
auto name = k.get_name();
return name.rfind("DPU",0) == 0; // Starts with "DPU"
});

xrt::xclbin::kernel xkernel;
if (itr!=xkernels.end())
xkernel = *itr;
else {
logger(ptree, "Error", "No kernel with `DPU` found in the xclbin");
ptree.put("status", test_token_failed);
return ptree;
}
auto kernelName = xkernel.get_name();
logger(ptree, "Details", boost::str(boost::format("Kernel name is '%s'") % kernelName));

auto working_dev = xrt::device(dev);
working_dev.register_xclbin(xclbin);
xrt::hw_context hwctx{working_dev, xclbin.get_uuid()};
xrt::kernel testker{hwctx, kernelName};

//Create BOs, the values are not initialized as they are not really used by this special test running on the device
int argno = 1;
xrt::bo bo_ifm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_param(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_ofm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_inter(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
xrt::bo bo_instr(working_dev, buffer_size, XCL_BO_FLAGS_CACHEABLE, testker.group_id(argno++));
argno++;
xrt::bo bo_mc(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
//Create ctrlcode with NOPs
std::memset(bo_instr.map<char*>(), 0, buffer_size);

//Sync BOs
bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE);

//Log
logger(ptree, "Details", boost::str(boost::format("Instruction size: '%f' bytes") % buffer_size));
logger(ptree, "Details", boost::str(boost::format("No. of iterations: '%f'") % itr_count_throughput));

// Run the test to compute throughput where we saturate NPU with jobs and then wait for all
// completions at the end
float elapsedSecs = 0.0;
std::array<xrt::run, itr_count_throughput> runhandles;

try {
auto start = std::chrono::high_resolution_clock::now();
for (auto & hand : runhandles)
hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc);
for (const auto& hand: runhandles)
hand.wait2();
auto end = std::chrono::high_resolution_clock::now();
elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
}
catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
}

// Compute the throughput
const double throughput = (elapsedSecs != 0.0) ? runhandles.size() / elapsedSecs : 0.0;

logger(ptree, "Details", boost::str(boost::format("Average throughput: '%.1f' ops/s") % throughput));
ptree.put("status", test_token_passed);
return ptree;
}
18 changes: 18 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestNPUThroughput.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

#ifndef __TestNPUThroughput_h_
#define __TestNPUThroughput_h_

#include "tools/common/TestRunner.h"
#include "xrt/xrt_device.h"

class TestNPUThroughput : public TestRunner {
public:
boost::property_tree::ptree run(std::shared_ptr<xrt_core::device> dev);

public:
TestNPUThroughput();
};

#endif
31 changes: 9 additions & 22 deletions src/runtime_src/core/tools/common/tests/TestVerify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "TestVerify.h"
#include "TestIPU.h"
#include "tools/common/XBUtilities.h"
namespace XBU = XBUtilities;

#include <filesystem>
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"
#define LENGTH 64

static constexpr size_t buffer_size = 64;

// ----- C L A S S M E T H O D S -------------------------------------------
TestVerify::TestVerify()
Expand All @@ -25,28 +25,14 @@ boost::property_tree::ptree
TestVerify::run(std::shared_ptr<xrt_core::device> dev)
{
boost::property_tree::ptree ptree;
switch (xrt_core::device_query<xrt_core::query::device_class>(dev)) {
case xrt_core::query::device_class::type::ryzen:
ptree = TestIPU{}.run(dev);
break;
case xrt_core::query::device_class::type::alveo:
ptree = get_test_header();
runTest(dev, ptree);
break;
}
return ptree;
}

void
TestVerify::runTest(std::shared_ptr<xrt_core::device> dev, boost::property_tree::ptree& ptree)
{
ptree = get_test_header();
xrt::device device(dev);

const std::string test_path = findPlatformPath(dev, ptree);
if (test_path.empty()) {
logger(ptree, "Error", "Platform test path was not found.");
ptree.put("status", test_token_failed);
return;
return ptree;
}

const std::string b_file = findXclbinPath(dev, ptree);
Expand All @@ -57,7 +43,7 @@ TestVerify::runTest(std::shared_ptr<xrt_core::device> dev, boost::property_tree:
if (!logic_uuid.empty() && !std::filesystem::exists(b_file)) {
logger(ptree, "Details", "Verify xclbin not available or shell partition is not programmed. Skipping validation.");
ptree.put("status", test_token_skipped);
return;
return ptree;
}
auto xclbin_uuid = device.load_xclbin(b_file);

Expand All @@ -70,19 +56,19 @@ TestVerify::runTest(std::shared_ptr<xrt_core::device> dev, boost::property_tree:
} catch (const std::exception&) {
logger(ptree, "Error", "Kernel could not be found.");
ptree.put("status", test_token_failed);
return;
return ptree;
}
}

// Allocate the output buffer to hold the kernel output
auto output_buffer = xrt::bo(device, sizeof(char) * LENGTH, krnl.group_id(0));
auto output_buffer = xrt::bo(device, sizeof(char) * buffer_size, krnl.group_id(0));

// Run the kernel and store its contents within the allocated output buffer
auto run = krnl(output_buffer);
run.wait();

// Prepare local buffer
char received_data[LENGTH] = {};
char received_data[buffer_size] = {};

// Acquire and read the buffer data
output_buffer.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
Expand All @@ -96,4 +82,5 @@ TestVerify::runTest(std::shared_ptr<xrt_core::device> dev, boost::property_tree:
}

ptree.put("status", test_token_passed);
return ptree;
}
6 changes: 5 additions & 1 deletion src/runtime_src/core/tools/xbutil2/SubCmdValidate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#include "tools/common/tests/TestTCTOneColumn.h"
#include "tools/common/tests/TestTCTAllColumn.h"
#include "tools/common/tests/TestGemm.h"
#include "tools/common/tests/TestNPUThroughput.h"
#include "tools/common/tests/TestNPULatency.h"
namespace XBU = XBUtilities;

// 3rd Party Library - Include Files
Expand Down Expand Up @@ -104,7 +106,9 @@ std::vector<std::shared_ptr<TestRunner>> testSuite = {
std::make_shared<TestDF_bandwidth>(),
std::make_shared<TestTCTOneColumn>(),
std::make_shared<TestTCTAllColumn>(),
std::make_shared<TestGemm>()
std::make_shared<TestGemm>(),
std::make_shared<TestNPUThroughput>(),
std::make_shared<TestNPULatency>()
};

/*
Expand Down
2 changes: 1 addition & 1 deletion src/runtime_src/core/tools/xbutil2/xbutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ R"(
}]
},{
"validate": [{
"test": ["verify", "df-bw", "tct-one-col", "tct-all-col", "gemm"]
"test": ["latency", "throughput", "df-bw", "tct-one-col", "tct-all-col", "gemm"]
}]
}]
}]
Expand Down

0 comments on commit 1457461

Please sign in to comment.