Skip to content

Commit

Permalink
Merge branch 'intel-pacsvm' into 'master'
Browse files Browse the repository at this point in the history
Add Intel SVM support to all benchmarks

See merge request pc2/HPCC_FPGA!14
  • Loading branch information
Marius Meyer committed Jun 3, 2020
2 parents ea9d696 + 48d5eed commit b3f5156
Show file tree
Hide file tree
Showing 33 changed files with 607 additions and 140 deletions.
1 change: 1 addition & 0 deletions FFT/src/common/parameters.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define LOG_FFT_SIZE @LOG_FFT_SIZE@
#define FFT_UNROLL @FFT_UNROLL@

#cmakedefine USE_SVM
/*
Short description of the program.
Moreover the version and build time is also compiled into the description.
Expand Down
3 changes: 3 additions & 0 deletions FFT/src/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ if (INTELFPGAOPENCL_FOUND)
target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}")
target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base)
target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel)
if (USE_SVM)
target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0)
endif()
target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA)
target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
Expand Down
2 changes: 1 addition & 1 deletion FFT/src/host/execution.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ simple exchange of the different calculation methods.
@return The resulting matrix
*/
std::unique_ptr<fft::FFTExecutionTimings>
calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config, std::complex<HOST_DATA_TYPE>* data, unsigned iterations, bool inverse);
calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config, std::complex<HOST_DATA_TYPE>* data, std::complex<HOST_DATA_TYPE>* data_out, unsigned iterations, bool inverse);

} // namespace bm_execution

Expand Down
37 changes: 32 additions & 5 deletions FFT/src/host/execution_default.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,45 @@ namespace bm_execution {
std::unique_ptr<fft::FFTExecutionTimings>
calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config,
std::complex<HOST_DATA_TYPE>* data,
std::complex<HOST_DATA_TYPE>* data_out,
unsigned iterations,
bool inverse) {

cl::Buffer inBuffer = cl::Buffer(*config.context, CL_MEM_WRITE_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE));
cl::Buffer outBuffer = cl::Buffer(*config.context, CL_MEM_READ_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE));

cl::Kernel fetchKernel(*config.program, FETCH_KERNEL_NAME);

fetchKernel.setArg(0, inBuffer);

cl::Kernel fftKernel(*config.program, FFT_KERNEL_NAME);

#ifdef USE_SVM
clSetKernelArgSVMPointer(fetchKernel(), 0,
reinterpret_cast<void*>(data));
clSetKernelArgSVMPointer(fftKernel(), 0,
reinterpret_cast<void*>(data_out));
#else
fetchKernel.setArg(0, inBuffer);
fftKernel.setArg(0, outBuffer);
#endif
fftKernel.setArg(1, iterations);
fftKernel.setArg(2, static_cast<cl_int>(inverse));

cl::CommandQueue fetchQueue(*config.context);
cl::CommandQueue fftQueue(*config.context);

#ifdef USE_SVM
clEnqueueSVMMap(fetchQueue(), CL_TRUE,
CL_MAP_READ,
reinterpret_cast<void *>(data),
(1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), 0,
NULL, NULL);
clEnqueueSVMMap(fftQueue(), CL_TRUE,
CL_MAP_WRITE,
reinterpret_cast<void *>(data_out),
(1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), 0,
NULL, NULL);
#else
fetchQueue.enqueueWriteBuffer(inBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data);
#endif

std::vector<double> calculationTimings;
for (uint r =0; r < config.programSettings->numRepetitions; r++) {
Expand All @@ -77,8 +96,16 @@ namespace bm_execution {
(endCalculation - startCalculation);
calculationTimings.push_back(calculationTime.count());
}

fetchQueue.enqueueReadBuffer(outBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data);
#ifdef USE_SVM
clEnqueueSVMUnmap(fetchQueue(),
reinterpret_cast<void *>(data), 0,
NULL, NULL);
clEnqueueSVMUnmap(fftQueue(),
reinterpret_cast<void *>(data_out), 0,
NULL, NULL);
#else
fetchQueue.enqueueReadBuffer(outBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data_out);
#endif

std::unique_ptr<fft::FFTExecutionTimings> result(new fft::FFTExecutionTimings{
calculationTimings
Expand Down
39 changes: 32 additions & 7 deletions FFT/src/host/fft_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,30 @@ fft::FFTProgramSettings::getSettingsMap() {
return map;
}

fft::FFTData::FFTData(cl::Context context, uint iterations) : context(context) {
#ifdef USE_SVM
data = reinterpret_cast<std::complex<HOST_DATA_TYPE>*>(
clSVMAlloc(context(), 0 ,
iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>), 1024));
data_out = reinterpret_cast<std::complex<HOST_DATA_TYPE>*>(
clSVMAlloc(context(), 0 ,
iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>), 1024));
#else
posix_memalign(reinterpret_cast<void**>(&data), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>));
posix_memalign(reinterpret_cast<void**>(&data_out), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>));
#endif
}

fft::FFTData::~FFTData() {
#ifdef USE_SVM
clSVMFree(context(), reinterpret_cast<void*>(data));
clSVMFree(context(), reinterpret_cast<void*>(data_out));
#else
free(data);
free(data_out);
#endif
}

fft::FFTBenchmark::FFTBenchmark(int argc, char* argv[]) {
setupBenchmark(argc, argv);
}
Expand All @@ -63,7 +87,7 @@ fft::FFTBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {

std::unique_ptr<fft::FFTExecutionTimings>
fft::FFTBenchmark::executeKernel(FFTData &data) {
return bm_execution::calculate(*executionSettings, data.data,executionSettings->programSettings->iterations,
return bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations,
executionSettings->programSettings->inverse);
}

Expand All @@ -85,33 +109,34 @@ fft::FFTBenchmark::printResults(const fft::FFTExecutionTimings &output) {

std::unique_ptr<fft::FFTData>
fft::FFTBenchmark::generateInputData() {
auto d = std::unique_ptr<fft::FFTData>(new fft::FFTData(executionSettings->programSettings->iterations));
auto d = std::unique_ptr<fft::FFTData>(new fft::FFTData(*executionSettings->context, executionSettings->programSettings->iterations));
std::mt19937 gen(0);
auto dis = std::uniform_real_distribution<HOST_DATA_TYPE>(-1.0, 1.0);
for (int i=0; i< executionSettings->programSettings->iterations * (1 << LOG_FFT_SIZE); i++) {
d->data[i].real(dis(gen));
d->data[i].imag(dis(gen));
d->data_out[i].real(0.0);
d->data_out[i].imag(0.0);
}
return d;
}

bool
fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) {
auto verify_data = generateInputData();
double residual_max = 0;
for (int i = 0; i < executionSettings->programSettings->iterations; i++) {
// we have to bit reverse the output data of the FPGA kernel, since it will be provided in bit-reversed order.
// Directly applying iFFT on the data would thus not form the identity function we want to have for verification.
// TODO: This might need to be changed for other FPGA implementations that return the data in correct order
fft::bit_reverse(&data.data[i * (1 << LOG_FFT_SIZE)], 1);
fft::fourier_transform_gold(true, LOG_FFT_SIZE, &data.data[i * (1 << LOG_FFT_SIZE)]);
fft::bit_reverse(&data.data_out[i * (1 << LOG_FFT_SIZE)], 1);
fft::fourier_transform_gold(true, LOG_FFT_SIZE, &data.data_out[i * (1 << LOG_FFT_SIZE)]);

// Normalize the data after applying iFFT
for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) {
data.data[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE);
data.data_out[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE);
}
for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) {
double tmp_error = std::abs(verify_data->data[i * (1 << LOG_FFT_SIZE) + j] - data.data[i * (1 << LOG_FFT_SIZE) + j]);
double tmp_error = std::abs(data.data[i * (1 << LOG_FFT_SIZE) + j] - data.data_out[i * (1 << LOG_FFT_SIZE) + j]);
residual_max = residual_max > tmp_error ? residual_max : tmp_error;
}
}
Expand Down
23 changes: 16 additions & 7 deletions FFT/src/host/fft_benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,27 +80,36 @@ class FFTData {
public:

/**
* @brief The data array used ofr the FFT calculation
* @brief The data array used as input of the FFT calculation
*
*/
std::complex<HOST_DATA_TYPE>* data;

/**
* @brief The data array used as output of the FFT calculation
*
*/
std::complex<HOST_DATA_TYPE>* data_out;

/**
* @brief The context that is used to allocate memory in SVM mode
*
*/
cl::Context context;

/**
* @brief Construct a new FFT Data object
*
* @param context The OpenCL context used to allocate memory in SVM mode
* @param iterations Number of FFT data that will be stored sequentially in the array
*/
FFTData(uint iterations) {
posix_memalign(reinterpret_cast<void**>(&data), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex<HOST_DATA_TYPE>));
}
FFTData(cl::Context context, uint iterations);

/**
* @brief Destroy the FFT Data object. Free the allocated memory
*
*/
~FFTData() {
free(data);
}
~FFTData();

};

Expand Down
40 changes: 23 additions & 17 deletions FFT/tests/test_execution_functionality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ TEST_F(FFTKernelTest, FFTReturnsZero) {
}
auto result = bm->executeKernel(*data);
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
EXPECT_FLOAT_EQ(std::abs(data->data[i]), 0.0);
EXPECT_FLOAT_EQ(std::abs(data->data_out[i]), 0.0);
}
}

Expand All @@ -72,11 +72,11 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll1And1) {
data->data[i].imag(1.0);
}
auto result = bm->executeKernel(*data);
EXPECT_NEAR(data->data[0].real(), (1 << LOG_FFT_SIZE), 0.00001);
EXPECT_NEAR(data->data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001);
EXPECT_NEAR(data->data_out[0].real(), (1 << LOG_FFT_SIZE), 0.00001);
EXPECT_NEAR(data->data_out[0].imag(), (1 << LOG_FFT_SIZE), 0.00001);
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001);
EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001);
EXPECT_NEAR(data->data_out[i].real(), 0.0, 0.00001);
EXPECT_NEAR(data->data_out[i].imag(), 0.0, 0.00001);
}
}

Expand All @@ -90,11 +90,11 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) {
data->data[i].imag(0.0);
}
auto result = bm->executeKernel(*data);
EXPECT_NEAR(data->data[0].real(), static_cast<HOST_DATA_TYPE>(1 << LOG_FFT_SIZE), 0.00001);
EXPECT_NEAR(data->data[0].imag(), 0.0, 0.00001);
EXPECT_NEAR(data->data_out[0].real(), static_cast<HOST_DATA_TYPE>(1 << LOG_FFT_SIZE), 0.00001);
EXPECT_NEAR(data->data_out[0].imag(), 0.0, 0.00001);
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001);
EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001);
EXPECT_NEAR(data->data_out[i].real(), 0.0, 0.00001);
EXPECT_NEAR(data->data_out[i].imag(), 0.0, 0.00001);
}
}

Expand All @@ -108,18 +108,24 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {

// Normalize iFFT result
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
data->data[i] /= (1 << LOG_FFT_SIZE);
data->data_out[i] /= (1 << LOG_FFT_SIZE);
}

// Need to again bit reverse input for iFFT
fft::bit_reverse(data->data, 1);
fft::bit_reverse(data->data_out, 1);

// Copy to input buffer for iFFT
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
data->data[i] = data->data_out[i];
}

bm->getExecutionSettings().programSettings->inverse = true;
auto result2 = bm->executeKernel(*data);
// Since data was already sorted by iFFT the bit reversal of the kernel has t be undone
fft::bit_reverse(data->data, 1);
fft::bit_reverse(data->data_out, 1);

for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001);
EXPECT_NEAR(std::abs(data->data_out[i]), std::abs(verify_data->data[i]), 0.001);
}
}

Expand All @@ -136,10 +142,10 @@ TEST_F(FFTKernelTest, FPGAFFTAndCPUFFTGiveSameResults) {

// Normalize iFFT result
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
data->data[i] -= verify_data->data[i];
data->data_out[i] -= verify_data->data[i];
}
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001);
EXPECT_NEAR(std::abs(data->data_out[i]), 0.0, 0.001);
}
}

Expand All @@ -157,9 +163,9 @@ TEST_F(FFTKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) {

// Normalize iFFT result
for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
data->data[i] -= verify_data->data[i];
data->data_out[i] -= verify_data->data[i];
}
for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001);
EXPECT_NEAR(std::abs(data->data_out[i]), 0.0, 0.001);
}
}
2 changes: 2 additions & 0 deletions GEMM/src/common/parameters.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#define HOST_DATA_TYPE @HOST_DATA_TYPE@
#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@

#cmakedefine USE_SVM

/*
Short description of the program
*/
Expand Down
Loading

0 comments on commit b3f5156

Please sign in to comment.