diff --git a/FFT/src/common/parameters.h.in b/FFT/src/common/parameters.h.in index 2e11cc4c..4d29fbaa 100644 --- a/FFT/src/common/parameters.h.in +++ b/FFT/src/common/parameters.h.in @@ -19,6 +19,7 @@ #define LOG_FFT_SIZE @LOG_FFT_SIZE@ #define FFT_UNROLL @FFT_UNROLL@ +#cmakedefine USE_SVM /* Short description of the program. Moreover the version and build time is also compiled into the description. diff --git a/FFT/src/host/CMakeLists.txt b/FFT/src/host/CMakeLists.txt index 0c505452..8568c69e 100755 --- a/FFT/src/host/CMakeLists.txt +++ b/FFT/src/host/CMakeLists.txt @@ -13,6 +13,9 @@ if (INTELFPGAOPENCL_FOUND) target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) + if (USE_SVM) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0) + endif() target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) diff --git a/FFT/src/host/execution.h b/FFT/src/host/execution.h index 65800bf7..01989309 100644 --- a/FFT/src/host/execution.h +++ b/FFT/src/host/execution.h @@ -47,7 +47,7 @@ simple exchange of the different calculation methods. @return The resulting matrix */ std::unique_ptr - calculate(hpcc_base::ExecutionSettings const& config, std::complex* data, unsigned iterations, bool inverse); + calculate(hpcc_base::ExecutionSettings const& config, std::complex* data, std::complex* data_out, unsigned iterations, bool inverse); } // namespace bm_execution diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp index 53a693d1..e08be365 100644 --- a/FFT/src/host/execution_default.cpp +++ b/FFT/src/host/execution_default.cpp @@ -42,6 +42,7 @@ namespace bm_execution { std::unique_ptr calculate(hpcc_base::ExecutionSettings const& config, std::complex* data, + std::complex* data_out, unsigned iterations, bool inverse) { @@ -49,19 +50,37 @@ namespace bm_execution { cl::Buffer outBuffer = cl::Buffer(*config.context, CL_MEM_READ_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE)); cl::Kernel fetchKernel(*config.program, FETCH_KERNEL_NAME); - - fetchKernel.setArg(0, inBuffer); - cl::Kernel fftKernel(*config.program, FFT_KERNEL_NAME); +#ifdef USE_SVM + clSetKernelArgSVMPointer(fetchKernel(), 0, + reinterpret_cast(data)); + clSetKernelArgSVMPointer(fftKernel(), 0, + reinterpret_cast(data_out)); +#else + fetchKernel.setArg(0, inBuffer); fftKernel.setArg(0, outBuffer); +#endif fftKernel.setArg(1, iterations); fftKernel.setArg(2, static_cast(inverse)); cl::CommandQueue fetchQueue(*config.context); cl::CommandQueue fftQueue(*config.context); +#ifdef USE_SVM + clEnqueueSVMMap(fetchQueue(), CL_TRUE, + CL_MAP_READ, + reinterpret_cast(data), + (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), 0, + NULL, NULL); + clEnqueueSVMMap(fftQueue(), CL_TRUE, + CL_MAP_WRITE, + reinterpret_cast(data_out), + (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), 0, + NULL, NULL); +#else fetchQueue.enqueueWriteBuffer(inBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data); +#endif std::vector calculationTimings; for (uint r =0; r < config.programSettings->numRepetitions; r++) { @@ -77,8 +96,16 @@ namespace bm_execution { (endCalculation - startCalculation); calculationTimings.push_back(calculationTime.count()); } - - fetchQueue.enqueueReadBuffer(outBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data); +#ifdef USE_SVM + clEnqueueSVMUnmap(fetchQueue(), + reinterpret_cast(data), 0, + NULL, NULL); + clEnqueueSVMUnmap(fftQueue(), + reinterpret_cast(data_out), 0, + NULL, NULL); +#else + fetchQueue.enqueueReadBuffer(outBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data_out); +#endif std::unique_ptr result(new fft::FFTExecutionTimings{ calculationTimings diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index 2a84e492..cf943c21 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -47,6 +47,30 @@ fft::FFTProgramSettings::getSettingsMap() { return map; } +fft::FFTData::FFTData(cl::Context context, uint iterations) : context(context) { +#ifdef USE_SVM + data = reinterpret_cast*>( + clSVMAlloc(context(), 0 , + iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex), 1024)); + data_out = reinterpret_cast*>( + clSVMAlloc(context(), 0 , + iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex), 1024)); +#else + posix_memalign(reinterpret_cast(&data), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex)); + posix_memalign(reinterpret_cast(&data_out), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex)); +#endif +} + +fft::FFTData::~FFTData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(data)); + clSVMFree(context(), reinterpret_cast(data_out)); +#else + free(data); + free(data_out); +#endif +} + fft::FFTBenchmark::FFTBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } @@ -63,7 +87,7 @@ fft::FFTBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { std::unique_ptr fft::FFTBenchmark::executeKernel(FFTData &data) { - return bm_execution::calculate(*executionSettings, data.data,executionSettings->programSettings->iterations, + return bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations, executionSettings->programSettings->inverse); } @@ -85,33 +109,34 @@ fft::FFTBenchmark::printResults(const fft::FFTExecutionTimings &output) { std::unique_ptr fft::FFTBenchmark::generateInputData() { - auto d = std::unique_ptr(new fft::FFTData(executionSettings->programSettings->iterations)); + auto d = std::unique_ptr(new fft::FFTData(*executionSettings->context, executionSettings->programSettings->iterations)); std::mt19937 gen(0); auto dis = std::uniform_real_distribution(-1.0, 1.0); for (int i=0; i< executionSettings->programSettings->iterations * (1 << LOG_FFT_SIZE); i++) { d->data[i].real(dis(gen)); d->data[i].imag(dis(gen)); + d->data_out[i].real(0.0); + d->data_out[i].imag(0.0); } return d; } bool fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) { - auto verify_data = generateInputData(); double residual_max = 0; for (int i = 0; i < executionSettings->programSettings->iterations; i++) { // we have to bit reverse the output data of the FPGA kernel, since it will be provided in bit-reversed order. // Directly applying iFFT on the data would thus not form the identity function we want to have for verification. // TODO: This might need to be changed for other FPGA implementations that return the data in correct order - fft::bit_reverse(&data.data[i * (1 << LOG_FFT_SIZE)], 1); - fft::fourier_transform_gold(true, LOG_FFT_SIZE, &data.data[i * (1 << LOG_FFT_SIZE)]); + fft::bit_reverse(&data.data_out[i * (1 << LOG_FFT_SIZE)], 1); + fft::fourier_transform_gold(true, LOG_FFT_SIZE, &data.data_out[i * (1 << LOG_FFT_SIZE)]); // Normalize the data after applying iFFT for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) { - data.data[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE); + data.data_out[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE); } for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) { - double tmp_error = std::abs(verify_data->data[i * (1 << LOG_FFT_SIZE) + j] - data.data[i * (1 << LOG_FFT_SIZE) + j]); + double tmp_error = std::abs(data.data[i * (1 << LOG_FFT_SIZE) + j] - data.data_out[i * (1 << LOG_FFT_SIZE) + j]); residual_max = residual_max > tmp_error ? residual_max : tmp_error; } } diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp index 2f1cbd17..53d8dea4 100644 --- a/FFT/src/host/fft_benchmark.hpp +++ b/FFT/src/host/fft_benchmark.hpp @@ -80,27 +80,36 @@ class FFTData { public: /** - * @brief The data array used ofr the FFT calculation + * @brief The data array used as input of the FFT calculation * */ std::complex* data; + /** + * @brief The data array used as output of the FFT calculation + * + */ + std::complex* data_out; + + /** + * @brief The context that is used to allocate memory in SVM mode + * + */ + cl::Context context; + /** * @brief Construct a new FFT Data object * + * @param context The OpenCL context used to allocate memory in SVM mode * @param iterations Number of FFT data that will be stored sequentially in the array */ - FFTData(uint iterations) { - posix_memalign(reinterpret_cast(&data), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex)); - } + FFTData(cl::Context context, uint iterations); /** * @brief Destroy the FFT Data object. Free the allocated memory * */ - ~FFTData() { - free(data); - } + ~FFTData(); }; diff --git a/FFT/tests/test_execution_functionality.cpp b/FFT/tests/test_execution_functionality.cpp index 3db9d624..cd0d85a0 100644 --- a/FFT/tests/test_execution_functionality.cpp +++ b/FFT/tests/test_execution_functionality.cpp @@ -58,7 +58,7 @@ TEST_F(FFTKernelTest, FFTReturnsZero) { } auto result = bm->executeKernel(*data); for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - EXPECT_FLOAT_EQ(std::abs(data->data[i]), 0.0); + EXPECT_FLOAT_EQ(std::abs(data->data_out[i]), 0.0); } } @@ -72,11 +72,11 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll1And1) { data->data[i].imag(1.0); } auto result = bm->executeKernel(*data); - EXPECT_NEAR(data->data[0].real(), (1 << LOG_FFT_SIZE), 0.00001); - EXPECT_NEAR(data->data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); + EXPECT_NEAR(data->data_out[0].real(), (1 << LOG_FFT_SIZE), 0.00001); + EXPECT_NEAR(data->data_out[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001); - EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001); + EXPECT_NEAR(data->data_out[i].real(), 0.0, 0.00001); + EXPECT_NEAR(data->data_out[i].imag(), 0.0, 0.00001); } } @@ -90,11 +90,11 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) { data->data[i].imag(0.0); } auto result = bm->executeKernel(*data); - EXPECT_NEAR(data->data[0].real(), static_cast(1 << LOG_FFT_SIZE), 0.00001); - EXPECT_NEAR(data->data[0].imag(), 0.0, 0.00001); + EXPECT_NEAR(data->data_out[0].real(), static_cast(1 << LOG_FFT_SIZE), 0.00001); + EXPECT_NEAR(data->data_out[0].imag(), 0.0, 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001); - EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001); + EXPECT_NEAR(data->data_out[i].real(), 0.0, 0.00001); + EXPECT_NEAR(data->data_out[i].imag(), 0.0, 0.00001); } } @@ -108,18 +108,24 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) { // Normalize iFFT result for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data->data[i] /= (1 << LOG_FFT_SIZE); + data->data_out[i] /= (1 << LOG_FFT_SIZE); } // Need to again bit reverse input for iFFT - fft::bit_reverse(data->data, 1); + fft::bit_reverse(data->data_out, 1); + + // Copy to input buffer for iFFT + for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { + data->data[i] = data->data_out[i]; + } + bm->getExecutionSettings().programSettings->inverse = true; auto result2 = bm->executeKernel(*data); // Since data was already sorted by iFFT the bit reversal of the kernel has t be undone - fft::bit_reverse(data->data, 1); + fft::bit_reverse(data->data_out, 1); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001); + EXPECT_NEAR(std::abs(data->data_out[i]), std::abs(verify_data->data[i]), 0.001); } } @@ -136,10 +142,10 @@ TEST_F(FFTKernelTest, FPGAFFTAndCPUFFTGiveSameResults) { // Normalize iFFT result for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data->data[i] -= verify_data->data[i]; + data->data_out[i] -= verify_data->data[i]; } for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001); + EXPECT_NEAR(std::abs(data->data_out[i]), 0.0, 0.001); } } @@ -157,9 +163,9 @@ TEST_F(FFTKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) { // Normalize iFFT result for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data->data[i] -= verify_data->data[i]; + data->data_out[i] -= verify_data->data[i]; } for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001); + EXPECT_NEAR(std::abs(data->data_out[i]), 0.0, 0.001); } } diff --git a/GEMM/src/common/parameters.h.in b/GEMM/src/common/parameters.h.in index 275d63be..9686f5e9 100644 --- a/GEMM/src/common/parameters.h.in +++ b/GEMM/src/common/parameters.h.in @@ -18,6 +18,8 @@ #define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ +#cmakedefine USE_SVM + /* Short description of the program */ diff --git a/GEMM/src/host/execution_cannon.cpp b/GEMM/src/host/execution_cannon.cpp index 927997a8..e615430a 100644 --- a/GEMM/src/host/execution_cannon.cpp +++ b/GEMM/src/host/execution_cannon.cpp @@ -79,6 +79,16 @@ calculate(hpcc_base::ExecutionSettings const& config, // prepare kernels +#ifdef USE_SVM + err = clSetKernelArgSVMPointer(gemmkernel(), 0, + reinterpret_cast(a)); + err = clSetKernelArgSVMPointer(gemmkernel(), 1, + reinterpret_cast(b)); + err = clSetKernelArgSVMPointer(gemmkernel(), 2, + reinterpret_cast(c)); + err = clSetKernelArgSVMPointer(gemmkernel(), 3, + reinterpret_cast(c_out)); +#else err = gemmkernel.setArg(0, Buffer_a); ASSERT_CL(err); err = gemmkernel.setArg(1, Buffer_b); @@ -87,6 +97,7 @@ calculate(hpcc_base::ExecutionSettings const& config, ASSERT_CL(err); err = gemmkernel.setArg(3, Buffer_c_out); ASSERT_CL(err); +#endif err = gemmkernel.setArg(4, alpha); ASSERT_CL(err); err = gemmkernel.setArg(5, beta); @@ -98,7 +109,33 @@ calculate(hpcc_base::ExecutionSettings const& config, double t; std::vector executionTimes; - for (int i = 0; i < config.programSettings->matrixSize; i++) { + for (int i = 0; i < config.programSettings->numRepetitions; i++) { +#ifdef USE_SVM + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ, + reinterpret_cast(a), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ, + reinterpret_cast(b), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ, + reinterpret_cast(c), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_WRITE, + reinterpret_cast(c_out), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); +#else compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, a); compute_queue.enqueueWriteBuffer(Buffer_b, CL_TRUE, 0, @@ -106,20 +143,33 @@ calculate(hpcc_base::ExecutionSettings const& config, compute_queue.enqueueWriteBuffer(Buffer_c_in, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, c); compute_queue.finish(); +#endif auto t1 = std::chrono::high_resolution_clock::now(); compute_queue.enqueueTask(gemmkernel); compute_queue.finish(); auto t2 = std::chrono::high_resolution_clock::now(); - std::chrono::duration timespan = - std::chrono::duration_cast> - (t2 - t1); + std::chrono::duration timespan = t2 - t1; executionTimes.push_back(timespan.count()); } /* --- Read back results from Device --- */ - +#ifdef USE_SVM + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(a), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(b), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(c), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(c_out), 0, + NULL, NULL); +#else compute_queue.enqueueReadBuffer(Buffer_c_out, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, c_out); +#endif std::unique_ptr results( diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp index a5fe009a..beb7005d 100644 --- a/GEMM/src/host/gemm_benchmark.cpp +++ b/GEMM/src/host/gemm_benchmark.cpp @@ -46,6 +46,42 @@ gemm::GEMMProgramSettings::getSettingsMap() { return map; } +gemm::GEMMData::GEMMData(cl::Context context, uint size) : normtotal(0.0), alpha(0.5), beta(2.0), context(context) { +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + B = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + C = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + C_out = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); +#else + posix_memalign(reinterpret_cast(&A), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C_out), 4096, size * size * sizeof(HOST_DATA_TYPE)); +#endif +} + +gemm::GEMMData::~GEMMData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(A)); + clSVMFree(context(), reinterpret_cast(B)); + clSVMFree(context(), reinterpret_cast(C)); + clSVMFree(context(), reinterpret_cast(C_out)); +#else + free(A); + free(B); + free(C); + free(C_out); +#endif +} + gemm::GEMMBenchmark::GEMMBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } @@ -74,9 +110,9 @@ gemm::GEMMBenchmark::printResults(const gemm::GEMMExecutionTimings &output) { double tmean = 0; double tmin = std::numeric_limits::max(); - double gflops = 2.0 * static_cast(executionSettings->programSettings->matrixSize - *executionSettings->programSettings->matrixSize - *executionSettings->programSettings->matrixSize)/1.0e9; + double gflops = 2.0 * (static_cast(executionSettings->programSettings->matrixSize) + *static_cast(executionSettings->programSettings->matrixSize) + *static_cast(executionSettings->programSettings->matrixSize))/1.0e9; for (double currentTime : output.timings) { tmean += currentTime; if (currentTime < tmin) { @@ -94,7 +130,7 @@ gemm::GEMMBenchmark::printResults(const gemm::GEMMExecutionTimings &output) { std::unique_ptr gemm::GEMMBenchmark::generateInputData() { - auto d = std::unique_ptr(new gemm::GEMMData(executionSettings->programSettings->matrixSize)); + auto d = std::unique_ptr(new gemm::GEMMData(*executionSettings->context, executionSettings->programSettings->matrixSize)); std::mt19937 gen(7); std::uniform_real_distribution<> dis(-1.0, 1.0); for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp index 7a7ccd5c..1a632e30 100644 --- a/GEMM/src/host/gemm_benchmark.hpp +++ b/GEMM/src/host/gemm_benchmark.hpp @@ -117,6 +117,12 @@ class GEMMData { */ HOST_DATA_TYPE alpha; + /** + * @brief The context that is used to allocate memory in SVM mode + * + */ + cl::Context context; + /** * @brief The scalar value that will be used for \f$\beta\f$ in the calculation * @@ -126,25 +132,16 @@ class GEMMData { /** * @brief Construct a new GEMM Data object * + * @param context The OpenCL context used to allocate memory in SVM mode * @param size Size of the allocated square matrices */ - GEMMData(uint size) : normtotal(0.0), alpha(0.5), beta(2.0) { - posix_memalign(reinterpret_cast(&A), 4096, size * size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&B), 4096, size * size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C), 4096, size * size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C_out), 4096, size * size * sizeof(HOST_DATA_TYPE)); - } + GEMMData(cl::Context context, uint size); /** * @brief Destroy the GEMM Data object. Free the allocated memory * */ - ~GEMMData() { - free(A); - free(B); - free(C); - free(C_out); - } + ~GEMMData(); }; diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in index 1b43c6ae..e1346397 100644 --- a/LINPACK/src/common/parameters.h.in +++ b/LINPACK/src/common/parameters.h.in @@ -20,6 +20,8 @@ #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@ #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@ +#cmakedefine USE_SVM + /* Short description of the program */ diff --git a/LINPACK/src/host/execution_blocked_pvt.cpp b/LINPACK/src/host/execution_blocked_pvt.cpp index c6ea99d4..0e3dcd65 100644 --- a/LINPACK/src/host/execution_blocked_pvt.cpp +++ b/LINPACK/src/host/execution_blocked_pvt.cpp @@ -66,10 +66,24 @@ calculate(const hpcc_base::ExecutionSettings&co // prepare kernels +#ifdef USE_SVM + // To prevent the reuse of the result of previous repetitions, use this + // buffer instead and copy the result back to the real buffer + HOST_DATA_TYPE* A_tmp = reinterpret_cast( + clSVMAlloc((*config.context)(), 0 , + config.programSettings->matrixSize * + config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE), 1024)); + + err = clSetKernelArgSVMPointer(gefakernel(), 0, + reinterpret_cast(A_tmp)); + err = clSetKernelArgSVMPointer(gefakernel(), 1, + reinterpret_cast(ipvt)); +#else err = gefakernel.setArg(0, Buffer_a); ASSERT_CL(err); err = gefakernel.setArg(1, Buffer_pivot); ASSERT_CL(err); +#endif err = gefakernel.setArg(2, static_cast(config.programSettings->matrixSize >> LOCAL_MEM_BLOCK_LOG)); ASSERT_CL(err); @@ -78,9 +92,34 @@ calculate(const hpcc_base::ExecutionSettings&co double t; std::vector executionTimes; for (int i = 0; i < config.programSettings->numRepetitions; i++) { +#ifdef USE_SVM + for (int k=0; k < config.programSettings->matrixSize * config.programSettings->matrixSize; k++) { + A_tmp[k] = A[k]; + } + + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, + reinterpret_cast(A_tmp), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ, + reinterpret_cast(b), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_WRITE, + reinterpret_cast(ipvt), + sizeof(cl_int) * + (config.programSettings->matrixSize), 0, + NULL, NULL); +#else compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A); compute_queue.finish(); +#endif auto t1 = std::chrono::high_resolution_clock::now(); compute_queue.enqueueTask(gefakernel); compute_queue.finish(); @@ -93,10 +132,29 @@ calculate(const hpcc_base::ExecutionSettings&co /* --- Read back results from Device --- */ +#ifdef USE_SVM + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(A), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(b), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(ipvt), 0, + NULL, NULL); + + // read back result from temporary buffer + for (int k=0; k < config.programSettings->matrixSize * config.programSettings->matrixSize; k++) { + A[k] = A_tmp[k]; + } + clSVMFree((*config.context)(), reinterpret_cast(A_tmp)); + +#else compute_queue.enqueueReadBuffer(Buffer_a, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A); compute_queue.enqueueReadBuffer(Buffer_pivot, CL_TRUE, 0, sizeof(cl_int)*config.programSettings->matrixSize, ipvt); +#endif // Solve linear equations on CPU // TODO: This has to be done on FPGA diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index deab852b..300c8dc3 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -46,6 +46,36 @@ linpack::LinpackProgramSettings::getSettingsMap() { return map; } +linpack::LinpackData::LinpackData(cl::Context context, uint size) : norma(0.0), context(context) { +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + b = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + ipvt = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(cl_int), 1024)); +#else + posix_memalign(reinterpret_cast(&A), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&b), 4096, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&ipvt), 4096, size * sizeof(cl_int)); +#endif + } + +linpack::LinpackData::~LinpackData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(A)); + clSVMFree(context(), reinterpret_cast(b)); + clSVMFree(context(), reinterpret_cast(ipvt)); +#else + free(A); + free(b); + free(ipvt); +#endif +} + linpack::LinpackBenchmark::LinpackBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } @@ -100,7 +130,7 @@ linpack::LinpackBenchmark::printResults(const linpack::LinpackExecutionTimings & std::unique_ptr linpack::LinpackBenchmark::generateInputData() { - auto d = std::unique_ptr(new linpack::LinpackData(executionSettings->programSettings->matrixSize)); + auto d = std::unique_ptr(new linpack::LinpackData(*executionSettings->context ,executionSettings->programSettings->matrixSize)); std::mt19937 gen(7); std::uniform_real_distribution<> dis(-1.0, 1.0); d->norma = 0.0; diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index 68efe1f6..c9e2d3c7 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -92,6 +92,12 @@ class LinpackData { */ cl_int* ipvt; + /** + * @brief The context that is used to allocate memory in SVM mode + * + */ + cl::Context context; + /** * @brief The maximum value of A that will be used for the error calculation * @@ -101,23 +107,16 @@ class LinpackData { /** * @brief Construct a new Linpack Data object * + * @param context The OpenCL context used to allocate memory in SVM mode * @param size Size of the allocated square matrix and vectors */ - LinpackData(uint size) : norma(0.0) { - posix_memalign(reinterpret_cast(&A), 4096, size * size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&b), 4096, size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&ipvt), 4096, size * sizeof(cl_int)); - } + LinpackData(cl::Context context, uint size); /** * @brief Destroy the Linpack Data object. Free the allocated memory * */ - ~LinpackData() { - free(A); - free(b); - free(ipvt); - } + ~LinpackData(); }; diff --git a/LINPACK/tests/CMakeLists.txt b/LINPACK/tests/CMakeLists.txt index b2f6adc6..a4383d47 100755 --- a/LINPACK/tests/CMakeLists.txt +++ b/LINPACK/tests/CMakeLists.txt @@ -25,6 +25,9 @@ if (INTELFPGAOPENCL_FOUND) target_link_libraries(${HOST_EXE_NAME}_test_intel ${LAPACK_LIBRARIES}) include_directories(SYSTEM $ENV{MKLROOT}/include) endif() + if (USE_SVM) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DCL_VERSION_2_0) + endif() add_dependencies(${HOST_EXE_NAME}_test_intel lu_blocked_pvt_emulate_intel) add_dependencies(${HOST_EXE_NAME}_test_intel lu_blocked_pvt_test_emulate_intel) target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) diff --git a/LINPACK/tests/test_kernel_functionality_separate_cores.cpp b/LINPACK/tests/test_kernel_functionality_separate_cores.cpp index ec580769..beed8f03 100644 --- a/LINPACK/tests/test_kernel_functionality_separate_cores.cpp +++ b/LINPACK/tests/test_kernel_functionality_separate_cores.cpp @@ -25,7 +25,25 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface(new linpack::LinpackBenchmark(argc, argv)); array_size = (1 << LOCAL_MEM_BLOCK_LOG); + bm->getExecutionSettings().programSettings->numRepetitions = 1; bm->getExecutionSettings().programSettings->matrixSize = array_size; +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc((*bm->getExecutionSettings().context)(), 0 , + array_size * array_size * sizeof(HOST_DATA_TYPE), 1024)); + B = reinterpret_cast( + clSVMAlloc((*bm->getExecutionSettings().context)(), 0 , + array_size * array_size * sizeof(HOST_DATA_TYPE), 1024)); + C = reinterpret_cast( + clSVMAlloc((*bm->getExecutionSettings().context)(), 0 , + array_size * array_size * sizeof(HOST_DATA_TYPE), 1024)); + scale = reinterpret_cast( + clSVMAlloc((*bm->getExecutionSettings().context)(), 0 , + array_size * sizeof(HOST_DATA_TYPE), 1024)); + ipvt = reinterpret_cast( + clSVMAlloc((*bm->getExecutionSettings().context)(), 0 , + array_size * sizeof(cl_int), 1024)); +#else posix_memalign(reinterpret_cast(&A), 4096, sizeof(HOST_DATA_TYPE) * array_size * array_size); posix_memalign(reinterpret_cast(&B), 4096, @@ -36,6 +54,7 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface(&ipvt), 4096, sizeof(cl_int) * array_size); +#endif } void initializeData() { @@ -83,6 +102,18 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface(A)); + err = clSetKernelArgSVMPointer(test_c4_kernel(), 1, + reinterpret_cast(B)); + err = clSetKernelArgSVMPointer(test_c4_kernel(), 2, + reinterpret_cast(C)); + err = clSetKernelArgSVMPointer(test_c4_kernel(), 3, + reinterpret_cast(scale)); + err = clSetKernelArgSVMPointer(test_c4_kernel(), 4, + reinterpret_cast(ipvt)); +#else err = test_c4_kernel.setArg(0, Buffer_a); ASSERT_CL(err); err = test_c4_kernel.setArg(1, Buffer_b); @@ -93,6 +124,7 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface(array_size >> LOCAL_MEM_BLOCK_LOG)); ASSERT_CL(err); @@ -101,6 +133,38 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface executionTimes; for (int i = 0; i < bm->getExecutionSettings().programSettings->numRepetitions; i++) { +#ifdef USE_SVM + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, + reinterpret_cast(A), + sizeof(HOST_DATA_TYPE) * + (array_size * array_size), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, + reinterpret_cast(B), + sizeof(HOST_DATA_TYPE) * + (array_size * array_size), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, + reinterpret_cast(C), + sizeof(HOST_DATA_TYPE) * + (array_size * array_size), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, + reinterpret_cast(scale), + sizeof(HOST_DATA_TYPE) * + (array_size * array_size), 0, + NULL, NULL); + clEnqueueSVMMap(compute_queue(), CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, + reinterpret_cast(ipvt), + sizeof(HOST_DATA_TYPE) * + (array_size * array_size), 0, + NULL, NULL); +#else compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*array_size*array_size, A); compute_queue.enqueueWriteBuffer(Buffer_b, CL_TRUE, 0, @@ -111,6 +175,7 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface(A), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(B), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(C), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(scale), 0, + NULL, NULL); + clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(ipvt), 0, + NULL, NULL); +#else compute_queue.enqueueReadBuffer(Buffer_a, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*array_size*array_size, A); compute_queue.enqueueReadBuffer(Buffer_b, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*array_size*array_size, B); compute_queue.enqueueReadBuffer(Buffer_c, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*array_size*array_size, C); +#endif } ~LinpackKernelSeparateTest() override { +#ifdef USE_SVM + clSVMFree((*bm->getExecutionSettings().context)(), reinterpret_cast(A)); + clSVMFree((*bm->getExecutionSettings().context)(), reinterpret_cast(B)); + clSVMFree((*bm->getExecutionSettings().context)(), reinterpret_cast(C)); + clSVMFree((*bm->getExecutionSettings().context)(), reinterpret_cast(scale)); + clSVMFree((*bm->getExecutionSettings().context)(), reinterpret_cast(ipvt)); +#else free(A); free(B); free(C); free(ipvt); free(scale); +#endif delete [] kernelFileName; } }; diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in index 423a96ca..82f70b47 100644 --- a/PTRANS/src/common/parameters.h.in +++ b/PTRANS/src/common/parameters.h.in @@ -17,6 +17,8 @@ #define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ +#cmakedefine USE_SVM + /* Short description of the program. Moreover the version and build time is also compiled into the description. diff --git a/PTRANS/src/host/execution_default.cpp b/PTRANS/src/host/execution_default.cpp index 83948912..df9ca2eb 100644 --- a/PTRANS/src/host/execution_default.cpp +++ b/PTRANS/src/host/execution_default.cpp @@ -52,9 +52,18 @@ namespace bm_execution { cl::Kernel transposeKernel(*config.program, KERNEL_NAME); +#ifdef USE_SVM + clSetKernelArgSVMPointer(transposeKernel(), 0, + reinterpret_cast(A)); + clSetKernelArgSVMPointer(transposeKernel(), 1, + reinterpret_cast(B)); + clSetKernelArgSVMPointer(transposeKernel(), 2, + reinterpret_cast(A_out)); +#else transposeKernel.setArg(0, bufferA); transposeKernel.setArg(1, bufferB); transposeKernel.setArg(2, bufferA_out); +#endif transposeKernel.setArg(3, config.programSettings->matrixSize / config.programSettings->blockSize); cl::CommandQueue queue(*config.context); @@ -65,10 +74,31 @@ namespace bm_execution { for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { auto startTransfer = std::chrono::high_resolution_clock::now(); +#ifdef USE_SVM + clEnqueueSVMMap(queue(), CL_TRUE, + CL_MAP_READ, + reinterpret_cast(A), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); + clEnqueueSVMMap(queue(), CL_TRUE, + CL_MAP_READ, + reinterpret_cast(B), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); + clEnqueueSVMMap(queue(), CL_TRUE, + CL_MAP_WRITE, + reinterpret_cast(A_out), + sizeof(HOST_DATA_TYPE) * + (config.programSettings->matrixSize * config.programSettings->matrixSize), 0, + NULL, NULL); +#else queue.enqueueWriteBuffer(bufferA, CL_FALSE, 0, sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize, A); queue.enqueueWriteBuffer(bufferB, CL_FALSE, 0, sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize, B); +#endif queue.finish(); auto endTransfer = std::chrono::high_resolution_clock::now(); std::chrono::duration transferTime = @@ -85,8 +115,20 @@ namespace bm_execution { calculationTimings.push_back(calculationTime.count()); startTransfer = std::chrono::high_resolution_clock::now(); +#ifdef USE_SVM + clEnqueueSVMUnmap(queue(), + reinterpret_cast(A), 0, + NULL, NULL); + clEnqueueSVMUnmap(queue(), + reinterpret_cast(B), 0, + NULL, NULL); + clEnqueueSVMUnmap(queue(), + reinterpret_cast(A_out), 0, + NULL, NULL); +#else queue.enqueueReadBuffer(bufferA_out, CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize, A_out); +#endif endTransfer = std::chrono::high_resolution_clock::now(); transferTime += std::chrono::duration_cast> diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 89debac9..229de120 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -48,6 +48,39 @@ transpose::TransposeProgramSettings::getSettingsMap() { return map; } +transpose::TransposeData::TransposeData(cl::Context context, uint size) : context(context) { +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + B = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + result = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); +#else + posix_memalign(reinterpret_cast(&A), 64, + sizeof(HOST_DATA_TYPE) * size * size); + posix_memalign(reinterpret_cast(&B), 64, + sizeof(HOST_DATA_TYPE) * size * size); + posix_memalign(reinterpret_cast(&result), 64, + sizeof(HOST_DATA_TYPE) * size * size); +#endif +} + +transpose::TransposeData::~TransposeData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(A)); + clSVMFree(context(), reinterpret_cast(B)); + clSVMFree(context(), reinterpret_cast(result)); +#else + free(A); + free(B); + free(result); +#endif +} + transpose::TransposeBenchmark::TransposeBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } @@ -102,7 +135,7 @@ transpose::TransposeBenchmark::printResults(const transpose::TransposeExecutionT std::unique_ptr transpose::TransposeBenchmark::generateInputData() { - auto d = std::unique_ptr(new transpose::TransposeData(executionSettings->programSettings->matrixSize)); + auto d = std::unique_ptr(new transpose::TransposeData(*executionSettings->context, executionSettings->programSettings->matrixSize)); std::mt19937 gen(7); std::uniform_real_distribution<> dis(-100.0, 100.0); diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 929a1aa5..68459124 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -97,29 +97,25 @@ class TransposeData { */ HOST_DATA_TYPE *result; + /** + * @brief The context that is used to allocate memory in SVM mode + * + */ + cl::Context context; + /** * @brief Construct a new Transpose Data object * + * @param context Context that is used to allocate memory for SVM * @param size Size of the allocated square matrices */ - TransposeData(uint size) { - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * size * size); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * size * size); - posix_memalign(reinterpret_cast(&result), 64, - sizeof(HOST_DATA_TYPE) * size * size); - } + TransposeData(cl::Context context, uint size); /** * @brief Destroy the Transpose Data object. Free the allocated memory * */ - ~TransposeData() { - free(A); - free(B); - free(result); - } + ~TransposeData(); }; diff --git a/RandomAccess/CMakeLists.txt b/RandomAccess/CMakeLists.txt index 0569be21..829f8dd0 100755 --- a/RandomAccess/CMakeLists.txt +++ b/RandomAccess/CMakeLists.txt @@ -7,7 +7,6 @@ set(PARALLEL_MEM_ACCESSES 1 CACHE STRING "Unrolling factor that is used for all set(NUM_REPLICATIONS 4 CACHE STRING "Number of times the kernels will be replicated") set(DEVICE_BUFFER_SIZE 1 CACHE STRING "Buffer size in number of values that is used within the single kernel implementation.") set(COMBINE_LOOPS Yes CACHE BOOL "If enabled this will combine the address calculation loop and the load darta loop to a single loop. This can improve the performance when all loops are running sequentially") -set(USE_SVM No CACHE BOOL "Use coarse grained SVM instead of loading the buffer on the FPGA before execution. Device needs to support this feature.") set(DATA_TYPE long) set(HOST_DATA_TYPE cl_ulong) diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index cf67db60..3797c50e 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -50,6 +50,24 @@ random_access::RandomAccessProgramSettings::getSettingsMap() { return map; } +random_access::RandomAccessData::RandomAccessData(cl::Context& context, size_t size) : context(context) { +#ifdef USE_SVM + data = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); +#else + posix_memalign(reinterpret_cast(&data), 4096, size * sizeof(HOST_DATA_TYPE)); +#endif +} + +random_access::RandomAccessData::~RandomAccessData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(data)); +#else + free(data); +#endif +} + random_access::RandomAccessBenchmark::RandomAccessBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp index d3c8bc79..65346415 100644 --- a/RandomAccess/src/host/random_access_benchmark.hpp +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -86,33 +86,25 @@ class RandomAccessData { */ HOST_DATA_TYPE *data; + /** + * @brief The context that is used to allocate memory in SVM mode + * + */ + cl::Context context; + /** * @brief Construct a new Random Access Data object * * @param context The OpenCL context that will be used to allocate SVM memory * @param size The size of the allocated memory in number of values */ - RandomAccessData(cl::Context& context, size_t size) { - #ifdef USE_SVM - data = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(HOST_DATA_TYPE), 1024)); - #else - posix_memalign(reinterpret_cast(&data), 4096, size * sizeof(HOST_DATA_TYPE)); - #endif - } + RandomAccessData(cl::Context& context, size_t size); /** * @brief Destroy the Random Access Data object and free the memory allocated in the constructor * */ - ~RandomAccessData() { - #ifdef USE_SVM - clSVMFree(data); - #else - free(data); - #endif - } + ~RandomAccessData(); }; diff --git a/RandomAccess/tests/CMakeLists.txt b/RandomAccess/tests/CMakeLists.txt index eadba306..565af0dc 100755 --- a/RandomAccess/tests/CMakeLists.txt +++ b/RandomAccess/tests/CMakeLists.txt @@ -14,6 +14,9 @@ if (INTELFPGAOPENCL_FOUND) target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(${HOST_EXE_NAME}_test_intel random_access_kernels_single_emulate_intel) target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + if (USE_SVM) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DCL_VERSION_2_0) + endif() target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME ${HOST_EXE_NAME}_test_intel_single_unit COMMAND $ -f random_access_kernels_single_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/STREAM/CMakeLists.txt b/STREAM/CMakeLists.txt index 87998322..218be8b3 100755 --- a/STREAM/CMakeLists.txt +++ b/STREAM/CMakeLists.txt @@ -8,7 +8,6 @@ set(GLOBAL_MEM_UNROLL 1 CACHE STRING "Unrolling factor that is used for all loop set(NUM_REPLICATIONS 4 CACHE STRING "Number of times the kernels will be replicated") set(DEVICE_BUFFER_SIZE 512 CACHE STRING "Buffer size in number of values that is used within the single kernel implementation.") set(INNER_LOOP_BUFFERS ON CACHE BOOL "Put the local memory buffers inside the outer loop in the kernel code") -set(USE_SVM No CACHE BOOL "Use SVM pointers instead of creating buffers on the board and transferring the data there before execution.") # Set the data type since optional vector types are used set(DATA_TYPE float) diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index 82ce87c8..a5fd6ab3 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -53,6 +53,43 @@ stream::StreamProgramSettings::getSettingsMap() { return map; } +stream::StreamData::StreamData(const cl::Context& _context, size_t size) : context(_context) { +#ifdef INTEL_FPGA +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + B = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + C = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); +#else + posix_memalign(reinterpret_cast(&A), 64, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 64, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 64, size * sizeof(HOST_DATA_TYPE)); +#endif +#endif +#ifdef XILINX_FPGA + posix_memalign(reinterpret_cast(&A), 4096, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 4096, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 4096, size * sizeof(HOST_DATA_TYPE)); +#endif +} + +stream::StreamData::~StreamData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(A)); + clSVMFree(context(), reinterpret_cast(B)); + clSVMFree(context(), reinterpret_cast(C)); +#else + free(A); + free(B); + free(C); +#endif +} + stream::StreamBenchmark::StreamBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index a38cb925..ab818d02 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -103,52 +103,25 @@ class StreamData { */ HOST_DATA_TYPE *C; + /** + * @brief The context that is used to allocate memory in SVM mode + * + */ + cl::Context context; + /** * @brief Construct a new Stream Data object * - * @param context the context that will be used to allocate SVM memory + * @param _context the context that will be used to allocate SVM memory * @param size the size of the data arrays in number of values */ - StreamData(const cl::Context& context, size_t size) { - #ifdef INTEL_FPGA - #ifdef USE_SVM - A = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(HOST_DATA_TYPE), 1024)); - B = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(HOST_DATA_TYPE), 1024)); - C = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(HOST_DATA_TYPE), 1024)); - #else - posix_memalign(reinterpret_cast(&A), 64, size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&B), 64, size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C), 64, size * sizeof(HOST_DATA_TYPE)); - #endif - #endif - #ifdef XILINX_FPGA - posix_memalign(reinterpret_cast(&A), 4096, size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&B), 4096, size * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C), 4096, size * sizeof(HOST_DATA_TYPE)); - #endif - } + StreamData(const cl::Context& _context, size_t size); /** * @brief Destroy the Stream Data object * */ - ~StreamData() { - #ifdef USE_SVM - clSVMFree(A); - clSVMFree(B); - clSVMFree(C); - #else - free(A); - free(B); - free(C); - #endif - } + ~StreamData(); }; diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index 9097411a..137dbdf9 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -15,6 +15,9 @@ if (INTELFPGAOPENCL_FOUND) target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(${HOST_EXE_NAME}_test_intel stream_kernels_emulate_intel stream_kernels_single_emulate_intel) target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + if (USE_SVM) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DCL_VERSION_2_0) + endif() target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME ${HOST_EXE_NAME}_test_intel_unit COMMAND $ -f stream_kernels_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) add_test(NAME ${HOST_EXE_NAME}_test_intel_single_unit COMMAND $ -f stream_kernels_single_emulate.aocx --single-kernel WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 24176b46..9410ce9d 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -10,6 +10,7 @@ set(DEFAULT_REPETITIONS 10 CACHE STRING "Default number of repetitions") set(DEFAULT_DEVICE -1 CACHE STRING "Index of the default device to use") set(DEFAULT_PLATFORM -1 CACHE STRING "Index of the default platform to use") set(USE_OPENMP ${USE_OPENMP} CACHE BOOL "Use OpenMP in the host code") +set(USE_SVM No CACHE BOOL "Use SVM pointers instead of creating buffers on the board and transferring the data there before execution.") # Set the used data type if (NOT DATA_TYPE) diff --git a/scripts/power_measurements/pac_s10_dc.fpgainfo.sh b/scripts/power_measurements/pac_s10_dc.fpgainfo.sh new file mode 100755 index 00000000..14b00a18 --- /dev/null +++ b/scripts/power_measurements/pac_s10_dc.fpgainfo.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +LOGFILE=powermeasure.csv + +echo "" > $LOGFILE + + +# Start the benchmark + +$@ & + +bm_pid=$! +# Start power measurements + +while $(kill -0 $bm_pid); do + echo $(fpgainfo power | grep "Amps\|Volts" | sed -r 's/.*: ([0-9]+)\.([0-9]+).*/\1.\2/g' | sed -r ':a;N;$!ba;s/\n/,/g') >> $LOGFILE + sleep 0.01 +done \ No newline at end of file diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 3d2df760..f10e73dd 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -354,22 +354,34 @@ class HpccFpgaBenchmark { std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl << HLINE; } + + auto gen_start = std::chrono::high_resolution_clock::now(); std::unique_ptr data = generateInputData(); + std::chrono::duration gen_time = std::chrono::high_resolution_clock::now() - gen_start; + if (world_rank == 0) { + std::cout << "Generation Time: " << gen_time.count() << " s" << std::endl; std::cout << HLINE << "Execute benchmark kernel..." << std::endl << HLINE; } - std::unique_ptr output = executeKernel(*data); + auto exe_start = std::chrono::high_resolution_clock::now(); + std::unique_ptr output = executeKernel(*data); + std::chrono::duration exe_time = std::chrono::high_resolution_clock::now() - exe_start; + if (world_rank == 0) { + std::cout << "Execution Time: " << exe_time.count() << " s" << std::endl; std::cout << HLINE << "Validate output..." << std::endl << HLINE; } - + + auto eval_start = std::chrono::high_resolution_clock::now(); bool validateSuccess = validateOutputAndPrintError(*data); + std::chrono::duration eval_time = std::chrono::high_resolution_clock::now() - eval_start; if (world_rank == 0) { printResults(*output); + std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl; } return validateSuccess; diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index d73bcf04..31b79010 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -210,7 +210,7 @@ choose a device. int err; int world_rank = 0; - int world_size = 0; + int world_size = 1; #ifdef _USE_MPI_ MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);