-
Notifications
You must be signed in to change notification settings - Fork 1
/
main_fft_many_floats.cpp
269 lines (226 loc) · 10.2 KB
/
main_fft_many_floats.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
constexpr auto kernel_file = "vector_fft_floats_multi.cl";
void withInput(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
int nButterfliesPerThread,
std::vector<float> const & input,
bool verifyResults
)
{
using namespace imajuscule;
using namespace imajuscule::fft;
verify(is_power_of_two(input.size()) && input.size() >= 2);
// Our GPU kernel doesn't do bit-reversal of the input, so this should be done on the host.
// In this scope, we verify that when the input is bit-reversed prior to being fed to 'cpu_func',
// we get the expected result:
if(verifyResults)
{
std::cout << "- make ref 1" << std::endl;
auto refForwardFft = makeRefForwardFft(input); // this implementation has been well unit-tested in another project
std::cout << "- make ref 2" << std::endl;
auto cpuForwardFft = cpu_fft_norecursion(bitReversePermutation(input));
std::cout << "- verify consistency" << std::endl;
verifyVectorsAreEqual(refForwardFft, cpuForwardFft);
std::cout << "- ok" << std::endl;
}
auto twiddle = imajuscule::compute_roots_of_unity<float>(input.size());
std::vector<std::complex<float>> output;
output.resize(input.size());
cl_int ret;
// Create memory buffers on the device for each vector
cl_mem input_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
input.size() * sizeof(decltype(input[0])), NULL, &ret);
CHECK_CL_ERROR(ret);
cl_mem twiddle_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
twiddle.size() * sizeof(decltype(twiddle[0])), NULL, &ret);
CHECK_CL_ERROR(ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
output.size() * sizeof(decltype(output[0])), NULL, &ret);
CHECK_CL_ERROR(ret);
// Copy the input and twiddles to their respective memory buffers.
// This can crash if the GPU has not enough memory.
ret = clEnqueueWriteBuffer(command_queue, input_mem_obj, CL_TRUE, 0,
input.size() * sizeof(decltype(input[0])), &input[0], 0, NULL, NULL);
CHECK_CL_ERROR(ret);
ret = clEnqueueWriteBuffer(command_queue, twiddle_mem_obj, CL_TRUE, 0,
twiddle.size()*sizeof(decltype(twiddle[0])), &twiddle[0], 0, NULL, NULL);
CHECK_CL_ERROR(ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input_mem_obj);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&twiddle_mem_obj);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
CHECK_CL_ERROR(ret);
// Execute the OpenCL kernel
size_t global_item_size = input.size()/(2*nButterfliesPerThread);
size_t local_item_size = global_item_size;
cl_event event;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size,
&local_item_size,
0, NULL, &event);
CHECK_CL_ERROR(ret);
{
auto begin = std::chrono::system_clock::now();
ret = clWaitForEvents(1, &event);
CHECK_CL_ERROR(ret);
auto afterEvent = std::chrono::system_clock::now();
cl_ulong time_start, time_end;
ret = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
CHECK_CL_ERROR(ret);
ret = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
CHECK_CL_ERROR(ret);
double elapsed = time_end - time_start;
std::cout << "kernel duration (us) " << (int)elapsed/1000 << std::endl;
std::cout << "cpu time [enqueue .. kernel event] (us) " << std::chrono::duration_cast<std::chrono::microseconds> (afterEvent - begin).count() <<std::endl;
// Read the memory buffer output_mem_obj on the device to the local variable output
ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
output.size() * sizeof(decltype(output[0])), &output[0], 0, NULL, NULL);
auto end= std::chrono::system_clock::now();
std::cout << "cpu time [enqueue .. results transfered] (us) " << std::chrono::duration_cast<std::chrono::microseconds> (end - begin).count() <<std::endl;
}
CHECK_CL_ERROR(ret);
if(verifyResults) {
std::cout << "verifying results... " << std::endl;
// The output produced by the gpu is the same as the output produced by the cpu:
verifyVectorsAreEqual(output,
cpu_fft_norecursion(input),
// getFFTEpsilon is assuming that the floating point errors "add up"
// at every butterfly operation, but like said here :
// https://floating-point-gui.de/errors/propagation/
// this is true for multiplications, but not for additions
// which are used in butterfly operations.
// Hence I replace the following line with 0.001f:
//20.f*getFFTEpsilon<float>(input.size()),
0.001f
);
}
// Cleanup
ret = clReleaseMemObject(input_mem_obj);
CHECK_CL_ERROR(ret);
ret = clReleaseMemObject(twiddle_mem_obj);
CHECK_CL_ERROR(ret);
ret = clReleaseMemObject(output_mem_obj);
CHECK_CL_ERROR(ret);
}
std::string ReplaceString(std::string subject, const std::string& search,
const std::string& replace) {
size_t pos = 0;
while ((pos = subject.find(search, pos)) != std::string::npos) {
subject.replace(pos, search.length(), replace);
pos += replace.length();
}
return subject;
}
struct ScopedKernel {
cl_program program;
cl_kernel kernel;
int nButterfliesPerThread;
ScopedKernel(cl_context context, cl_device_id device_id, std::string const & kernel_src, size_t const input_size) {
int const nButterflies = input_size/2;
for(nButterfliesPerThread = 1;;) {
std::string const replaced_str = ReplaceString(kernel_src,
"replace_N_LOCAL_BUTTERFLIES",
std::to_string(nButterfliesPerThread));
size_t const replaced_source_size = replaced_str.size();
const char * rep_src = replaced_str.data();
cl_int ret;
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1,
(const char **)&rep_src, (const size_t *)&replaced_source_size, &ret);
CHECK_CL_ERROR(ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id,
"-I /Users/Olivier/Dev/gpgpu/ -cl-denorms-are-zero -cl-strict-aliasing -cl-fast-relaxed-math",
NULL, NULL);
CHECK_CL_ERROR(ret);
// Create the OpenCL kernel
kernel = clCreateKernel(program, "kernel_func", &ret);
CHECK_CL_ERROR(ret);
size_t workgroup_max_sz;
ret = clGetKernelWorkGroupInfo(kernel,
device_id,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof(workgroup_max_sz), &workgroup_max_sz, NULL);
CHECK_CL_ERROR(ret);
std::cout << "workgroup max size: " << workgroup_max_sz << " for " << nButterfliesPerThread << " butterfly per thread." << std::endl;
if(nButterflies > nButterfliesPerThread * workgroup_max_sz) {
release();
// To estimate the next value of 'nButterfliesPerThread',
// we make the reasonnable assumption that "work group max size"
// won't be bigger if we increase 'nButterfliesPerThread':
nButterfliesPerThread = nButterflies / workgroup_max_sz;
continue;
}
break;
}
}
~ScopedKernel() {
release();
}
private:
void release() {
cl_int ret = clReleaseKernel(kernel);
CHECK_CL_ERROR(ret);
ret = clReleaseProgram(program);
CHECK_CL_ERROR(ret);
kernel = 0;
program = 0;
}
ScopedKernel(const ScopedKernel&) = delete;
ScopedKernel& operator=(const ScopedKernel&) = delete;
ScopedKernel(ScopedKernel&&) = delete;
ScopedKernel& operator=(ScopedKernel&&) = delete;
};
int main(void) {
srand(0); // we use rand() as random number generator and we want reproducible results so we use a fixed seeed.
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
CHECK_CL_ERROR(ret);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
CHECK_CL_ERROR(ret);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
CHECK_CL_ERROR(ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id,
CL_QUEUE_PROFILING_ENABLE, &ret);
CHECK_CL_ERROR(ret);
// read the kernel code
auto kernel_src = read_kernel(kernel_file);
// Note that if the GPU has not enough memory available, it will crash.
// On my system, the limit is reached at size 134217728.
for(int sz=2; sz < 10000000; sz *= 2) {
std::cout << std::endl << "* input size: " << sz << std::endl;
// Create the input vector
std::vector<float> input;
input.reserve(sz);
for(int i=0; i<sz; ++i) {
input.push_back(rand_float(0.f,1.f));
}
const ScopedKernel sc(context, device_id, kernel_src, input.size());
withInput(context,
command_queue,
sc.kernel,
sc.nButterfliesPerThread,
input,
false // set this to true to verify results
);
}
// Clean up
ret = clFlush(command_queue);
CHECK_CL_ERROR(ret);
ret = clFinish(command_queue);
CHECK_CL_ERROR(ret);
ret = clReleaseCommandQueue(command_queue);
CHECK_CL_ERROR(ret);
ret = clReleaseContext(context);
CHECK_CL_ERROR(ret);
return 0;
}