-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.cpp
130 lines (95 loc) · 4.21 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// common includes
#include <complex>
#include <fstream>
#include <iostream>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <unordered_set>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "error_check.cpp"
#include "read_kernel_source.cpp"
#include "math.cpp"
#include "bitReverse.cpp"
#include "rand.cpp"
#include "cpu_fft.cpp"
#include "cpu_fft_norecursion.cpp"
//
// GPGPU algorithms, by order of increasing (gpu kernel) complexities:
//
// 1. This example adds two vectors of 1024 ints:
//
//#include "main_add_1024_ints.cpp"
// 2. This example performs a permutation of adjacent elements
// in a vector of 8 elements:
//
//#include "main_mix_8_floats.cpp"
// 3. This example computes an fft (Cooley-Tuckey radix-2, no bit-reversal of the input)
// on a vector of 8 elements:
//
//#include "main_fft_8_floats.cpp"
// 4. This example computes an fft (Cooley-Tuckey radix-2, no bit-reversal of the input)
// on vectors of large sizes:
//
//#include "main_fft_many_floats.cpp" // for 8192 fft: 1500 us kernel time
// 5. This example computes an fft (Cooley-Tuckey radix-2, no bit-reversal of the input)
// on vectors of large sizes, using local memory to speed up the kernel.
//
// Several variations are possible, by using different kernels (see inside the source)
//
//#include "main_fft_many_floats_local.cpp" // for 8192 fft: 1300 us kernel time
// 6. This example computes an fft (Cooley-Tuckey radix-2, no bit-reversal of the input)
// on vectors of large sizes, using local memory to speed up the kernel,
// and computing twiddle factors on the fly instead of reading them from memory:
//
//#include "main_fft_many_floats_local_twiddles.cpp"
// 6.1 This example computes an fft (Cooley-Tuckey radix-2, no bit-reversal of the input)
// on vectors of huge sizes (using 2 different kernels to compute the fft, one for the first levels
// and the other for the last levels. The use of sequential kernels allow for global synchronization
// across workgroups), using local memory to speed up the kernel,
// and computing twiddle factors on the fly instead of reading them from memory:
//
//#include "main_fft_huge_floats_local_twiddles.cpp"
// 6.2 This example computes an fft (Stockham radix-2)
// on vectors of huge sizes (using 2 different kernels to compute the fft, one for the first levels
// and the other for the last levels. The use of sequential kernels allow for global synchronization
// across workgroups), using local memory to speed up the kernel,
// and computing twiddle factors on the fly instead of reading them from memory:
//
#include "main_fft_huge_floats_stockham_local_twiddles.cpp"
// 7. This example computes an fft (Cooley-Tuckey radix-2, no bit-reversal of the input)
// on vectors of large sizes, using local memory to speed up the kernel,
// computing twiddle factors on the fly instead of reading them from memory,
// and where a separate representation for complex numbers is used to avoid bank conflicts:
//
//#include "main_fft_many_floats_local_twiddles_separate.cpp"
// 8. This example computes an fft (Stockham radix-2)
// on a vector of 8 elements:
//
//#include "main_fft_8_floats_stockham.cpp"
// 9. This example computes an fft (Stockham radix-2)
// on vectors of large sizes.
//
//#include "main_fft_many_floats_stockham.cpp"
// 10. This example computes an fft (Stockham radix-2)
// on vectors of large sizes,
// computing twiddle factors on the fly instead of reading them from memory:
//
//#include "main_fft_many_floats_stockham_twiddles.cpp"
// 11. This example computes an fft (Stockham radix-2)
// on vectors of large sizes,
// computing twiddle factors on the fly instead of reading them from memory,
// and where a separate representation for complex numbers is used to avoid bank conflicts:
//
//#include "main_fft_many_floats_stockham_twiddles_separate.cpp"
// 12. This example computes an fft (Stockham radix-2)
// on vectors of large sizes,
// computing twiddle factors on the fly instead of reading them from memory
// using images instead of global memory for global input and output
//
//#include "main_fft_many_floats_stockham_twiddles_images.cpp"