-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu_nms.cu
283 lines (242 loc) · 11.1 KB
/
gpu_nms.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#include"gpu_kernels.hpp"
#include"indices.hpp"
#define cond (bx == 14 && by == 14 && tx == 0 && ty == 0)
#define SQRT2 (1.4142135623731)
template<typename T, int SUBIMGX, int SUBIMGY, int THX, int THY, int PADDING, int SN>
__global__
void
gpu_nms_kernel(
int interp_img_height, int interp_img_width,
T* dev_Ix, T* dev_Iy, T* dev_I_grad_mag,
T* dev_subpix_pos_x_map,
T* dev_subpix_pos_y_map )
{
#define sIx(i,j) sIx[(i) * PIMGX + (j)]
#define sIy(i,j) sIy[(i) * PIMGX + (j)]
#define sgrad_mag(i,j) sgrad_mag[(i) * PIMGX + (j)]
#define tIx(i,j) tIx[(i) * PIMGX + (j)]
#define tIy(i,j) tIy[(i) * PIMGX + (j)]
#define tgrad_mag(i,j) tgrad_mag[(i) * PIMGX + (j)]
extern __shared__ double sdata[];
const int tid = threadIdx.x;
const int tx = tid / THX;
const int ty = tid % THX;
const int bx = blockIdx.x;
const int by = blockIdx.y;
const int PIMGX = SUBIMGX + (2 * PADDING); // padded image x-size
const int PIMGY = SUBIMGY + (2 * PADDING); // padded image y-size
const int base_gtx = bx * SUBIMGX + tx + PADDING;
const int base_gty = by * SUBIMGY + ty + PADDING;
// shared memory ptrs
T* sIx = (T*)sdata;
T* sIy = sIx + PIMGX * PIMGY;
T* sgrad_mag = sIy + PIMGX * PIMGY;
T* tIx = &sIx(PADDING, PADDING);
T* tIy = &sIy(PADDING, PADDING);
T* tgrad_mag = &sgrad_mag(PADDING, PADDING);
// read sIx, sIy, and sgrad_mag
int ii = 0, jj = 0;
#pragma unroll
for(jj = 0; jj < PIMGY-THY; jj+=THY) {
int ty_ = base_gty + jj - PADDING;
#pragma unroll
for(ii = 0; ii < PIMGX-THX; ii+=THX) {
int tx_ = base_gtx + ii - PADDING;
sIx(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Ix(tx_, ty_) : 0.;
sIy(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Iy(tx_, ty_) : 0.;
sgrad_mag(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_I_grad_mag(tx_, ty_) : 0.;
}
if(tx < PIMGX-ii) {
int tx_ = base_gtx + ii - PADDING;
sIx(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Ix(tx_, ty_) : 0.;
sIy(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Iy(tx_, ty_) : 0.;
sgrad_mag(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_I_grad_mag(tx_, ty_) : 0.;
}
}
// last column block
if(ty < PIMGY-jj) {
int ty_ = base_gty + jj - PADDING;
#pragma unroll
for(ii = 0; ii < PIMGX-THX; ii+=THX) {
int tx_ = base_gtx + ii - PADDING;
sIx(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Ix(tx_, ty_) : 0.;
sIy(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Iy(tx_, ty_) : 0.;
sgrad_mag(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_I_grad_mag(tx_, ty_) : 0.;
}
if(tx < PIMGX-ii) {
int tx_ = base_gtx + ii - PADDING;
sIx(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Ix(tx_, ty_) : 0.;
sIy(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_Iy(tx_, ty_) : 0.;
sgrad_mag(ii+tx,jj+ty) = (tx_ < interp_img_height && ty_ < interp_img_width ) ? dev_I_grad_mag(tx_, ty_) : 0.;
}
}
__syncthreads();
int i = tx, j = ty;
T norm_dir_x, norm_dir_y;
T slope, fp, fm;
T coeff_A, coeff_B, coeff_C, s, s_star;
T max_f, subpix_grad_x, subpix_grad_y;
T subpix_pos_x_map = 0, subpix_pos_y_map = 0;
// -- ignore neglectable gradient magnitude --
if (tgrad_mag(i, j) <= 2) return;
// -- ignore invalid gradient direction --
if ( (fabs(tIx(i, j)) < 1e-6) && (fabs(tIy(i, j)) < 1e-6) ) return;
// -- calculate the unit direction --
norm_dir_x = tIx(i,j) / tgrad_mag(i,j);
norm_dir_y = tIy(i,j) / tgrad_mag(i,j);
// -- find corresponding quadrant --
if ((tIx(i,j) >= 0) && (tIy(i,j) >= 0)) {
if (tIx(i,j) >= tIy(i,j)) { // -- 1st quadrant --
slope = norm_dir_y / norm_dir_x;
fp = tgrad_mag(i, j+SN) * (1-slope) + tgrad_mag(i+SN, j+SN) * slope;
fm = tgrad_mag(i, j-SN) * (1-slope) + tgrad_mag(i-SN, j-SN) * slope;
}
else { // -- 2nd quadrant --
slope = norm_dir_x / norm_dir_y;
fp = tgrad_mag(i+SN, j) * (1-slope) + tgrad_mag(i+SN, j+SN) * slope;
fm = tgrad_mag(i-SN, j) * (1-slope) + tgrad_mag(i-SN, j-SN) * slope;
}
}
else if ((tIx(i,j) < 0) && (tIy(i,j) >= 0)) {
if (fabs(tIx(i,j)) < tIy(i,j)) { // -- 3rd quadrant --
slope = -norm_dir_x / norm_dir_y;
fp = tgrad_mag(i+SN, j) * (1-slope) + tgrad_mag(i+SN, j-SN) * slope;
fm = tgrad_mag(i-SN, j) * (1-slope) + tgrad_mag(i-SN, j+SN) * slope;
}
else { // -- 4th quadrant --
slope = -norm_dir_y / norm_dir_x;
fp = tgrad_mag(i, j-SN) * (1-slope) + tgrad_mag(i+SN, j-SN) * slope;
fm = tgrad_mag(i, j+SN) * (1-slope) + tgrad_mag(i-SN, j+SN) * slope;
}
}
else if ((tIx(i,j) < 0) && (tIy(i,j) < 0)) {
if(fabs(tIx(i,j)) >= fabs(tIy(i,j))) { // -- 5th quadrant --
slope = norm_dir_y / norm_dir_x;
fp = tgrad_mag(i, j-SN) * (1-slope) + tgrad_mag(i-SN, j-SN) * slope;
fm = tgrad_mag(i, j+SN) * (1-slope) + tgrad_mag(i+SN, j+SN) * slope;
}
else { // -- 6th quadrant --
slope = norm_dir_x / norm_dir_y;
fp = tgrad_mag(i-SN, j) * (1-slope) + tgrad_mag(i-SN, j-SN) * slope;
fm = tgrad_mag(i+SN, j) * (1-slope) + tgrad_mag(i+SN, j+SN) * slope;
}
}
else if ((tIx(i,j) >= 0) && (tIy(i,j) < 0)) {
if(tIx(i,j) < fabs(tIy(i,j))) { // -- 7th quadrant --
slope = -norm_dir_x / norm_dir_y;
fp = tgrad_mag(i-SN, j) * (1-slope) + tgrad_mag(i-SN, j+SN) * slope;
fm = tgrad_mag(i+SN, j) * (1-slope) + tgrad_mag(i+SN, j-SN) * slope;
}
else { // -- 8th quadrant --
slope = -norm_dir_y / norm_dir_x;
fp = tgrad_mag(i, j+SN) * (1-slope) + tgrad_mag(i-SN, j+SN) * slope;
fm = tgrad_mag(i, j-SN) * (1-slope) + tgrad_mag(i+SN, j-SN) * slope;
}
}
// -- fit a parabola to find the edge subpixel location when doing max test --
s = sqrt(1+slope*slope);
if((tgrad_mag(i, j) > fm && tgrad_mag(i, j) > fp) || // -- abs max --
(tgrad_mag(i, j) > fm && tgrad_mag(i, j) >= fp) || // -- relaxed max --
(tgrad_mag(i, j) >= fm && tgrad_mag(i, j) > fp))
{
// -- fit a parabola; define coefficients --
coeff_A = (fm+fp-2*tgrad_mag(i, j))/(2*s*s);
coeff_B = (fp-fm)/(2*s);
coeff_C = tgrad_mag(i, j);
s_star = -coeff_B/(2*coeff_A); // -- location of max --
max_f = coeff_A*s_star*s_star + coeff_B*s_star + coeff_C; // -- value of max --
if(fabs(s_star) <= SQRT2) { // -- significant max is within a pixel --
// -- subpixel magnitude in x and y --
//subpix_grad_x = max_f*norm_dir_x;
//subpix_grad_y = max_f*norm_dir_y;
// -- subpixel gradient magnitude --
//subpix_grad_mag = sqrt(subpix_grad_x*subpix_grad_x + subpix_grad_y*subpix_grad_y);
// -- store subpixel location in a map --
subpix_pos_x_map = base_gty/*j*/ + s_star * norm_dir_x;
subpix_pos_y_map = base_gtx/*i*/ + s_star * norm_dir_y;
// -- store gradient of subpixel edge in the map --
//subpix_grad_mag_map = subpix_grad_mag;
}
}
__syncthreads();
if(base_gtx < interp_img_height-PADDING && base_gty < interp_img_width-PADDING) {
dev_subpix_pos_x_map(base_gtx, base_gty) = subpix_pos_x_map;
dev_subpix_pos_y_map(base_gtx, base_gty) = subpix_pos_y_map;
}
}
//extern "C"
template<typename T>
void gpu_nms_template(
int device_id,
int interp_h, int interp_w,
T* dev_Ix, T* dev_Iy, T* dev_I_grad_mag,
T* dev_subpix_pos_x_map,
T* dev_subpix_pos_y_map )
{
// kernel parameters
// TODO: tune
const int padding = 10;
const int subimgx = 8;
const int subimgy = 8;
const int thx = 8;
const int thy = 8;
const int sn = 1; // should be 1 but now just use 2
// end of kernel parameters
assert(subimgx == thx && subimgy == thy && thx == thy);
int hh = interp_h - (2*padding);
int ww = interp_w - (2*padding);
const int gridx = (hh + subimgx - 1) / subimgx;
const int gridy = (ww + subimgy - 1) / subimgy;
dim3 grid(gridx, gridy, 1);
dim3 threads(thx*thy,1,1);
int shmem = 0;
shmem += sizeof(T) * (subimgx + padding + padding) * (subimgy + padding + padding); // sIx
shmem += sizeof(T) * (subimgx + padding + padding) * (subimgy + padding + padding); // sIy
shmem += sizeof(T) * (subimgx + padding + padding) * (subimgy + padding + padding); // sgrad_mag
// get max. dynamic shared memory on the GPU
int nthreads_max, shmem_max = 0;
cudacheck( cudaDeviceGetAttribute(&nthreads_max, cudaDevAttrMaxThreadsPerBlock, device_id) );
#if CUDA_VERSION >= 9000
cudacheck( cudaDeviceGetAttribute (&shmem_max, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id) );
if (shmem <= shmem_max) {
cudacheck( cudaFuncSetAttribute(gpu_nms_kernel<T, subimgx, subimgy, thx, thy, padding, sn>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem) );
}
#else
cudacheck( cudaDeviceGetAttribute (&shmem_max, cudaDevAttrMaxSharedMemoryPerBlock, device_id) );
#endif // CUDA_VERSION >= 9000
if ( shmem > shmem_max ) {
printf("error: kernel %s requires too many threads or too much shared memory\n", __func__);
}
void *kernel_args[] = { &interp_h, &interp_w,
&dev_Ix, &dev_Iy, &dev_I_grad_mag,
&dev_subpix_pos_x_map, &dev_subpix_pos_y_map };
cudacheck( cudaLaunchKernel((void*)gpu_nms_kernel<T, subimgx, subimgy, thx, thy, padding, sn>, grid, threads, kernel_args, shmem, NULL) );
}
// single precision NMS
void gpu_nms(
int device_id,
int interp_h, int interp_w,
float* dev_Ix, float* dev_Iy, float* dev_I_grad_mag,
float* dev_subpix_pos_x_map,
float* dev_subpix_pos_y_map )
{
gpu_nms_template<float>(
device_id, interp_h, interp_w,
dev_Ix, dev_Iy, dev_I_grad_mag,
dev_subpix_pos_x_map, dev_subpix_pos_y_map
);
}
// double precision NMS
void gpu_nms(
int device_id,
int interp_h, int interp_w,
double* dev_Ix, double* dev_Iy, double* dev_I_grad_mag,
double* dev_subpix_pos_x_map,
double* dev_subpix_pos_y_map )
{
gpu_nms_template<double>(
device_id, interp_h, interp_w,
dev_Ix, dev_Iy, dev_I_grad_mag,
dev_subpix_pos_x_map, dev_subpix_pos_y_map
);
}