-
Notifications
You must be signed in to change notification settings - Fork 1
/
reduce_utils.cu
46 lines (29 loc) · 963 Bytes
/
reduce_utils.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include "globals.h"
__global__ void d_reduce_float(float tot_sum, const float* dat,
const int max) {
extern __shared__ float block_sum[];
const int ind = blockIdx.x * blockDim.x + threadIdx.x;
if (ind > max)
return;
const int tid = threadIdx.x;
if (ind == 0)
tot_sum = 0.f;
if (tid == 0)
block_sum[tid] = 0.f;
__syncthreads();
atomicAdd(&block_sum[tid], dat[ind]);
__syncthreads();
if ( tid == 0 )
atomicAdd(&tot_sum, block_sum[tid]);
}
float reduce_device_float(float* d_dat, const int threads, const int m)
{
int loc_threads = (m < threads) ? m : threads;
int loc_blocks = m / threads;
float* d_sum, h_sum;
cudaMalloc(&d_sum, sizeof(float));
d_reduce_float<<<loc_blocks, loc_threads,loc_blocks*sizeof(float)>>>(*d_sum, d_dat, m);
cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(&d_sum);
return h_sum;
}