forked from cathy-kim/Pelee-TensorRT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmathFunctions.h
executable file
·136 lines (108 loc) · 4.4 KB
/
mathFunctions.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#ifndef __MATH_FUNCTINS_H__
#define __MATH_FUNCTINS_H__
#include <stdint.h>
#include <cmath> // for std::fabs and std::signbit
#include <cblas.h>
#include <cudnn.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <driver_types.h> // cuda driver types
#include <algorithm>
#include <glog/logging.h>
#define PERMUTELAYER_ORDERNUM 4
#define BLOCK 512
//
// CUDA macros
//
// CUDA: various checks for different function calls.
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define CUBLAS_CHECK(condition) \
do { \
cublasStatus_t status = condition; \
CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
<< cublasGetErrorString(status); \
} while (0)
#define CURAND_CHECK(condition) \
do { \
curandStatus_t status = condition; \
CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
<< curandGetErrorString(status); \
} while (0)
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// CUDA: check for error after kernel execution and exit loudly if there is one.
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
// CUDA: library error reporting.
const char* cublasGetErrorString(cublasStatus_t error);
const char* curandGetErrorString(curandStatus_t error);
// CUDA: use 512 threads per block
const int TENSORRT_CUDA_NUM_THREADS = 256;
// CUDA: number of blocks for threads.
inline int TENSORRT_GET_BLOCKS(const int N) {
return (N + TENSORRT_CUDA_NUM_THREADS - 1) / TENSORRT_CUDA_NUM_THREADS;
}
/*
* function: X[i] = alpha,initialize X with constant alpha
*
*/
template <typename Dtype>
void tensorrt_gpu_set(const int N, const Dtype alpha, Dtype *X);
/*
* function: y[index] = pow(a[index], alpha)
*@params n: the dims of matrix a
*@params a: matrix
*@params y: vector
*/
template <typename Dtype>
void tensorrt_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
/*
*function:y = alpha*A*x + beta*y;
*@params handle: handle
*@params TransA: transpose flag
*@params M: the rows of A
*@params N: the cols of A
*@params alpha: the coefficient of A*x
*@params A: matrix [M x N]
*@params x: vector x
*@params beta: the coefficient of y
*@params y: vector y
*/
template <typename Dtype>
void tensorrt_gpu_gemv(cublasHandle_t handle,const CBLAS_TRANSPOSE TransA, const int M, const int N,
const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
Dtype* y);
template <typename Dtype>
void tensorrt_gpu_divbsx(const int nthreads, const Dtype* A,
const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans,
Dtype* B);
template <typename Dtype>
void tensorrt_gpu_mulbsx(const int nthreads, const Dtype* A,
const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans,
Dtype* B);
cudaError_t tensorrt_gpu_permute(const int nthreads,float* const bottom_data,const bool forward,
const int* permute_order,const int* old_steps,const int* new_steps,const int num_axes,float* const top_data,cudaStream_t stream);
cudaError_t SoftmaxLayer(const float *bottom_data, int count, int channels, int outer_num_, int inner_num_, float *scale_data, float *top_data, cudaStream_t stream);
cudaError_t ConcatLayer(int nthreads, const float *bottom_data, bool kForward, int num_concats_, int concat_input_size_, int top_concat_axis, int bottom_concat_axis, int offset_concat_axis, float *top_data, cudaStream_t stream);
//cudaError_t cudaSoftmax(int n, int channels, float* x, float*y, cudaStream_t stream);
//virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
cudaError_t cudaSoftmax_caffe(int count,int channels,float* x,float* y, cudaStream_t stream);
cudaError_t cudaDetectionOutput_caffe( int bottom0_count,
int bottom1_count,
float* loc_data,
float* bottom1,
float* prior_data,
float* bottom3,
float* bottom4,
float* y,
cudaStream_t stream);
#endif