-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGPU_Commons.cu
121 lines (104 loc) · 5.13 KB
/
GPU_Commons.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#ifndef _GPU_COMMONS_CU_
#define _GPU_COMMONS_CU_
#include "Apps.cuh"
#include "GPU_Commons.cuh"
#if defined(_MATLAB_DISPLAY_CUDA_SETTING_INFO_)
#include "D:\Program Files (x86)\MATLAB\R2014b\extern\include\mex.h"
#endif
/* **********************************************************************************************************
Commonly used functions
*************************************************************************************************************/
/* Check Cuda errors */
bool CheckCudaError(cudaError errnum, char *ErrMsg, char *InfoString)
{
/*
Parameters description:
errnum (input) - the error code of a CUDA function;
ErrMsg (output) - a string pointer for the text of the error description;
InfoString (input) - a string pointer for the text of some additional information (e.g., the location of the error).
return value - true if an error is identified, or false otherwise .
*/
if (errnum != cudaSuccess){
sprintf_s(ErrMsg, TempStringBufferSize,
"%s : %s - '%s'.", __FUNCTION__, InfoString, cudaGetErrorString(errnum));
return true;
}
else
return false;
}
/* Initialize cuda device */
bool CuDeviceInit(struct cudaDeviceProp *deviceProp, int DeviceID, char * ErrMsg)
{
/*
Parameters description:
deviceProp (input) - a struct for CUDA device properties;
DeviceID (input) - user-specified GPU device ID number;
ErrMsg (output) - a string pointer for the text of the error description;
return value - true if the user-specified GPU device is initialized successfully, or
false otherwise.
*/
int deviceCount, targetDevice;
/* Get the number of CUDA enabled GPU devices */
if(CheckCudaError(cudaGetDeviceCount(&deviceCount), ErrMsg, "cudaGetDeviceCount"))
return(false);
/* Select and reset target device */
targetDevice = DeviceID;
if(CheckCudaError(cudaSetDevice(targetDevice), ErrMsg, "cudaSetDevice")) return(false);
if(CheckCudaError(cudaDeviceReset(), ErrMsg, "cudaDeviceReset")) return(false);
/* Query target device */
if(CheckCudaError(cudaGetDeviceProperties(deviceProp, targetDevice),
ErrMsg, "cudaGetDeviceProperties"))
return(false);
return(true);
}
/*
Following the instruction of CUDA_Occupancy_Calculator.xls which is provided with CUDA,
the below function Get_Kernel_Basic_Config is to determine the limit of the CUDA block size
and the limit of the shared memory size per block for a kernel function.
*/
bool Get_Kernel_Basic_Config(int userBlkSize, struct cudaDeviceProp deviceProp,
struct cudaFuncAttributes KernelAttrib, int *MaxThreadsPer1DBlk,
size_t *MaxExtSharedMemPerBlk, char *ErrMsg)
{
/*
Parameters description:
userBlkSize (input) - user specified maximum number of threads per CUDA block;
deviceProp (input) - a struct for CUDA device properties;
KernelAttrib (input) - a struct for the properties of the kernel function;
MaxThreadsPer1DBlk (output) - the maximum number of threads per CUDA block
after considering many factors as in CUDA_Occupancy_Calculator.xls;
MaxExtSharedMemPerBlk (output) - the maximum size of the externally allocable
shared memory per CUDA block for MaxThreadsPer1DBlk threads;
ErrMsg - a string pointer for the text of the error description if an error is found.
return value - true if the user-specified GPU device is initialized successfully, or
false otherwise.
Note:
To understand this function, please refer to CUDA_Occupancy_Calculator.xls, which is usually
provided with the CUDA software.
*/
int RegisterLimitedBlks; // Registers limited Blocks per Multiprocessor
int MaxWarpsLimitedBlks; // Maximum warps limited Blocks per Multiprocessor
int KernelWarpsPerBlock; // Number of Warps per Block
int MaxWarpsPerMP; // Maximum allowed Warps per Multiprocessor
int RegsWarpsLimitedBlks; // Registers and Maximum allowed Warps limited Blocks per Multiprocessor
size_t MaxSharedMemPerBlk; /* Maximum available shared memory per block
(including the static shared variables in the kernel function). */
*MaxThreadsPer1DBlk = deviceProp.maxThreadsPerBlock;
if (userBlkSize>0) *MaxThreadsPer1DBlk = MIN(userBlkSize, *MaxThreadsPer1DBlk);
*MaxThreadsPer1DBlk = MIN(*MaxThreadsPer1DBlk, KernelAttrib.maxThreadsPerBlock);
RegisterLimitedBlks = (int)floor((deviceProp.regsPerBlock+0.0f)/(*MaxThreadsPer1DBlk)/KernelAttrib.numRegs);
KernelWarpsPerBlock = (int)ceil((*MaxThreadsPer1DBlk+0.0f)/deviceProp.warpSize);
MaxWarpsPerMP = (int)floor((deviceProp.maxThreadsPerBlock+0.0f)/deviceProp.warpSize);
MaxWarpsLimitedBlks = (int)floor((MaxWarpsPerMP+0.0f)/KernelWarpsPerBlock);
RegsWarpsLimitedBlks = MIN(RegisterLimitedBlks, MaxWarpsLimitedBlks);
MaxSharedMemPerBlk = (int)floor((deviceProp.sharedMemPerBlock+0.0f)/RegsWarpsLimitedBlks);
if(KernelAttrib.sharedSizeBytes>MaxSharedMemPerBlk){
sprintf_s(ErrMsg, TempStringBufferSize,
"%s : not enough shared memory for kernel static shared memory!", __FUNCTION__);
return false;
}
// MaxExtSharedMemPerBlk is maximum externally available shared memory per block
*MaxExtSharedMemPerBlk = MaxSharedMemPerBlk-KernelAttrib.sharedSizeBytes;
return(true);
}
#endif // _GPU_COMMONS_CU_