Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bit redop test #25

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ All tests support the same set of arguments :
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
* NCCL operations arguments
* `-o,--op <sum/prod/min/max/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
* `-o,--op <sum/prod/min/max/band/bor/bxor/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
* `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
* `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
* Performance
Expand Down
54 changes: 50 additions & 4 deletions src/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32",
ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
#endif
ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin};
const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"};
ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclBitAnd, ncclBitOr, ncclBitXor};
const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "band", "bor", "bxor"};

thread_local int is_main_thread = 0;

Expand Down Expand Up @@ -184,6 +184,12 @@ template<typename T>
__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
template<typename T>
__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
template<typename T>
__device__ T ncclOpBitAnd(T a, T b) { return a&b; }
template<typename T>
__device__ T ncclOpBitOr(T a, T b) { return a|b; }
template<typename T>
__device__ T ncclOpBitXor(T a, T b) { return a^b; }

// Definitions for half
template<>
Expand All @@ -195,6 +201,45 @@ __device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(
template<>
__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }

// Definitions for bit op with floating number
template<typename T>
union bitConverter;

template<>
union bitConverter<half> {
half storage;
int16_t a;
};
template<>
union bitConverter<float> {
float storage;
int a;
};
template<>
union bitConverter<double> {
double storage;
int64_t a;
};

#define BIT_OPS(dtype, name, op) \
template<> \
__device__ dtype ncclOpBit##name(dtype a, dtype b) { \
union bitConverter<dtype> ca, cb, cr; \
ca.storage = a; \
cb.storage = b; \
cr.a = ca.a op cb.a; \
return cr.storage; \
}

#define BIT_OP_TYPE(name, op) \
BIT_OPS(half, name, op) \
BIT_OPS(float, name, op) \
BIT_OPS(double, name, op)

BIT_OP_TYPE(And, &)
BIT_OP_TYPE(Or, |)
BIT_OP_TYPE(Xor, ^)

template<typename T, T (*Op)(T, T)>
__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
Expand All @@ -207,7 +252,8 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse
}

#define KERN(type, op) (void*)InitDataReduceKernel<type, op<type>>
#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin)
#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin), \
KERN(type, ncclOpBitAnd), KERN(type, ncclOpBitOr), KERN(type, ncclOpBitXor)

static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = {
OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double)
Expand Down Expand Up @@ -658,7 +704,7 @@ int main(int argc, char* argv[]) {
"[-w,--warmup_iters <warmup iteration count>] \n\t"
"[-p,--parallel_init <0/1>] \n\t"
"[-c,--check <0/1>] \n\t"
"[-o,--op <sum/prod/min/max/all>] \n\t"
"[-o,--op <sum/prod/min/max/band/bor/bxor/all>] \n\t"
"[-d,--datatype <nccltype/all>] \n\t"
"[-r,--root <root>] \n\t"
"[-z,--blocking <0/1>] \n\t"
Expand Down