diff --git a/build/examples/15_ampere_sparse_tensorop_gemm/ncu_run.sh b/build/examples/15_ampere_sparse_tensorop_gemm/ncu_run.sh index e915e9c7..91d5b075 100755 --- a/build/examples/15_ampere_sparse_tensorop_gemm/ncu_run.sh +++ b/build/examples/15_ampere_sparse_tensorop_gemm/ncu_run.sh @@ -4,4 +4,4 @@ if [ $# != 1 ]; then echo "e.g. ./ncu_run.sh output_file_name " exit 100 fi -sudo /usr/local/cuda/bin/ncu --call-stack --nvtx -o $1 --set full ./15_ampere_sparse_tensorop_gemm +sudo /usr/local/cuda/bin/ncu --call-stack --nvtx -o $1 --set full ./15_ampere_sparse_tensorop_gemm diff --git a/build/examples/15_ampere_sparse_tensorop_gemm/nsys_run.sh b/build/examples/15_ampere_sparse_tensorop_gemm/nsys_run.sh index 4f8515b0..8a3765b4 100755 --- a/build/examples/15_ampere_sparse_tensorop_gemm/nsys_run.sh +++ b/build/examples/15_ampere_sparse_tensorop_gemm/nsys_run.sh @@ -4,14 +4,18 @@ if [ $# != 1 ]; then echo "e.g. ./nsys_run.sh output_file_name " exit 100 fi +#for i in 0 32 64 96 128 224 320 416 512 1024 2048 4096 8192 #for i in 0 32 64 96 128 224 320 416 512 #for i in 0 32 64 96 128 224 256 #for i in 0 32 64 96 128 #for i in 0 32 64 -for i in 0 32 +#for i in 0 32 +for i in 3264 +#for i in 0 do nsys profile -t cuda,osrt,nvtx,cudnn,cublas -o $1_$i.qdstrm --stats=true -w true ./15_ampere_sparse_tensorop_gemm $1 $i | grep -e SparseGemm -e Gemm -e vecAddOpt -e M: >>$1.log echo ////////////////////////////////////////////////////////// >>$1.log echo >>$1.log done +cat $1.log #nsys profile -t cuda,osrt,nvtx,cudnn,cublas -o $1.qdstrm --stats=true -w true ./15_ampere_sparse_tensorop_gemm 1024 0 |grep -e SparseGemm -e Gemm -e vecAddOpt -e M: diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu index 513fae71..97cde013 100644 --- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu +++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu @@ -59,16 +59,18 @@ efficiently. /////////////////////////////////////////////// ///// TEST CONFIGURATION /////////////////////////////////////////////// -#define DENSE_GEMM_EN 1 // 0: disable, 1: enable +#define DENSE_GEMM_EN 0 // 0: disable, 1: enable #define VEC_ADD_EN 1 // 0: disable, 1: enable #define REF_EN 2 // 0: disable, 1: host, 2: cutlass #define DBG_LOG_EN 0 // 0: disable, 1: enable -#define LIST_ENTRY_NUM 2048 +#define LIST_ENTRY_NUM 8192 -#define M_SIZE 512 -#define K_SIZE 20480 -#define N_SIZE 5120 +#define M_SIZE 8192 +#define K_SIZE 16384 +#define N_SIZE 4096 +//#define K_SIZE 20480 +//#define N_SIZE 5120 #define M_EXTRA_SIZE 0