From dc481729650851c543e26dbcaa43ec53529cac69 Mon Sep 17 00:00:00 2001 From: Edgar Solomonik Date: Thu, 13 Oct 2016 15:00:16 -0500 Subject: [PATCH] Got ScaLAPACK interface working, at least for the read/write pair use case (direct copy for pure cyclic untested). Brought up to date nonsq_pgemm_test, although now there is overhead to reading/writing the tensor. Got rid of other tests. Tweaked config.mk (generated by configure) to give info on how to add ScaLAPACK libs and turn on MKL sparse BLAS kernels. --- Makefile | 14 +- bench/nonsq_pgemm_bench.cxx | 441 ------------------ configure | 12 + scalapack_tests/Makefile | 8 + .../nonsq_pgemm_test.cxx | 110 +++-- src/interface/matrix.cxx | 53 ++- src/interface/matrix.h | 6 +- src/tensor/untyped_tensor.cxx | 6 +- test/pgemm_test.cxx | 373 --------------- 9 files changed, 123 insertions(+), 900 deletions(-) delete mode 100644 bench/nonsq_pgemm_bench.cxx create mode 100644 scalapack_tests/Makefile rename {test => scalapack_tests}/nonsq_pgemm_test.cxx (87%) delete mode 100644 test/pgemm_test.cxx diff --git a/Makefile b/Makefile index 8e2f894e..b506779d 100644 --- a/Makefile +++ b/Makefile @@ -15,16 +15,19 @@ all: $(BDIR)/lib/libctf.a EXAMPLES = algebraic_multigrid apsp bitonic_sort btwn_central ccsd checkpoint dft_3D fft force_integration force_integration_sparse jacobi matmul neural_network particle_interaction qinformatics recursive_matmul scan sparse_mp3 sparse_permuted_slice spectral_element spmv sssp strassen trace TESTS = bivar_function bivar_transform ccsdt_map_test ccsdt_t3_to_t2 dft diag_ctr diag_sym endomorphism_cust endomorphism_cust_sp endomorphism gemm_4D multi_tsr_sym permute_multiworld readall_test readwrite_test repack scalar speye sptensor_sum subworld_gemm sy_times_ns test_suite univar_function weigh_4D -BENCHMARKS = nonsq_pgemm_bench bench_contraction bench_nosym_transp bench_redistribution model_trainer +BENCHMARKS = bench_contraction bench_nosym_transp bench_redistribution model_trainer + +SCALAPACK_TESTS = nonsq_pgemm_test nonsq_pgemm_bench STUDIES = fast_diagram fast_3mm fast_sym fast_sym_4D \ fast_tensor_ctr fast_sy_as_as_tensor_ctr fast_as_as_sy_tensor_ctr -EXECUTABLES = $(EXAMPLES) $(TESTS) $(BENCHMARKS) $(STUDIES) +EXECUTABLES = $(EXAMPLES) $(TESTS) $(BENCHMARKS) $(SCALAPACK_TESTS) $(STUDIES) export EXAMPLES export TESTS export BENCHMARKS +export SCALAPACK_TESTS export STUDIES @@ -45,6 +48,13 @@ tests: $(TESTS) $(TESTS): $(MAKE) $@ -C test +.PHONY: scalapack_tests +scalapack_tests: $(SCALAPACK_TESTS) +$(SCALAPACK_TESTS): + $(MAKE) $@ -C scalapack_tests + + + .PHONY: bench bench: $(BENCHMARKS) $(BENCHMARKS): diff --git a/bench/nonsq_pgemm_bench.cxx b/bench/nonsq_pgemm_bench.cxx deleted file mode 100644 index 5ba23e23..00000000 --- a/bench/nonsq_pgemm_bench.cxx +++ /dev/null @@ -1,441 +0,0 @@ -/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/ - - - -#include "mpi.h" -#include -#include -#include -#include -//#include -#include "../dist_tensor/cyclopstf.hpp" -#include "../shared/util.h" - -#define ZGEMM_TEST -#ifdef ZGEMM_TEST -typedef std::complex VAL_TYPE; -#else -typedef double VAL_TYPE; -#endif - -#define NUM_ITER 5 - -//proper modulus for 'a' in the range of [-b inf] -#define WRAP(a,b) ((a + b)%b) -#define MIN( a, b ) ( ((a) < (b)) ? (a) : (b) ) -#if (defined BGP || defined BGQ) -#define DESCINIT descinit -#define PDGEMM pdgemm -#define PZGEMM pzgemm -#else -#define DESCINIT descinit_ -#define PDGEMM pdgemm_ -#define PZGEMM pzgemm_ -#endif - -extern "C" { -void Cblacs_pinfo(int*,int*); - -void Cblacs_get(int,int,int*); - -int Cblacs_gridinit(int*,char*,int,int); - -void DESCINIT(int *, const int *, - const int *, const int *, - const int *, const int *, - const int *, const int *, - const int *, int *); - -void PDGEMM( char *, char *, - int *, int *, - int *, double *, - double *, int *, - int *, int *, - double *, int *, - int *, int *, - double *, double *, - int *, int *, - int *); -void PZGEMM( char *, char *, - int *, int *, - int *, std::complex *, - std::complex *, int *, - int *, int *, - std::complex *, int *, - int *, int *, - std::complex *, std::complex *, - int *, int *, - int *); -} - -static void cdesc_init(int * desc, - const int m, const int n, - const int mb, const int nb, - const int irsrc, const int icsrc, - const int ictxt, const int LLD, - int * info){ - DESCINIT(desc,&m,&n,&mb,&nb,&irsrc,&icsrc, - &ictxt, &LLD, info); -} - - -static void cpdgemm( char n1, char n2, - int sz1, int sz2, - int sz3, double ALPHA, - double * A, int ia, - int ja, int * desca, - double * B, int ib, - int jb, int * descb, - double BETA, double * C, - int ic, int jc, - int * descc){ - PDGEMM(&n1, &n2, &sz1, - &sz2, &sz3, &ALPHA, - A, &ia, &ja, desca, - B, &ib, &jb, descb, &BETA, - C, &ic, &jc, descc); -} -static void cpzgemm(char n1, char n2, - int sz1, int sz2, - int sz3, std::complex ALPHA, - std::complex * A, int ia, - int ja, int * desca, - std::complex * B, int ib, - int jb, int * descb, - std::complex BETA, std::complex * C, - int ic, int jc, - int * descc){ - PZGEMM(&n1, &n2, &sz1, - &sz2, &sz3, &ALPHA, - A, &ia, &ja, desca, - B, &ib, &jb, descb, &BETA, - C, &ic, &jc, descc); -} - -int uint_log2( int n){ - int j; - for (j=0; n!=1; j++) n>>=1; - return j; -} - -//extern "C" void pbm(); - -int main(int argc, char **argv) { -/*void pbm() { - - int argc; - char **argv;*/ - int myRank, numPes; - int m, k, n; - int nprow, npcol; - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &numPes); - MPI_Comm_rank(MPI_COMM_WORLD, &myRank); - - - if (argc != 1 && argc >= 7) { - if (myRank == 0) printf("%s [m] [n] [k] [nprow] [niter]\n", - argv[0]); - MPI_Abort(MPI_COMM_WORLD, -1); - } - - if (argc <= 4){ - m = 400; - n = 300; - k = 500; - if (sqrt(numPes)*sqrt(numPes) == numPes){ - nprow = sqrt(numPes); - npcol = sqrt(numPes); - } else { - nprow = numPes; - npcol = 1; - } - } - if (argc > 4){ - m = atoi(argv[1]); - n = atoi(argv[2]); - k = atoi(argv[3]); - nprow = atoi(argv[4]); - npcol = numPes/nprow; - assert(numPes == nprow*npcol); - } - - int num_iter; - if (argc > 5) num_iter = atoi(argv[5]); - else num_iter = NUM_ITER; - -#ifdef ZGEMM_TEST - std::complex ALPHA = std::complex(2.0,-.7); - std::complex BETA =std::complex(1.3,.4); -#else - double ALPHA = 0.3; - double BETA = .7; -#endif - - - - if (myRank == 0){ - printf("MATRIX MULTIPLICATION OF MATRICES\n"); - printf("MATRIX DIMENSIONS ARE %d by %d by %d\n", m, n, k); - printf("PROCESSOR GRID IS %d by %d\n", nprow, npcol); - printf("PERFORMING %d ITERATIONS\n", num_iter); - printf("WITH RANDOM DATA\n"); -// printf("ALPHA = %lf, BETA = %lf\n",ALPHA,BETA); - } - - int iter, i, j, nblk, mblk; - - nblk = n/npcol; - //mblk = m/nprow; - mblk = m/npcol; - - VAL_TYPE * mat_A = (VAL_TYPE*)malloc(k*m*sizeof(VAL_TYPE)/numPes); - VAL_TYPE * mat_B = (VAL_TYPE*)malloc(n*k*sizeof(VAL_TYPE)/numPes); - VAL_TYPE * mat_C = (VAL_TYPE*)malloc(nblk*mblk*sizeof(VAL_TYPE)); - VAL_TYPE * mat_C_CTF = (VAL_TYPE*)malloc(nblk*mblk*sizeof(VAL_TYPE)); - - VAL_TYPE ans_verify; - - srand48(13*myRank); - for (i=0; i > * myctf = new tCTF< std::complex >; - myctf->init(MPI_COMM_WORLD,myRank,numPes,TOPOLOGY_BGQ); - - cpzgemm('T','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - - if (myRank == 0) - printf("Performed ScaLAPACK pzgemm, starting CTF pzgemm\n"); - - myctf->pgemm('T','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C_CTF, 1, 1, desc_c); -/* myctf->def_scala_mat(desc_a, mat_A, &tid_A); - myctf->def_scala_mat(desc_b, mat_B, &tid_B); - myctf->def_scala_mat(desc_c, mat_C_CTF, &tid_C); - myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C); - myctf->read_scala_mat(tid_C, mat_C_CTF); -*/ -#if 0 - for (i=0; i 1.E-6 && - fabs(mat_C[i*(nblk)+j].imag()-mat_C_CTF[i*(nblk)+j].imag()) > 1.E-6){ - printf("[%d] incorrect answer %lf %lf at C[%d,%d]=%lf,%lf\n", - myRank, mat_C_CTF[i*(nblk)+j].real(), - mat_C_CTF[i*(nblk)+j].imag(), i,j, - mat_C[i*(nblk)+j].real(), mat_C[i*(nblk)+j].imag()); - } - } - } - if (myRank == 0) - printf("PZGEMM Verification complete, correct if no errors.\n"); -#endif - - double startTime, endTime; - startTime = MPI_Wtime(); - for (iter=0; iter < num_iter; iter++){ - //seq_square_matmul(mat_A, mat_B, mat_C, blockDim, 0); - cpzgemm('T','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - if (iter == 0) - ans_verify = mat_C[2]; - } - - -#else - CTF * myctf = new CTF; - myctf->init(MPI_COMM_WORLD,TOPOLOGY_BGQ,myRank,numPes); - cpdgemm('N','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - - if (myRank == 0) - printf("Performed ScaLAPACK pdgemm, starting CTF pdgemm\n"); - - myctf->pgemm('N','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C_CTF, 1, 1, desc_c); - /*myctf->def_scala_mat(desc_a, mat_A, &tid_A); - myctf->def_scala_mat(desc_b, mat_B, &tid_B); - myctf->def_scala_mat(desc_c, mat_C_CTF, &tid_C); - myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C); - myctf->read_scala_mat(tid_C, mat_C_CTF); -*/ - - for (i=0; i 1.E-6){ - printf("[%d] incorrect answer %lf at C[%d,%d]=%lf\n", - myRank, mat_C_CTF[i*(nblk)+j], i,j, - mat_C[i*(nblk)+j]); - } - } - } - if (myRank == 0) - printf("PDGEMM Verification complete, correct if no errors.\n"); - - double startTime, endTime; - startTime = MPI_Wtime(); - for (iter=0; iter < num_iter; iter++){ - //seq_square_matmul(mat_A, mat_B, mat_C, blockDim, 0); - cpdgemm('N','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - if (iter == 0) - ans_verify = mat_C[2]; - } -#endif - - if(myRank == 0) { - endTime = MPI_Wtime(); - printf("Completed %u ScaLAPACK iterations\n", iter); - printf("Time elapsed per iteration: %f\n", (endTime - startTime)/num_iter); -#ifdef ZGEMM_TEST - printf("Gigaflops: %f\n", 8.*m*n*k/ - ((endTime - startTime)/num_iter)*1E-9); -#else - printf("Gigaflops: %f\n", 2.*m*n*k/ - ((endTime - startTime)/num_iter)*1E-9); -#endif - } - -#ifdef TAU - TAU_PROFILE_TIMER(timer, "main", " (int, char**)", TAU_USER); - TAU_PROFILE_START(timer); - TAU_PROFILE_INIT(argc, argv); - TAU_PROFILE_SET_NODE(myRank); - TAU_PROFILE_SET_CONTEXT(0); -#endif - - startTime = MPI_Wtime(); - for (iter=0; iter < num_iter; iter++){ - //seq_square_matmul(mat_A, mat_B, mat_C, blockDim, 0); - TAU_FSTART(ctf_pgemm_bench); - myctf->pgemm('T','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - TAU_FSTOP(ctf_pgemm_bench); -// myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C); - if (iter == 0) - ans_verify = mat_C[2]; - } - - if(myRank == 0) { - endTime = MPI_Wtime(); - printf("Completed %u CTF PGEMM iterations\n", iter); - printf("Time elapsed per iteration: %f\n", (endTime - startTime)/num_iter); -#ifdef ZGEMM_TEST - printf("Gigaflops: %f\n", 8.*m*n*k/ - ((endTime - startTime)/num_iter)*1E-9); -#else - printf("Gigaflops: %f\n", 2.*m*n*k/ - ((endTime - startTime)/num_iter)*1E-9); -#endif - } - - TAU_PROFILE_STOP(timer); - - myctf->exit(); - - MPI_Finalize(); - return 0; -} /* end function main */ - - diff --git a/configure b/configure index 2554fa2e..9855a2ad 100755 --- a/configure +++ b/configure @@ -567,6 +567,12 @@ else fi +SPMKL_COMMENT=" +### Optional: sparse MKL BLAS Routines +#uncomment below to enable MKL BLAS routines if setting up MKL manually +#DEFS += -DUSE_SP_MKL=1 +" + # # Determine BLAS/LAPACK naming convention # @@ -585,6 +591,7 @@ else echo 'BLAS library was not specified, using -mkl.' BLASLIBS=-mkl DEFS="$DEFS -DUSE_SP_MKL=1" + SPMKL_COMMENT="" elif testlink -lblas dgemm_ 0; then UNDERSCORE=1 echo 'BLAS library was not specified, using -lblas (with underscores).' @@ -697,6 +704,7 @@ mkdir -p $BUILDDIR/lib mkdir -p $BUILDDIR/obj mkdir -p $BUILDDIR/bin + cat > $BUILDDIR/config.mk < #include #include -//#include -#include "../dist_tensor/cyclopstf.hpp" -#include "../shared/util.h" -#include "../shared/offload.h" +#include "ctf.hpp" + +using namespace CTF; #define ZGEMM_TEST #ifdef ZGEMM_TEST @@ -178,18 +177,20 @@ int main(int argc, char **argv) { else alloc_host_buf = 1; #ifdef ZGEMM_TEST - std::complex ALPHA = std::complex(2.0,-.7); - std::complex BETA =std::complex(1.3,.4); + std::complex ALPHA = std::complex(.3,0.); + std::complex BETA =std::complex(.7,0.); #else double ALPHA = 0.3; double BETA = .7; #endif + double DALPHA = 0.3; + double DBETA = .7; if (myRank == 0){ printf("matrix multiplication of matrices\n"); - printf("matrix dimensions are " PRId64 " by " PRId64 " by " PRId64 "\n", m, n, k); + printf("matrix dimensions are %ld by %ld by %ld\n", m, n, k); printf("processor grid is %d by %d\n", nprow, npcol); printf("performing %d iterations\n", num_iter); printf("with random data\n"); @@ -201,17 +202,18 @@ int main(int argc, char **argv) { int64_t nblk, mblk; int64_t fnblk, fmblk, fkblk; +//#define TESTPAD #ifdef TESTPAD nblk = n/npcol+4; - mblk = m/npcol; + mblk = m/std::min(nprow,npcol); fnblk = n/npcol+4; - fmblk = m/npcol; + fmblk = m/std::min(nprow,npcol); fkblk = k/nprow; #else nblk = n/npcol; - mblk = m/npcol; + mblk = m/std::min(nprow,npcol); fnblk = n/npcol; - fmblk = m/npcol; + fmblk = m/std::min(nprow,npcol); fkblk = k/nprow; #endif @@ -245,8 +247,8 @@ int main(int argc, char **argv) { for (i=0; i=n){ - mat_B[i*(fkblk)+j].real() = 0.0; - mat_B[i*(fkblk)+j].imag() = 0.0; - } else { - mat_B[i*(fkblk)+j].real() = drand48(); - mat_B[i*(fkblk)+j].imag() = drand48(); + mat_B[i*(fkblk)+j].real(0.0); + mat_B[i*(fkblk)+j].imag(0.0); + } else { + mat_B[i*(fkblk)+j].real(drand48()); + mat_B[i*(fkblk)+j].imag(drand48()); } #else mat_B[i*(fkblk)+j] = drand48(); @@ -273,11 +275,11 @@ int main(int argc, char **argv) { for (j=0; j=n){ - mat_C[i*(mblk)+j].real() = 0.0; - mat_C[i*(mblk)+j].imag() = 0.0; + mat_C[i*(mblk)+j].real(0.0); + mat_C[i*(mblk)+j].imag(0.0); } else { - mat_C[i*(mblk)+j].real() = drand48(); - mat_C[i*(mblk)+j].imag() = drand48(); + mat_C[i*(mblk)+j].real(drand48()); + mat_C[i*(mblk)+j].imag(drand48()); } #else mat_C[i*(mblk)+j] = drand48(); @@ -320,27 +322,27 @@ int main(int argc, char **argv) { int desc_b[9]; int desc_c[9]; cdesc_init(desc_a, k, m, - fkblk, fmblk, - 0, 0, - icontxt, fkblk, - &info); + fkblk, fmblk, + 0, 0, + icontxt, fkblk, + &info); assert(info==0); cdesc_init(desc_b, k, n, - fkblk, fnblk, - 0, 0, - icontxt, fkblk, - &info); + fkblk, fnblk, + 0, 0, + icontxt, fkblk, + &info); assert(info==0); cdesc_init(desc_c, m, n, - mblk, nblk, - 0, 0, - icontxt, mblk, - &info); + mblk, nblk, + 0, 0, + icontxt, mblk, + &info); assert(info==0); + + World dw(MPI_COMM_WORLD); #ifdef ZGEMM_TEST - tCTF< std::complex > * myctf = new tCTF< std::complex >; - myctf->init(MPI_COMM_WORLD, myRank,numPes); double scalaTime = MPI_Wtime(); cpzgemm('T','N', m, n, k, ALPHA, mat_A, 1, 1, desc_a, @@ -349,14 +351,23 @@ int main(int argc, char **argv) { if (myRank == 0) printf("Performed ScaLAPACK pzgemm in %lf sec, starting CTF pzgemm\n", MPI_Wtime()-scalaTime); + + Matrix< std::complex > A(desc_a, mat_A, dw); + Matrix< std::complex > B(desc_b, mat_B, dw); + Matrix< std::complex > C(desc_c, mat_C_CTF, dw); + double ctfTime = MPI_Wtime(); - myctf->pgemm('T','N', m, n, k, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C_CTF, 1, 1, desc_c); + (DBETA*C["ij"])+=DALPHA*A["ki"]*B["kj"]; if (myRank == 0) printf("Performed CTF pzgemm in %lf sec\n", MPI_Wtime()-ctfTime); + C.read_mat(desc_c, mat_C_CTF); + +/* myctf->pgemm('T','N', m, n, k, ALPHA, + mat_A, 1, 1, desc_a, + mat_B, 1, 1, desc_b, BETA, + mat_C_CTF, 1, 1, desc_c); */ + if (myRank < npcol*npcol){ for (i=0; ipgemm( 'T','N', m, n, k, ALPHA, + /*myctf->pgemm( 'T','N', m, n, k, ALPHA, mat_A, 1, 1, desc_a, mat_B, 1, 1, desc_b, BETA, mat_C, 1, 1, desc_c); if (iter == 0) - ans_verify = mat_C[2]; + ans_verify = mat_C[2];*/ } if(myRank == 0) { @@ -471,13 +474,8 @@ int main(int argc, char **argv) { printf("Gigaflops: %f\n", 2.*m*n*k/ ((endTime - startTime)/num_iter)*1E-9); #endif - //printf("Ans=%lf\n",ans_verify); } - TAU_PROFILE_STOP(timer); - - myctf->exit(); - MPI_Finalize(); return 0; } /* end function main */ diff --git a/src/interface/matrix.cxx b/src/interface/matrix.cxx index 2a8d3474..30fb5da0 100644 --- a/src/interface/matrix.cxx +++ b/src/interface/matrix.cxx @@ -150,7 +150,7 @@ namespace CTF { } template - void gen_my_kv_pair(int rank, + void get_my_kv_pair(int rank, int nrow, int ncol, int mb, @@ -159,29 +159,30 @@ namespace CTF { int pc, int rsrc, int csrc, - int64_t & nmyc, int64_t & nmyr, + int64_t & nmyc, Pair *& pairs){ nmyr = mb*(nrow/mb/pr); if ((nrow/mb)%pr > (rank+pr-rsrc)%pr){ nmyr+=mb; } - if (((nrow/mb)%pr)+1 == (rank+pr-rsrc)%pr){ + if (((nrow/mb)%pr) == (rank+pr-rsrc)%pr){ nmyr+=nrow%mb; } nmyc = nb*(ncol/nb/pc); if ((ncol/nb)%pc > (rank/pr+pc-csrc)%pc){ - nmyc+=mb; + nmyc+=nb; } - if (((ncol/nb)%pc)+1 == (rank/pr+pc-csrc)%pc){ + if (((ncol/nb)%pc) == (rank/pr+pc-csrc)%pc){ nmyc+=ncol%nb; } +// printf("nrow = %d ncol = %d nmyr = %ld, nmyc = %ld mb = %d nb = %d pr = %d pc = %d\n",nrow,ncol,nmyr,nmyc,mb,nb,pr,pc); pairs = (Pair*)CTF_int::alloc(sizeof(Pair)*nmyr*nmyc); int cblk = (rank/pr+pc-csrc)%pc; for (int64_t i=0; iedge_map[0].np == pr && this->edge_map[1].np == pc){ if (lda == nrow/pc){ - printf("untested\n"); memcpy(this->data, (char*)data_, sizeof(dtype)*this->size); } else { - printf("untested\n"); for (int i=0; idata+i*nrow*sizeof(dtype)/pr,(char*)(data_+i*nrow/pr), sizeof(dtype)*this->size); + memcpy(this->data+i*nrow*sizeof(dtype)/pr,(char*)(data_+i*nrow/pr), nrow*sizeof(dtype)/pr); } } } else { - printf("untested\n"); - Matrix M(nrow, ncol, mb, nb, pr, pc, rsrc, csrc, lda, data_); + Matrix M(nrow, ncol, mb, nb, pr, pc, rsrc, csrc, lda, data_); (*this)["ab"] = M["ab"]; } } else { Pair * pairs; int64_t nmyr, nmyc; - get_my_kv_pair(this->wrld->rank, nrow, ncol, mb, nb, rsrc, csrc, nmyr, nmyc, &pairs); + get_my_kv_pair(this->wrld->rank, nrow, ncol, mb, nb, pr, pc, rsrc, csrc, nmyr, nmyc, pairs); + //printf("lda = %d, nmyr =%ld, nmyc=%ld\n",lda,nmyr,nmyc); if (lda == nmyr){ - printf("untested\n"); for (int64_t i=0; iwrite(nmyr*nmyc, pairs); - cdealloc(pairs); + CTF_int::cdealloc(pairs); } } @@ -247,30 +245,26 @@ namespace CTF { if (mb==1 && nb==1 && nrow%pr==0 && ncol%pc==0 && rsrc==0 && csrc==0){ if (this->edge_map[0].np == pr && this->edge_map[1].np == pc){ if (lda == nrow/pc){ - printf("untested\n"); memcpy((char*)data_, this->data, sizeof(dtype)*this->size); } else { - printf("untested\n"); for (int i=0; idata+i*nrow*sizeof(dtype)/pr, sizeof(dtype)*this->size); + memcpy((char*)(data_+i*nrow/pr), this->data+i*nrow*sizeof(dtype)/pr, nrow*sizeof(dtype)/pr); } } } else { - printf("untested\n"); int plens[] = {pr, pc}; Partition ip(2, plens); - Matrix M(nrow, ncol, "ij", ip["ij"], 0, this->wrld, this->sr); + Matrix M(nrow, ncol, "ij", ip["ij"], Idx_Partition(), 0, *this->wrld, *this->sr); M["ab"] = (*this)["ab"]; M.read_mat(mb, nb, pr, pc, rsrc, csrc, lda, data_); } } else { Pair * pairs; int64_t nmyr, nmyc; - get_my_kv_pair(this->wrld->rank, nrow, ncol, mb, nb, rsrc, csrc, nmyr, nmyc, &pairs); + get_my_kv_pair(this->wrld->rank, nrow, ncol, mb, nb, pr, pc, rsrc, csrc, nmyr, nmyc, pairs); this->read(nmyr*nmyc, pairs); if (lda == nmyr){ - printf("untested\n"); for (int64_t i=0; i + void Matrix::read_mat(int const * desc, + dtype * data_){ + int ictxt = desc[1]; + int pr, pc, ipr, ipc; + CTF_BLAS::BLACS_GRIDINFO(&ictxt, &pr, &pc, &ipr, &ipc); + IASSERT(ipr == this->wrld->rank%pr); + IASSERT(ipc == this->wrld->rank/pr); + + read_mat(desc[4],desc[5],pr,pc,desc[6],desc[7],desc[8],data_); + } + template Matrix::Matrix(int nrow_, int ncol_, @@ -295,7 +301,7 @@ namespace CTF { int rsrc, int csrc, int lda, - dtype * data, + dtype const * data, World & wrld_, CTF_int::algstrct const & sr_, char const * name_, @@ -327,6 +333,7 @@ namespace CTF { CTF_BLAS::BLACS_GRIDINFO(&ictxt, &pr, &pc, &ipr, &ipc); IASSERT(ipr == wrld_.rank%pr); IASSERT(ipc == wrld_.rank/pr); + IASSERT(pr*pc == wrld_.np); this->set_distribution("ij", Partition(2,CTF_int::int2(pr, pc))["ij"], Idx_Partition()); write_mat(desc[4],desc[5],pr,pc,desc[6],desc[7],desc[8],data_); } diff --git a/src/interface/matrix.h b/src/interface/matrix.h index a0065c46..c4e1f715 100644 --- a/src/interface/matrix.h +++ b/src/interface/matrix.h @@ -182,7 +182,7 @@ namespace CTF { int rsrc, int csrc, int lda, - dtype * data, + dtype const * data, World & wrld=get_universe(), CTF_int::algstrct const & sr=Ring(), char const * name=NULL, @@ -241,8 +241,8 @@ namespace CTF { * see ScaLAPACK docs for "Array Descriptor for In-core Dense Matrices" * \param[in] data locally stored values */ - void read(int const * desc, - dtype * data); + void read_mat(int const * desc, + dtype * data); /* * \brief prints matrix by row and column (modify print(...) overload in set.h if you would like a different print format) diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx index 32fba39f..83753633 100644 --- a/src/tensor/untyped_tensor.cxx +++ b/src/tensor/untyped_tensor.cxx @@ -1167,6 +1167,8 @@ namespace CTF_int { ASSERT(itopo != -1); assert(itopo != -1); + this->clear_mapping(); + this->topo = wrld->topovec[itopo]; for (int i=0; iedge_map+i; @@ -1178,7 +1180,7 @@ namespace CTF_int { map = map->child; } map->type = PHYSICAL_MAP; - map->np = top->dim_comm[j].np; + map->np = this->topo->dim_comm[j].np; map->cdt = j; } } @@ -1203,7 +1205,7 @@ namespace CTF_int { if (!check_self_mapping(this, idx_A)){ if (wrld->rank == 0) printf("CTF ERROR: invalid distribution in read() call, aborting.\n"); - ASSERT(0); + IASSERT(0); assert(0); } diff --git a/test/pgemm_test.cxx b/test/pgemm_test.cxx deleted file mode 100644 index f446d969..00000000 --- a/test/pgemm_test.cxx +++ /dev/null @@ -1,373 +0,0 @@ -/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/ - - - -#include "mpi.h" -#include -#include -#include -#include -//#include -#include "../dist_tensor/cyclopstf.hpp" -#include "../shared/util.h" - -#define ZGEMM_TEST -#ifdef ZGEMM_TEST -typedef std::complex VAL_TYPE; -#else -typedef double VAL_TYPE; -#endif - -#define NUM_ITER 5 - -//proper modulus for 'a' in the range of [-b inf] -#define WRAP(a,b) ((a + b)%b) -#define MIN( a, b ) ( ((a) < (b)) ? (a) : (b) ) -#if (defined BGP || defined BGQ) -#define DESCINIT descinit -#define PDGEMM pdgemm -#define PZGEMM pzgemm -#else -#define DESCINIT descinit_ -#define PDGEMM pdgemm_ -#define PZGEMM pzgemm_ -#endif - -extern "C" { -void Cblacs_pinfo(int*,int*); - -void Cblacs_get(int,int,int*); - -int Cblacs_gridinit(int*,char*,int,int); - -void DESCINIT(int *, const int *, - const int *, const int *, - const int *, const int *, - const int *, const int *, - const int *, int *); - -void PDGEMM( char *, char *, - int *, int *, - int *, double *, - double *, int *, - int *, int *, - double *, int *, - int *, int *, - double *, double *, - int *, int *, - int *); -void PZGEMM( char *, char *, - int *, int *, - int *, std::complex *, - std::complex *, int *, - int *, int *, - std::complex *, int *, - int *, int *, - std::complex *, std::complex *, - int *, int *, - int *); -} - -static void cdesc_init(int * desc, - const int m, const int n, - const int mb, const int nb, - const int irsrc, const int icsrc, - const int ictxt, const int LLD, - int * info){ - DESCINIT(desc,&m,&n,&mb,&nb,&irsrc,&icsrc, - &ictxt, &LLD, info); -} - - -static void cpdgemm( char n1, char n2, - int sz1, int sz2, - int sz3, double ALPHA, - double * A, int ia, - int ja, int * desca, - double * B, int ib, - int jb, int * descb, - double BETA, double * C, - int ic, int jc, - int * descc){ - PDGEMM(&n1, &n2, &sz1, - &sz2, &sz3, &ALPHA, - A, &ia, &ja, desca, - B, &ib, &jb, descb, &BETA, - C, &ic, &jc, descc); -} -static void cpzgemm(char n1, char n2, - int sz1, int sz2, - int sz3, std::complex ALPHA, - std::complex * A, int ia, - int ja, int * desca, - std::complex * B, int ib, - int jb, int * descb, - std::complex BETA, std::complex * C, - int ic, int jc, - int * descc){ - PZGEMM(&n1, &n2, &sz1, - &sz2, &sz3, &ALPHA, - A, &ia, &ja, desca, - B, &ib, &jb, descb, &BETA, - C, &ic, &jc, descc); -} - -int uint_log2( int n){ - int j; - for (j=0; n!=1; j++) n>>=1; - return j; -} - -//extern "C" void pbm(); - -int main(int argc, char **argv) { -/*void pbm() { - - int argc; - char **argv;*/ - int myRank, numPes; - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &numPes); - MPI_Comm_rank(MPI_COMM_WORLD, &myRank); - - if (argc < 3 || argc > 4) { - if (myRank == 0) printf("%s [log2_matrix_dimension] [log2_block_dimension] [number of iterations]\n", - argv[0]); - //printf("%s [matrix_dimension_X] [matrix_dimension_Y] [block_dimension_X] [block_dimension_Y]\n", argv[0]); - MPI_Abort(MPI_COMM_WORLD, -1); - } - - const int log_matrixDim = atoi(argv[1]); - const int log_blockDim = atoi(argv[2]); - const int matrixDim = 1< 3) num_iter = atoi(argv[3]); - else num_iter = NUM_ITER; - - double ALPHA = 2.3; - double BETA = 0.7; - - - - if (myRank == 0){ - printf("MATRIX MULTIPLICATION OF SQUARE MATRICES\n"); - printf("MATRIX DIMENSION IS %d\n", matrixDim); - printf("BLOCK DIMENSION IS %d\n", blockDim); - printf("PERFORMING %d ITERATIONS\n", num_iter); - printf("WITH RANDOM DATA\n"); - printf("ALPHA = %lf, BETA = %lf\n",ALPHA,BETA); - } - - if (matrixDim < blockDim || matrixDim % blockDim != 0) { - if (myRank == 0) printf("array_size_X mod block_size_X != 0!\n"); - MPI_Abort(MPI_COMM_WORLD, -1); - } - if (matrixDim < blockDim || matrixDim % blockDim != 0) { - if (myRank == 0) printf("array_size_Y mod block_size_Y != 0!\n"); - MPI_Abort(MPI_COMM_WORLD, -1); - } - - const int log_num_blocks_dim = log_matrixDim - log_blockDim; - const int num_blocks_dim = 1< > * myctf = new tCTF< std::complex >; - myctf->init(MPI_COMM_WORLD,myRank,numPes,TOPOLOGY_BGQ); - cpzgemm('N','N', matrixDim, matrixDim, matrixDim, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - - if (myRank == 0) - printf("Performed ScaLAPACK pzgemm, starting CTF pzgemm\n"); - - myctf->pgemm('N','N', matrixDim, matrixDim, matrixDim, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C_CTF, 1, 1, desc_c); - - for (i=0; i 1.E-6 && - fabs(mat_C[i*blockDim+j].imag()-mat_C_CTF[i*blockDim+j].imag()) > 1.E-6){ - printf("[%d] incorrect answer at C[%d,%d]\n",myRank,i,j); - } - } - } - if (myRank == 0) - printf("PZGEMM Verification complete, correct if no errors.\n"); - - double startTime, endTime; - startTime = MPI_Wtime(); - for (iter=0; iter < num_iter; iter++){ - //seq_square_matmul(mat_A, mat_B, mat_C, blockDim, 0); - cpzgemm('N','N', matrixDim, matrixDim, matrixDim, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - if (iter == 0) - ans_verify = mat_C[2]; - } - - -#else - CTF * myctf = new CTF; - myctf->init(MPI_COMM_WORLD, TOPOLOGY_BGQ, myRank,numPes); - cpdgemm('N','N', matrixDim, matrixDim, matrixDim, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - - if (myRank == 0) - printf("Performed ScaLAPACK pdgemm, starting CTF pdgemm\n"); - - myctf->pgemm('N','N', matrixDim, matrixDim, matrixDim, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C_CTF, 1, 1, desc_c); - - for (i=0; i 1.E-6){ - printf("[%d] incorrect answer %lf at C[%d,%d] = %lf\n",myRank,mat_C_CTF[i*blockDim+j],i,j,mat_C[i*blockDim+j]); - } - } - } - if (myRank == 0) - printf("PDGEMM Verification complete, correct if no errors.\n"); - - double startTime, endTime; - startTime = MPI_Wtime(); - for (iter=0; iter < num_iter; iter++){ - //seq_square_matmul(mat_A, mat_B, mat_C, blockDim, 0); - cpdgemm('N','N', matrixDim, matrixDim, matrixDim, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - if (iter == 0) - ans_verify = mat_C[2]; - } -#endif - - if(myRank == 0) { - endTime = MPI_Wtime(); - printf("Completed %u ScaLAPACK iterations\n", iter); - printf("Time elapsed per iteration: %f\n", (endTime - startTime)/num_iter); - printf("Gigaflops: %f\n", 2.*matrixDim*matrixDim*matrixDim/ - ((endTime - startTime)/num_iter)*1E-9); - //printf("Ans=%lf\n",ans_verify); - } - -#ifdef TAU - TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); - TAU_PROFILE_START(timer); - TAU_PROFILE_INIT(argc, argv); - TAU_PROFILE_SET_NODE(myRank); - TAU_PROFILE_SET_CONTEXT(0); -#endif - - startTime = MPI_Wtime(); - for (iter=0; iter < num_iter; iter++){ - //seq_square_matmul(mat_A, mat_B, mat_C, blockDim, 0); - myctf->pgemm('N','N', matrixDim, matrixDim, matrixDim, ALPHA, - mat_A, 1, 1, desc_a, - mat_B, 1, 1, desc_b, BETA, - mat_C, 1, 1, desc_c); - if (iter == 0) - ans_verify = mat_C[2]; - } - - if(myRank == 0) { - endTime = MPI_Wtime(); - printf("Completed %u CTF PGEMM iterations\n", iter); - printf("Time elapsed per iteration: %f\n", (endTime - startTime)/num_iter); - printf("Gigaflops: %f\n", 2.*matrixDim*matrixDim*matrixDim/ - ((endTime - startTime)/num_iter)*1E-9); - //printf("Ans=%lf\n",ans_verify); - } - - TAU_PROFILE_STOP(timer); - - - MPI_Finalize(); - return 0; -} /* end function main */ - -