diff --git a/examples/spttn_tucker_solve_kernels.cxx b/examples/spttn_tucker_solve_kernels.cxx index 0bdf6549..e2121819 100644 --- a/examples/spttn_tucker_solve_kernels.cxx +++ b/examples/spttn_tucker_solve_kernels.cxx @@ -94,11 +94,11 @@ bool execute_spttn_kernel(int n, int ur, int vr, int wr, UCxx.norm2(norm); int64_t sz = T.get_tot_size(false); bool pass = (norm / sz < 1.e-5); - if (dw.rank == 0){ + if (dw.rank == 0) { if (!pass) - printf("Test passed.\n"); - else printf("Test failed.\n"); + else + printf("Test passed.\n"); } IASSERT(pass); mpass = mpass & pass; @@ -247,11 +247,11 @@ bool execute_spttn_kernel(int n, int ur, int vr, int wr, UCxx.norm2(norm); int64_t sz = T.get_tot_size(false); bool pass = (norm / sz < 1.e-5); - if (dw.rank == 0){ + if (dw.rank == 0) { if (!pass) - printf("Test passed.\n"); - else printf("Test failed.\n"); + else + printf("Test passed.\n"); } IASSERT(pass); mpass = mpass & pass; @@ -358,11 +358,11 @@ bool execute_spttn_kernel(int n, int ur, int vr, int wr, UCxx.norm2(norm); int64_t sz = T.get_tot_size(false); bool pass = (norm / sz < 1.e-5); - if (dw.rank == 0){ + if (dw.rank == 0) { if (!pass) - printf("Test passed.\n"); - else printf("Test failed.\n"); + else + printf("Test passed.\n"); } IASSERT(pass); mpass = mpass & pass; @@ -395,6 +395,40 @@ bool execute_spttn_kernel(int n, int ur, int vr, int wr, for a: buf2 += buf[a] * U[a,i] Z_ijk += buf2 * T_ijk + + + with thres_buf_sz = 2 + path chosen: 0 + ta: 8 tb: 16 tab: 24 inds: 28 + ta: 4 tb: 24 tab: 28 inds: 14 + ta: 2 tb: 28 tab: 30 inds: 7 + ta: 1 tb: 30 tab: 31 inds: 7 + total loop depth: 15 + term id 0: 4 8 16 32 + term id 1: 4 2 8 16 + term id 2: 4 2 1 8 + term id 3: 4 1 2 + niloops: 6 + + i: 1 j: 2 k: 4 a: 8 b: 16 c: 32 + T: 1 U: 2 V: 4 W: 8 C: 16 + 4 + 4 + 4 + 3 = 15 + for k: + for a: + for b: + for c: + buf[a,b,c] += W[c,k] * C[a,b,c] + for j: + for a: + for b: + buf2[a] += buf[a,b,c] * V[b,j] + for i: + for a: + buf[i] += buf2[a] * U[a,i] + for i: + for j: + Z_ijk += buf[i] * T[i,j,k] + */ lens_uc[0] = ur; lens_uc[1] = n1; @@ -418,8 +452,9 @@ bool execute_spttn_kernel(int n, int ur, int vr, int wr, C.fill_random((dtype)0,(dtype)1); Tensor * ops[5] = {&U, &V, &W, &C, &UC}; + int max_buf_dim = 2; stime = MPI_Wtime(); - spttn_kernel(&T, ops, 5, "ijk,ai,bj,ck,abc->ijk"); + spttn_kernel(&T, ops, 5, "ijk,ai,bj,ck,abc->ijk", max_buf_dim); etime = MPI_Wtime(); if (dw.rank == 0) printf("ijk,ai,bj,ck,abc->ijk using SpTTN-Cyclops (NOTE that it includes CSF construction time; please see total time to calculate printed above): %1.2lf\n", (etime - stime)); @@ -433,11 +468,11 @@ bool execute_spttn_kernel(int n, int ur, int vr, int wr, UCxx.norm2(norm); int64_t sz = T.get_tot_size(false); bool pass = (norm / sz < 1.e-5); - if (dw.rank == 0){ + if (dw.rank == 0) { if (!pass) - printf("Test passed.\n"); - else printf("Test failed.\n"); + else + printf("Test passed.\n"); } IASSERT(pass); mpass = mpass & pass; diff --git a/src/interface/multilinear.cxx b/src/interface/multilinear.cxx index 73593421..5038298a 100644 --- a/src/interface/multilinear.cxx +++ b/src/interface/multilinear.cxx @@ -1004,23 +1004,23 @@ namespace CTF { } template - void spttn_kernel(Tensor * A, Tensor ** Bs, int nBs, const char * einsum_expr, std::string * terms, int nterms, std::string * index_order) + void spttn_kernel(Tensor * A, Tensor ** Bs, int nBs, const char * einsum_expr, std::string * terms, int nterms, std::string * index_order, int max_buf_dim) { IASSERT(terms != nullptr && index_order != nullptr); char * idx_A; char ** idx_Bs; CTF_int::parse_einsum(einsum_expr, &idx_A, &idx_Bs, nBs); - CTF_int::spttn_contraction ctr = CTF_int::spttn_contraction(A, idx_A, (CTF_int::tensor **)Bs, nBs, idx_Bs, terms, nterms, index_order); + CTF_int::spttn_contraction ctr = CTF_int::spttn_contraction(A, idx_A, (CTF_int::tensor **)Bs, nBs, idx_Bs, terms, nterms, index_order, max_buf_dim); ctr.execute(); } template - void spttn_kernel(Tensor *A, Tensor **Bs, int nBs, const char *einsum_expr) + void spttn_kernel(Tensor *A, Tensor **Bs, int nBs, const char *einsum_expr, int max_buf_dim) { char *idx_A; char **idx_Bs; CTF_int::parse_einsum(einsum_expr, &idx_A, &idx_Bs, nBs); - CTF_int::spttn_contraction ctr = CTF_int::spttn_contraction(A, idx_A, (CTF_int::tensor **)Bs, nBs, idx_Bs); + CTF_int::spttn_contraction ctr = CTF_int::spttn_contraction(A, idx_A, (CTF_int::tensor **)Bs, nBs, idx_Bs, max_buf_dim); ctr.execute(); } diff --git a/src/interface/multilinear.h b/src/interface/multilinear.h index eeafc435..3827dd25 100644 --- a/src/interface/multilinear.h +++ b/src/interface/multilinear.h @@ -60,10 +60,10 @@ namespace CTF { void Solve_Factor(Tensor * T, Tensor ** mat_list, Tensor * RHS, int mode, bool aux_mode_first); template - void spttn_kernel(Tensor * A, Tensor ** Bs, int nBs, const char * einsum_expr, std::string * terms, int nterms, std::string * index_order); + void spttn_kernel(Tensor * A, Tensor ** Bs, int nBs, const char * einsum_expr, std::string * terms, int nterms, std::string * index_order, int max_buf_dim = 2); template - void spttn_kernel(Tensor * A, Tensor ** B, int nBs, const char * einsum_expr); + void spttn_kernel(Tensor * A, Tensor ** B, int nBs, const char * einsum_expr, int max_buf_dim = 2); } #endif diff --git a/src/spttn_cyclops/cp_io.h b/src/spttn_cyclops/cp_io.h index 1fb9f677..086d19c0 100644 --- a/src/spttn_cyclops/cp_io.h +++ b/src/spttn_cyclops/cp_io.h @@ -479,6 +479,31 @@ namespace CTF_int { return false; } + // populate a single term + void optimize_for_blas (uint16_t S, uint8_t sT, uint8_t eT, uint16_t rem_inds) + { + // first push sparse indices + uint16_t cp_S = S; + for (int i = nindices; i >= 0; i--) { + if (((1 << i) & sp_inds) && (rem_inds & (1 << i))) { + if (apply_constraints(cp_S, sT, eT, (1< * dt; + // for the sparse output tensor in SpTTN kernels that have the same sparsity pattern as the input tensor + CTF::Pair * dt_sp_op; int64_t * ldas; int64_t * nnz_level; int * phys_phase; @@ -121,6 +123,11 @@ namespace CTF_int { return dt[pt].d; } + void init_sp_op(CTF::Pair * pairs) + { + dt_sp_op = pairs; + } + void traverse_CSF(int64_t st_ptr, int64_t en_ptr, int level) diff --git a/src/spttn_cyclops/prepare_kernel.cxx b/src/spttn_cyclops/prepare_kernel.cxx index b7613487..1badfd2c 100644 --- a/src/spttn_cyclops/prepare_kernel.cxx +++ b/src/spttn_cyclops/prepare_kernel.cxx @@ -61,6 +61,7 @@ namespace CTF_int { const std::string * terms_, int nterms_, const std::string & sindex_order_, + int max_buf_dim_, bool retain_op_, tensor ** redis_op_, char const * alpha_, @@ -76,6 +77,7 @@ namespace CTF_int { tensor ** Bs_, int nBs_, const char * const * cidx_Bs, + int max_buf_dim_, bool retain_op_, tensor ** redis_op_, char const * alpha_, @@ -86,6 +88,7 @@ namespace CTF_int { nBs = nBs_; // nBs is number of Bs including the ouput. The number of input tensors including A is nBs nterms = nBs-1; + max_buf_dim = max_buf_dim_; retain_op = retain_op_; redis_op = redis_op_; func = func_; @@ -128,7 +131,7 @@ namespace CTF_int { for (int i = 0; i < nterms; i++) { new(terms + i) contraction_terms(dim_max, nBs); } - select_cp_io_populate_terms(cidx_A, cidx_Bs, nterms, terms, order_A, idx_A, nBs, order_Bs, idx_Bs, num_indices, A->wrld->rank); + select_cp_io_populate_terms(cidx_A, cidx_Bs, nterms, terms, order_A, idx_A, nBs, order_Bs, idx_Bs, num_indices, max_buf_dim, A->wrld->rank); } template @@ -140,6 +143,7 @@ namespace CTF_int { const std::string * sterms, int nterms_, const std::string * sindex_order, + int max_buf_dim_, bool retain_op_, tensor ** redis_op_, char const * alpha_, @@ -149,6 +153,7 @@ namespace CTF_int { Bs = Bs_; nBs = nBs_; nterms = nterms_; + max_buf_dim = max_buf_dim_; retain_op = retain_op_; redis_op = redis_op_; func = func_; @@ -207,6 +212,7 @@ namespace CTF_int { const std::string * sterms, int nterms_, const std::string & sindex_order, + int max_buf_dim_, bool retain_op_, tensor ** redis_op_, char const * alpha_, @@ -352,8 +358,7 @@ namespace CTF_int { if (A->wrld->rank == 0) printf("tree construction time: %1.2lf\n", (etime - stime)); if (Bs[nBs-1]->is_sparse) { - // allocate the sparse output tensor in the same tree structure as the input tensor - IASSERT(0); + A_csf.init_sp_op((Pair*)Bs[nBs-1]->data); } stime = MPI_Wtime(); diff --git a/src/spttn_cyclops/prepare_kernel.h b/src/spttn_cyclops/prepare_kernel.h index 7a274004..773e8d8b 100644 --- a/src/spttn_cyclops/prepare_kernel.h +++ b/src/spttn_cyclops/prepare_kernel.h @@ -54,6 +54,8 @@ namespace CTF_int { int nterms; /** \brief contraction order by index */ int * index_order; + /** \brief maximum intermediate tensor dimension */ + int max_buf_dim; /** \brief number of indices */ int num_indices; /** \brief if true, do not redistribute the output tensor */ @@ -90,6 +92,7 @@ namespace CTF_int { const std::string * terms, int nterms, const std::string & index_order, + int max_buf_dim, bool retain_op=false, tensor ** redis_op=nullptr, char const * alpha=NULL, @@ -102,6 +105,7 @@ namespace CTF_int { const std::string * terms, int nterms, const std::string & index_order, + int max_buf_dim, bool retain_op=false, tensor ** redis_op=nullptr, char const * alpha=NULL, @@ -114,6 +118,7 @@ namespace CTF_int { const std::string * terms, int nterms, const std::string * index_order, + int max_buf_dim, bool retain_op=false, tensor ** redis_op=nullptr, char const * alpha=NULL, @@ -123,6 +128,7 @@ namespace CTF_int { tensor ** Bs, int nBs, const char * const * idx_Bs, + int max_buf_dim, bool retain_op=false, tensor ** redis_op=nullptr, char const * alpha=NULL, @@ -1239,6 +1245,7 @@ namespace CTF_int { int * order_Bs, int ** idx_Bs, int num_indices, + int max_buf_dim, const int rank) { // sparse tensor + (nBs-1), excluding the output tensor @@ -1309,8 +1316,8 @@ namespace CTF_int { for (size_t i = 0; i < paths.size(); i++) { // choose a contraction path that can be implemented with the given constraints if (path_cost[i] != pick_cp_cost) continue; - int thres_buf_sz = 2; - local_index_order * lio = new local_index_order(nterms, all_inds, sp_inds, numones, paths[i], cp_cache, thres_buf_sz, sp_buffer); + // int thres_buf_sz = 2; + local_index_order * lio = new local_index_order(nterms, all_inds, sp_inds, numones, paths[i], cp_cache, max_buf_dim, sp_buffer); uint16_t S = 0; uint8_t sT = 0; uint8_t eT = nterms-1; @@ -1483,4 +1490,4 @@ namespace CTF_int { } } -#endif \ No newline at end of file +#endif