From 28da2231604befd10ef3f45a75ac5099eaa4f5b2 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 11:25:33 +0900 Subject: [PATCH 01/39] Port from libVMC --- src/common/blalink.hh | 518 ----------------------------------- src/common/blalink_fort.h | 184 ------------- src/common/colmaj.hh | 26 -- src/ltl2inv/blalink_gemmt.c | 42 --- src/ltl2inv/blalink_gemmt.hh | 80 ------ src/ltl2inv/eigen_gemmt.tcc | 77 ++++++ src/ltl2inv/error.hh | 42 +++ src/ltl2inv/gemmt2blas.cc | 35 +++ src/ltl2inv/ilaenv_lauum.cc | 10 +- src/ltl2inv/ilaenv_lauum.hh | 3 +- src/ltl2inv/invert.tcc | 92 ++++--- src/ltl2inv/lapack2eigen.hh | 272 ++++++++++++++++++ src/ltl2inv/ltl2inv.cc | 54 ++-- src/ltl2inv/makefile_ltl2inv | 9 +- src/ltl2inv/pfaffian.tcc | 80 ++++-- src/ltl2inv/testinv.cc | 125 --------- src/ltl2inv/trmmt.tcc | 28 +- 17 files changed, 602 insertions(+), 1075 deletions(-) delete mode 100644 src/common/blalink.hh delete mode 100644 src/common/blalink_fort.h delete mode 100644 src/common/colmaj.hh delete mode 100644 src/ltl2inv/blalink_gemmt.c delete mode 100644 src/ltl2inv/blalink_gemmt.hh create mode 100644 src/ltl2inv/eigen_gemmt.tcc create mode 100644 src/ltl2inv/error.hh create mode 100644 src/ltl2inv/gemmt2blas.cc create mode 100644 src/ltl2inv/lapack2eigen.hh delete mode 100644 src/ltl2inv/testinv.cc diff --git a/src/common/blalink.hh b/src/common/blalink.hh deleted file mode 100644 index 2750c34c..00000000 --- a/src/common/blalink.hh +++ /dev/null @@ -1,518 +0,0 @@ -/** - * \file blalink.hh - * C++ to C type wrapper for BLIS type-apis. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at https://mozilla.org/MPL/2.0/. - */ -#pragma once -#include -#include -typedef std::complex ccscmplx; -typedef std::complex ccdcmplx; -// BLIS definitions. -#include "blis.h" -// BLAS definitions -#include "blalink_fort.h" - -// error info -typedef enum { - Pfaffine_SIGN_ERR = 1, - Pfaffine_OUT_OF_BOUND, - Pfaffine_BAD_SCRATCHPAD, - Pfaffine_BAD_REPRESENTATION, - Pfaffine_INTEGER_OVERFLOW, - Pfaffine_DOUBLE_NAN_DETECTED, - Pfaffine_NOT_IMPLEMNTED, - Pfaffine_NUM_ERROR_TYPE -} skpfa_error_t; -inline signed err_info(signed type, signed pos) -{ return type * Pfaffine_NUM_ERROR_TYPE + pos; } - -// gemm -template -inline void gemm(trans_t transa, trans_t transb, - dim_t m, dim_t n, dim_t k, - T alpha, - T *a, inc_t lda, - T *b, inc_t ldb, - T beta, - T *c, inc_t ldc); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void gemm(trans_t transa, trans_t transb, \ - dim_t m, dim_t n, dim_t k, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *b, inc_t ldb, \ - cctype beta, \ - cctype *c, inc_t ldc) \ - { \ - bli_##cchar##gemm(transa, transb, \ - m, n, k, \ - (ctype *)&alpha, \ - (ctype *)a, 1, lda, \ - (ctype *)b, 1, ldb, \ - (ctype *)&beta, \ - (ctype *)c, 1, ldc); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void gemm(trans_t transa, trans_t transb, \ - dim_t m, dim_t n, dim_t k, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *b, inc_t ldb, \ - cctype beta, \ - cctype *c, inc_t ldc) \ - { \ - char ta = trans2char(transa), \ - tb = trans2char(transb); \ - cchar##gemm_(&ta, &tb, &m, &n, &k, \ - (ctype *)&alpha, \ - (ctype *)a, &lda, \ - (ctype *)b, &ldb, \ - (ctype *)&beta, \ - (ctype *)c, &ldc); \ - } -#endif -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - - -// ger -template -inline void ger(dim_t m, dim_t n, - T alpha, - T *x, inc_t incx, - T *y, inc_t incy, - T *a, inc_t lda); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ - template <> inline void ger(dim_t m, dim_t n, \ - cctype alpha, \ - cctype *x, inc_t incx, \ - cctype *y, inc_t incy, \ - cctype *a, inc_t lda) \ - { \ - bli_##cchar##ger(BLIS_NO_CONJUGATE, \ - BLIS_NO_CONJUGATE, \ - m, n, \ - (ctype *)&alpha, \ - (ctype *)x, incx, \ - (ctype *)y, incy, \ - (ctype *)a, 1, lda); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ - template <> inline void ger(dim_t m, dim_t n, \ - cctype alpha, \ - cctype *x, inc_t incx, \ - cctype *y, inc_t incy, \ - cctype *a, inc_t lda) \ - { \ - cchar##cfunc##_(&m, &n, \ - (ctype *)&alpha, \ - (ctype *)x, &incx, \ - (ctype *)y, &incy, \ - (ctype *)a, &lda); \ - } -#endif -BLALINK_MAC( float, float, s, ger ) -BLALINK_MAC( double, double, d, ger ) -BLALINK_MAC( ccscmplx, scomplex, c, geru ) -BLALINK_MAC( ccdcmplx, dcomplex, z, geru ) -#undef BLALINK_MAC - - -// gemv -template -inline void gemv(trans_t trans, - dim_t m, dim_t n, - T alpha, - T *a, inc_t lda, - T *x, inc_t incx, - T beta, - T *y, inc_t incy); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void gemv(trans_t trans, \ - dim_t m, dim_t n, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *x, inc_t incx, \ - cctype beta, \ - cctype *y, inc_t incy) \ - { \ - bli_##cchar##gemv(trans, \ - BLIS_NO_CONJUGATE, \ - m, n, \ - (ctype *)&alpha, \ - (ctype *)a, 1, lda, \ - (ctype *)x, incx, \ - (ctype *)&beta, \ - (ctype *)y, incy); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void gemv(trans_t trans, \ - dim_t m, dim_t n, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *x, inc_t incx, \ - cctype beta, \ - cctype *y, inc_t incy) \ - { \ - char t = trans2char(trans); \ - cchar##gemv_(&t, &m, &n, \ - (ctype *)&alpha, \ - (ctype *)a, &lda, \ - (ctype *)x, &incx, \ - (ctype *)&beta, \ - (ctype *)y, &incy); \ - } -#endif -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - - -// trmm -template -inline void trmm(side_t sidea, - uplo_t uploa, - trans_t transa, - diag_t diaga, - dim_t m, dim_t n, - T alpha, - T *a, inc_t lda, - T *b, inc_t ldb); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void trmm(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \ - dim_t m, dim_t n, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *b, inc_t ldb) \ - { \ - bli_##cchar##trmm(sidea, uploa, transa, diaga, \ - m, n, \ - (ctype *)&alpha, \ - (ctype *)a, 1, lda, \ - (ctype *)b, 1, ldb); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void trmm(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \ - dim_t m, dim_t n, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *b, inc_t ldb) \ - { \ - char ul = uplo2char(uploa), \ - si = side2char(sidea), \ - tr = trans2char(transa), \ - dg = diag2char(diaga); \ - cchar##trmm_(&si, &ul, &tr, &dg, \ - &m, &n, \ - (ctype *)&alpha, \ - (ctype *)a, &lda, \ - (ctype *)b, &ldb); \ - } -#endif -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - - -// trmv -template -inline void trmv(uplo_t uploa, - trans_t transa, - dim_t m, - T alpha, - T *a, inc_t lda, - T *x, inc_t incx); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void trmv(uplo_t uploa, trans_t transa, dim_t m, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *x, inc_t incx) \ - { \ - bli_##cchar##trmv(uploa, transa, \ - BLIS_NONUNIT_DIAG, m, \ - (ctype *)&alpha, \ - (ctype *)a, 1, lda, \ - (ctype *)x, incx); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void trmv(uplo_t uploa, trans_t transa, dim_t m, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *x, inc_t incx) \ - { \ - char ul = uplo2char(uploa), \ - tr = trans2char(transa), \ - dg = 'N'; \ - /* - * TRMV has alpha in its templated interface, which is absent in BLAS. - */ \ - assert(alpha == cctype(1.0)); \ - cchar##trmv_(&ul, &tr, &dg, &m, \ - (ctype *)a, &lda, \ - (ctype *)x, &incx); \ - } -#endif -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - - -// swap -template -inline void swap(dim_t n, - T *x, inc_t incx, - T *y, inc_t incy); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void swap(dim_t n, \ - cctype *x, inc_t incx, \ - cctype *y, inc_t incy) \ - { \ - bli_##cchar##swapv(n, \ - (ctype *)x, incx, \ - (ctype *)y, incy); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void swap(dim_t n, \ - cctype *x, inc_t incx, \ - cctype *y, inc_t incy) \ - { \ - cchar##swap_(&n, \ - (ctype *)x, &incx, \ - (ctype *)y, &incy); \ - } -#endif -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - - -// axpy -template -inline void axpy(dim_t n, - T alpha, - T *x, inc_t incx, - T *y, inc_t incy); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void axpy(dim_t n, \ - cctype alpha, \ - cctype *x, inc_t incx, \ - cctype *y, inc_t incy) \ - { \ - bli_##cchar##axpyv(BLIS_NO_CONJUGATE, n, \ - (ctype *)&alpha, \ - (ctype *)x, incx, \ - (ctype *)y, incy); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void axpy(dim_t n, \ - cctype alpha, \ - cctype *x, inc_t incx, \ - cctype *y, inc_t incy) \ - { \ - cchar##axpy_(&n, \ - (ctype *)&alpha, \ - (ctype *)x, &incx, \ - (ctype *)y, &incy); \ - } -#endif -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - -// dot -template -inline T dot(dim_t n, - T *sx, inc_t incx, - T *sy, inc_t incy); -#ifndef BLAS_EXTERNAL -#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ - template <> inline cctype dot(dim_t n, \ - cctype *sx, inc_t incx, \ - cctype *sy, inc_t incy) \ - { \ - cctype rho; \ - bli_##cchar##dotv(BLIS_NO_CONJUGATE, \ - BLIS_NO_CONJUGATE, \ - n, \ - (ctype *)sx, incx, \ - (ctype *)sy, incy, \ - (ctype *)&rho); \ - return rho; \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ - template <> inline cctype dot(dim_t n, \ - cctype *sx, inc_t incx, \ - cctype *sy, inc_t incy) \ - { \ - ctype rho = cchar##cfunc##_(&n, \ - (ctype *)sx, &incx, \ - (ctype *)sy, &incy); \ - return *((cctype *)&rho); \ - } -#endif -BLALINK_MAC( float, float, s, dot ) -BLALINK_MAC( double, double, d, dot ) -#if defined(BLAS_EXTERNAL) && defined(F77_COMPLEX_RET_INTEL) -// Intel style complex return. -#undef BLALINK_MAC -#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ - template <> inline cctype dot(dim_t n, \ - cctype *sx, inc_t incx, \ - cctype *sy, inc_t incy) \ - { \ - cctype rho; \ - cchar##cfunc##_((ctype *)&rho, &n, \ - (ctype *)sx, &incx, \ - (ctype *)sy, &incy); \ - return rho; \ - } -#endif -BLALINK_MAC( ccscmplx, scomplex, c, dotu ) -BLALINK_MAC( ccdcmplx, dcomplex, z, dotu ) -#undef BLALINK_MAC - - - -// [LAPACK] lacpy -template -inline void lacpy(uplo_t uploa, - dim_t m, dim_t n, - T *a, inc_t lda, - T *b, inc_t ldb); -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void lacpy(uplo_t uploa, \ - dim_t m, dim_t n, \ - cctype *a, inc_t lda, \ - cctype *b, inc_t ldb) \ - { \ - char ul = uplo2char(uploa); \ - cchar##lacpy_(&ul, &m, &n, a, &lda, b, &ldb); \ - } -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - -// [LAPACK] trtri -template -inline int trtri(uplo_t uploa, - diag_t diaga, - dim_t n, - T *a, inc_t lda); -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline int trtri(uplo_t uploa, \ - diag_t diaga, \ - dim_t n, \ - cctype *a, inc_t lda) \ - { \ - char ul = uplo2char(uploa); \ - char dg = diag2char(diaga); \ - int info; \ - cchar##trtri_(&ul, &dg, &n, a, &lda, &info); \ - return info; \ - } -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - -// [PFAPACK] sktrf -template -inline int sktrf(uplo_t uploa, - dim_t n, - T *a, inc_t lda, - int *ipiv, - T *work, dim_t lwork); -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline int sktrf(uplo_t uploa, \ - dim_t n, \ - cctype *a, inc_t lda, \ - int *ipiv, \ - cctype *work, dim_t lwork) \ - { \ - char ul = uplo2char(uploa); \ - char mode = 'N'; \ - int info; \ - cchar##sktrf_(&ul, &mode, &n, a, &lda, ipiv, work, &lwork, &info); \ - return info; \ - } -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - -// [PFAPACK] skpfa -template -inline int skpfa(uplo_t uploa, - dim_t n, - T *a, inc_t lda, - T *pfa, int *ipiv, - T *work, dim_t lwork); -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline int skpfa(uplo_t uploa, \ - dim_t n, \ - cctype *a, inc_t lda, \ - cctype *pfa, int *ipiv, \ - cctype *work, dim_t lwork) \ - { \ - char ul = uplo2char(uploa); \ - char mthd = 'P'; \ - int info; \ - cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, &info); \ - return info; \ - } -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -#undef BLALINK_MAC -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline int skpfa(uplo_t uploa, \ - dim_t n, \ - cctype *a, inc_t lda, \ - cctype *pfa, int *ipiv, \ - cctype *work, dim_t lwork) \ - { \ - char ul = uplo2char(uploa); \ - char mthd = 'P'; \ - int info; \ - cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, 0, &info); \ - return info; \ - } -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - diff --git a/src/common/blalink_fort.h b/src/common/blalink_fort.h deleted file mode 100644 index eb0b3a39..00000000 --- a/src/common/blalink_fort.h +++ /dev/null @@ -1,184 +0,0 @@ -/** - * \file blalink_fort.hh - * Wrapper for external BLAS. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at https://mozilla.org/MPL/2.0/. - */ -#pragma once -#include -#include "blis.h" - - -// converter from BLIS enum to BLAS char. -BLIS_INLINE char trans2char(trans_t t) -{ - switch (t) { - case BLIS_NO_TRANSPOSE: - return 'N'; - case BLIS_TRANSPOSE: - return 'T'; - case BLIS_CONJ_NO_TRANSPOSE: - return 'C'; - default: - abort(); - } -} -BLIS_INLINE char uplo2char(uplo_t t) -{ - switch (t) { - case BLIS_UPPER: - return 'U'; - case BLIS_LOWER: - return 'L'; - default: - abort(); - } -} -BLIS_INLINE char side2char(side_t t) -{ - switch (t) { - case BLIS_LEFT: - return 'L'; - case BLIS_RIGHT: - return 'R'; - default: - abort(); - } -} -BLIS_INLINE char diag2char(diag_t t) -{ - switch (t) { - case BLIS_UNIT_DIAG: - return 'U'; - case BLIS_NONUNIT_DIAG: - return 'N'; - } -} - - -#ifdef __cplusplus -extern "C" -{ -#endif - -// BLAS { -#ifdef BLAS_EXTERNAL - -// gemm -void sgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, float *alpha, - float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc); -void dgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, double *alpha, - double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc); -void cgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha, - void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); -void zgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha, - void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); - - -// ger -void sger_(dim_t *m, dim_t *n, float *alpha, - float *x, dim_t *incx, float *y, dim_t *incy, float *a, dim_t *lda); -void dger_(dim_t *m, dim_t *n, double *alpha, - double *x, dim_t *incx, double *y, dim_t *incy, double *a, dim_t *lda); -void cgeru_(dim_t *m, dim_t *n, void *alpha, - void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda); -void zgeru_(dim_t *m, dim_t *n, void *alpha, - void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda); - - -// gemv -void sgemv_(char *trans, dim_t *m, dim_t *n, float *alpha, float *a, dim_t *lda, - float *x, dim_t *incx, float *beta, float *y, dim_t *incy); -void dgemv_(char *trans, dim_t *m, dim_t *n, double *alpha, double *a, dim_t *lda, - double *x, dim_t *incx, double *beta, double *y, dim_t *incy); -void cgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda, - void *x, dim_t *incx, void *beta, void *y, dim_t *incy); -void zgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda, - void *x, dim_t *incx, void *beta, void *y, dim_t *incy); - - -// trmm -void strmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, - float *alpha, float *a, inc_t *lda, float *b, inc_t *ldb); -void dtrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, - double *alpha, double *a, inc_t *lda, double *b, inc_t *ldb); -void ctrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, - void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb); -void ztrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, - void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb); - - -// trmv -void strmv_(char *uplo, char *transa, char *diag, dim_t *n, - float *a, inc_t *lda, float *b, inc_t *ldb); -void dtrmv_(char *uplo, char *transa, char *diag, dim_t *n, - double *a, inc_t *lda, double *b, inc_t *ldb); -void ctrmv_(char *uplo, char *transa, char *diag, dim_t *n, - void *a, inc_t *lda, void *b, inc_t *ldb); -void ztrmv_(char *uplo, char *transa, char *diag, dim_t *n, - void *a, inc_t *lda, void *b, inc_t *ldb); - -// swap -void sswap_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy); -void dswap_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy); -void cswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); -void zswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); - -// axpy -void saxpy_(dim_t *n, float *alpha, float *sx, dim_t *incx, float *sy, dim_t *incy); -void daxpy_(dim_t *n, double *alpha, double *sx, dim_t *incx, double *sy, dim_t *incy); -void caxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy); -void zaxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy); - -// dot -float sdot_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy); -double ddot_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy); -scomplex cdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); -dcomplex zdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); -#ifndef F77_COMPLEX_RET_INTEL -scomplex cdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); -dcomplex zdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); -#else -void cdotu_(scomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); -void zdotu_(dcomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); -#endif - -// } -#endif - -// LAPACK { - -// lacpy -void slacpy_(char *uplo, dim_t *m, dim_t *n, float *a, inc_t *lda, float *b, inc_t *ldb); -void dlacpy_(char *uplo, dim_t *m, dim_t *n, double *a, inc_t *lda, double *b, inc_t *ldb); -void clacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb); -void zlacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb); - -// trtri -void strtri_(char *uplo, char *diag, dim_t *n, float *a, inc_t *lda, int *info); -void dtrtri_(char *uplo, char *diag, dim_t *n, double *a, inc_t *lda, int *info); -void ctrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info); -void ztrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info); - -// } - -// PFAPACK77 { - -void ssktrf_(char *uplo, char *mode, dim_t *n, float *a, dim_t *lda, int *ipiv, float *work, dim_t *lwork, int *info); -void dsktrf_(char *uplo, char *mode, dim_t *n, double *a, dim_t *lda, int *ipiv, double *work, dim_t *lwork, int *info); -void csktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info); -void zsktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info); - -void sskpfa_(char *uplo, char *mthd, dim_t *n, float *a, dim_t *lda, float *pfa, int *ipiv, float *work, dim_t *lwork, int *info); -void dskpfa_(char *uplo, char *mthd, dim_t *n, double *a, dim_t *lda, double *pfa, int *ipiv, double *work, dim_t *lwork, int *info); -void cskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info); -void zskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info); - -// } - -#ifdef __cplusplus -} -#endif - diff --git a/src/common/colmaj.hh b/src/common/colmaj.hh deleted file mode 100644 index d9a03c1e..00000000 --- a/src/common/colmaj.hh +++ /dev/null @@ -1,26 +0,0 @@ -/** - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at https://mozilla.org/MPL/2.0/. - */ -#pragma once - -template -struct colmaj -{ - T *dat; - int ld; - - colmaj() = delete; - colmaj(T *dat_, int ld_) - : dat(dat_), ld(ld_) { } - - T &operator()(int i, int j) - { - return dat[ i + j * ld ]; - } - T &operator()() - { - return dat[ 0 ]; - } -}; diff --git a/src/ltl2inv/blalink_gemmt.c b/src/ltl2inv/blalink_gemmt.c deleted file mode 100644 index fbaee562..00000000 --- a/src/ltl2inv/blalink_gemmt.c +++ /dev/null @@ -1,42 +0,0 @@ -/** - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at https://mozilla.org/MPL/2.0/. - */ -#include "blis.h" -#include "blalink_fort.h" - -#if !( defined(BLAS_EXTERNAL) && defined(MKL) ) -// Redefine GEMMT Fortran from BLIS (which is built without BLAS API). -// Required by Pfapack77. -#define BLAGEN_MAC(cctype, ctype, cchar) \ - void cchar##gemmt_(const char *uplo_, \ - const char *transa_, \ - const char *transb_, \ - const int *m, const int *k, \ - const ctype *alpha, \ - const ctype *a, const int *lda, \ - const ctype *b, const int *ldb, \ - const ctype *beta, ctype *c, const int *ldc) \ - { \ - uplo_t uploc; \ - trans_t transa; \ - trans_t transb; \ - bli_param_map_netlib_to_blis_uplo(*uplo_, &uploc) ; \ - bli_param_map_netlib_to_blis_trans(*transa_, &transa) ; \ - bli_param_map_netlib_to_blis_trans(*transb_, &transb) ; \ - bli_##cchar##gemmt(uploc, \ - transa, transb, \ - *m, *k, \ - alpha, \ - a, 1, *lda, \ - b, 1, *ldb, \ - beta, \ - c, 1, *ldc); \ - } -BLAGEN_MAC( float, float, s ) -BLAGEN_MAC( double, double, d ) -BLAGEN_MAC( ccscmplx, scomplex, c ) -BLAGEN_MAC( ccdcmplx, dcomplex, z ) -#undef BLAGEN_MAC -#endif diff --git a/src/ltl2inv/blalink_gemmt.hh b/src/ltl2inv/blalink_gemmt.hh deleted file mode 100644 index 52bb2c45..00000000 --- a/src/ltl2inv/blalink_gemmt.hh +++ /dev/null @@ -1,80 +0,0 @@ -/** - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at https://mozilla.org/MPL/2.0/. - */ -#pragma once -#include "blalink.hh" - - -// gemmt is not part of BLAS standard. -// It's currently know as exposed by BLIS and MKL. -template -inline void gemmt(uplo_t uploc, - trans_t transa, trans_t transb, - dim_t m, dim_t k, - T alpha, - T *a, inc_t lda, - T *b, inc_t ldb, - T beta, - T *c, inc_t ldc); -#if !( defined(BLAS_EXTERNAL) && defined(MKL) ) -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void gemmt(uplo_t uploc, \ - trans_t transa, trans_t transb, \ - dim_t m, dim_t k, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *b, inc_t ldb, \ - cctype beta, \ - cctype *c, inc_t ldc) \ - { \ - bli_##cchar##gemmt(uploc, \ - transa, transb, \ - m, k, \ - (ctype *)&alpha, \ - (ctype *)a, 1, lda, \ - (ctype *)b, 1, ldb, \ - (ctype *)&beta, \ - (ctype *)c, 1, ldc); \ - } -#else -#define BLALINK_MAC(cctype, ctype, cchar) \ - template <> inline void gemmt(uplo_t uploc, \ - trans_t transa, trans_t transb, \ - dim_t m, dim_t k, \ - cctype alpha, \ - cctype *a, inc_t lda, \ - cctype *b, inc_t ldb, \ - cctype beta, \ - cctype *c, inc_t ldc) \ - { \ - char ul = uplo2char(uploc); \ - char ta = trans2char(transa); \ - char tb = trans2char(transb); \ - cchar##gemmt_(&ul, \ - &ta, &tb, \ - &m, &k, \ - (ctype *)&alpha, \ - (ctype *)a, &lda, \ - (ctype *)b, &ldb, \ - (ctype *)&beta, \ - (ctype *)c, &ldc); \ - } -extern "C" { -void sgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, float *alpha, - float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc); -void dgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, double *alpha, - double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc); -void cgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha, - void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); -void zgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha, - void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); -} -#endif -BLALINK_MAC( float, float, s ) -BLALINK_MAC( double, double, d ) -BLALINK_MAC( ccscmplx, scomplex, c ) -BLALINK_MAC( ccdcmplx, dcomplex, z ) -#undef BLALINK_MAC - diff --git a/src/ltl2inv/eigen_gemmt.tcc b/src/ltl2inv/eigen_gemmt.tcc new file mode 100644 index 00000000..4e569df9 --- /dev/null +++ b/src/ltl2inv/eigen_gemmt.tcc @@ -0,0 +1,77 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "lapack2eigen.hh" +#include "ilaenv_lauum.hh" + + +template +inline void gemmt(const char &uploC, + const char &transA, + const char &transB, + const T &alpha, + const l2e::le_mat_t &A_, + const l2e::le_mat_t &B_, + const T &beta, + l2e::le_mat_t &C_) +{ + using LProxy = l2e::le_mat_t; + using Matrix = Eigen::Matrix; + const int npanel = ilaenv_lauum(uploC, C_.rows); + + bool conjA = transA == 'C' || transA == 'c'; + bool conjB = transB == 'C' || transB == 'c'; + + const auto &A = (transA == 'T' || transA == 't') ? A_.transpose().to_eigen() : A_.to_eigen(); + const auto &B = (transB == 'T' || transB == 't') ? B_.transpose().to_eigen() : B_.to_eigen(); + auto C = C_.to_eigen(); + + if (uploC == 'L' || uploC == 'l') { + + for (int ipanel = 0; ipanel < A.rows(); ipanel += npanel) { + int sz_panel = std::min(npanel, A.rows() - ipanel); + + // Diagonal. + const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all); + const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1)); + const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha; + LProxy Cpanel_(Cpanel); + lacpy(uploC, LProxy(Cupdated), Cpanel_); + + // Off-diagonal. + const auto &Aremain = A(Eigen::seq(ipanel+sz_panel, A.rows()-1), Eigen::all); + const auto &Bremain = B(Eigen::all, Eigen::seq(ipanel+sz_panel, B.cols()-1)); + auto Cremain = C(Eigen::seq(ipanel+sz_panel, C.rows()-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Cremain = Cremain * beta + Aremain * Bpanel * alpha; + } + + } else { // (uploC == 'U' || uploC == 'u') + + for (int ipanel = (A.rows()-1) / npanel * npanel; ipanel >= 0; ipanel -= npanel) { + int sz_panel = std::min(npanel, A.rows() - ipanel); + + // Diagonal. + const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all); + const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1)); + const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha; + LProxy Cpanel_(Cpanel); + lacpy(uploC, LProxy(Cupdated), Cpanel_); + + // Off-diagonal. + const auto &Aremain = A(Eigen::seq(0, ipanel-1), Eigen::all); + const auto &Bremain = B(Eigen::all, Eigen::seq(0, ipanel-1)); + auto Cremain = C(Eigen::seq(0, ipanel-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Cremain = Cremain * beta + Aremain * Bpanel * alpha; + } + + } +} diff --git a/src/ltl2inv/error.hh b/src/ltl2inv/error.hh new file mode 100644 index 00000000..5c0a06e4 --- /dev/null +++ b/src/ltl2inv/error.hh @@ -0,0 +1,42 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include +#include + + +inline void abort(const char *msg) +{ + std::cout << msg << std::endl; + abort(); +} + +inline void abort(const char *msg0, const char *msg1) +{ + std::cout << msg0 << ": " << msg1 << std::endl; + abort(); +} + +#if (defined(_DEBUG) || !defined(SKIP_CHECKS)) + +inline void assert_(const bool cond, const char *msg) +{ + if (!cond) abort(msg); +} + +inline void assert_(const bool cond, const char *msg0, const char *msg1) +{ + if (!cond) abort(msg0, msg1); +} + +#else + +inline void assert_(const bool cond, const char *msg) { } + +inline void assert_(const bool cond, const char *msg0, const char *msg1) { } + +#endif + diff --git a/src/ltl2inv/gemmt2blas.cc b/src/ltl2inv/gemmt2blas.cc new file mode 100644 index 00000000..56cd6fe4 --- /dev/null +++ b/src/ltl2inv/gemmt2blas.cc @@ -0,0 +1,35 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "lapack2eigen.hh" + + +#if !defined(_BLAS_HAS_GEMMT) +// Import the customized GEMMT impl. to the BLAS API. +// Required by Pfapack77. +#define BLAGEN_MAC(cctype, ctype, cchar) \ + extern "C" \ + void cchar##gemmt_(const char *uplo_, \ + const char *transa_, \ + const char *transb_, \ + const int *m, const int *k, \ + const ctype *alpha, \ + ctype *a, const int *lda, \ + ctype *b, const int *ldb, \ + const ctype *beta, ctype *c, const int *ldc) \ + { \ + bool tA = (*transa_ == 'T' || *transa_ == 't'); \ + bool tB = (*transb_ == 'T' || *transb_ == 't'); \ + l2e::le_mat_t A(tA ? *k : *m, tA ? *m : *k, 1, *lda, (cctype *)a); \ + l2e::le_mat_t B(tB ? *k : *m, tB ? *m : *k, 1, *ldb, (cctype *)b); \ + l2e::le_mat_t C(*m, *m, 1, *ldc, (cctype *)c); \ + gemmt(*uplo_, *transa_, *transb_, *(cctype *)alpha, A, B, *(cctype *)beta, C); \ + } +BLAGEN_MAC( float, float, s ) +BLAGEN_MAC( double, double, d ) +BLAGEN_MAC( ccscmplx, scomplex, c ) +BLAGEN_MAC( ccdcmplx, dcomplex, z ) +#undef BLAGEN_MAC +#endif diff --git a/src/ltl2inv/ilaenv_lauum.cc b/src/ltl2inv/ilaenv_lauum.cc index f150384f..6dcb8b90 100644 --- a/src/ltl2inv/ilaenv_lauum.cc +++ b/src/ltl2inv/ilaenv_lauum.cc @@ -3,17 +3,19 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ -#include "blalink.hh" #include "ilaenv.h" #include "ilaenv_lauum.hh" +#include +using ccscmplx = std::complex; +using ccdcmplx = std::complex; + #define EXPANDMAC(cctype, name) \ -template <> int ilaenv_lauum(uplo_t uplo, int n) \ +template <> int ilaenv_lauum(const char uplo, const int n) \ { \ - char uplo_ = uplo2char(uplo); \ int ispec = 1; \ int dummy = 0; \ - return ilaenv_(&ispec, #name, &uplo_, &n, &dummy, &dummy, &dummy); \ + return ilaenv_(&ispec, #name, &uplo, &n, &dummy, &dummy, &dummy); \ } EXPANDMAC( float, SLAUUM ) EXPANDMAC( double, DLAUUM ) diff --git a/src/ltl2inv/ilaenv_lauum.hh b/src/ltl2inv/ilaenv_lauum.hh index 68d89b57..46a59b0f 100644 --- a/src/ltl2inv/ilaenv_lauum.hh +++ b/src/ltl2inv/ilaenv_lauum.hh @@ -4,8 +4,7 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "blalink.hh" template -int ilaenv_lauum(uplo_t uplo, int n); +int ilaenv_lauum(const char uplo, const int n); diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc index 0b11edf1..4fd24cda 100644 --- a/src/ltl2inv/invert.tcc +++ b/src/ltl2inv/invert.tcc @@ -4,15 +4,15 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "colmaj.hh" -#include "blalink.hh" +#include "lapack2eigen.hh" #include "trmmt.tcc" +#include "error.hh" -template -void sktdsmx(int n, T *vT, T *B_, int ldB, T *C_, int ldC) +template +void sktdsmx(const Vec &vT, const Mat &B, Mat &C) { - colmaj B(B_, ldB); - colmaj C(C_, ldC); + int n = B.rows(); + assert_(B.cols() == n && C.cols() == n && C.rows() == n, "sktdsmx: Dimension mismatch."); for (int j = 0; j < n; ++j) C(1, j) = B(0, j) / -vT[0]; @@ -27,13 +27,13 @@ void sktdsmx(int n, T *vT, T *B_, int ldB, T *C_, int ldC) C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1]; } -template -void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) +template +void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M) { - colmaj A(A_, ldA); - colmaj M(M_, ldM); - - int npanel_trmm = ilaenv_lauum(BLIS_LOWER, n); + using namespace Eigen; + using T = typename Mat::Scalar; + int n = A.rows(); + int npanel_trmm = ilaenv_lauum('L', n); int full = npanel_trmm <= 1 || npanel_trmm >= n; // Set M. @@ -41,15 +41,21 @@ void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) for (int i = 0; i < n; ++i) M(i, j) = T(!(i - j)); - trtri(BLIS_LOWER, BLIS_UNIT_DIAG, n-1, &A(1, 0), ldA); - lacpy(BLIS_LOWER, n-2, n-2, &A(2, 0), ldA, &M(2, 1), ldM); + l2e::le_mat_t A_(A); + l2e::le_mat_t M_(M); + l2e::le_mat_t A_Lx(A(seq(1, n-1), seq(0, n-2))); + l2e::le_mat_t A_L(A(seq(2, n-1), seq(0, n-3))); + l2e::le_mat_t M_L(M(seq(2, n-1), seq(1, n-2))); + + trtri('L', 'U', A_Lx); + lacpy('L', A_L, M_L); for (int i = 0; i < n-1; ++i) vT[i] = A(i+1, i); - sktdsmx(n, vT, &M(0, 0), ldM, &A(0, 0), ldA); + sktdsmx(vT, M, A); if (!full) { - trmmt(BLIS_LOWER, BLIS_UNIT_DIAG, n, T(1.0), M, A); + trmmt('L', 'U', T(1.0), M, A); for (int j = 0; j < n; ++j) { A(j, j) = 0.0; for (int i = 0; i < j; ++i) @@ -60,25 +66,24 @@ void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) // In-place permute columns. for (int j = n-1; j >= 0; --j) if (iPiv[j]-1 != j) - swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1); + A(Eigen::all, j).swap(A(Eigen::all, iPiv[j] - 1)); if (full) - trmm(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG, - n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA); + A = M.template triangularView().transpose() * A; // In-place permute rows. for (int i = n-1; i >= 0; --i) if (iPiv[i]-1 != i) - swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA); + A(i, Eigen::all).swap(A(iPiv[i] - 1, Eigen::all)); } -template -void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) +template +void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M) { - colmaj A(A_, ldA); - colmaj M(M_, ldM); - - int npanel_trmm = ilaenv_lauum(BLIS_UPPER, n); + using namespace Eigen; + using T = typename Mat::Scalar; + int n = A.rows(); + int npanel_trmm = ilaenv_lauum('U', n); int full = npanel_trmm <= 1 || npanel_trmm >= n; // Set M. @@ -86,15 +91,21 @@ void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) for (int i = 0; i < n; ++i) M(i, j) = T(!(i - j)); - trtri(BLIS_UPPER, BLIS_UNIT_DIAG, n-1, &A(0, 1), ldA); - lacpy(BLIS_UPPER, n-2, n-2, &A(0, 2), ldA, &M(0, 1), ldM); + l2e::le_mat_t A_(A); + l2e::le_mat_t M_(M); + l2e::le_mat_t A_Ux(A(seq(0, n-2), seq(1, n-1))); + l2e::le_mat_t A_U(A(seq(0, n-3), seq(2, n-1))); + l2e::le_mat_t M_U(M(seq(0, n-3), seq(1, n-2))); + + trtri('U', 'U', A_Ux); + lacpy('U', A_U, M_U); for (int i = 0; i < n-1; ++i) vT[i] = -A(i, i+1); - sktdsmx(n, vT, &M(0, 0), ldM, &A(0, 0), ldA); + sktdsmx(vT, M, A); if (!full) { - trmmt(BLIS_UPPER, BLIS_UNIT_DIAG, n, T(1.0), M, A); + trmmt('U', 'U', T(1.0), M, A); for (int j = 0; j < n; ++j) { A(j, j) = 0.0; for (int i = j + 1; i < n; ++i) @@ -105,26 +116,27 @@ void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) // In-place permute columns. for (int j = 0; j < n; ++j) if (iPiv[j]-1 != j) - swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1); + A(Eigen::all, j).swap(A(Eigen::all, iPiv[j]-1)); if (full) - trmm(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG, - n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA); + A = M.template triangularView().transpose() * A; // In-place permute rows. for (int i = 0; i < n; ++i) if (iPiv[i]-1 != i) - swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA); + A(i, Eigen::all).swap(A(iPiv[i]-1, Eigen::all)); } -template -void ltl2inv(uplo_t uplo, int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) +template +void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat &M) { switch (uplo) { - case BLIS_LOWER: - ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); break; - case BLIS_UPPER: - utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); break; + case 'l': + case 'L': + ltl2inv(A, iPiv, vT, M); break; + case 'u': + case 'U': + utu2inv(A, iPiv, vT, M); break; default: break; } diff --git a/src/ltl2inv/lapack2eigen.hh b/src/ltl2inv/lapack2eigen.hh new file mode 100644 index 00000000..1d850068 --- /dev/null +++ b/src/ltl2inv/lapack2eigen.hh @@ -0,0 +1,272 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include +#include +#ifndef EIGEN_USE_LAPACKE // This lets Eigen/Dense include LAPACK definitions. +#include +#endif +#ifdef _BLIS +#define BLIS_DISABLE_BLAS // Screen BLAS definition even if BLIS' built with them. +#include "blis/blis.h" +#else +using Eigen::scomplex; +using Eigen::dcomplex; +#endif +#include "error.hh" +#include "fortran_pfapack.h" +using f77_int = lapack_int; +using ccscmplx = std::complex; +using ccdcmplx = std::complex; + +namespace l2e +{ + +template +struct le_mat_t +{ + const f77_int rows, cols; ///< Matrix size. + const f77_int rs, cs; ///< Strides. + + T *const data; + +protected: + template + static f77_int _rs(const Mat &mat) + { + f77_int rs = 0; + if (mat.rows() > 1) + rs = &(mat(1, 0)) - &(mat(0, 0)); + return rs; + } + template + static f77_int _cs(const Mat &mat) + { + f77_int cs = 0; + if (mat.cols() > 1) + cs = &(mat(0, 1)) - &(mat(0, 0)); + return cs; + } + +public: + le_mat_t(const f77_int &rows_, const f77_int &cols_, + const f77_int &rs_, const f77_int &cs_, T *const data_) + : rows(rows_), cols(cols_), rs(rs_), cs(cs_), data(data_) { } + + template + le_mat_t(Eigen::Matrix &mat) + : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { } + + template + le_mat_t(Eigen::Block mat) // TODO: Check for data copying. + : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { } + + template + le_mat_t(Eigen::Map, Alignment, Stride> &mat) + : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { } + + template + Eigen::Map, Alignment, + Eigen::Stride > to_eigen() const + { + using matrix_t = Eigen::Matrix; + using stride_t = Eigen::Stride; + return Eigen::Map(data, rows, cols, stride_t(cs, rs)); + } + + le_mat_t transpose() const + { return le_mat_t(cols, rows, cs, rs, data); } +}; + +} + + +// ---- BLAS/LAPACK routines used in our settings. ----- + + +// [LAPACK] lacpy +#define BLALINK_PARAMS(T) const char &uploA, \ + const l2e::le_mat_t &A, \ + l2e::le_mat_t &B +template +inline void lacpy( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline void lacpy( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1 && B.rs == 1, "BLAS requires column-major."); \ + /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \ + cchar##lacpy_((char *)&uploA, (f77_int *)&A.rows, (f77_int *)&A.cols, \ + (ctype *)A.data, (f77_int *)&A.cs, \ + (ctype *)B.data, (f77_int *)&B.cs); \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, _Complex float, c ) +BLALINK_MAC( ccdcmplx, _Complex double, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [LAPACK] trtri +#define BLALINK_PARAMS(T) const char &uploA, \ + const char &diagA, \ + l2e::le_mat_t &A +template +inline f77_int trtri( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline f77_int trtri( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + f77_int info; \ + /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \ + cchar##trtri_((char *)&uploA, (char *)&diagA, (f77_int *)&A.rows, \ + (ctype *)A.data, (f77_int *)&A.cs, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, _Complex float, c ) +BLALINK_MAC( ccdcmplx, _Complex double, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [PFAPACK] sktrf +#define BLALINK_PARAMS(T) const char &uplo, \ + const char &mode, \ + l2e::le_mat_t &A, \ + f77_int *ipiv, \ + T *work, f77_int lwork +template +inline f77_int sktrf( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline f77_int sktrf( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + f77_int info; \ + cchar##sktrf_(&uplo, &mode, &A.rows, (ctype *)A.data, &A.cs, ipiv, work, &lwork, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, floatcmplx, c ) +BLALINK_MAC( ccdcmplx, doublecmplx, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [PFAPACK] skpfa +#define BLALINK_PARAMS(T) const char &uplo, \ + const char &mthd, \ + l2e::le_mat_t &A, \ + T *pfa, f77_int *ipiv, \ + T *work, f77_int lwork +template +inline f77_int skpfa( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + int info; \ + cchar##skpfa_(&uplo, &mthd, &A.rows, (ctype *)A.data, &A.cs, pfa, ipiv, work, &lwork, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +#undef BLALINK_MAC +#define BLALINK_MAC(T, ctype, rtype, cchar) \ + template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + rtype *rwork = 0; \ + if (mthd != 'p' && mthd != 'P') \ + rwork = new rtype[lwork]; \ + int info; \ + cchar##skpfa_(&uplo, &mthd, &A.rows, A.data, &A.cs, pfa, ipiv, work, &lwork, rwork, &info); \ + delete[] rwork; \ + return info; \ + } +BLALINK_MAC( ccscmplx, floatcmplx, float, c ) +BLALINK_MAC( ccdcmplx, doublecmplx, double, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [Non-standard] gemmt +#define BLALINK_PARAMS(T) const char &uploC, \ + const char &transA, \ + const char &transB, \ + const T &alpha, \ + const l2e::le_mat_t &A, \ + const l2e::le_mat_t &B, \ + const T &beta, \ + l2e::le_mat_t &C +template +inline void gemmt( BLALINK_PARAMS(T) ); +#if !defined(_BLIS) && !defined(_BLAS_HAS_GEMMT) + #include "eigen_gemmt.tcc" +#else +#ifdef _BLIS +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline void gemmt( BLALINK_PARAMS(T) ) \ + { \ + uplo_t bli_uploC; \ + trans_t bli_transA, bli_transB; \ + bli_param_map_netlib_to_blis_uplo( uploC, &bli_uploC ); \ + bli_param_map_netlib_to_blis_trans( transA, &bli_transA ); \ + bli_param_map_netlib_to_blis_trans( transB, &bli_transB ); \ + bli_##cchar##gemmt(bli_uploC, \ + bli_transA, \ + bli_transB, \ + (transA == 'T' || transA == 't') ? A.cols : A.rows, \ + (transA == 'T' || transA == 't') ? A.rows : A.cols, \ + (const ctype *)&alpha, \ + (const ctype *)A.data, A.rs, A.cs, \ + (const ctype *)B.data, B.rs, B.cs, \ + (const ctype *)&beta, \ + (ctype *)C.data, C.rs, C.cs); \ + } +#else + // ifdef _BLAS_HAS_GEMMT +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline void gemmt( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1 && B.rs == 1 && C.rs == 1, "BLAS requires column-major."); \ + cchar##gemmt_(&uploC, \ + &transA, &transB, \ + (transA == 'T' || transA == 't') ? &A.cols : &A.rows, \ + (transA == 'T' || transA == 't') ? &A.rows : &A.cols, \ + (const ctype *)&alpha, \ + (const ctype *)A.data, &A.cs, \ + (const ctype *)B.data, &B.cs, \ + (const ctype *)&beta, \ + (ctype *)C.data, &C.cs); \ + } +extern "C" { +#define GEMMT_F77_PARAMS(T) const char *uploC, \ + const char *transA, \ + const char *transB, \ + const f77_int *m, \ + const f77_int *k, \ + const T *alpha, \ + const T *A, const f77_int *ldA, \ + const T *B, const f77_int *ldB, \ + const T *beta, \ + T *C, const f77_int *ldC +void sgemmt_( GEMMT_F77_PARAMS(float) ); +void dgemmt_( GEMMT_F77_PARAMS(double) ); +void cgemmt_( GEMMT_F77_PARAMS(scomplex) ); +void zgemmt_( GEMMT_F77_PARAMS(dcomplex) ); +#undef GEMMT_F77_PARAMS +} +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC +#endif +#undef BLALINK_PARAMS diff --git a/src/ltl2inv/ltl2inv.cc b/src/ltl2inv/ltl2inv.cc index 19f10256..e7ef84fc 100644 --- a/src/ltl2inv/ltl2inv.cc +++ b/src/ltl2inv/ltl2inv.cc @@ -10,24 +10,42 @@ extern "C" { -void ltl2inv_s(int n, float *A_, int ldA, int *iPiv, float *vT, float *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } -void ltl2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } -void ltl2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } -void ltl2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } -void utu2inv_s(int n, float *A_, int ldA, int *iPiv, float *vT, float *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } -void utu2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } -void utu2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } -void utu2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } - - -void ltl2pfa_s(int n, float *A_, int ldA, int *iPiv, float *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } -void ltl2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } -void ltl2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } -void ltl2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } -void utu2pfa_s(int n, float *A_, int ldA, int *iPiv, float *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } -void utu2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } -void utu2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } -void utu2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } +#define GENDEF(schema, cchar, ctype, cctype) \ +void schema ##2inv_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *vT, cctype *M_, int ldM) \ +{ \ + using namespace Eigen; \ + Map, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \ + Map, 0, OuterStride<> > M(M_, n, n, OuterStride<>(ldM)); \ + Map Piv(iPiv, n); \ + Map> T(vT, n); \ + schema ##2inv(A, Piv, vT, M); \ +} +GENDEF(ltl, s, float, float) +GENDEF(ltl, d, double, double) +GENDEF(ltl, c, ccscmplx, ccscmplx) +GENDEF(ltl, z, ccdcmplx, ccdcmplx) +GENDEF(utu, s, float, float) +GENDEF(utu, d, double, double) +GENDEF(utu, c, ccscmplx, ccscmplx) +GENDEF(utu, z, ccdcmplx, ccdcmplx) + +#undef GENDEF +#define GENDEF(schema, cchar, ctype, cctype) \ +void schema ##2pfa_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *Pfa) \ +{ \ + using namespace Eigen; \ + Map, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \ + Map Piv(iPiv, n); \ + *Pfa = schema ##2pfa(A, Piv); \ +} +GENDEF(ltl, s, float, float) +GENDEF(ltl, d, double, double) +GENDEF(ltl, c, ccscmplx, ccscmplx) +GENDEF(ltl, z, ccdcmplx, ccdcmplx) +GENDEF(utu, s, float, float) +GENDEF(utu, d, double, double) +GENDEF(utu, c, ccscmplx, ccscmplx) +GENDEF(utu, z, ccdcmplx, ccdcmplx) } diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv index 8c76afb1..2b9d4aeb 100644 --- a/src/ltl2inv/makefile_ltl2inv +++ b/src/ltl2inv/makefile_ltl2inv @@ -1,9 +1,8 @@ include ../make.sys -OBJ = ltl2inv.o blalink_gemmt.o ilaenv_lauum.o -SRC = ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc -HDR = invert.tcc pfaffian.tcc \ - ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh +OBJ = ilaenv_lauum.o gemmt2blas.o ltl2inv.o +SRC = ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc +HDR = ilaenv.h ilaenv_lauum.hh lapack2eigen.hh error.hh eigen_gemmt.tcc invert.tcc pfaffian.tcc trmmt.tcc ltl2inv.a: $(OBJ) $(AR) rvu $@ $(OBJ) @@ -17,6 +16,7 @@ SUFFIXES: .o .c $(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \ -Wno-incompatible-pointer-types-discards-qualifiers \ -I../common \ + -I../pfapack/c_interface \ -I$(BLIS_ROOT)/include \ -I$(BLIS_ROOT)/include/blis @@ -25,5 +25,6 @@ SUFFIXES: .o .cc .cc.o: $(HDR) $(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \ -I../common \ + -I../pfapack/c_interface \ -I$(BLIS_ROOT)/include \ -I$(BLIS_ROOT)/include/blis diff --git a/src/ltl2inv/pfaffian.tcc b/src/ltl2inv/pfaffian.tcc index ed028524..51777f0b 100644 --- a/src/ltl2inv/pfaffian.tcc +++ b/src/ltl2inv/pfaffian.tcc @@ -4,47 +4,83 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "colmaj.hh" -#include "blalink.hh" +#include "lapack2eigen.hh" -template -T ltl2pfa(int n, T *A_, int ldA, int *iPiv) + +template +struct SignedLogScale +{ + using T = T_; + + T_ val; + int sgn; +}; + + +template +SignedLogScale utu2logpfa(const Mat &A, const Vec &iPiv) +{ + SignedLogScale logpfa{ 0.0, 1 }; + + for (int i = 0; i < A.rows(); i += 2) { + logpfa.val += std::log(std::abs(Amp( A(i, i+1) ))); + logpfa.sgn *= A(i, i+1) > 0 ? 1 : -1; + + if (iPiv(i) -1 != i ) logpfa.sgn *= -1; + if (iPiv(i+1)-1 != i+1) logpfa.sgn *= -1; + } + return logpfa; +} + +// Additional parameter for return type deduction. +template +Ret utu2logpfa(const Mat &A, const Vec &iPiv, Ret &ret) +{ return utu2logpfa(A, iPiv); } + + + +template +Amp ltl2pfa(const Mat &A, const Vec &iPiv) { - T pfa = 1.0; - colmaj A(A_, ldA); + Amp pfa = 1.0; - for (int i = 0; i < n; i += 2) { + for (int i = 0; i < A.rows(); i += 2) { pfa *= -A(i+1, i); - if (iPiv[i ]-1 != i ) pfa = -pfa; - if (iPiv[i+1]-1 != i+1) pfa = -pfa; + if (iPiv(i) -1 != i ) pfa = -pfa; + if (iPiv(i+1)-1 != i+1) pfa = -pfa; } return pfa; } -template -T utu2pfa(int n, T *A_, int ldA, int *iPiv) +template +Amp utu2pfa(const Mat &A, const Vec &iPiv) { - T pfa = 1.0; - colmaj A(A_, ldA); + Amp pfa = 1.0; - for (int i = 0; i < n; i += 2) { + for (int i = 0; i < A.rows(); i += 2) { pfa *= A(i, i+1); - if (iPiv[i ]-1 != i ) pfa = -pfa; - if (iPiv[i+1]-1 != i+1) pfa = -pfa; + if (iPiv(i) -1 != i ) pfa = -pfa; + if (iPiv(i+1)-1 != i+1) pfa = -pfa; } return pfa; } -template -T ltl2pfa(uplo_t uplo, int n, T *A_, int ldA, int *iPiv) +template +Amp ltl2pfa(const char &uplo, const Mat &A, const Vec &iPiv) { switch (uplo) { - case BLIS_LOWER: - return ltl2pfa(n, A_, ldA, iPiv); - case BLIS_UPPER: - return utu2pfa(n, A_, ldA, iPiv); + case 'l': + case 'L': + return ltl2pfa(A, iPiv); + case 'u': + case 'U': + return utu2pfa(A, iPiv); default: return 0.0; } diff --git a/src/ltl2inv/testinv.cc b/src/ltl2inv/testinv.cc deleted file mode 100644 index c7cbc358..00000000 --- a/src/ltl2inv/testinv.cc +++ /dev/null @@ -1,125 +0,0 @@ -#include -#include -#include -#include - -#include "colmaj.hh" -#include "invert.tcc" - -struct info_t { - double err; - long long tim; -}; - -template -info_t test_decomp2inv(uplo_t uplo, int n) -{ - using namespace std::chrono; - - colmaj A(new T[n * n], n); - colmaj M(new T[n * n], n); - colmaj X(new T[n * n], n); - int *ipiv = new int[n]; - T *vT = new T[n-1]; - - assert(&A(0, 0)); - assert(&M(0, 0)); - assert(&X(0, 0)); - assert(ipiv); - assert(vT); - - for (int j = 0; j < n; ++j) { - A(j, j) = 0.0; - X(j, j) = 0.0; - - for (int i = j+1; i < n; ++i) { - A(i, j) = rand(); - A(i, j) /= RAND_MAX; - - A(j, i) = -A(i, j); - switch (uplo) { - case BLIS_LOWER: - X(i, j) = A(i, j); break; - case BLIS_UPPER: - X(j, i) = A(j, i); break; - default: - break; - } - } - } - - auto start = high_resolution_clock::now(); - - sktrf(uplo, n, &X(0, 0), n, ipiv, &M(0, 0), n*n); - switch (uplo) { - case BLIS_LOWER: - ltl2inv(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break; - case BLIS_UPPER: - utu2inv(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break; - default: - break; - } - - auto tim = duration_cast(high_resolution_clock::now() - start).count(); - - gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, - n, n, n, - 1.0, - &A(0, 0), n, - &X(0, 0), n, - 0.0, - &M(0, 0), n); - -#ifdef VERBOSE - printf("iPiv = [ "); - for (int i = 0; i < n; ++i) - printf("%d ", ipiv[i]); - printf("]\n"); - printf("vT = [ "); - for (int i = 0; i < n-1; ++i) - printf("%f ", vT[i]); - printf("]\n"); - - printf("A*inv(A) =\n"); - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) - printf("%10.6f ", M(i, j)); - printf("\n"); - } -#endif - - double err = 0.0; - for (int j = 0; j < n; ++j) - for (int i = 0; i < n; ++i) - err += std::abs(M(i, j) - double(!(i - j))); - err /= n; // sqrt(n*n); - - delete[] &A(0, 0); - delete[] &M(0, 0); - delete[] &X(0, 0); - delete[] ipiv; - delete[] vT; - - return info_t{ err, tim }; -} - -int main(void) -{ - info_t info; int n; - n= 10; info = test_decomp2inv (BLIS_LOWER, n); printf("double n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); - n= 10; info = test_decomp2inv (BLIS_LOWER, n); printf("float n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); - n= 10; info = test_decomp2inv(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim); - n=200; info = test_decomp2inv (BLIS_LOWER, n); printf("double n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); - n=200; info = test_decomp2inv (BLIS_LOWER, n); printf("float n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); - n=200; info = test_decomp2inv(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim); - - n= 10; info = test_decomp2inv (BLIS_UPPER, n); printf("double n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); - n= 10; info = test_decomp2inv (BLIS_UPPER, n); printf("float n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); - n= 10; info = test_decomp2inv(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim); - n=200; info = test_decomp2inv (BLIS_UPPER, n); printf("double n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); - n=200; info = test_decomp2inv (BLIS_UPPER, n); printf("float n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); - n=200; info = test_decomp2inv(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim); - - return 0; -} - diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc index 79bc7170..78399b55 100644 --- a/src/ltl2inv/trmmt.tcc +++ b/src/ltl2inv/trmmt.tcc @@ -4,31 +4,39 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "colmaj.hh" -#include "blalink.hh" +#include "lapack2eigen.hh" #include "ilaenv_lauum.hh" -template -inline void trmmt(uplo_t uploab, diag_t diaga, int n, T alpha, colmaj &A, colmaj &B) +template +inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat &B) { - int npanel = ilaenv_lauum(uploab, n); + using namespace Eigen; + using T = typename Mat::Scalar; + int n = A.rows(); + int npanel = ilaenv_lauum(uploAB, n); - if (uploab == BLIS_UPPER) { + if (uploAB == 'U' || uploAB == 'u') { // A upper => A' lower -> write to UPPER part of B. for (int j = 0; j < n; j += npanel) { int nloc = std::min(npanel, n - j); - trmm(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, diaga, - j+nloc, nloc, alpha, &A(0, 0), A.ld, &B(0, j), B.ld); + auto Bslice = B(seq(0, j+nloc-1), seq(j, j+nloc-1)); + if (diagA == 'U' || diagA == 'u') + Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView().transpose() * Bslice * alpha; + else + Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView().transpose() * Bslice * alpha; } } else { // A lower => A' upper -> write to LOWER part of B. for (int j = 0; j < n; j += npanel) { int nloc = std::min(npanel, n - j); - trmm(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, diaga, - n-j, nloc, alpha, &A(j, j), A.ld, &B(j, j), B.ld); + auto Bslice = B(seq(j, n-1), seq(j, j+nloc-1)); + if (diagA == 'U' || diagA == 'u') + Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView().transpose() * Bslice * alpha; + else + Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView().transpose() * Bslice * alpha; } } } From a07f273686140735d46fe6ed2bd7699443d5ee4a Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 11:35:15 +0900 Subject: [PATCH 02/39] ltl2inv via Eig Build Sys --- src/ltl2inv/CMakeLists.txt | 11 +++++++---- src/ltl2inv/makefile_ltl2inv | 2 -- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt index 8aa7141c..04fe165e 100644 --- a/src/ltl2inv/CMakeLists.txt +++ b/src/ltl2inv/CMakeLists.txt @@ -1,5 +1,5 @@ # include guard -cmake_minimum_required(VERSION 2.8.0 ) +cmake_minimum_required(VERSION 3.0) if(${CMAKE_PROJECT_NAME} STREQUAL "Project") message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.") @@ -8,9 +8,12 @@ endif(${CMAKE_PROJECT_NAME} STREQUAL "Project") if("${ARCHITECTURE}" STREQUAL "x86_64") add_definitions(-DF77_COMPLEX_RET_INTEL) endif() -include_directories(../common) +include_directories(../pfapack/c_interface) -# TODO: Move blalink_gemmt.c to other subprojects? -add_library(ltl2inv STATIC ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc) +# Require Eigen +find_package(Eigen3 3.3 REQUIRED NO_MODULE) +include_directories(${EIGEN3_INCLUDE_DIR}) + +add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc) target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL) diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv index 2b9d4aeb..d57c5fcd 100644 --- a/src/ltl2inv/makefile_ltl2inv +++ b/src/ltl2inv/makefile_ltl2inv @@ -15,7 +15,6 @@ SUFFIXES: .o .c .c.o: $(HDR) $(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \ -Wno-incompatible-pointer-types-discards-qualifiers \ - -I../common \ -I../pfapack/c_interface \ -I$(BLIS_ROOT)/include \ -I$(BLIS_ROOT)/include/blis @@ -24,7 +23,6 @@ SUFFIXES: .o .cc .cc.o: $(HDR) $(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \ - -I../common \ -I../pfapack/c_interface \ -I$(BLIS_ROOT)/include \ -I$(BLIS_ROOT)/include/blis From 24700aae2c3be25b22b411d68a7a9fe782b1ea95 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 14:48:31 +0900 Subject: [PATCH 03/39] Port from libVMC + Fixes for Complex Nums. --- src/ltl2inv/invert.tcc | 16 +- src/ltl2inv/trmmt.tcc | 4 +- src/pfupdates/CMakeLists.txt | 2 +- src/pfupdates/config_manager.tcc | 279 ++++++++++++ src/pfupdates/makefile_pfupdates | 8 +- src/pfupdates/orbital_mat.tcc | 53 ++- src/pfupdates/pf_interface.cc | 38 +- src/pfupdates/pf_interface.h | 8 +- src/pfupdates/skmv.tcc | 51 +-- src/pfupdates/skr2k.tcc | 36 +- src/pfupdates/skslc.tcc | 25 +- src/pfupdates/testupdate.cc | 70 --- src/pfupdates/updated_tdi.tcc | 736 ++++++++++++++++++------------- 13 files changed, 854 insertions(+), 472 deletions(-) create mode 100644 src/pfupdates/config_manager.tcc delete mode 100644 src/pfupdates/testupdate.cc diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc index 4fd24cda..ba3bfd8a 100644 --- a/src/ltl2inv/invert.tcc +++ b/src/ltl2inv/invert.tcc @@ -8,8 +8,8 @@ #include "trmmt.tcc" #include "error.hh" -template -void sktdsmx(const Vec &vT, const Mat &B, Mat &C) +template +void sktdsmx(const Vec &vT, const Mat &B, Mat_ &C) { int n = B.rows(); assert_(B.cols() == n && C.cols() == n && C.rows() == n, "sktdsmx: Dimension mismatch."); @@ -27,8 +27,8 @@ void sktdsmx(const Vec &vT, const Mat &B, Mat &C) C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1]; } -template -void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M) +template +void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M) { using namespace Eigen; using T = typename Mat::Scalar; @@ -77,8 +77,8 @@ void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M) A(i, Eigen::all).swap(A(iPiv[i] - 1, Eigen::all)); } -template -void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M) +template +void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M) { using namespace Eigen; using T = typename Mat::Scalar; @@ -127,8 +127,8 @@ void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M) A(i, Eigen::all).swap(A(iPiv[i]-1, Eigen::all)); } -template -void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat &M) +template +void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M) { switch (uplo) { case 'l': diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc index 78399b55..db60335a 100644 --- a/src/ltl2inv/trmmt.tcc +++ b/src/ltl2inv/trmmt.tcc @@ -8,8 +8,8 @@ #include "ilaenv_lauum.hh" -template -inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat &B) +template +inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat_ &B) { using namespace Eigen; using T = typename Mat::Scalar; diff --git a/src/pfupdates/CMakeLists.txt b/src/pfupdates/CMakeLists.txt index 1a06ef74..11b0202b 100644 --- a/src/pfupdates/CMakeLists.txt +++ b/src/pfupdates/CMakeLists.txt @@ -8,8 +8,8 @@ endif(${CMAKE_PROJECT_NAME} STREQUAL "Project") if(${ARCHITECTURE} STREQUAL "x86_64") add_definitions(-DF77_COMPLEX_RET_INTEL) endif() -include_directories(../common) include_directories(../ltl2inv) +include_directories(../pfapack/c_interface) add_library(pfupdates STATIC pf_interface.cc) target_compile_definitions(pfupdates PRIVATE -D_CC_IMPL) diff --git a/src/pfupdates/config_manager.tcc b/src/pfupdates/config_manager.tcc new file mode 100644 index 00000000..44eb1f07 --- /dev/null +++ b/src/pfupdates/config_manager.tcc @@ -0,0 +1,279 @@ +/** + * \copyright Copyright (c) Dept. Phys., Univ. Tokyo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "error.hh" +#include +#include +#include + +namespace vmc +{ +struct config_manager +{ + using index_t = int32_t; + using base_t = std::vector; + + const index_t nsites; + index_t norbs() + { return nsites * 2; } + + const base_t &config_base() const { return _cfg_base; } + const index_t config_base(index_t i) const { return _cfg_base.at(i); } + const base_t & from_idx() const { return _from_idx; } + const index_t from_idx(index_t i) const { return _from_idx.at(i); } + const base_t & to_orb() const { return _to_orb; } + const index_t to_orb(index_t i) const { return _to_orb.at(i); } + const base_t & config() const { return _cfg; } + const index_t config(index_t i) const { return _cfg.at(i); } + const base_t & num() const { return _num; } + const index_t num(index_t i) const { return _num.at(i); } + const base_t & inv() const { return _inv; } + const index_t inv(index_t i) const { return _inv.at(i); } + const index_t spin(index_t i) const { return _num.at(i) == 1 ? 1 : _num.at(i) == 2 ? -1 : 0; } + + void reserve_space(index_t mmax) + { + _from_idx.reserve(mmax * sizeof(index_t)); + _to_orb.reserve(mmax * sizeof(index_t)); + } + + void attach_config(const base_t &cfg) + { + _cfg_base = cfg; + _cfg = cfg; + _from_idx.clear(); + _to_orb.clear(); + + // Reset _num. + for (index_t i = 0; i < nsites; ++i) + _num.at(i) = 0; + + // Generate num. + for (index_t i = 0; i < cfg.size(); ++i) { + // 0b01 for up, 0b10 for down, 0b11 for double occupation. + index_t xi = cfg.at(i) % nsites; + index_t si = cfg.at(i) / nsites; + _num.at(xi) += (si + 1); + } + + // Generate inv. + for (index_t i = 0; i < _inv.size(); ++i) + _inv.at(i) = -1; + for (index_t i = 0; i < cfg.size(); ++i) + _inv.at(cfg.at(i)) = i; + } + + void merge_config() + { + // OR: cfg_base = cfg; + for (index_t j = 0; j < _from_idx.size(); ++j) + _cfg_base.at(_from_idx.at(j)) = _to_orb.at(j); + _from_idx.clear(); + _to_orb.clear(); + } + + void push_update(index_t osi, index_t msj) + { + for (auto i = _from_idx.begin(); i != _from_idx.end(); ++i) + assert_(*i != msj, typeid(*this).name(), "Consecutive hopping unsupported. Please merge manually before proceeding."); + index_t osj = _cfg.at(msj); + index_t xj = osj % nsites; + index_t sj = osj / nsites; + index_t xi = osi % nsites; + index_t si = osi / nsites; + assert_(_inv.at(osj) == msj && _inv.at(osi) == -1, typeid(*this).name(), "Inv error @ push."); + + _from_idx.push_back(msj); + _to_orb.push_back(osi); + _cfg.at(msj) = osi; + // Update num. + _num.at(xj) -= sj + 1; + _num.at(xi) += si + 1; + // Update inv. + _inv.at(osj) = -1; + _inv.at(osi) = msj; + } + + void pop_update() + { + assert_(_from_idx.size(), typeid(*this).name(), "Trying to pop empty updates."); + index_t msj = _from_idx.back(); + index_t osj = _cfg_base.at(msj); + index_t xj = osj % nsites; + index_t sj = osj / nsites; + index_t osi = _cfg.at(msj); + index_t xi = osi % nsites; + index_t si = osi / nsites; + assert_(_inv.at(osi) == msj && _inv.at(osj) == -1, typeid(*this).name(), "Inv error @ pop."); + + _cfg.at(msj) = osj; + // Update num. + _num.at(xi) -= si + 1; + _num.at(xj) += sj + 1; + // Update inv. + _inv.at(osi) = -1; + _inv.at(osj) = msj; + _from_idx.pop_back(); + _to_orb.pop_back(); + } + + void check() + { + assert_(_cfg.size() <= norbs(), typeid(*this).name(), "More Fermions than orbitals."); + + base_t num(nsites, 0); + for (index_t i = 0; i < num.size(); ++i) { + index_t xi = _cfg.at(i) % nsites; + index_t si = _cfg.at(i) / nsites; + + assert_(!(num.at(xi) & (si + 1)), typeid(*this).name(), "More than two Fermions occupying the same orbital."); + num.at(xi) += (si + 1); + } + for (index_t i = 0; i < nsites; ++i) + assert_(num.at(i) == _num.at(i), typeid(*this).name(), "Found broken member: _num."); + + base_t cfg_curr = _cfg_base; + for (index_t j = 0; j < _from_idx.size(); ++j) + cfg_curr.at(_from_idx.at(j)) = _to_orb.at(j); + for (index_t i = 0; i < cfg_curr.size(); ++i) + assert_(cfg_curr.at(i) == _cfg.at(i), typeid(*this).name(), "Found broken member: _cfg."); + // Check inv. + } + + config_manager(index_t nsites_) + : nsites(nsites_), _num(nsites_), _inv(norbs()) { } + + config_manager(index_t nsites_, base_t cfg) + : nsites(nsites_), _num(nsites_), _inv(norbs()) + { attach_config(cfg); } + + template + static base_t singlet_config(unsigned nsite, unsigned nsinglet, rng_t &rng) + { + using dist_t = std::uniform_int_distribution; + + bool allow_doublon = nsinglet * 2 > nsite; + base_t marks(nsite, 0); + base_t cfg(0); + + // Spin up. + for (int i = 0; i < nsinglet; ++i) { + int iseq = dist_t(0, nsite - i - 1)(rng); + + int isite; + int isite_c = 0; + do { + // Find the next available site. + while (marks.at(isite_c)) { isite_c++; } + isite = isite_c; + isite_c++; + } while (iseq-- > 0); + + marks.at(isite) = 1; + cfg.push_back(isite); + } + + // Spin down. + for (int i = 0; i < nsinglet; ++i) { + int iseq; + if (allow_doublon) + iseq = dist_t(0, nsite - i - 1)(rng); + else + iseq = dist_t(0, nsite - i - 1 - nsinglet)(rng); + + int isite; + int isite_c = 0; + do { + // Find the next unoccupied site / orb. + if (allow_doublon) + while (marks.at(isite_c) > 1) { isite_c++; } + else + while (marks.at(isite_c) > 0) { isite_c++; } + isite = isite_c; + isite_c++; + } while (iseq-- > 0); + + marks.at(isite) = 2; + cfg.push_back(isite + nsite); + } + return cfg; + } + + template + std::pair propose_hop(rng_t &rng) const + { + int loop_breaker = 0; + int xi; + int msj = std::uniform_int_distribution(0, _cfg.size() - 1)(rng); + std::uniform_int_distribution select_site(0, nsites - 1); + int sj = msj / nsites; + + do { + assert_(loop_breaker++ < 10000000, typeid(*this).name(), "Too many loops."); + xi = select_site(rng); + } while (_num.at(xi) & (sj + 1)); + + return std::make_pair(xi + sj * nsites, msj); + } + + template + std::tuple propose_exc(rng_t &rng) const + { + using dist_t = std::uniform_int_distribution; + + int nup = 0; + int ndn = 0; + int loop_breaker = 0; + int msj, xj = 0; + int msl, xl = 0; + + // Count # of spin-up/down sites. + for (int i = 0; i < _num.size(); ++i) + if (num(i) == 1) + nup++; + else if (num(i) == 2) + ndn++; + + // Pick a singly occupied spin-up site. + int iseq_up = dist_t(0, nup - 1)(rng); + while (true) { + while (num(xj) != 1) ++xj; + if (--iseq_up < 0) + break; + else + ++xj; + } + + // Pick a singly occupied spin-down site. + int iseq_dn = dist_t(0, ndn - 1)(rng); + while (true) { + while (num(xl) != 2) ++xl; + if (--iseq_dn < 0) + break; + else + ++xl; + } + + // Reverse lookup config indices. + msj = inv(xj); + msl = inv(xl + nsites); + assert_(msj >= 0 && msl >= 0, typeid(*this).name(), "ExchangeConfig error."); + + return std::make_tuple(xl, msj, xj + nsites, msl); + } + +private: + base_t _cfg_base; + base_t _from_idx; + base_t _to_orb; + base_t _cfg; + base_t _num; + base_t _inv; +}; +} + diff --git a/src/pfupdates/makefile_pfupdates b/src/pfupdates/makefile_pfupdates index bd8502e3..ab9be063 100644 --- a/src/pfupdates/makefile_pfupdates +++ b/src/pfupdates/makefile_pfupdates @@ -2,12 +2,14 @@ include ../make.sys OBJ = pfupdates.o SRC = pf_interface.cc -HDR = pf_interface.h orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \ - ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh +HDR = pf_interface.h \ + orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \ + ../ltl2inv/eigen_gemmt.tcc ../ltl2inv/invert.tcc ../ltl2inv/pfaffian.tcc ../ltl2inv/trmmt.tcc \ + ../ltl2inv/error.hh ../ltl2inv/ilaenv_lauum.hh ../ltl2inv/lapack2eigen.hh $(OBJ): $(SRC) $(HDR) $(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \ - -I../common -I../ltl2inv \ + -I../pfapack/c_interface -I../ltl2inv \ -I$(BLIS_ROOT)/include \ -I$(BLIS_ROOT)/include/blis diff --git a/src/pfupdates/orbital_mat.tcc b/src/pfupdates/orbital_mat.tcc index 3e7ccf76..ddb4ce6c 100644 --- a/src/pfupdates/orbital_mat.tcc +++ b/src/pfupdates/orbital_mat.tcc @@ -6,33 +6,36 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "blis.h" -#include "colmaj.hh" +#include #include -template -using matrix_t = colmaj; -template +namespace vmc +{ +namespace orbital +{ +template struct orbital_mat { - const uplo_t uplo; - const dim_t nsite; - matrix_t X; ///< nsite*nsite. + using matrix_t = matrix_t_; + using T = typename matrix_t::Scalar; + using index_t = int32_t; - orbital_mat(uplo_t uplo_, dim_t nsite_, T *X_, inc_t ldX) - : uplo(uplo_), nsite(nsite_), - X(X_, ldX) { } + const char uplo; + const index_t norb_; + matrix_t &X; ///< norb*norb. - orbital_mat(uplo_t uplo_, dim_t nsite_, matrix_t &X_) - : uplo(uplo_), nsite(nsite_), X(X_) { } + orbital_mat(char uplo_, index_t norb__, matrix_t &X_) + : uplo(uplo_), norb_(norb__), X(X_) { } - void randomize(double amplitude, unsigned seed) { + virtual index_t norb() { return norb_; } + + virtual void randomize(double amplitude, unsigned seed) { using namespace std; mt19937_64 rng(seed); uniform_real_distribution dist(-0.1, 1.0); - for (dim_t j = 0; j < nsite; ++j) { - for (dim_t i = 0; i < j; ++i) { + for (index_t j = 0; j < norb_; ++j) { + for (index_t i = 0; i < j; ++i) { X(i, j) = T(dist(rng)) * amplitude; X(j, i) = -X(i, j); } @@ -42,18 +45,28 @@ struct orbital_mat void randomize(double amplitude) { randomize(amplitude, 511); } - T operator()(dim_t osi, dim_t osj) { + virtual index_t pack_idx(index_t osi, index_t osj) { + abort(); + } + + virtual void unpack_idx(index_t &osi, index_t &osj, index_t idx) { + abort(); + } + + virtual T operator()(index_t osi, index_t osj) { if (osi == osj) return T(0.0); switch (uplo) { - case BLIS_UPPER: + case 'U': + case 'u': if (osi < osj) return X(osi, osj); else return -X(osj, osi); - case BLIS_LOWER: + case 'L': + case 'l': if (osi > osj) return X(osi, osj); else @@ -64,3 +77,5 @@ struct orbital_mat } } }; +} +} diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc index a95efe01..fcdcfaca 100644 --- a/src/pfupdates/pf_interface.cc +++ b/src/pfupdates/pf_interface.cc @@ -4,6 +4,7 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #include "updated_tdi.tcc" +#include "orbital_mat.tcc" // In implementation, this should appear AFTER blis.h. #include "pf_interface.h" @@ -16,8 +17,15 @@ #error "Valid non-preprocessor _pragma() not found." #endif -#define orbv( i, ctype ) ( (orbital_mat *)orbv[i] ) -#define objv( i, ctype ) ( (updated_tdi *)objv[i] ) +using namespace Eigen; +using namespace vmc::orbital; +template +using matrix_t = Map, 0, OuterStride<> >; + +#define matv( i, ctype ) ( ( matrix_t *)matv[i] ) +#define mapv( i, ctype ) ( ( matrix_t *)mapv[i] ) +#define orbv( i, ctype ) ( ( orbital_mat > *)orbv[i] ) +#define objv( i, ctype ) ( (updated_tdi > > *)objv[i] ) #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar @@ -35,19 +43,23 @@ int32_t *elespn, \ uint64_t mmax, \ void *objv[], \ - void *orbv[] ) \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ { \ OMP_PARALLEL_FOR_SHARED \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ - orbv[iqp] = new orbital_mat( \ - BLIS_UPPER, norbs, orbmat_base + iqp * orbmat_stride, norbs); \ - objv[iqp] = new updated_tdi( \ - *orbv(iqp, ctype), nelec, invmat_base + iqp * invmat_stride, nelec, mmax); \ + mapv[iqp] = new matrix_t(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \ + matv[iqp] = new matrix_t(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \ + orbv[iqp] = new orbital_mat>('U', norbs, *mapv(iqp, ctype) ); \ + objv[iqp] = new updated_tdi>> \ + ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \ \ /* Compute initial. */ \ - auto &cfg_i = objv(iqp, ctype)->elem_cfg; \ + vmc::config_manager::base_t cfg_i(nelec); \ for (int msi = 0; msi < nelec; ++msi) \ cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \ + objv(iqp, ctype)->attach_config(cfg_i); \ objv(iqp, ctype)->initialize(); \ } \ } @@ -61,11 +73,15 @@ GENIMPL( ccdcmplx, z ) void EXPANDNAME( updated_tdi_v_free, cblachar ) \ ( uint64_t num_qp, \ void *objv[], \ - void *orbv[] ) \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ { \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ delete orbv(iqp, ctype); \ delete objv(iqp, ctype); \ + delete matv(iqp, ctype); \ + delete mapv(iqp, ctype); \ } \ } // GENIMPL( float, s ) @@ -82,7 +98,7 @@ GENIMPL( ccdcmplx, z ) { \ /* Minus sign appears from that SlaterElm is in fact interpreted as row-major. */ \ for (int iqp = 0; iqp < num_qp; ++iqp) \ - pfav[iqp] = -objv(iqp, ctype)->get_Pfa(); \ + pfav[iqp] = -objv(iqp, ctype)->get_amplitude(); \ } // GENIMPL( float, s ) GENIMPL( double, d ) @@ -120,7 +136,7 @@ GENIMPL( ccdcmplx, z ) { \ OMP_PARALLEL_FOR_SHARED \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ - auto &from_i = objv(iqp, ctype)->from_idx; \ + const auto &from_i = objv(iqp, ctype)->get_config_manager().from_idx(); \ if (from_i.size() > objv(iqp, ctype)->mmax - 2) \ objv(iqp, ctype)->merge_updates(); \ else \ diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h index 8593a12c..903a90ef 100644 --- a/src/pfupdates/pf_interface.h +++ b/src/pfupdates/pf_interface.h @@ -38,7 +38,9 @@ extern "C" { int32_t *elespn, \ uint64_t mmax, \ void *objv[], \ - void *orbv[] ); + void *orbv[], \ + void *matv[], \ + void *mapv[] ); // GENDEF( float, s ) GENDEF( double, d ) @@ -50,7 +52,9 @@ GENDEF( ccdcmplx, z ) void EXPANDNAME( updated_tdi_v_free, cblachar ) \ ( uint64_t num_qp, \ void *objv[], \ - void *orbv[] ); + void *orbv[], \ + void *matv[], \ + void *mapv[] ); // GENDEF( float, s ) GENDEF( double, d ) diff --git a/src/pfupdates/skmv.tcc b/src/pfupdates/skmv.tcc index 2abc2c6d..31a0947c 100644 --- a/src/pfupdates/skmv.tcc +++ b/src/pfupdates/skmv.tcc @@ -6,50 +6,39 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "colmaj.hh" -#include "blalink.hh" -#include +#include "lapack2eigen.hh" -template -void skmv(uplo_t uploA, - dim_t m, - T alpha, - T *_A, inc_t ldA, - T *X, - T *Y) { - using namespace std; - colmaj A(_A, ldA); +template +Eigen::Matrix + skmv(const char &uploA, + const typename Mat::Scalar &alpha, + const Mat &A, + const Vec &X) { + using namespace Eigen; + Eigen::Matrix Y; // Way 1: - // skmm(BLIS_LEFT, uploA, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, m, 1, - // T(1.0), &A(0, 0), A.ld, &X[0], ldA, T(0.0), &Y[0], ldA); - - // Way 2: - vector Z(m); - for (dim_t i = 0; i < m; ++i) { - Z[i] = X[i]; - Y[i] = X[i]; + if (uploA == 'U' || uploA == 'u') { + Y = A.template triangularView() * X - A.template triangularView().transpose() * X; + } else { + Y = A.template triangularView() * X - A.template triangularView().transpose() * X; } - trmv(uploA, BLIS_NO_TRANSPOSE, - m, alpha, &A(0, 0), A.ld, &Y[0], 1); - trmv(uploA, BLIS_TRANSPOSE, - m, alpha, &A(0, 0), A.ld, &Z[0], 1); - for (dim_t i = 0; i < m; ++i) - Y[i] -= Z[i]; + Y *= alpha; - // Way 3: - // for (dim_t i = 0; i < m; ++i) + // Way 2: + // for (int i = 0; i < m; ++i) // Y[i] = 0.0; - // for (dim_t j = 0; j < m; ++j) { + // for (int j = 0; j < m; ++j) { // T x_j = X[j]; - // for (dim_t i = 0; i < j; ++i) { + // for (int i = 0; i < j; ++i) { // T A_ij = A(i, j); // Y[i] += A_ij * x_j; // Y[j] -= A_ij * X[i]; // } // } - // for (dim_t i = 0; i < m; ++i) + // for (int i = 0; i < m; ++i) // Y[i] *= alpha; + return Y; } diff --git a/src/pfupdates/skr2k.tcc b/src/pfupdates/skr2k.tcc index 0b8cdcb6..a6bba999 100644 --- a/src/pfupdates/skr2k.tcc +++ b/src/pfupdates/skr2k.tcc @@ -7,29 +7,29 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ -#include "blalink.hh" -#include "blalink_gemmt.hh" -#include "colmaj.hh" +#pragma once +#include "lapack2eigen.hh" #include -template -inline void skr2k(uplo_t uploc, - trans_t transab, - dim_t m, dim_t k, - T alpha, - T *_A, inc_t ldA, - T *_B, inc_t ldB, - T beta, - T *_C, inc_t ldC) +template +inline void skr2k(const char &uploC, + const char &transAB, + const typename Mat2::Scalar &alpha, + const Mat0 &A, + const Mat1 &B, + const typename Mat2::Scalar &beta, + Mat2 &C) { - T one = 1.0; - if (transab == BLIS_NO_TRANSPOSE) { - gemmt(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC); - gemmt(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC); + using T = typename Mat0::Scalar; + + l2e::le_mat_t C_(C); + if (transAB == 'N' || transAB == 'n') { + gemmt(uploC, 'N', 'T', alpha, l2e::le_mat_t(A), l2e::le_mat_t(B), beta, C_); + gemmt(uploC, 'N', 'T', -alpha, l2e::le_mat_t(B), l2e::le_mat_t(A), T(1), C_); } else { - gemmt(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC); - gemmt(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC); + gemmt(uploC, 'T', 'N', alpha, l2e::le_mat_t(A), l2e::le_mat_t(B), beta, C_); + gemmt(uploC, 'T', 'N', -alpha, l2e::le_mat_t(B), l2e::le_mat_t(A), T(1), C_); } } diff --git a/src/pfupdates/skslc.tcc b/src/pfupdates/skslc.tcc index 0fd5a4cf..084c965d 100644 --- a/src/pfupdates/skslc.tcc +++ b/src/pfupdates/skslc.tcc @@ -6,28 +6,29 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ -#include "colmaj.hh" +#pragma once +#include "lapack2eigen.hh" -template -inline signed skslc(uplo_t uploA, - unsigned n, - unsigned i, - T *x, - T *_A, unsigned ldA) +template +inline Eigen::Matrix + skslc(const char &uploA, unsigned i, const Mat &A) { - colmaj A(_A, ldA); + Eigen::Matrix x(A.rows()); + x[i] = 0.0; switch (uploA) { - case BLIS_UPPER: + case 'U': + case 'u': for (unsigned j = 0; j < i; ++j) x[j] = A(j, i); - for (unsigned j = i+1; j < n; ++j) + for (unsigned j = i+1; j < A.rows(); ++j) x[j] = -A(i, j); - return 0; + break; default: std::cerr << "SKSLC: Lower triangular storage not implemented. Sorry." << std::endl; - return err_info(Pfaffine_NOT_IMPLEMNTED, 0); + break;; } + return x; } diff --git a/src/pfupdates/testupdate.cc b/src/pfupdates/testupdate.cc deleted file mode 100644 index 8f1c237b..00000000 --- a/src/pfupdates/testupdate.cc +++ /dev/null @@ -1,70 +0,0 @@ -/** - * \copyright Copyright (c) Dept. Phys., Univ. Tokyo - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at https://mozilla.org/MPL/2.0/. - */ -#include "updated_tdi.tcc" -#include -#include -#include - -struct info_t { - double err; - long long tim; -}; - -template -info_t test_tdi_update(dim_t nsite, std::vector cfg, std::vector from, std::vector to) -{ - dim_t nelec = cfg.size(); - dim_t mmax = from.size(); - T err = 0; - long long tim; - - T *orb__ = new T[nsite * nsite]; - colmaj orb_(orb__, nsite); - orbital_mat orb(BLIS_UPPER, nsite, orb_); - orb.randomize(0.6, 1234); - - T *mat_ = new T[nelec * nelec]; - updated_tdi tdi(orb, cfg, mat_, nelec, mmax); - - for (dim_t i = 0; i < mmax; ++i) { - if (from.at(i) < 0) - tdi.pop_update(true); - else - tdi.push_update_safe(to.at(i), from[i], true); - - std::vector cfg2(tdi.elem_cfg); - // Apply hopping. - for (dim_t j = 0; j < tdi.from_idx.size(); ++j) - cfg2.at(tdi.from_idx.at(j)) = tdi.to_site.at(j); - - T *mat2_ = new T[nelec * nelec]; - T pfa2 = updated_tdi(orb, cfg2, mat2_, nelec, 1).Pfa; - - double err_ = std::abs(pfa2 - tdi.Pfa * tdi.PfaRatio); - printf("Pfa = %e; Pfa diff = %e\n", tdi.Pfa, err_); - err += err_; - - delete[] mat2_; - } - - delete[] mat_; - delete[] orb__; - - return info_t{ err, tim }; -} - -int main(void) -{ - test_tdi_update(200, - std::vector{ 8, 63, 20, 53, 57, 14, 7, 22, 32, 67, 56, 66, 46, 16, 18, 26, 70, 28, 51, 64, 17, 68, 49, 4, 48, 41, 43, 21, 19, 45, 58, 11, 1, 24, 2, 69, 12, 23, 30, 10, 65, 3, 31, 55, 52, 37, 42, 34, 25, 60, 59, 50, 40, 9, 44, 54, 6, 36, 13, 29, 27, 38, 61, 33, 47, 5, 15, 39, 62, 35 }, - std::vector{ 0, 39, 49, 10, 60, 3 }, - std::vector{ 99, 101, 120, 199, 133, 177 }); - - return 0; -} - diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc index 40a338af..5eda1701 100644 --- a/src/pfupdates/updated_tdi.tcc +++ b/src/pfupdates/updated_tdi.tcc @@ -6,9 +6,7 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "orbital_mat.tcc" -#include "blalink.hh" -#include "blalink_gemmt.hh" +#include "config_manager.tcc" #include "pfaffian.tcc" #include "invert.tcc" #include "skr2k.tcc" @@ -17,52 +15,72 @@ #include #include -template struct updated_tdi { - orbital_mat &Xij; - uplo_t uplo; ///< Can be switched only with update_uplo(). - const dim_t nelec; - const dim_t mmax; - dim_t nq_updated; // Update of Q(:, i) can be delayed against U. - - // Performance tuning constants. +namespace vmc +{ +namespace orbital +{ +template +struct updated_tdi { + using orbital_t = orbital_t_; + using index_t = vmc::config_manager::index_t; + using amp_t = amp_t_; + using T = typename orbital_t::T; + using invmat_t = typename orbital_t::matrix_t; ///< Just for mVMC's C wrapper. Mathematically not so good. + using matrix_t = Eigen::Matrix; + using submat_t = Eigen::Block; + using vector_t = Eigen::Matrix; + using idxvec_t = Eigen::VectorXi; + + orbital_t &Xij; + const index_t nelec; ///< Total Fermionic number / Matrix size. + const index_t mmax; ///< Max number of updates contained. const bool single_hop_alpha; ///< Whether to simplify k=1 situations. +private: + char uplo; ///< Can be switched only with update_uplo(). + index_t nq_updated; // Update of Q(:, i) can be delayed against U. + // Matrix M. - matrix_t M; + invmat_t &M; ///< (nelec, nelec) // Updator blocks U, inv(M)U and inv(M)V. // Note that V is not stored. - matrix_t U; ///< Serves also as B*inv - matrix_t Q; ///< Contains the whole B buffer. Assert![ Q = M U ] - matrix_t P; ///< Q(0, mmax) + matrix_t U; ///< Serves also as B*inv + matrix_t Q; ///< Contains the whole B buffer. Assert![ Q = M U ] + // This member causes pointer problem on mixed precision. + // (Which was fixed by constructing Block objects on-the-fly.) + // TODO: Check its reason. + // submat_t P; ///< &Q(0, mmax). // Central update buffer and its blocks. - matrix_t W; - matrix_t UMU, UMV, VMV; - matrix_t Cp; ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ] - matrix_t Gc; ///< Used to be Gaussian vectors. Now just scratchpads. - signed *const cPov; ///< Pivot when tri-diagonalizing C. - T Pfa; - T PfaRatio; //< Updated Pfaffian / Base Pfaffian. - // TODO: Maybe one should log all Pfaffian histories. - - std::vector elem_cfg; - std::vector from_idx; - std::vector to_site; + matrix_t W; + matrix_t UMU, UMV, VMV; + matrix_t Cp; ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ] + matrix_t Gc; ///< Used to be Gaussian vectors. Now just scratchpads. + idxvec_t cPiv; ///< Pivot when tri-diagonalizing C. + amp_t Pfa; + amp_t PfaRatio; //< Updated Pfaffian / Base Pfaffian. + // TODO: Maybe one should log partially the computed Pfaffian history. + + // Configuration. + vmc::config_manager elem_cfg; + +public: void initialize() { using namespace std; - auto &cfg = elem_cfg; - matrix_t G(new T[nelec * nelec], nelec); - from_idx.clear(); - to_site.clear(); - from_idx.reserve(mmax * sizeof(dim_t)); - to_site.reserve(mmax * sizeof(dim_t)); + elem_cfg.merge_config(); + elem_cfg.reserve_space(mmax); + const auto &cfg = elem_cfg.config_base(); + matrix_t G(nelec, nelec); + nq_updated = 0; switch (uplo) { - case BLIS_UPPER: - for (dim_t j = 0; j < nelec; ++j) { - for (dim_t i = 0; i < j; ++i) { + case 'U': + case 'u': + for (index_t j = 0; j < nelec; ++j) { + for (index_t i = 0; i < j; ++i) { M(i, j) = Xij(cfg.at(i), cfg.at(j)); } M(j, j) = T(0.0); @@ -70,104 +88,103 @@ template struct updated_tdi { break; default: - cerr << "updated_tdi: BLIS_LOWER is not supported. The error is fetal." + cerr << "updated_tdi: Lower triangular is not implemented yet." << endl; } // Allocate scratchpad. - T *vT = new T[nelec-1]; - signed *iPiv = new signed[nelec]; + vector_t vT(nelec - 1); + idxvec_t iPiv(nelec); + l2e::le_mat_t M_(M); // Perform LTL factorization regardless of `uplo`. - signed info = sktrf(uplo, nelec, &M(0, 0), M.ld, iPiv, &G(0, 0), G.ld * nelec); - Pfa = ltl2pfa(uplo, nelec, &M(0, 0), M.ld, iPiv); - ltl2inv(uplo, nelec, &M(0, 0), M.ld, iPiv, vT, &G(0, 0), G.ld); -#ifdef _DEBUG - cerr << "SKPFA+INV: n=" << nelec << " info=" << info << endl; -#endif - - delete[] vT; - delete[] iPiv; - delete[](&G(0, 0)); + signed info = sktrf(uplo, 'N', M_, &iPiv(0), &G(0, 0), G.size()); + Pfa = ltl2pfa(uplo, M, iPiv); + PfaRatio = 1.0; // Reset Pfaffian ratio. + ltl2inv(uplo, M, iPiv, vT, G); } - ~updated_tdi() { - delete[](&U(0, 0)); - delete[](&Q(0, 0)); - - delete[](&Cp(0, 0)); - delete[](&Gc(0, 0)); + updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_, std::vector cfg) + : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true), + M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2), + W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax), + // Cp, Gc and cPiv are dynamically allocated. + Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2, cfg), uplo('U') + { initialize(); } - delete[](&W(0, 0)); - delete[](&UMU(0, 0)); - delete[](&UMV(0, 0)); - delete[](&VMV(0, 0)); - - delete[] cPov; + // Unsafe construction without initializaion. + updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_) + : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true), + M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2), + W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax), + Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2), uplo('U') { } + + // Copy construction could be unsafe since it does not always contain initializaion. + // M is automatically allocated in this case. + updated_tdi(const updated_tdi &tdi_) = delete; + /* + : Xij(tdi_.Xij), nelec(tdi_.nelec), mmax(tdi_.mmax), nq_updated(0), single_hop_alpha(true), + M(new invmat_t(...)), U(nelec, mmax * 2), Q(nelec, mmax * 2), + W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax), + Pfa(0.0), PfaRatio(1.0), elem_cfg(tdi_.Xij.norb()/2, tdi_.get_config()), uplo('U') { + if (elem_cfg.config().size() == nelec && + elem_cfg.config(0) != elem_cfg.config(1) && // Short-circuit complicated check. + is_perm(elem_cfg.config())) + initialize(); + } */ + + bool is_perm(const vmc::config_manager::base_t &cfg) const + { + std::vector occupied(Xij.norb(), false); + for (const auto &xi : cfg) { + if (xi > Xij.norb()) + return false; + if (occupied.at(xi)) + return false; + occupied.at(xi) = true; + } + return true; } - updated_tdi(orbital_mat &Xij_, std::vector &cfg, T *M_, inc_t ldM, - dim_t mmax_) - : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), M(M_, ldM), - U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), - P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), - UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), - VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), - Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg), - from_idx(0), to_site(0), uplo(BLIS_UPPER) { - initialize(); + amp_t get_amplitude() { return Pfa * get_amplitude_ratio(); } + amp_t get_amplitude_ratio() { + if (PfaRatio == 0.0) + update_pfaffian_ratio(); + return PfaRatio; } + int max_updates() const { return mmax; } + int num_updates() const { return elem_cfg.from_idx().size(); } - /* - updated_tdi(orbital_mat &Xij_, std::vector &cfg, matrix_t &M_, - dim_t mmax_) - : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), M(M_), - U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), - P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), - UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), - VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), - Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg), - from_idx(0), to_site(0), uplo(BLIS_UPPER) { - initialize(); - }*/ + const vmc::config_manager::base_t get_config() const + { return elem_cfg.config(); } - // Unsafe construction without initializaion. - updated_tdi(orbital_mat &Xij_, dim_t nelec_, T *M_, inc_t ldM, dim_t mmax_) - : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), M(M_, ldM), - U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), - P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), - UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), - VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), - Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(nelec, 0), - from_idx(0), to_site(0), uplo(BLIS_UPPER) { } - - T get_Pfa() { return Pfa * PfaRatio; } + const vmc::config_manager::base_t &get_config_base() const + { return elem_cfg.config_base(); } + + const vmc::config_manager &get_config_manager() const + { return elem_cfg; } + + void attach_config(const vmc::config_manager::base_t &cfg) + { + assert_(cfg.size() == nelec, typeid(*this).name(), "Invalid config feeded."); + elem_cfg.attach_config(cfg); + } void assemble_C_BMB() { using namespace std; - dim_t k = from_idx.size(); + using namespace Eigen; + index_t k = elem_cfg.from_idx().size(); + if (!k) + return; // Assemble (half of) C+BMB buffer. + Cp = matrix_t(2 * k, 2 * k); switch (uplo) { - case BLIS_UPPER: - for (dim_t j = 0; j < k; ++j) - for (dim_t i = 0; i < j; ++i) - Cp(i, j) = W(i, j) + UMU(i, j); - for (dim_t j = 0; j < k; ++j) { - for (dim_t i = 0; i < k; ++i) - Cp(i, j + k) = +UMV(i, j); - - Cp(j, j + k) -= T(1.0); - } - for (dim_t j = 0; j < k; ++j) - for (dim_t i = 0; i < j; ++i) - Cp(i + k, j + k) = VMV(i, j); + case 'U': + case 'u': + // Left-lower block skipped due to uplo='U'. + Cp << W(seq(0, k-1), seq(0, k-1)) + UMU(seq(0, k-1), seq(0, k-1)), UMV(seq(0, k-1), seq(0, k-1)) - matrix_t::Identity(k, k), + matrix_t::Zero(k, k), VMV(seq(0, k-1), seq(0, k-1)); break; default: @@ -177,12 +194,15 @@ template struct updated_tdi { } // Update Q rows buffer. - bool require_Q(bool all) { - const dim_t k = from_idx.size(); - const dim_t n = nelec; - dim_t k_cal; + bool require_Q(bool require_all) { + using namespace Eigen; + using namespace l2e; + + const index_t k = elem_cfg.from_idx().size(); + const index_t n = nelec; + index_t k_cal; // Require all K columns instead of first K-1. - if (all) + if (require_all) k_cal = k; else k_cal = k - 1; @@ -193,12 +213,12 @@ template struct updated_tdi { for (; nq_updated < k_cal; ++nq_updated) { // Update single column. Use SKMV. - skmv(uplo, n, T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), &Q(0, nq_updated)); + Q(all, nq_updated) = skmv(uplo, T(1.0), M, U(all, nq_updated)); } /* else { // Update multiple columns. Use SKMM. - skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated, - T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), U.ld, - T(0.0), &Q(0, nq_updated), Q.ld); + skmm('L', uplo, 'N', 'N', + T(1.0), le_mat_t(M), le_mat_tU(all, seq(nq_updated, k_cal-1)), + T(0.0), le_mat_t(Q(all, seq(nq_updated, k_cal-1)))); } nq_updated = k_cal; */ @@ -208,18 +228,22 @@ template struct updated_tdi { // UMU needs to be recalculated for hopping change. // TODO: Find some way to avoid recalculating UQ? bool require_UMU() { - const dim_t k = from_idx.size(); - const dim_t n = nelec; + using namespace Eigen; - for (dim_t o = 0; o < k; ++o) - for (dim_t l = 0; l < o; ++l) + const index_t k = elem_cfg.from_idx().size(); + const index_t n = nelec; + + for (index_t o = 0; o < k; ++o) + for (index_t l = 0; l < o; ++l) switch (uplo) { - case BLIS_UPPER: - UMU(l, o) = -dot(n, &U(0, o), 1, &Q(0, l), 1); + case 'U': + case 'u': + UMU(l, o) = -(U(all, o).transpose() * Q(all, l))(0, 0); break; - case BLIS_LOWER: + case 'L': + case 'l': // Always assume l < o to calculate one less column of Q. - UMU(o, l) = dot(n, &U(0, o), 1, &Q(0, l), 1); + UMU(o, l) = (U(all, o).transpose() * Q(all, l))(0, 0); break; default: break; @@ -227,101 +251,95 @@ template struct updated_tdi { return true; } - // Inverts a configuration from fermion index to site index. - void config_to_site(std::vector &cfg, std::vector &site, bool init) { - using namespace std; - - if (site.size() != Xij.nsite || cfg.size() != nelec) { - cerr << "updated_tdi::inv_config:" - << " Invalid config / invert config buffer." << endl; - return; - } - - if (init) - for (dim_t i = 0; i < site.size(); ++i) - site.at(i) = -1; - for (dim_t i = 0; i < cfg.size(); ++i) - site.at(cfg.at(i)) = i; - - return; - } - // Update osi <- os[msj]. // i.e. c+_i c_{x_j}. - void push_update(dim_t osi, dim_t msj, bool compute_pfa) { + void push_update(index_t osi, index_t msj, bool compute_pfa) { using namespace std; + using namespace Eigen; // This is the k-th hopping. - dim_t n = nelec; - dim_t k = from_idx.size(); - dim_t osj = elem_cfg.at(msj); + index_t n = nelec; + index_t k = elem_cfg.from_idx().size(); + index_t osj = elem_cfg.config_base(msj); - // TODO: Check bounds, duplicates, etc. - from_idx.push_back(msj); - to_site.push_back(osi); + elem_cfg.push_update(osi, msj); - // TODO: Check for hopping-backs. + // TODO: Check for hopping-backs. // This can only be handled by cancellation. // Singularity will emerge otherwise. - // Speed-up lookup of Xij. - std::vector sitecfg(Xij.nsite, -1); - config_to_site(elem_cfg, sitecfg, false); - for (dim_t i = 0; i < Xij.nsite; ++i) - // U(i, k) = Xij(elem_cfg.at(i), osi) - Xij(elem_cfg.at(i), osj); - if (sitecfg.at(i) >= 0) - // Direct access: special case when installed into VMC. - U(sitecfg.at(i), k) = Xij.X(i, osi) - Xij.X(i, osj); + // TODO: Speed-up lookup of Xij. + for (index_t i = 0; i < n; ++i) + U(i, k) = Xij(elem_cfg.config_base(i), osi) - Xij(elem_cfg.config_base(i), osj); + // for (index_t i = 0; i < Xij.norb(); ++i) + // if (elem_cfg.inv(i) >= 0) + // U(elem_cfg.inv(i), k) = Xij(i, osi) - Xij(i, osj); + + // P matrix from latter half of Q (or rigorously speaking, B matrix). + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); - skslc(uplo, n, msj, &P(0, k), &M(0, 0), M.ld); + P(all, k) = skslc(uplo, msj, M); // Updated the already logged U. - for (dim_t l = 0; l < k; ++l) { - dim_t msl = from_idx.at(l); + for (index_t l = 0; l < k; ++l) { + index_t msl = elem_cfg.from_idx().at(l); T U_jl_ = U(msj, l); ///< Backup this value before change. // Write updates. - U(msl, k) = Xij(to_site.at(l), osi) - Xij(elem_cfg.at(msl), osj); - U(msj, l) = Xij(osi, to_site.at(l)) - Xij(osj, elem_cfg.at(msl)); + U(msl, k) = Xij(elem_cfg.to_orb().at(l), osi) - Xij(elem_cfg.config_base(msl), osj); + U(msj, l) = Xij(osi, elem_cfg.to_orb().at(l)) - Xij(osj, elem_cfg.config_base(msl)); // Change of Q[:, l] induced by change of U[msj, l]; // Utilize that P[:, k] = M[:, msj] ///< M is skew-symmetric. - axpy(n, U(msj, l) - U_jl_, &P(0, k), 1, &Q(0, l), 1); + Q(all, l) += P(all, k) * (U(msj, l) - U_jl_); // Change of UMV[l, k] induced by change of U[msj, l]. - for (dim_t o = 0; o < k; ++o) + for (index_t o = 0; o < k; ++o) UMV(l, o) += (U(msj, l) - U_jl_) * P(msj, o); } // Update UMV and VMV. UMU is updated elsewhere. switch (uplo) { - case BLIS_UPPER: - for (dim_t l = 0; l < k; ++l) - W(l, k) = -Xij(to_site.at(l), osi) + Xij(elem_cfg.at(from_idx.at(l)), osj); - - for (dim_t l = 0; l < k; ++l) { - UMV(l, k) = dot(n, &U(0, l), 1, &P(0, k), 1); - UMV(k, l) = dot(n, &U(0, k), 1, &P(0, l), 1); + case 'U': + case 'u': + for (index_t l = 0; l < k; ++l) + W(l, k) = -Xij(elem_cfg.to_orb(l), osi) + Xij(elem_cfg.config_base(elem_cfg.from_idx(l)), osj); + + for (index_t l = 0; l < k; ++l) { + UMV(l, k) = (U(all, l).transpose() * P(all, k))(0, 0); + UMV(k, l) = (U(all, k).transpose() * P(all, l))(0, 0); } - UMV(k, k) = dot(n, &U(0, k), 1, &P(0, k), 1); - for (dim_t l = 0; l < k; ++l) + UMV(k, k) = (U(all, k).transpose() * P(all, k))(0, 0); + for (index_t l = 0; l < k; ++l) VMV(l, k) /* -VMV(k, l) */ = -P(msj, l); ///< P(from_idx.at(l), k); break; default: - cerr << "updated_tdi::push_update:" + cerr << "updated_tdi::push_update:" << " Only upper-triangular storage is supported." << endl; } if (compute_pfa) { - // NOTE: Update k to be new size. - k += 1; + // NOTE: Update k to be new size? + // k += 1; - // Calculate unupdated columns of U. + update_pfaffian_ratio(); + } else + // Set to 0.0 to denote dirty. + PfaRatio = 0.0; + } + + void update_pfaffian_ratio(void) + { + index_t k = elem_cfg.from_idx().size(); + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); + { + // All compute_pfa requires first K-1 of Q. require_Q(false); + // Recompute UMU. require_UMU(); - // Assemble (half of) C+BMB buffer. + // Reassemble C and scratchpads. assemble_C_BMB(); // If it's the first update Pfafian can be directly read out. @@ -331,67 +349,76 @@ template struct updated_tdi { } // Compute pfaffian. - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k); -#ifdef _DEBUG - cerr << "SKPFA: info=" << info << endl; -#endif + l2e::le_mat_t Cp_(Cp); + Gc = matrix_t::Zero(2*k, 2*k); + cPiv = idxvec_t::Zero(2*k); + // Use not skpfa for mixed precision. + signed info = sktrf(uplo, 'P', Cp_, &cPiv(0), &Gc(0, 0), Gc.size()); + PfaRatio = ltl2pfa(uplo, Cp, cPiv); + // Pfaffian of C = [ W -I; I 0 ]. PfaRatio *= pow(-1.0, k * (k + 1) / 2); + } + } - } else - // Set to 0.0 to denote dirty. - PfaRatio = 0.0; + bool check_update_safety(const std::vector &ms, bool merge_error = false) + { + if (elem_cfg.from_idx().size() + ms.size() > mmax) + return false; + + // Check if alrady hopped out. + for (index_t l = 0; l < elem_cfg.from_idx().size(); ++l) + for (index_t j = 0; j < ms.size(); ++j) + if (ms.at(j) == elem_cfg.from_idx().at(l)) { + assert_(!merge_error, typeid(*this).name(), "Conflict on non-mergable push."); + return false; + } + return true; } - void push_update(dim_t osi, dim_t msj) { push_update(osi, msj, true); } + void push_update(index_t osi, index_t msj) { push_update(osi, msj, true); } // Check duplicate and push. - void push_update_safe(dim_t osi, dim_t msj, bool compute_pfa) { - if (from_idx.size() >= mmax) + void push_update_safe(index_t osi, index_t msj, bool compute_pfa, bool merge_error = false) { + if (!check_update_safety({ msj })) merge_updates(); - // Check if already hopped out. - for (dim_t l = 0; l < from_idx.size(); ++l) - if (msj == from_idx.at(l)) { - merge_updates(); - break; - } push_update(osi, msj, compute_pfa); } - void push_update_safe(dim_t osi, dim_t msj) { + void push_update_safe(index_t osi, index_t msj) { push_update_safe(osi, msj, true); } void pop_update(bool compute_pfa) { using namespace std; + using namespace Eigen; // Pop out. - dim_t msj_ = from_idx.at(from_idx.size() - 1); - dim_t osi_ = to_site.at(to_site.size() - 1); - dim_t osj_ = elem_cfg.at(msj_); - from_idx.pop_back(); - to_site.pop_back(); + index_t msj_ = elem_cfg.from_idx().back(); + index_t osi_ = elem_cfg.to_orb().back(); + index_t osj_ = elem_cfg.config_base(msj_); + elem_cfg.pop_update(); // New update size. - dim_t k = from_idx.size(); + index_t k = elem_cfg.from_idx().size(); + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); // Revert U[:, 1:k-1] update. - for (dim_t l = 0; l < k; ++l) { - dim_t msl = from_idx.at(l); + for (index_t l = 0; l < k; ++l) { + index_t msl = elem_cfg.from_idx(l); T U_jl_ = U(msj_, l); // Revert U rows. (final column is popped out). - U(msj_, l) = Xij(osj_, to_site.at(l)) - Xij(osj_, elem_cfg.at(msl)); + U(msj_, l) = Xij(osj_, elem_cfg.to_orb(l)) - Xij(osj_, elem_cfg.config_base(msl)); // Change of Q[:, l] induced by change of U[msj, l]; T U_diff_msj = U(msj_, l) - U_jl_; switch (uplo) { - case BLIS_UPPER: - for (dim_t i = 0; i < msj_; ++i) - Q(i, l) += M(i, msj_) * U_diff_msj; - for (dim_t i = msj_ + 1; i < nelec; ++i) - Q(i, l) -= M(msj_, i) * U_diff_msj; + case 'U': + case 'u': + Q(seq(0, msj_-1), l) += M(seq(0, msj_-1), msj_) * U_diff_msj; + Q(seq(msj_+1, nelec-1), l) -= M(msj_, seq(msj_+1, nelec-1)) * U_diff_msj; break; default: @@ -400,7 +427,7 @@ template struct updated_tdi { } // Change of UMV[l, k] induced by change of U[msj, l]. - for (dim_t o = 0; o < k; ++o) + for (index_t o = 0; o < k; ++o) UMV(l, o) += (U(msj_, l) - U_jl_) * P(msj_, o); } // Pop out outdated Q. @@ -408,19 +435,10 @@ template struct updated_tdi { nq_updated = k; // Compute new (previous, in fact) Pfaffian. - if (!k) + if (k == 0) PfaRatio = 1.0; else if (compute_pfa) { - // All compute_pfa requires first K-1 of Q. - require_Q(false); - - // Recompute UMU. - require_UMU(); - // Reassemble C and scratchpads. - assemble_C_BMB(); - - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k); - PfaRatio *= pow(-1.0, k * (k + 1) / 2); + update_pfaffian_ratio(); } else // Set to 0.0 to denote dirty. @@ -430,20 +448,20 @@ template struct updated_tdi { void merge_updates() { using namespace std; - dim_t n = nelec; - dim_t k = from_idx.size(); + index_t n = nelec; + index_t k = elem_cfg.from_idx().size(); if (k == 0) return; // Allocate scratchpad. - T *vT = new T[2 * k - 1]; + vector_t vT(2 * k - 1); // Update whole Q. require_Q(true); if (k == 1) { // M is dirty. - if (PfaRatio == T(0.0)) { + if (PfaRatio == 0.0) { require_UMU(); assemble_C_BMB(); PfaRatio = -Cp(0, 1); @@ -455,34 +473,38 @@ template struct updated_tdi { // Redo the tridiagonal factorization. require_UMU(); assemble_C_BMB(); - signed info = sktrf(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, &Gc(0, 0), Gc.ld * 2 * k); + + l2e::le_mat_t Cp_(Cp); + Gc = matrix_t::Zero(2*k, 2*k); + cPiv = idxvec_t::Zero(2*k); + signed info = sktrf(uplo, 'N', Cp_, &cPiv(0), &Gc(0, 0), 2*k * 2*k); // PfaRatio is dirty. - if (PfaRatio == T(0.0)) - PfaRatio = ltl2pfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov) * T(pow(-1.0, k * (k + 1) / 2)); - ltl2inv(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, vT, &Gc(0, 0), Gc.ld); + if (PfaRatio == 0.0) + PfaRatio = ltl2pfa(uplo, Cp, cPiv) * + T(pow(-1.0, k * (k + 1) / 2)); + ltl2inv(uplo, Cp, cPiv, vT, Gc); } - inv_update(k, Cp); + assert_(Cp.rows() == 2 * k, typeid(*this).name(), "Cp is bad."); + inv_update(Cp); // Apply hopping. - for (int j = 0; j < k; ++j) - elem_cfg.at(from_idx.at(j)) = to_site.at(j); - from_idx.clear(); - to_site.clear(); + elem_cfg.merge_config(); nq_updated = 0; Pfa *= PfaRatio; PfaRatio = 1.0; - - delete[] vT; } - void inv_update(dim_t k, matrix_t &C) { - matrix_t ABC(&U(0, 0), U.ld); ///< Use U as inv(A)*B*upper(C) buffer. - matrix_t AB(&Q(0, 0), Q.ld); + void inv_update(matrix_t &C) { + using namespace Eigen; + const index_t k = C.rows() / 2; + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); + + auto ABC = U(all, seq(0, 2 * k - 1)); ///< Use U as inv(A)*B*upper(C) buffer. + auto AB = Q(all, seq(0, 2 * k - 1)); if (k == 1 && single_hop_alpha) { // k == 1 requires no copying - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.ld, - &P(0, 0), P.ld, T(1.0), &M(0, 0), M.ld); + skr2k(uplo, 'N', C(0, 1), Q(all, 0), P(all, 0), T(1.0), M); // See below for reason of calling this procedule. // update_uplo(uplo); return; @@ -490,39 +512,15 @@ template struct updated_tdi { // Close empty space between Q and P. if (k != mmax) - for (dim_t j = 0; j < k; ++j) - memcpy(&AB(0, k + j), &P(0, j), nelec * sizeof(T)); - - # if 0 - // Copy AB to ABC for TRMM interface. - for (dim_t j = 0; j < 2 * k - 1; ++j) - // inv(A)*U [ 0 + + + - // 0 0 + + - // 0 0 0 + - // 0 0 0 0 ] => AB[:, 0:2] -> ABC[:, 1:3] - memcpy(&ABC(0, j + 1), &AB(0, j), nelec * sizeof(T)); - trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), - &C(0, 1), C.ld, &ABC(0, 1), ABC.ld); - - // Update: write to M. - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.ld, - &AB(0, 1), AB.ld, T(1.0), &M(0, 0), M.ld); - #else - gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, - nelec, 2 * k, 2 * k, - T(1.0), - &AB(), AB.ld, - &C(), C.ld, - T(0.0), - &ABC(), ABC.ld); - gemmt(uplo, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, - nelec, 2 * k, - T(1.0), - &ABC(), ABC.ld, - &AB(), AB.ld, - T(1.0), - &M(), M.ld); - #endif + for (index_t j = 0; j < k; ++j) + AB(all, k + j) = P(all, j); + + l2e::le_mat_t M_(M); + ABC = AB * C; + gemmt(uplo, 'N', 'T', T(1.0), + l2e::le_mat_t(ABC), + l2e::le_mat_t(AB), + T(1.0), M_); // Identity update to complete antisymmetric matrix. // This called due to lack of skmm support at the moment. @@ -532,15 +530,19 @@ template struct updated_tdi { /** * Complete antisymmetric matrix. */ - void skcomplete(uplo_t uplo_, dim_t n, matrix_t &A) { - for (dim_t j = 0; j < n; ++j) { - for (dim_t i = 0; i < j; ++i) { + static void skcomplete(const char &uplo_, matrix_t &A) { + const index_t n = A.rows(); + + for (index_t j = 0; j < n; ++j) { + for (index_t i = 0; i < j; ++i) { switch (uplo_) { - case BLIS_UPPER: + case 'U': + case 'u': A(j, i) = -A(i, j); break; - case BLIS_LOWER: + case 'L': + case 'l': A(i, j) = -A(j, i); break; @@ -552,15 +554,159 @@ template struct updated_tdi { } } - void update_uplo(uplo_t uplo_new) { - skcomplete(uplo /* NOTE: old uplo */, nelec, M); - if (from_idx.size()) { - skcomplete(uplo, from_idx.size(), UMU); - skcomplete(uplo, from_idx.size(), VMV); - skcomplete(uplo, from_idx.size(), W); + void update_uplo(const char &uplo_new) { + using namespace Eigen; + + skcomplete(uplo /* NOTE: Old uplo. */, M); + const int k = elem_cfg.from_idx().size(); + if (k) { + skcomplete(uplo, UMU(seq(0, k-1), seq(0, k-1))); + skcomplete(uplo, VMV(seq(0, k-1), seq(0, k-1))); + skcomplete(uplo, W(seq(0, k-1), seq(0, k-1))); } uplo = uplo_new; } -}; + Eigen::Vector batch_query_amplitudes(int N, idxvec_t to_orbs, idxvec_t from_ids) { + using namespace Eigen; + using ampvec_t = Eigen::Vector; + + bool upper = (uplo == 'U' || uplo == 'u'); + int k0 = from_ids.size(); + int k1 = k0 / N; + assert_(k0 % N == 0, typeid(*this).name(), "Misaligned batch query."); + ampvec_t result(k1); + + // Works w/ merged M only. + merge_updates(); + // Operates on complete M. + skcomplete(uplo, M); + + matrix_t UbB; + if (N > 1) + UbB = matrix_t::Zero(nelec, k1 * (N-1)); + matrix_t UbF= matrix_t::Zero(nelec, k1); + matrix_t Pb = matrix_t::Zero(nelec, k0); + + // Grouped comput. of Ub. + for (index_t l1 = 0; l1 < k1; ++l1) { + for (index_t l = l1 * N; l < (l1 + 1) * N; ++l) + elem_cfg.push_update(to_orbs(l), from_ids(l)); + + int osi, osj; + + for (index_t l2 = 0; l2 < N-1; ++l2) { + index_t l = l1 * N + l2; + index_t lB = l1 * (N-1) + l2; + + osi = to_orbs(l); + osj = elem_cfg.config_base(from_ids(l)); + + for (index_t i = 0; i < nelec; ++i) { + index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i); + UbB(i, lB) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj); + } + } + + // Final column in the grouped U. + { + index_t l = l1 * N + N - 1; + osi = to_orbs(l); + osj = elem_cfg.config_base(from_ids(l)); + + for (index_t i = 0; i < nelec; ++i) { + index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i); + UbF(i, l1) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj); + } + } + + for (index_t ll = 0; ll < N; ++ll) + elem_cfg.pop_update(); + } + + // Pb needs no grouping. + for (index_t l = 0; l < k0; ++l) + Pb(all, l) = M(all, from_ids(l)); + + matrix_t QbB; + if (N > 1) + // Require Q. + QbB = M * UbB; + + // Batch assemble C. + matrix_t Cb = matrix_t::Zero(2*N, 2*N * k1); + for (index_t l1 = 0; l1 < k1; ++l1) { + auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1)); + auto UCur = UbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1)); + auto QCur = QbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1)); + auto PCur = Pb(all, seq(N * l1, N * (l1+1) - 1)); + + // UMU + W. + for (index_t o = 0; o < N-1; ++o) + for (index_t l = 0; l < o; ++l) + if (upper) + CCur(l, o) = -UCur(all, o).transpose() * QCur(all, l) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + o))); + else + CCur(o, l) = UCur(all, o).transpose() * QCur(all, l) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + o))); + { + for (index_t l = 0; l < N-1; ++l) + if (upper) + CCur(l, N-1) = -UbF(all, l1).transpose() * QCur(all, l) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + N-1))); + else + CCur(N-1, l) = UbF(all, l1).transpose() * QCur(all, l) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + N-1))); + } + + // UMV + if (upper) { + if (N > 1) + CCur(seq(0, N - 2), seq(N, 2*N - 1)) = UCur.transpose() * PCur; + CCur(N - 1, seq(N, 2*N - 1)) = UbF(all, l1).transpose() * PCur; + } else + assert_(false, typeid(*this).name(), "Lower diag not implemented."); + // -I + CCur(seq(0, N - 1), seq(N, 2*N - 1)) -= matrix_t::Identity(N, N); + + // VMV + for (index_t o = 0; o < N; ++o) + for (index_t l = 0; l < o; ++l) + if (upper) + CCur(N + l, N + o) = -PCur(from_ids(l1 * N + o), l); + else + CCur(N + o, N + l) = PCur(from_ids(l1 * N + o), l); + } + + // Batch Pfaffian call. + matrix_t Gc = matrix_t::Zero(2*N, 2*N); + idxvec_t cPiv = idxvec_t::Zero(2*N); + for (index_t l1 = 0; l1 < k1; ++l1) { + auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1)); + if (N == 1) { + result(l1) = -CCur(0, 1) + T(1.0); + continue; + } + l2e::le_mat_t C_(CCur); + signed info = sktrf(uplo, 'P', C_, &cPiv(0), &Gc(0, 0), Gc.size()); + result(l1) = ltl2pfa(uplo, CCur, cPiv); + } + if (N > 1) + result = result * pow(-1.0, N * (N + 1) / 2); + + return result; + } + +}; +} +} From 26ae54635beaa07e3b172dc4bcdbbd8fe9736637 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 14:54:55 +0900 Subject: [PATCH 04/39] Adapt mVMC for Modified PfUpdates Interface --- src/mVMC/vmcmake.c | 14 ++++++++------ src/mVMC/vmcmake_fsz.c | 14 ++++++++------ src/mVMC/vmcmake_fsz_real.c | 14 ++++++++------ src/mVMC/vmcmake_real.c | 14 ++++++++------ 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c index 57ab2dd2..aa6e22ca 100644 --- a/src/mVMC/vmcmake.c +++ b/src/mVMC/vmcmake.c @@ -74,6 +74,8 @@ void VMCMakeSample(MPI_Comm comm) { // TODO: Compute from qpStart to qpEnd to support loop splitting. void *pfOrbital[NQPFull]; void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; // Read block size from input. const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE"); if (optBlockSize) @@ -94,7 +96,7 @@ void VMCMakeSample(MPI_Comm comm) { InvM, Nsize*Nsize, TmpEleIdx, EleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd); @@ -108,13 +110,13 @@ void VMCMakeSample(MPI_Comm comm) { qpStart,qpEnd,comm); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, SlaterElm, Nsite2*Nsite2, InvM, Nsize*Nsize, TmpEleIdx, EleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd); @@ -276,13 +278,13 @@ void VMCMakeSample(MPI_Comm comm) { StartTimer(34); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, SlaterElm, Nsite2*Nsite2, InvM, Nsize*Nsize, TmpEleIdx, EleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd); @@ -309,7 +311,7 @@ void VMCMakeSample(MPI_Comm comm) { #ifdef _pf_block_update // Free-up updator space. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); #endif return; diff --git a/src/mVMC/vmcmake_fsz.c b/src/mVMC/vmcmake_fsz.c index 404318e4..801ba523 100644 --- a/src/mVMC/vmcmake_fsz.c +++ b/src/mVMC/vmcmake_fsz.c @@ -83,6 +83,8 @@ void VMCMakeSample_fsz(MPI_Comm comm) { // TODO: Compute from qpStart to qpEnd to support loop splitting. void *pfOrbital[NQPFull]; void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; // Read block size from input. const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE"); if (optBlockSize) @@ -100,7 +102,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) { InvM, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -117,13 +119,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) { #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, SlaterElm, Nsite2*Nsite2, InvM, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -382,13 +384,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) { StartTimer(34); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, SlaterElm, Nsite2*Nsite2, InvM, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -426,7 +428,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) { #ifdef _pf_block_update // Free-up updator space. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); #endif return; diff --git a/src/mVMC/vmcmake_fsz_real.c b/src/mVMC/vmcmake_fsz_real.c index b76e6505..f3dda1d6 100644 --- a/src/mVMC/vmcmake_fsz_real.c +++ b/src/mVMC/vmcmake_fsz_real.c @@ -79,6 +79,8 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { // TODO: Compute from qpStart to qpEnd to support loop splitting. void *pfOrbital[NQPFull]; void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; // TODO: Make it input parameter. if (NExUpdatePath == 0) NBlockUpdateSize = 4; @@ -91,7 +93,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { InvM_real, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -108,13 +110,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize, SlaterElm_real, Nsite2*Nsite2, InvM_real, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -368,13 +370,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { StartTimer(34); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize, SlaterElm_real, Nsite2*Nsite2, InvM_real, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -412,7 +414,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { #ifdef _pf_block_update // Free-up updator space. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); #endif return; diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c index 6c9bde7d..f567c04f 100644 --- a/src/mVMC/vmcmake_real.c +++ b/src/mVMC/vmcmake_real.c @@ -76,6 +76,8 @@ void VMCMakeSample_real(MPI_Comm comm) { // TODO: Compute from qpStart to qpEnd to support loop splitting. void *pfOrbital[NQPFull]; void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; // Read block size from input. const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE"); if (optBlockSize) @@ -96,7 +98,7 @@ void VMCMakeSample_real(MPI_Comm comm) { InvM_real, Nsize*Nsize, TmpEleIdx, EleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_real(TmpEleIdx, qpStart, qpEnd); @@ -110,13 +112,13 @@ void VMCMakeSample_real(MPI_Comm comm) { qpStart, qpEnd, comm); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize, SlaterElm_real, Nsite2*Nsite2, InvM_real, Nsize*Nsize, TmpEleIdx, EleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_real(TmpEleIdx, qpStart, qpEnd); @@ -283,13 +285,13 @@ void VMCMakeSample_real(MPI_Comm comm) { StartTimer(34); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize, SlaterElm_real, Nsite2*Nsite2, InvM_real, Nsize*Nsize, TmpEleIdx, EleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_real(TmpEleIdx, qpStart, qpEnd); @@ -316,7 +318,7 @@ void VMCMakeSample_real(MPI_Comm comm) { #ifdef _pf_block_update // Free-up updator space. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); #endif return; From 01b2b6061dbfcdf76d37474ad0d8cd35786e0711 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 15:10:08 +0900 Subject: [PATCH 05/39] Fix CI --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7d2322e3..e7b4a465 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,7 +12,7 @@ jobs: - name: apt run: | sudo apt update - sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev + sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev - name: pip run: | From 75efd6954804c36ab4975b9664425f6cb88ee0c1 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 15:29:58 +0900 Subject: [PATCH 06/39] Fix CI --- .github/workflows/main.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e7b4a465..85f74a72 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,13 +12,19 @@ jobs: - name: apt run: | sudo apt update - sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev + sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev wget - name: pip run: | python -m pip install numpy python3 -m pip install numpy + - name: eigen + working-directory: ${{runner.workspace}} + run: | + wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz + tar -zxvf eigen-3.4.0.tar.gz + - name: make workspace run: cmake -E make_directory ${{runner.workspace}}/build @@ -26,7 +32,7 @@ jobs: working-directory: ${{runner.workspace}}/build shell: bash run: | - cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE + cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH=${{runner.workspace}}/eigen-3.4.0 -DEigen3_DIR=${{runner.workspace}}/eigen-3.4.0/cmake $GITHUB_WORKSPACE - name: build working-directory: ${{runner.workspace}}/build From d50eac07207317dffe964798a55186ae6201526f Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 15:38:13 +0900 Subject: [PATCH 07/39] Fix CI --- .github/workflows/main.yml | 7 ++++++- src/ltl2inv/CMakeLists.txt | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 85f74a72..b2dcbf1e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -24,6 +24,11 @@ jobs: run: | wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz tar -zxvf eigen-3.4.0.tar.gz + mkdir eigen-build + cd eigen-build + cmake ../eigen-3.4.0 + make -j4 + sudo make install - name: make workspace run: cmake -E make_directory ${{runner.workspace}}/build @@ -32,7 +37,7 @@ jobs: working-directory: ${{runner.workspace}}/build shell: bash run: | - cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH=${{runner.workspace}}/eigen-3.4.0 -DEigen3_DIR=${{runner.workspace}}/eigen-3.4.0/cmake $GITHUB_WORKSPACE + cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE - name: build working-directory: ${{runner.workspace}}/build diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt index 04fe165e..1c704a0d 100644 --- a/src/ltl2inv/CMakeLists.txt +++ b/src/ltl2inv/CMakeLists.txt @@ -11,7 +11,7 @@ endif() include_directories(../pfapack/c_interface) # Require Eigen -find_package(Eigen3 3.3 REQUIRED NO_MODULE) +find_package(Eigen3 3.4 REQUIRED NO_MODULE) include_directories(${EIGEN3_INCLUDE_DIR}) add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc) From 5766c58ea33eeea0644f955fd202dbe4a74b7988 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 16:20:33 +0900 Subject: [PATCH 08/39] Upgrade BLIS --- .gitmodules | 2 +- src/blis | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 44e5b4ac..825489fb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "src/blis"] path = src/blis - url = https://github.com/xrq-phys/blis + url = https://github.com/flame/blis branch = mvmc [submodule "src/pfapack"] path = src/pfapack diff --git a/src/blis b/src/blis index a32257ee..aeb5f0cc 160000 --- a/src/blis +++ b/src/blis @@ -1 +1 @@ -Subproject commit a32257eeab2e9946e71546a05a1847a39341ec6b +Subproject commit aeb5f0cc19665456e990a7ffccdb09da2e3f504b From 8334613de8afe5c31d6ba0ea4784afd57240a427 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 18:09:36 +0900 Subject: [PATCH 09/39] Upgrade CI BLIS Don't use the artifacts. --- .github/workflows/main.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b2dcbf1e..8a67bd55 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,6 +30,20 @@ jobs: make -j4 sudo make install + - name: blis + working-directory: ${{runner.workspace}} + run: | + mkdir blis + cd blis + git init + git remote add origin https://github.com/flame/blis + git fetch --depth 1 origin e3d352f1fcc93e6a46fde1aa4a7f0a18fb27bd42 + git checkout FETCH_HEAD + ./configure -t none -p ${{runner.workspace}}/blis-inst x86_64 + make -j8 install + cd ${{runner.workspace}}/blis-inst + tar -zcvf ${{runner.workspace}}/libblis_artifact.tar.gz include lib + - name: make workspace run: cmake -E make_directory ${{runner.workspace}}/build @@ -38,6 +52,9 @@ jobs: shell: bash run: | cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE + # Override downloaded artifact. + # TODO: Rebuild the artifacts. + mv ${{runner.workspace}}/libblis_artifact.tar.gz . - name: build working-directory: ${{runner.workspace}}/build From 0375f1c889ace9ff7d86a87c44f55845869495e4 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 18:53:06 +0900 Subject: [PATCH 10/39] Fix: Use BLIS GEMMT --- CMakeLists.txt | 1 + download_blis_artifact.cmake | 1 + mVMCconfig.sh | 12 ++++++------ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3257d79d..1099e4ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,6 +137,7 @@ enable_language(CXX) set(CMAKE_CXX_STANDARD 11) if(Require_BLIS) + add_definitions(-D_BLIS) include("download_blis_artifact.cmake") else(Require_BLIS) # Use bundled blis.h diff --git a/download_blis_artifact.cmake b/download_blis_artifact.cmake index f6274bcc..ec9fc8a7 100644 --- a/download_blis_artifact.cmake +++ b/download_blis_artifact.cmake @@ -53,6 +53,7 @@ add_custom_target(blis_target DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib/libblis.a) add_custom_target(blis_include DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/include/blis/blis.h) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/blis) add_library(blis STATIC IMPORTED GLOBAL) diff --git a/mVMCconfig.sh b/mVMCconfig.sh index 0b7e1d62..cfc7453a 100755 --- a/mVMCconfig.sh +++ b/mVMCconfig.sh @@ -41,7 +41,7 @@ FFLAGS = -DNDEBUG -DFUJITSU -Kfast CFLAGS = -DNDEBUG -Ofast -fopenmp CXXFLAGS = -DNDEBUG -Ofast -fopenmp CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -DBLAS_EXTERNAL +CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.so -SSL2 -lm -lpthread @@ -125,7 +125,7 @@ FFLAGS = -O3 CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += +CXXFLAGS += -D_BLIS BLIS_ROOT = ${PWD}/src/blis-install LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread @@ -149,7 +149,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += +CXXFLAGS += -D_BLIS BLIS_ROOT = ${PWD}/src/blis-install LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread @@ -173,7 +173,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL +CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm -lpthread @@ -197,7 +197,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += +CXXFLAGS += -D_BLIS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread @@ -221,7 +221,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple. +CXXFLAGS += -D_BLIS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple. BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread From 2687fd407c12931432c2b7c3af1394b6ad6b0494 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 01:38:04 +0900 Subject: [PATCH 11/39] First Step: Extract PfM from GF2 for FCMP Exchange Only --- src/mVMC/calham.c | 34 ++++++++++++++++++++++++++++++---- src/mVMC/locgrn.c | 26 ++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c index d35445eb..0b1fb3b2 100644 --- a/src/mVMC/calham.c +++ b/src/mVMC/calham.c @@ -67,6 +67,11 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const double complex *myBuffer; double complex myEnergy; + int *lazy_info = malloc(7 * sizeof(int) * NExchangeCoupling * 2); + double complex *lazy_ip = malloc(sizeof(double) * NExchangeCoupling * 2); + double complex *lazy_pfa = malloc(sizeof(double) * NExchangeCoupling * 2); + memset(lazy_info, 0, 7 * NExchangeCoupling * 2); + RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj); RequestWorkSpaceThreadComplex(NQPFull+2*Nsize); /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */ @@ -82,7 +87,7 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling, \ NTransfer, Transfer, ParaTransfer, NPairHopping, PairHopping, ParaPairHopping, \ NExchangeCoupling, ExchangeCoupling, ParaExchangeCoupling, NInterAll, InterAll, ParaInterAll, n0, n1) \ - shared(eleCfg, eleProjCnt, eleIdx, eleNum) reduction(+:e) + shared(eleCfg, eleProjCnt, eleIdx, eleNum, lazy_info) reduction(+:e) { myEleIdx = GetWorkSpaceThreadInt(Nsize); myEleNum = GetWorkSpaceThreadInt(Nsite2); @@ -158,9 +163,27 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const ri = ExchangeCoupling[idx][0]; rj = ExchangeCoupling[idx][1]; - tmp = GreenFunc2(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer); - tmp += GreenFunc2(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer); - myEnergy += ParaExchangeCoupling[idx] * tmp; + lazy_ip[idx * 2] = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info + 7 * (idx * 2)); + lazy_ip[idx * 2 + 1] = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info + 7 * (idx * 2 + 1)); + if ( !lazy_info[7 * (idx * 2) + 6] ) myEnergy += lazy_ip[idx * 2]; + if ( !lazy_info[7 * (idx * 2 + 1) + 6] ) myEnergy += lazy_ip[idx * 2 + 1]; + } + #pragma omp barrier + #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait + for(idx=0;idx/ */ /* buffer size = NQPFull+2*Nsize */ -double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl, +double complex GreenFunc2_(const int ri, const int rj, const int rk, const int rl, const int s, const int t, const double complex ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, - int *projCntNew, double complex *buffer) { + int *projCntNew, double complex *buffer, int *lazy_buffer) { double complex z; int mj,msj,ml,mtl; int rsi,rsj,rtk,rtl; @@ -145,8 +145,20 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV); - z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_buffer) { + CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV); + z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + } else { + // Save invocation hook instead of calling. + // REMEMBER THE ORDER OF S/T, ML, MJ. + lazy_buffer[0] = mj; + lazy_buffer[1] = ml; + lazy_buffer[2] = msj; //< to edit to ri, revert to rj + lazy_buffer[3] = mtl; //< to edit to rk, revert to rl + lazy_buffer[4] = ri; + lazy_buffer[5] = rk; + lazy_buffer[6] = 1; //< "is delayed" flag.. + } /* revert hopping */ eleIdx[mtl] = rl; @@ -159,6 +171,12 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl return conj(z/ip);//TBC } +double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl, + const int s, const int t, const double complex ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, + int *projCntNew, double complex *buffer) { + return GreenFunc2_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0); +} // ignore GreenFuncN: to be added From 78d3c0c309c66d0afe90fe366ef6151dc1eebff7 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Fri, 10 Feb 2023 22:53:10 +0900 Subject: [PATCH 12/39] Fix Malloc/Memset/Conj Typos --- src/mVMC/calham.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c index 0b1fb3b2..91e2fc8c 100644 --- a/src/mVMC/calham.c +++ b/src/mVMC/calham.c @@ -68,9 +68,9 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const double complex myEnergy; int *lazy_info = malloc(7 * sizeof(int) * NExchangeCoupling * 2); - double complex *lazy_ip = malloc(sizeof(double) * NExchangeCoupling * 2); - double complex *lazy_pfa = malloc(sizeof(double) * NExchangeCoupling * 2); - memset(lazy_info, 0, 7 * NExchangeCoupling * 2); + double complex *lazy_ip = malloc(sizeof(double complex) * NExchangeCoupling * 2); + double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2); + memset(lazy_info, 0, 7 * NExchangeCoupling * 2 * sizeof(int)); RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj); RequestWorkSpaceThreadComplex(NQPFull+2*Nsize); @@ -180,7 +180,7 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const myEleIdx[msj] = lazy_info_loc[4]; myEleIdx[mtl] = lazy_info_loc[5]; CalculateNewPfMTwo_fcmp(lazy_info_loc[1], mtl/Ne, lazy_info_loc[0], msj/Ne, myBuffer, myEleIdx, 0, NQPFull, myBuffer+NQPFull); - myEnergy += CalculateIP_fcmp(myBuffer, 0, NQPFull, MPI_COMM_SELF) * lazy_ip[idx]; + myEnergy += conj(CalculateIP_fcmp(myBuffer, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx]; myEleIdx[msj] = rj; myEleIdx[mtl] = rl; } From d6ce91898803c5f31978f39283862125e028ff8e Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 00:44:28 +0900 Subject: [PATCH 13/39] Env Adjustments - __cplusplus - Include in common hdr. - Compile PfUpdates regardless of whether PFAFFIAN_BLOCKED This is consistent w/ traditional Makefile. --- CMakeLists.txt | 8 ++++---- src/mVMC/include/vmcmain.h | 1 + src/mVMC/vmcmake.c | 5 ----- src/mVMC/vmcmake_real.c | 5 ----- src/pfupdates/pf_interface.h | 4 ++-- 5 files changed, 7 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1099e4ca..fe3208ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,11 +146,11 @@ endif(Require_BLIS) if(PFAFFIAN_BLOCKED) add_definitions(-D_pf_block_update) - add_subdirectory(src/pfupdates) - if(Require_BLIS) - add_dependencies(pfupdates blis_include) - endif(Require_BLIS) endif(PFAFFIAN_BLOCKED) +add_subdirectory(src/pfupdates) +if(Require_BLIS) + add_dependencies(pfupdates blis_include) +endif(Require_BLIS) if (Document) add_subdirectory(doc) diff --git a/src/mVMC/include/vmcmain.h b/src/mVMC/include/vmcmain.h index 090c7b69..cf642973 100644 --- a/src/mVMC/include/vmcmain.h +++ b/src/mVMC/include/vmcmain.h @@ -72,6 +72,7 @@ extern int omp_get_thread_num(void); #else #include "../stcopt_pdposv.c" #endif +#include "../../pfupdates/pf_interface.h" #include "../stcopt_cg.c" diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c index aa6e22ca..349c1301 100644 --- a/src/mVMC/vmcmake.c +++ b/src/mVMC/vmcmake.c @@ -36,11 +36,6 @@ along with this program. If not, see http://www.gnu.org/licenses/. #include "splitloop.h" #include "qp.h" -#ifdef _pf_block_update -// Block-update extension. -#include "../pfupdates/pf_interface.h" -#endif - void VMCMakeSample(MPI_Comm comm) { int outStep,nOutStep; int inStep,nInStep; diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c index f567c04f..7c0d57c4 100644 --- a/src/mVMC/vmcmake_real.c +++ b/src/mVMC/vmcmake_real.c @@ -37,11 +37,6 @@ which follows "The BSD 3-Clause License". #include "splitloop.h" #include "vmcmake.h" -#ifdef _pf_block_update -// Block-update extension. -#include "../pfupdates/pf_interface.h" -#endif - void VMCMakeSample_real(MPI_Comm comm) { int outStep, nOutStep; int inStep, nInStep; diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h index 903a90ef..62bcee3f 100644 --- a/src/pfupdates/pf_interface.h +++ b/src/pfupdates/pf_interface.h @@ -12,7 +12,7 @@ #include "stdint.h" // Redefine double complex for (future) non-C99 compatibility. -#ifndef _CC_IMPL +#ifndef __cplusplus typedef _Complex double ccdcmplx; typedef _Complex float ccscmplx; #else @@ -118,7 +118,7 @@ GENDEF( ccdcmplx, z ) #undef EXPANDNAME -#ifdef _CC_IMPL +#ifdef __cplusplus } #endif From f345f2506887fadfa7335608ce9695ffc92fe01c Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 20:28:46 +0900 Subject: [PATCH 14/39] Not-working Batch GF2 - OMP sync problem unresolved - More problems pending --- src/mVMC/calham.c | 72 +++++++++++++++-------- src/mVMC/locgrn.c | 20 +++---- src/pfupdates/pf_interface.cc | 107 ++++++++++++++++++++++++++++++++-- src/pfupdates/pf_interface.h | 44 ++++++++++++++ src/pfupdates/updated_tdi.tcc | 22 +++++-- 5 files changed, 219 insertions(+), 46 deletions(-) diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c index 91e2fc8c..a2e5c841 100644 --- a/src/mVMC/calham.c +++ b/src/mVMC/calham.c @@ -67,10 +67,16 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const double complex *myBuffer; double complex myEnergy; - int *lazy_info = malloc(7 * sizeof(int) * NExchangeCoupling * 2); - double complex *lazy_ip = malloc(sizeof(double complex) * NExchangeCoupling * 2); - double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2); - memset(lazy_info, 0, 7 * NExchangeCoupling * 2 * sizeof(int)); + int *lazy_info = malloc( sizeof(int) * NExchangeCoupling * 2 * 2); + int *lazy_rsi = malloc(2 * sizeof(int) * NExchangeCoupling * 2); + int *lazy_msj = malloc(2 * sizeof(int) * NExchangeCoupling * 2); + double complex *lazy_ip = malloc(sizeof(double complex) * NExchangeCoupling * 2); + double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2 * NQPFull); + memset(lazy_info, 0, sizeof(int) * NExchangeCoupling * 2); + memset(lazy_info + NExchangeCoupling * 2, -1, sizeof(int) * NExchangeCoupling * 2); + + for (int mi=0; mi // Non-# Pragma support differs from compiler to compiler #if defined(__INTEL_COMPILER) -#define OMP_PARALLEL_FOR_SHARED __pragma(omp parallel for default(shared)) +#define _ccPragma(_1) __pragma(_1) #elif defined(__GNUC__) -#define OMP_PARALLEL_FOR_SHARED _Pragma("omp parallel for default(shared)") +#define _ccPragma(_1) _Pragma(#_1) #else -#error "Valid non-preprocessor _pragma() not found." +#error "Valid non-preprocessor _Pragma() not found." #endif using namespace Eigen; @@ -29,6 +30,46 @@ using matrix_t = Map, 0, OuterStride<> >; #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar +#define GENIMPL( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \ + ( uint64_t num_qp, \ + uint64_t nsite, \ + uint64_t norbs, \ + uint64_t nelec, \ + ctype *orbmat_base, \ + int64_t orbmat_stride, \ + ctype *invmat_base, \ + int64_t invmat_stride, \ + int32_t *eleidx, \ + int32_t *elespn, \ + uint64_t mmax, \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ +{ \ + for (int iqp = 0; iqp < num_qp; ++iqp) { \ + mapv[iqp] = new matrix_t(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \ + matv[iqp] = new matrix_t(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \ + orbv[iqp] = new orbital_mat>('U', norbs, *mapv(iqp, ctype) ); \ + objv[iqp] = new updated_tdi>> \ + ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \ +\ + /* Compute initial. */ \ + vmc::config_manager::base_t cfg_i(nelec); \ + for (int msi = 0; msi < nelec; ++msi) \ + cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \ + objv(iqp, ctype)->attach_config(cfg_i); \ + objv(iqp, ctype)->initialize_precomputed(pfav[iqp]); \ + } \ +} +// GENIMPL( float, s ) +GENIMPL( double, d ) +// GENIMPL( ccscmplx, c ) +GENIMPL( ccdcmplx, z ) +#undef GENIMPL + #define GENIMPL( ctype, cblachar ) \ void EXPANDNAME( updated_tdi_v_init, cblachar ) \ ( uint64_t num_qp, \ @@ -47,7 +88,7 @@ using matrix_t = Map, 0, OuterStride<> >; void *matv[], \ void *mapv[] ) \ { \ - OMP_PARALLEL_FOR_SHARED \ + _ccPragma(omp parallel for default(shared)) \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ mapv[iqp] = new matrix_t(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \ matv[iqp] = new matrix_t(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \ @@ -114,7 +155,7 @@ GENIMPL( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ) \ { \ - OMP_PARALLEL_FOR_SHARED \ + _ccPragma(omp parallel for default(shared)) \ for (int iqp = 0; iqp < num_qp; ++iqp) \ objv(iqp, ctype)->push_update_safe(osi, msj, cal_pfa!=0); \ } @@ -134,7 +175,7 @@ GENIMPL( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ) \ { \ - OMP_PARALLEL_FOR_SHARED \ + _ccPragma(omp parallel for default(shared)) \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ const auto &from_i = objv(iqp, ctype)->get_config_manager().from_idx(); \ if (from_i.size() > objv(iqp, ctype)->mmax - 2) \ @@ -172,5 +213,59 @@ GENIMPL( double, d ) GENIMPL( ccdcmplx, z ) #undef GENIMPL +#define GENIMPL( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_omp_proc_batch_greentwo, cblachar ) \ + ( uint64_t num_qp, \ + int num_gf, \ + int needs_comput[], \ + int unpack_idx[], \ + int to_orbs[], \ + int from_ids[], \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ +{ \ + int ith = omp_get_thread_num(); \ + int nth = omp_get_num_threads(); \ +\ + int gf_count = 0; \ + for (int i = 0; i < num_gf; ++i) \ + if ( needs_comput[i] ) { \ + if ( ith == 0 ) { \ + /* Pack info space. */ \ + to_orbs[gf_count * 2 ] = to_orbs[i * 2]; \ + to_orbs[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \ + from_ids[gf_count * 2 ] = from_ids[i * 2]; \ + from_ids[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \ + unpack_idx[gf_count] = i; \ + } \ + /* Count Green's funcs. */ \ + gf_count++; \ + } \ +\ + int gf_cnt_thread = (gf_count + nth - 1) / nth; \ + int gfStart = gf_cnt_thread * ith; \ + int gfEnd = std::min(gf_count, gfStart + gf_cnt_thread); \ +\ + Eigen::Map to_orbs_thread( to_orbs + gfStart * 2, (gfEnd - gfStart) * 2); \ + Eigen::Map from_ids_thread(from_ids + gfStart * 2, (gfEnd - gfStart) * 2); \ +\ + for (int iqp = 0; iqp < num_qp; ++iqp) { \ + Eigen::Vector pfaBatch = \ + objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs_thread, from_ids_thread); \ +\ + /* Unpack computed Pfaffians. */ \ + for (int igf = gfStart; igf < gfEnd; ++igf) \ + pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf - gfStart]; \ + } \ +} +// GENIMPL( float, s ) +GENIMPL( double, d ) +// GENIMPL( ccscmplx, c ) +GENIMPL( ccdcmplx, z ) +#undef GENIMPL + #undef EXPANDNAME diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h index 62bcee3f..7a858868 100644 --- a/src/pfupdates/pf_interface.h +++ b/src/pfupdates/pf_interface.h @@ -24,6 +24,31 @@ extern "C" { #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar +#define GENDEF( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \ + ( uint64_t num_qp, \ + uint64_t nsite, \ + uint64_t norbs, \ + uint64_t nelec, \ + ctype *orbmat_base, \ + int64_t orbmat_stride, \ + ctype *invmat_base, \ + int64_t invmat_stride, \ + int32_t *eleidx, \ + int32_t *elespn, \ + uint64_t mmax, \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ); +// GENDEF( float, s ) +GENDEF( double, d ) +// GENDEF( ccscmplx, c ) +GENDEF( ccdcmplx, z ) +#undef GENDEF + + #define GENDEF( ctype, cblachar ) \ void EXPANDNAME( updated_tdi_v_init, cblachar ) \ ( uint64_t num_qp, \ @@ -116,6 +141,25 @@ GENDEF( double, d ) GENDEF( ccdcmplx, z ) #undef GENDEF +#define GENDEF( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_omp_proc_batch_greentwo, cblachar ) \ + ( uint64_t num_qp, \ + int num_gf, \ + int needs_comput[], \ + int unpack_idx[], \ + int to_orbs[], \ + int from_ids[], \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ); +// GENDEF( float, s ) +GENDEF( double, d ) +// GENDEF( ccscmplx, c ) +GENDEF( ccdcmplx, z ) +#undef GENDEF + #undef EXPANDNAME #ifdef __cplusplus diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc index 5eda1701..6705a2fb 100644 --- a/src/pfupdates/updated_tdi.tcc +++ b/src/pfupdates/updated_tdi.tcc @@ -68,6 +68,14 @@ private: public: + void initialize_precomputed(T Pfa_) { + elem_cfg.merge_config(); + elem_cfg.reserve_space(mmax); + nq_updated = 0; + Pfa = Pfa_; + PfaRatio = 1.0; + } + void initialize() { using namespace std; elem_cfg.merge_config(); @@ -530,7 +538,8 @@ public: /** * Complete antisymmetric matrix. */ - static void skcomplete(const char &uplo_, matrix_t &A) { + template + static void skcomplete(const char &uplo_, Mat &A) { const index_t n = A.rows(); for (index_t j = 0; j < n; ++j) { @@ -568,7 +577,8 @@ public: uplo = uplo_new; } - Eigen::Vector batch_query_amplitudes(int N, idxvec_t to_orbs, idxvec_t from_ids) { + Eigen::Vector batch_query_amplitudes + (int N, const Eigen::Map &to_orbs, const Eigen::Map &from_ids) { using namespace Eigen; using ampvec_t = Eigen::Vector; @@ -646,24 +656,24 @@ public: for (index_t o = 0; o < N-1; ++o) for (index_t l = 0; l < o; ++l) if (upper) - CCur(l, o) = -UCur(all, o).transpose() * QCur(all, l) - + CCur(l, o) = -(UCur(all, o).transpose() * QCur(all, l))(0,0) - Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), elem_cfg.config_base(from_ids(l1 * N + o))); else - CCur(o, l) = UCur(all, o).transpose() * QCur(all, l) - + CCur(o, l) = (UCur(all, o).transpose() * QCur(all, l))(0,0) - Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), elem_cfg.config_base(from_ids(l1 * N + o))); { for (index_t l = 0; l < N-1; ++l) if (upper) - CCur(l, N-1) = -UbF(all, l1).transpose() * QCur(all, l) - + CCur(l, N-1) = -(UbF(all, l1).transpose() * QCur(all, l))(0,0) - Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), elem_cfg.config_base(from_ids(l1 * N + N-1))); else - CCur(N-1, l) = UbF(all, l1).transpose() * QCur(all, l) - + CCur(N-1, l) = (UbF(all, l1).transpose() * QCur(all, l))(0,0) - Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), elem_cfg.config_base(from_ids(l1 * N + N-1))); From 880612760e776fe9d8762e69d37cff0fe0e9f67c Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 20:38:41 +0900 Subject: [PATCH 15/39] Fix: Batch GF2 Forgot to Convert AmpRatio to Amp --- src/pfupdates/pf_interface.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc index 26815109..0ccb2218 100644 --- a/src/pfupdates/pf_interface.cc +++ b/src/pfupdates/pf_interface.cc @@ -255,6 +255,7 @@ GENIMPL( ccdcmplx, z ) for (int iqp = 0; iqp < num_qp; ++iqp) { \ Eigen::Vector pfaBatch = \ objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs_thread, from_ids_thread); \ + pfaBatch *= objv(iqp, ctype)->get_amplitude(); \ \ /* Unpack computed Pfaffians. */ \ for (int igf = gfStart; igf < gfEnd; ++igf) \ From e77f0ae041b33f589afe2102e3c9aa0b82cd8269 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 21:48:48 +0900 Subject: [PATCH 16/39] Fix: Batch GF2 OMP - Not tested for NQP>1 --- src/pfupdates/pf_interface.cc | 52 +++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc index 0ccb2218..223946f0 100644 --- a/src/pfupdates/pf_interface.cc +++ b/src/pfupdates/pf_interface.cc @@ -227,30 +227,46 @@ GENIMPL( ccdcmplx, z ) void *matv[], \ void *mapv[] ) \ { \ - int ith = omp_get_thread_num(); \ - int nth = omp_get_num_threads(); \ + const int ith = omp_get_thread_num(); \ + const int nth = omp_get_num_threads(); \ \ + /* Count Green's funcs. */ \ int gf_count = 0; \ for (int i = 0; i < num_gf; ++i) \ + if ( needs_comput[i] ) \ + gf_count++; \ +\ + /* Partitioning. */ \ + int gfCountThread = (gf_count + nth - 1) / nth; \ + const int gfStart = gfCountThread * ith; \ + gfCountThread = std::min(gf_count - gfStart, gfCountThread); \ +\ + /* Pack info space. */ \ + gf_count = -gfStart; \ + int *to_orbs_thread_, *from_ids_thread_, *unpack_idx_thread; \ + for (int i = 0; i < num_gf && gf_count < gfCountThread; ++i) \ if ( needs_comput[i] ) { \ - if ( ith == 0 ) { \ - /* Pack info space. */ \ - to_orbs[gf_count * 2 ] = to_orbs[i * 2]; \ - to_orbs[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \ - from_ids[gf_count * 2 ] = from_ids[i * 2]; \ - from_ids[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \ - unpack_idx[gf_count] = i; \ + if ( gf_count >= 0 ) { \ + if ( gf_count == 0 ) { \ + /* Pin down scratchpad spots. */ \ + to_orbs_thread_ = to_orbs + i * 2; \ + from_ids_thread_ = from_ids + i * 2; \ + unpack_idx_thread = unpack_idx + i; \ + unpack_idx_thread[0] = i; \ + } else { \ + /* 0th idx already in the place. Pack from 1st. */ \ + to_orbs_thread_[gf_count * 2 ] = to_orbs[i * 2]; \ + to_orbs_thread_[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \ + from_ids_thread_[gf_count * 2 ] = from_ids[i * 2]; \ + from_ids_thread_[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \ + unpack_idx_thread[gf_count] = i; \ + } \ } \ - /* Count Green's funcs. */ \ gf_count++; \ } \ \ - int gf_cnt_thread = (gf_count + nth - 1) / nth; \ - int gfStart = gf_cnt_thread * ith; \ - int gfEnd = std::min(gf_count, gfStart + gf_cnt_thread); \ -\ - Eigen::Map to_orbs_thread( to_orbs + gfStart * 2, (gfEnd - gfStart) * 2); \ - Eigen::Map from_ids_thread(from_ids + gfStart * 2, (gfEnd - gfStart) * 2); \ + Eigen::Map to_orbs_thread( to_orbs_thread_, gfCountThread * 2); \ + Eigen::Map from_ids_thread(from_ids_thread_, gfCountThread * 2); \ \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ Eigen::Vector pfaBatch = \ @@ -258,8 +274,8 @@ GENIMPL( ccdcmplx, z ) pfaBatch *= objv(iqp, ctype)->get_amplitude(); \ \ /* Unpack computed Pfaffians. */ \ - for (int igf = gfStart; igf < gfEnd; ++igf) \ - pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf - gfStart]; \ + for (int igf = 0; igf < gfCountThread; ++igf) \ + pfav[unpack_idx_thread[igf] * num_qp + iqp] = pfaBatch[igf]; \ } \ } // GENIMPL( float, s ) From 5389c0ce32272163f4d41fa0081837e13aec88f9 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 22:14:39 +0900 Subject: [PATCH 17/39] Fix: CMake Eigen --- CMakeLists.txt | 4 ++++ src/ltl2inv/CMakeLists.txt | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe3208ed..f69b4ac9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,6 +136,10 @@ add_subdirectory("${STDFACE_DIR}") enable_language(CXX) set(CMAKE_CXX_STANDARD 11) +# Require Eigen +find_package(Eigen3 3.4 REQUIRED NO_MODULE) +include_directories(${EIGEN3_INCLUDE_DIR}) + if(Require_BLIS) add_definitions(-D_BLIS) include("download_blis_artifact.cmake") diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt index 1c704a0d..165ea01c 100644 --- a/src/ltl2inv/CMakeLists.txt +++ b/src/ltl2inv/CMakeLists.txt @@ -10,10 +10,6 @@ if("${ARCHITECTURE}" STREQUAL "x86_64") endif() include_directories(../pfapack/c_interface) -# Require Eigen -find_package(Eigen3 3.4 REQUIRED NO_MODULE) -include_directories(${EIGEN3_INCLUDE_DIR}) - add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc) target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL) From c987aeeee312214f875dd51bfc64ed50c16e0e4d Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 22:23:56 +0900 Subject: [PATCH 18/39] Eigen Should Offload MMuls to BLAS We often have the best impl of BLAS. --- CMakeLists.txt | 2 ++ mVMCconfig.sh | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f69b4ac9..76b217fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,8 @@ set(CMAKE_CXX_STANDARD 11) # Require Eigen find_package(Eigen3 3.4 REQUIRED NO_MODULE) include_directories(${EIGEN3_INCLUDE_DIR}) +# We surely want to offload mmuls to BLAS. +add_definitions(-DEIGEN_USE_BLAS) if(Require_BLIS) add_definitions(-D_BLIS) diff --git a/mVMCconfig.sh b/mVMCconfig.sh index cfc7453a..1e0d7a57 100755 --- a/mVMCconfig.sh +++ b/mVMCconfig.sh @@ -41,7 +41,7 @@ FFLAGS = -DNDEBUG -DFUJITSU -Kfast CFLAGS = -DNDEBUG -Ofast -fopenmp CXXFLAGS = -DNDEBUG -Ofast -fopenmp CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS +CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.so -SSL2 -lm -lpthread @@ -75,7 +75,7 @@ FFLAGS = -O3 -implicitnone CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL +CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread @@ -101,7 +101,7 @@ FFLAGS = -O3 -implicitnone CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL +CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread @@ -125,7 +125,7 @@ FFLAGS = -O3 CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -D_BLIS +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread @@ -149,7 +149,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -D_BLIS +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread @@ -173,7 +173,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS +CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm -lpthread @@ -197,7 +197,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -D_BLIS +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread @@ -221,7 +221,7 @@ FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 CFLAGS += -D_lapack -D_pf_block_update -CXXFLAGS += -D_BLIS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple. +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple. BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread From 32887d03e6e46e2297c79407671598b96beb6184 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 22:25:57 +0900 Subject: [PATCH 19/39] Fix: Include --- src/pfupdates/config_manager.tcc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pfupdates/config_manager.tcc b/src/pfupdates/config_manager.tcc index 44eb1f07..978cbcf5 100644 --- a/src/pfupdates/config_manager.tcc +++ b/src/pfupdates/config_manager.tcc @@ -9,6 +9,7 @@ #include "error.hh" #include #include +#include #include namespace vmc From 88c18843de06a4dfc6d93dbd15d65200fb204a38 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 23:05:59 +0900 Subject: [PATCH 20/39] Fix: Link PfUpdates Always --- src/mVMC/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/mVMC/CMakeLists.txt b/src/mVMC/CMakeLists.txt index 44ae92cf..dd898044 100644 --- a/src/mVMC/CMakeLists.txt +++ b/src/mVMC/CMakeLists.txt @@ -28,9 +28,7 @@ add_executable(vmc.out ${SOURCES_vmcmain} ${SOURCES_sfmt}) target_link_libraries(vmc.out StdFace) target_link_libraries(vmc.out pfapack) target_link_libraries(vmc.out ltl2inv) -if(PFAFFIAN_BLOCKED) - target_link_libraries(vmc.out pfupdates) -endif(PFAFFIAN_BLOCKED) +target_link_libraries(vmc.out pfupdates) if(Require_BLIS) target_link_libraries(vmc.out blis pthread) endif(Require_BLIS) From e34ed8d05fae3815111abe0eb48e1255271bd03b Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 11 Feb 2023 23:52:55 +0900 Subject: [PATCH 21/39] Batch GF2 Parallel in QP This is faster than partitioning the batch since it retains thickness. --- src/mVMC/calham.c | 32 ++++++++++++++++++++++---- src/pfupdates/pf_interface.cc | 43 ++++++++++++++++++++++++++++++++++- src/pfupdates/pf_interface.h | 16 ++++++++----- 3 files changed, 79 insertions(+), 12 deletions(-) diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c index a2e5c841..442c280f 100644 --- a/src/mVMC/calham.c +++ b/src/mVMC/calham.c @@ -196,11 +196,33 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const if ( !*lazy_info_odd ) myEnergy += lazy_ip[idx * 2 + 1]; } #pragma omp barrier - updated_tdi_v_omp_proc_batch_greentwo_z(NQPFull, NExchangeCoupling*2, - lazy_info, lazy_info + NExchangeCoupling*2, - lazy_rsi, lazy_msj, - lazy_pfa, - pfUpdator, pfOrbital, pfMat, pfMap); +#if 1 + int num_qp_var0 = 0; + // Pack lazy info. + for (idx=0; idx to_orbs( to_orbs_, num_gf * 2); \ + Eigen::Map from_ids(from_ids_, num_gf * 2); \ +\ + for (int iqp = qpStart; iqp < qpEnd; ++iqp) { \ + Eigen::Vector pfaBatch = \ + objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs, from_ids); \ + pfaBatch *= objv(iqp, ctype)->get_amplitude(); \ +\ + /* Unpack computed Pfaffians. */ \ + for (int igf = 0; igf < num_gf; ++igf) \ + pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf]; \ + } \ +} +// GENIMPL( float, s ) +GENIMPL( double, d ) +// GENIMPL( ccscmplx, c ) +GENIMPL( ccdcmplx, z ) +#undef GENIMPL + +#define GENIMPL( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_omp_var1_proc_batch_greentwo, cblachar ) \ ( uint64_t num_qp, \ int num_gf, \ int needs_comput[], \ diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h index 7a858868..49ad0252 100644 --- a/src/pfupdates/pf_interface.h +++ b/src/pfupdates/pf_interface.h @@ -141,8 +141,8 @@ GENDEF( double, d ) GENDEF( ccdcmplx, z ) #undef GENDEF -#define GENDEF( ctype, cblachar ) \ - void EXPANDNAME( updated_tdi_v_omp_proc_batch_greentwo, cblachar ) \ +#define GENDEF( ctype, infix, cblachar ) \ + void EXPANDNAME( updated_tdi_v_omp_## infix ##_proc_batch_greentwo, cblachar ) \ ( uint64_t num_qp, \ int num_gf, \ int needs_comput[], \ @@ -154,10 +154,14 @@ GENDEF( ccdcmplx, z ) void *orbv[], \ void *matv[], \ void *mapv[] ); -// GENDEF( float, s ) -GENDEF( double, d ) -// GENDEF( ccscmplx, c ) -GENDEF( ccdcmplx, z ) +// GENDEF( float, var0, s ) +// GENDEF( float, var1, s ) +GENDEF( double, var0, d ) +GENDEF( double, var1, d ) +// GENDEF( ccscmplx, var0, c ) +// GENDEF( ccscmplx, var1, c ) +GENDEF( ccdcmplx, var0, z ) +GENDEF( ccdcmplx, var1, z ) #undef GENDEF #undef EXPANDNAME From 932b1417ec67f48fa6277b471681bfe463f41ff0 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Tue, 14 Feb 2023 18:41:51 +0900 Subject: [PATCH 22/39] Silent Bracket Warning --- src/mVMC/vmcmake.c | 3 ++- src/mVMC/vmcmake_fsz.c | 3 ++- src/mVMC/vmcmake_real.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c index 349c1301..fa35d7a5 100644 --- a/src/mVMC/vmcmake.c +++ b/src/mVMC/vmcmake.c @@ -76,11 +76,12 @@ void VMCMakeSample(MPI_Comm comm) { if (optBlockSize) NBlockUpdateSize = atoi(optBlockSize); // Fall back to default if input is invalid. - if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) + if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) { if (NExUpdatePath == 0) NBlockUpdateSize = 4; else NBlockUpdateSize = 20; + } // Set one universal EleSpn. for (mi=0; mi 100) + if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) { if (NExUpdatePath == 0) NBlockUpdateSize = 4; else NBlockUpdateSize = 20; + } // Initialize with free spin configuration. updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c index 7c0d57c4..3570b956 100644 --- a/src/mVMC/vmcmake_real.c +++ b/src/mVMC/vmcmake_real.c @@ -78,11 +78,12 @@ void VMCMakeSample_real(MPI_Comm comm) { if (optBlockSize) NBlockUpdateSize = atoi(optBlockSize); // Fall back to default if input is invalid. - if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) + if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) { if (NExUpdatePath == 0) NBlockUpdateSize = 2; else NBlockUpdateSize = 20; + } // Set one universal EleSpn. for (mi=0; mi Date: Mon, 22 May 2023 18:17:40 +0900 Subject: [PATCH 23/39] Batch GF2 for all Ham in complex cSz --- src/mVMC/calham.c | 114 +++++++++++++++++++++++++++++----------------- 1 file changed, 72 insertions(+), 42 deletions(-) diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c index 442c280f..d698f0a9 100644 --- a/src/mVMC/calham.c +++ b/src/mVMC/calham.c @@ -67,13 +67,14 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const double complex *myBuffer; double complex myEnergy; - int *lazy_info = malloc( sizeof(int) * NExchangeCoupling * 2 * 2); - int *lazy_rsi = malloc(2 * sizeof(int) * NExchangeCoupling * 2); - int *lazy_msj = malloc(2 * sizeof(int) * NExchangeCoupling * 2); - double complex *lazy_ip = malloc(sizeof(double complex) * NExchangeCoupling * 2); - double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2 * NQPFull); - memset(lazy_info, 0, sizeof(int) * NExchangeCoupling * 2); - memset(lazy_info + NExchangeCoupling * 2, -1, sizeof(int) * NExchangeCoupling * 2); + const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll; + int *lazy_info = malloc( sizeof(int) * nHamiltonianTwo * 2); + int *lazy_rsi = malloc(2 * sizeof(int) * nHamiltonianTwo); + int *lazy_msj = malloc(2 * sizeof(int) * nHamiltonianTwo); + double complex *lazy_ip = malloc(sizeof(double complex) * nHamiltonianTwo); + double complex *lazy_pfa = malloc(sizeof(double complex) * nHamiltonianTwo * NQPFull); + memset(lazy_info, 0, sizeof(int) * nHamiltonianTwo); + memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo); for (int mi=0; mi Date: Mon, 22 May 2023 23:54:24 +0900 Subject: [PATCH 24/39] Batch GF2 var 1/2 Heuristics Based on Nelec ~ 200. --- src/mVMC/calham.c | 52 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c index d698f0a9..a44bf73b 100644 --- a/src/mVMC/calham.c +++ b/src/mVMC/calham.c @@ -240,33 +240,33 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const /* Batch-compute 2-body Green's functions. */ #pragma omp barrier -#if 0 - int num_qp_var0 = 0; - // Pack lazy info. - for (idx=0; idx Date: Tue, 23 May 2023 20:49:26 +0900 Subject: [PATCH 25/39] Batch GF2 for all Ham in complex fSz --- src/mVMC/calham_fsz.c | 135 ++++++++++++++++++++++++++++++++++++------ src/mVMC/locgrn_fsz.c | 50 +++++++++++++--- 2 files changed, 159 insertions(+), 26 deletions(-) diff --git a/src/mVMC/calham_fsz.c b/src/mVMC/calham_fsz.c index 10239e14..c3935ada 100644 --- a/src/mVMC/calham_fsz.c +++ b/src/mVMC/calham_fsz.c @@ -2,7 +2,7 @@ mVMC - A numerical solver package for a wide range of quantum lattice models based on many-variable Variational Monte Carlo method Copyright (C) 2016 The University of Tokyo, All rights reserved. -his program is developed based on the mVMC-mini program +This program is developed based on the mVMC-mini program (https://github.com/fiber-miniapp/mVMC-mini) which follows "The BSD 3-Clause License". @@ -57,6 +57,15 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co double complex *myBuffer; double complex myEnergy; + const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll; + int *lazy_info = malloc( sizeof(int) * nHamiltonianTwo * 2); + int *lazy_rsi = malloc(2 * sizeof(int) * nHamiltonianTwo); + int *lazy_msj = malloc(2 * sizeof(int) * nHamiltonianTwo); + double complex *lazy_ip = malloc(sizeof(double complex) * nHamiltonianTwo); + double complex *lazy_pfa = malloc(sizeof(double complex) * nHamiltonianTwo * NQPFull); + memset(lazy_info, 0, sizeof(int) * nHamiltonianTwo); + memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo); + RequestWorkSpaceThreadInt(Nsize+Nsize+Nsite2+NProj); RequestWorkSpaceThreadComplex(NQPFull+2*Nsize); /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */ @@ -66,7 +75,7 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co private(myEleIdx,myEleNum,myProjCntNew,myBuffer,myEnergy,idx) \ reduction(+:e) */ -#pragma omp parallel default(none) \ +#pragma omp parallel default(shared) \ private(myEleIdx,myEleSpn,myEleNum,myProjCntNew,myBuffer,myEnergy, idx, ri, rj, rk, rl, s, t,u,v) \ firstprivate(ip, Nsize, Nsite2, NProj, NQPFull, NCoulombIntra, CoulombIntra, ParaCoulombIntra, \ NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling, \ @@ -80,6 +89,20 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co myProjCntNew = GetWorkSpaceThreadInt(NProj); myBuffer = GetWorkSpaceThreadComplex(NQPFull+2*Nsize); + void *pfOrbital[NQPFull]; + void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; + + // Attaching thread-private objects to thread-shared InvM. + // These objects no long need mutating states in this use. Just functor-like stuff. + updated_tdi_v_seq_init_precomp_z(NQPFull, Nsite, Nsite2, Nsize, + SlaterElm, Nsite2*Nsite2, + InvM, Nsize*Nsize, + eleIdx, eleSpn, + 2 /* GF @ measure: 2 at max. */, PfM, + pfUpdator, pfOrbital, pfMat, pfMap); + #pragma loop noalias for(idx=0;idx/ */ /* buffer size = NQPFull+2*Nsize */ -double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl, +double complex GreenFunc2_fsz_(const int ri, const int rj, const int rk, const int rl, const int s, const int t, const double complex ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, - int *projCntNew, double complex *buffer) { + int *projCntNew, double complex *buffer, + int *lazy_info, int *lazy_rsi, int *lazy_msj) { double complex z; int mj,msj,ml,mtl; int rsi,rsj,rtk,rtl; @@ -199,8 +200,17 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); - z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_info) { + CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); + z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + } else { + // Save invocation hook instead of calling. + lazy_msj[0] = msj; //< to edit to ri, revert to rj + lazy_msj[1] = mtl; //< to edit to rk, revert to rl + lazy_rsi[0] = rsi; + lazy_rsi[1] = rtk; + lazy_info[0] = 1; //< "is delayed" flag.. + } /* revert hopping */ eleIdx[mtl] = rl; @@ -215,12 +225,21 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in return conj(z/ip);//TBC } +double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl, + const int s, const int t, const double complex ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, + int *projCntNew, double complex *buffer) { + return GreenFunc2_fsz_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn, + projCntNew, buffer, 0, 0, 0); +} + /* Calculate 2-body Green function / */ /* buffer size = NQPFull+2*Nsize */ -double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl, +double complex GreenFunc2_fsz2_(const int ri, const int rj, const int rk, const int rl, const int s, const int t,const int u,const int v, const double complex ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, - int *projCntNew, double complex *buffer) { + int *projCntNew, double complex *buffer, + int *lazy_info, int *lazy_rsi, int *lazy_msj) { double complex z; //int mj,msj,ml,mtl; int mj,ml; // mj: (rj,t) -> (ri,s) ml:(rl,v) -> (rk,u) @@ -325,8 +344,16 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s - z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_info) { + CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s + z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + } else { + lazy_msj[0] = mj; + lazy_msj[1] = ml; + lazy_rsi[0] = XI; + lazy_rsi[1] = XK; + lazy_info[0] = 1; + } /* revert hopping */ eleIdx[ml] = rl; // ml : (rl,v) @@ -342,6 +369,13 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i } +double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl, + const int s, const int t,const int u,const int v, const double complex ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, + int *projCntNew, double complex *buffer) { + return GreenFunc2_fsz2_(ri, rj, rk, rl, s, t, u, v, ip, + eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn, projCntNew, buffer, 0, 0, 0); +} // ignore GreenFuncN: to be added From aedba0832418d8e9678d5db75459948d68ab5535 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Tue, 1 Aug 2023 18:24:18 +0900 Subject: [PATCH 26/39] Upgrade PFAPACK77 w/ 2-Step Algos --- src/pfapack | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pfapack b/src/pfapack index ebaa11b7..af8471ae 160000 --- a/src/pfapack +++ b/src/pfapack @@ -1 +1 @@ -Subproject commit ebaa11b722dbb5f1ddc62da607592d2e301c0c4a +Subproject commit af8471aeaf1ff1c07aef6be196ca2ddbb09726d2 From 01824c2e3c986e56f64309f789aa67082f809749 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 6 Nov 2023 17:17:38 +0900 Subject: [PATCH 27/39] Adopt Fix in PFAPACK77 2-Step --- src/pfapack | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pfapack b/src/pfapack index af8471ae..7826344a 160000 --- a/src/pfapack +++ b/src/pfapack @@ -1 +1 @@ -Subproject commit af8471aeaf1ff1c07aef6be196ca2ddbb09726d2 +Subproject commit 7826344af031cfe554d1e26f0e7d0b21cb510c9c From f40bd6bd4a277ae09dde187c286cdfae8246ac0c Mon Sep 17 00:00:00 2001 From: Ruqing Xu Date: Sat, 29 Jun 2024 17:28:00 +0900 Subject: [PATCH 28/39] Batched GF2 for Real non-fsz --- src/mVMC/calham_real.c | 131 ++++++++++++++++++++++++++++++++++------- src/mVMC/locgrn_real.c | 25 ++++++-- 2 files changed, 132 insertions(+), 24 deletions(-) diff --git a/src/mVMC/calham_real.c b/src/mVMC/calham_real.c index 660aa2d4..08149c56 100644 --- a/src/mVMC/calham_real.c +++ b/src/mVMC/calham_real.c @@ -51,6 +51,18 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg double *myBuffer; double myEnergy; + const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll; + int *lazy_info = malloc( sizeof(int) * nHamiltonianTwo * 2); + int *lazy_rsi = malloc(2 * sizeof(int) * nHamiltonianTwo); + int *lazy_msj = malloc(2 * sizeof(int) * nHamiltonianTwo); + double *lazy_ip = malloc(sizeof(double) * nHamiltonianTwo); + double *lazy_pfa = malloc(sizeof(double) * nHamiltonianTwo * NQPFull); + memset(lazy_info, 0, sizeof(int) * nHamiltonianTwo); + memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo); + + for (int mi=0; mi/ */ /* buffer size = NQPFull+2*Nsize */ -double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, +double GreenFunc2_real_(const int ri, const int rj, const int rk, const int rl, const int s, const int t, const double ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, - int *projCntNew, double *buffer) { + int *projCntNew, double *buffer, + int *lazy_info, int *lazy_rsi, int *lazy_msj) { double z; int mj,msj,ml,mtl; int rsi,rsj,rtk,rtl; @@ -149,8 +150,17 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV); - z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_info) { + CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV); + z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF); + } else { + // Save invocation hook instead of calling. + lazy_msj[0] = msj; //< to edit to ri, revert to rj + lazy_msj[1] = mtl; //< to edit to rk, revert to rl + lazy_rsi[0] = rsi; + lazy_rsi[1] = rtk; + lazy_info[0] = 1; //< "is delayed" flag.. + } /* revert hopping */ eleIdx[mtl] = rl; @@ -163,6 +173,13 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, return z/ip;//TBC } +double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, + const int s, const int t, const double ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, + int *projCntNew, double *buffer) { + return GreenFunc2_real_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0, 0, 0); +} + /// Calculate n-body Green function /// From 8b831ea853cbba32a15e90b20b5b500aef23e908 Mon Sep 17 00:00:00 2001 From: Ruqing Xu Date: Sat, 29 Jun 2024 17:30:06 +0900 Subject: [PATCH 29/39] Heuristics --- src/mVMC/calham_real.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mVMC/calham_real.c b/src/mVMC/calham_real.c index 08149c56..f9111197 100644 --- a/src/mVMC/calham_real.c +++ b/src/mVMC/calham_real.c @@ -244,8 +244,7 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg /* Batch-compute 2-body Green's functions. */ #pragma omp barrier - // if ( Nsize <= 100 ) // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2. - if ( 1 ) + if ( Nsize <= 400 ) // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2. { int num_qp_var0 = 0; // Pack lazy info. From 5b474aa9c42ecb729af94e6131dd65a8ca09ab9c Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 Jun 2024 17:38:50 +0900 Subject: [PATCH 30/39] Remove clutter print --- src/mVMC/calham_real.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mVMC/calham_real.c b/src/mVMC/calham_real.c index f9111197..fc2e76b9 100644 --- a/src/mVMC/calham_real.c +++ b/src/mVMC/calham_real.c @@ -259,7 +259,6 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg } num_qp_var0++; } - fprintf(stdout, "GF2: %d / %d\n", num_qp_var0, nHamiltonianTwo); #pragma omp barrier updated_tdi_v_omp_var0_proc_batch_greentwo_d(NQPFull, num_qp_var0, NULL, lazy_info + nHamiltonianTwo, From fa841f48b51bc7b7d3ed43ca7161673c70227af8 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 Jun 2024 17:53:53 +0900 Subject: [PATCH 31/39] real calc use makeInitialSample: non-fsz --- src/mVMC/vmcmake_real.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c index 3570b956..9b73521f 100644 --- a/src/mVMC/vmcmake_real.c +++ b/src/mVMC/vmcmake_real.c @@ -61,7 +61,7 @@ void VMCMakeSample_real(MPI_Comm comm) { StartTimer(30); if (BurnFlag == 0) { - makeInitialSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt, + makeInitialSample_real(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt, qpStart, qpEnd, comm); } else { copyFromBurnSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt); @@ -104,7 +104,7 @@ void VMCMakeSample_real(MPI_Comm comm) { if (!isfinite(logIpOld)) { if (rank == 0) fprintf(stderr, "waring: VMCMakeSample remakeSample logIpOld=%e\n", creal(logIpOld)); //TBC - makeInitialSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt, + makeInitialSample_real(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt, qpStart, qpEnd, comm); #ifdef _pf_block_update // Clear and reinitialize. From 5a43674237eed281c2e6d6fba21f572497e18d7f Mon Sep 17 00:00:00 2001 From: Ruqing Xu Date: Sun, 30 Jun 2024 20:14:45 +0900 Subject: [PATCH 32/39] Use Direct-blocked+Left-looking Pfapack - Strange (Complex&ABI suspected) error for skip=2 aggregate-block - Direct-block + left-looking is faster --- src/mVMC/matrix.c | 26 ++++++++++++++------------ src/pfapack | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/mVMC/matrix.c b/src/mVMC/matrix.c index c91ba200..646c8f18 100644 --- a/src/mVMC/matrix.c +++ b/src/mVMC/matrix.c @@ -162,15 +162,15 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta } } - M_ZSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info); + M_ZSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info); if(info!=0) return info; - utu2pfa_z(n, invM, lda, iwork, &pfaff); + ltl2pfa_z(n, invM, lda, iwork, &pfaff); if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1; PfM[qpidx] = pfaff; /* Calculate inverse. */ - utu2inv_z(n, invM, lda, iwork, work, bufM, lda); + ltl2inv_z(n, invM, lda, iwork, work, bufM, lda); /* InvM -> InvM(T) -> -InvM */ M_ZSCAL(&nsq, &minus_one, invM, &one); @@ -261,15 +261,15 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int } /* Calculate Pf M */ - M_DSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info); + M_DSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info); if(info!=0) return info; - utu2pfa_d(n, invM, lda, iwork, &pfaff); + ltl2pfa_d(n, invM, lda, iwork, &pfaff); if(!isfinite(pfaff)) return qpidx+1; PfM_real[qpidx] = pfaff; /* Calculate inverse. */ - utu2inv_d(n, invM, lda, iwork, work, bufM, lda); + ltl2inv_d(n, invM, lda, iwork, work, bufM, lda); /* InvM -> InvM(T) -> -InvM */ M_DSCAL(&nsq, &minus_one, invM, &one); @@ -369,15 +369,15 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE } /* Calculate Pf M */ - M_ZSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info); + M_ZSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info); if(info!=0) return info; - utu2pfa_z(n, invM, lda, iwork, &pfaff); + ltl2pfa_z(n, invM, lda, iwork, &pfaff); if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1; PfM[qpidx] = pfaff; /* Calculate inverse. */ - utu2inv_z(n, invM, lda, iwork, work, bufM, lda); + ltl2inv_z(n, invM, lda, iwork, work, bufM, lda); /* mVMC's handling InvM as row-major, * i.e. InvM needs a transpose, InvM -> -InvM according antisymmetric properties. */ @@ -596,15 +596,17 @@ int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpE } /* calculate Pf M */ - M_DSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info); + M_DSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info); + // printf("M_DSKTRF info:%d\n", info); if(info!=0) return info; - utu2pfa_d(n, invM, lda, iwork, &pfaff); + ltl2pfa_d(n, invM, lda, iwork, &pfaff); + // printf("pfaff:%e\n", pfaff); if(!isfinite(pfaff)) return qpidx+1; PfM_real[qpidx] = pfaff; /* Compute inverse */ - utu2inv_d(n, invM, lda, iwork, work, bufM, lda); + ltl2inv_d(n, invM, lda, iwork, work, bufM, lda); // InvM -> InvM' = -InvM // TODO: Include this in ltl2inv diff --git a/src/pfapack b/src/pfapack index 7826344a..f541a51a 160000 --- a/src/pfapack +++ b/src/pfapack @@ -1 +1 @@ -Subproject commit 7826344af031cfe554d1e26f0e7d0b21cb510c9c +Subproject commit f541a51a3a6968b84117cf12f69b0776053a79e3 From 27dab862991c1bd3628295622d7907b9d55de8f4 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sun, 30 Jun 2024 20:16:42 +0900 Subject: [PATCH 33/39] Correct para amp See TorchVMC dev notes --- src/mVMC/parameter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mVMC/parameter.c b/src/mVMC/parameter.c index 9167c941..fc0b59ef 100644 --- a/src/mVMC/parameter.c +++ b/src/mVMC/parameter.c @@ -29,7 +29,7 @@ along with this program. If not, see http://www.gnu.org/licenses/. #ifndef _SRC_PARAMETER #define _SRC_PARAMETER -#define D_AmpMax 4.0 +#define D_AmpMax 0.5 /* initialize variational parameters */ void InitParameter() { From 65fed488cd7d678d8235de45d0a234079fad5bbb Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 1 Jul 2024 00:12:35 +0900 Subject: [PATCH 34/39] Batched GF2 for non-fsz --- src/mVMC/calgrn.c | 86 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 5 deletions(-) diff --git a/src/mVMC/calgrn.c b/src/mVMC/calgrn.c index 8d13215b..e81503da 100644 --- a/src/mVMC/calgrn.c +++ b/src/mVMC/calgrn.c @@ -38,6 +38,17 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in int *myEleIdx, *myEleNum, *myProjCntNew; double complex *myBuffer; + int *lazy_info = malloc( sizeof(int) * NCisAjsCktAltDC * 2); + int *lazy_rsi = malloc(2 * sizeof(int) * NCisAjsCktAltDC); + int *lazy_msj = malloc(2 * sizeof(int) * NCisAjsCktAltDC); + double complex *lazy_ip = malloc(sizeof(double complex) * NCisAjsCktAltDC); + double complex *lazy_pfa = malloc(sizeof(double complex) * NCisAjsCktAltDC * NQPFull); + memset(lazy_info, 0, sizeof(int) * NCisAjsCktAltDC); + memset(lazy_info + NCisAjsCktAltDC, -1, sizeof(int) * NCisAjsCktAltDC); + + for (int mi=0; mi Date: Wed, 2 Oct 2024 22:53:15 +0900 Subject: [PATCH 35/39] Merge .github/workflows changes ahead of #64 s.t. testing goes ahead of actual merging --- .github/workflows/deploy_docs.yml | 16 +++++++++++----- .github/workflows/main.yml | 8 +++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 02ab63ca..9c6855af 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -4,6 +4,7 @@ on: push: branches: - master + - develop - gh_actions # test branch - '!gh-pages' tags: '*' @@ -13,21 +14,21 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Inject slug/short variables - uses: rlespinasse/github-slug-action@v3.x + uses: rlespinasse/github-slug-action@v4.x - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: main - name: Checkout gh-pages - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: ref: gh-pages path: gh-pages - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: 3.8 @@ -42,7 +43,7 @@ jobs: cmake -E make_directory build cd build cmake -DDocument=ON -DENABLE_MPI=OFF ../ - make doc-ja-html doc-en-html + make doc-ja-html doc-en-html tutorial-ja-html - name: Deploy Configuration run: | @@ -63,6 +64,11 @@ jobs: mkdir -p "gh-pages/doc/${TARGET_NAME}" cp -r "main/build/doc/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/${lang}" done + for lang in ja; do + rm -rf "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}" + mkdir -p "gh-pages/doc/${TARGET_NAME}"/tutorial/ + cp -r "main/build/doc/tutorial/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}" + done cd gh-pages git config --local user.name "${GIT_USER}" git config --local user.email "${GIT_EMAIL}" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b49b05fe..4f66d3a6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,6 +2,7 @@ name: CI on: push: + pull_request: schedule: - cron: '0 0 1,15 * *' # JST 9:00 on 1st and 15th every month @@ -32,7 +33,7 @@ jobs: - name: brew if: ${{ runner.os == 'macOS' }} run: | - brew install openmpi scalapack libomp + brew install openmpi scalapack libomp blis - name: Setup Python uses: actions/setup-python@v5 @@ -52,13 +53,10 @@ jobs: run: | if [ ${{ runner.os }} = "macOS" ] ; then # CONFIG=apple requires gfortran but macOS runner has not, but gfortran-11, 12, ... - ln -s `which gfortran-11` gfortran - env PATH=`pwd`:$PATH cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE + cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_Fortran_COMPILER=gfortran-14 $GITHUB_WORKSPACE else cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE fi - env: - HOMEBREW_PREFIX: /opt/homebrew - name: build working-directory: ${{runner.workspace}}/build From 10bdc6c59396646628c28e1f7b6afd221d03cb93 Mon Sep 17 00:00:00 2001 From: Ruqing Xu Date: Wed, 2 Oct 2024 23:02:34 +0900 Subject: [PATCH 36/39] Merge upstream config changes --- .github/workflows/deploy_docs.yml | 16 +++++--- .github/workflows/main.yml | 68 ++++++++++++++++--------------- config/apple.cmake | 12 +++--- config/gcc.cmake | 1 + 4 files changed, 54 insertions(+), 43 deletions(-) diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 02ab63ca..9c6855af 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -4,6 +4,7 @@ on: push: branches: - master + - develop - gh_actions # test branch - '!gh-pages' tags: '*' @@ -13,21 +14,21 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Inject slug/short variables - uses: rlespinasse/github-slug-action@v3.x + uses: rlespinasse/github-slug-action@v4.x - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: main - name: Checkout gh-pages - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: ref: gh-pages path: gh-pages - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: 3.8 @@ -42,7 +43,7 @@ jobs: cmake -E make_directory build cd build cmake -DDocument=ON -DENABLE_MPI=OFF ../ - make doc-ja-html doc-en-html + make doc-ja-html doc-en-html tutorial-ja-html - name: Deploy Configuration run: | @@ -63,6 +64,11 @@ jobs: mkdir -p "gh-pages/doc/${TARGET_NAME}" cp -r "main/build/doc/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/${lang}" done + for lang in ja; do + rm -rf "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}" + mkdir -p "gh-pages/doc/${TARGET_NAME}"/tutorial/ + cp -r "main/build/doc/tutorial/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}" + done cd gh-pages git config --local user.name "${GIT_USER}" git config --local user.email "${GIT_EMAIL}" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8a67bd55..4f66d3a6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,48 +1,48 @@ name: CI -on: [push] +on: + push: + pull_request: + schedule: + - cron: '0 0 1,15 * *' # JST 9:00 on 1st and 15th every month jobs: ctest: - runs-on: ubuntu-20.04 + runs-on: ${{ matrix.os }} + + strategy: + matrix: + os: ["ubuntu-22.04", "ubuntu-20.04", "macos-latest"] + ompsize: [1, 4] + exclude: + - os: "macos-latest" + ompsize: 4 # OMP on macOS is too slow + fail-fast: false + env: + OMP_NUM_THREADS: ${{ matrix.ompsize }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: apt + if: ${{ runner.os == 'Linux' }} run: | sudo apt update - sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev wget + sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev - - name: pip + - name: brew + if: ${{ runner.os == 'macOS' }} run: | - python -m pip install numpy - python3 -m pip install numpy + brew install openmpi scalapack libomp blis - - name: eigen - working-directory: ${{runner.workspace}} - run: | - wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz - tar -zxvf eigen-3.4.0.tar.gz - mkdir eigen-build - cd eigen-build - cmake ../eigen-3.4.0 - make -j4 - sudo make install + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" - - name: blis - working-directory: ${{runner.workspace}} + - name: pip run: | - mkdir blis - cd blis - git init - git remote add origin https://github.com/flame/blis - git fetch --depth 1 origin e3d352f1fcc93e6a46fde1aa4a7f0a18fb27bd42 - git checkout FETCH_HEAD - ./configure -t none -p ${{runner.workspace}}/blis-inst x86_64 - make -j8 install - cd ${{runner.workspace}}/blis-inst - tar -zcvf ${{runner.workspace}}/libblis_artifact.tar.gz include lib + python3 -m pip install numpy - name: make workspace run: cmake -E make_directory ${{runner.workspace}}/build @@ -51,10 +51,12 @@ jobs: working-directory: ${{runner.workspace}}/build shell: bash run: | - cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE - # Override downloaded artifact. - # TODO: Rebuild the artifacts. - mv ${{runner.workspace}}/libblis_artifact.tar.gz . + if [ ${{ runner.os }} = "macOS" ] ; then + # CONFIG=apple requires gfortran but macOS runner has not, but gfortran-11, 12, ... + cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_Fortran_COMPILER=gfortran-14 $GITHUB_WORKSPACE + else + cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE + fi - name: build working-directory: ${{runner.workspace}}/build diff --git a/config/apple.cmake b/config/apple.cmake index 16ff1130..37a7a290 100644 --- a/config/apple.cmake +++ b/config/apple.cmake @@ -1,16 +1,18 @@ # for Apple clang compiler # additional libomp and gfortran installation required # mac computers are suggested to use this configuration for better performance + +if(NOT DEFINED ENV{HOMEBREW_PREFIX}) + message(FATAL "Homebrew is not installed. Please install Homebrew first.") +endif() + set(CMAKE_C_COMPILER "clang" CACHE STRING "" FORCE) set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "" FORCE) set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall -Wformat -Werror=format-security") set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas -Wno-logical-not-parentheses") -set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE) -add_definitions(-DF77_COMPLEX_RET_INTEL) # OpenMP with libomp -set(CMAKE_EXE_LINKER_FLAGS "-L/usr/local/lib -lomp") -set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp" CACHE STRING "" FORCE) +set(CMAKE_EXE_LINKER_FLAGS "-L$ENV{HOMEBREW_PREFIX}/opt/libomp/lib -lomp") +set(OpenMP_C_FLAGS "-I$ENV{HOMEBREW_PREFIX}/opt/libomp/include -Xpreprocessor -fopenmp" CACHE STRING "" FORCE) set(OpenMP_C_LIB_NAMES "") set(CMAKE_OSX_SYSROOT "") - diff --git a/config/gcc.cmake b/config/gcc.cmake index 751912d7..8a15d2dc 100644 --- a/config/gcc.cmake +++ b/config/gcc.cmake @@ -3,4 +3,5 @@ set(CMAKE_C_COMPILER "gcc" CACHE STRING "" FORCE) set(CMAKE_CXX_COMPILER "g++" CACHE STRING "" FORCE) set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall -Wformat -Werror=format-security") set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ") set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE) From 1c190121b6d4e297a5750729689aacffe0cbfa41 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Thu, 3 Oct 2024 17:16:10 +0900 Subject: [PATCH 37/39] Update main.yml Add Eigen3 to CI deps --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4f66d3a6..25c18f04 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -28,12 +28,12 @@ jobs: if: ${{ runner.os == 'Linux' }} run: | sudo apt update - sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev + sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev - name: brew if: ${{ runner.os == 'macOS' }} run: | - brew install openmpi scalapack libomp blis + brew install openmpi scalapack libomp blis eigen - name: Setup Python uses: actions/setup-python@v5 From 805b19d10a5537a9effe28614b874a5507cc3249 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Thu, 3 Oct 2024 17:29:37 +0900 Subject: [PATCH 38/39] Update download_blis_artifact.cmake Use BLIS artifact from Julia binary builders --- download_blis_artifact.cmake | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/download_blis_artifact.cmake b/download_blis_artifact.cmake index ec9fc8a7..b2033f92 100644 --- a/download_blis_artifact.cmake +++ b/download_blis_artifact.cmake @@ -4,20 +4,8 @@ if(NOT DEFINED BLIS_ARTIFACT_CONFIG) COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE) if(${ARCHITECTURE} STREQUAL "x86_64") - execute_process( - COMMAND cat /proc/cpuinfo - COMMAND grep Intel - COMMAND head -1 - COMMAND tr -d '\n' - OUTPUT_VARIABLE UARCH_INTEL) - if(NOT "${UARCH_INTEL} " STREQUAL " ") - set(BLIS_ARTIFACT_CONFIG "intel64") - else() - set(BLIS_ARTIFACT_CONFIG "amd64") - endif() - elseif(${ARCHITECTURE} STREQUAL "aarch64" OR - ${ARCHITECTURE} STREQUAL "arm64") - # Currently SVE requires being set manually. + set(BLIS_ARTIFACT_CONFIG "x86_64") + elseif(${ARCHITECTURE} STREQUAL "aarch64" OR ${ARCHITECTURE} STREQUAL "arm64") set(BLIS_ARTIFACT_CONFIG "arm64") else() message(FATAL_ERROR "Failed to recognize architecture ${ARCHITECTURE}.") @@ -27,14 +15,10 @@ endif(NOT DEFINED BLIS_ARTIFACT_CONFIG) message(STATUS "Downloading BLIS artifact...") set(BLIS_ARCHIVE ${CMAKE_CURRENT_BINARY_DIR}/libblis_artifact.tar.gz) -if(${BLIS_ARTIFACT_CONFIG} STREQUAL "intel64") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_intel64_gcc.tar.gz) -elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "amd64") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_amd64_gcc.tar.gz) -elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_cortexa57_aarch64-linux-gnu-gcc.tar.gz) -elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_armsve_aarch64-linux-gnu-gcc-10.tar.gz) +if(${BLIS_ARTIFACT_CONFIG} STREQUAL "x86_64") + set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.x86_64-linux-gnu.tar.gz) +elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64" OR ${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve") + set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.aarch64-linux-gnu.tar.gz) elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "a64fx") set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_a64fx_aarch64-linux-gnu-gcc-10.tar.gz) else() From 3203b346aab774b72806965ebb4599641e402a89 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Thu, 3 Oct 2024 23:10:28 +0900 Subject: [PATCH 39/39] Update CMakeLists.txt ${ARCHITECTURE} might be empty --- src/pfupdates/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pfupdates/CMakeLists.txt b/src/pfupdates/CMakeLists.txt index 11b0202b..a7ef83a4 100644 --- a/src/pfupdates/CMakeLists.txt +++ b/src/pfupdates/CMakeLists.txt @@ -5,7 +5,7 @@ if(${CMAKE_PROJECT_NAME} STREQUAL "Project") message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.") endif(${CMAKE_PROJECT_NAME} STREQUAL "Project") -if(${ARCHITECTURE} STREQUAL "x86_64") +if("${ARCHITECTURE}" STREQUAL "x86_64") add_definitions(-DF77_COMPLEX_RET_INTEL) endif() include_directories(../ltl2inv)