From 28da2231604befd10ef3f45a75ac5099eaa4f5b2 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 11:25:33 +0900
Subject: [PATCH 01/39] Port from libVMC

---
 src/common/blalink.hh        | 518 -----------------------------------
 src/common/blalink_fort.h    | 184 -------------
 src/common/colmaj.hh         |  26 --
 src/ltl2inv/blalink_gemmt.c  |  42 ---
 src/ltl2inv/blalink_gemmt.hh |  80 ------
 src/ltl2inv/eigen_gemmt.tcc  |  77 ++++++
 src/ltl2inv/error.hh         |  42 +++
 src/ltl2inv/gemmt2blas.cc    |  35 +++
 src/ltl2inv/ilaenv_lauum.cc  |  10 +-
 src/ltl2inv/ilaenv_lauum.hh  |   3 +-
 src/ltl2inv/invert.tcc       |  92 ++++---
 src/ltl2inv/lapack2eigen.hh  | 272 ++++++++++++++++++
 src/ltl2inv/ltl2inv.cc       |  54 ++--
 src/ltl2inv/makefile_ltl2inv |   9 +-
 src/ltl2inv/pfaffian.tcc     |  80 ++++--
 src/ltl2inv/testinv.cc       | 125 ---------
 src/ltl2inv/trmmt.tcc        |  28 +-
 17 files changed, 602 insertions(+), 1075 deletions(-)
 delete mode 100644 src/common/blalink.hh
 delete mode 100644 src/common/blalink_fort.h
 delete mode 100644 src/common/colmaj.hh
 delete mode 100644 src/ltl2inv/blalink_gemmt.c
 delete mode 100644 src/ltl2inv/blalink_gemmt.hh
 create mode 100644 src/ltl2inv/eigen_gemmt.tcc
 create mode 100644 src/ltl2inv/error.hh
 create mode 100644 src/ltl2inv/gemmt2blas.cc
 create mode 100644 src/ltl2inv/lapack2eigen.hh
 delete mode 100644 src/ltl2inv/testinv.cc
diff --git a/src/common/blalink.hh b/src/common/blalink.hh
deleted file mode 100644
index 2750c34c..00000000
--- a/src/common/blalink.hh
+++ /dev/null
@@ -1,518 +0,0 @@
-/**
- * \file blalink.hh
- * C++ to C type wrapper for BLIS type-apis.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at https://mozilla.org/MPL/2.0/.
- */
-#pragma once
-#include <complex>
-#include <cassert>
-typedef std::complex<float>  ccscmplx;
-typedef std::complex<double> ccdcmplx;
-// BLIS definitions.
-#include "blis.h"
-// BLAS definitions
-#include "blalink_fort.h"
-
-// error info
-typedef enum {
-    Pfaffine_SIGN_ERR = 1,
-    Pfaffine_OUT_OF_BOUND,
-    Pfaffine_BAD_SCRATCHPAD,
-    Pfaffine_BAD_REPRESENTATION,
-    Pfaffine_INTEGER_OVERFLOW,
-    Pfaffine_DOUBLE_NAN_DETECTED,
-    Pfaffine_NOT_IMPLEMNTED,
-    Pfaffine_NUM_ERROR_TYPE
-} skpfa_error_t;
-inline signed err_info(signed type, signed pos)
-{ return type * Pfaffine_NUM_ERROR_TYPE + pos; }
-
-// gemm
-template <typename T>
-inline void gemm(trans_t transa, trans_t transb,
-                 dim_t m, dim_t n, dim_t k,
-                 T alpha,
-                 T *a, inc_t lda,
-                 T *b, inc_t ldb,
-                 T beta,
-                 T *c, inc_t ldc);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void gemm<cctype>(trans_t transa, trans_t transb, \
-                                         dim_t m, dim_t n, dim_t k, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *b, inc_t ldb, \
-                                         cctype beta, \
-                                         cctype *c, inc_t ldc) \
-    { \
-        bli_##cchar##gemm(transa, transb, \
-                          m, n, k, \
-                          (ctype *)&alpha, \
-                          (ctype *)a, 1, lda, \
-                          (ctype *)b, 1, ldb, \
-                          (ctype *)&beta, \
-                          (ctype *)c, 1, ldc); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void gemm<cctype>(trans_t transa, trans_t transb, \
-                                         dim_t m, dim_t n, dim_t k, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *b, inc_t ldb, \
-                                         cctype beta, \
-                                         cctype *c, inc_t ldc) \
-    { \
-        char ta = trans2char(transa), \
-             tb = trans2char(transb); \
-        cchar##gemm_(&ta, &tb, &m, &n, &k, \
-                     (ctype *)&alpha, \
-                     (ctype *)a, &lda, \
-                     (ctype *)b, &ldb, \
-                     (ctype *)&beta, \
-                     (ctype *)c, &ldc); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-
-// ger
-template <typename T>
-inline void ger(dim_t m, dim_t n,
-                T alpha,
-                T *x, inc_t incx,
-                T *y, inc_t incy,
-                T *a, inc_t lda);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
-    template <> inline void ger<cctype>(dim_t m, dim_t n, \
-                                        cctype alpha, \
-                                        cctype *x, inc_t incx, \
-                                        cctype *y, inc_t incy, \
-                                        cctype *a, inc_t lda) \
-    { \
-        bli_##cchar##ger(BLIS_NO_CONJUGATE, \
-                         BLIS_NO_CONJUGATE, \
-                         m, n, \
-                         (ctype *)&alpha, \
-                         (ctype *)x, incx, \
-                         (ctype *)y, incy, \
-                         (ctype *)a, 1, lda); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
-    template <> inline void ger<cctype>(dim_t m, dim_t n, \
-                                        cctype alpha, \
-                                        cctype *x, inc_t incx, \
-                                        cctype *y, inc_t incy, \
-                                        cctype *a, inc_t lda) \
-    { \
-        cchar##cfunc##_(&m, &n, \
-                        (ctype *)&alpha, \
-                        (ctype *)x, &incx, \
-                        (ctype *)y, &incy, \
-                        (ctype *)a, &lda); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s, ger  )
-BLALINK_MAC( double,   double,   d, ger  )
-BLALINK_MAC( ccscmplx, scomplex, c, geru )
-BLALINK_MAC( ccdcmplx, dcomplex, z, geru )
-#undef BLALINK_MAC
-
-
-// gemv
-template <typename T>
-inline void gemv(trans_t trans,
-                 dim_t m, dim_t n,
-                 T alpha,
-                 T *a, inc_t lda,
-                 T *x, inc_t incx,
-                 T beta,
-                 T *y, inc_t incy);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void gemv<cctype>(trans_t trans, \
-                                         dim_t m, dim_t n, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *x, inc_t incx, \
-                                         cctype beta, \
-                                         cctype *y, inc_t incy) \
-    { \
-        bli_##cchar##gemv(trans, \
-                          BLIS_NO_CONJUGATE, \
-                          m, n, \
-                          (ctype *)&alpha, \
-                          (ctype *)a, 1, lda, \
-                          (ctype *)x, incx, \
-                          (ctype *)&beta, \
-                          (ctype *)y, incy); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void gemv<cctype>(trans_t trans, \
-                                         dim_t m, dim_t n, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *x, inc_t incx, \
-                                         cctype beta, \
-                                         cctype *y, inc_t incy) \
-    { \
-        char t = trans2char(trans); \
-        cchar##gemv_(&t, &m, &n, \
-                     (ctype *)&alpha, \
-                     (ctype *)a, &lda, \
-                     (ctype *)x, &incx, \
-                     (ctype *)&beta, \
-                     (ctype *)y, &incy); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-
-// trmm
-template <typename T>
-inline void trmm(side_t sidea, 
-                 uplo_t uploa, 
-                 trans_t transa, 
-                 diag_t diaga,
-                 dim_t m, dim_t n,
-                 T alpha, 
-                 T *a, inc_t lda, 
-                 T *b, inc_t ldb);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void trmm<cctype>(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \
-                                         dim_t m, dim_t n, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *b, inc_t ldb) \
-    { \
-        bli_##cchar##trmm(sidea, uploa, transa, diaga, \
-                          m, n, \
-                          (ctype *)&alpha, \
-                          (ctype *)a, 1, lda, \
-                          (ctype *)b, 1, ldb); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void trmm<cctype>(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \
-                                         dim_t m, dim_t n, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *b, inc_t ldb) \
-    { \
-        char ul = uplo2char(uploa), \
-             si = side2char(sidea), \
-             tr = trans2char(transa), \
-             dg = diag2char(diaga); \
-        cchar##trmm_(&si, &ul, &tr, &dg, \
-                     &m, &n, \
-                     (ctype *)&alpha, \
-                     (ctype *)a, &lda, \
-                     (ctype *)b, &ldb); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-
-// trmv
-template <typename T>
-inline void trmv(uplo_t uploa,
-                 trans_t transa,
-                 dim_t m,
-                 T alpha,
-                 T *a, inc_t lda,
-                 T *x, inc_t incx);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void trmv<cctype>(uplo_t uploa, trans_t transa, dim_t m, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *x, inc_t incx) \
-    { \
-        bli_##cchar##trmv(uploa, transa, \
-                          BLIS_NONUNIT_DIAG, m, \
-                          (ctype *)&alpha, \
-                          (ctype *)a, 1, lda, \
-                          (ctype *)x, incx); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void trmv<cctype>(uplo_t uploa, trans_t transa, dim_t m, \
-                                         cctype alpha, \
-                                         cctype *a, inc_t lda, \
-                                         cctype *x, inc_t incx) \
-    { \
-        char ul = uplo2char(uploa), \
-             tr = trans2char(transa), \
-             dg = 'N'; \
-        /*
-         * TRMV has alpha in its templated interface, which is absent in BLAS.
-         */ \
-        assert(alpha == cctype(1.0)); \
-        cchar##trmv_(&ul, &tr, &dg, &m, \
-                     (ctype *)a, &lda, \
-                     (ctype *)x, &incx); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-
-// swap
-template <typename T>
-inline void swap(dim_t n,
-                 T *x, inc_t incx,
-                 T *y, inc_t incy);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void swap<cctype>(dim_t n, \
-                                         cctype *x, inc_t incx, \
-                                         cctype *y, inc_t incy) \
-    { \
-        bli_##cchar##swapv(n, \
-                           (ctype *)x, incx, \
-                           (ctype *)y, incy); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void swap<cctype>(dim_t n, \
-                                         cctype *x, inc_t incx, \
-                                         cctype *y, inc_t incy) \
-    { \
-        cchar##swap_(&n, \
-                     (ctype *)x, &incx, \
-                     (ctype *)y, &incy); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-
-// axpy
-template <typename T>
-inline void axpy(dim_t n,
-                 T alpha,
-                 T *x, inc_t incx,
-                 T *y, inc_t incy);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void axpy<cctype>(dim_t n, \
-                                         cctype alpha, \
-                                         cctype *x, inc_t incx, \
-                                         cctype *y, inc_t incy) \
-    { \
-        bli_##cchar##axpyv(BLIS_NO_CONJUGATE, n, \
-                           (ctype *)&alpha, \
-                           (ctype *)x, incx, \
-                           (ctype *)y, incy); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void axpy<cctype>(dim_t n, \
-                                         cctype alpha, \
-                                         cctype *x, inc_t incx, \
-                                         cctype *y, inc_t incy) \
-    { \
-        cchar##axpy_(&n, \
-                     (ctype *)&alpha, \
-                     (ctype *)x, &incx, \
-                     (ctype *)y, &incy); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-// dot
-template <typename T>
-inline T dot(dim_t n,
-             T *sx, inc_t incx,
-             T *sy, inc_t incy);
-#ifndef BLAS_EXTERNAL
-#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
-    template <> inline cctype dot<cctype>(dim_t n, \
-                                          cctype *sx, inc_t incx, \
-                                          cctype *sy, inc_t incy) \
-    { \
-        cctype rho; \
-        bli_##cchar##dotv(BLIS_NO_CONJUGATE, \
-                          BLIS_NO_CONJUGATE, \
-                          n, \
-                          (ctype *)sx, incx, \
-                          (ctype *)sy, incy, \
-                          (ctype *)&rho); \
-        return rho; \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
-    template <> inline cctype dot<cctype>(dim_t n, \
-                                          cctype *sx, inc_t incx, \
-                                          cctype *sy, inc_t incy) \
-    { \
-        ctype rho = cchar##cfunc##_(&n, \
-                                    (ctype *)sx, &incx, \
-                                    (ctype *)sy, &incy); \
-        return *((cctype *)&rho); \
-    }
-#endif
-BLALINK_MAC( float,    float,    s, dot  )
-BLALINK_MAC( double,   double,   d, dot  )
-#if defined(BLAS_EXTERNAL) && defined(F77_COMPLEX_RET_INTEL)
-// Intel style complex return.
-#undef BLALINK_MAC
-#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
-    template <> inline cctype dot<cctype>(dim_t n, \
-                                          cctype *sx, inc_t incx, \
-                                          cctype *sy, inc_t incy) \
-    { \
-        cctype rho; \
-        cchar##cfunc##_((ctype *)&rho, &n, \
-                        (ctype *)sx, &incx, \
-                        (ctype *)sy, &incy); \
-        return rho; \
-    }
-#endif
-BLALINK_MAC( ccscmplx, scomplex, c, dotu )
-BLALINK_MAC( ccdcmplx, dcomplex, z, dotu )
-#undef BLALINK_MAC
-
-
-
-// [LAPACK] lacpy
-template <typename T>
-inline void lacpy(uplo_t uploa,
-                  dim_t m, dim_t n,
-                  T *a, inc_t lda,
-                  T *b, inc_t ldb);
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void lacpy<cctype>(uplo_t uploa, \
-                                          dim_t m, dim_t n, \
-                                          cctype *a, inc_t lda, \
-                                          cctype *b, inc_t ldb) \
-    { \
-        char ul = uplo2char(uploa); \
-        cchar##lacpy_(&ul, &m, &n, a, &lda, b, &ldb); \
-    }
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-// [LAPACK] trtri
-template <typename T>
-inline int trtri(uplo_t uploa,
-                 diag_t diaga,
-                 dim_t n,
-                 T *a, inc_t lda);
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline int trtri(uplo_t uploa, \
-                                 diag_t diaga, \
-                                 dim_t n, \
-                                 cctype *a, inc_t lda) \
-    { \
-        char ul = uplo2char(uploa); \
-        char dg = diag2char(diaga); \
-        int info; \
-        cchar##trtri_(&ul, &dg, &n, a, &lda, &info); \
-        return info; \
-    }
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-// [PFAPACK] sktrf
-template <typename T>
-inline int sktrf(uplo_t uploa,
-                 dim_t n,
-                 T *a, inc_t lda,
-                 int *ipiv,
-                 T *work, dim_t lwork);
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline int sktrf(uplo_t uploa, \
-                                 dim_t n, \
-                                 cctype *a, inc_t lda, \
-                                 int *ipiv, \
-                                 cctype *work, dim_t lwork) \
-    { \
-        char ul = uplo2char(uploa); \
-        char mode = 'N'; \
-        int info; \
-        cchar##sktrf_(&ul, &mode, &n, a, &lda, ipiv, work, &lwork, &info); \
-        return info; \
-    }
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
-// [PFAPACK] skpfa
-template <typename T>
-inline int skpfa(uplo_t uploa,
-                 dim_t n,
-                 T *a, inc_t lda,
-                 T *pfa, int *ipiv,
-                 T *work, dim_t lwork);
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline int skpfa(uplo_t uploa, \
-                                 dim_t n, \
-                                 cctype *a, inc_t lda, \
-                                 cctype *pfa, int *ipiv, \
-                                 cctype *work, dim_t lwork) \
-    { \
-        char ul = uplo2char(uploa); \
-        char mthd = 'P'; \
-        int info; \
-        cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, &info); \
-        return info; \
-    }
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-#undef BLALINK_MAC
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline int skpfa(uplo_t uploa, \
-                                 dim_t n, \
-                                 cctype *a, inc_t lda, \
-                                 cctype *pfa, int *ipiv, \
-                                 cctype *work, dim_t lwork) \
-    { \
-        char ul = uplo2char(uploa); \
-        char mthd = 'P'; \
-        int info; \
-        cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, 0, &info); \
-        return info; \
-    }
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
diff --git a/src/common/blalink_fort.h b/src/common/blalink_fort.h
deleted file mode 100644
index eb0b3a39..00000000
--- a/src/common/blalink_fort.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/**
- * \file blalink_fort.hh
- * Wrapper for external BLAS.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at https://mozilla.org/MPL/2.0/.
- */
-#pragma once
-#include <stdlib.h>
-#include "blis.h"
-
-
-// converter from BLIS enum to BLAS char.
-BLIS_INLINE char trans2char(trans_t t)
-{
-    switch (t) {
-    case BLIS_NO_TRANSPOSE:
-        return 'N';
-    case BLIS_TRANSPOSE:
-        return 'T';
-    case BLIS_CONJ_NO_TRANSPOSE:
-        return 'C';
-    default:
-        abort();
-    }
-}
-BLIS_INLINE char uplo2char(uplo_t t)
-{
-    switch (t) {
-    case BLIS_UPPER:
-        return 'U';
-    case BLIS_LOWER:
-        return 'L';
-    default:
-        abort();
-    }
-}
-BLIS_INLINE char side2char(side_t t)
-{
-    switch (t) {
-    case BLIS_LEFT:
-        return 'L';
-    case BLIS_RIGHT:
-        return 'R';
-    default:
-        abort();
-    }
-}
-BLIS_INLINE char diag2char(diag_t t)
-{
-    switch (t) {
-    case BLIS_UNIT_DIAG:
-        return 'U';
-    case BLIS_NONUNIT_DIAG:
-        return 'N';
-    }
-}
-
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-// BLAS {
-#ifdef BLAS_EXTERNAL
-
-// gemm
-void sgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, float *alpha,
-            float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc);
-void dgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, double *alpha,
-            double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc);
-void cgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha,
-            void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
-void zgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha,
-            void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
-
-
-// ger
-void sger_(dim_t *m, dim_t *n, float *alpha,
-           float *x, dim_t *incx, float *y, dim_t *incy, float *a, dim_t *lda);
-void dger_(dim_t *m, dim_t *n, double *alpha,
-           double *x, dim_t *incx, double *y, dim_t *incy, double *a, dim_t *lda);
-void cgeru_(dim_t *m, dim_t *n, void *alpha,
-            void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda);
-void zgeru_(dim_t *m, dim_t *n, void *alpha,
-            void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda);
-
-
-// gemv
-void sgemv_(char *trans, dim_t *m, dim_t *n, float *alpha, float *a, dim_t *lda,
-            float *x, dim_t *incx, float *beta, float *y, dim_t *incy);
-void dgemv_(char *trans, dim_t *m, dim_t *n, double *alpha, double *a, dim_t *lda,
-            double *x, dim_t *incx, double *beta, double *y, dim_t *incy);
-void cgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda,
-            void *x, dim_t *incx, void *beta, void *y, dim_t *incy);
-void zgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda,
-            void *x, dim_t *incx, void *beta, void *y, dim_t *incy);
-
-
-// trmm
-void strmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
-            float *alpha, float *a, inc_t *lda, float *b, inc_t *ldb);	
-void dtrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
-            double *alpha, double *a, inc_t *lda, double *b, inc_t *ldb);	
-void ctrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
-            void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb);	
-void ztrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
-            void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb);	
-
-
-// trmv
-void strmv_(char *uplo, char *transa, char *diag, dim_t *n,
-            float *a, inc_t *lda, float *b, inc_t *ldb);	
-void dtrmv_(char *uplo, char *transa, char *diag, dim_t *n,
-            double *a, inc_t *lda, double *b, inc_t *ldb);	
-void ctrmv_(char *uplo, char *transa, char *diag, dim_t *n,
-            void *a, inc_t *lda, void *b, inc_t *ldb);	
-void ztrmv_(char *uplo, char *transa, char *diag, dim_t *n,
-            void *a, inc_t *lda, void *b, inc_t *ldb);	
-
-// swap
-void sswap_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy);
-void dswap_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy);
-void cswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-void zswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-
-// axpy
-void saxpy_(dim_t *n, float *alpha, float *sx, dim_t *incx, float *sy, dim_t *incy);
-void daxpy_(dim_t *n, double *alpha, double *sx, dim_t *incx, double *sy, dim_t *incy);
-void caxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy);
-void zaxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy);
-
-// dot
-float sdot_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy);
-double ddot_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy);
-scomplex cdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-dcomplex zdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-#ifndef F77_COMPLEX_RET_INTEL
-scomplex cdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-dcomplex zdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-#else
-void cdotu_(scomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-void zdotu_(dcomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
-#endif
-
-// }
-#endif
-
-// LAPACK {
-
-// lacpy
-void slacpy_(char *uplo, dim_t *m, dim_t *n, float *a, inc_t *lda, float *b, inc_t *ldb);
-void dlacpy_(char *uplo, dim_t *m, dim_t *n, double *a, inc_t *lda, double *b, inc_t *ldb);
-void clacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb);
-void zlacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb);
-
-// trtri
-void strtri_(char *uplo, char *diag, dim_t *n, float *a, inc_t *lda, int *info);
-void dtrtri_(char *uplo, char *diag, dim_t *n, double *a, inc_t *lda, int *info);
-void ctrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info);
-void ztrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info);
-
-// }
-
-// PFAPACK77 {
-
-void ssktrf_(char *uplo, char *mode, dim_t *n, float *a, dim_t *lda, int *ipiv, float *work, dim_t *lwork, int *info);
-void dsktrf_(char *uplo, char *mode, dim_t *n, double *a, dim_t *lda, int *ipiv, double *work, dim_t *lwork, int *info);
-void csktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info);
-void zsktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info);
-
-void sskpfa_(char *uplo, char *mthd, dim_t *n, float *a, dim_t *lda, float *pfa, int *ipiv, float *work, dim_t *lwork, int *info);
-void dskpfa_(char *uplo, char *mthd, dim_t *n, double *a, dim_t *lda, double *pfa, int *ipiv, double *work, dim_t *lwork, int *info);
-void cskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info);
-void zskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info);
-
-// }
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/src/common/colmaj.hh b/src/common/colmaj.hh
deleted file mode 100644
index d9a03c1e..00000000
--- a/src/common/colmaj.hh
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at https://mozilla.org/MPL/2.0/.
- */
-#pragma once
-
-template <typename T>
-struct colmaj
-{
-  T *dat;
-  int ld;
-
-  colmaj() = delete;
-  colmaj(T *dat_, int ld_)
-  : dat(dat_), ld(ld_) { }
-
-  T &operator()(int i, int j)
-  {
-    return dat[ i + j * ld ];
-  }
-  T &operator()()
-  {
-    return dat[ 0 ];
-  }
-};
diff --git a/src/ltl2inv/blalink_gemmt.c b/src/ltl2inv/blalink_gemmt.c
deleted file mode 100644
index fbaee562..00000000
--- a/src/ltl2inv/blalink_gemmt.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at https://mozilla.org/MPL/2.0/.
- */
-#include "blis.h"
-#include "blalink_fort.h"
-
-#if !( defined(BLAS_EXTERNAL) && defined(MKL) )
-// Redefine GEMMT Fortran from BLIS (which is built without BLAS API).
-// Required by Pfapack77.
-#define BLAGEN_MAC(cctype, ctype, cchar) \
-    void cchar##gemmt_(const char *uplo_, \
-                       const char *transa_, \
-                       const char *transb_, \
-                       const int *m, const int *k, \
-                       const ctype *alpha, \
-                       const ctype *a, const int *lda, \
-                       const ctype *b, const int *ldb, \
-                       const ctype *beta, ctype *c, const int *ldc) \
-    { \
-        uplo_t uploc; \
-        trans_t transa; \
-        trans_t transb; \
-        bli_param_map_netlib_to_blis_uplo(*uplo_, &uploc) ; \
-        bli_param_map_netlib_to_blis_trans(*transa_, &transa) ; \
-        bli_param_map_netlib_to_blis_trans(*transb_, &transb) ; \
-        bli_##cchar##gemmt(uploc, \
-                           transa, transb, \
-                           *m, *k, \
-                           alpha, \
-                           a, 1, *lda, \
-                           b, 1, *ldb, \
-                           beta, \
-                           c, 1, *ldc); \
-    }
-BLAGEN_MAC( float,    float,    s )
-BLAGEN_MAC( double,   double,   d )
-BLAGEN_MAC( ccscmplx, scomplex, c )
-BLAGEN_MAC( ccdcmplx, dcomplex, z )
-#undef BLAGEN_MAC
-#endif
diff --git a/src/ltl2inv/blalink_gemmt.hh b/src/ltl2inv/blalink_gemmt.hh
deleted file mode 100644
index 52bb2c45..00000000
--- a/src/ltl2inv/blalink_gemmt.hh
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at https://mozilla.org/MPL/2.0/.
- */
-#pragma once
-#include "blalink.hh"
-
-
-// gemmt is not part of BLAS standard.
-// It's currently know as exposed by BLIS and MKL.
-template <typename T>
-inline void gemmt(uplo_t uploc,
-                  trans_t transa, trans_t transb,
-                  dim_t m, dim_t k,
-                  T alpha,
-                  T *a, inc_t lda,
-                  T *b, inc_t ldb,
-                  T beta,
-                  T *c, inc_t ldc);
-#if !( defined(BLAS_EXTERNAL) && defined(MKL) )
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void gemmt<cctype>(uplo_t uploc, \
-                                          trans_t transa, trans_t transb, \
-                                          dim_t m, dim_t k, \
-                                          cctype alpha, \
-                                          cctype *a, inc_t lda, \
-                                          cctype *b, inc_t ldb, \
-                                          cctype beta, \
-                                          cctype *c, inc_t ldc) \
-    { \
-        bli_##cchar##gemmt(uploc, \
-                           transa, transb, \
-                           m, k, \
-                           (ctype *)&alpha, \
-                           (ctype *)a, 1, lda, \
-                           (ctype *)b, 1, ldb, \
-                           (ctype *)&beta, \
-                           (ctype *)c, 1, ldc); \
-    }
-#else
-#define BLALINK_MAC(cctype, ctype, cchar) \
-    template <> inline void gemmt<cctype>(uplo_t uploc, \
-                                          trans_t transa, trans_t transb, \
-                                          dim_t m, dim_t k, \
-                                          cctype alpha, \
-                                          cctype *a, inc_t lda, \
-                                          cctype *b, inc_t ldb, \
-                                          cctype beta, \
-                                          cctype *c, inc_t ldc) \
-    { \
-        char ul = uplo2char(uploc); \
-        char ta = trans2char(transa); \
-        char tb = trans2char(transb); \
-        cchar##gemmt_(&ul, \
-                      &ta, &tb, \
-                      &m, &k, \
-                      (ctype *)&alpha, \
-                      (ctype *)a, &lda, \
-                      (ctype *)b, &ldb, \
-                      (ctype *)&beta, \
-                      (ctype *)c, &ldc); \
-    }
-extern "C" {
-void sgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, float *alpha,
-             float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc);
-void dgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, double *alpha,
-             double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc);
-void cgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha,
-             void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
-void zgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha,
-             void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
-}
-#endif
-BLALINK_MAC( float,    float,    s )
-BLALINK_MAC( double,   double,   d )
-BLALINK_MAC( ccscmplx, scomplex, c )
-BLALINK_MAC( ccdcmplx, dcomplex, z )
-#undef BLALINK_MAC
-
diff --git a/src/ltl2inv/eigen_gemmt.tcc b/src/ltl2inv/eigen_gemmt.tcc
new file mode 100644
index 00000000..4e569df9
--- /dev/null
+++ b/src/ltl2inv/eigen_gemmt.tcc
@@ -0,0 +1,77 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "lapack2eigen.hh"
+#include "ilaenv_lauum.hh"
+
+
+template <typename T>
+inline void gemmt(const char &uploC,
+                  const char &transA,
+                  const char &transB,
+                  const T &alpha,
+                  const l2e::le_mat_t<T> &A_,
+                  const l2e::le_mat_t<T> &B_,
+                  const T &beta,
+                        l2e::le_mat_t<T> &C_)
+{
+    using LProxy = l2e::le_mat_t<T>;
+    using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+    const int npanel = ilaenv_lauum<T>(uploC, C_.rows);
+
+    bool conjA = transA == 'C' || transA == 'c';
+    bool conjB = transB == 'C' || transB == 'c';
+
+    const auto &A = (transA == 'T' || transA == 't') ? A_.transpose().to_eigen() : A_.to_eigen();
+    const auto &B = (transB == 'T' || transB == 't') ? B_.transpose().to_eigen() : B_.to_eigen();
+    auto        C =                                                                C_.to_eigen();
+
+    if (uploC == 'L' || uploC == 'l') {
+
+        for (int ipanel = 0; ipanel < A.rows(); ipanel += npanel) {
+            int sz_panel = std::min<int>(npanel, A.rows() - ipanel);
+
+            // Diagonal.
+            const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all);
+            const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1));
+            const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1),
+                                   Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha;
+            LProxy Cpanel_(Cpanel);
+            lacpy(uploC, LProxy(Cupdated), Cpanel_);
+
+            // Off-diagonal.
+            const auto &Aremain = A(Eigen::seq(ipanel+sz_panel, A.rows()-1), Eigen::all);
+            const auto &Bremain = B(Eigen::all, Eigen::seq(ipanel+sz_panel, B.cols()-1));
+            auto        Cremain = C(Eigen::seq(ipanel+sz_panel, C.rows()-1),
+                                    Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Cremain = Cremain * beta + Aremain * Bpanel * alpha;
+        }
+
+    } else { // (uploC == 'U' || uploC == 'u')
+
+        for (int ipanel = (A.rows()-1) / npanel * npanel; ipanel >= 0; ipanel -= npanel) {
+            int sz_panel = std::min<int>(npanel, A.rows() - ipanel);
+
+            // Diagonal.
+            const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all);
+            const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1));
+            const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1),
+                                   Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha;
+            LProxy Cpanel_(Cpanel);
+            lacpy(uploC, LProxy(Cupdated), Cpanel_);
+
+            // Off-diagonal.
+            const auto &Aremain = A(Eigen::seq(0, ipanel-1), Eigen::all);
+            const auto &Bremain = B(Eigen::all, Eigen::seq(0, ipanel-1));
+            auto        Cremain = C(Eigen::seq(0, ipanel-1),
+                                    Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Cremain = Cremain * beta + Aremain * Bpanel * alpha;
+        }
+
+    }
+}
diff --git a/src/ltl2inv/error.hh b/src/ltl2inv/error.hh
new file mode 100644
index 00000000..5c0a06e4
--- /dev/null
+++ b/src/ltl2inv/error.hh
@@ -0,0 +1,42 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include <cstdlib>
+#include <iostream>
+
+
+inline void abort(const char *msg)
+{
+  std::cout << msg << std::endl;
+  abort();
+}
+
+inline void abort(const char *msg0, const char *msg1)
+{
+  std::cout << msg0 << ": " << msg1 << std::endl;
+  abort();
+}
+
+#if (defined(_DEBUG) || !defined(SKIP_CHECKS))
+
+inline void assert_(const bool cond, const char *msg)
+{
+  if (!cond) abort(msg);
+}
+
+inline void assert_(const bool cond, const char *msg0, const char *msg1)
+{
+  if (!cond) abort(msg0, msg1);
+}
+
+#else
+
+inline void assert_(const bool cond, const char *msg) { }
+
+inline void assert_(const bool cond, const char *msg0, const char *msg1) { }
+
+#endif
+
diff --git a/src/ltl2inv/gemmt2blas.cc b/src/ltl2inv/gemmt2blas.cc
new file mode 100644
index 00000000..56cd6fe4
--- /dev/null
+++ b/src/ltl2inv/gemmt2blas.cc
@@ -0,0 +1,35 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "lapack2eigen.hh"
+
+
+#if !defined(_BLAS_HAS_GEMMT)
+// Import the customized GEMMT impl. to the BLAS API.
+// Required by Pfapack77.
+#define BLAGEN_MAC(cctype, ctype, cchar) \
+    extern "C" \
+    void cchar##gemmt_(const char *uplo_, \
+                       const char *transa_, \
+                       const char *transb_, \
+                       const int *m, const int *k, \
+                       const ctype *alpha, \
+                       ctype *a, const int *lda, \
+                       ctype *b, const int *ldb, \
+                       const ctype *beta, ctype *c, const int *ldc) \
+    { \
+        bool tA = (*transa_ == 'T' || *transa_ == 't'); \
+        bool tB = (*transb_ == 'T' || *transb_ == 't'); \
+        l2e::le_mat_t<cctype> A(tA ? *k : *m, tA ? *m : *k, 1, *lda, (cctype *)a); \
+        l2e::le_mat_t<cctype> B(tB ? *k : *m, tB ? *m : *k, 1, *ldb, (cctype *)b); \
+        l2e::le_mat_t<cctype> C(*m, *m, 1, *ldc, (cctype *)c); \
+        gemmt<cctype>(*uplo_, *transa_, *transb_, *(cctype *)alpha, A, B, *(cctype *)beta, C); \
+    }
+BLAGEN_MAC( float,    float,    s )
+BLAGEN_MAC( double,   double,   d )
+BLAGEN_MAC( ccscmplx, scomplex, c )
+BLAGEN_MAC( ccdcmplx, dcomplex, z )
+#undef BLAGEN_MAC
+#endif
diff --git a/src/ltl2inv/ilaenv_lauum.cc b/src/ltl2inv/ilaenv_lauum.cc
index f150384f..6dcb8b90 100644
--- a/src/ltl2inv/ilaenv_lauum.cc
+++ b/src/ltl2inv/ilaenv_lauum.cc
@@ -3,17 +3,19 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
-#include "blalink.hh"
 #include "ilaenv.h"
 #include "ilaenv_lauum.hh"
+#include <complex>
+using ccscmplx = std::complex<float>;
+using ccdcmplx = std::complex<double>;
+
 
 #define EXPANDMAC(cctype, name) \
-template <> int ilaenv_lauum<cctype>(uplo_t uplo, int n) \
+template <> int ilaenv_lauum<cctype>(const char uplo, const int n) \
 { \
-    char uplo_ = uplo2char(uplo); \
     int ispec = 1; \
     int dummy = 0; \
-    return ilaenv_(&ispec, #name, &uplo_, &n, &dummy, &dummy, &dummy); \
+    return ilaenv_(&ispec, #name, &uplo, &n, &dummy, &dummy, &dummy); \
 }
 EXPANDMAC( float,    SLAUUM )
 EXPANDMAC( double,   DLAUUM )
diff --git a/src/ltl2inv/ilaenv_lauum.hh b/src/ltl2inv/ilaenv_lauum.hh
index 68d89b57..46a59b0f 100644
--- a/src/ltl2inv/ilaenv_lauum.hh
+++ b/src/ltl2inv/ilaenv_lauum.hh
@@ -4,8 +4,7 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "blalink.hh"
 
 template <typename T>
-int ilaenv_lauum(uplo_t uplo, int n);
+int ilaenv_lauum(const char uplo, const int n);
 
diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc
index 0b11edf1..4fd24cda 100644
--- a/src/ltl2inv/invert.tcc
+++ b/src/ltl2inv/invert.tcc
@@ -4,15 +4,15 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "colmaj.hh"
-#include "blalink.hh"
+#include "lapack2eigen.hh"
 #include "trmmt.tcc"
+#include "error.hh"
 
-template <typename T>
-void sktdsmx(int n, T *vT, T *B_, int ldB, T *C_, int ldC)
+template <typename Vec, typename Mat>
+void sktdsmx(const Vec &vT, const Mat &B, Mat &C)
 {
-  colmaj<T> B(B_, ldB);
-  colmaj<T> C(C_, ldC);
+  int n = B.rows();
+  assert_(B.cols() == n && C.cols() == n && C.rows() == n, "sktdsmx: Dimension mismatch.");
 
   for (int j = 0; j < n; ++j)
     C(1, j) = B(0, j) / -vT[0];
@@ -27,13 +27,13 @@ void sktdsmx(int n, T *vT, T *B_, int ldB, T *C_, int ldC)
       C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1];
 }
 
-template <typename T>
-void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
+template <typename iVec, typename Vec, typename Mat>
+void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
 {
-  colmaj<T> A(A_, ldA);
-  colmaj<T> M(M_, ldM);
-
-  int npanel_trmm = ilaenv_lauum<T>(BLIS_LOWER, n);
+  using namespace Eigen;
+  using T = typename Mat::Scalar;
+  int n = A.rows();
+  int npanel_trmm = ilaenv_lauum<T>('L', n);
   int full = npanel_trmm <= 1 || npanel_trmm >= n;
 
   // Set M.
@@ -41,15 +41,21 @@ void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
     for (int i = 0; i < n; ++i)
       M(i, j) = T(!(i - j));
 
-  trtri(BLIS_LOWER, BLIS_UNIT_DIAG, n-1, &A(1, 0), ldA);
-  lacpy(BLIS_LOWER, n-2, n-2, &A(2, 0), ldA, &M(2, 1), ldM);
+  l2e::le_mat_t<T> A_(A);
+  l2e::le_mat_t<T> M_(M);
+  l2e::le_mat_t<T> A_Lx(A(seq(1, n-1), seq(0, n-2)));
+  l2e::le_mat_t<T> A_L(A(seq(2, n-1), seq(0, n-3)));
+  l2e::le_mat_t<T> M_L(M(seq(2, n-1), seq(1, n-2)));
+
+  trtri('L', 'U', A_Lx);
+  lacpy('L', A_L, M_L);
 
   for (int i = 0; i < n-1; ++i)
     vT[i] = A(i+1, i);
-  sktdsmx<T>(n, vT, &M(0, 0), ldM, &A(0, 0), ldA);
+  sktdsmx(vT, M, A);
 
   if (!full) {
-    trmmt(BLIS_LOWER, BLIS_UNIT_DIAG, n, T(1.0), M, A);
+    trmmt('L', 'U', T(1.0), M, A);
     for (int j = 0; j < n; ++j) {
       A(j, j) = 0.0;
       for (int i = 0; i < j; ++i)
@@ -60,25 +66,24 @@ void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
   // In-place permute columns.
   for (int j = n-1; j >= 0; --j)
     if (iPiv[j]-1 != j)
-      swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1);
+      A(Eigen::all, j).swap(A(Eigen::all, iPiv[j] - 1));
 
   if (full)
-    trmm(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG,
-         n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA);
+    A = M.template triangularView<UnitLower>().transpose() * A;
 
   // In-place permute rows.
   for (int i = n-1; i >= 0; --i)
     if (iPiv[i]-1 != i)
-      swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA);
+      A(i, Eigen::all).swap(A(iPiv[i] - 1, Eigen::all));
 }
 
-template <typename T>
-void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
+template <typename iVec, typename Vec, typename Mat>
+void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
 {
-  colmaj<T> A(A_, ldA);
-  colmaj<T> M(M_, ldM);
-
-  int npanel_trmm = ilaenv_lauum<T>(BLIS_UPPER, n);
+  using namespace Eigen;
+  using T = typename Mat::Scalar;
+  int n = A.rows();
+  int npanel_trmm = ilaenv_lauum<T>('U', n);
   int full = npanel_trmm <= 1 || npanel_trmm >= n;
 
   // Set M.
@@ -86,15 +91,21 @@ void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
     for (int i = 0; i < n; ++i)
       M(i, j) = T(!(i - j));
 
-  trtri(BLIS_UPPER, BLIS_UNIT_DIAG, n-1, &A(0, 1), ldA);
-  lacpy(BLIS_UPPER, n-2, n-2, &A(0, 2), ldA, &M(0, 1), ldM);
+  l2e::le_mat_t<T> A_(A);
+  l2e::le_mat_t<T> M_(M);
+  l2e::le_mat_t<T> A_Ux(A(seq(0, n-2), seq(1, n-1)));
+  l2e::le_mat_t<T> A_U(A(seq(0, n-3), seq(2, n-1)));
+  l2e::le_mat_t<T> M_U(M(seq(0, n-3), seq(1, n-2)));
+
+  trtri('U', 'U', A_Ux);
+  lacpy('U', A_U, M_U);
 
   for (int i = 0; i < n-1; ++i)
     vT[i] = -A(i, i+1);
-  sktdsmx<T>(n, vT, &M(0, 0), ldM, &A(0, 0), ldA);
+  sktdsmx(vT, M, A);
 
   if (!full) {
-    trmmt(BLIS_UPPER, BLIS_UNIT_DIAG, n, T(1.0), M, A);
+    trmmt('U', 'U', T(1.0), M, A);
       for (int j = 0; j < n; ++j) {
         A(j, j) = 0.0;
         for (int i = j + 1; i < n; ++i)
@@ -105,26 +116,27 @@ void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
   // In-place permute columns.
   for (int j = 0; j < n; ++j)
     if (iPiv[j]-1 != j)
-      swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1);
+      A(Eigen::all, j).swap(A(Eigen::all, iPiv[j]-1));
 
   if (full)
-    trmm(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG,
-         n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA);
+    A = M.template triangularView<UnitUpper>().transpose() * A;
 
   // In-place permute rows.
   for (int i = 0; i < n; ++i)
     if (iPiv[i]-1 != i)
-      swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA);
+      A(i, Eigen::all).swap(A(iPiv[i]-1, Eigen::all));
 }
 
-template <typename T>
-void ltl2inv(uplo_t uplo, int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
+template <typename iVec, typename Vec, typename Mat>
+void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
 {
   switch (uplo) {
-  case BLIS_LOWER:
-    ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); break;
-  case BLIS_UPPER:
-    utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); break;
+  case 'l':
+  case 'L':
+    ltl2inv(A, iPiv, vT, M); break;
+  case 'u':
+  case 'U':
+    utu2inv(A, iPiv, vT, M); break;
   default:
     break;
   }
diff --git a/src/ltl2inv/lapack2eigen.hh b/src/ltl2inv/lapack2eigen.hh
new file mode 100644
index 00000000..1d850068
--- /dev/null
+++ b/src/ltl2inv/lapack2eigen.hh
@@ -0,0 +1,272 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include <Eigen/Dense>
+#include <cstdint>
+#ifndef EIGEN_USE_LAPACKE // This lets Eigen/Dense include LAPACK definitions.
+#include <Eigen/src/misc/lapacke.h>
+#endif
+#ifdef _BLIS
+#define BLIS_DISABLE_BLAS // Screen BLAS definition even if BLIS' built with them.
+#include "blis/blis.h"
+#else
+using Eigen::scomplex;
+using Eigen::dcomplex;
+#endif
+#include "error.hh"
+#include "fortran_pfapack.h"
+using f77_int = lapack_int;
+using ccscmplx = std::complex<float>;
+using ccdcmplx = std::complex<double>;
+
+namespace l2e
+{
+
+template <typename T>
+struct le_mat_t
+{
+    const f77_int rows, cols; ///< Matrix size.
+    const f77_int rs, cs; ///< Strides.
+
+    T *const data;
+
+protected:
+    template <typename Mat>
+    static f77_int _rs(const Mat &mat)
+    {
+        f77_int rs = 0;
+        if (mat.rows() > 1)
+            rs = &(mat(1, 0)) - &(mat(0, 0));
+        return rs;
+    }
+    template <typename Mat>
+    static f77_int _cs(const Mat &mat)
+    {
+        f77_int cs = 0;
+        if (mat.cols() > 1)
+            cs = &(mat(0, 1)) - &(mat(0, 0));
+        return cs;
+    }
+
+public:
+    le_mat_t(const f77_int &rows_, const f77_int &cols_,
+             const f77_int &rs_, const f77_int &cs_, T *const data_)
+    : rows(rows_), cols(cols_), rs(rs_), cs(cs_), data(data_) { }
+
+    template <int RowsAtCompilation, int ColsAtCompilation>
+    le_mat_t(Eigen::Matrix<T, RowsAtCompilation, ColsAtCompilation> &mat)
+    : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { }
+
+    template <typename Mat, int BlockRowsAtCompil, int BlockColsAtCompil, bool BlockProps>
+    le_mat_t(Eigen::Block<Mat, BlockRowsAtCompil, BlockColsAtCompil, BlockProps> mat) // TODO: Check for data copying.
+    : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { }
+
+    template <int RowsAtCompilation, int ColsAtCompilation, int Alignment, typename Stride>
+    le_mat_t(Eigen::Map<Eigen::Matrix<T, RowsAtCompilation, ColsAtCompilation>, Alignment, Stride> &mat)
+    : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { }
+
+    template <int Alignment = 0>
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, Alignment,
+               Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic> > to_eigen() const
+    {
+        using matrix_t = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+        using stride_t = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+        return Eigen::Map<matrix_t, Alignment, stride_t>(data, rows, cols, stride_t(cs, rs));
+    }
+
+    le_mat_t<T> transpose() const
+    { return le_mat_t<T>(cols, rows, cs, rs, data); }
+};
+
+}
+
+
+// ---- BLAS/LAPACK routines used in our settings. -----
+
+
+// [LAPACK] lacpy
+#define BLALINK_PARAMS(T) const char &uploA, \
+                          const l2e::le_mat_t<T> &A, \
+                                l2e::le_mat_t<T> &B
+template <typename T>
+inline void lacpy( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline void lacpy<T>( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1 && B.rs == 1, "BLAS requires column-major."); \
+        /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \
+        cchar##lacpy_((char *)&uploA, (f77_int *)&A.rows, (f77_int *)&A.cols, \
+                      (ctype *)A.data, (f77_int *)&A.cs, \
+                      (ctype *)B.data, (f77_int *)&B.cs); \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, _Complex float,  c )
+BLALINK_MAC( ccdcmplx, _Complex double, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [LAPACK] trtri
+#define BLALINK_PARAMS(T) const char &uploA, \
+                          const char &diagA, \
+                          l2e::le_mat_t<T> &A
+template <typename T>
+inline f77_int trtri( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline f77_int trtri( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        f77_int info; \
+        /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \
+        cchar##trtri_((char *)&uploA, (char *)&diagA, (f77_int *)&A.rows, \
+                      (ctype *)A.data, (f77_int *)&A.cs, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, _Complex float,  c )
+BLALINK_MAC( ccdcmplx, _Complex double, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [PFAPACK] sktrf
+#define BLALINK_PARAMS(T) const char &uplo, \
+                          const char &mode, \
+                          l2e::le_mat_t<T> &A, \
+                          f77_int *ipiv, \
+                          T *work, f77_int lwork
+template <typename T>
+inline f77_int sktrf( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline f77_int sktrf( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        f77_int info; \
+        cchar##sktrf_(&uplo, &mode, &A.rows, (ctype *)A.data, &A.cs, ipiv, work, &lwork, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, floatcmplx,  c )
+BLALINK_MAC( ccdcmplx, doublecmplx, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [PFAPACK] skpfa
+#define BLALINK_PARAMS(T) const char &uplo, \
+                          const char &mthd, \
+                          l2e::le_mat_t<T> &A, \
+                          T *pfa, f77_int *ipiv, \
+                          T *work, f77_int lwork
+template <typename T>
+inline f77_int skpfa( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        int info; \
+        cchar##skpfa_(&uplo, &mthd, &A.rows, (ctype *)A.data, &A.cs, pfa, ipiv, work, &lwork, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+#undef BLALINK_MAC
+#define BLALINK_MAC(T, ctype, rtype, cchar) \
+    template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        rtype *rwork = 0; \
+        if (mthd != 'p' && mthd != 'P') \
+            rwork = new rtype[lwork]; \
+        int info; \
+        cchar##skpfa_(&uplo, &mthd, &A.rows, A.data, &A.cs, pfa, ipiv, work, &lwork, rwork, &info); \
+        delete[] rwork; \
+        return info; \
+    }
+BLALINK_MAC( ccscmplx, floatcmplx,  float,  c )
+BLALINK_MAC( ccdcmplx, doublecmplx, double, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [Non-standard] gemmt
+#define BLALINK_PARAMS(T) const char &uploC, \
+                          const char &transA, \
+                          const char &transB, \
+                          const T &alpha, \
+                          const l2e::le_mat_t<T> &A, \
+                          const l2e::le_mat_t<T> &B, \
+                          const T &beta, \
+                                l2e::le_mat_t<T> &C
+template <typename T>
+inline void gemmt( BLALINK_PARAMS(T) );
+#if !defined(_BLIS) && !defined(_BLAS_HAS_GEMMT)
+  #include "eigen_gemmt.tcc"
+#else
+#ifdef _BLIS
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline void gemmt<T>( BLALINK_PARAMS(T) ) \
+    { \
+        uplo_t bli_uploC; \
+        trans_t bli_transA, bli_transB; \
+        bli_param_map_netlib_to_blis_uplo( uploC, &bli_uploC ); \
+        bli_param_map_netlib_to_blis_trans( transA, &bli_transA ); \
+        bli_param_map_netlib_to_blis_trans( transB, &bli_transB ); \
+        bli_##cchar##gemmt(bli_uploC, \
+                           bli_transA, \
+                           bli_transB, \
+                           (transA == 'T' || transA == 't') ? A.cols : A.rows, \
+                           (transA == 'T' || transA == 't') ? A.rows : A.cols, \
+                           (const ctype *)&alpha, \
+                           (const ctype *)A.data, A.rs, A.cs, \
+                           (const ctype *)B.data, B.rs, B.cs, \
+                           (const ctype *)&beta, \
+                           (ctype       *)C.data, C.rs, C.cs); \
+    }
+#else
+  // ifdef _BLAS_HAS_GEMMT
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline void gemmt<T>( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1 && B.rs == 1 && C.rs == 1, "BLAS requires column-major."); \
+        cchar##gemmt_(&uploC, \
+                      &transA, &transB, \
+                      (transA == 'T' || transA == 't') ? &A.cols : &A.rows, \
+                      (transA == 'T' || transA == 't') ? &A.rows : &A.cols, \
+                      (const ctype *)&alpha, \
+                      (const ctype *)A.data, &A.cs, \
+                      (const ctype *)B.data, &B.cs, \
+                      (const ctype *)&beta, \
+                      (ctype       *)C.data, &C.cs); \
+    }
+extern "C" {
+#define GEMMT_F77_PARAMS(T) const char *uploC, \
+                            const char *transA, \
+                            const char *transB, \
+                            const f77_int *m, \
+                            const f77_int *k, \
+                            const T *alpha, \
+                            const T *A, const f77_int *ldA, \
+                            const T *B, const f77_int *ldB, \
+                            const T *beta, \
+                                  T *C, const f77_int *ldC
+void sgemmt_( GEMMT_F77_PARAMS(float)    );
+void dgemmt_( GEMMT_F77_PARAMS(double)   );
+void cgemmt_( GEMMT_F77_PARAMS(scomplex) );
+void zgemmt_( GEMMT_F77_PARAMS(dcomplex) );
+#undef GEMMT_F77_PARAMS
+}
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+#endif
+#undef BLALINK_PARAMS
diff --git a/src/ltl2inv/ltl2inv.cc b/src/ltl2inv/ltl2inv.cc
index 19f10256..e7ef84fc 100644
--- a/src/ltl2inv/ltl2inv.cc
+++ b/src/ltl2inv/ltl2inv.cc
@@ -10,24 +10,42 @@
 
 extern "C" {
 
-void ltl2inv_s(int n, float    *A_, int ldA, int *iPiv, float    *vT, float    *M_, int ldM) { ltl2inv<float   >(n, A_, ldA, iPiv, vT, M_, ldM); }
-void ltl2inv_d(int n, double   *A_, int ldA, int *iPiv, double   *vT, double   *M_, int ldM) { ltl2inv<double  >(n, A_, ldA, iPiv, vT, M_, ldM); }
-void ltl2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { ltl2inv<ccscmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
-void ltl2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { ltl2inv<ccdcmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
-void utu2inv_s(int n, float    *A_, int ldA, int *iPiv, float    *vT, float    *M_, int ldM) { utu2inv<float   >(n, A_, ldA, iPiv, vT, M_, ldM); }
-void utu2inv_d(int n, double   *A_, int ldA, int *iPiv, double   *vT, double   *M_, int ldM) { utu2inv<double  >(n, A_, ldA, iPiv, vT, M_, ldM); }
-void utu2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { utu2inv<ccscmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
-void utu2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { utu2inv<ccdcmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
-
-
-void ltl2pfa_s(int n, float    *A_, int ldA, int *iPiv, float    *Pfa) { *Pfa = ltl2pfa<float   >(n, A_, ldA, iPiv); }
-void ltl2pfa_d(int n, double   *A_, int ldA, int *iPiv, double   *Pfa) { *Pfa = ltl2pfa<double  >(n, A_, ldA, iPiv); }
-void ltl2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = ltl2pfa<ccscmplx>(n, A_, ldA, iPiv); }
-void ltl2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = ltl2pfa<ccdcmplx>(n, A_, ldA, iPiv); }
-void utu2pfa_s(int n, float    *A_, int ldA, int *iPiv, float    *Pfa) { *Pfa = utu2pfa<float   >(n, A_, ldA, iPiv); }
-void utu2pfa_d(int n, double   *A_, int ldA, int *iPiv, double   *Pfa) { *Pfa = utu2pfa<double  >(n, A_, ldA, iPiv); }
-void utu2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = utu2pfa<ccscmplx>(n, A_, ldA, iPiv); }
-void utu2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = utu2pfa<ccdcmplx>(n, A_, ldA, iPiv); }
+#define GENDEF(schema, cchar, ctype, cctype) \
+void schema ##2inv_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *vT, cctype *M_, int ldM) \
+{ \
+  using namespace Eigen; \
+  Map<Matrix<cctype, Dynamic, Dynamic>, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \
+  Map<Matrix<cctype, Dynamic, Dynamic>, 0, OuterStride<> > M(M_, n, n, OuterStride<>(ldM)); \
+  Map<VectorXi> Piv(iPiv, n); \
+  Map<Vector<cctype, Dynamic>> T(vT, n); \
+  schema ##2inv(A, Piv, vT, M); \
+}
+GENDEF(ltl, s, float,    float)
+GENDEF(ltl, d, double,   double)
+GENDEF(ltl, c, ccscmplx, ccscmplx)
+GENDEF(ltl, z, ccdcmplx, ccdcmplx)
+GENDEF(utu, s, float,    float)
+GENDEF(utu, d, double,   double)
+GENDEF(utu, c, ccscmplx, ccscmplx)
+GENDEF(utu, z, ccdcmplx, ccdcmplx)
+
+#undef GENDEF
+#define GENDEF(schema, cchar, ctype, cctype) \
+void schema ##2pfa_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *Pfa) \
+{ \
+  using namespace Eigen; \
+  Map<Matrix<cctype, Dynamic, Dynamic>, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \
+  Map<VectorXi> Piv(iPiv, n); \
+  *Pfa = schema ##2pfa(A, Piv); \
+}
+GENDEF(ltl, s, float,    float)
+GENDEF(ltl, d, double,   double)
+GENDEF(ltl, c, ccscmplx, ccscmplx)
+GENDEF(ltl, z, ccdcmplx, ccdcmplx)
+GENDEF(utu, s, float,    float)
+GENDEF(utu, d, double,   double)
+GENDEF(utu, c, ccscmplx, ccscmplx)
+GENDEF(utu, z, ccdcmplx, ccdcmplx)
 
 }
 
diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv
index 8c76afb1..2b9d4aeb 100644
--- a/src/ltl2inv/makefile_ltl2inv
+++ b/src/ltl2inv/makefile_ltl2inv
@@ -1,9 +1,8 @@
 include ../make.sys
 
-OBJ = ltl2inv.o blalink_gemmt.o ilaenv_lauum.o
-SRC = ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc
-HDR = invert.tcc pfaffian.tcc \
-	  ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh
+OBJ = ilaenv_lauum.o gemmt2blas.o ltl2inv.o
+SRC = ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc
+HDR = ilaenv.h ilaenv_lauum.hh lapack2eigen.hh error.hh eigen_gemmt.tcc invert.tcc pfaffian.tcc trmmt.tcc
 
 ltl2inv.a: $(OBJ)
 	$(AR) rvu $@ $(OBJ)
@@ -17,6 +16,7 @@ SUFFIXES: .o .c
 	$(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \
 		-Wno-incompatible-pointer-types-discards-qualifiers \
 		-I../common \
+		-I../pfapack/c_interface \
 		-I$(BLIS_ROOT)/include \
 		-I$(BLIS_ROOT)/include/blis
 
@@ -25,5 +25,6 @@ SUFFIXES: .o .cc
 .cc.o: $(HDR)
 	$(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \
 		-I../common \
+		-I../pfapack/c_interface \
 		-I$(BLIS_ROOT)/include \
 		-I$(BLIS_ROOT)/include/blis
diff --git a/src/ltl2inv/pfaffian.tcc b/src/ltl2inv/pfaffian.tcc
index ed028524..51777f0b 100644
--- a/src/ltl2inv/pfaffian.tcc
+++ b/src/ltl2inv/pfaffian.tcc
@@ -4,47 +4,83 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "colmaj.hh"
-#include "blalink.hh"
+#include "lapack2eigen.hh"
 
-template <typename T>
-T ltl2pfa(int n, T *A_, int ldA, int *iPiv)
+
+template <typename T_>
+struct SignedLogScale
+{
+  using T = T_;
+
+  T_  val;
+  int sgn;
+};
+
+
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+SignedLogScale<Amp> utu2logpfa(const Mat &A, const Vec &iPiv)
+{
+  SignedLogScale<Amp> logpfa{ 0.0, 1 };
+
+  for (int i = 0; i < A.rows(); i += 2) {
+    logpfa.val += std::log(std::abs(Amp( A(i, i+1) )));
+    logpfa.sgn *= A(i, i+1) > 0 ? 1 : -1;
+
+    if (iPiv(i)  -1 != i  ) logpfa.sgn *= -1;
+    if (iPiv(i+1)-1 != i+1) logpfa.sgn *= -1;
+  }
+  return logpfa;
+}
+
+// Additional parameter for return type deduction.
+template <typename Mat, typename Vec, typename Ret>
+Ret utu2logpfa(const Mat &A, const Vec &iPiv, Ret &ret)
+{ return utu2logpfa<Mat, Vec, typename Ret::T>(A, iPiv); }
+
+
+
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+Amp ltl2pfa(const Mat &A, const Vec &iPiv)
 {
-  T pfa = 1.0;
-  colmaj<T> A(A_, ldA);
+  Amp pfa = 1.0;
 
-  for (int i = 0; i < n; i += 2) {
+  for (int i = 0; i < A.rows(); i += 2) {
     pfa *= -A(i+1, i);
 
-    if (iPiv[i  ]-1 != i  ) pfa = -pfa;
-    if (iPiv[i+1]-1 != i+1) pfa = -pfa;
+    if (iPiv(i)  -1 != i  ) pfa = -pfa;
+    if (iPiv(i+1)-1 != i+1) pfa = -pfa;
   }
   return pfa;
 }
 
-template <typename T>
-T utu2pfa(int n, T *A_, int ldA, int *iPiv)
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+Amp utu2pfa(const Mat &A, const Vec &iPiv)
 {
-  T pfa = 1.0;
-  colmaj<T> A(A_, ldA);
+  Amp pfa = 1.0;
 
-  for (int i = 0; i < n; i += 2) {
+  for (int i = 0; i < A.rows(); i += 2) {
     pfa *= A(i, i+1);
 
-    if (iPiv[i  ]-1 != i  ) pfa = -pfa;
-    if (iPiv[i+1]-1 != i+1) pfa = -pfa;
+    if (iPiv(i)  -1 != i  ) pfa = -pfa;
+    if (iPiv(i+1)-1 != i+1) pfa = -pfa;
   }
   return pfa;
 }
 
-template <typename T>
-T ltl2pfa(uplo_t uplo, int n, T *A_, int ldA, int *iPiv)
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+Amp ltl2pfa(const char &uplo, const Mat &A, const Vec &iPiv)
 {
   switch (uplo) {
-  case BLIS_LOWER:
-    return ltl2pfa(n, A_, ldA, iPiv);
-  case BLIS_UPPER:
-    return utu2pfa(n, A_, ldA, iPiv);
+  case 'l':
+  case 'L':
+    return ltl2pfa(A, iPiv);
+  case 'u':
+  case 'U':
+    return utu2pfa(A, iPiv);
   default:
     return 0.0;
   }
diff --git a/src/ltl2inv/testinv.cc b/src/ltl2inv/testinv.cc
deleted file mode 100644
index c7cbc358..00000000
--- a/src/ltl2inv/testinv.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <chrono>
-
-#include "colmaj.hh"
-#include "invert.tcc"
-
-struct info_t {
-    double err;
-    long long tim;
-};
-
-template <typename T>
-info_t test_decomp2inv(uplo_t uplo, int n)
-{
-    using namespace std::chrono;
-
-    colmaj<T> A(new T[n * n], n);
-    colmaj<T> M(new T[n * n], n);
-    colmaj<T> X(new T[n * n], n);
-    int *ipiv = new int[n];
-    T *vT = new T[n-1];
-
-    assert(&A(0, 0));
-    assert(&M(0, 0));
-    assert(&X(0, 0));
-    assert(ipiv);
-    assert(vT);
-
-    for (int j = 0; j < n; ++j) {
-        A(j, j) = 0.0;
-        X(j, j) = 0.0;
-
-        for (int i = j+1; i < n; ++i) {
-            A(i, j) = rand();
-            A(i, j) /= RAND_MAX;
-
-            A(j, i) = -A(i, j);
-            switch (uplo) {
-            case BLIS_LOWER:
-                X(i, j) = A(i, j); break;
-            case BLIS_UPPER:
-                X(j, i) = A(j, i); break;
-            default:
-                break;
-            }
-        }
-    }
-
-    auto start = high_resolution_clock::now();
-
-    sktrf<T>(uplo, n, &X(0, 0), n, ipiv, &M(0, 0), n*n);
-    switch (uplo) {
-    case BLIS_LOWER:
-        ltl2inv<T>(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break;
-    case BLIS_UPPER:
-        utu2inv<T>(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break;
-    default:
-        break;
-    }
-
-    auto tim = duration_cast<nanoseconds>(high_resolution_clock::now() - start).count();
-
-    gemm<T>(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE,
-            n, n, n,
-            1.0,
-            &A(0, 0), n,
-            &X(0, 0), n,
-            0.0,
-            &M(0, 0), n);
-
-#ifdef VERBOSE
-    printf("iPiv = [ ");
-    for (int i = 0; i < n; ++i)
-        printf("%d ", ipiv[i]);
-    printf("]\n");
-    printf("vT = [ ");
-    for (int i = 0; i < n-1; ++i)
-        printf("%f ", vT[i]);
-    printf("]\n");
-
-    printf("A*inv(A) =\n");
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < n; ++j)
-            printf("%10.6f ", M(i, j));
-        printf("\n");
-    }
-#endif
-
-    double err = 0.0;
-    for (int j = 0; j < n; ++j)
-        for (int i = 0; i < n; ++i)
-            err += std::abs(M(i, j) - double(!(i - j)));
-    err /= n; // sqrt(n*n);
-
-    delete[] &A(0, 0);
-    delete[] &M(0, 0);
-    delete[] &X(0, 0);
-    delete[] ipiv;
-    delete[] vT;
-
-    return info_t{ err, tim };
-}
-
-int main(void)
-{
-    info_t info; int n;
-    n= 10; info = test_decomp2inv<double>  (BLIS_LOWER, n); printf("double   n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
-    n= 10; info = test_decomp2inv<float>   (BLIS_LOWER, n); printf("float    n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
-    n= 10; info = test_decomp2inv<ccdcmplx>(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim);
-    n=200; info = test_decomp2inv<double>  (BLIS_LOWER, n); printf("double   n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
-    n=200; info = test_decomp2inv<float>   (BLIS_LOWER, n); printf("float    n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
-    n=200; info = test_decomp2inv<ccdcmplx>(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim);
-
-    n= 10; info = test_decomp2inv<double>  (BLIS_UPPER, n); printf("double   n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
-    n= 10; info = test_decomp2inv<float>   (BLIS_UPPER, n); printf("float    n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
-    n= 10; info = test_decomp2inv<ccdcmplx>(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim);
-    n=200; info = test_decomp2inv<double>  (BLIS_UPPER, n); printf("double   n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
-    n=200; info = test_decomp2inv<float>   (BLIS_UPPER, n); printf("float    n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
-    n=200; info = test_decomp2inv<ccdcmplx>(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim);
-
-    return 0;
-}
-
diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc
index 79bc7170..78399b55 100644
--- a/src/ltl2inv/trmmt.tcc
+++ b/src/ltl2inv/trmmt.tcc
@@ -4,31 +4,39 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "colmaj.hh"
-#include "blalink.hh"
+#include "lapack2eigen.hh"
 #include "ilaenv_lauum.hh"
 
 
-template <typename T>
-inline void trmmt(uplo_t uploab, diag_t diaga, int n, T alpha, colmaj<T> &A, colmaj<T> &B)
+template <typename Mat>
+inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat &B)
 {
-  int npanel = ilaenv_lauum<T>(uploab, n);
+  using namespace Eigen;
+  using T = typename Mat::Scalar;
+  int n = A.rows();
+  int npanel = ilaenv_lauum<T>(uploAB, n);
 
-  if (uploab == BLIS_UPPER) {
+  if (uploAB == 'U' || uploAB == 'u') {
     // A upper => A' lower -> write to UPPER part of B.
     for (int j = 0; j < n; j += npanel) {
       int nloc = std::min<int>(npanel, n - j);
 
-      trmm<T>(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, diaga,
-              j+nloc, nloc, alpha, &A(0, 0), A.ld, &B(0, j), B.ld);
+      auto Bslice = B(seq(0, j+nloc-1), seq(j, j+nloc-1));
+      if (diagA == 'U' || diagA == 'u')
+        Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView<UnitUpper>().transpose() * Bslice * alpha;
+      else
+        Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView<Upper>().transpose() * Bslice * alpha;
     }
   } else {
     // A lower => A' upper -> write to LOWER part of B.
     for (int j = 0; j < n; j += npanel) {
       int nloc = std::min<int>(npanel, n - j);
 
-      trmm<T>(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, diaga,
-              n-j, nloc, alpha, &A(j, j), A.ld, &B(j, j), B.ld);
+      auto Bslice = B(seq(j, n-1), seq(j, j+nloc-1));
+      if (diagA == 'U' || diagA == 'u')
+        Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView<UnitLower>().transpose() * Bslice * alpha;
+      else
+        Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView<Lower>().transpose() * Bslice * alpha;
     }
   }
 }

From a07f273686140735d46fe6ed2bd7699443d5ee4a Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 11:35:15 +0900
Subject: [PATCH 02/39] ltl2inv via Eig Build Sys

---
 src/ltl2inv/CMakeLists.txt   | 11 +++++++----
 src/ltl2inv/makefile_ltl2inv |  2 --
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt
index 8aa7141c..04fe165e 100644
--- a/src/ltl2inv/CMakeLists.txt
+++ b/src/ltl2inv/CMakeLists.txt
@@ -1,5 +1,5 @@
 # include guard
-cmake_minimum_required(VERSION 2.8.0 )
+cmake_minimum_required(VERSION 3.0)
 
 if(${CMAKE_PROJECT_NAME} STREQUAL "Project")
   message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.")
@@ -8,9 +8,12 @@ endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
 if("${ARCHITECTURE}" STREQUAL "x86_64")
   add_definitions(-DF77_COMPLEX_RET_INTEL)
 endif()
-include_directories(../common)
+include_directories(../pfapack/c_interface)
 
-# TODO: Move blalink_gemmt.c to other subprojects?
-add_library(ltl2inv STATIC ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc)
+# Require Eigen
+find_package(Eigen3 3.3 REQUIRED NO_MODULE)
+include_directories(${EIGEN3_INCLUDE_DIR})
+
+add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc)
 target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL)
 
diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv
index 2b9d4aeb..d57c5fcd 100644
--- a/src/ltl2inv/makefile_ltl2inv
+++ b/src/ltl2inv/makefile_ltl2inv
@@ -15,7 +15,6 @@ SUFFIXES: .o .c
 .c.o: $(HDR)
 	$(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \
 		-Wno-incompatible-pointer-types-discards-qualifiers \
-		-I../common \
 		-I../pfapack/c_interface \
 		-I$(BLIS_ROOT)/include \
 		-I$(BLIS_ROOT)/include/blis
@@ -24,7 +23,6 @@ SUFFIXES: .o .cc
 
 .cc.o: $(HDR)
 	$(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \
-		-I../common \
 		-I../pfapack/c_interface \
 		-I$(BLIS_ROOT)/include \
 		-I$(BLIS_ROOT)/include/blis

From 24700aae2c3be25b22b411d68a7a9fe782b1ea95 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 14:48:31 +0900
Subject: [PATCH 03/39] Port from libVMC + Fixes for Complex Nums.

---
 src/ltl2inv/invert.tcc           |  16 +-
 src/ltl2inv/trmmt.tcc            |   4 +-
 src/pfupdates/CMakeLists.txt     |   2 +-
 src/pfupdates/config_manager.tcc | 279 ++++++++++++
 src/pfupdates/makefile_pfupdates |   8 +-
 src/pfupdates/orbital_mat.tcc    |  53 ++-
 src/pfupdates/pf_interface.cc    |  38 +-
 src/pfupdates/pf_interface.h     |   8 +-
 src/pfupdates/skmv.tcc           |  51 +--
 src/pfupdates/skr2k.tcc          |  36 +-
 src/pfupdates/skslc.tcc          |  25 +-
 src/pfupdates/testupdate.cc      |  70 ---
 src/pfupdates/updated_tdi.tcc    | 736 ++++++++++++++++++-------------
 13 files changed, 854 insertions(+), 472 deletions(-)
 create mode 100644 src/pfupdates/config_manager.tcc
 delete mode 100644 src/pfupdates/testupdate.cc

diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc
index 4fd24cda..ba3bfd8a 100644
--- a/src/ltl2inv/invert.tcc
+++ b/src/ltl2inv/invert.tcc
@@ -8,8 +8,8 @@
 #include "trmmt.tcc"
 #include "error.hh"
 
-template <typename Vec, typename Mat>
-void sktdsmx(const Vec &vT, const Mat &B, Mat &C)
+template <typename Vec, typename Mat, typename Mat_>
+void sktdsmx(const Vec &vT, const Mat &B, Mat_ &C)
 {
   int n = B.rows();
   assert_(B.cols() == n && C.cols() == n && C.rows() == n, "sktdsmx: Dimension mismatch.");
@@ -27,8 +27,8 @@ void sktdsmx(const Vec &vT, const Mat &B, Mat &C)
       C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1];
 }
 
-template <typename iVec, typename Vec, typename Mat>
-void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
+template <typename iVec, typename Vec, typename Mat, typename Mat_>
+void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M)
 {
   using namespace Eigen;
   using T = typename Mat::Scalar;
@@ -77,8 +77,8 @@ void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
       A(i, Eigen::all).swap(A(iPiv[i] - 1, Eigen::all));
 }
 
-template <typename iVec, typename Vec, typename Mat>
-void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
+template <typename iVec, typename Vec, typename Mat, typename Mat_>
+void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M)
 {
   using namespace Eigen;
   using T = typename Mat::Scalar;
@@ -127,8 +127,8 @@ void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
       A(i, Eigen::all).swap(A(iPiv[i]-1, Eigen::all));
 }
 
-template <typename iVec, typename Vec, typename Mat>
-void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat &M)
+template <typename iVec, typename Vec, typename Mat, typename Mat_>
+void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M)
 {
   switch (uplo) {
   case 'l':
diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc
index 78399b55..db60335a 100644
--- a/src/ltl2inv/trmmt.tcc
+++ b/src/ltl2inv/trmmt.tcc
@@ -8,8 +8,8 @@
 #include "ilaenv_lauum.hh"
 
 
-template <typename Mat>
-inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat &B)
+template <typename Mat, typename Mat_>
+inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat_ &B)
 {
   using namespace Eigen;
   using T = typename Mat::Scalar;
diff --git a/src/pfupdates/CMakeLists.txt b/src/pfupdates/CMakeLists.txt
index 1a06ef74..11b0202b 100644
--- a/src/pfupdates/CMakeLists.txt
+++ b/src/pfupdates/CMakeLists.txt
@@ -8,8 +8,8 @@ endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
 if(${ARCHITECTURE} STREQUAL "x86_64")
   add_definitions(-DF77_COMPLEX_RET_INTEL)
 endif()
-include_directories(../common)
 include_directories(../ltl2inv)
+include_directories(../pfapack/c_interface)
 
 add_library(pfupdates STATIC pf_interface.cc)
 target_compile_definitions(pfupdates PRIVATE -D_CC_IMPL)
diff --git a/src/pfupdates/config_manager.tcc b/src/pfupdates/config_manager.tcc
new file mode 100644
index 00000000..44eb1f07
--- /dev/null
+++ b/src/pfupdates/config_manager.tcc
@@ -0,0 +1,279 @@
+/**
+ * \copyright Copyright (c) Dept. Phys., Univ. Tokyo
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "error.hh"
+#include <random>
+#include <vector>
+#include <cstdint>
+
+namespace vmc
+{
+struct config_manager
+{
+  using index_t = int32_t;
+  using base_t = std::vector<index_t>;
+
+  const index_t nsites;
+  index_t norbs()
+  { return nsites * 2; }
+
+  const  base_t &config_base()          const { return _cfg_base; }
+  const index_t  config_base(index_t i) const { return _cfg_base.at(i); }
+  const  base_t &   from_idx()          const { return _from_idx; }
+  const index_t     from_idx(index_t i) const { return _from_idx.at(i); }
+  const  base_t &     to_orb()          const { return _to_orb; }
+  const index_t       to_orb(index_t i) const { return _to_orb.at(i); }
+  const  base_t &     config()          const { return _cfg; }
+  const index_t       config(index_t i) const { return _cfg.at(i); }
+  const  base_t &        num()          const { return _num; }
+  const index_t          num(index_t i) const { return _num.at(i); }
+  const  base_t &        inv()          const { return _inv; }
+  const index_t          inv(index_t i) const { return _inv.at(i); }
+  const index_t         spin(index_t i) const { return _num.at(i) == 1 ? 1 : _num.at(i) == 2 ? -1 : 0; }
+
+  void reserve_space(index_t mmax)
+  {
+    _from_idx.reserve(mmax * sizeof(index_t));
+    _to_orb.reserve(mmax * sizeof(index_t));
+  }
+
+  void attach_config(const base_t &cfg)
+  {
+    _cfg_base = cfg;
+    _cfg = cfg;
+    _from_idx.clear();
+    _to_orb.clear();
+
+    // Reset _num.
+    for (index_t i = 0; i < nsites; ++i)
+      _num.at(i) = 0;
+
+    // Generate num.
+    for (index_t i = 0; i < cfg.size(); ++i) {
+      // 0b01 for up, 0b10 for down, 0b11 for double occupation.
+      index_t xi = cfg.at(i) % nsites;
+      index_t si = cfg.at(i) / nsites;
+      _num.at(xi) += (si + 1);
+    }
+
+    // Generate inv.
+    for (index_t i = 0; i < _inv.size(); ++i)
+      _inv.at(i) = -1;
+    for (index_t i = 0; i < cfg.size(); ++i)
+      _inv.at(cfg.at(i)) = i;
+  }
+
+  void merge_config()
+  {
+    // OR: cfg_base = cfg;
+    for (index_t j = 0; j < _from_idx.size(); ++j)
+      _cfg_base.at(_from_idx.at(j)) = _to_orb.at(j);
+    _from_idx.clear();
+    _to_orb.clear();
+  }
+
+  void push_update(index_t osi, index_t msj)
+  {
+    for (auto i = _from_idx.begin(); i != _from_idx.end(); ++i)
+      assert_(*i != msj, typeid(*this).name(), "Consecutive hopping unsupported. Please merge manually before proceeding.");
+    index_t osj = _cfg.at(msj);
+    index_t xj = osj % nsites;
+    index_t sj = osj / nsites;
+    index_t xi = osi % nsites;
+    index_t si = osi / nsites;
+    assert_(_inv.at(osj) == msj && _inv.at(osi) == -1, typeid(*this).name(), "Inv error @ push.");
+
+    _from_idx.push_back(msj);
+    _to_orb.push_back(osi);
+    _cfg.at(msj) = osi;
+    // Update num.
+    _num.at(xj) -= sj + 1;
+    _num.at(xi) += si + 1;
+    // Update inv.
+    _inv.at(osj) = -1;
+    _inv.at(osi) = msj;
+  }
+
+  void pop_update()
+  {
+    assert_(_from_idx.size(), typeid(*this).name(), "Trying to pop empty updates.");
+    index_t msj = _from_idx.back();
+    index_t osj = _cfg_base.at(msj);
+    index_t xj  = osj % nsites;
+    index_t sj  = osj / nsites;
+    index_t osi = _cfg.at(msj);
+    index_t xi  = osi % nsites;
+    index_t si  = osi / nsites;
+    assert_(_inv.at(osi) == msj && _inv.at(osj) == -1, typeid(*this).name(), "Inv error @ pop.");
+
+    _cfg.at(msj) = osj;
+    // Update num.
+    _num.at(xi) -= si + 1;
+    _num.at(xj) += sj + 1;
+    // Update inv.
+    _inv.at(osi) = -1;
+    _inv.at(osj) = msj;
+    _from_idx.pop_back();
+    _to_orb.pop_back();
+  }
+
+  void check()
+  {
+    assert_(_cfg.size() <= norbs(), typeid(*this).name(), "More Fermions than orbitals.");
+
+    base_t num(nsites, 0);
+    for (index_t i = 0; i < num.size(); ++i) {
+      index_t xi = _cfg.at(i) % nsites;
+      index_t si = _cfg.at(i) / nsites;
+
+      assert_(!(num.at(xi) & (si + 1)), typeid(*this).name(), "More than two Fermions occupying the same orbital.");
+      num.at(xi) += (si + 1);
+    }
+    for (index_t i = 0; i < nsites; ++i)
+      assert_(num.at(i) == _num.at(i), typeid(*this).name(), "Found broken member: _num.");
+
+    base_t cfg_curr = _cfg_base;
+    for (index_t j = 0; j < _from_idx.size(); ++j)
+      cfg_curr.at(_from_idx.at(j)) = _to_orb.at(j);
+    for (index_t i = 0; i < cfg_curr.size(); ++i)
+      assert_(cfg_curr.at(i) == _cfg.at(i), typeid(*this).name(), "Found broken member: _cfg.");
+    // Check inv.
+  }
+
+  config_manager(index_t nsites_)
+  : nsites(nsites_), _num(nsites_), _inv(norbs()) {  }
+
+  config_manager(index_t nsites_, base_t cfg)
+  : nsites(nsites_), _num(nsites_), _inv(norbs())
+  { attach_config(cfg); }
+
+  template <typename rng_t>
+  static base_t singlet_config(unsigned nsite, unsigned nsinglet, rng_t &rng)
+  {
+    using dist_t = std::uniform_int_distribution<unsigned>;
+
+    bool allow_doublon = nsinglet * 2 > nsite;
+    base_t marks(nsite, 0);
+    base_t cfg(0);
+
+    // Spin up.
+    for (int i = 0; i < nsinglet; ++i) {
+      int iseq = dist_t(0, nsite - i - 1)(rng);
+
+      int isite;
+      int isite_c = 0;
+      do {
+        // Find the next available site.
+        while (marks.at(isite_c)) { isite_c++; }
+        isite = isite_c;
+        isite_c++;
+      } while (iseq-- > 0);
+
+      marks.at(isite) = 1;
+      cfg.push_back(isite);
+    }
+
+    // Spin down.
+    for (int i = 0; i < nsinglet; ++i) {
+      int iseq;
+      if (allow_doublon)
+        iseq = dist_t(0, nsite - i - 1)(rng);
+      else
+        iseq = dist_t(0, nsite - i - 1 - nsinglet)(rng);
+
+      int isite;
+      int isite_c = 0;
+      do {
+        // Find the next unoccupied site / orb.
+        if (allow_doublon)
+          while (marks.at(isite_c) > 1) { isite_c++; }
+        else
+          while (marks.at(isite_c) > 0) { isite_c++; }
+        isite = isite_c;
+        isite_c++;
+      } while (iseq-- > 0);
+
+      marks.at(isite) = 2;
+      cfg.push_back(isite + nsite);
+    }
+    return cfg;
+  }
+
+  template <typename rng_t>
+  std::pair<int, int> propose_hop(rng_t &rng) const
+  {
+    int loop_breaker = 0;
+    int xi;
+    int msj = std::uniform_int_distribution<unsigned>(0, _cfg.size() - 1)(rng);
+    std::uniform_int_distribution<unsigned> select_site(0, nsites - 1);
+    int sj = msj / nsites;
+
+    do {
+      assert_(loop_breaker++ < 10000000, typeid(*this).name(), "Too many loops.");
+      xi = select_site(rng);
+    } while (_num.at(xi) & (sj + 1));
+
+    return std::make_pair(xi + sj * nsites, msj);
+  }
+
+  template <typename rng_t>
+  std::tuple<int, int, int, int> propose_exc(rng_t &rng) const
+  {
+    using dist_t = std::uniform_int_distribution<unsigned>;
+
+    int nup = 0;
+    int ndn = 0;
+    int loop_breaker = 0;
+    int msj, xj = 0;
+    int msl, xl = 0;
+
+    // Count # of spin-up/down sites.
+    for (int i = 0; i < _num.size(); ++i)
+      if (num(i) == 1)
+        nup++;
+      else if (num(i) == 2)
+        ndn++;
+
+    // Pick a singly occupied spin-up site.
+    int iseq_up = dist_t(0, nup - 1)(rng);
+    while (true) {
+      while (num(xj) != 1) ++xj;
+      if (--iseq_up < 0)
+        break;
+      else
+        ++xj;
+    }
+
+    // Pick a singly occupied spin-down site.
+    int iseq_dn = dist_t(0, ndn - 1)(rng);
+    while (true) {
+      while (num(xl) != 2) ++xl;
+      if (--iseq_dn < 0)
+        break;
+      else
+        ++xl;
+    }
+
+    // Reverse lookup config indices.
+    msj = inv(xj);
+    msl = inv(xl + nsites);
+    assert_(msj >= 0 && msl >= 0, typeid(*this).name(), "ExchangeConfig error.");
+
+    return std::make_tuple(xl, msj, xj + nsites, msl);
+  }
+
+private:
+  base_t _cfg_base;
+  base_t _from_idx;
+  base_t _to_orb;
+  base_t _cfg;
+  base_t _num;
+  base_t _inv;
+};
+}
+
diff --git a/src/pfupdates/makefile_pfupdates b/src/pfupdates/makefile_pfupdates
index bd8502e3..ab9be063 100644
--- a/src/pfupdates/makefile_pfupdates
+++ b/src/pfupdates/makefile_pfupdates
@@ -2,12 +2,14 @@ include ../make.sys
 
 OBJ = pfupdates.o
 SRC = pf_interface.cc
-HDR = pf_interface.h orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \
-	  ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh
+HDR = pf_interface.h \
+		orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \
+		../ltl2inv/eigen_gemmt.tcc ../ltl2inv/invert.tcc ../ltl2inv/pfaffian.tcc ../ltl2inv/trmmt.tcc \
+		../ltl2inv/error.hh ../ltl2inv/ilaenv_lauum.hh ../ltl2inv/lapack2eigen.hh
 
 $(OBJ): $(SRC) $(HDR) 
 	$(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \
-		-I../common -I../ltl2inv \
+		-I../pfapack/c_interface -I../ltl2inv \
 		-I$(BLIS_ROOT)/include \
 		-I$(BLIS_ROOT)/include/blis
 
diff --git a/src/pfupdates/orbital_mat.tcc b/src/pfupdates/orbital_mat.tcc
index 3e7ccf76..ddb4ce6c 100644
--- a/src/pfupdates/orbital_mat.tcc
+++ b/src/pfupdates/orbital_mat.tcc
@@ -6,33 +6,36 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "blis.h"
-#include "colmaj.hh"
+#include <Eigen/Dense>
 #include <random>
-template <typename T>
-using matrix_t = colmaj<T>;
 
-template <typename T>
+namespace vmc
+{
+namespace orbital
+{
+template <typename matrix_t_>
 struct orbital_mat
 {
-  const uplo_t uplo;
-  const dim_t nsite;
-  matrix_t<T> X; ///< nsite*nsite.
+  using matrix_t = matrix_t_;
+  using T = typename matrix_t::Scalar;
+  using index_t = int32_t;
 
-  orbital_mat(uplo_t uplo_, dim_t nsite_, T *X_, inc_t ldX)
-      : uplo(uplo_), nsite(nsite_),
-        X(X_, ldX) { }
+  const char uplo;
+  const index_t norb_;
+  matrix_t &X; ///< norb*norb.
 
-  orbital_mat(uplo_t uplo_, dim_t nsite_, matrix_t<T> &X_)
-  : uplo(uplo_), nsite(nsite_), X(X_) { }
+  orbital_mat(char uplo_, index_t norb__, matrix_t &X_)
+  : uplo(uplo_), norb_(norb__), X(X_) { }
 
-  void randomize(double amplitude, unsigned seed) { 
+  virtual index_t norb() { return norb_; }
+
+  virtual void randomize(double amplitude, unsigned seed) {
     using namespace std;
     mt19937_64 rng(seed);
     uniform_real_distribution<double> dist(-0.1, 1.0);
 
-    for (dim_t j = 0; j < nsite; ++j) {
-      for (dim_t i = 0; i < j; ++i) {
+    for (index_t j = 0; j < norb_; ++j) {
+      for (index_t i = 0; i < j; ++i) {
         X(i, j) = T(dist(rng)) * amplitude;
         X(j, i) = -X(i, j);
       }
@@ -42,18 +45,28 @@ struct orbital_mat
 
   void randomize(double amplitude) { randomize(amplitude, 511); }
 
-  T operator()(dim_t osi, dim_t osj) {
+  virtual index_t pack_idx(index_t osi, index_t osj) {
+    abort();
+  }
+
+  virtual void unpack_idx(index_t &osi, index_t &osj, index_t idx) {
+    abort();
+  }
+
+  virtual T operator()(index_t osi, index_t osj) {
     if (osi == osj)
       return T(0.0);
 
     switch (uplo) {
-    case BLIS_UPPER:
+    case 'U':
+    case 'u':
       if (osi < osj)
         return X(osi, osj);
       else
         return -X(osj, osi);
     
-    case BLIS_LOWER:
+    case 'L':
+    case 'l':
       if (osi > osj)
         return X(osi, osj);
       else
@@ -64,3 +77,5 @@ struct orbital_mat
     }
   }
 };
+}
+}
diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc
index a95efe01..fcdcfaca 100644
--- a/src/pfupdates/pf_interface.cc
+++ b/src/pfupdates/pf_interface.cc
@@ -4,6 +4,7 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #include "updated_tdi.tcc"
+#include "orbital_mat.tcc"
 // In implementation, this should appear AFTER blis.h.
 #include "pf_interface.h"
 
@@ -16,8 +17,15 @@
 #error "Valid non-preprocessor _pragma() not found."
 #endif
 
-#define orbv( i, ctype ) ( (orbital_mat<ctype> *)orbv[i] )
-#define objv( i, ctype ) ( (updated_tdi<ctype> *)objv[i] )
+using namespace Eigen;
+using namespace vmc::orbital;
+template <typename T>
+using matrix_t = Map<Matrix<T, Dynamic, Dynamic>, 0, OuterStride<> >;
+
+#define matv( i, ctype ) ( (                        matrix_t<ctype>     *)matv[i] )
+#define mapv( i, ctype ) ( (                        matrix_t<ctype>     *)mapv[i] )
+#define orbv( i, ctype ) ( (            orbital_mat<matrix_t<ctype> >   *)orbv[i] )
+#define objv( i, ctype ) ( (updated_tdi<orbital_mat<matrix_t<ctype> > > *)objv[i] )
 
 #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar
 
@@ -35,19 +43,23 @@
       int32_t  *elespn, \
       uint64_t  mmax, \
       void     *objv[], \
-      void     *orbv[] ) \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
 { \
   OMP_PARALLEL_FOR_SHARED \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
-    orbv[iqp] = new orbital_mat<ctype>( \
-        BLIS_UPPER, norbs, orbmat_base + iqp * orbmat_stride, norbs); \
-    objv[iqp] = new updated_tdi<ctype>( \
-        *orbv(iqp, ctype), nelec, invmat_base + iqp * invmat_stride, nelec, mmax); \
+    mapv[iqp] = new matrix_t<ctype>(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \
+    matv[iqp] = new matrix_t<ctype>(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \
+    orbv[iqp] = new orbital_mat<matrix_t<ctype>>('U', norbs, *mapv(iqp, ctype) ); \
+    objv[iqp] = new updated_tdi<orbital_mat<matrix_t<ctype>>> \
+        ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \
 \
     /* Compute initial. */ \
-    auto &cfg_i = objv(iqp, ctype)->elem_cfg; \
+    vmc::config_manager::base_t cfg_i(nelec); \
     for (int msi = 0; msi < nelec; ++msi) \
       cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \
+    objv(iqp, ctype)->attach_config(cfg_i); \
     objv(iqp, ctype)->initialize(); \
   } \
 }
@@ -61,11 +73,15 @@ GENIMPL( ccdcmplx, z )
   void EXPANDNAME( updated_tdi_v_free, cblachar ) \
     ( uint64_t  num_qp, \
       void     *objv[], \
-      void     *orbv[] ) \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
 { \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
     delete orbv(iqp, ctype); \
     delete objv(iqp, ctype); \
+    delete matv(iqp, ctype); \
+    delete mapv(iqp, ctype); \
   } \
 }
 // GENIMPL( float,    s )
@@ -82,7 +98,7 @@ GENIMPL( ccdcmplx, z )
 { \
   /* Minus sign appears from that SlaterElm is in fact interpreted as row-major. */ \
   for (int iqp = 0; iqp < num_qp; ++iqp) \
-    pfav[iqp] = -objv(iqp, ctype)->get_Pfa(); \
+    pfav[iqp] = -objv(iqp, ctype)->get_amplitude(); \
 }
 // GENIMPL( float,    s )
 GENIMPL( double,   d )
@@ -120,7 +136,7 @@ GENIMPL( ccdcmplx, z )
 { \
   OMP_PARALLEL_FOR_SHARED \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
-    auto &from_i = objv(iqp, ctype)->from_idx; \
+    const auto &from_i = objv(iqp, ctype)->get_config_manager().from_idx(); \
     if (from_i.size() > objv(iqp, ctype)->mmax - 2) \
       objv(iqp, ctype)->merge_updates(); \
     else \
diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h
index 8593a12c..903a90ef 100644
--- a/src/pfupdates/pf_interface.h
+++ b/src/pfupdates/pf_interface.h
@@ -38,7 +38,9 @@ extern "C" {
       int32_t  *elespn, \
       uint64_t  mmax, \
       void     *objv[], \
-      void     *orbv[] );
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
 
 // GENDEF( float,    s )
 GENDEF( double,   d )
@@ -50,7 +52,9 @@ GENDEF( ccdcmplx, z )
    void EXPANDNAME( updated_tdi_v_free, cblachar ) \
     ( uint64_t  num_qp, \
       void     *objv[], \
-      void     *orbv[] );
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
 
 // GENDEF( float,    s )
 GENDEF( double,   d )
diff --git a/src/pfupdates/skmv.tcc b/src/pfupdates/skmv.tcc
index 2abc2c6d..31a0947c 100644
--- a/src/pfupdates/skmv.tcc
+++ b/src/pfupdates/skmv.tcc
@@ -6,50 +6,39 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "colmaj.hh"
-#include "blalink.hh"
-#include <vector>
+#include "lapack2eigen.hh"
 
-template <typename T>
-void skmv(uplo_t uploA,
-          dim_t m,
-          T alpha,
-          T *_A, inc_t ldA,
-          T *X,
-          T *Y) {
-  using namespace std;
-  colmaj<T> A(_A, ldA);
+template <typename Mat, typename Vec>
+Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1>
+  skmv(const char &uploA,
+       const typename Mat::Scalar &alpha,
+       const Mat &A,
+       const Vec &X) {
+  using namespace Eigen;
 
+  Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1> Y;
   // Way 1:
-  // skmm(BLIS_LEFT, uploA, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, m, 1,
-  //      T(1.0), &A(0, 0), A.ld, &X[0], ldA, T(0.0), &Y[0], ldA);
-
-  // Way 2:
-  vector<T> Z(m);
-  for (dim_t i = 0; i < m; ++i) {
-    Z[i] = X[i];
-    Y[i] = X[i];
+  if (uploA == 'U' || uploA == 'u') {
+    Y = A.template triangularView<StrictlyUpper>() * X - A.template triangularView<StrictlyUpper>().transpose() * X;
+  } else {
+    Y = A.template triangularView<StrictlyLower>() * X - A.template triangularView<StrictlyLower>().transpose() * X;
   }
-  trmv(uploA, BLIS_NO_TRANSPOSE,
-       m, alpha, &A(0, 0), A.ld, &Y[0], 1);
-  trmv(uploA, BLIS_TRANSPOSE,
-       m, alpha, &A(0, 0), A.ld, &Z[0], 1);
-  for (dim_t i = 0; i < m; ++i)
-    Y[i] -= Z[i];
+  Y *= alpha;
 
-  // Way 3:
-  // for (dim_t i = 0; i < m; ++i)
+  // Way 2:
+  // for (int i = 0; i < m; ++i)
   //   Y[i] = 0.0;
-  // for (dim_t j = 0; j < m; ++j) {
+  // for (int j = 0; j < m; ++j) {
   //   T x_j = X[j];
-  //   for (dim_t i = 0; i < j; ++i) {
+  //   for (int i = 0; i < j; ++i) {
   //     T A_ij = A(i, j);
   //     Y[i] += A_ij * x_j;
   //     Y[j] -= A_ij * X[i];
   //   }
   // }
-  // for (dim_t i = 0; i < m; ++i)
+  // for (int i = 0; i < m; ++i)
   //   Y[i] *= alpha;
   
+  return Y;
 }
 
diff --git a/src/pfupdates/skr2k.tcc b/src/pfupdates/skr2k.tcc
index 0b8cdcb6..a6bba999 100644
--- a/src/pfupdates/skr2k.tcc
+++ b/src/pfupdates/skr2k.tcc
@@ -7,29 +7,29 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
-#include "blalink.hh"
-#include "blalink_gemmt.hh"
-#include "colmaj.hh"
+#pragma once
+#include "lapack2eigen.hh"
 #include <iostream>
 
 
-template <typename T>
-inline void skr2k(uplo_t uploc,
-                  trans_t transab,
-                  dim_t m, dim_t k,
-                  T alpha,
-                  T *_A, inc_t ldA,
-                  T *_B, inc_t ldB,
-                  T beta,
-                  T *_C, inc_t ldC)
+template <typename Mat0, typename Mat1, typename Mat2>
+inline void skr2k(const char &uploC,
+                  const char &transAB,
+                  const typename Mat2::Scalar &alpha,
+                  const Mat0 &A,
+                  const Mat1 &B,
+                  const typename Mat2::Scalar &beta,
+                        Mat2 &C)
 {
-    T one = 1.0;
-    if (transab == BLIS_NO_TRANSPOSE) {
-        gemmt<T>(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC);
-        gemmt<T>(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC);
+    using T = typename Mat0::Scalar;
+
+    l2e::le_mat_t<T> C_(C);
+    if (transAB == 'N' || transAB == 'n') {
+        gemmt<T>(uploC, 'N', 'T',  alpha, l2e::le_mat_t<T>(A), l2e::le_mat_t<T>(B), beta, C_);
+        gemmt<T>(uploC, 'N', 'T', -alpha, l2e::le_mat_t<T>(B), l2e::le_mat_t<T>(A), T(1), C_);
     } else {
-        gemmt<T>(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC);
-        gemmt<T>(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC);
+        gemmt<T>(uploC, 'T', 'N',  alpha, l2e::le_mat_t<T>(A), l2e::le_mat_t<T>(B), beta, C_);
+        gemmt<T>(uploC, 'T', 'N', -alpha, l2e::le_mat_t<T>(B), l2e::le_mat_t<T>(A), T(1), C_);
     }
 }
 
diff --git a/src/pfupdates/skslc.tcc b/src/pfupdates/skslc.tcc
index 0fd5a4cf..084c965d 100644
--- a/src/pfupdates/skslc.tcc
+++ b/src/pfupdates/skslc.tcc
@@ -6,28 +6,29 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
-#include "colmaj.hh"
+#pragma once
+#include "lapack2eigen.hh"
 
-template<typename T>
-inline signed skslc(uplo_t uploA,
-                    unsigned n,
-                    unsigned i,
-                    T *x,
-                    T *_A, unsigned ldA)
+template<typename Mat>
+inline Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1>
+  skslc(const char &uploA, unsigned i, const Mat &A)
 {
-    colmaj<T> A(_A, ldA);
+    Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1> x(A.rows());
+
     x[i] = 0.0;
     switch (uploA) {
-    case BLIS_UPPER:
+    case 'U':
+    case 'u':
         for (unsigned j = 0; j < i; ++j)
             x[j] =  A(j, i);
-        for (unsigned j = i+1; j < n; ++j)
+        for (unsigned j = i+1; j < A.rows(); ++j)
             x[j] = -A(i, j);
-        return 0;
+        break;
 
     default:
         std::cerr << "SKSLC: Lower triangular storage not implemented. Sorry." << std::endl;
-        return err_info(Pfaffine_NOT_IMPLEMNTED, 0);
+        break;;
     }
+    return x;
 }
 
diff --git a/src/pfupdates/testupdate.cc b/src/pfupdates/testupdate.cc
deleted file mode 100644
index 8f1c237b..00000000
--- a/src/pfupdates/testupdate.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * \copyright Copyright (c) Dept. Phys., Univ. Tokyo
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at https://mozilla.org/MPL/2.0/.
- */
-#include "updated_tdi.tcc"
-#include <cstdio>
-#include <chrono>
-#include <vector>
-
-struct info_t {
-    double err;
-    long long tim;
-};
-
-template <typename T>
-info_t test_tdi_update(dim_t nsite, std::vector<dim_t> cfg, std::vector<dim_t> from, std::vector<dim_t> to)
-{
-    dim_t nelec = cfg.size();
-    dim_t mmax = from.size();
-    T err = 0;
-    long long tim;
-
-    T *orb__ = new T[nsite * nsite];
-    colmaj<T> orb_(orb__, nsite);
-    orbital_mat<T> orb(BLIS_UPPER, nsite, orb_);
-    orb.randomize(0.6, 1234);
-
-    T *mat_ = new T[nelec * nelec];
-    updated_tdi<T> tdi(orb, cfg, mat_, nelec, mmax);
-
-    for (dim_t i = 0; i < mmax; ++i) {
-        if (from.at(i) < 0)
-            tdi.pop_update(true);
-        else
-            tdi.push_update_safe(to.at(i), from[i], true);
-
-        std::vector<dim_t> cfg2(tdi.elem_cfg);
-        // Apply hopping.
-        for (dim_t j = 0; j < tdi.from_idx.size(); ++j)
-          cfg2.at(tdi.from_idx.at(j)) = tdi.to_site.at(j);
-
-        T *mat2_ = new T[nelec * nelec];
-        T pfa2 = updated_tdi<T>(orb, cfg2, mat2_, nelec, 1).Pfa;
-
-        double err_ = std::abs(pfa2 - tdi.Pfa * tdi.PfaRatio);
-        printf("Pfa = %e; Pfa diff = %e\n", tdi.Pfa, err_);
-        err += err_;
-
-        delete[] mat2_;
-    }
-
-    delete[] mat_;
-    delete[] orb__;
-
-    return info_t{ err, tim };
-}
-
-int main(void)
-{
-    test_tdi_update<double>(200,
-        std::vector<dim_t>{ 8, 63, 20, 53, 57, 14, 7, 22, 32, 67, 56, 66, 46, 16, 18, 26, 70, 28, 51, 64, 17, 68, 49, 4, 48, 41, 43, 21, 19, 45, 58, 11, 1, 24, 2, 69, 12, 23, 30, 10, 65, 3, 31, 55, 52, 37, 42, 34, 25, 60, 59, 50, 40, 9, 44, 54, 6, 36, 13, 29, 27, 38, 61, 33, 47, 5, 15, 39, 62, 35 },
-        std::vector<dim_t>{  0,  39,  49,  10,  60,   3 },
-        std::vector<dim_t>{ 99, 101, 120, 199, 133, 177 });
-
-    return 0;
-}
-
diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc
index 40a338af..5eda1701 100644
--- a/src/pfupdates/updated_tdi.tcc
+++ b/src/pfupdates/updated_tdi.tcc
@@ -6,9 +6,7 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "orbital_mat.tcc"
-#include "blalink.hh"
-#include "blalink_gemmt.hh"
+#include "config_manager.tcc"
 #include "pfaffian.tcc"
 #include "invert.tcc"
 #include "skr2k.tcc"
@@ -17,52 +15,72 @@
 #include <iostream>
 #include <vector>
 
-template <typename T> struct updated_tdi {
-  orbital_mat<T> &Xij;
-  uplo_t uplo; ///< Can be switched only with update_uplo().
-  const dim_t nelec;
-  const dim_t mmax;
-  dim_t nq_updated; // Update of Q(:, i) can be delayed against U.
-
-  // Performance tuning constants.
+namespace vmc
+{
+namespace orbital
+{
+template <typename orbital_t_,
+          typename amp_t_ = typename orbital_t_::T>
+struct updated_tdi {
+  using orbital_t = orbital_t_;
+  using index_t = vmc::config_manager::index_t;
+  using amp_t = amp_t_;
+  using T = typename orbital_t::T;
+  using invmat_t = typename orbital_t::matrix_t; ///< Just for mVMC's C wrapper. Mathematically not so good.
+  using matrix_t = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+  using submat_t = Eigen::Block<matrix_t, Eigen::Dynamic, Eigen::Dynamic, true>;
+  using vector_t = Eigen::Matrix<T, Eigen::Dynamic, 1>;
+  using idxvec_t = Eigen::VectorXi;
+
+  orbital_t &Xij;
+  const index_t nelec; ///< Total Fermionic number / Matrix size.
+  const index_t mmax;  ///< Max number of updates contained.
   const bool single_hop_alpha; ///< Whether to simplify k=1 situations.
 
+private:
+  char uplo; ///< Can be switched only with update_uplo().
+  index_t nq_updated; // Update of Q(:, i) can be delayed against U.
+
   // Matrix M.
-  matrix_t<T> M;
+  invmat_t &M; ///< (nelec, nelec)
 
   // Updator blocks U, inv(M)U and inv(M)V.
   // Note that V is not stored.
-  matrix_t<T> U; ///< Serves also as B*inv
-  matrix_t<T> Q; ///< Contains the whole B buffer. Assert![ Q = M U ]
-  matrix_t<T> P; ///< Q(0, mmax)
+  matrix_t U; ///< Serves also as B*inv
+  matrix_t Q; ///< Contains the whole B buffer. Assert![ Q = M U ]
+  // This member causes pointer problem on mixed precision.
+  // (Which was fixed by constructing Block objects on-the-fly.)
+  // TODO: Check its reason.
+  // submat_t P; ///< &Q(0, mmax).
 
   // Central update buffer and its blocks.
-  matrix_t<T> W;
-  matrix_t<T> UMU, UMV, VMV;
-  matrix_t<T> Cp;       ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ]
-  matrix_t<T> Gc;       ///< Used to be Gaussian vectors. Now just scratchpads.
-  signed *const cPov; ///< Pivot when tri-diagonalizing C.
-  T Pfa;
-  T PfaRatio; //< Updated Pfaffian / Base Pfaffian.
-  // TODO: Maybe one should log all Pfaffian histories.
-
-  std::vector<dim_t> elem_cfg;
-  std::vector<dim_t> from_idx;
-  std::vector<dim_t> to_site;
+  matrix_t W;
+  matrix_t UMU, UMV, VMV;
+  matrix_t Cp;       ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ]
+  matrix_t Gc;       ///< Used to be Gaussian vectors. Now just scratchpads.
+  idxvec_t cPiv;     ///< Pivot when tri-diagonalizing C.
+  amp_t Pfa;
+  amp_t PfaRatio; //< Updated Pfaffian / Base Pfaffian.
+  // TODO: Maybe one should log partially the computed Pfaffian history.
+
+  // Configuration.
+  vmc::config_manager elem_cfg;
+
+public:
 
   void initialize() {
     using namespace std;
-    auto &cfg = elem_cfg;
-    matrix_t<T> G(new T[nelec * nelec], nelec);
-    from_idx.clear();
-    to_site.clear();
-    from_idx.reserve(mmax * sizeof(dim_t));
-    to_site.reserve(mmax * sizeof(dim_t));
+    elem_cfg.merge_config();
+    elem_cfg.reserve_space(mmax);
+    const auto &cfg = elem_cfg.config_base();
+    matrix_t G(nelec, nelec);
+    nq_updated = 0;
 
     switch (uplo) {
-    case BLIS_UPPER:
-      for (dim_t j = 0; j < nelec; ++j) {
-        for (dim_t i = 0; i < j; ++i) {
+    case 'U':
+    case 'u':
+      for (index_t j = 0; j < nelec; ++j) {
+        for (index_t i = 0; i < j; ++i) {
           M(i, j) = Xij(cfg.at(i), cfg.at(j));
         }
         M(j, j) = T(0.0);
@@ -70,104 +88,103 @@ template <typename T> struct updated_tdi {
       break;
 
     default:
-      cerr << "updated_tdi<T>: BLIS_LOWER is not supported. The error is fetal."
+      cerr << "updated_tdi: Lower triangular is not implemented yet."
            << endl;
     }
 
     // Allocate scratchpad.
-    T *vT = new T[nelec-1];
-    signed *iPiv = new signed[nelec];
+    vector_t vT(nelec - 1);
+    idxvec_t iPiv(nelec);
+    l2e::le_mat_t<T> M_(M);
 
     // Perform LTL factorization regardless of `uplo`.
-    signed info = sktrf(uplo, nelec, &M(0, 0), M.ld, iPiv, &G(0, 0), G.ld * nelec);
-    Pfa = ltl2pfa(uplo, nelec, &M(0, 0), M.ld, iPiv);
-    ltl2inv(uplo, nelec, &M(0, 0), M.ld, iPiv, vT, &G(0, 0), G.ld);
-#ifdef _DEBUG
-    cerr << "SKPFA+INV: n=" << nelec << " info=" << info << endl;
-#endif
-
-    delete[] vT;
-    delete[] iPiv;
-    delete[](&G(0, 0));
+    signed info = sktrf(uplo, 'N', M_, &iPiv(0), &G(0, 0), G.size());
+    Pfa = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, M, iPiv);
+    PfaRatio = 1.0; // Reset Pfaffian ratio.
+    ltl2inv(uplo, M, iPiv, vT, G);
   }
 
-  ~updated_tdi() {
-    delete[](&U(0, 0));
-    delete[](&Q(0, 0));
-
-    delete[](&Cp(0, 0));
-    delete[](&Gc(0, 0));
+  updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_, std::vector<index_t> cfg)
+      : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true),
+        M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2),
+        W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax),
+        // Cp, Gc and cPiv are dynamically allocated.
+        Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2, cfg), uplo('U')
+  { initialize(); }
 
-    delete[](&W(0, 0));
-    delete[](&UMU(0, 0));
-    delete[](&UMV(0, 0));
-    delete[](&VMV(0, 0));
-
-    delete[] cPov;
+  // Unsafe construction without initializaion.
+  updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_)
+      : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true),
+        M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2),
+        W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax),
+        Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2), uplo('U') { }
+
+  // Copy construction could be unsafe since it does not always contain initializaion.
+  // M is automatically allocated in this case.
+  updated_tdi(const updated_tdi<orbital_t> &tdi_) = delete;
+    /*
+      : Xij(tdi_.Xij), nelec(tdi_.nelec), mmax(tdi_.mmax), nq_updated(0), single_hop_alpha(true),
+        M(new invmat_t(...)), U(nelec, mmax * 2), Q(nelec, mmax * 2),
+        W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax),
+        Pfa(0.0), PfaRatio(1.0), elem_cfg(tdi_.Xij.norb()/2, tdi_.get_config()), uplo('U') {
+    if (elem_cfg.config().size() == nelec &&
+        elem_cfg.config(0) != elem_cfg.config(1) && // Short-circuit complicated check.
+        is_perm(elem_cfg.config()))
+      initialize();
+  } */
+
+  bool is_perm(const vmc::config_manager::base_t &cfg) const
+  {
+    std::vector<bool> occupied(Xij.norb(), false);
+    for (const auto &xi : cfg) {
+      if (xi > Xij.norb())
+        return false;
+      if (occupied.at(xi))
+        return false;
+      occupied.at(xi) = true;
+    }
+    return true;
   }
 
-  updated_tdi(orbital_mat<T> &Xij_, std::vector<dim_t> &cfg, T *M_, inc_t ldM,
-              dim_t mmax_)
-      : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), M(M_, ldM),
-        U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
-        P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
-        UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
-        VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg),
-        from_idx(0), to_site(0), uplo(BLIS_UPPER) {
-    initialize();
+  amp_t get_amplitude() { return Pfa * get_amplitude_ratio(); }
+  amp_t get_amplitude_ratio() {
+    if (PfaRatio == 0.0)
+      update_pfaffian_ratio();
+    return PfaRatio;
   }
+  int max_updates() const { return mmax; }
+  int num_updates() const { return elem_cfg.from_idx().size(); }
 
-  /*
-  updated_tdi(orbital_mat<T> &Xij_, std::vector<dim_t> &cfg, matrix_t<T> &M_,
-              dim_t mmax_)
-      : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), M(M_),
-        U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
-        P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
-        UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
-        VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg),
-        from_idx(0), to_site(0), uplo(BLIS_UPPER) {
-    initialize();
-  }*/
+  const vmc::config_manager::base_t get_config() const
+  { return elem_cfg.config(); }
 
-  // Unsafe construction without initializaion.
-  updated_tdi(orbital_mat<T> &Xij_, dim_t nelec_, T *M_, inc_t ldM, dim_t mmax_) 
-      : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), M(M_, ldM),
-        U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
-        P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
-        UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
-        VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(nelec, 0),
-        from_idx(0), to_site(0), uplo(BLIS_UPPER) { }
-
-  T get_Pfa() { return Pfa * PfaRatio; }
+  const vmc::config_manager::base_t &get_config_base() const
+  { return elem_cfg.config_base(); }
+
+  const vmc::config_manager &get_config_manager() const
+  { return elem_cfg; }
+
+  void attach_config(const vmc::config_manager::base_t &cfg)
+  {
+    assert_(cfg.size() == nelec, typeid(*this).name(), "Invalid config feeded.");
+    elem_cfg.attach_config(cfg);
+  }
 
   void assemble_C_BMB() {
     using namespace std;
-    dim_t k = from_idx.size();
+    using namespace Eigen;
+    index_t k = elem_cfg.from_idx().size();
+    if (!k)
+      return;
 
     // Assemble (half of) C+BMB buffer.
+    Cp = matrix_t(2 * k, 2 * k);
     switch (uplo) {
-    case BLIS_UPPER:
-      for (dim_t j = 0; j < k; ++j)
-        for (dim_t i = 0; i < j; ++i)
-          Cp(i, j) = W(i, j) + UMU(i, j);
-      for (dim_t j = 0; j < k; ++j) {
-        for (dim_t i = 0; i < k; ++i)
-          Cp(i, j + k) = +UMV(i, j);
-
-        Cp(j, j + k) -= T(1.0);
-      }
-      for (dim_t j = 0; j < k; ++j)
-        for (dim_t i = 0; i < j; ++i)
-          Cp(i + k, j + k) = VMV(i, j);
+    case 'U':
+    case 'u':
+      // Left-lower block skipped due to uplo='U'.
+      Cp << W(seq(0, k-1), seq(0, k-1)) + UMU(seq(0, k-1), seq(0, k-1)), UMV(seq(0, k-1), seq(0, k-1)) - matrix_t::Identity(k, k),
+            matrix_t::Zero(k, k),                                        VMV(seq(0, k-1), seq(0, k-1));
       break;
 
     default:
@@ -177,12 +194,15 @@ template <typename T> struct updated_tdi {
   }
 
   // Update Q rows buffer.
-  bool require_Q(bool all) {
-    const dim_t k = from_idx.size();
-    const dim_t n = nelec;
-    dim_t k_cal;
+  bool require_Q(bool require_all) {
+    using namespace Eigen;
+    using namespace l2e;
+
+    const index_t k = elem_cfg.from_idx().size();
+    const index_t n = nelec;
+    index_t k_cal;
     // Require all K columns instead of first K-1.
-    if (all)
+    if (require_all)
       k_cal = k;
     else
       k_cal = k - 1;
@@ -193,12 +213,12 @@ template <typename T> struct updated_tdi {
 
     for (; nq_updated < k_cal; ++nq_updated) {
       // Update single column. Use SKMV.
-      skmv(uplo, n, T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), &Q(0, nq_updated));
+      Q(all, nq_updated) = skmv(uplo, T(1.0), M, U(all, nq_updated));
     } /* else {
       // Update multiple columns. Use SKMM.
-      skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated,
-           T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), U.ld,
-           T(0.0), &Q(0, nq_updated), Q.ld);
+      skmm('L', uplo, 'N', 'N',
+           T(1.0), le_mat_t<T>(M), le_mat_t<T>U(all, seq(nq_updated, k_cal-1)),
+           T(0.0), le_mat_t<T>(Q(all, seq(nq_updated, k_cal-1))));
     }
     nq_updated = k_cal;
     */
@@ -208,18 +228,22 @@ template <typename T> struct updated_tdi {
   // UMU needs to be recalculated for hopping change.
   // TODO: Find some way to avoid recalculating UQ?
   bool require_UMU() {
-    const dim_t k = from_idx.size();
-    const dim_t n = nelec;
+    using namespace Eigen;
 
-    for (dim_t o = 0; o < k; ++o)
-      for (dim_t l = 0; l < o; ++l)
+    const index_t k = elem_cfg.from_idx().size();
+    const index_t n = nelec;
+
+    for (index_t o = 0; o < k; ++o)
+      for (index_t l = 0; l < o; ++l)
         switch (uplo) {
-        case BLIS_UPPER:
-          UMU(l, o) = -dot(n, &U(0, o), 1, &Q(0, l), 1);
+        case 'U':
+        case 'u':
+          UMU(l, o) = -(U(all, o).transpose() * Q(all, l))(0, 0);
           break;
-        case BLIS_LOWER:
+        case 'L':
+        case 'l':
           // Always assume l < o to calculate one less column of Q.
-          UMU(o, l) = dot(n, &U(0, o), 1, &Q(0, l), 1);
+          UMU(o, l) = (U(all, o).transpose() * Q(all, l))(0, 0);
           break;
         default:
           break;
@@ -227,101 +251,95 @@ template <typename T> struct updated_tdi {
     return true;
   }
 
-  // Inverts a configuration from fermion index to site index.
-  void config_to_site(std::vector<dim_t> &cfg, std::vector<signed> &site, bool init) {
-    using namespace std;
-
-    if (site.size() != Xij.nsite || cfg.size() != nelec) {
-      cerr << "updated_tdi<T>::inv_config:"
-           << " Invalid config / invert config buffer." << endl;
-      return;
-    }
-
-    if (init)
-      for (dim_t i = 0; i < site.size(); ++i)
-        site.at(i) = -1;
-    for (dim_t i = 0; i < cfg.size(); ++i)
-      site.at(cfg.at(i)) = i;
-
-    return;
-  }
-
   // Update osi <- os[msj].
   // i.e. c+_i c_{x_j}.
-  void push_update(dim_t osi, dim_t msj, bool compute_pfa) {
+  void push_update(index_t osi, index_t msj, bool compute_pfa) {
     using namespace std;
+    using namespace Eigen;
 
     // This is the k-th hopping.
-    dim_t n = nelec;
-    dim_t k = from_idx.size();
-    dim_t osj = elem_cfg.at(msj);
+    index_t n = nelec;
+    index_t k = elem_cfg.from_idx().size();
+    index_t osj = elem_cfg.config_base(msj);
 
-    // TODO: Check bounds, duplicates, etc.
-    from_idx.push_back(msj);
-    to_site.push_back(osi);
+    elem_cfg.push_update(osi, msj);
 
-    // TODO: Check for hopping-backs. 
+    // TODO: Check for hopping-backs.
     // This can only be handled by cancellation.
     // Singularity will emerge otherwise.
 
-    // Speed-up lookup of Xij.
-    std::vector<signed> sitecfg(Xij.nsite, -1);
-    config_to_site(elem_cfg, sitecfg, false);
-    for (dim_t i = 0; i < Xij.nsite; ++i)
-      // U(i, k) = Xij(elem_cfg.at(i), osi) - Xij(elem_cfg.at(i), osj);
-      if (sitecfg.at(i) >= 0)
-        // Direct access: special case when installed into VMC.
-        U(sitecfg.at(i), k) = Xij.X(i, osi) - Xij.X(i, osj);
+    // TODO: Speed-up lookup of Xij.
+    for (index_t i = 0; i < n; ++i)
+      U(i, k) = Xij(elem_cfg.config_base(i), osi) - Xij(elem_cfg.config_base(i), osj);
+    // for (index_t i = 0; i < Xij.norb(); ++i)
+    //   if (elem_cfg.inv(i) >= 0)
+    //     U(elem_cfg.inv(i), k) = Xij(i, osi) - Xij(i, osj);
+
+    // P matrix from latter half of Q (or rigorously speaking, B matrix).
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
 
-    skslc(uplo, n, msj, &P(0, k), &M(0, 0), M.ld);
+    P(all, k) = skslc(uplo, msj, M);
 
     // Updated the already logged U.
-    for (dim_t l = 0; l < k; ++l) {
-      dim_t msl = from_idx.at(l);
+    for (index_t l = 0; l < k; ++l) {
+      index_t msl = elem_cfg.from_idx().at(l);
       T U_jl_ = U(msj, l); ///< Backup this value before change.
 
       // Write updates.
-      U(msl, k) = Xij(to_site.at(l), osi) - Xij(elem_cfg.at(msl), osj);
-      U(msj, l) = Xij(osi, to_site.at(l)) - Xij(osj, elem_cfg.at(msl));
+      U(msl, k) = Xij(elem_cfg.to_orb().at(l), osi) - Xij(elem_cfg.config_base(msl), osj);
+      U(msj, l) = Xij(osi, elem_cfg.to_orb().at(l)) - Xij(osj, elem_cfg.config_base(msl));
 
       // Change of Q[:, l] induced by change of U[msj, l];
       // Utilize that P[:, k] = M[:, msj] ///< M is skew-symmetric.
-      axpy(n, U(msj, l) - U_jl_, &P(0, k), 1, &Q(0, l), 1);
+      Q(all, l) += P(all, k) * (U(msj, l) - U_jl_);
 
       // Change of UMV[l, k] induced by change of U[msj, l].
-      for (dim_t o = 0; o < k; ++o)
+      for (index_t o = 0; o < k; ++o)
         UMV(l, o) += (U(msj, l) - U_jl_) * P(msj, o);
     }
 
     // Update UMV and VMV. UMU is updated elsewhere.
     switch (uplo) {
-    case BLIS_UPPER:
-      for (dim_t l = 0; l < k; ++l)
-        W(l, k) = -Xij(to_site.at(l), osi) + Xij(elem_cfg.at(from_idx.at(l)), osj);
-
-      for (dim_t l = 0; l < k; ++l) {
-        UMV(l, k) = dot(n, &U(0, l), 1, &P(0, k), 1);
-        UMV(k, l) = dot(n, &U(0, k), 1, &P(0, l), 1);
+    case 'U':
+    case 'u':
+      for (index_t l = 0; l < k; ++l)
+        W(l, k) = -Xij(elem_cfg.to_orb(l), osi) + Xij(elem_cfg.config_base(elem_cfg.from_idx(l)), osj);
+
+      for (index_t l = 0; l < k; ++l) {
+        UMV(l, k) = (U(all, l).transpose() * P(all, k))(0, 0);
+        UMV(k, l) = (U(all, k).transpose() * P(all, l))(0, 0);
       }
-      UMV(k, k) = dot(n, &U(0, k), 1, &P(0, k), 1);
-      for (dim_t l = 0; l < k; ++l)
+      UMV(k, k) = (U(all, k).transpose() * P(all, k))(0, 0);
+      for (index_t l = 0; l < k; ++l)
         VMV(l, k) /* -VMV(k, l) */ = -P(msj, l); ///< P(from_idx.at(l), k);
       break;
 
     default:
-      cerr << "updated_tdi<T>::push_update:"
+      cerr << "updated_tdi::push_update:"
            << " Only upper-triangular storage is supported." << endl;
     }
 
     if (compute_pfa) {
-      // NOTE: Update k to be new size.
-      k += 1;
+      // NOTE: Update k to be new size?
+      // k += 1;
 
-      // Calculate unupdated columns of U.
+      update_pfaffian_ratio();
+    } else
+      // Set to 0.0 to denote dirty.
+      PfaRatio = 0.0;
+  }
+
+  void update_pfaffian_ratio(void)
+  {
+    index_t k = elem_cfg.from_idx().size();
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
+    {
+      // All compute_pfa requires first K-1 of Q.
       require_Q(false);
 
+      // Recompute UMU.
       require_UMU();
-      // Assemble (half of) C+BMB buffer.
+      // Reassemble C and scratchpads.
       assemble_C_BMB();
 
       // If it's the first update Pfafian can be directly read out.
@@ -331,67 +349,76 @@ template <typename T> struct updated_tdi {
       }
 
       // Compute pfaffian.
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k);
-#ifdef _DEBUG
-      cerr << "SKPFA: info=" << info << endl;
-#endif
+      l2e::le_mat_t<T> Cp_(Cp);
+      Gc = matrix_t::Zero(2*k, 2*k);
+      cPiv = idxvec_t::Zero(2*k);
+      // Use not skpfa<T> for mixed precision.
+      signed info = sktrf(uplo, 'P', Cp_, &cPiv(0), &Gc(0, 0), Gc.size());
+      PfaRatio = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, Cp, cPiv);
+
       // Pfaffian of C = [ W -I; I 0 ].
       PfaRatio *= pow(-1.0, k * (k + 1) / 2);
+    }
+  }
 
-    } else
-      // Set to 0.0 to denote dirty.
-      PfaRatio = 0.0;
+  bool check_update_safety(const std::vector<index_t> &ms, bool merge_error = false)
+  {
+    if (elem_cfg.from_idx().size() + ms.size() > mmax)
+      return false;
+
+    // Check if alrady hopped out.
+    for (index_t l = 0; l < elem_cfg.from_idx().size(); ++l)
+      for (index_t j = 0; j < ms.size(); ++j)
+        if (ms.at(j) == elem_cfg.from_idx().at(l)) {
+          assert_(!merge_error, typeid(*this).name(), "Conflict on non-mergable push.");
+          return false;
+        }
+    return true;
   }
 
-  void push_update(dim_t osi, dim_t msj) { push_update(osi, msj, true); }
+  void push_update(index_t osi, index_t msj) { push_update(osi, msj, true); }
 
   // Check duplicate and push.
-  void push_update_safe(dim_t osi, dim_t msj, bool compute_pfa) {
-    if (from_idx.size() >= mmax)
+  void push_update_safe(index_t osi, index_t msj, bool compute_pfa, bool merge_error = false) {
+    if (!check_update_safety({ msj }))
       merge_updates();
 
-    // Check if already hopped out.
-    for (dim_t l = 0; l < from_idx.size(); ++l)
-      if (msj == from_idx.at(l)) {
-        merge_updates(); 
-        break;
-      }
     push_update(osi, msj, compute_pfa);
   }
 
-  void push_update_safe(dim_t osi, dim_t msj) {
+  void push_update_safe(index_t osi, index_t msj) {
     push_update_safe(osi, msj, true);
   }
 
   void pop_update(bool compute_pfa) {
     using namespace std;
+    using namespace Eigen;
 
     // Pop out.
-    dim_t msj_ = from_idx.at(from_idx.size() - 1);
-    dim_t osi_ = to_site.at(to_site.size() - 1);
-    dim_t osj_ = elem_cfg.at(msj_);
-    from_idx.pop_back();
-    to_site.pop_back();
+    index_t msj_ = elem_cfg.from_idx().back();
+    index_t osi_ = elem_cfg.to_orb().back();
+    index_t osj_ = elem_cfg.config_base(msj_);
+    elem_cfg.pop_update();
 
     // New update size.
-    dim_t k = from_idx.size();
+    index_t k = elem_cfg.from_idx().size();
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
 
     // Revert U[:, 1:k-1] update.
-    for (dim_t l = 0; l < k; ++l) {
-      dim_t msl = from_idx.at(l);
+    for (index_t l = 0; l < k; ++l) {
+      index_t msl = elem_cfg.from_idx(l);
       T U_jl_ = U(msj_, l);
 
       // Revert U rows. (final column is popped out).
-      U(msj_, l) = Xij(osj_, to_site.at(l)) - Xij(osj_, elem_cfg.at(msl));
+      U(msj_, l) = Xij(osj_, elem_cfg.to_orb(l)) - Xij(osj_, elem_cfg.config_base(msl));
 
       // Change of Q[:, l] induced by change of U[msj, l];
       T U_diff_msj = U(msj_, l) - U_jl_;
       switch (uplo) {
-      case BLIS_UPPER:
-        for (dim_t i = 0; i < msj_; ++i)
-          Q(i, l) += M(i, msj_) * U_diff_msj;
-        for (dim_t i = msj_ + 1; i < nelec; ++i)
-          Q(i, l) -= M(msj_, i) * U_diff_msj;
+      case 'U':
+      case 'u':
+        Q(seq(0, msj_-1), l) += M(seq(0, msj_-1), msj_) * U_diff_msj;
+        Q(seq(msj_+1, nelec-1), l) -= M(msj_, seq(msj_+1, nelec-1)) * U_diff_msj;
         break;
 
       default:
@@ -400,7 +427,7 @@ template <typename T> struct updated_tdi {
       }
 
       // Change of UMV[l, k] induced by change of U[msj, l].
-      for (dim_t o = 0; o < k; ++o)
+      for (index_t o = 0; o < k; ++o)
         UMV(l, o) += (U(msj_, l) - U_jl_) * P(msj_, o);
     }
     // Pop out outdated Q.
@@ -408,19 +435,10 @@ template <typename T> struct updated_tdi {
       nq_updated = k;
 
     // Compute new (previous, in fact) Pfaffian.
-    if (!k)
+    if (k == 0)
       PfaRatio = 1.0;
     else if (compute_pfa) {
-      // All compute_pfa requires first K-1 of Q.
-      require_Q(false);
-
-      // Recompute UMU.
-      require_UMU();
-      // Reassemble C and scratchpads.
-      assemble_C_BMB();
-
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k);
-      PfaRatio *= pow(-1.0, k * (k + 1) / 2);
+      update_pfaffian_ratio();
 
     } else
       // Set to 0.0 to denote dirty.
@@ -430,20 +448,20 @@ template <typename T> struct updated_tdi {
   void merge_updates() {
     using namespace std;
 
-    dim_t n = nelec;
-    dim_t k = from_idx.size();
+    index_t n = nelec;
+    index_t k = elem_cfg.from_idx().size();
     if (k == 0)
       return;
 
     // Allocate scratchpad.
-    T *vT = new T[2 * k - 1];
+    vector_t vT(2 * k - 1);
 
     // Update whole Q.
     require_Q(true);
 
     if (k == 1) {
       // M is dirty.
-      if (PfaRatio == T(0.0)) {
+      if (PfaRatio == 0.0) {
         require_UMU();
         assemble_C_BMB();
         PfaRatio = -Cp(0, 1);
@@ -455,34 +473,38 @@ template <typename T> struct updated_tdi {
       // Redo the tridiagonal factorization.
       require_UMU();
       assemble_C_BMB();
-      signed info = sktrf(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, &Gc(0, 0), Gc.ld * 2 * k);
+
+      l2e::le_mat_t<T> Cp_(Cp);
+      Gc = matrix_t::Zero(2*k, 2*k);
+      cPiv = idxvec_t::Zero(2*k);
+      signed info = sktrf(uplo, 'N', Cp_, &cPiv(0), &Gc(0, 0), 2*k * 2*k);
       // PfaRatio is dirty.
-      if (PfaRatio == T(0.0))
-        PfaRatio = ltl2pfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov) * T(pow(-1.0, k * (k + 1) / 2));
-      ltl2inv(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, vT, &Gc(0, 0), Gc.ld);
+      if (PfaRatio == 0.0)
+        PfaRatio = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, Cp, cPiv) *
+          T(pow(-1.0, k * (k + 1) / 2));
+      ltl2inv(uplo, Cp, cPiv, vT, Gc);
     }
-    inv_update(k, Cp);
+    assert_(Cp.rows() == 2 * k, typeid(*this).name(), "Cp is bad.");
+    inv_update(Cp);
 
     // Apply hopping.
-    for (int j = 0; j < k; ++j)
-      elem_cfg.at(from_idx.at(j)) = to_site.at(j);
-    from_idx.clear();
-    to_site.clear();
+    elem_cfg.merge_config();
     nq_updated = 0;
     Pfa *= PfaRatio;
     PfaRatio = 1.0;
-
-    delete[] vT;
   }
 
-  void inv_update(dim_t k, matrix_t<T> &C) {
-    matrix_t<T> ABC(&U(0, 0), U.ld); ///< Use U as inv(A)*B*upper(C) buffer.
-    matrix_t<T> AB(&Q(0, 0), Q.ld);
+  void inv_update(matrix_t &C) {
+    using namespace Eigen;
+    const index_t k = C.rows() / 2;
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
+
+    auto ABC = U(all, seq(0, 2 * k - 1)); ///< Use U as inv(A)*B*upper(C) buffer.
+    auto AB  = Q(all, seq(0, 2 * k - 1));
 
     if (k == 1 && single_hop_alpha) {
       // k == 1 requires no copying
-      skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.ld,
-            &P(0, 0), P.ld, T(1.0), &M(0, 0), M.ld);
+      skr2k(uplo, 'N', C(0, 1), Q(all, 0), P(all, 0), T(1.0), M);
       // See below for reason of calling this procedule.
       // update_uplo(uplo);
       return;
@@ -490,39 +512,15 @@ template <typename T> struct updated_tdi {
 
     // Close empty space between Q and P.
     if (k != mmax)
-      for (dim_t j = 0; j < k; ++j)
-        memcpy(&AB(0, k + j), &P(0, j), nelec * sizeof(T));
-
-    # if 0
-    // Copy AB to ABC for TRMM interface.
-    for (dim_t j = 0; j < 2 * k - 1; ++j)
-      // inv(A)*U  [ 0 + + +
-      //             0 0 + +
-      //             0 0 0 +
-      //             0 0 0 0 ] => AB[:, 0:2] -> ABC[:, 1:3]
-      memcpy(&ABC(0, j + 1), &AB(0, j), nelec * sizeof(T));
-    trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0),
-         &C(0, 1), C.ld, &ABC(0, 1), ABC.ld);
-
-    // Update: write to M.
-    skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.ld,
-          &AB(0, 1), AB.ld, T(1.0), &M(0, 0), M.ld);
-    #else
-    gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE,
-         nelec, 2 * k, 2 * k,
-         T(1.0),
-         &AB(), AB.ld,
-         &C(), C.ld,
-         T(0.0),
-         &ABC(), ABC.ld);
-    gemmt(uplo, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE,
-          nelec, 2 * k,
-          T(1.0),
-          &ABC(), ABC.ld,
-          &AB(), AB.ld,
-          T(1.0),
-          &M(), M.ld);
-    #endif
+      for (index_t j = 0; j < k; ++j)
+        AB(all, k + j) = P(all, j);
+
+    l2e::le_mat_t<T> M_(M);
+    ABC = AB * C;
+    gemmt<T>(uplo, 'N', 'T', T(1.0),
+             l2e::le_mat_t<T>(ABC),
+             l2e::le_mat_t<T>(AB),
+             T(1.0), M_);
 
     // Identity update to complete antisymmetric matrix.
     // This called due to lack of skmm support at the moment.
@@ -532,15 +530,19 @@ template <typename T> struct updated_tdi {
   /**
    * Complete antisymmetric matrix.
    */
-  void skcomplete(uplo_t uplo_, dim_t n, matrix_t<T> &A) {
-    for (dim_t j = 0; j < n; ++j) {
-      for (dim_t i = 0; i < j; ++i) {
+  static void skcomplete(const char &uplo_, matrix_t &A) {
+    const index_t n = A.rows();
+
+    for (index_t j = 0; j < n; ++j) {
+      for (index_t i = 0; i < j; ++i) {
         switch (uplo_) {
-        case BLIS_UPPER:
+        case 'U':
+        case 'u':
           A(j, i) = -A(i, j);
           break;
 
-        case BLIS_LOWER:
+        case 'L':
+        case 'l':
           A(i, j) = -A(j, i);
           break;
 
@@ -552,15 +554,159 @@ template <typename T> struct updated_tdi {
     }
   }
 
-  void update_uplo(uplo_t uplo_new) {
-    skcomplete(uplo /* NOTE: old uplo */, nelec, M);
-    if (from_idx.size()) {
-      skcomplete(uplo, from_idx.size(), UMU);
-      skcomplete(uplo, from_idx.size(), VMV);
-      skcomplete(uplo, from_idx.size(), W);
+  void update_uplo(const char &uplo_new) {
+    using namespace Eigen;
+
+    skcomplete(uplo /* NOTE: Old uplo. */, M);
+    const int k = elem_cfg.from_idx().size();
+    if (k) {
+      skcomplete(uplo, UMU(seq(0, k-1), seq(0, k-1)));
+      skcomplete(uplo, VMV(seq(0, k-1), seq(0, k-1)));
+      skcomplete(uplo,   W(seq(0, k-1), seq(0, k-1)));
     }
 
     uplo = uplo_new;
   }
 
-}; 
+  Eigen::Vector<amp_t, Eigen::Dynamic> batch_query_amplitudes(int N, idxvec_t to_orbs, idxvec_t from_ids) {
+    using namespace Eigen;
+    using ampvec_t = Eigen::Vector<amp_t, Eigen::Dynamic>;
+
+    bool upper = (uplo == 'U' || uplo == 'u');
+    int k0 = from_ids.size();
+    int k1 = k0 / N;
+    assert_(k0 % N == 0, typeid(*this).name(), "Misaligned batch query.");
+    ampvec_t result(k1);
+
+    // Works w/ merged M only.
+    merge_updates();
+    // Operates on complete M.
+    skcomplete(uplo, M);
+
+    matrix_t UbB;
+    if (N > 1)
+      UbB = matrix_t::Zero(nelec, k1 * (N-1));
+    matrix_t UbF= matrix_t::Zero(nelec, k1);
+    matrix_t Pb = matrix_t::Zero(nelec, k0);
+
+    // Grouped comput. of Ub.
+    for (index_t l1 = 0; l1 < k1; ++l1) {
+      for (index_t l = l1 * N; l < (l1 + 1) * N; ++l)
+        elem_cfg.push_update(to_orbs(l), from_ids(l));
+
+      int osi, osj;
+
+      for (index_t l2 = 0; l2 < N-1; ++l2) {
+        index_t l = l1 * N + l2;
+        index_t lB = l1 * (N-1) + l2;
+
+        osi = to_orbs(l);
+        osj = elem_cfg.config_base(from_ids(l));
+
+        for (index_t i = 0; i < nelec; ++i) {
+          index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i);
+          UbB(i, lB) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj);
+        }
+      }
+
+      // Final column in the grouped U.
+      {
+        index_t l = l1 * N + N - 1;
+        osi = to_orbs(l);
+        osj = elem_cfg.config_base(from_ids(l));
+
+        for (index_t i = 0; i < nelec; ++i) {
+          index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i);
+          UbF(i, l1) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj);
+        }
+      }
+
+      for (index_t ll = 0; ll < N; ++ll)
+        elem_cfg.pop_update();
+    }
+
+    // Pb needs no grouping.
+    for (index_t l = 0; l < k0; ++l)
+      Pb(all, l) = M(all, from_ids(l));
+
+    matrix_t QbB;
+    if (N > 1)
+      // Require Q.
+      QbB = M * UbB;
+
+    // Batch assemble C.
+    matrix_t Cb = matrix_t::Zero(2*N, 2*N * k1);
+    for (index_t l1 = 0; l1 < k1; ++l1) {
+      auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1));
+      auto UCur = UbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1));
+      auto QCur = QbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1));
+      auto PCur = Pb(all, seq(N * l1, N * (l1+1) - 1));
+
+      // UMU + W.
+      for (index_t o = 0; o < N-1; ++o)
+        for (index_t l = 0; l < o; ++l)
+          if (upper)
+            CCur(l, o) = -UCur(all, o).transpose() * QCur(all, l) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + o)));
+          else
+            CCur(o, l) =  UCur(all, o).transpose() * QCur(all, l) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + o)));
+      {
+        for (index_t l = 0; l < N-1; ++l)
+          if (upper)
+            CCur(l, N-1) = -UbF(all, l1).transpose() * QCur(all, l) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + N-1)));
+          else
+            CCur(N-1, l) =  UbF(all, l1).transpose() * QCur(all, l) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + N-1)));
+      }
+
+      // UMV
+      if (upper) {
+        if (N > 1)
+          CCur(seq(0, N - 2), seq(N, 2*N - 1)) = UCur.transpose() * PCur;
+        CCur(N - 1, seq(N, 2*N - 1)) = UbF(all, l1).transpose() * PCur;
+      } else
+        assert_(false, typeid(*this).name(), "Lower diag not implemented.");
+      // -I
+      CCur(seq(0, N - 1), seq(N, 2*N - 1)) -= matrix_t::Identity(N, N);
+
+      // VMV
+      for (index_t o = 0; o < N; ++o)
+        for (index_t l = 0; l < o; ++l)
+          if (upper)
+            CCur(N + l, N + o) = -PCur(from_ids(l1 * N + o), l);
+          else
+            CCur(N + o, N + l) =  PCur(from_ids(l1 * N + o), l);
+    }
+
+    // Batch Pfaffian call.
+    matrix_t Gc = matrix_t::Zero(2*N, 2*N);
+    idxvec_t cPiv = idxvec_t::Zero(2*N);
+    for (index_t l1 = 0; l1 < k1; ++l1) {
+      auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1));
+      if (N == 1) {
+        result(l1) = -CCur(0, 1) + T(1.0);
+        continue;
+      }
+      l2e::le_mat_t<T> C_(CCur);
+      signed info = sktrf(uplo, 'P', C_, &cPiv(0), &Gc(0, 0), Gc.size());
+      result(l1) = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, CCur, cPiv);
+    }
+    if (N > 1)
+      result = result * pow(-1.0, N * (N + 1) / 2);
+
+    return result;
+  }
+
+};
+}
+}

From 26ae54635beaa07e3b172dc4bcdbbd8fe9736637 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 14:54:55 +0900
Subject: [PATCH 04/39] Adapt mVMC for Modified PfUpdates Interface

---
 src/mVMC/vmcmake.c          | 14 ++++++++------
 src/mVMC/vmcmake_fsz.c      | 14 ++++++++------
 src/mVMC/vmcmake_fsz_real.c | 14 ++++++++------
 src/mVMC/vmcmake_real.c     | 14 ++++++++------
 4 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c
index 57ab2dd2..aa6e22ca 100644
--- a/src/mVMC/vmcmake.c
+++ b/src/mVMC/vmcmake.c
@@ -74,6 +74,8 @@ void VMCMakeSample(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // Read block size from input.
   const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE");
   if (optBlockSize)
@@ -94,7 +96,7 @@ void VMCMakeSample(MPI_Comm comm) {
                        InvM, Nsize*Nsize,
                        TmpEleIdx, EleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
   CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd);
@@ -108,13 +110,13 @@ void VMCMakeSample(MPI_Comm comm) {
                       qpStart,qpEnd,comm);
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm, Nsite2*Nsite2,
                          InvM, Nsize*Nsize,
                          TmpEleIdx, EleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
     CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd);
@@ -276,13 +278,13 @@ void VMCMakeSample(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm, Nsite2*Nsite2,
                              InvM, Nsize*Nsize,
                              TmpEleIdx, EleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
         CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd);
@@ -309,7 +311,7 @@ void VMCMakeSample(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;
diff --git a/src/mVMC/vmcmake_fsz.c b/src/mVMC/vmcmake_fsz.c
index 404318e4..801ba523 100644
--- a/src/mVMC/vmcmake_fsz.c
+++ b/src/mVMC/vmcmake_fsz.c
@@ -83,6 +83,8 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // Read block size from input.
   const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE");
   if (optBlockSize)
@@ -100,7 +102,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
                        InvM, Nsize*Nsize,
                        TmpEleIdx, TmpEleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
   CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -117,13 +119,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
 
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm, Nsite2*Nsite2,
                          InvM, Nsize*Nsize,
                          TmpEleIdx, TmpEleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
     CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -382,13 +384,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm, Nsite2*Nsite2,
                              InvM, Nsize*Nsize,
                              TmpEleIdx, TmpEleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
         CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -426,7 +428,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;
diff --git a/src/mVMC/vmcmake_fsz_real.c b/src/mVMC/vmcmake_fsz_real.c
index b76e6505..f3dda1d6 100644
--- a/src/mVMC/vmcmake_fsz_real.c
+++ b/src/mVMC/vmcmake_fsz_real.c
@@ -79,6 +79,8 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // TODO: Make it input parameter.
   if (NExUpdatePath == 0)
     NBlockUpdateSize = 4;
@@ -91,7 +93,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
                        InvM_real, Nsize*Nsize,
                        TmpEleIdx, TmpEleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
   CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -108,13 +110,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
 
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm_real, Nsite2*Nsite2,
                          InvM_real, Nsize*Nsize,
                          TmpEleIdx, TmpEleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
     CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -368,13 +370,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm_real, Nsite2*Nsite2,
                              InvM_real, Nsize*Nsize,
                              TmpEleIdx, TmpEleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
         CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -412,7 +414,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;
diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c
index 6c9bde7d..f567c04f 100644
--- a/src/mVMC/vmcmake_real.c
+++ b/src/mVMC/vmcmake_real.c
@@ -76,6 +76,8 @@ void VMCMakeSample_real(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // Read block size from input.
   const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE");
   if (optBlockSize)
@@ -96,7 +98,7 @@ void VMCMakeSample_real(MPI_Comm comm) {
                        InvM_real, Nsize*Nsize,
                        TmpEleIdx, EleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
   CalculateMAll_real(TmpEleIdx, qpStart, qpEnd);
@@ -110,13 +112,13 @@ void VMCMakeSample_real(MPI_Comm comm) {
                       qpStart, qpEnd, comm);
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm_real, Nsite2*Nsite2,
                          InvM_real, Nsize*Nsize,
                          TmpEleIdx, EleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
     CalculateMAll_real(TmpEleIdx, qpStart, qpEnd);
@@ -283,13 +285,13 @@ void VMCMakeSample_real(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm_real, Nsite2*Nsite2,
                              InvM_real, Nsize*Nsize,
                              TmpEleIdx, EleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
         CalculateMAll_real(TmpEleIdx, qpStart, qpEnd);
@@ -316,7 +318,7 @@ void VMCMakeSample_real(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;

From 01b2b6061dbfcdf76d37474ad0d8cd35786e0711 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 15:10:08 +0900
Subject: [PATCH 05/39] Fix CI

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 7d2322e3..e7b4a465 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -12,7 +12,7 @@ jobs:
       - name: apt
         run: |
           sudo apt update
-          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev
+          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev
 
       - name: pip
         run: |

From 75efd6954804c36ab4975b9664425f6cb88ee0c1 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 15:29:58 +0900
Subject: [PATCH 06/39] Fix CI

---
 .github/workflows/main.yml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index e7b4a465..85f74a72 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -12,13 +12,19 @@ jobs:
       - name: apt
         run: |
           sudo apt update
-          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev
+          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev wget
 
       - name: pip
         run: |
           python -m pip install numpy
           python3 -m pip install numpy
 
+      - name: eigen
+        working-directory: ${{runner.workspace}}
+        run: |
+          wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz
+          tar -zxvf eigen-3.4.0.tar.gz
+
       - name: make workspace
         run: cmake -E make_directory ${{runner.workspace}}/build
 
@@ -26,7 +32,7 @@ jobs:
         working-directory: ${{runner.workspace}}/build
         shell: bash
         run: |
-          cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
+          cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH=${{runner.workspace}}/eigen-3.4.0 -DEigen3_DIR=${{runner.workspace}}/eigen-3.4.0/cmake $GITHUB_WORKSPACE
 
       - name: build
         working-directory: ${{runner.workspace}}/build

From d50eac07207317dffe964798a55186ae6201526f Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 15:38:13 +0900
Subject: [PATCH 07/39] Fix CI

---
 .github/workflows/main.yml | 7 ++++++-
 src/ltl2inv/CMakeLists.txt | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 85f74a72..b2dcbf1e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -24,6 +24,11 @@ jobs:
         run: |
           wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz
           tar -zxvf eigen-3.4.0.tar.gz
+          mkdir eigen-build
+          cd eigen-build
+          cmake ../eigen-3.4.0
+          make -j4
+          sudo make install
 
       - name: make workspace
         run: cmake -E make_directory ${{runner.workspace}}/build
@@ -32,7 +37,7 @@ jobs:
         working-directory: ${{runner.workspace}}/build
         shell: bash
         run: |
-          cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH=${{runner.workspace}}/eigen-3.4.0 -DEigen3_DIR=${{runner.workspace}}/eigen-3.4.0/cmake $GITHUB_WORKSPACE
+          cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
 
       - name: build
         working-directory: ${{runner.workspace}}/build
diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt
index 04fe165e..1c704a0d 100644
--- a/src/ltl2inv/CMakeLists.txt
+++ b/src/ltl2inv/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()
 include_directories(../pfapack/c_interface)
 
 # Require Eigen
-find_package(Eigen3 3.3 REQUIRED NO_MODULE)
+find_package(Eigen3 3.4 REQUIRED NO_MODULE)
 include_directories(${EIGEN3_INCLUDE_DIR})
 
 add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc)

From 5766c58ea33eeea0644f955fd202dbe4a74b7988 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 16:20:33 +0900
Subject: [PATCH 08/39] Upgrade BLIS

---
 .gitmodules | 2 +-
 src/blis    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 44e5b4ac..825489fb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "src/blis"]
 	path = src/blis
-	url = https://github.com/xrq-phys/blis
+	url = https://github.com/flame/blis
 	branch = mvmc
 [submodule "src/pfapack"]
 	path = src/pfapack
diff --git a/src/blis b/src/blis
index a32257ee..aeb5f0cc 160000
--- a/src/blis
+++ b/src/blis
@@ -1 +1 @@
-Subproject commit a32257eeab2e9946e71546a05a1847a39341ec6b
+Subproject commit aeb5f0cc19665456e990a7ffccdb09da2e3f504b

From 8334613de8afe5c31d6ba0ea4784afd57240a427 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 18:09:36 +0900
Subject: [PATCH 09/39] Upgrade CI BLIS

Don't use the artifacts.
---
 .github/workflows/main.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b2dcbf1e..8a67bd55 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,6 +30,20 @@ jobs:
           make -j4
           sudo make install
 
+      - name: blis
+        working-directory: ${{runner.workspace}}
+        run: |
+          mkdir blis
+          cd blis
+          git init
+          git remote add origin https://github.com/flame/blis
+          git fetch --depth 1 origin e3d352f1fcc93e6a46fde1aa4a7f0a18fb27bd42
+          git checkout FETCH_HEAD
+          ./configure -t none -p ${{runner.workspace}}/blis-inst x86_64
+          make -j8 install
+          cd ${{runner.workspace}}/blis-inst
+          tar -zcvf ${{runner.workspace}}/libblis_artifact.tar.gz include lib
+
       - name: make workspace
         run: cmake -E make_directory ${{runner.workspace}}/build
 
@@ -38,6 +52,9 @@ jobs:
         shell: bash
         run: |
           cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
+          # Override downloaded artifact.
+          # TODO: Rebuild the artifacts.
+          mv ${{runner.workspace}}/libblis_artifact.tar.gz .
 
       - name: build
         working-directory: ${{runner.workspace}}/build

From 0375f1c889ace9ff7d86a87c44f55845869495e4 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 18:53:06 +0900
Subject: [PATCH 10/39] Fix: Use BLIS GEMMT

---
 CMakeLists.txt               |  1 +
 download_blis_artifact.cmake |  1 +
 mVMCconfig.sh                | 12 ++++++------
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3257d79d..1099e4ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,6 +137,7 @@ enable_language(CXX)
 set(CMAKE_CXX_STANDARD 11)
 
 if(Require_BLIS)
+  add_definitions(-D_BLIS)
   include("download_blis_artifact.cmake")
 else(Require_BLIS)
   # Use bundled blis.h
diff --git a/download_blis_artifact.cmake b/download_blis_artifact.cmake
index f6274bcc..ec9fc8a7 100644
--- a/download_blis_artifact.cmake
+++ b/download_blis_artifact.cmake
@@ -53,6 +53,7 @@ add_custom_target(blis_target
   DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib/libblis.a)
 add_custom_target(blis_include
   DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/include/blis/blis.h)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/blis)
 
 add_library(blis STATIC IMPORTED GLOBAL)
diff --git a/mVMCconfig.sh b/mVMCconfig.sh
index 0b7e1d62..cfc7453a 100755
--- a/mVMCconfig.sh
+++ b/mVMCconfig.sh
@@ -41,7 +41,7 @@ FFLAGS = -DNDEBUG -DFUJITSU -Kfast
 CFLAGS = -DNDEBUG -Ofast -fopenmp
 CXXFLAGS = -DNDEBUG -Ofast -fopenmp
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -DBLAS_EXTERNAL
+CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.so -SSL2 -lm -lpthread
@@ -125,7 +125,7 @@ FFLAGS = -O3
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS +=
+CXXFLAGS += -D_BLIS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread
@@ -149,7 +149,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS +=
+CXXFLAGS += -D_BLIS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread
@@ -173,7 +173,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
+CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm -lpthread
@@ -197,7 +197,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS +=
+CXXFLAGS += -D_BLIS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread
@@ -221,7 +221,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple.
+CXXFLAGS += -D_BLIS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple.
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread

From 2687fd407c12931432c2b7c3af1394b6ad6b0494 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 01:38:04 +0900
Subject: [PATCH 11/39] First Step: Extract PfM from GF2 for FCMP Exchange Only

---
 src/mVMC/calham.c | 34 ++++++++++++++++++++++++++++++----
 src/mVMC/locgrn.c | 26 ++++++++++++++++++++++----
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c
index d35445eb..0b1fb3b2 100644
--- a/src/mVMC/calham.c
+++ b/src/mVMC/calham.c
@@ -67,6 +67,11 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
   double complex *myBuffer;
   double complex myEnergy;
 
+  int *lazy_info = malloc(7 * sizeof(int) * NExchangeCoupling * 2);
+  double complex *lazy_ip = malloc(sizeof(double) * NExchangeCoupling * 2);
+  double complex *lazy_pfa = malloc(sizeof(double) * NExchangeCoupling * 2);
+  memset(lazy_info, 0, 7 * NExchangeCoupling * 2);
+
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -82,7 +87,7 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
                NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling, \
                NTransfer, Transfer, ParaTransfer, NPairHopping, PairHopping, ParaPairHopping, \
                NExchangeCoupling, ExchangeCoupling, ParaExchangeCoupling, NInterAll, InterAll, ParaInterAll, n0, n1) \
-  shared(eleCfg, eleProjCnt, eleIdx, eleNum) reduction(+:e)
+  shared(eleCfg, eleProjCnt, eleIdx, eleNum, lazy_info) reduction(+:e)
   {
     myEleIdx = GetWorkSpaceThreadInt(Nsize);
     myEleNum = GetWorkSpaceThreadInt(Nsite2);
@@ -158,9 +163,27 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
     
-      tmp =  GreenFunc2(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      tmp += GreenFunc2(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      myEnergy += ParaExchangeCoupling[idx] * tmp;
+      lazy_ip[idx * 2]     = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info + 7 * (idx * 2));
+      lazy_ip[idx * 2 + 1] = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info + 7 * (idx * 2 + 1));
+      if ( !lazy_info[7 * (idx * 2)     + 6] ) myEnergy += lazy_ip[idx * 2];
+      if ( !lazy_info[7 * (idx * 2 + 1) + 6] ) myEnergy += lazy_ip[idx * 2 + 1];
+    }
+    #pragma omp barrier
+    #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
+    for(idx=0;idx<NExchangeCoupling*2;idx++) {
+      int *lazy_info_loc = lazy_info + 7 * idx;
+      if ( lazy_info_loc[6] ) {
+        int msj = lazy_info_loc[2];
+        int mtl = lazy_info_loc[3];
+        rj = myEleIdx[msj];
+        rl = myEleIdx[mtl];
+        myEleIdx[msj] = lazy_info_loc[4];
+        myEleIdx[mtl] = lazy_info_loc[5];
+        CalculateNewPfMTwo_fcmp(lazy_info_loc[1], mtl/Ne, lazy_info_loc[0], msj/Ne, myBuffer, myEleIdx, 0, NQPFull, myBuffer+NQPFull);
+        myEnergy += CalculateIP_fcmp(myBuffer, 0, NQPFull, MPI_COMM_SELF) * lazy_ip[idx];
+        myEleIdx[msj] = rj;
+        myEleIdx[mtl] = rl;
+      }
     }
 
     /* Inter All */
@@ -182,6 +205,9 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
 
     e += myEnergy;
   }
+  free(lazy_info);
+  free(lazy_ip);
+  free(lazy_pfa);
 
   ReleaseWorkSpaceThreadInt();
   ReleaseWorkSpaceThreadComplex();
diff --git a/src/mVMC/locgrn.c b/src/mVMC/locgrn.c
index b1fdd7c5..2ba2e25e 100644
--- a/src/mVMC/locgrn.c
+++ b/src/mVMC/locgrn.c
@@ -73,10 +73,10 @@ double complex GreenFunc1(const int ri, const int rj, const int s, const double
 
 /* Calculate 2-body Green function <psi|CisAjsCktAlt|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl,
+double complex GreenFunc2_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
-                  int *projCntNew, double complex *buffer) {
+                  int *projCntNew, double complex *buffer, int *lazy_buffer) {
   double complex z;
   int mj,msj,ml,mtl;
   int rsi,rsj,rtk,rtl;
@@ -145,8 +145,20 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV);
-  z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_buffer) {
+    CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV);
+    z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    // Save invocation hook instead of calling.
+    // REMEMBER THE ORDER OF S/T, ML, MJ.
+    lazy_buffer[0] = mj;
+    lazy_buffer[1] = ml;
+    lazy_buffer[2] = msj; //< to edit to ri, revert to rj
+    lazy_buffer[3] = mtl; //< to edit to rk, revert to rl
+    lazy_buffer[4] = ri;
+    lazy_buffer[5] = rk;
+    lazy_buffer[6] = 1; //< "is delayed" flag..
+  }
 
   /* revert hopping */
   eleIdx[mtl] = rl;
@@ -159,6 +171,12 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl
   return conj(z/ip);//TBC
 }
 
+double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t, const double complex ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
+                  int *projCntNew, double complex *buffer) {
+  return GreenFunc2_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0);
+}
 
 // ignore GreenFuncN: to be added
 

From 78d3c0c309c66d0afe90fe366ef6151dc1eebff7 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Fri, 10 Feb 2023 22:53:10 +0900
Subject: [PATCH 12/39] Fix Malloc/Memset/Conj Typos

---
 src/mVMC/calham.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c
index 0b1fb3b2..91e2fc8c 100644
--- a/src/mVMC/calham.c
+++ b/src/mVMC/calham.c
@@ -68,9 +68,9 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
   double complex myEnergy;
 
   int *lazy_info = malloc(7 * sizeof(int) * NExchangeCoupling * 2);
-  double complex *lazy_ip = malloc(sizeof(double) * NExchangeCoupling * 2);
-  double complex *lazy_pfa = malloc(sizeof(double) * NExchangeCoupling * 2);
-  memset(lazy_info, 0, 7 * NExchangeCoupling * 2);
+  double complex *lazy_ip = malloc(sizeof(double complex) * NExchangeCoupling * 2);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2);
+  memset(lazy_info, 0, 7 * NExchangeCoupling * 2 * sizeof(int));
 
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
@@ -180,7 +180,7 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
         myEleIdx[msj] = lazy_info_loc[4];
         myEleIdx[mtl] = lazy_info_loc[5];
         CalculateNewPfMTwo_fcmp(lazy_info_loc[1], mtl/Ne, lazy_info_loc[0], msj/Ne, myBuffer, myEleIdx, 0, NQPFull, myBuffer+NQPFull);
-        myEnergy += CalculateIP_fcmp(myBuffer, 0, NQPFull, MPI_COMM_SELF) * lazy_ip[idx];
+        myEnergy += conj(CalculateIP_fcmp(myBuffer, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
         myEleIdx[msj] = rj;
         myEleIdx[mtl] = rl;
       }

From d6ce91898803c5f31978f39283862125e028ff8e Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 00:44:28 +0900
Subject: [PATCH 13/39] Env Adjustments

- __cplusplus
- Include in common hdr.
- Compile PfUpdates regardless of whether PFAFFIAN_BLOCKED
  This is consistent w/ traditional Makefile.
---
 CMakeLists.txt               | 8 ++++----
 src/mVMC/include/vmcmain.h   | 1 +
 src/mVMC/vmcmake.c           | 5 -----
 src/mVMC/vmcmake_real.c      | 5 -----
 src/pfupdates/pf_interface.h | 4 ++--
 5 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1099e4ca..fe3208ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -146,11 +146,11 @@ endif(Require_BLIS)
 
 if(PFAFFIAN_BLOCKED)
   add_definitions(-D_pf_block_update)
-  add_subdirectory(src/pfupdates)
-  if(Require_BLIS)
-    add_dependencies(pfupdates blis_include)
-  endif(Require_BLIS)
 endif(PFAFFIAN_BLOCKED)
+add_subdirectory(src/pfupdates)
+if(Require_BLIS)
+  add_dependencies(pfupdates blis_include)
+endif(Require_BLIS)
 
 if (Document)
   add_subdirectory(doc)
diff --git a/src/mVMC/include/vmcmain.h b/src/mVMC/include/vmcmain.h
index 090c7b69..cf642973 100644
--- a/src/mVMC/include/vmcmain.h
+++ b/src/mVMC/include/vmcmain.h
@@ -72,6 +72,7 @@ extern int omp_get_thread_num(void);
 #else
  #include "../stcopt_pdposv.c"
 #endif
+#include "../../pfupdates/pf_interface.h"
 
 #include "../stcopt_cg.c"
 
diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c
index aa6e22ca..349c1301 100644
--- a/src/mVMC/vmcmake.c
+++ b/src/mVMC/vmcmake.c
@@ -36,11 +36,6 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #include "splitloop.h"
 #include "qp.h"
 
-#ifdef _pf_block_update
-// Block-update extension.
-#include "../pfupdates/pf_interface.h"
-#endif
-
 void VMCMakeSample(MPI_Comm comm) {
   int outStep,nOutStep;
   int inStep,nInStep;
diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c
index f567c04f..7c0d57c4 100644
--- a/src/mVMC/vmcmake_real.c
+++ b/src/mVMC/vmcmake_real.c
@@ -37,11 +37,6 @@ which follows "The BSD 3-Clause License".
 #include "splitloop.h"
 #include "vmcmake.h"
 
-#ifdef _pf_block_update
-// Block-update extension.
-#include "../pfupdates/pf_interface.h"
-#endif
-
 void VMCMakeSample_real(MPI_Comm comm) {
   int outStep, nOutStep;
   int inStep, nInStep;
diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h
index 903a90ef..62bcee3f 100644
--- a/src/pfupdates/pf_interface.h
+++ b/src/pfupdates/pf_interface.h
@@ -12,7 +12,7 @@
 #include "stdint.h"
 
 // Redefine double complex for (future) non-C99 compatibility.
-#ifndef _CC_IMPL
+#ifndef __cplusplus
 typedef _Complex double ccdcmplx;
 typedef _Complex float  ccscmplx;
 #else
@@ -118,7 +118,7 @@ GENDEF( ccdcmplx, z )
 
 #undef EXPANDNAME
 
-#ifdef _CC_IMPL
+#ifdef __cplusplus
 }
 #endif
 

From f345f2506887fadfa7335608ce9695ffc92fe01c Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 20:28:46 +0900
Subject: [PATCH 14/39] Not-working Batch GF2

- OMP sync problem unresolved
- More problems pending
---
 src/mVMC/calham.c             |  72 +++++++++++++++--------
 src/mVMC/locgrn.c             |  20 +++----
 src/pfupdates/pf_interface.cc | 107 ++++++++++++++++++++++++++++++++--
 src/pfupdates/pf_interface.h  |  44 ++++++++++++++
 src/pfupdates/updated_tdi.tcc |  22 +++++--
 5 files changed, 219 insertions(+), 46 deletions(-)

diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c
index 91e2fc8c..a2e5c841 100644
--- a/src/mVMC/calham.c
+++ b/src/mVMC/calham.c
@@ -67,10 +67,16 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
   double complex *myBuffer;
   double complex myEnergy;
 
-  int *lazy_info = malloc(7 * sizeof(int) * NExchangeCoupling * 2);
-  double complex *lazy_ip = malloc(sizeof(double complex) * NExchangeCoupling * 2);
-  double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2);
-  memset(lazy_info, 0, 7 * NExchangeCoupling * 2 * sizeof(int));
+  int *lazy_info = malloc(    sizeof(int) * NExchangeCoupling * 2 * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * NExchangeCoupling * 2);
+  int *lazy_msj  = malloc(2 * sizeof(int) * NExchangeCoupling * 2);
+  double complex *lazy_ip  = malloc(sizeof(double complex) * NExchangeCoupling * 2);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2 * NQPFull);
+  memset(lazy_info,                          0, sizeof(int) * NExchangeCoupling * 2);
+  memset(lazy_info + NExchangeCoupling * 2, -1, sizeof(int) * NExchangeCoupling * 2);
+
+  for (int mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
+  for (int mi=Ne;mi<Ne*2;mi++) EleSpn[mi] = 1;
 
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
@@ -94,6 +100,20 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadComplex(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_z(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm, Nsite2*Nsite2,
+                                     InvM, Nsize*Nsize,
+                                     eleIdx, EleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -163,28 +183,29 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
     
-      lazy_ip[idx * 2]     = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info + 7 * (idx * 2));
-      lazy_ip[idx * 2 + 1] = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info + 7 * (idx * 2 + 1));
-      if ( !lazy_info[7 * (idx * 2)     + 6] ) myEnergy += lazy_ip[idx * 2];
-      if ( !lazy_info[7 * (idx * 2 + 1) + 6] ) myEnergy += lazy_ip[idx * 2 + 1];
+      int *lazy_info_even = lazy_info + idx * 2;
+      int *lazy_info_odd  = lazy_info + idx * 2 + 1;
+      int *lazy_rsi_even  = lazy_rsi + (idx * 2    ) * 2;
+      int *lazy_rsi_odd   = lazy_rsi + (idx * 2 + 1) * 2;
+      int *lazy_msj_even  = lazy_msj + (idx * 2    ) * 2;
+      int *lazy_msj_odd   = lazy_msj + (idx * 2 + 1) * 2;
+
+      lazy_ip[idx * 2]     = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
+      lazy_ip[idx * 2 + 1] = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
+      if ( !*lazy_info_even ) myEnergy += lazy_ip[idx * 2];
+      if ( !*lazy_info_odd  ) myEnergy += lazy_ip[idx * 2 + 1];
     }
     #pragma omp barrier
+    updated_tdi_v_omp_proc_batch_greentwo_z(NQPFull, NExchangeCoupling*2,
+                                            lazy_info, lazy_info + NExchangeCoupling*2,
+                                            lazy_rsi, lazy_msj,
+                                            lazy_pfa,
+                                            pfUpdator, pfOrbital, pfMat, pfMap);
+    #pragma omp barrier
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
-    for(idx=0;idx<NExchangeCoupling*2;idx++) {
-      int *lazy_info_loc = lazy_info + 7 * idx;
-      if ( lazy_info_loc[6] ) {
-        int msj = lazy_info_loc[2];
-        int mtl = lazy_info_loc[3];
-        rj = myEleIdx[msj];
-        rl = myEleIdx[mtl];
-        myEleIdx[msj] = lazy_info_loc[4];
-        myEleIdx[mtl] = lazy_info_loc[5];
-        CalculateNewPfMTwo_fcmp(lazy_info_loc[1], mtl/Ne, lazy_info_loc[0], msj/Ne, myBuffer, myEleIdx, 0, NQPFull, myBuffer+NQPFull);
-        myEnergy += conj(CalculateIP_fcmp(myBuffer, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
-        myEleIdx[msj] = rj;
-        myEleIdx[mtl] = rl;
-      }
-    }
+    for(idx=0;idx<NExchangeCoupling*2;idx++)
+      if ( lazy_info[idx] )
+        myEnergy += conj(CalculateIP_fcmp(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
 
     /* Inter All */
     #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
@@ -204,8 +225,13 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
     {StopTimer(72);}
 
     e += myEnergy;
+
+    // Freeing within OMP region is thread-private.
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
   }
   free(lazy_info);
+  free(lazy_rsi);
+  free(lazy_msj);
   free(lazy_ip);
   free(lazy_pfa);
 
diff --git a/src/mVMC/locgrn.c b/src/mVMC/locgrn.c
index 2ba2e25e..058962aa 100644
--- a/src/mVMC/locgrn.c
+++ b/src/mVMC/locgrn.c
@@ -76,7 +76,8 @@ double complex GreenFunc1(const int ri, const int rj, const int s, const double
 double complex GreenFunc2_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
-                  int *projCntNew, double complex *buffer, int *lazy_buffer) {
+                  int *projCntNew, double complex *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double complex z;
   int mj,msj,ml,mtl;
   int rsi,rsj,rtk,rtl;
@@ -145,19 +146,16 @@ double complex GreenFunc2_(const int ri, const int rj, const int rk, const int r
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  if (!lazy_buffer) {
+  if (!lazy_info) {
     CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV);
     z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
   } else {
     // Save invocation hook instead of calling.
-    // REMEMBER THE ORDER OF S/T, ML, MJ.
-    lazy_buffer[0] = mj;
-    lazy_buffer[1] = ml;
-    lazy_buffer[2] = msj; //< to edit to ri, revert to rj
-    lazy_buffer[3] = mtl; //< to edit to rk, revert to rl
-    lazy_buffer[4] = ri;
-    lazy_buffer[5] = rk;
-    lazy_buffer[6] = 1; //< "is delayed" flag..
+    lazy_msj[0] = msj; //< to edit to ri, revert to rj
+    lazy_msj[1] = mtl; //< to edit to rk, revert to rl
+    lazy_rsi[0] = rsi;
+    lazy_rsi[1] = rtk;
+    lazy_info[0] = 1; //< "is delayed" flag..
   }
 
   /* revert hopping */
@@ -175,7 +173,7 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl
                   const int s, const int t, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
                   int *projCntNew, double complex *buffer) {
-  return GreenFunc2_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0);
+  return GreenFunc2_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0, 0, 0);
 }
 
 // ignore GreenFuncN: to be added
diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc
index fcdcfaca..26815109 100644
--- a/src/pfupdates/pf_interface.cc
+++ b/src/pfupdates/pf_interface.cc
@@ -7,14 +7,15 @@
 #include "orbital_mat.tcc"
 // In implementation, this should appear AFTER blis.h.
 #include "pf_interface.h"
+#include <omp.h>
 
 // Non-# Pragma support differs from compiler to compiler
 #if defined(__INTEL_COMPILER)
-#define OMP_PARALLEL_FOR_SHARED __pragma(omp parallel for default(shared))
+#define _ccPragma(_1) __pragma(_1)
 #elif defined(__GNUC__)
-#define OMP_PARALLEL_FOR_SHARED _Pragma("omp parallel for default(shared)")
+#define _ccPragma(_1) _Pragma(#_1)
 #else
-#error "Valid non-preprocessor _pragma() not found."
+#error "Valid non-preprocessor _Pragma() not found."
 #endif
 
 using namespace Eigen;
@@ -29,6 +30,46 @@ using matrix_t = Map<Matrix<T, Dynamic, Dynamic>, 0, OuterStride<> >;
 
 #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar
 
+#define GENIMPL( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \
+    ( uint64_t  num_qp, \
+      uint64_t  nsite, \
+      uint64_t  norbs, \
+      uint64_t  nelec, \
+      ctype    *orbmat_base, \
+      int64_t   orbmat_stride, \
+      ctype    *invmat_base, \
+      int64_t   invmat_stride, \
+      int32_t  *eleidx, \
+      int32_t  *elespn, \
+      uint64_t  mmax, \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
+{ \
+  for (int iqp = 0; iqp < num_qp; ++iqp) { \
+    mapv[iqp] = new matrix_t<ctype>(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \
+    matv[iqp] = new matrix_t<ctype>(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \
+    orbv[iqp] = new orbital_mat<matrix_t<ctype>>('U', norbs, *mapv(iqp, ctype) ); \
+    objv[iqp] = new updated_tdi<orbital_mat<matrix_t<ctype>>> \
+        ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \
+\
+    /* Compute initial. */ \
+    vmc::config_manager::base_t cfg_i(nelec); \
+    for (int msi = 0; msi < nelec; ++msi) \
+      cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \
+    objv(iqp, ctype)->attach_config(cfg_i); \
+    objv(iqp, ctype)->initialize_precomputed(pfav[iqp]); \
+  } \
+}
+// GENIMPL( float,    s )
+GENIMPL( double,   d )
+// GENIMPL( ccscmplx, c )
+GENIMPL( ccdcmplx, z )
+#undef GENIMPL
+
 #define GENIMPL( ctype, cblachar ) \
   void EXPANDNAME( updated_tdi_v_init, cblachar ) \
     ( uint64_t  num_qp, \
@@ -47,7 +88,7 @@ using matrix_t = Map<Matrix<T, Dynamic, Dynamic>, 0, OuterStride<> >;
       void     *matv[], \
       void     *mapv[] ) \
 { \
-  OMP_PARALLEL_FOR_SHARED \
+  _ccPragma(omp parallel for default(shared)) \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
     mapv[iqp] = new matrix_t<ctype>(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \
     matv[iqp] = new matrix_t<ctype>(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \
@@ -114,7 +155,7 @@ GENIMPL( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] ) \
 { \
-  OMP_PARALLEL_FOR_SHARED \
+  _ccPragma(omp parallel for default(shared)) \
   for (int iqp = 0; iqp < num_qp; ++iqp) \
     objv(iqp, ctype)->push_update_safe(osi, msj, cal_pfa!=0); \
 }
@@ -134,7 +175,7 @@ GENIMPL( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] ) \
 { \
-  OMP_PARALLEL_FOR_SHARED \
+  _ccPragma(omp parallel for default(shared)) \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
     const auto &from_i = objv(iqp, ctype)->get_config_manager().from_idx(); \
     if (from_i.size() > objv(iqp, ctype)->mmax - 2) \
@@ -172,5 +213,59 @@ GENIMPL( double,   d )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
+#define GENIMPL( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_proc_batch_greentwo, cblachar ) \
+    ( uint64_t  num_qp, \
+      int       num_gf, \
+      int       needs_comput[], \
+      int       unpack_idx[], \
+      int       to_orbs[], \
+      int       from_ids[], \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
+{ \
+  int ith = omp_get_thread_num(); \
+  int nth = omp_get_num_threads(); \
+\
+  int gf_count = 0; \
+  for (int i = 0; i < num_gf; ++i) \
+    if ( needs_comput[i] ) { \
+      if ( ith == 0 ) { \
+        /* Pack info space. */ \
+        to_orbs[gf_count * 2    ] = to_orbs[i * 2]; \
+        to_orbs[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \
+        from_ids[gf_count * 2    ] = from_ids[i * 2]; \
+        from_ids[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \
+        unpack_idx[gf_count] = i; \
+      } \
+      /* Count Green's funcs. */ \
+      gf_count++; \
+    } \
+\
+  int gf_cnt_thread = (gf_count + nth - 1) / nth; \
+  int gfStart = gf_cnt_thread * ith; \
+  int gfEnd = std::min(gf_count, gfStart + gf_cnt_thread); \
+\
+  Eigen::Map<VectorXi>  to_orbs_thread( to_orbs + gfStart * 2, (gfEnd - gfStart) * 2); \
+  Eigen::Map<VectorXi> from_ids_thread(from_ids + gfStart * 2, (gfEnd - gfStart) * 2); \
+\
+  for (int iqp = 0; iqp < num_qp; ++iqp) { \
+    Eigen::Vector<ctype, Eigen::Dynamic> pfaBatch = \
+      objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs_thread, from_ids_thread); \
+\
+    /* Unpack computed Pfaffians. */ \
+    for (int igf = gfStart; igf < gfEnd; ++igf) \
+      pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf - gfStart]; \
+  } \
+}
+// GENIMPL( float,    s )
+GENIMPL( double,   d )
+// GENIMPL( ccscmplx, c )
+GENIMPL( ccdcmplx, z )
+#undef GENIMPL
+
 #undef EXPANDNAME
 
diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h
index 62bcee3f..7a858868 100644
--- a/src/pfupdates/pf_interface.h
+++ b/src/pfupdates/pf_interface.h
@@ -24,6 +24,31 @@ extern "C" {
 
 #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar
 
+#define GENDEF( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \
+    ( uint64_t  num_qp, \
+      uint64_t  nsite, \
+      uint64_t  norbs, \
+      uint64_t  nelec, \
+      ctype    *orbmat_base, \
+      int64_t   orbmat_stride, \
+      ctype    *invmat_base, \
+      int64_t   invmat_stride, \
+      int32_t  *eleidx, \
+      int32_t  *elespn, \
+      uint64_t  mmax, \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
+// GENDEF( float,    s )
+GENDEF( double,   d )
+// GENDEF( ccscmplx, c )
+GENDEF( ccdcmplx, z )
+#undef GENDEF
+
+
 #define GENDEF( ctype, cblachar ) \
    void EXPANDNAME( updated_tdi_v_init, cblachar ) \
     ( uint64_t  num_qp, \
@@ -116,6 +141,25 @@ GENDEF( double,   d )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
+#define GENDEF( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_proc_batch_greentwo, cblachar ) \
+    ( uint64_t  num_qp, \
+      int       num_gf, \
+      int       needs_comput[], \
+      int       unpack_idx[], \
+      int       to_orbs[], \
+      int       from_ids[], \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
+// GENDEF( float,    s )
+GENDEF( double,   d )
+// GENDEF( ccscmplx, c )
+GENDEF( ccdcmplx, z )
+#undef GENDEF
+
 #undef EXPANDNAME
 
 #ifdef __cplusplus
diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc
index 5eda1701..6705a2fb 100644
--- a/src/pfupdates/updated_tdi.tcc
+++ b/src/pfupdates/updated_tdi.tcc
@@ -68,6 +68,14 @@ private:
 
 public:
 
+  void initialize_precomputed(T Pfa_) {
+    elem_cfg.merge_config();
+    elem_cfg.reserve_space(mmax);
+    nq_updated = 0;
+    Pfa = Pfa_;
+    PfaRatio = 1.0;
+  }
+
   void initialize() {
     using namespace std;
     elem_cfg.merge_config();
@@ -530,7 +538,8 @@ public:
   /**
    * Complete antisymmetric matrix.
    */
-  static void skcomplete(const char &uplo_, matrix_t &A) {
+  template <typename Mat>
+  static void skcomplete(const char &uplo_, Mat &A) {
     const index_t n = A.rows();
 
     for (index_t j = 0; j < n; ++j) {
@@ -568,7 +577,8 @@ public:
     uplo = uplo_new;
   }
 
-  Eigen::Vector<amp_t, Eigen::Dynamic> batch_query_amplitudes(int N, idxvec_t to_orbs, idxvec_t from_ids) {
+  Eigen::Vector<amp_t, Eigen::Dynamic> batch_query_amplitudes
+      (int N, const Eigen::Map<idxvec_t> &to_orbs, const Eigen::Map<idxvec_t> &from_ids) {
     using namespace Eigen;
     using ampvec_t = Eigen::Vector<amp_t, Eigen::Dynamic>;
 
@@ -646,24 +656,24 @@ public:
       for (index_t o = 0; o < N-1; ++o)
         for (index_t l = 0; l < o; ++l)
           if (upper)
-            CCur(l, o) = -UCur(all, o).transpose() * QCur(all, l) -
+            CCur(l, o) = -(UCur(all, o).transpose() * QCur(all, l))(0,0) -
               Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) +
               Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
                   elem_cfg.config_base(from_ids(l1 * N + o)));
           else
-            CCur(o, l) =  UCur(all, o).transpose() * QCur(all, l) -
+            CCur(o, l) =  (UCur(all, o).transpose() * QCur(all, l))(0,0) -
               Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) +
               Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
                   elem_cfg.config_base(from_ids(l1 * N + o)));
       {
         for (index_t l = 0; l < N-1; ++l)
           if (upper)
-            CCur(l, N-1) = -UbF(all, l1).transpose() * QCur(all, l) -
+            CCur(l, N-1) = -(UbF(all, l1).transpose() * QCur(all, l))(0,0) -
               Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) +
               Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
                   elem_cfg.config_base(from_ids(l1 * N + N-1)));
           else
-            CCur(N-1, l) =  UbF(all, l1).transpose() * QCur(all, l) -
+            CCur(N-1, l) =  (UbF(all, l1).transpose() * QCur(all, l))(0,0) -
               Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) +
               Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
                   elem_cfg.config_base(from_ids(l1 * N + N-1)));

From 880612760e776fe9d8762e69d37cff0fe0e9f67c Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 20:38:41 +0900
Subject: [PATCH 15/39] Fix: Batch GF2 Forgot to Convert AmpRatio to Amp

---
 src/pfupdates/pf_interface.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc
index 26815109..0ccb2218 100644
--- a/src/pfupdates/pf_interface.cc
+++ b/src/pfupdates/pf_interface.cc
@@ -255,6 +255,7 @@ GENIMPL( ccdcmplx, z )
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
     Eigen::Vector<ctype, Eigen::Dynamic> pfaBatch = \
       objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs_thread, from_ids_thread); \
+    pfaBatch *= objv(iqp, ctype)->get_amplitude(); \
 \
     /* Unpack computed Pfaffians. */ \
     for (int igf = gfStart; igf < gfEnd; ++igf) \

From e77f0ae041b33f589afe2102e3c9aa0b82cd8269 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 21:48:48 +0900
Subject: [PATCH 16/39] Fix: Batch GF2 OMP

- Not tested for NQP>1
---
 src/pfupdates/pf_interface.cc | 52 +++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc
index 0ccb2218..223946f0 100644
--- a/src/pfupdates/pf_interface.cc
+++ b/src/pfupdates/pf_interface.cc
@@ -227,30 +227,46 @@ GENIMPL( ccdcmplx, z )
       void     *matv[], \
       void     *mapv[] ) \
 { \
-  int ith = omp_get_thread_num(); \
-  int nth = omp_get_num_threads(); \
+  const int ith = omp_get_thread_num(); \
+  const int nth = omp_get_num_threads(); \
 \
+  /* Count Green's funcs. */ \
   int gf_count = 0; \
   for (int i = 0; i < num_gf; ++i) \
+    if ( needs_comput[i] ) \
+      gf_count++; \
+\
+  /* Partitioning. */ \
+  int gfCountThread = (gf_count + nth - 1) / nth; \
+  const int gfStart = gfCountThread * ith; \
+  gfCountThread = std::min(gf_count - gfStart, gfCountThread); \
+\
+  /* Pack info space. */ \
+  gf_count = -gfStart; \
+  int *to_orbs_thread_, *from_ids_thread_, *unpack_idx_thread; \
+  for (int i = 0; i < num_gf && gf_count < gfCountThread; ++i) \
     if ( needs_comput[i] ) { \
-      if ( ith == 0 ) { \
-        /* Pack info space. */ \
-        to_orbs[gf_count * 2    ] = to_orbs[i * 2]; \
-        to_orbs[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \
-        from_ids[gf_count * 2    ] = from_ids[i * 2]; \
-        from_ids[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \
-        unpack_idx[gf_count] = i; \
+      if ( gf_count >= 0 ) { \
+        if ( gf_count == 0 ) { \
+          /* Pin down scratchpad spots. */ \
+          to_orbs_thread_   = to_orbs  + i * 2; \
+          from_ids_thread_  = from_ids + i * 2; \
+          unpack_idx_thread = unpack_idx + i; \
+          unpack_idx_thread[0] = i; \
+        } else { \
+          /* 0th idx already in the place. Pack from 1st. */ \
+          to_orbs_thread_[gf_count * 2    ] = to_orbs[i * 2]; \
+          to_orbs_thread_[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \
+          from_ids_thread_[gf_count * 2    ] = from_ids[i * 2]; \
+          from_ids_thread_[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \
+          unpack_idx_thread[gf_count] = i; \
+        } \
       } \
-      /* Count Green's funcs. */ \
       gf_count++; \
     } \
 \
-  int gf_cnt_thread = (gf_count + nth - 1) / nth; \
-  int gfStart = gf_cnt_thread * ith; \
-  int gfEnd = std::min(gf_count, gfStart + gf_cnt_thread); \
-\
-  Eigen::Map<VectorXi>  to_orbs_thread( to_orbs + gfStart * 2, (gfEnd - gfStart) * 2); \
-  Eigen::Map<VectorXi> from_ids_thread(from_ids + gfStart * 2, (gfEnd - gfStart) * 2); \
+  Eigen::Map<VectorXi>  to_orbs_thread( to_orbs_thread_, gfCountThread * 2); \
+  Eigen::Map<VectorXi> from_ids_thread(from_ids_thread_, gfCountThread * 2); \
 \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
     Eigen::Vector<ctype, Eigen::Dynamic> pfaBatch = \
@@ -258,8 +274,8 @@ GENIMPL( ccdcmplx, z )
     pfaBatch *= objv(iqp, ctype)->get_amplitude(); \
 \
     /* Unpack computed Pfaffians. */ \
-    for (int igf = gfStart; igf < gfEnd; ++igf) \
-      pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf - gfStart]; \
+    for (int igf = 0; igf < gfCountThread; ++igf) \
+      pfav[unpack_idx_thread[igf] * num_qp + iqp] = pfaBatch[igf]; \
   } \
 }
 // GENIMPL( float,    s )

From 5389c0ce32272163f4d41fa0081837e13aec88f9 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 22:14:39 +0900
Subject: [PATCH 17/39] Fix: CMake Eigen

---
 CMakeLists.txt             | 4 ++++
 src/ltl2inv/CMakeLists.txt | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe3208ed..f69b4ac9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,10 @@ add_subdirectory("${STDFACE_DIR}")
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 11)
 
+# Require Eigen
+find_package(Eigen3 3.4 REQUIRED NO_MODULE)
+include_directories(${EIGEN3_INCLUDE_DIR})
+
 if(Require_BLIS)
   add_definitions(-D_BLIS)
   include("download_blis_artifact.cmake")
diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt
index 1c704a0d..165ea01c 100644
--- a/src/ltl2inv/CMakeLists.txt
+++ b/src/ltl2inv/CMakeLists.txt
@@ -10,10 +10,6 @@ if("${ARCHITECTURE}" STREQUAL "x86_64")
 endif()
 include_directories(../pfapack/c_interface)
 
-# Require Eigen
-find_package(Eigen3 3.4 REQUIRED NO_MODULE)
-include_directories(${EIGEN3_INCLUDE_DIR})
-
 add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc)
 target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL)
 

From c987aeeee312214f875dd51bfc64ed50c16e0e4d Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 22:23:56 +0900
Subject: [PATCH 18/39] Eigen Should Offload MMuls to BLAS

We often have the best impl of BLAS.
---
 CMakeLists.txt |  2 ++
 mVMCconfig.sh  | 16 ++++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f69b4ac9..76b217fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,6 +139,8 @@ set(CMAKE_CXX_STANDARD 11)
 # Require Eigen
 find_package(Eigen3 3.4 REQUIRED NO_MODULE)
 include_directories(${EIGEN3_INCLUDE_DIR})
+# We surely want to offload mmuls to BLAS.
+add_definitions(-DEIGEN_USE_BLAS)
 
 if(Require_BLIS)
   add_definitions(-D_BLIS)
diff --git a/mVMCconfig.sh b/mVMCconfig.sh
index cfc7453a..1e0d7a57 100755
--- a/mVMCconfig.sh
+++ b/mVMCconfig.sh
@@ -41,7 +41,7 @@ FFLAGS = -DNDEBUG -DFUJITSU -Kfast
 CFLAGS = -DNDEBUG -Ofast -fopenmp
 CXXFLAGS = -DNDEBUG -Ofast -fopenmp
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS
+CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.so -SSL2 -lm -lpthread
@@ -75,7 +75,7 @@ FFLAGS = -O3 -implicitnone
 CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas
 CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
+CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread
@@ -101,7 +101,7 @@ FFLAGS = -O3 -implicitnone
 CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas
 CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
+CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread
@@ -125,7 +125,7 @@ FFLAGS = -O3
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -D_BLIS
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread
@@ -149,7 +149,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -D_BLIS
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread
@@ -173,7 +173,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS
+CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm -lpthread
@@ -197,7 +197,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -D_BLIS
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread
@@ -221,7 +221,7 @@ FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
 CFLAGS += -D_lapack -D_pf_block_update
-CXXFLAGS += -D_BLIS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple.
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple.
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread

From 32887d03e6e46e2297c79407671598b96beb6184 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 22:25:57 +0900
Subject: [PATCH 19/39] Fix: Include <tuple>

---
 src/pfupdates/config_manager.tcc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pfupdates/config_manager.tcc b/src/pfupdates/config_manager.tcc
index 44eb1f07..978cbcf5 100644
--- a/src/pfupdates/config_manager.tcc
+++ b/src/pfupdates/config_manager.tcc
@@ -9,6 +9,7 @@
 #include "error.hh"
 #include <random>
 #include <vector>
+#include <tuple>
 #include <cstdint>
 
 namespace vmc

From 88c18843de06a4dfc6d93dbd15d65200fb204a38 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 23:05:59 +0900
Subject: [PATCH 20/39] Fix: Link PfUpdates Always

---
 src/mVMC/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mVMC/CMakeLists.txt b/src/mVMC/CMakeLists.txt
index 44ae92cf..dd898044 100644
--- a/src/mVMC/CMakeLists.txt
+++ b/src/mVMC/CMakeLists.txt
@@ -28,9 +28,7 @@ add_executable(vmc.out ${SOURCES_vmcmain} ${SOURCES_sfmt})
 target_link_libraries(vmc.out StdFace)
 target_link_libraries(vmc.out pfapack)
 target_link_libraries(vmc.out ltl2inv)
-if(PFAFFIAN_BLOCKED)
-  target_link_libraries(vmc.out pfupdates)
-endif(PFAFFIAN_BLOCKED)
+target_link_libraries(vmc.out pfupdates)
 if(Require_BLIS)
   target_link_libraries(vmc.out blis pthread)
 endif(Require_BLIS)

From e34ed8d05fae3815111abe0eb48e1255271bd03b Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 11 Feb 2023 23:52:55 +0900
Subject: [PATCH 21/39] Batch GF2 Parallel in QP

This is faster than partitioning the batch since it retains thickness.
---
 src/mVMC/calham.c             | 32 ++++++++++++++++++++++----
 src/pfupdates/pf_interface.cc | 43 ++++++++++++++++++++++++++++++++++-
 src/pfupdates/pf_interface.h  | 16 ++++++++-----
 3 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c
index a2e5c841..442c280f 100644
--- a/src/mVMC/calham.c
+++ b/src/mVMC/calham.c
@@ -196,11 +196,33 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
       if ( !*lazy_info_odd  ) myEnergy += lazy_ip[idx * 2 + 1];
     }
     #pragma omp barrier
-    updated_tdi_v_omp_proc_batch_greentwo_z(NQPFull, NExchangeCoupling*2,
-                                            lazy_info, lazy_info + NExchangeCoupling*2,
-                                            lazy_rsi, lazy_msj,
-                                            lazy_pfa,
-                                            pfUpdator, pfOrbital, pfMat, pfMap);
+#if 1
+    int num_qp_var0 = 0;
+    // Pack lazy info.
+    for (idx=0; idx<NExchangeCoupling*2; ++idx)
+      if (lazy_info[idx]) {
+        if (omp_get_thread_num() == 0) {
+          lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+          lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+          lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+          lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+          lazy_info[NExchangeCoupling*2 + num_qp_var0] = idx; \
+        }
+        num_qp_var0++;
+      }
+    #pragma omp barrier
+    updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
+                                                 NULL, lazy_info + NExchangeCoupling*2,
+                                                 lazy_rsi, lazy_msj,
+                                                 lazy_pfa,
+                                                 pfUpdator, pfOrbital, pfMat, pfMap);
+#else
+    updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, NExchangeCoupling*2,
+                                                 lazy_info, lazy_info + NExchangeCoupling*2,
+                                                 lazy_rsi, lazy_msj,
+                                                 lazy_pfa,
+                                                 pfUpdator, pfOrbital, pfMat, pfMap);
+#endif
     #pragma omp barrier
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
     for(idx=0;idx<NExchangeCoupling*2;idx++)
diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc
index 223946f0..575ca625 100644
--- a/src/pfupdates/pf_interface.cc
+++ b/src/pfupdates/pf_interface.cc
@@ -214,7 +214,48 @@ GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
 #define GENIMPL( ctype, cblachar ) \
-  void EXPANDNAME( updated_tdi_v_omp_proc_batch_greentwo, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_var0_proc_batch_greentwo, cblachar ) \
+    ( uint64_t  num_qp, \
+      int       num_gf, \
+      int       needs_comput[], /* Already packed. var0 only does unpacking. */ \
+      int       unpack_idx[],   /* Already packed. var0 only does unpacking. */ \
+      int       to_orbs_[],     /* Already packed. var0 only does unpacking. */ \
+      int       from_ids_[],    /* Already packed. var0 only does unpacking. */ \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
+{ \
+  const int ith = omp_get_thread_num(); \
+  const int nth = omp_get_num_threads(); \
+\
+  /* Partitioning. */ \
+  const int qpCountThread = (num_qp + nth - 1) / nth; \
+  const int qpStart = qpCountThread * ith; \
+  const int qpEnd = std::min(int(num_qp), qpCountThread + qpStart); \
+\
+  Eigen::Map<VectorXi>  to_orbs( to_orbs_, num_gf * 2); \
+  Eigen::Map<VectorXi> from_ids(from_ids_, num_gf * 2); \
+\
+  for (int iqp = qpStart; iqp < qpEnd; ++iqp) { \
+    Eigen::Vector<ctype, Eigen::Dynamic> pfaBatch = \
+      objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs, from_ids); \
+    pfaBatch *= objv(iqp, ctype)->get_amplitude(); \
+\
+    /* Unpack computed Pfaffians. */ \
+    for (int igf = 0; igf < num_gf; ++igf) \
+      pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf]; \
+  } \
+}
+// GENIMPL( float,    s )
+GENIMPL( double,   d )
+// GENIMPL( ccscmplx, c )
+GENIMPL( ccdcmplx, z )
+#undef GENIMPL
+
+#define GENIMPL( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_var1_proc_batch_greentwo, cblachar ) \
     ( uint64_t  num_qp, \
       int       num_gf, \
       int       needs_comput[], \
diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h
index 7a858868..49ad0252 100644
--- a/src/pfupdates/pf_interface.h
+++ b/src/pfupdates/pf_interface.h
@@ -141,8 +141,8 @@ GENDEF( double,   d )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
-#define GENDEF( ctype, cblachar ) \
-  void EXPANDNAME( updated_tdi_v_omp_proc_batch_greentwo, cblachar ) \
+#define GENDEF( ctype, infix, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_## infix ##_proc_batch_greentwo, cblachar ) \
     ( uint64_t  num_qp, \
       int       num_gf, \
       int       needs_comput[], \
@@ -154,10 +154,14 @@ GENDEF( ccdcmplx, z )
       void     *orbv[], \
       void     *matv[], \
       void     *mapv[] );
-// GENDEF( float,    s )
-GENDEF( double,   d )
-// GENDEF( ccscmplx, c )
-GENDEF( ccdcmplx, z )
+// GENDEF( float, var0,    s )
+// GENDEF( float, var1,    s )
+GENDEF( double, var0,   d )
+GENDEF( double, var1,   d )
+// GENDEF( ccscmplx, var0, c )
+// GENDEF( ccscmplx, var1, c )
+GENDEF( ccdcmplx, var0, z )
+GENDEF( ccdcmplx, var1, z )
 #undef GENDEF
 
 #undef EXPANDNAME

From 932b1417ec67f48fa6277b471681bfe463f41ff0 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Tue, 14 Feb 2023 18:41:51 +0900
Subject: [PATCH 22/39] Silent Bracket Warning

---
 src/mVMC/vmcmake.c      | 3 ++-
 src/mVMC/vmcmake_fsz.c  | 3 ++-
 src/mVMC/vmcmake_real.c | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c
index 349c1301..fa35d7a5 100644
--- a/src/mVMC/vmcmake.c
+++ b/src/mVMC/vmcmake.c
@@ -76,11 +76,12 @@ void VMCMakeSample(MPI_Comm comm) {
   if (optBlockSize)
     NBlockUpdateSize = atoi(optBlockSize);
   // Fall back to default if input is invalid.
-  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100)
+  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) {
     if (NExUpdatePath == 0)
       NBlockUpdateSize = 4;
     else
       NBlockUpdateSize = 20;
+  }
 
   // Set one universal EleSpn.
   for (mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
diff --git a/src/mVMC/vmcmake_fsz.c b/src/mVMC/vmcmake_fsz.c
index 801ba523..ea0d44af 100644
--- a/src/mVMC/vmcmake_fsz.c
+++ b/src/mVMC/vmcmake_fsz.c
@@ -90,11 +90,12 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
   if (optBlockSize)
     NBlockUpdateSize = atoi(optBlockSize);
   // Fall back to default if input is invalid.
-  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100)
+  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) {
     if (NExUpdatePath == 0)
       NBlockUpdateSize = 4;
     else
       NBlockUpdateSize = 20;
+  }
 
   // Initialize with free spin configuration.
   updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c
index 7c0d57c4..3570b956 100644
--- a/src/mVMC/vmcmake_real.c
+++ b/src/mVMC/vmcmake_real.c
@@ -78,11 +78,12 @@ void VMCMakeSample_real(MPI_Comm comm) {
   if (optBlockSize)
     NBlockUpdateSize = atoi(optBlockSize);
   // Fall back to default if input is invalid.
-  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100)
+  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) {
     if (NExUpdatePath == 0)
       NBlockUpdateSize = 2;
     else
       NBlockUpdateSize = 20;
+  }
 
   // Set one universal EleSpn.
   for (mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;

From e41ba7175f62c246dd8e8f9b296fff6894510d0d Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Mon, 22 May 2023 18:17:40 +0900
Subject: [PATCH 23/39] Batch GF2 for all Ham in complex cSz

---
 src/mVMC/calham.c | 114 +++++++++++++++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 42 deletions(-)

diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c
index 442c280f..d698f0a9 100644
--- a/src/mVMC/calham.c
+++ b/src/mVMC/calham.c
@@ -67,13 +67,14 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
   double complex *myBuffer;
   double complex myEnergy;
 
-  int *lazy_info = malloc(    sizeof(int) * NExchangeCoupling * 2 * 2);
-  int *lazy_rsi  = malloc(2 * sizeof(int) * NExchangeCoupling * 2);
-  int *lazy_msj  = malloc(2 * sizeof(int) * NExchangeCoupling * 2);
-  double complex *lazy_ip  = malloc(sizeof(double complex) * NExchangeCoupling * 2);
-  double complex *lazy_pfa = malloc(sizeof(double complex) * NExchangeCoupling * 2 * NQPFull);
-  memset(lazy_info,                          0, sizeof(int) * NExchangeCoupling * 2);
-  memset(lazy_info + NExchangeCoupling * 2, -1, sizeof(int) * NExchangeCoupling * 2);
+  const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll;
+  int *lazy_info = malloc(    sizeof(int) * nHamiltonianTwo * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  int *lazy_msj  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  double complex *lazy_ip  = malloc(sizeof(double complex) * nHamiltonianTwo);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * nHamiltonianTwo * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * nHamiltonianTwo);
+  memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo);
 
   for (int mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
   for (int mi=Ne;mi<Ne*2;mi++) EleSpn[mi] = 1;
@@ -167,15 +168,29 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
     #pragma omp master
     {StopTimer(71);StartTimer(72);}
 
+    /****************/
+    /* 2-body terms */
+    /****************/
+    int noffset_lazy = 0, noffset_rsij = 0;
+
     /* Pair Hopping */
     #pragma omp for private(idx,ri,rj) schedule(dynamic) nowait
     for(idx=0;idx<NPairHopping;idx++) {
       ri = PairHopping[idx][0];
       rj = PairHopping[idx][1];
-    
-      myEnergy += ParaPairHopping[idx]
-        * GreenFunc2(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaPairHopping[idx] *
+        GreenFunc2_(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                    lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NPairHopping;
+    noffset_rsij += NPairHopping * 2;
 
     /* Exchange Coupling */
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
@@ -183,66 +198,81 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
     
-      int *lazy_info_even = lazy_info + idx * 2;
-      int *lazy_info_odd  = lazy_info + idx * 2 + 1;
-      int *lazy_rsi_even  = lazy_rsi + (idx * 2    ) * 2;
-      int *lazy_rsi_odd   = lazy_rsi + (idx * 2 + 1) * 2;
-      int *lazy_msj_even  = lazy_msj + (idx * 2    ) * 2;
-      int *lazy_msj_odd   = lazy_msj + (idx * 2 + 1) * 2;
-
-      lazy_ip[idx * 2]     = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
-      lazy_ip[idx * 2 + 1] = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
-      if ( !*lazy_info_even ) myEnergy += lazy_ip[idx * 2];
-      if ( !*lazy_info_odd  ) myEnergy += lazy_ip[idx * 2 + 1];
+      int *lazy_info_even = lazy_info + noffset_lazy +  idx * 2;
+      int *lazy_info_odd  = lazy_info + noffset_lazy +  idx * 2 + 1;
+      int *lazy_rsi_even  = lazy_rsi  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_rsi_odd   = lazy_rsi  + noffset_rsij + (idx * 2 + 1) * 2;
+      int *lazy_msj_even  = lazy_msj  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_msj_odd   = lazy_msj  + noffset_rsij + (idx * 2 + 1) * 2;
+      double complex *lazy_ip_even = lazy_ip + noffset_lazy + idx * 2;
+      double complex *lazy_ip_odd  = lazy_ip + noffset_lazy + idx * 2 + 1;
+
+      *lazy_ip_even = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
+      *lazy_ip_odd  = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
+      if ( !*lazy_info_even ) myEnergy += *lazy_ip_even;
+      if ( !*lazy_info_odd  ) myEnergy += *lazy_ip_odd;
+    }
+    noffset_lazy += 2 * NExchangeCoupling;
+    noffset_rsij += 2 * NExchangeCoupling * 2;
+
+    /* Inter All */
+    #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
+    for(idx=0;idx<NInterAll;idx++) {
+      ri = InterAll[idx][0];
+      rj = InterAll[idx][2];
+      s  = InterAll[idx][3];
+      rk = InterAll[idx][4];
+      rl = InterAll[idx][6];
+      t  = InterAll[idx][7];
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaInterAll[idx] *
+        GreenFunc2_(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                    lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NInterAll;
+    noffset_rsij += NInterAll * 2;
+
+    /* Batch-compute 2-body Green's functions. */
     #pragma omp barrier
-#if 1
+#if 0
     int num_qp_var0 = 0;
     // Pack lazy info.
-    for (idx=0; idx<NExchangeCoupling*2; ++idx)
+    for (idx=0; idx<nHamiltonianTwo; ++idx)
       if (lazy_info[idx]) {
         if (omp_get_thread_num() == 0) {
           lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
           lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
           lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
           lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
-          lazy_info[NExchangeCoupling*2 + num_qp_var0] = idx; \
+          lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
         }
         num_qp_var0++;
       }
     #pragma omp barrier
     updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
-                                                 NULL, lazy_info + NExchangeCoupling*2,
+                                                 NULL, lazy_info + nHamiltonianTwo,
                                                  lazy_rsi, lazy_msj,
                                                  lazy_pfa,
                                                  pfUpdator, pfOrbital, pfMat, pfMap);
 #else
-    updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, NExchangeCoupling*2,
-                                                 lazy_info, lazy_info + NExchangeCoupling*2,
+    updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, nHamiltonianTwo,
+                                                 lazy_info, lazy_info + nHamiltonianTwo,
                                                  lazy_rsi, lazy_msj,
                                                  lazy_pfa,
                                                  pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
     #pragma omp barrier
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
-    for(idx=0;idx<NExchangeCoupling*2;idx++)
+    for(idx=0;idx<nHamiltonianTwo;idx++)
       if ( lazy_info[idx] )
         myEnergy += conj(CalculateIP_fcmp(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
 
-    /* Inter All */
-    #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
-    for(idx=0;idx<NInterAll;idx++) {
-      ri = InterAll[idx][0];
-      rj = InterAll[idx][2];
-      s  = InterAll[idx][3];
-      rk = InterAll[idx][4];
-      rl = InterAll[idx][6];
-      t  = InterAll[idx][7];
-      
-      myEnergy += ParaInterAll[idx]
-        * GreenFunc2(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-    }
-
     #pragma omp master
     {StopTimer(72);}
 

From 72baf9b55e1116de08eb9e3eb05fb48426e706da Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Mon, 22 May 2023 23:54:24 +0900
Subject: [PATCH 24/39] Batch GF2 var 1/2 Heuristics

Based on Nelec ~ 200.
---
 src/mVMC/calham.c | 52 +++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c
index d698f0a9..a44bf73b 100644
--- a/src/mVMC/calham.c
+++ b/src/mVMC/calham.c
@@ -240,33 +240,33 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
 
     /* Batch-compute 2-body Green's functions. */
     #pragma omp barrier
-#if 0
-    int num_qp_var0 = 0;
-    // Pack lazy info.
-    for (idx=0; idx<nHamiltonianTwo; ++idx)
-      if (lazy_info[idx]) {
-        if (omp_get_thread_num() == 0) {
-          lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
-          lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
-          lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
-          lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
-          lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
+    if ( Nsize <= 200 ) { // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<nHamiltonianTwo; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
         }
-        num_qp_var0++;
-      }
-    #pragma omp barrier
-    updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
-                                                 NULL, lazy_info + nHamiltonianTwo,
-                                                 lazy_rsi, lazy_msj,
-                                                 lazy_pfa,
-                                                 pfUpdator, pfOrbital, pfMat, pfMap);
-#else
-    updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, nHamiltonianTwo,
-                                                 lazy_info, lazy_info + nHamiltonianTwo,
-                                                 lazy_rsi, lazy_msj,
-                                                 lazy_pfa,
-                                                 pfUpdator, pfOrbital, pfMat, pfMap);
-#endif
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, nHamiltonianTwo,
+                                                   lazy_info, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    }
     #pragma omp barrier
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
     for(idx=0;idx<nHamiltonianTwo;idx++)

From 21caa1b2a33b1324113c8ca26a0ccae06924dd71 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Tue, 23 May 2023 20:49:26 +0900
Subject: [PATCH 25/39] Batch GF2 for all Ham in complex fSz

---
 src/mVMC/calham_fsz.c | 135 ++++++++++++++++++++++++++++++++++++------
 src/mVMC/locgrn_fsz.c |  50 +++++++++++++---
 2 files changed, 159 insertions(+), 26 deletions(-)

diff --git a/src/mVMC/calham_fsz.c b/src/mVMC/calham_fsz.c
index 10239e14..c3935ada 100644
--- a/src/mVMC/calham_fsz.c
+++ b/src/mVMC/calham_fsz.c
@@ -2,7 +2,7 @@
 mVMC - A numerical solver package for a wide range of quantum lattice models based on many-variable Variational Monte Carlo method
 Copyright (C) 2016 The University of Tokyo, All rights reserved.
 
-his program is developed based on the mVMC-mini program
+This program is developed based on the mVMC-mini program
 (https://github.com/fiber-miniapp/mVMC-mini)
 which follows "The BSD 3-Clause License".
 
@@ -57,6 +57,15 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
   double complex *myBuffer;
   double complex myEnergy;
 
+  const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll;
+  int *lazy_info = malloc(    sizeof(int) * nHamiltonianTwo * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  int *lazy_msj  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  double complex *lazy_ip  = malloc(sizeof(double complex) * nHamiltonianTwo);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * nHamiltonianTwo * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * nHamiltonianTwo);
+  memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo);
+
   RequestWorkSpaceThreadInt(Nsize+Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -66,7 +75,7 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
   private(myEleIdx,myEleNum,myProjCntNew,myBuffer,myEnergy,idx)  \
   reduction(+:e)
   */
-#pragma omp parallel default(none)                                      \
+#pragma omp parallel default(shared)                                      \
   private(myEleIdx,myEleSpn,myEleNum,myProjCntNew,myBuffer,myEnergy, idx, ri, rj, rk, rl, s, t,u,v) \
   firstprivate(ip, Nsize, Nsite2, NProj, NQPFull, NCoulombIntra, CoulombIntra, ParaCoulombIntra, \
                NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling, \
@@ -80,6 +89,20 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadComplex(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_z(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm, Nsite2*Nsite2,
+                                     InvM, Nsize*Nsize,
+                                     eleIdx, eleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -140,29 +163,52 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
     #pragma omp master
     {StopTimer(71);StartTimer(72);}
 
+    /****************/
+    /* 2-body terms */
+    /****************/
+    int noffset_lazy = 0, noffset_rsij = 0;
+
     /* Pair Hopping */
     #pragma omp for private(idx,ri,rj) schedule(dynamic) nowait
     for(idx=0;idx<NPairHopping;idx++) {
       ri = PairHopping[idx][0];
       rj = PairHopping[idx][1];
-    
-      myEnergy += ParaPairHopping[idx]
-        * GreenFunc2_fsz(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaPairHopping[idx] *
+        GreenFunc2_fsz_(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer,
+                        lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+        if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NPairHopping;
+    noffset_rsij += NPairHopping * 2;
 
     /* Exchange Coupling */
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
     for(idx=0;idx<NExchangeCoupling;idx++) {
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
-    
-      tmp =  GreenFunc2_fsz(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      //printf("idx=%d ri=%d rj=%d:  tmp=%lf \n",idx,ri,rj,creal(tmp));
-      tmp += GreenFunc2_fsz(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      //tmp =  GreenFunc2_fsz2(ri,rj,rj,ri,0,0,1,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      //tmp += GreenFunc2_fsz2(ri,rj,rj,ri,1,1,0,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      myEnergy += ParaExchangeCoupling[idx] * tmp;
+
+      int *lazy_info_even = lazy_info + noffset_lazy +  idx * 2;
+      int *lazy_info_odd  = lazy_info + noffset_lazy +  idx * 2 + 1;
+      int *lazy_rsi_even  = lazy_rsi  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_rsi_odd   = lazy_rsi  + noffset_rsij + (idx * 2 + 1) * 2;
+      int *lazy_msj_even  = lazy_msj  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_msj_odd   = lazy_msj  + noffset_rsij + (idx * 2 + 1) * 2;
+      double complex *lazy_ip_even = lazy_ip + noffset_lazy + idx * 2;
+      double complex *lazy_ip_odd  = lazy_ip + noffset_lazy + idx * 2 + 1;
+
+      *lazy_ip_even = ParaExchangeCoupling[idx] * GreenFunc2_fsz_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
+      *lazy_ip_odd  = ParaExchangeCoupling[idx] * GreenFunc2_fsz_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
+      if ( !*lazy_info_even ) myEnergy += *lazy_ip_even;
+      if ( !*lazy_info_odd  ) myEnergy += *lazy_ip_odd;
     }
+    noffset_lazy += 2 * NExchangeCoupling;
+    noffset_rsij += 2 * NExchangeCoupling * 2;
 
     /* Inter All */
     #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
@@ -175,21 +221,74 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
       u  = InterAll[idx][5];
       rl = InterAll[idx][6];
       v  = InterAll[idx][7];
-      
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
       if(s==t && u==v){
-        myEnergy += ParaInterAll[idx]
-          * GreenFunc2_fsz(ri,rj,rk,rl,s,u,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
+        *lazy_ip_loc = ParaInterAll[idx] *
+          GreenFunc2_fsz_(ri,rj,rk,rl,s,u,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer,
+                          lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
       }else{
-        myEnergy += ParaInterAll[idx]
-          * GreenFunc2_fsz2(ri,rj,rk,rl,s,t,u,v,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      } 
+        *lazy_ip_loc = ParaInterAll[idx] *
+          GreenFunc2_fsz2_(ri,rj,rk,rl,s,t,u,v,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer,
+                           lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      }
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
+    }
+    noffset_lazy += NInterAll;
+    noffset_rsij += NInterAll * 2;
+
+    /* Batch-compute 2-body Green's functions. */
+    #pragma omp barrier
+    if ( Nsize <= 200 ) { // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<nHamiltonianTwo; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
+        }
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, nHamiltonianTwo,
+                                                   lazy_info, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
     }
+    #pragma omp barrier
+    #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
+    for(idx=0;idx<nHamiltonianTwo;idx++)
+      if ( lazy_info[idx] )
+        myEnergy += conj(CalculateIP_fcmp(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
 
     #pragma omp master
     {StopTimer(72);}
 
     e += myEnergy;
+
+    // Freeing within OMP region is thread-private.
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
   }
+  free(lazy_info);
+  free(lazy_rsi);
+  free(lazy_msj);
+  free(lazy_ip);
+  free(lazy_pfa);
 
   ReleaseWorkSpaceThreadInt();
   ReleaseWorkSpaceThreadComplex();
diff --git a/src/mVMC/locgrn_fsz.c b/src/mVMC/locgrn_fsz.c
index a686906f..6ca1bb2d 100644
--- a/src/mVMC/locgrn_fsz.c
+++ b/src/mVMC/locgrn_fsz.c
@@ -125,10 +125,11 @@ double complex GreenFunc1_fsz2(const int ri, const int rj, const int s,const int
 
 /* Calculate 2-body Green function <psi|CisAjsCktAlt|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl,
+double complex GreenFunc2_fsz_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
-                  int *projCntNew, double complex *buffer) {
+                  int *projCntNew, double complex *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double complex z;
   int mj,msj,ml,mtl;
   int rsi,rsj,rtk,rtl;
@@ -199,8 +200,17 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV);
-  z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_info) {
+    CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV);
+    z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    // Save invocation hook instead of calling.
+    lazy_msj[0] = msj; //< to edit to ri, revert to rj
+    lazy_msj[1] = mtl; //< to edit to rk, revert to rl
+    lazy_rsi[0] = rsi;
+    lazy_rsi[1] = rtk;
+    lazy_info[0] = 1; //< "is delayed" flag..
+  }
 
   /* revert hopping */
   eleIdx[mtl] = rl;
@@ -215,12 +225,21 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in
   return conj(z/ip);//TBC
 }
 
+double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t, const double complex ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
+                  int *projCntNew, double complex *buffer) {
+  return GreenFunc2_fsz_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn,
+                         projCntNew, buffer, 0, 0, 0);
+}
+
 /* Calculate 2-body Green function <psi|CisAjtCkuAlv|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl,
+double complex GreenFunc2_fsz2_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t,const int u,const int v, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
-                  int *projCntNew, double complex *buffer) {
+                  int *projCntNew, double complex *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double complex z;
   //int mj,msj,ml,mtl;
   int mj,ml; // mj: (rj,t) -> (ri,s) ml:(rl,v) -> (rk,u)
@@ -325,8 +344,16 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s
-  z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_info) {
+    CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s
+    z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    lazy_msj[0] = mj;
+    lazy_msj[1] = ml;
+    lazy_rsi[0] = XI;
+    lazy_rsi[1] = XK;
+    lazy_info[0] = 1;
+  }
 
   /* revert hopping */
   eleIdx[ml]  = rl;  // ml : (rl,v)
@@ -342,6 +369,13 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i
 }
 
 
+double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t,const int u,const int v, const double complex ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
+                  int *projCntNew, double complex *buffer) {
+  return GreenFunc2_fsz2_(ri, rj, rk, rl, s, t, u, v, ip,
+                          eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn, projCntNew, buffer, 0, 0, 0);
+}
 
 // ignore GreenFuncN: to be added
 

From aedba0832418d8e9678d5db75459948d68ab5535 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Tue, 1 Aug 2023 18:24:18 +0900
Subject: [PATCH 26/39] Upgrade PFAPACK77 w/ 2-Step Algos

---
 src/pfapack | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pfapack b/src/pfapack
index ebaa11b7..af8471ae 160000
--- a/src/pfapack
+++ b/src/pfapack
@@ -1 +1 @@
-Subproject commit ebaa11b722dbb5f1ddc62da607592d2e301c0c4a
+Subproject commit af8471aeaf1ff1c07aef6be196ca2ddbb09726d2

From 01824c2e3c986e56f64309f789aa67082f809749 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Mon, 6 Nov 2023 17:17:38 +0900
Subject: [PATCH 27/39] Adopt Fix in PFAPACK77 2-Step

---
 src/pfapack | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pfapack b/src/pfapack
index af8471ae..7826344a 160000
--- a/src/pfapack
+++ b/src/pfapack
@@ -1 +1 @@
-Subproject commit af8471aeaf1ff1c07aef6be196ca2ddbb09726d2
+Subproject commit 7826344af031cfe554d1e26f0e7d0b21cb510c9c

From f40bd6bd4a277ae09dde187c286cdfae8246ac0c Mon Sep 17 00:00:00 2001
From: Ruqing Xu <rxu.sol@gmail.com>
Date: Sat, 29 Jun 2024 17:28:00 +0900
Subject: [PATCH 28/39] Batched GF2 for Real non-fsz

---
 src/mVMC/calham_real.c | 131 ++++++++++++++++++++++++++++++++++-------
 src/mVMC/locgrn_real.c |  25 ++++++--
 2 files changed, 132 insertions(+), 24 deletions(-)

diff --git a/src/mVMC/calham_real.c b/src/mVMC/calham_real.c
index 660aa2d4..08149c56 100644
--- a/src/mVMC/calham_real.c
+++ b/src/mVMC/calham_real.c
@@ -51,6 +51,18 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
   double  *myBuffer;
   double  myEnergy;
 
+  const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll;
+  int *lazy_info = malloc(    sizeof(int) * nHamiltonianTwo * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  int *lazy_msj  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  double *lazy_ip  = malloc(sizeof(double) * nHamiltonianTwo);
+  double *lazy_pfa = malloc(sizeof(double) * nHamiltonianTwo * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * nHamiltonianTwo);
+  memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo);
+
+  for (int mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
+  for (int mi=Ne;mi<Ne*2;mi++) EleSpn[mi] = 1;
+
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadDouble(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -66,13 +78,27 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
     NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling,    \
     NTransfer, Transfer, ParaTransfer, NPairHopping, PairHopping, ParaPairHopping,    \
     NExchangeCoupling, ExchangeCoupling, ParaExchangeCoupling, NInterAll, InterAll, ParaInterAll, n0, n1)\
-    shared(eleCfg, eleProjCnt, eleIdx, eleNum) reduction(+:e)
+    shared(eleCfg, eleProjCnt, eleIdx, eleNum, lazy_info) reduction(+:e)
   {
     myEleIdx = GetWorkSpaceThreadInt(Nsize);
     myEleNum = GetWorkSpaceThreadInt(Nsite2);
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadDouble(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_d(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm_real, Nsite2*Nsite2,
+                                     InvM_real, Nsize*Nsize,
+                                     eleIdx, EleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM_real,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -145,36 +171,54 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
 #pragma omp master
     printf("    Debug: PairHopping\n");
 #endif
+
+    /****************/
+    /* 2-body terms */
+    /****************/
+    int noffset_lazy = 0, noffset_rsij = 0;
+
     /* Pair Hopping */
     #pragma omp for private(idx,ri,rj) schedule(dynamic) nowait
     for(idx=0;idx<NPairHopping;idx++) {
       ri = PairHopping[idx][0];
       rj = PairHopping[idx][1];
-    
-      myEnergy += ParaPairHopping[idx]
-        * GreenFunc2_real(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaPairHopping[idx]
+        * GreenFunc2_real_(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                           lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NPairHopping;
+    noffset_rsij += NPairHopping * 2;
 
-#ifdef _DEBUG
-//#pragma omp master
-    //printf("    Debug: ExchangeCoupling, NExchangeCoupling=%d\n", NExchangeCoulpling);
-#endif
     /* Exchange Coupling */
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
     for(idx=0;idx<NExchangeCoupling;idx++) {
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
     
-      tmp =  GreenFunc2_real(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      tmp += GreenFunc2_real(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      myEnergy += ParaExchangeCoupling[idx] * tmp;
-      //printf("XDEBUG: idx=%d tmp=%lf %lf\n",idx,tmp,ParaExchangeCoupling[idx]);
+      int *lazy_info_even = lazy_info + noffset_lazy +  idx * 2;
+      int *lazy_info_odd  = lazy_info + noffset_lazy +  idx * 2 + 1;
+      int *lazy_rsi_even  = lazy_rsi  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_rsi_odd   = lazy_rsi  + noffset_rsij + (idx * 2 + 1) * 2;
+      int *lazy_msj_even  = lazy_msj  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_msj_odd   = lazy_msj  + noffset_rsij + (idx * 2 + 1) * 2;
+      double *lazy_ip_even = lazy_ip + noffset_lazy + idx * 2;
+      double *lazy_ip_odd  = lazy_ip + noffset_lazy + idx * 2 + 1;
+
+      *lazy_ip_even = ParaExchangeCoupling[idx] * GreenFunc2_real_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
+      *lazy_ip_odd  = ParaExchangeCoupling[idx] * GreenFunc2_real_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
+      if ( !*lazy_info_even ) myEnergy += *lazy_ip_even;
+      if ( !*lazy_info_odd  ) myEnergy += *lazy_ip_odd;
     }
-    
-#ifdef _DEBUG
-#pragma omp master
-    printf("    Debug: InterAll, NInterAll=%d\n", NInterAll);
-#endif
+    noffset_lazy += 2 * NExchangeCoupling;
+    noffset_rsij += 2 * NExchangeCoupling * 2;
+
     /* Inter All */
     #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
     for(idx=0;idx<NInterAll;idx++) {
@@ -184,10 +228,57 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
       rk = InterAll[idx][4];
       rl = InterAll[idx][6];
       t  = InterAll[idx][7];
-      
-      myEnergy += ParaInterAll[idx]
-        * GreenFunc2_real(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaInterAll[idx]
+        * GreenFunc2_real_(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                           lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NInterAll;
+    noffset_rsij += NInterAll * 2;
+
+    /* Batch-compute 2-body Green's functions. */
+    #pragma omp barrier
+    // if ( Nsize <= 100 ) // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+    if ( 1 )
+    {
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<nHamiltonianTwo; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
+        }
+      fprintf(stdout, "GF2: %d / %d\n", num_qp_var0, nHamiltonianTwo);
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_d(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_d(NQPFull, nHamiltonianTwo,
+                                                   lazy_info, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    }
+    #pragma omp barrier
+    #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
+    for(idx=0;idx<nHamiltonianTwo;idx++)
+      if ( lazy_info[idx] )
+        myEnergy += CalculateIP_real(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF) * lazy_ip[idx];
 
     #pragma omp master
     {StopTimer(72);}
diff --git a/src/mVMC/locgrn_real.c b/src/mVMC/locgrn_real.c
index 82cc4037..4c6a0e1f 100644
--- a/src/mVMC/locgrn_real.c
+++ b/src/mVMC/locgrn_real.c
@@ -77,10 +77,11 @@ double  GreenFunc1_real(const int ri, const int rj, const int s, const double ip
 
 /* Calculate 2-body Green function <psi|CisAjsCktAlt|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
+double GreenFunc2_real_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t, const double ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
-                  int *projCntNew, double *buffer) {
+                  int *projCntNew, double *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double z;
   int mj,msj,ml,mtl;
   int rsi,rsj,rtk,rtl;
@@ -149,8 +150,17 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV);
-  z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_info) {
+    CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV);
+    z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    // Save invocation hook instead of calling.
+    lazy_msj[0] = msj; //< to edit to ri, revert to rj
+    lazy_msj[1] = mtl; //< to edit to rk, revert to rl
+    lazy_rsi[0] = rsi;
+    lazy_rsi[1] = rtk;
+    lazy_info[0] = 1; //< "is delayed" flag..
+  }
 
   /* revert hopping */
   eleIdx[mtl] = rl;
@@ -163,6 +173,13 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
   return z/ip;//TBC
 }
 
+double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t, const double ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
+                  int *projCntNew, double *buffer) {
+  return GreenFunc2_real_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0, 0, 0);
+}
+
 
 /// Calculate n-body Green function
 /// <phi| c1 a1 c2 a2 ... cn an |x>

From 8b831ea853cbba32a15e90b20b5b500aef23e908 Mon Sep 17 00:00:00 2001
From: Ruqing Xu <rxu.sol@gmail.com>
Date: Sat, 29 Jun 2024 17:30:06 +0900
Subject: [PATCH 29/39] Heuristics

---
 src/mVMC/calham_real.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mVMC/calham_real.c b/src/mVMC/calham_real.c
index 08149c56..f9111197 100644
--- a/src/mVMC/calham_real.c
+++ b/src/mVMC/calham_real.c
@@ -244,8 +244,7 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
 
     /* Batch-compute 2-body Green's functions. */
     #pragma omp barrier
-    // if ( Nsize <= 100 ) // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
-    if ( 1 )
+    if ( Nsize <= 400 ) // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
     {
       int num_qp_var0 = 0;
       // Pack lazy info.

From 5b474aa9c42ecb729af94e6131dd65a8ca09ab9c Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 29 Jun 2024 17:38:50 +0900
Subject: [PATCH 30/39] Remove clutter print

---
 src/mVMC/calham_real.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mVMC/calham_real.c b/src/mVMC/calham_real.c
index f9111197..fc2e76b9 100644
--- a/src/mVMC/calham_real.c
+++ b/src/mVMC/calham_real.c
@@ -259,7 +259,6 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
           }
           num_qp_var0++;
         }
-      fprintf(stdout, "GF2: %d / %d\n", num_qp_var0, nHamiltonianTwo);
       #pragma omp barrier
       updated_tdi_v_omp_var0_proc_batch_greentwo_d(NQPFull, num_qp_var0,
                                                    NULL, lazy_info + nHamiltonianTwo,

From fa841f48b51bc7b7d3ed43ca7161673c70227af8 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sat, 29 Jun 2024 17:53:53 +0900
Subject: [PATCH 31/39] real calc use makeInitialSample: non-fsz

---
 src/mVMC/vmcmake_real.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c
index 3570b956..9b73521f 100644
--- a/src/mVMC/vmcmake_real.c
+++ b/src/mVMC/vmcmake_real.c
@@ -61,7 +61,7 @@ void VMCMakeSample_real(MPI_Comm comm) {
 
   StartTimer(30);
   if (BurnFlag == 0) {
-    makeInitialSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
+    makeInitialSample_real(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
                       qpStart, qpEnd, comm);
   } else {
     copyFromBurnSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt);
@@ -104,7 +104,7 @@ void VMCMakeSample_real(MPI_Comm comm) {
 
   if (!isfinite(logIpOld)) {
     if (rank == 0) fprintf(stderr, "waring: VMCMakeSample remakeSample logIpOld=%e\n", creal(logIpOld)); //TBC
-    makeInitialSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
+    makeInitialSample_real(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
                       qpStart, qpEnd, comm);
 #ifdef _pf_block_update
     // Clear and reinitialize.

From 5a43674237eed281c2e6d6fba21f572497e18d7f Mon Sep 17 00:00:00 2001
From: Ruqing Xu <rxu.sol@gmail.com>
Date: Sun, 30 Jun 2024 20:14:45 +0900
Subject: [PATCH 32/39] Use Direct-blocked+Left-looking Pfapack

- Strange (Complex&ABI suspected) error for skip=2 aggregate-block
- Direct-block + left-looking is faster
---
 src/mVMC/matrix.c | 26 ++++++++++++++------------
 src/pfapack       |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/mVMC/matrix.c b/src/mVMC/matrix.c
index c91ba200..646c8f18 100644
--- a/src/mVMC/matrix.c
+++ b/src/mVMC/matrix.c
@@ -162,15 +162,15 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta
     }
   }
 
-  M_ZSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
+  M_ZSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
 
-  utu2pfa_z(n, invM, lda, iwork, &pfaff);
+  ltl2pfa_z(n, invM, lda, iwork, &pfaff);
   if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
   /* Calculate inverse. */
-  utu2inv_z(n, invM, lda, iwork, work, bufM, lda);
+  ltl2inv_z(n, invM, lda, iwork, work, bufM, lda);
 
   /* InvM -> InvM(T) -> -InvM */
   M_ZSCAL(&nsq, &minus_one, invM, &one);
@@ -261,15 +261,15 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int
   }
 
   /* Calculate Pf M */
-  M_DSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
+  M_DSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
 
-  utu2pfa_d(n, invM, lda, iwork, &pfaff);
+  ltl2pfa_d(n, invM, lda, iwork, &pfaff);
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
   /* Calculate inverse. */
-  utu2inv_d(n, invM, lda, iwork, work, bufM, lda);
+  ltl2inv_d(n, invM, lda, iwork, work, bufM, lda);
 
   /* InvM -> InvM(T) -> -InvM */
   M_DSCAL(&nsq, &minus_one, invM, &one);
@@ -369,15 +369,15 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE
   }
 
   /* Calculate Pf M */
-  M_ZSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
+  M_ZSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
 
-  utu2pfa_z(n, invM, lda, iwork, &pfaff);
+  ltl2pfa_z(n, invM, lda, iwork, &pfaff);
   if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
   /* Calculate inverse. */
-  utu2inv_z(n, invM, lda, iwork, work, bufM, lda);
+  ltl2inv_z(n, invM, lda, iwork, work, bufM, lda);
 
   /* mVMC's handling InvM as row-major,
    * i.e. InvM needs a transpose, InvM -> -InvM according antisymmetric properties. */
@@ -596,15 +596,17 @@ int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpE
   }
 
   /* calculate Pf M */
-  M_DSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
+  M_DSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
+  // printf("M_DSKTRF info:%d\n", info);
   if(info!=0) return info;
 
-  utu2pfa_d(n, invM, lda, iwork, &pfaff);
+  ltl2pfa_d(n, invM, lda, iwork, &pfaff);
+  // printf("pfaff:%e\n", pfaff);
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
   /* Compute inverse */
-  utu2inv_d(n, invM, lda, iwork, work, bufM, lda);
+  ltl2inv_d(n, invM, lda, iwork, work, bufM, lda);
 
   // InvM -> InvM' = -InvM
   // TODO: Include this in ltl2inv
diff --git a/src/pfapack b/src/pfapack
index 7826344a..f541a51a 160000
--- a/src/pfapack
+++ b/src/pfapack
@@ -1 +1 @@
-Subproject commit 7826344af031cfe554d1e26f0e7d0b21cb510c9c
+Subproject commit f541a51a3a6968b84117cf12f69b0776053a79e3

From 27dab862991c1bd3628295622d7907b9d55de8f4 Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Sun, 30 Jun 2024 20:16:42 +0900
Subject: [PATCH 33/39] Correct para amp

See TorchVMC dev notes
---
 src/mVMC/parameter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mVMC/parameter.c b/src/mVMC/parameter.c
index 9167c941..fc0b59ef 100644
--- a/src/mVMC/parameter.c
+++ b/src/mVMC/parameter.c
@@ -29,7 +29,7 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #ifndef _SRC_PARAMETER
 #define _SRC_PARAMETER
 
-#define D_AmpMax             4.0
+#define D_AmpMax             0.5
 
 /* initialize variational parameters */
 void InitParameter() {

From 65fed488cd7d678d8235de45d0a234079fad5bbb Mon Sep 17 00:00:00 2001
From: RuQing Xu <ruqing.xu@phys.s.u-tokyo.ac.jp>
Date: Mon, 1 Jul 2024 00:12:35 +0900
Subject: [PATCH 34/39] Batched GF2 for non-fsz

---
 src/mVMC/calgrn.c | 86 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 5 deletions(-)

diff --git a/src/mVMC/calgrn.c b/src/mVMC/calgrn.c
index 8d13215b..e81503da 100644
--- a/src/mVMC/calgrn.c
+++ b/src/mVMC/calgrn.c
@@ -38,6 +38,17 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
   int *myEleIdx, *myEleNum, *myProjCntNew;
   double complex *myBuffer;
 
+  int *lazy_info = malloc(    sizeof(int) * NCisAjsCktAltDC * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * NCisAjsCktAltDC);
+  int *lazy_msj  = malloc(2 * sizeof(int) * NCisAjsCktAltDC);
+  double complex *lazy_ip  = malloc(sizeof(double complex) * NCisAjsCktAltDC);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * NCisAjsCktAltDC * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * NCisAjsCktAltDC);
+  memset(lazy_info + NCisAjsCktAltDC, -1, sizeof(int) * NCisAjsCktAltDC);
+
+  for (int mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
+  for (int mi=Ne;mi<Ne*2;mi++) EleSpn[mi] = 1;
+
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -50,6 +61,20 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadComplex(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_z(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm, Nsite2*Nsite2,
+                                     InvM, Nsize*Nsize,
+                                     eleIdx, EleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -70,7 +95,7 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
     #pragma omp master
     {StopTimer(50);StartTimer(51);}
     
-    #pragma omp for private(idx,ri,rj,s,rk,rl,t,tmp) schedule(dynamic)
+    #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic)
     for(idx=0;idx<NCisAjsCktAltDC;idx++) {
       ri = CisAjsCktAltDCIdx[idx][0];
       rj = CisAjsCktAltDCIdx[idx][2];
@@ -79,11 +104,57 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
       rl = CisAjsCktAltDCIdx[idx][6];
       t  = CisAjsCktAltDCIdx[idx][5];
 
-      tmp = GreenFunc2(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,
-                       myProjCntNew,myBuffer);
-      PhysCisAjsCktAltDC[idx] += w*tmp;
+      int *lazy_info_loc = lazy_info + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + idx;
+
+      *lazy_ip_loc = w *
+        GreenFunc2_(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+          lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) {
+        PhysCisAjsCktAltDC[idx] += *lazy_ip_loc;
+      } /* else {
+        // Reverse lookup for the idx
+        *lazy_info_loc = idx + 1;
+      } */
     }
-    
+
+    /* Batch-compute 2-body Green's functions. */
+    #pragma omp barrier
+    if ( Nsize <= 200 ) { // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<NCisAjsCktAltDC; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[NCisAjsCktAltDC + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
+        }
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + NCisAjsCktAltDC,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, NCisAjsCktAltDC,
+                                                   lazy_info, lazy_info + NCisAjsCktAltDC,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    }
+    #pragma omp barrier
+    #pragma omp for private(idx) schedule(dynamic) nowait
+    for(idx=0;idx<NCisAjsCktAltDC;idx++)
+      if ( lazy_info[idx] )
+        PhysCisAjsCktAltDC[idx] += conj(CalculateIP_fcmp(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
+
     #pragma omp master
     {StopTimer(51);StartTimer(52);}
 
@@ -105,6 +176,11 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
     #pragma omp master
     {StopTimer(53);}
   }
+  free(lazy_info);
+  free(lazy_rsi);
+  free(lazy_msj);
+  free(lazy_ip);
+  free(lazy_pfa);
 
   ReleaseWorkSpaceThreadInt();
   ReleaseWorkSpaceThreadComplex();

From 7fa0c68b5bd5fdccc56cab85a12889b2fa04ac56 Mon Sep 17 00:00:00 2001
From: Ruqing Xu <ruqingx@nvidia.com>
Date: Wed, 2 Oct 2024 22:53:15 +0900
Subject: [PATCH 35/39] Merge .github/workflows changes ahead of #64

s.t. testing goes ahead of actual merging
---
 .github/workflows/deploy_docs.yml | 16 +++++++++++-----
 .github/workflows/main.yml        |  8 +++-----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml
index 02ab63ca..9c6855af 100644
--- a/.github/workflows/deploy_docs.yml
+++ b/.github/workflows/deploy_docs.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - master
+      - develop
       - gh_actions  # test branch
       - '!gh-pages'
     tags: '*'
@@ -13,21 +14,21 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
     - name: Inject slug/short variables
-      uses: rlespinasse/github-slug-action@v3.x
+      uses: rlespinasse/github-slug-action@v4.x
 
     - name: Checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
       with:
         path: main
 
     - name: Checkout gh-pages
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
       with:
         ref: gh-pages
         path: gh-pages
 
     - name: Set up Python
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.8
 
@@ -42,7 +43,7 @@ jobs:
         cmake -E make_directory build
         cd build
         cmake -DDocument=ON -DENABLE_MPI=OFF ../
-        make doc-ja-html doc-en-html
+        make doc-ja-html doc-en-html tutorial-ja-html
 
     - name: Deploy Configuration
       run: |
@@ -63,6 +64,11 @@ jobs:
             mkdir -p "gh-pages/doc/${TARGET_NAME}"
             cp -r "main/build/doc/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/${lang}"
           done
+          for lang in ja; do
+            rm -rf "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}"
+            mkdir -p "gh-pages/doc/${TARGET_NAME}"/tutorial/
+            cp -r "main/build/doc/tutorial/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}"
+          done
           cd gh-pages
           git config --local user.name "${GIT_USER}"
           git config --local user.email "${GIT_EMAIL}"
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b49b05fe..4f66d3a6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,6 +2,7 @@ name: CI
 
 on:
   push:
+  pull_request:
   schedule:
     - cron: '0 0 1,15 * *' # JST 9:00 on 1st and 15th every month
 
@@ -32,7 +33,7 @@ jobs:
       - name: brew
         if: ${{ runner.os == 'macOS' }}
         run: |
-          brew install openmpi scalapack libomp
+          brew install openmpi scalapack libomp blis
 
       - name: Setup Python
         uses: actions/setup-python@v5
@@ -52,13 +53,10 @@ jobs:
         run: |
           if [ ${{ runner.os }} = "macOS" ] ; then
             # CONFIG=apple requires gfortran but macOS runner has not, but gfortran-11, 12, ...
-            ln -s `which gfortran-11` gfortran
-            env PATH=`pwd`:$PATH cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
+            cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_Fortran_COMPILER=gfortran-14 $GITHUB_WORKSPACE
           else
             cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
           fi
-        env:
-          HOMEBREW_PREFIX: /opt/homebrew
 
       - name: build
         working-directory: ${{runner.workspace}}/build

From 10bdc6c59396646628c28e1f7b6afd221d03cb93 Mon Sep 17 00:00:00 2001
From: Ruqing Xu <ruqingx@nvidia.com>
Date: Wed, 2 Oct 2024 23:02:34 +0900
Subject: [PATCH 36/39] Merge upstream config changes

---
 .github/workflows/deploy_docs.yml | 16 +++++---
 .github/workflows/main.yml        | 68 ++++++++++++++++---------------
 config/apple.cmake                | 12 +++---
 config/gcc.cmake                  |  1 +
 4 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml
index 02ab63ca..9c6855af 100644
--- a/.github/workflows/deploy_docs.yml
+++ b/.github/workflows/deploy_docs.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - master
+      - develop
       - gh_actions  # test branch
       - '!gh-pages'
     tags: '*'
@@ -13,21 +14,21 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
     - name: Inject slug/short variables
-      uses: rlespinasse/github-slug-action@v3.x
+      uses: rlespinasse/github-slug-action@v4.x
 
     - name: Checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
       with:
         path: main
 
     - name: Checkout gh-pages
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
       with:
         ref: gh-pages
         path: gh-pages
 
     - name: Set up Python
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.8
 
@@ -42,7 +43,7 @@ jobs:
         cmake -E make_directory build
         cd build
         cmake -DDocument=ON -DENABLE_MPI=OFF ../
-        make doc-ja-html doc-en-html
+        make doc-ja-html doc-en-html tutorial-ja-html
 
     - name: Deploy Configuration
       run: |
@@ -63,6 +64,11 @@ jobs:
             mkdir -p "gh-pages/doc/${TARGET_NAME}"
             cp -r "main/build/doc/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/${lang}"
           done
+          for lang in ja; do
+            rm -rf "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}"
+            mkdir -p "gh-pages/doc/${TARGET_NAME}"/tutorial/
+            cp -r "main/build/doc/tutorial/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}"
+          done
           cd gh-pages
           git config --local user.name "${GIT_USER}"
           git config --local user.email "${GIT_EMAIL}"
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 8a67bd55..4f66d3a6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,48 +1,48 @@
 name: CI
 
-on: [push]
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron: '0 0 1,15 * *' # JST 9:00 on 1st and 15th every month
 
 jobs:
   ctest:
-    runs-on: ubuntu-20.04
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      matrix:
+        os: ["ubuntu-22.04", "ubuntu-20.04", "macos-latest"]
+        ompsize: [1, 4]
+        exclude:
+          - os: "macos-latest"
+            ompsize: 4 # OMP on macOS is too slow
+      fail-fast: false
+    env:
+      OMP_NUM_THREADS: ${{ matrix.ompsize }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: apt
+        if: ${{ runner.os == 'Linux' }}
         run: |
           sudo apt update
-          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev wget
+          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev
 
-      - name: pip
+      - name: brew
+        if: ${{ runner.os == 'macOS' }}
         run: |
-          python -m pip install numpy
-          python3 -m pip install numpy
+          brew install openmpi scalapack libomp blis
 
-      - name: eigen
-        working-directory: ${{runner.workspace}}
-        run: |
-          wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz
-          tar -zxvf eigen-3.4.0.tar.gz
-          mkdir eigen-build
-          cd eigen-build
-          cmake ../eigen-3.4.0
-          make -j4
-          sudo make install
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
-      - name: blis
-        working-directory: ${{runner.workspace}}
+      - name: pip
         run: |
-          mkdir blis
-          cd blis
-          git init
-          git remote add origin https://github.com/flame/blis
-          git fetch --depth 1 origin e3d352f1fcc93e6a46fde1aa4a7f0a18fb27bd42
-          git checkout FETCH_HEAD
-          ./configure -t none -p ${{runner.workspace}}/blis-inst x86_64
-          make -j8 install
-          cd ${{runner.workspace}}/blis-inst
-          tar -zcvf ${{runner.workspace}}/libblis_artifact.tar.gz include lib
+          python3 -m pip install numpy
 
       - name: make workspace
         run: cmake -E make_directory ${{runner.workspace}}/build
@@ -51,10 +51,12 @@ jobs:
         working-directory: ${{runner.workspace}}/build
         shell: bash
         run: |
-          cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
-          # Override downloaded artifact.
-          # TODO: Rebuild the artifacts.
-          mv ${{runner.workspace}}/libblis_artifact.tar.gz .
+          if [ ${{ runner.os }} = "macOS" ] ; then
+            # CONFIG=apple requires gfortran but macOS runner has not, but gfortran-11, 12, ...
+            cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_Fortran_COMPILER=gfortran-14 $GITHUB_WORKSPACE
+          else
+            cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
+          fi
 
       - name: build
         working-directory: ${{runner.workspace}}/build
diff --git a/config/apple.cmake b/config/apple.cmake
index 16ff1130..37a7a290 100644
--- a/config/apple.cmake
+++ b/config/apple.cmake
@@ -1,16 +1,18 @@
 # for Apple clang compiler
 # additional libomp and gfortran installation required
 # mac computers are suggested to use this configuration for better performance
+
+if(NOT DEFINED ENV{HOMEBREW_PREFIX})
+  message(FATAL "Homebrew is not installed. Please install Homebrew first.")
+endif()
+
 set(CMAKE_C_COMPILER "clang" CACHE STRING "" FORCE)
 set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall  -Wformat -Werror=format-security")
 set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas -Wno-logical-not-parentheses")
-set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE)
-add_definitions(-DF77_COMPLEX_RET_INTEL)
 
 # OpenMP with libomp
-set(CMAKE_EXE_LINKER_FLAGS "-L/usr/local/lib -lomp") 
-set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp" CACHE STRING "" FORCE) 
+set(CMAKE_EXE_LINKER_FLAGS "-L$ENV{HOMEBREW_PREFIX}/opt/libomp/lib -lomp") 
+set(OpenMP_C_FLAGS "-I$ENV{HOMEBREW_PREFIX}/opt/libomp/include -Xpreprocessor -fopenmp" CACHE STRING "" FORCE) 
 set(OpenMP_C_LIB_NAMES "")
 set(CMAKE_OSX_SYSROOT "")
-
diff --git a/config/gcc.cmake b/config/gcc.cmake
index 751912d7..8a15d2dc 100644
--- a/config/gcc.cmake
+++ b/config/gcc.cmake
@@ -3,4 +3,5 @@ set(CMAKE_C_COMPILER "gcc" CACHE STRING "" FORCE)
 set(CMAKE_CXX_COMPILER "g++" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall  -Wformat -Werror=format-security")
 set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ")
 set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE)

From 1c190121b6d4e297a5750729689aacffe0cbfa41 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Thu, 3 Oct 2024 17:16:10 +0900
Subject: [PATCH 37/39] Update main.yml

Add Eigen3 to CI deps
---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 4f66d3a6..25c18f04 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -28,12 +28,12 @@ jobs:
         if: ${{ runner.os == 'Linux' }}
         run: |
           sudo apt update
-          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev
+          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev
 
       - name: brew
         if: ${{ runner.os == 'macOS' }}
         run: |
-          brew install openmpi scalapack libomp blis
+          brew install openmpi scalapack libomp blis eigen
 
       - name: Setup Python
         uses: actions/setup-python@v5

From 805b19d10a5537a9effe28614b874a5507cc3249 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Thu, 3 Oct 2024 17:29:37 +0900
Subject: [PATCH 38/39] Update download_blis_artifact.cmake

Use BLIS artifact from Julia binary builders
---
 download_blis_artifact.cmake | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/download_blis_artifact.cmake b/download_blis_artifact.cmake
index ec9fc8a7..b2033f92 100644
--- a/download_blis_artifact.cmake
+++ b/download_blis_artifact.cmake
@@ -4,20 +4,8 @@ if(NOT DEFINED BLIS_ARTIFACT_CONFIG)
     COMMAND tr -d '\n'
     OUTPUT_VARIABLE ARCHITECTURE)
   if(${ARCHITECTURE} STREQUAL "x86_64")
-    execute_process(
-      COMMAND cat /proc/cpuinfo
-      COMMAND grep Intel
-      COMMAND head -1
-      COMMAND tr -d '\n'
-      OUTPUT_VARIABLE UARCH_INTEL)
-    if(NOT "${UARCH_INTEL} " STREQUAL " ")
-      set(BLIS_ARTIFACT_CONFIG "intel64")
-    else()
-      set(BLIS_ARTIFACT_CONFIG "amd64")
-    endif()
-  elseif(${ARCHITECTURE} STREQUAL "aarch64" OR
-      ${ARCHITECTURE} STREQUAL "arm64")
-      # Currently SVE requires being set manually.
+      set(BLIS_ARTIFACT_CONFIG "x86_64")
+  elseif(${ARCHITECTURE} STREQUAL "aarch64" OR ${ARCHITECTURE} STREQUAL "arm64")
       set(BLIS_ARTIFACT_CONFIG "arm64")
   else()
     message(FATAL_ERROR "Failed to recognize architecture ${ARCHITECTURE}.")
@@ -27,14 +15,10 @@ endif(NOT DEFINED BLIS_ARTIFACT_CONFIG)
 message(STATUS "Downloading BLIS artifact...")
 
 set(BLIS_ARCHIVE ${CMAKE_CURRENT_BINARY_DIR}/libblis_artifact.tar.gz)
-if(${BLIS_ARTIFACT_CONFIG} STREQUAL "intel64")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_intel64_gcc.tar.gz)
-elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "amd64")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_amd64_gcc.tar.gz)
-elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_cortexa57_aarch64-linux-gnu-gcc.tar.gz)
-elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_armsve_aarch64-linux-gnu-gcc-10.tar.gz)
+if(${BLIS_ARTIFACT_CONFIG} STREQUAL "x86_64")
+  set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.x86_64-linux-gnu.tar.gz)
+elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64" OR ${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve")
+  set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.aarch64-linux-gnu.tar.gz)
 elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "a64fx")
   set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_a64fx_aarch64-linux-gnu-gcc-10.tar.gz)
 else()

From 3203b346aab774b72806965ebb4599641e402a89 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Thu, 3 Oct 2024 23:10:28 +0900
Subject: [PATCH 39/39] Update CMakeLists.txt

${ARCHITECTURE} might be empty
---
 src/pfupdates/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pfupdates/CMakeLists.txt b/src/pfupdates/CMakeLists.txt
index 11b0202b..a7ef83a4 100644
--- a/src/pfupdates/CMakeLists.txt
+++ b/src/pfupdates/CMakeLists.txt
@@ -5,7 +5,7 @@ if(${CMAKE_PROJECT_NAME} STREQUAL "Project")
   message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.")
 endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
 
-if(${ARCHITECTURE} STREQUAL "x86_64")
+if("${ARCHITECTURE}" STREQUAL "x86_64")
   add_definitions(-DF77_COMPLEX_RET_INTEL)
 endif()
 include_directories(../ltl2inv)