diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml
index 02ab63ca..9c6855af 100644
--- a/.github/workflows/deploy_docs.yml
+++ b/.github/workflows/deploy_docs.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - master
+      - develop
       - gh_actions  # test branch
       - '!gh-pages'
     tags: '*'
@@ -13,21 +14,21 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
     - name: Inject slug/short variables
-      uses: rlespinasse/github-slug-action@v3.x
+      uses: rlespinasse/github-slug-action@v4.x
 
     - name: Checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
       with:
         path: main
 
     - name: Checkout gh-pages
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
       with:
         ref: gh-pages
         path: gh-pages
 
     - name: Set up Python
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.8
 
@@ -42,7 +43,7 @@ jobs:
         cmake -E make_directory build
         cd build
         cmake -DDocument=ON -DENABLE_MPI=OFF ../
-        make doc-ja-html doc-en-html
+        make doc-ja-html doc-en-html tutorial-ja-html
 
     - name: Deploy Configuration
       run: |
@@ -63,6 +64,11 @@ jobs:
             mkdir -p "gh-pages/doc/${TARGET_NAME}"
             cp -r "main/build/doc/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/${lang}"
           done
+          for lang in ja; do
+            rm -rf "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}"
+            mkdir -p "gh-pages/doc/${TARGET_NAME}"/tutorial/
+            cp -r "main/build/doc/tutorial/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}"
+          done
           cd gh-pages
           git config --local user.name "${GIT_USER}"
           git config --local user.email "${GIT_EMAIL}"
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b49b05fe..25c18f04 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,6 +2,7 @@ name: CI
 
 on:
   push:
+  pull_request:
   schedule:
     - cron: '0 0 1,15 * *' # JST 9:00 on 1st and 15th every month
 
@@ -27,12 +28,12 @@ jobs:
         if: ${{ runner.os == 'Linux' }}
         run: |
           sudo apt update
-          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev
+          sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev
 
       - name: brew
         if: ${{ runner.os == 'macOS' }}
         run: |
-          brew install openmpi scalapack libomp
+          brew install openmpi scalapack libomp blis eigen
 
       - name: Setup Python
         uses: actions/setup-python@v5
@@ -52,13 +53,10 @@ jobs:
         run: |
           if [ ${{ runner.os }} = "macOS" ] ; then
             # CONFIG=apple requires gfortran but macOS runner has not, but gfortran-11, 12, ...
-            ln -s `which gfortran-11` gfortran
-            env PATH=`pwd`:$PATH cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
+            cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_Fortran_COMPILER=gfortran-14 $GITHUB_WORKSPACE
           else
             cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE
           fi
-        env:
-          HOMEBREW_PREFIX: /opt/homebrew
 
       - name: build
         working-directory: ${{runner.workspace}}/build
diff --git a/.gitmodules b/.gitmodules
index 99e901f0..825489fb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,10 +1,6 @@
-[submodule "src/pfaffine"]
-	path = src/pfaffine
-	url = https://github.com/xrq-phys/Pfaffine
-	branch = blis
 [submodule "src/blis"]
 	path = src/blis
-	url = https://github.com/xrq-phys/blis
+	url = https://github.com/flame/blis
 	branch = mvmc
 [submodule "src/pfapack"]
 	path = src/pfapack
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70851aa8..76b217fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,9 @@
-cmake_minimum_required(VERSION 2.8.0 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 project(mVMC NONE)
 
 option(USE_SCALAPACK "Use Scalapack" OFF)
 option(PFAFFIAN_BLOCKED "Use blocked-update Pfaffian to speed up." OFF)
+option(USE_GEMMT "Use GEMMT. Recommended regardless blocked-Pfaffian-update." ON)
 
 add_definitions(-D_mVMC)
 if(CONFIG)
@@ -17,11 +18,10 @@ message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 
 option(GIT_SUBMODULE_UPDATE "execute `git submodule update` automatically" ON)
-enable_language(C Fortran)
-if(PFAFFIAN_BLOCKED)
-  enable_language(CXX)
-  set(CMAKE_CXX_STANDARD 11)
-endif(PFAFFIAN_BLOCKED)
+
+# First, enables C language only.
+# External packages only use their C API.
+enable_language(C)
 
 set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 set(CMAKE_SKIP_BUILD_RPATH FALSE)
@@ -29,6 +29,7 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 set(CMAKE_MACOSX_RPATH 1)
 
+# TODO: Is this really needed?
 if("${CMAKE_BUILD_TYPE}" MATCHES "Debug")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_DEBUG}")
 else()
@@ -37,16 +38,19 @@ else()
 endif()
 
 if(CMAKE_C_COMPILER_ID STREQUAL "Intel")
+  # TODO: Really needs separation?
   if("${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "15.0.0.20140528")
      set(OMP_FLAG_Intel "-openmp")
   else()
     set(OMP_FLAG_Intel "-qopenmp")
   endif()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OMP_FLAG_Intel}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OMP_FLAG_Intel}")
 else()
   find_package(OpenMP)
   if(OPENMP_FOUND)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
   endif(OPENMP_FOUND)	
 endif()
 
@@ -57,6 +61,9 @@ if(MPI_C_FOUND)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}")
 endif(MPI_C_FOUND)
 
+# BLAS/LAPACK/Pfapack77 needs Fortran interface.
+enable_language(Fortran)
+
 find_package(LAPACK)
 if(USE_SCALAPACK MATCHES OFF)
   if(LAPACK_FOUND)
@@ -64,6 +71,34 @@ if(USE_SCALAPACK MATCHES OFF)
   endif(LAPACK_FOUND)
 endif()
 
+if(PFAFFIAN_BLOCKED
+    OR USE_GEMMT)
+  set(Require_BLIS ON)
+endif()
+
+if(BLA_VENDOR MATCHES "Intel10*"
+    OR BLAS_LIBRARIES MATCHES "/*mkl*")
+  # Don't require BLIS when MKL is used.
+  add_definitions(-DMKL)
+  add_definitions(-DBLAS_EXTERNAL)
+  add_definitions(-DF77_COMPLEX_RET_INTEL)
+  set(Require_BLIS OFF)
+elseif(BLA_VENDOR MATCHES "FLA*"
+    OR BLAS_LIBRARIES MATCHES "/*blis*")
+  # Skip extra BLIS if it's already the BLAS vendor.
+  list(GET BLAS_LIBRARIES 0 BLIS_FIRST_LIB)
+  get_filename_component(BLIS_LIB_DIR ${BLIS_FIRST_LIB} DIRECTORY)
+  include_directories(${BLIS_LIB_DIR}/../include)
+  include_directories(${BLIS_LIB_DIR}/../include/blis)
+  set(Require_BLIS OFF)
+else()
+  # BLAS vendor preference:
+  # External > BLIS > Reference
+  if(DEFINED BLA_VENDOR)
+    add_definitions(-DBLAS_EXTERNAL)
+  endif()
+endif()
+
 # Build and enable tests
 # testing setup
 # enable_testing() must be called in the top-level CMakeLists.txt before any add_subdirectory() is called.
@@ -97,13 +132,31 @@ if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
 endif()
 add_subdirectory("${STDFACE_DIR}")
 
+# C++ support for pfupdates & ltl2inv.
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 11)
+
+# Require Eigen
+find_package(Eigen3 3.4 REQUIRED NO_MODULE)
+include_directories(${EIGEN3_INCLUDE_DIR})
+# We surely want to offload mmuls to BLAS.
+add_definitions(-DEIGEN_USE_BLAS)
+
+if(Require_BLIS)
+  add_definitions(-D_BLIS)
+  include("download_blis_artifact.cmake")
+else(Require_BLIS)
+  # Use bundled blis.h
+  include_directories(src/common/deps)
+endif(Require_BLIS)
+
 if(PFAFFIAN_BLOCKED)
   add_definitions(-D_pf_block_update)
-  include("download_blis_artifact.cmake")
-  # Must set BLIS artifact BEFORE adding pfupdates target.
-  add_subdirectory(src/pfupdates)
-  add_dependencies(pfupdates blis_include)
 endif(PFAFFIAN_BLOCKED)
+add_subdirectory(src/pfupdates)
+if(Require_BLIS)
+  add_dependencies(pfupdates blis_include)
+endif(Require_BLIS)
 
 if (Document)
   add_subdirectory(doc)
@@ -111,5 +164,9 @@ endif(Document)
 
 add_subdirectory(src/ComplexUHF)
 add_subdirectory(src/pfapack/fortran)
+add_subdirectory(src/ltl2inv)
+  if(Require_BLIS)
+    add_dependencies(ltl2inv blis_include)
+  endif(Require_BLIS)
 add_subdirectory(src/mVMC)
 add_subdirectory(tool)
diff --git a/config/aocc.cmake b/config/aocc.cmake
index a53a783a..258d9772 100644
--- a/config/aocc.cmake
+++ b/config/aocc.cmake
@@ -9,6 +9,9 @@ set(CMAKE_C_FLAGS_RELEASE "-Wno-unknown-pragmas -O3 -DNDEBUG -DHAVE_SSE2" CACHE
 set(CMAKE_Fortran_COMPILER "flang" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -DHAVE_SSE2" CACHE STRING "" FORCE)
 
+# OpenMP, libatomic
+set(CMAKE_EXE_LINKER_FLAGS "-fopenmp -latomic") 
+
 # for AOCL
 set(BLA_VENDOR "FLAME" CACHE STRING "" FORCE)
 
diff --git a/config/apple.cmake b/config/apple.cmake
index 73082407..37a7a290 100644
--- a/config/apple.cmake
+++ b/config/apple.cmake
@@ -2,7 +2,7 @@
 # additional libomp and gfortran installation required
 # mac computers are suggested to use this configuration for better performance
 
-if(NOT $ENV{HOMEBREW_PREFIX})
+if(NOT DEFINED ENV{HOMEBREW_PREFIX})
   message(FATAL "Homebrew is not installed. Please install Homebrew first.")
 endif()
 
@@ -10,7 +10,6 @@ set(CMAKE_C_COMPILER "clang" CACHE STRING "" FORCE)
 set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall  -Wformat -Werror=format-security")
 set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas -Wno-logical-not-parentheses")
-# set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE)
 
 # OpenMP with libomp
 set(CMAKE_EXE_LINKER_FLAGS "-L$ENV{HOMEBREW_PREFIX}/opt/libomp/lib -lomp") 
diff --git a/config/gcc.cmake b/config/gcc.cmake
index 751912d7..8a15d2dc 100644
--- a/config/gcc.cmake
+++ b/config/gcc.cmake
@@ -3,4 +3,5 @@ set(CMAKE_C_COMPILER "gcc" CACHE STRING "" FORCE)
 set(CMAKE_CXX_COMPILER "g++" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall  -Wformat -Werror=format-security")
 set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ")
 set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE)
diff --git a/download_blis_artifact.cmake b/download_blis_artifact.cmake
index f6274bcc..b2033f92 100644
--- a/download_blis_artifact.cmake
+++ b/download_blis_artifact.cmake
@@ -4,20 +4,8 @@ if(NOT DEFINED BLIS_ARTIFACT_CONFIG)
     COMMAND tr -d '\n'
     OUTPUT_VARIABLE ARCHITECTURE)
   if(${ARCHITECTURE} STREQUAL "x86_64")
-    execute_process(
-      COMMAND cat /proc/cpuinfo
-      COMMAND grep Intel
-      COMMAND head -1
-      COMMAND tr -d '\n'
-      OUTPUT_VARIABLE UARCH_INTEL)
-    if(NOT "${UARCH_INTEL} " STREQUAL " ")
-      set(BLIS_ARTIFACT_CONFIG "intel64")
-    else()
-      set(BLIS_ARTIFACT_CONFIG "amd64")
-    endif()
-  elseif(${ARCHITECTURE} STREQUAL "aarch64" OR
-      ${ARCHITECTURE} STREQUAL "arm64")
-      # Currently SVE requires being set manually.
+      set(BLIS_ARTIFACT_CONFIG "x86_64")
+  elseif(${ARCHITECTURE} STREQUAL "aarch64" OR ${ARCHITECTURE} STREQUAL "arm64")
       set(BLIS_ARTIFACT_CONFIG "arm64")
   else()
     message(FATAL_ERROR "Failed to recognize architecture ${ARCHITECTURE}.")
@@ -27,14 +15,10 @@ endif(NOT DEFINED BLIS_ARTIFACT_CONFIG)
 message(STATUS "Downloading BLIS artifact...")
 
 set(BLIS_ARCHIVE ${CMAKE_CURRENT_BINARY_DIR}/libblis_artifact.tar.gz)
-if(${BLIS_ARTIFACT_CONFIG} STREQUAL "intel64")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_intel64_gcc.tar.gz)
-elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "amd64")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_amd64_gcc.tar.gz)
-elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_cortexa57_aarch64-linux-gnu-gcc.tar.gz)
-elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve")
-  set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_armsve_aarch64-linux-gnu-gcc-10.tar.gz)
+if(${BLIS_ARTIFACT_CONFIG} STREQUAL "x86_64")
+  set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.x86_64-linux-gnu.tar.gz)
+elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64" OR ${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve")
+  set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.aarch64-linux-gnu.tar.gz)
 elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "a64fx")
   set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_a64fx_aarch64-linux-gnu-gcc-10.tar.gz)
 else()
@@ -53,6 +37,7 @@ add_custom_target(blis_target
   DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib/libblis.a)
 add_custom_target(blis_include
   DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/include/blis/blis.h)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/blis)
 
 add_library(blis STATIC IMPORTED GLOBAL)
diff --git a/mVMCconfig.sh b/mVMCconfig.sh
index f731ccc7..1e0d7a57 100755
--- a/mVMCconfig.sh
+++ b/mVMCconfig.sh
@@ -40,8 +40,8 @@ F90 = frt
 FFLAGS = -DNDEBUG -DFUJITSU -Kfast
 CFLAGS = -DNDEBUG -Ofast -fopenmp
 CXXFLAGS = -DNDEBUG -Ofast -fopenmp
-CFLAGS += -D_lapack -D_pf_block_update # -D_pfaffine
-CXXFLAGS += -DBLAS_EXTERNAL
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.so -SSL2 -lm -lpthread
@@ -74,8 +74,8 @@ F90 = ifort
 FFLAGS = -O3 -implicitnone
 CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas
 CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
-CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread
@@ -100,8 +100,8 @@ F90 = ifort
 FFLAGS = -O3 -implicitnone
 CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas
 CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
-CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread
@@ -124,8 +124,8 @@ F90 = flang
 FFLAGS = -O3
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
-CXXFLAGS +=
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread
@@ -148,8 +148,8 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
-CXXFLAGS +=
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread
@@ -172,8 +172,8 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
-CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm -lpthread
@@ -196,8 +196,8 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
-CXXFLAGS +=
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread
@@ -220,8 +220,8 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
-CXXFLAGS += # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple.
+CFLAGS += -D_lapack -D_pf_block_update
+CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple.
 
 BLIS_ROOT = ${PWD}/src/blis-install
 LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread
@@ -242,7 +242,6 @@ EOF
     cat src/make.sys
     cat src/make-ext.sys
     ln -s $PWD/src/make.sys src/StdFace/make.sys;
-    ln -s $PWD/src/make.sys src/pfaffine/make.inc;
     ln -s $PWD/src/make.sys src/pfapack/fortran/make.inc;
 
     cat > makefile <<EOF
diff --git a/src/blis b/src/blis
index a32257ee..aeb5f0cc 160000
--- a/src/blis
+++ b/src/blis
@@ -1 +1 @@
-Subproject commit a32257eeab2e9946e71546a05a1847a39341ec6b
+Subproject commit aeb5f0cc19665456e990a7ffccdb09da2e3f504b
diff --git a/src/common/deps/blis.h b/src/common/deps/blis.h
new file mode 100644
index 00000000..0627afe2
--- /dev/null
+++ b/src/common/deps/blis.h
@@ -0,0 +1,552 @@
+/*
+ * blis.h
+   This header is cut out from BLIS v0.8.0,
+    containing necessary type definitions & macros.
+   User can use this file to avert BLIS dependency
+    if their BLAS library vendors a GEMMT implementation.
+   Here begins 3-clause BSD-style license of BLIS:
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_H
+#define BLIS_H
+
+
+// Allow C++ users to include this header file in their source code. However,
+// we make the extern "C" conditional on whether we're using a C++ compiler,
+// since regular C compilers don't understand the extern "C" construct.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS
+// YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES.
+
+#include <stdio.h> // skipped
+#include <stdlib.h> // skipped
+#include <math.h> // skipped
+#include <string.h> // skipped
+#include <stdarg.h> // skipped
+#include <float.h> // skipped
+#include <errno.h> // skipped
+#include <ctype.h> // skipped
+#include <assert.h> // skipped
+
+// -- STATIC INLINE FUNCTIONS --------------------------------------------------
+
+// C and C++ have different semantics for defining "inline" functions. In C,
+// the keyword phrase "static inline" accomplishes this, though the "inline"
+// is optional. In C++, the "inline" keyword is required and obviates "static"
+// altogether. Why does this matter? While BLIS is compiled in C99, blis.h may
+// be #included by a source file that is compiled with C++.
+#ifdef __cplusplus
+  #define BLIS_INLINE inline
+#else
+  //#define BLIS_INLINE static inline
+  #define BLIS_INLINE static
+#endif
+
+// -- Common BLIS definitions --
+
+// begin bli_type_defs.h
+
+
+#ifndef BLIS_TYPE_DEFS_H
+#define BLIS_TYPE_DEFS_H
+
+
+//
+// -- BLIS basic types ---------------------------------------------------------
+//
+
+#ifdef __cplusplus
+  // For C++, include stdint.h.
+#include <stdint.h> // skipped
+#elif __STDC_VERSION__ >= 199901L
+  // For C99 (or later), include stdint.h.
+#include <stdint.h> // skipped
+#include <stdbool.h> // skipped
+#else
+  // When stdint.h is not available, manually typedef the types we will use.
+  #ifdef _WIN32
+    typedef          __int32  int32_t;
+    typedef unsigned __int32 uint32_t;
+    typedef          __int64  int64_t;
+    typedef unsigned __int64 uint64_t;
+  #else
+    #error "Attempting to compile on pre-C99 system without stdint.h."
+  #endif
+#endif
+
+// -- General-purpose integers --
+
+// If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits.
+// NOTE: This cpp guard will only meaningfully change BLIS's behavior on
+// systems where the BLIS integer size would have been automatically selected
+// to be 32 bits, since explicit selection of 32 bits is prohibited at
+// configure-time (and explicit or automatic selection of 64 bits is fine
+// and would have had the same result).
+#if BLIS_BLAS_INT_SIZE == 64
+  #undef  BLIS_INT_TYPE_SIZE
+  #define BLIS_INT_TYPE_SIZE 64
+#endif
+
+// Define integer types depending on what size integer was requested.
+#if   BLIS_INT_TYPE_SIZE == 32
+typedef           int32_t  gint_t;
+typedef          uint32_t guint_t;
+#elif BLIS_INT_TYPE_SIZE == 64
+typedef           int64_t  gint_t;
+typedef          uint64_t guint_t;
+#else
+typedef   signed long int  gint_t;
+typedef unsigned long int guint_t;
+#endif
+
+// -- Boolean type --
+
+// NOTE: bool_t is no longer used and has been replaced with C99's bool type.
+//typedef bool bool_t;
+
+// BLIS uses TRUE and FALSE macro constants as possible boolean values, but we
+// define these macros in terms of true and false, respectively, which are
+// defined by C99 in stdbool.h.
+#ifndef TRUE
+  #define TRUE  true
+#endif
+
+#ifndef FALSE
+  #define FALSE false
+#endif
+
+// -- Special-purpose integers --
+
+// This cpp guard provides a temporary hack to allow libflame
+// interoperability with BLIS.
+#ifndef _DEFINED_DIM_T
+#define _DEFINED_DIM_T
+typedef   gint_t dim_t;      // dimension type
+#endif
+typedef   gint_t inc_t;      // increment/stride type
+typedef   gint_t doff_t;     // diagonal offset type
+typedef  guint_t siz_t;      // byte size type
+typedef uint32_t objbits_t;  // object information bit field
+
+// -- Real types --
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES   4
+#define BLIS_MAX_TYPE_SIZE  sizeof(dcomplex)
+
+// There are some places where we need to use sizeof() inside of a C
+// preprocessor #if conditional, and so here we define the various sizes
+// for those purposes.
+#define BLIS_SIZEOF_S      4  // sizeof(float)
+#define BLIS_SIZEOF_D      8  // sizeof(double)
+#define BLIS_SIZEOF_C      8  // sizeof(scomplex)
+#define BLIS_SIZEOF_Z      16 // sizeof(dcomplex)
+
+// -- Complex types --
+
+#ifdef BLIS_ENABLE_C99_COMPLEX
+
+	#if __STDC_VERSION__ >= 199901L
+#include <complex.h> // skipped
+
+		// Typedef official complex types to BLIS complex type names.
+		typedef  float complex scomplex;
+		typedef double complex dcomplex;
+	#else
+		#error "Configuration requested C99 complex types, but C99 does not appear to be supported."
+	#endif
+
+#else // ifndef BLIS_ENABLE_C99_COMPLEX
+
+	// This cpp guard provides a temporary hack to allow libflame
+	// interoperability with BLIS.
+	#ifndef _DEFINED_SCOMPLEX
+	#define _DEFINED_SCOMPLEX
+	typedef struct
+	{
+		float  real;
+		float  imag;
+	} scomplex;
+	#endif
+
+	// This cpp guard provides a temporary hack to allow libflame
+	// interoperability with BLIS.
+	#ifndef _DEFINED_DCOMPLEX
+	#define _DEFINED_DCOMPLEX
+	typedef struct
+	{
+		double real;
+		double imag;
+	} dcomplex;
+	#endif
+
+#endif // BLIS_ENABLE_C99_COMPLEX
+
+// -- Atom type --
+
+// Note: atom types are used to hold "bufferless" scalar object values. Note
+// that it needs to be as large as the largest possible scalar value we might
+// want to hold. Thus, for now, it is a dcomplex.
+typedef dcomplex atom_t;
+
+// -- Fortran-77 types --
+
+// Note: These types are typically only used by BLAS compatibility layer, but
+// we must define them even when the compatibility layer isn't being built
+// because they also occur in bli_slamch() and bli_dlamch().
+
+// Define f77_int depending on what size of integer was requested.
+#if   BLIS_BLAS_INT_TYPE_SIZE == 32
+typedef int32_t   f77_int;
+#elif BLIS_BLAS_INT_TYPE_SIZE == 64
+typedef int64_t   f77_int;
+#else
+typedef long int  f77_int;
+#endif
+
+typedef char      f77_char;
+typedef float     f77_float;
+typedef double    f77_double;
+typedef scomplex  f77_scomplex;
+typedef dcomplex  f77_dcomplex;
+
+// -- Void function pointer types --
+
+// Note: This type should be used in any situation where the address of a
+// *function* will be conveyed or stored prior to it being typecast back
+// to the correct function type. It does not need to be used when conveying
+// or storing the address of *data* (such as an array of float or double).
+
+//typedef void (*void_fp)( void );
+typedef void* void_fp;
+
+
+//
+// -- BLIS info bit field offsets ----------------------------------------------
+//
+
+
+
+// info
+#define BLIS_DATATYPE_SHIFT                0
+#define   BLIS_DOMAIN_SHIFT                0
+#define   BLIS_PRECISION_SHIFT             1
+#define BLIS_CONJTRANS_SHIFT               3
+#define   BLIS_TRANS_SHIFT                 3
+#define   BLIS_CONJ_SHIFT                  4
+#define BLIS_UPLO_SHIFT                    5
+#define   BLIS_UPPER_SHIFT                 5
+#define   BLIS_DIAG_SHIFT                  6
+#define   BLIS_LOWER_SHIFT                 7
+#define BLIS_UNIT_DIAG_SHIFT               8
+#define BLIS_INVERT_DIAG_SHIFT             9
+#define BLIS_TARGET_DT_SHIFT               10
+#define   BLIS_TARGET_DOMAIN_SHIFT         10
+#define   BLIS_TARGET_PREC_SHIFT           11
+#define BLIS_EXEC_DT_SHIFT                 13
+#define   BLIS_EXEC_DOMAIN_SHIFT           13
+#define   BLIS_EXEC_PREC_SHIFT             14
+#define BLIS_PACK_SCHEMA_SHIFT             16
+#define   BLIS_PACK_RC_SHIFT               16
+#define   BLIS_PACK_PANEL_SHIFT            17
+#define   BLIS_PACK_FORMAT_SHIFT           18
+#define   BLIS_PACK_SHIFT                  22
+#define BLIS_PACK_REV_IF_UPPER_SHIFT       23
+#define BLIS_PACK_REV_IF_LOWER_SHIFT       24
+#define BLIS_PACK_BUFFER_SHIFT             25
+#define BLIS_STRUC_SHIFT                   27
+#define BLIS_COMP_DT_SHIFT                 29
+#define   BLIS_COMP_DOMAIN_SHIFT           29
+#define   BLIS_COMP_PREC_SHIFT             30
+
+// info2
+#define BLIS_SCALAR_DT_SHIFT                0
+#define   BLIS_SCALAR_DOMAIN_SHIFT          0
+#define   BLIS_SCALAR_PREC_SHIFT            1
+
+//
+// -- BLIS info bit field masks ------------------------------------------------
+//
+
+// info
+#define BLIS_DATATYPE_BITS                 ( 0x7  << BLIS_DATATYPE_SHIFT )
+#define   BLIS_DOMAIN_BIT                  ( 0x1  << BLIS_DOMAIN_SHIFT )
+#define   BLIS_PRECISION_BIT               ( 0x1  << BLIS_PRECISION_SHIFT )
+#define BLIS_CONJTRANS_BITS                ( 0x3  << BLIS_CONJTRANS_SHIFT )
+#define   BLIS_TRANS_BIT                   ( 0x1  << BLIS_TRANS_SHIFT )
+#define   BLIS_CONJ_BIT                    ( 0x1  << BLIS_CONJ_SHIFT )
+#define BLIS_UPLO_BITS                     ( 0x7  << BLIS_UPLO_SHIFT )
+#define   BLIS_UPPER_BIT                   ( 0x1  << BLIS_UPPER_SHIFT )
+#define   BLIS_DIAG_BIT                    ( 0x1  << BLIS_DIAG_SHIFT )
+#define   BLIS_LOWER_BIT                   ( 0x1  << BLIS_LOWER_SHIFT )
+#define BLIS_UNIT_DIAG_BIT                 ( 0x1  << BLIS_UNIT_DIAG_SHIFT )
+#define BLIS_INVERT_DIAG_BIT               ( 0x1  << BLIS_INVERT_DIAG_SHIFT )
+#define BLIS_TARGET_DT_BITS                ( 0x7  << BLIS_TARGET_DT_SHIFT )
+#define   BLIS_TARGET_DOMAIN_BIT           ( 0x1  << BLIS_TARGET_DOMAIN_SHIFT )
+#define   BLIS_TARGET_PREC_BIT             ( 0x1  << BLIS_TARGET_PREC_SHIFT )
+#define BLIS_EXEC_DT_BITS                  ( 0x7  << BLIS_EXEC_DT_SHIFT )
+#define   BLIS_EXEC_DOMAIN_BIT             ( 0x1  << BLIS_EXEC_DOMAIN_SHIFT )
+#define   BLIS_EXEC_PREC_BIT               ( 0x1  << BLIS_EXEC_PREC_SHIFT )
+#define BLIS_PACK_SCHEMA_BITS              ( 0x7F << BLIS_PACK_SCHEMA_SHIFT )
+#define   BLIS_PACK_RC_BIT                 ( 0x1  << BLIS_PACK_RC_SHIFT )
+#define   BLIS_PACK_PANEL_BIT              ( 0x1  << BLIS_PACK_PANEL_SHIFT )
+#define   BLIS_PACK_FORMAT_BITS            ( 0xF  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_PACK_BIT                    ( 0x1  << BLIS_PACK_SHIFT )
+#define BLIS_PACK_REV_IF_UPPER_BIT         ( 0x1  << BLIS_PACK_REV_IF_UPPER_SHIFT )
+#define BLIS_PACK_REV_IF_LOWER_BIT         ( 0x1  << BLIS_PACK_REV_IF_LOWER_SHIFT )
+#define BLIS_PACK_BUFFER_BITS              ( 0x3  << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_STRUC_BITS                    ( 0x3  << BLIS_STRUC_SHIFT )
+#define BLIS_COMP_DT_BITS                  ( 0x7  << BLIS_COMP_DT_SHIFT )
+#define   BLIS_COMP_DOMAIN_BIT             ( 0x1  << BLIS_COMP_DOMAIN_SHIFT )
+#define   BLIS_COMP_PREC_BIT               ( 0x1  << BLIS_COMP_PREC_SHIFT )
+
+// info2
+#define BLIS_SCALAR_DT_BITS                ( 0x7  << BLIS_SCALAR_DT_SHIFT )
+#define   BLIS_SCALAR_DOMAIN_BIT           ( 0x1  << BLIS_SCALAR_DOMAIN_SHIFT )
+#define   BLIS_SCALAR_PREC_BIT             ( 0x1  << BLIS_SCALAR_PREC_SHIFT )
+
+
+//
+// -- BLIS enumerated type value definitions -----------------------------------
+//
+
+#define BLIS_BITVAL_REAL                      0x0
+#define BLIS_BITVAL_COMPLEX                   BLIS_DOMAIN_BIT
+#define BLIS_BITVAL_SINGLE_PREC               0x0
+#define BLIS_BITVAL_DOUBLE_PREC               BLIS_PRECISION_BIT
+#define   BLIS_BITVAL_FLOAT_TYPE              0x0
+#define   BLIS_BITVAL_SCOMPLEX_TYPE           BLIS_DOMAIN_BIT  
+#define   BLIS_BITVAL_DOUBLE_TYPE             BLIS_PRECISION_BIT
+#define   BLIS_BITVAL_DCOMPLEX_TYPE         ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT )
+#define   BLIS_BITVAL_INT_TYPE                0x04
+#define   BLIS_BITVAL_CONST_TYPE              0x05
+#define BLIS_BITVAL_NO_TRANS                  0x0
+#define BLIS_BITVAL_TRANS                     BLIS_TRANS_BIT
+#define BLIS_BITVAL_NO_CONJ                   0x0
+#define BLIS_BITVAL_CONJ                      BLIS_CONJ_BIT
+#define BLIS_BITVAL_CONJ_TRANS              ( BLIS_CONJ_BIT | BLIS_TRANS_BIT )
+#define BLIS_BITVAL_ZEROS                     0x0 
+#define BLIS_BITVAL_UPPER                   ( BLIS_UPPER_BIT | BLIS_DIAG_BIT )
+#define BLIS_BITVAL_LOWER                   ( BLIS_LOWER_BIT | BLIS_DIAG_BIT )
+#define BLIS_BITVAL_DENSE                     BLIS_UPLO_BITS  
+#define BLIS_BITVAL_NONUNIT_DIAG              0x0
+#define BLIS_BITVAL_UNIT_DIAG                 BLIS_UNIT_DIAG_BIT
+#define BLIS_BITVAL_INVERT_DIAG               BLIS_INVERT_DIAG_BIT
+#define BLIS_BITVAL_NOT_PACKED                0x0
+#define   BLIS_BITVAL_4MI                   ( 0x1  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_3MI                   ( 0x2  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_4MS                   ( 0x3  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_3MS                   ( 0x4  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_RO                    ( 0x5  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_IO                    ( 0x6  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_RPI                   ( 0x7  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_1E                    ( 0x8  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_1R                    ( 0x9  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_PACKED_UNSPEC         ( BLIS_PACK_BIT                                                            )
+#define   BLIS_BITVAL_PACKED_ROWS           ( BLIS_PACK_BIT                                                            )
+#define   BLIS_BITVAL_PACKED_COLUMNS        ( BLIS_PACK_BIT                                         | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS     ( BLIS_PACK_BIT                   | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS     ( BLIS_PACK_BIT                   | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_RO  ( BLIS_PACK_BIT | BLIS_BITVAL_RO  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_RO  ( BLIS_PACK_BIT | BLIS_BITVAL_RO  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_IO  ( BLIS_PACK_BIT | BLIS_BITVAL_IO  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_IO  ( BLIS_PACK_BIT | BLIS_BITVAL_IO  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_1E  ( BLIS_PACK_BIT | BLIS_BITVAL_1E  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_1E  ( BLIS_PACK_BIT | BLIS_BITVAL_1E  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_1R  ( BLIS_PACK_BIT | BLIS_BITVAL_1R  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_1R  ( BLIS_PACK_BIT | BLIS_BITVAL_1R  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define BLIS_BITVAL_PACK_FWD_IF_UPPER         0x0
+#define BLIS_BITVAL_PACK_REV_IF_UPPER         BLIS_PACK_REV_IF_UPPER_BIT
+#define BLIS_BITVAL_PACK_FWD_IF_LOWER         0x0
+#define BLIS_BITVAL_PACK_REV_IF_LOWER         BLIS_PACK_REV_IF_LOWER_BIT
+#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK        0x0
+#define BLIS_BITVAL_BUFFER_FOR_B_PANEL      ( 0x1 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_BUFFER_FOR_C_PANEL      ( 0x2 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_BUFFER_FOR_GEN_USE      ( 0x3 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_GENERAL                   0x0
+#define BLIS_BITVAL_HERMITIAN               ( 0x1 << BLIS_STRUC_SHIFT )
+#define BLIS_BITVAL_SYMMETRIC               ( 0x2 << BLIS_STRUC_SHIFT )
+#define BLIS_BITVAL_TRIANGULAR              ( 0x3 << BLIS_STRUC_SHIFT )
+
+
+//
+// -- BLIS enumerated type definitions -----------------------------------------
+//
+
+// -- Operational parameter types --
+
+typedef enum
+{
+	BLIS_NO_TRANSPOSE      = 0x0,
+	BLIS_TRANSPOSE         = BLIS_BITVAL_TRANS,
+	BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ,
+	BLIS_CONJ_TRANSPOSE    = BLIS_BITVAL_CONJ_TRANS
+} trans_t;
+
+typedef enum
+{
+	BLIS_NO_CONJUGATE      = 0x0,
+	BLIS_CONJUGATE         = BLIS_BITVAL_CONJ
+} conj_t;
+
+typedef enum
+{
+	BLIS_ZEROS             = BLIS_BITVAL_ZEROS,
+	BLIS_LOWER             = BLIS_BITVAL_LOWER,
+	BLIS_UPPER             = BLIS_BITVAL_UPPER,
+	BLIS_DENSE             = BLIS_BITVAL_DENSE
+} uplo_t;
+
+typedef enum
+{
+	BLIS_LEFT              = 0x0,
+	BLIS_RIGHT
+} side_t;
+
+typedef enum
+{
+	BLIS_NONUNIT_DIAG      = 0x0,
+	BLIS_UNIT_DIAG         = BLIS_BITVAL_UNIT_DIAG
+} diag_t;
+
+typedef enum
+{
+	BLIS_NO_INVERT_DIAG    = 0x0,
+	BLIS_INVERT_DIAG       = BLIS_BITVAL_INVERT_DIAG
+} invdiag_t;
+
+typedef enum
+{
+	BLIS_GENERAL           = BLIS_BITVAL_GENERAL,
+	BLIS_HERMITIAN         = BLIS_BITVAL_HERMITIAN,
+	BLIS_SYMMETRIC         = BLIS_BITVAL_SYMMETRIC,
+	BLIS_TRIANGULAR        = BLIS_BITVAL_TRIANGULAR
+} struc_t;
+
+
+// -- Data type --
+
+typedef enum
+{
+	BLIS_FLOAT             = BLIS_BITVAL_FLOAT_TYPE,
+	BLIS_DOUBLE            = BLIS_BITVAL_DOUBLE_TYPE,
+	BLIS_SCOMPLEX          = BLIS_BITVAL_SCOMPLEX_TYPE,
+	BLIS_DCOMPLEX          = BLIS_BITVAL_DCOMPLEX_TYPE,
+	BLIS_INT               = BLIS_BITVAL_INT_TYPE,
+	BLIS_CONSTANT          = BLIS_BITVAL_CONST_TYPE,
+	BLIS_DT_LO             = BLIS_FLOAT,
+	BLIS_DT_HI             = BLIS_DCOMPLEX
+} num_t;
+
+typedef enum
+{
+	BLIS_REAL              = BLIS_BITVAL_REAL,
+	BLIS_COMPLEX           = BLIS_BITVAL_COMPLEX
+} dom_t;
+
+typedef enum
+{
+	BLIS_SINGLE_PREC       = BLIS_BITVAL_SINGLE_PREC,
+	BLIS_DOUBLE_PREC       = BLIS_BITVAL_DOUBLE_PREC
+} prec_t;
+
+#endif
+// end bli_type_defs.h
+
+// begin bli_param_map.h
+BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side )
+{
+	if      ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT;
+	else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT;
+	else
+	{
+		*blis_side = BLIS_LEFT;
+	}
+}
+
+BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo )
+{
+	if      ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER;
+	else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER;
+	else
+	{
+		*blis_uplo = BLIS_LOWER;
+	}
+}
+
+BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans )
+{
+	if      ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
+	else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
+	else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
+	else
+	{
+		*blis_trans = BLIS_NO_TRANSPOSE;
+	}
+}
+
+BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag )
+{
+	if      ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG;
+	else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG;
+	else
+	{
+		*blis_diag = BLIS_NONUNIT_DIAG;
+	}
+}
+// end bli_param_map.h
+
+// End extern "C" construct block.
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt
new file mode 100644
index 00000000..165ea01c
--- /dev/null
+++ b/src/ltl2inv/CMakeLists.txt
@@ -0,0 +1,15 @@
+# include guard
+cmake_minimum_required(VERSION 3.0)
+
+if(${CMAKE_PROJECT_NAME} STREQUAL "Project")
+  message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.")
+endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
+
+if("${ARCHITECTURE}" STREQUAL "x86_64")
+  add_definitions(-DF77_COMPLEX_RET_INTEL)
+endif()
+include_directories(../pfapack/c_interface)
+
+add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc)
+target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL)
+
diff --git a/src/ltl2inv/eigen_gemmt.tcc b/src/ltl2inv/eigen_gemmt.tcc
new file mode 100644
index 00000000..4e569df9
--- /dev/null
+++ b/src/ltl2inv/eigen_gemmt.tcc
@@ -0,0 +1,77 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "lapack2eigen.hh"
+#include "ilaenv_lauum.hh"
+
+
+template <typename T>
+inline void gemmt(const char &uploC,
+                  const char &transA,
+                  const char &transB,
+                  const T &alpha,
+                  const l2e::le_mat_t<T> &A_,
+                  const l2e::le_mat_t<T> &B_,
+                  const T &beta,
+                        l2e::le_mat_t<T> &C_)
+{
+    using LProxy = l2e::le_mat_t<T>;
+    using Matrix = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+    const int npanel = ilaenv_lauum<T>(uploC, C_.rows);
+
+    bool conjA = transA == 'C' || transA == 'c';
+    bool conjB = transB == 'C' || transB == 'c';
+
+    const auto &A = (transA == 'T' || transA == 't') ? A_.transpose().to_eigen() : A_.to_eigen();
+    const auto &B = (transB == 'T' || transB == 't') ? B_.transpose().to_eigen() : B_.to_eigen();
+    auto        C =                                                                C_.to_eigen();
+
+    if (uploC == 'L' || uploC == 'l') {
+
+        for (int ipanel = 0; ipanel < A.rows(); ipanel += npanel) {
+            int sz_panel = std::min<int>(npanel, A.rows() - ipanel);
+
+            // Diagonal.
+            const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all);
+            const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1));
+            const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1),
+                                   Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha;
+            LProxy Cpanel_(Cpanel);
+            lacpy(uploC, LProxy(Cupdated), Cpanel_);
+
+            // Off-diagonal.
+            const auto &Aremain = A(Eigen::seq(ipanel+sz_panel, A.rows()-1), Eigen::all);
+            const auto &Bremain = B(Eigen::all, Eigen::seq(ipanel+sz_panel, B.cols()-1));
+            auto        Cremain = C(Eigen::seq(ipanel+sz_panel, C.rows()-1),
+                                    Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Cremain = Cremain * beta + Aremain * Bpanel * alpha;
+        }
+
+    } else { // (uploC == 'U' || uploC == 'u')
+
+        for (int ipanel = (A.rows()-1) / npanel * npanel; ipanel >= 0; ipanel -= npanel) {
+            int sz_panel = std::min<int>(npanel, A.rows() - ipanel);
+
+            // Diagonal.
+            const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all);
+            const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1));
+            const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1),
+                                   Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha;
+            LProxy Cpanel_(Cpanel);
+            lacpy(uploC, LProxy(Cupdated), Cpanel_);
+
+            // Off-diagonal.
+            const auto &Aremain = A(Eigen::seq(0, ipanel-1), Eigen::all);
+            const auto &Bremain = B(Eigen::all, Eigen::seq(0, ipanel-1));
+            auto        Cremain = C(Eigen::seq(0, ipanel-1),
+                                    Eigen::seq(ipanel, ipanel+sz_panel-1));
+            Cremain = Cremain * beta + Aremain * Bpanel * alpha;
+        }
+
+    }
+}
diff --git a/src/ltl2inv/error.hh b/src/ltl2inv/error.hh
new file mode 100644
index 00000000..5c0a06e4
--- /dev/null
+++ b/src/ltl2inv/error.hh
@@ -0,0 +1,42 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include <cstdlib>
+#include <iostream>
+
+
+inline void abort(const char *msg)
+{
+  std::cout << msg << std::endl;
+  abort();
+}
+
+inline void abort(const char *msg0, const char *msg1)
+{
+  std::cout << msg0 << ": " << msg1 << std::endl;
+  abort();
+}
+
+#if (defined(_DEBUG) || !defined(SKIP_CHECKS))
+
+inline void assert_(const bool cond, const char *msg)
+{
+  if (!cond) abort(msg);
+}
+
+inline void assert_(const bool cond, const char *msg0, const char *msg1)
+{
+  if (!cond) abort(msg0, msg1);
+}
+
+#else
+
+inline void assert_(const bool cond, const char *msg) { }
+
+inline void assert_(const bool cond, const char *msg0, const char *msg1) { }
+
+#endif
+
diff --git a/src/ltl2inv/gemmt2blas.cc b/src/ltl2inv/gemmt2blas.cc
new file mode 100644
index 00000000..56cd6fe4
--- /dev/null
+++ b/src/ltl2inv/gemmt2blas.cc
@@ -0,0 +1,35 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "lapack2eigen.hh"
+
+
+#if !defined(_BLAS_HAS_GEMMT)
+// Import the customized GEMMT impl. to the BLAS API.
+// Required by Pfapack77.
+#define BLAGEN_MAC(cctype, ctype, cchar) \
+    extern "C" \
+    void cchar##gemmt_(const char *uplo_, \
+                       const char *transa_, \
+                       const char *transb_, \
+                       const int *m, const int *k, \
+                       const ctype *alpha, \
+                       ctype *a, const int *lda, \
+                       ctype *b, const int *ldb, \
+                       const ctype *beta, ctype *c, const int *ldc) \
+    { \
+        bool tA = (*transa_ == 'T' || *transa_ == 't'); \
+        bool tB = (*transb_ == 'T' || *transb_ == 't'); \
+        l2e::le_mat_t<cctype> A(tA ? *k : *m, tA ? *m : *k, 1, *lda, (cctype *)a); \
+        l2e::le_mat_t<cctype> B(tB ? *k : *m, tB ? *m : *k, 1, *ldb, (cctype *)b); \
+        l2e::le_mat_t<cctype> C(*m, *m, 1, *ldc, (cctype *)c); \
+        gemmt<cctype>(*uplo_, *transa_, *transb_, *(cctype *)alpha, A, B, *(cctype *)beta, C); \
+    }
+BLAGEN_MAC( float,    float,    s )
+BLAGEN_MAC( double,   double,   d )
+BLAGEN_MAC( ccscmplx, scomplex, c )
+BLAGEN_MAC( ccdcmplx, dcomplex, z )
+#undef BLAGEN_MAC
+#endif
diff --git a/src/ltl2inv/ilaenv.h b/src/ltl2inv/ilaenv.h
new file mode 100644
index 00000000..add0a1ed
--- /dev/null
+++ b/src/ltl2inv/ilaenv.h
@@ -0,0 +1,15 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ilaenv_(const int *ispec, const char *name, const char *opts, const int *n1, const int *n2, const int *n3, const int *n4);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/ltl2inv/ilaenv_lauum.cc b/src/ltl2inv/ilaenv_lauum.cc
new file mode 100644
index 00000000..6dcb8b90
--- /dev/null
+++ b/src/ltl2inv/ilaenv_lauum.cc
@@ -0,0 +1,25 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "ilaenv.h"
+#include "ilaenv_lauum.hh"
+#include <complex>
+using ccscmplx = std::complex<float>;
+using ccdcmplx = std::complex<double>;
+
+
+#define EXPANDMAC(cctype, name) \
+template <> int ilaenv_lauum<cctype>(const char uplo, const int n) \
+{ \
+    int ispec = 1; \
+    int dummy = 0; \
+    return ilaenv_(&ispec, #name, &uplo, &n, &dummy, &dummy, &dummy); \
+}
+EXPANDMAC( float,    SLAUUM )
+EXPANDMAC( double,   DLAUUM )
+EXPANDMAC( ccscmplx, CLAUUM )
+EXPANDMAC( ccdcmplx, ZLAUUM )
+#undef EXPANDMAC
+
diff --git a/src/ltl2inv/ilaenv_lauum.hh b/src/ltl2inv/ilaenv_lauum.hh
new file mode 100644
index 00000000..46a59b0f
--- /dev/null
+++ b/src/ltl2inv/ilaenv_lauum.hh
@@ -0,0 +1,10 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+
+template <typename T>
+int ilaenv_lauum(const char uplo, const int n);
+
diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc
new file mode 100644
index 00000000..ba3bfd8a
--- /dev/null
+++ b/src/ltl2inv/invert.tcc
@@ -0,0 +1,144 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "lapack2eigen.hh"
+#include "trmmt.tcc"
+#include "error.hh"
+
+template <typename Vec, typename Mat, typename Mat_>
+void sktdsmx(const Vec &vT, const Mat &B, Mat_ &C)
+{
+  int n = B.rows();
+  assert_(B.cols() == n && C.cols() == n && C.rows() == n, "sktdsmx: Dimension mismatch.");
+
+  for (int j = 0; j < n; ++j)
+    C(1, j) = B(0, j) / -vT[0];
+  for (int i = 2; i < n; i += 2)
+    for (int j = 0; j < n; ++j)
+      C(i+1, j) = (B(i, j) - C(i-1, j) * vT[i-1]) / -vT[i];
+
+  for (int j = 0; j < n; ++j)
+    C(n-2, j) = B(n-1, j) / vT[n-2];
+  for (int i = n-3; i >= 0; i -= 2)
+    for (int j = 0; j < n; ++j)
+      C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1];
+}
+
+template <typename iVec, typename Vec, typename Mat, typename Mat_>
+void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M)
+{
+  using namespace Eigen;
+  using T = typename Mat::Scalar;
+  int n = A.rows();
+  int npanel_trmm = ilaenv_lauum<T>('L', n);
+  int full = npanel_trmm <= 1 || npanel_trmm >= n;
+
+  // Set M.
+  for (int j = 0; j < n; ++j)
+    for (int i = 0; i < n; ++i)
+      M(i, j) = T(!(i - j));
+
+  l2e::le_mat_t<T> A_(A);
+  l2e::le_mat_t<T> M_(M);
+  l2e::le_mat_t<T> A_Lx(A(seq(1, n-1), seq(0, n-2)));
+  l2e::le_mat_t<T> A_L(A(seq(2, n-1), seq(0, n-3)));
+  l2e::le_mat_t<T> M_L(M(seq(2, n-1), seq(1, n-2)));
+
+  trtri('L', 'U', A_Lx);
+  lacpy('L', A_L, M_L);
+
+  for (int i = 0; i < n-1; ++i)
+    vT[i] = A(i+1, i);
+  sktdsmx(vT, M, A);
+
+  if (!full) {
+    trmmt('L', 'U', T(1.0), M, A);
+    for (int j = 0; j < n; ++j) {
+      A(j, j) = 0.0;
+      for (int i = 0; i < j; ++i)
+        A(i, j) = -A(j, i);
+    }
+  }
+
+  // In-place permute columns.
+  for (int j = n-1; j >= 0; --j)
+    if (iPiv[j]-1 != j)
+      A(Eigen::all, j).swap(A(Eigen::all, iPiv[j] - 1));
+
+  if (full)
+    A = M.template triangularView<UnitLower>().transpose() * A;
+
+  // In-place permute rows.
+  for (int i = n-1; i >= 0; --i)
+    if (iPiv[i]-1 != i)
+      A(i, Eigen::all).swap(A(iPiv[i] - 1, Eigen::all));
+}
+
+template <typename iVec, typename Vec, typename Mat, typename Mat_>
+void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M)
+{
+  using namespace Eigen;
+  using T = typename Mat::Scalar;
+  int n = A.rows();
+  int npanel_trmm = ilaenv_lauum<T>('U', n);
+  int full = npanel_trmm <= 1 || npanel_trmm >= n;
+
+  // Set M.
+  for (int j = 0; j < n; ++j)
+    for (int i = 0; i < n; ++i)
+      M(i, j) = T(!(i - j));
+
+  l2e::le_mat_t<T> A_(A);
+  l2e::le_mat_t<T> M_(M);
+  l2e::le_mat_t<T> A_Ux(A(seq(0, n-2), seq(1, n-1)));
+  l2e::le_mat_t<T> A_U(A(seq(0, n-3), seq(2, n-1)));
+  l2e::le_mat_t<T> M_U(M(seq(0, n-3), seq(1, n-2)));
+
+  trtri('U', 'U', A_Ux);
+  lacpy('U', A_U, M_U);
+
+  for (int i = 0; i < n-1; ++i)
+    vT[i] = -A(i, i+1);
+  sktdsmx(vT, M, A);
+
+  if (!full) {
+    trmmt('U', 'U', T(1.0), M, A);
+      for (int j = 0; j < n; ++j) {
+        A(j, j) = 0.0;
+        for (int i = j + 1; i < n; ++i)
+          A(i, j) = -A(j, i);
+      }
+  }
+
+  // In-place permute columns.
+  for (int j = 0; j < n; ++j)
+    if (iPiv[j]-1 != j)
+      A(Eigen::all, j).swap(A(Eigen::all, iPiv[j]-1));
+
+  if (full)
+    A = M.template triangularView<UnitUpper>().transpose() * A;
+
+  // In-place permute rows.
+  for (int i = 0; i < n; ++i)
+    if (iPiv[i]-1 != i)
+      A(i, Eigen::all).swap(A(iPiv[i]-1, Eigen::all));
+}
+
+template <typename iVec, typename Vec, typename Mat, typename Mat_>
+void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M)
+{
+  switch (uplo) {
+  case 'l':
+  case 'L':
+    ltl2inv(A, iPiv, vT, M); break;
+  case 'u':
+  case 'U':
+    utu2inv(A, iPiv, vT, M); break;
+  default:
+    break;
+  }
+}
+
diff --git a/src/ltl2inv/lapack2eigen.hh b/src/ltl2inv/lapack2eigen.hh
new file mode 100644
index 00000000..1d850068
--- /dev/null
+++ b/src/ltl2inv/lapack2eigen.hh
@@ -0,0 +1,272 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include <Eigen/Dense>
+#include <cstdint>
+#ifndef EIGEN_USE_LAPACKE // This lets Eigen/Dense include LAPACK definitions.
+#include <Eigen/src/misc/lapacke.h>
+#endif
+#ifdef _BLIS
+#define BLIS_DISABLE_BLAS // Screen BLAS definition even if BLIS' built with them.
+#include "blis/blis.h"
+#else
+using Eigen::scomplex;
+using Eigen::dcomplex;
+#endif
+#include "error.hh"
+#include "fortran_pfapack.h"
+using f77_int = lapack_int;
+using ccscmplx = std::complex<float>;
+using ccdcmplx = std::complex<double>;
+
+namespace l2e
+{
+
+template <typename T>
+struct le_mat_t
+{
+    const f77_int rows, cols; ///< Matrix size.
+    const f77_int rs, cs; ///< Strides.
+
+    T *const data;
+
+protected:
+    template <typename Mat>
+    static f77_int _rs(const Mat &mat)
+    {
+        f77_int rs = 0;
+        if (mat.rows() > 1)
+            rs = &(mat(1, 0)) - &(mat(0, 0));
+        return rs;
+    }
+    template <typename Mat>
+    static f77_int _cs(const Mat &mat)
+    {
+        f77_int cs = 0;
+        if (mat.cols() > 1)
+            cs = &(mat(0, 1)) - &(mat(0, 0));
+        return cs;
+    }
+
+public:
+    le_mat_t(const f77_int &rows_, const f77_int &cols_,
+             const f77_int &rs_, const f77_int &cs_, T *const data_)
+    : rows(rows_), cols(cols_), rs(rs_), cs(cs_), data(data_) { }
+
+    template <int RowsAtCompilation, int ColsAtCompilation>
+    le_mat_t(Eigen::Matrix<T, RowsAtCompilation, ColsAtCompilation> &mat)
+    : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { }
+
+    template <typename Mat, int BlockRowsAtCompil, int BlockColsAtCompil, bool BlockProps>
+    le_mat_t(Eigen::Block<Mat, BlockRowsAtCompil, BlockColsAtCompil, BlockProps> mat) // TODO: Check for data copying.
+    : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { }
+
+    template <int RowsAtCompilation, int ColsAtCompilation, int Alignment, typename Stride>
+    le_mat_t(Eigen::Map<Eigen::Matrix<T, RowsAtCompilation, ColsAtCompilation>, Alignment, Stride> &mat)
+    : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { }
+
+    template <int Alignment = 0>
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, Alignment,
+               Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic> > to_eigen() const
+    {
+        using matrix_t = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+        using stride_t = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+        return Eigen::Map<matrix_t, Alignment, stride_t>(data, rows, cols, stride_t(cs, rs));
+    }
+
+    le_mat_t<T> transpose() const
+    { return le_mat_t<T>(cols, rows, cs, rs, data); }
+};
+
+}
+
+
+// ---- BLAS/LAPACK routines used in our settings. -----
+
+
+// [LAPACK] lacpy
+#define BLALINK_PARAMS(T) const char &uploA, \
+                          const l2e::le_mat_t<T> &A, \
+                                l2e::le_mat_t<T> &B
+template <typename T>
+inline void lacpy( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline void lacpy<T>( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1 && B.rs == 1, "BLAS requires column-major."); \
+        /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \
+        cchar##lacpy_((char *)&uploA, (f77_int *)&A.rows, (f77_int *)&A.cols, \
+                      (ctype *)A.data, (f77_int *)&A.cs, \
+                      (ctype *)B.data, (f77_int *)&B.cs); \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, _Complex float,  c )
+BLALINK_MAC( ccdcmplx, _Complex double, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [LAPACK] trtri
+#define BLALINK_PARAMS(T) const char &uploA, \
+                          const char &diagA, \
+                          l2e::le_mat_t<T> &A
+template <typename T>
+inline f77_int trtri( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline f77_int trtri( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        f77_int info; \
+        /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \
+        cchar##trtri_((char *)&uploA, (char *)&diagA, (f77_int *)&A.rows, \
+                      (ctype *)A.data, (f77_int *)&A.cs, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, _Complex float,  c )
+BLALINK_MAC( ccdcmplx, _Complex double, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [PFAPACK] sktrf
+#define BLALINK_PARAMS(T) const char &uplo, \
+                          const char &mode, \
+                          l2e::le_mat_t<T> &A, \
+                          f77_int *ipiv, \
+                          T *work, f77_int lwork
+template <typename T>
+inline f77_int sktrf( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline f77_int sktrf( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        f77_int info; \
+        cchar##sktrf_(&uplo, &mode, &A.rows, (ctype *)A.data, &A.cs, ipiv, work, &lwork, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, floatcmplx,  c )
+BLALINK_MAC( ccdcmplx, doublecmplx, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [PFAPACK] skpfa
+#define BLALINK_PARAMS(T) const char &uplo, \
+                          const char &mthd, \
+                          l2e::le_mat_t<T> &A, \
+                          T *pfa, f77_int *ipiv, \
+                          T *work, f77_int lwork
+template <typename T>
+inline f77_int skpfa( BLALINK_PARAMS(T) );
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        int info; \
+        cchar##skpfa_(&uplo, &mthd, &A.rows, (ctype *)A.data, &A.cs, pfa, ipiv, work, &lwork, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+#undef BLALINK_MAC
+#define BLALINK_MAC(T, ctype, rtype, cchar) \
+    template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1, "BLAS requires column-major."); \
+        rtype *rwork = 0; \
+        if (mthd != 'p' && mthd != 'P') \
+            rwork = new rtype[lwork]; \
+        int info; \
+        cchar##skpfa_(&uplo, &mthd, &A.rows, A.data, &A.cs, pfa, ipiv, work, &lwork, rwork, &info); \
+        delete[] rwork; \
+        return info; \
+    }
+BLALINK_MAC( ccscmplx, floatcmplx,  float,  c )
+BLALINK_MAC( ccdcmplx, doublecmplx, double, z )
+#undef BLALINK_MAC
+#undef BLALINK_PARAMS
+
+
+// [Non-standard] gemmt
+#define BLALINK_PARAMS(T) const char &uploC, \
+                          const char &transA, \
+                          const char &transB, \
+                          const T &alpha, \
+                          const l2e::le_mat_t<T> &A, \
+                          const l2e::le_mat_t<T> &B, \
+                          const T &beta, \
+                                l2e::le_mat_t<T> &C
+template <typename T>
+inline void gemmt( BLALINK_PARAMS(T) );
+#if !defined(_BLIS) && !defined(_BLAS_HAS_GEMMT)
+  #include "eigen_gemmt.tcc"
+#else
+#ifdef _BLIS
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline void gemmt<T>( BLALINK_PARAMS(T) ) \
+    { \
+        uplo_t bli_uploC; \
+        trans_t bli_transA, bli_transB; \
+        bli_param_map_netlib_to_blis_uplo( uploC, &bli_uploC ); \
+        bli_param_map_netlib_to_blis_trans( transA, &bli_transA ); \
+        bli_param_map_netlib_to_blis_trans( transB, &bli_transB ); \
+        bli_##cchar##gemmt(bli_uploC, \
+                           bli_transA, \
+                           bli_transB, \
+                           (transA == 'T' || transA == 't') ? A.cols : A.rows, \
+                           (transA == 'T' || transA == 't') ? A.rows : A.cols, \
+                           (const ctype *)&alpha, \
+                           (const ctype *)A.data, A.rs, A.cs, \
+                           (const ctype *)B.data, B.rs, B.cs, \
+                           (const ctype *)&beta, \
+                           (ctype       *)C.data, C.rs, C.cs); \
+    }
+#else
+  // ifdef _BLAS_HAS_GEMMT
+#define BLALINK_MAC(T, ctype, cchar) \
+    template <> inline void gemmt<T>( BLALINK_PARAMS(T) ) \
+    { \
+        assert_(A.rs == 1 && B.rs == 1 && C.rs == 1, "BLAS requires column-major."); \
+        cchar##gemmt_(&uploC, \
+                      &transA, &transB, \
+                      (transA == 'T' || transA == 't') ? &A.cols : &A.rows, \
+                      (transA == 'T' || transA == 't') ? &A.rows : &A.cols, \
+                      (const ctype *)&alpha, \
+                      (const ctype *)A.data, &A.cs, \
+                      (const ctype *)B.data, &B.cs, \
+                      (const ctype *)&beta, \
+                      (ctype       *)C.data, &C.cs); \
+    }
+extern "C" {
+#define GEMMT_F77_PARAMS(T) const char *uploC, \
+                            const char *transA, \
+                            const char *transB, \
+                            const f77_int *m, \
+                            const f77_int *k, \
+                            const T *alpha, \
+                            const T *A, const f77_int *ldA, \
+                            const T *B, const f77_int *ldB, \
+                            const T *beta, \
+                                  T *C, const f77_int *ldC
+void sgemmt_( GEMMT_F77_PARAMS(float)    );
+void dgemmt_( GEMMT_F77_PARAMS(double)   );
+void cgemmt_( GEMMT_F77_PARAMS(scomplex) );
+void zgemmt_( GEMMT_F77_PARAMS(dcomplex) );
+#undef GEMMT_F77_PARAMS
+}
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+#endif
+#undef BLALINK_PARAMS
diff --git a/src/ltl2inv/ltl2inv.cc b/src/ltl2inv/ltl2inv.cc
new file mode 100644
index 00000000..e7ef84fc
--- /dev/null
+++ b/src/ltl2inv/ltl2inv.cc
@@ -0,0 +1,51 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "pfaffian.tcc"
+#include "invert.tcc"
+
+
+
+extern "C" {
+
+#define GENDEF(schema, cchar, ctype, cctype) \
+void schema ##2inv_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *vT, cctype *M_, int ldM) \
+{ \
+  using namespace Eigen; \
+  Map<Matrix<cctype, Dynamic, Dynamic>, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \
+  Map<Matrix<cctype, Dynamic, Dynamic>, 0, OuterStride<> > M(M_, n, n, OuterStride<>(ldM)); \
+  Map<VectorXi> Piv(iPiv, n); \
+  Map<Vector<cctype, Dynamic>> T(vT, n); \
+  schema ##2inv(A, Piv, vT, M); \
+}
+GENDEF(ltl, s, float,    float)
+GENDEF(ltl, d, double,   double)
+GENDEF(ltl, c, ccscmplx, ccscmplx)
+GENDEF(ltl, z, ccdcmplx, ccdcmplx)
+GENDEF(utu, s, float,    float)
+GENDEF(utu, d, double,   double)
+GENDEF(utu, c, ccscmplx, ccscmplx)
+GENDEF(utu, z, ccdcmplx, ccdcmplx)
+
+#undef GENDEF
+#define GENDEF(schema, cchar, ctype, cctype) \
+void schema ##2pfa_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *Pfa) \
+{ \
+  using namespace Eigen; \
+  Map<Matrix<cctype, Dynamic, Dynamic>, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \
+  Map<VectorXi> Piv(iPiv, n); \
+  *Pfa = schema ##2pfa(A, Piv); \
+}
+GENDEF(ltl, s, float,    float)
+GENDEF(ltl, d, double,   double)
+GENDEF(ltl, c, ccscmplx, ccscmplx)
+GENDEF(ltl, z, ccdcmplx, ccdcmplx)
+GENDEF(utu, s, float,    float)
+GENDEF(utu, d, double,   double)
+GENDEF(utu, c, ccscmplx, ccscmplx)
+GENDEF(utu, z, ccdcmplx, ccdcmplx)
+
+}
+
diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv
new file mode 100644
index 00000000..d57c5fcd
--- /dev/null
+++ b/src/ltl2inv/makefile_ltl2inv
@@ -0,0 +1,28 @@
+include ../make.sys
+
+OBJ = ilaenv_lauum.o gemmt2blas.o ltl2inv.o
+SRC = ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc
+HDR = ilaenv.h ilaenv_lauum.hh lapack2eigen.hh error.hh eigen_gemmt.tcc invert.tcc pfaffian.tcc trmmt.tcc
+
+ltl2inv.a: $(OBJ)
+	$(AR) rvu $@ $(OBJ)
+
+clean:
+	rm -f $(OBJ)
+
+SUFFIXES: .o .c
+
+.c.o: $(HDR)
+	$(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \
+		-Wno-incompatible-pointer-types-discards-qualifiers \
+		-I../pfapack/c_interface \
+		-I$(BLIS_ROOT)/include \
+		-I$(BLIS_ROOT)/include/blis
+
+SUFFIXES: .o .cc
+
+.cc.o: $(HDR)
+	$(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \
+		-I../pfapack/c_interface \
+		-I$(BLIS_ROOT)/include \
+		-I$(BLIS_ROOT)/include/blis
diff --git a/src/ltl2inv/pfaffian.tcc b/src/ltl2inv/pfaffian.tcc
new file mode 100644
index 00000000..51777f0b
--- /dev/null
+++ b/src/ltl2inv/pfaffian.tcc
@@ -0,0 +1,88 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "lapack2eigen.hh"
+
+
+template <typename T_>
+struct SignedLogScale
+{
+  using T = T_;
+
+  T_  val;
+  int sgn;
+};
+
+
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+SignedLogScale<Amp> utu2logpfa(const Mat &A, const Vec &iPiv)
+{
+  SignedLogScale<Amp> logpfa{ 0.0, 1 };
+
+  for (int i = 0; i < A.rows(); i += 2) {
+    logpfa.val += std::log(std::abs(Amp( A(i, i+1) )));
+    logpfa.sgn *= A(i, i+1) > 0 ? 1 : -1;
+
+    if (iPiv(i)  -1 != i  ) logpfa.sgn *= -1;
+    if (iPiv(i+1)-1 != i+1) logpfa.sgn *= -1;
+  }
+  return logpfa;
+}
+
+// Additional parameter for return type deduction.
+template <typename Mat, typename Vec, typename Ret>
+Ret utu2logpfa(const Mat &A, const Vec &iPiv, Ret &ret)
+{ return utu2logpfa<Mat, Vec, typename Ret::T>(A, iPiv); }
+
+
+
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+Amp ltl2pfa(const Mat &A, const Vec &iPiv)
+{
+  Amp pfa = 1.0;
+
+  for (int i = 0; i < A.rows(); i += 2) {
+    pfa *= -A(i+1, i);
+
+    if (iPiv(i)  -1 != i  ) pfa = -pfa;
+    if (iPiv(i+1)-1 != i+1) pfa = -pfa;
+  }
+  return pfa;
+}
+
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+Amp utu2pfa(const Mat &A, const Vec &iPiv)
+{
+  Amp pfa = 1.0;
+
+  for (int i = 0; i < A.rows(); i += 2) {
+    pfa *= A(i, i+1);
+
+    if (iPiv(i)  -1 != i  ) pfa = -pfa;
+    if (iPiv(i+1)-1 != i+1) pfa = -pfa;
+  }
+  return pfa;
+}
+
+template <typename Mat, typename Vec,
+          typename Amp = typename Mat::Scalar>
+Amp ltl2pfa(const char &uplo, const Mat &A, const Vec &iPiv)
+{
+  switch (uplo) {
+  case 'l':
+  case 'L':
+    return ltl2pfa(A, iPiv);
+  case 'u':
+  case 'U':
+    return utu2pfa(A, iPiv);
+  default:
+    return 0.0;
+  }
+}
+
diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc
new file mode 100644
index 00000000..db60335a
--- /dev/null
+++ b/src/ltl2inv/trmmt.tcc
@@ -0,0 +1,43 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "lapack2eigen.hh"
+#include "ilaenv_lauum.hh"
+
+
+template <typename Mat, typename Mat_>
+inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat_ &B)
+{
+  using namespace Eigen;
+  using T = typename Mat::Scalar;
+  int n = A.rows();
+  int npanel = ilaenv_lauum<T>(uploAB, n);
+
+  if (uploAB == 'U' || uploAB == 'u') {
+    // A upper => A' lower -> write to UPPER part of B.
+    for (int j = 0; j < n; j += npanel) {
+      int nloc = std::min<int>(npanel, n - j);
+
+      auto Bslice = B(seq(0, j+nloc-1), seq(j, j+nloc-1));
+      if (diagA == 'U' || diagA == 'u')
+        Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView<UnitUpper>().transpose() * Bslice * alpha;
+      else
+        Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView<Upper>().transpose() * Bslice * alpha;
+    }
+  } else {
+    // A lower => A' upper -> write to LOWER part of B.
+    for (int j = 0; j < n; j += npanel) {
+      int nloc = std::min<int>(npanel, n - j);
+
+      auto Bslice = B(seq(j, n-1), seq(j, j+nloc-1));
+      if (diagA == 'U' || diagA == 'u')
+        Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView<UnitLower>().transpose() * Bslice * alpha;
+      else
+        Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView<Lower>().transpose() * Bslice * alpha;
+    }
+  }
+}
+
diff --git a/src/mVMC/CMakeLists.txt b/src/mVMC/CMakeLists.txt
index f80471ca..dd898044 100644
--- a/src/mVMC/CMakeLists.txt
+++ b/src/mVMC/CMakeLists.txt
@@ -27,9 +27,11 @@ target_link_libraries(vmcdry.out StdFace m)
 add_executable(vmc.out ${SOURCES_vmcmain} ${SOURCES_sfmt})
 target_link_libraries(vmc.out StdFace)
 target_link_libraries(vmc.out pfapack)
-if(PFAFFIAN_BLOCKED)
-  target_link_libraries(vmc.out pfupdates blis pthread)
-endif(PFAFFIAN_BLOCKED)
+target_link_libraries(vmc.out ltl2inv)
+target_link_libraries(vmc.out pfupdates)
+if(Require_BLIS)
+  target_link_libraries(vmc.out blis pthread)
+endif(Require_BLIS)
 target_link_libraries(vmc.out ${LAPACK_LIBRARIES} m)
 
 if(USE_SCALAPACK)
diff --git a/src/mVMC/calgrn.c b/src/mVMC/calgrn.c
index 8d13215b..e81503da 100644
--- a/src/mVMC/calgrn.c
+++ b/src/mVMC/calgrn.c
@@ -38,6 +38,17 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
   int *myEleIdx, *myEleNum, *myProjCntNew;
   double complex *myBuffer;
 
+  int *lazy_info = malloc(    sizeof(int) * NCisAjsCktAltDC * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * NCisAjsCktAltDC);
+  int *lazy_msj  = malloc(2 * sizeof(int) * NCisAjsCktAltDC);
+  double complex *lazy_ip  = malloc(sizeof(double complex) * NCisAjsCktAltDC);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * NCisAjsCktAltDC * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * NCisAjsCktAltDC);
+  memset(lazy_info + NCisAjsCktAltDC, -1, sizeof(int) * NCisAjsCktAltDC);
+
+  for (int mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
+  for (int mi=Ne;mi<Ne*2;mi++) EleSpn[mi] = 1;
+
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -50,6 +61,20 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadComplex(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_z(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm, Nsite2*Nsite2,
+                                     InvM, Nsize*Nsize,
+                                     eleIdx, EleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -70,7 +95,7 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
     #pragma omp master
     {StopTimer(50);StartTimer(51);}
     
-    #pragma omp for private(idx,ri,rj,s,rk,rl,t,tmp) schedule(dynamic)
+    #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic)
     for(idx=0;idx<NCisAjsCktAltDC;idx++) {
       ri = CisAjsCktAltDCIdx[idx][0];
       rj = CisAjsCktAltDCIdx[idx][2];
@@ -79,11 +104,57 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
       rl = CisAjsCktAltDCIdx[idx][6];
       t  = CisAjsCktAltDCIdx[idx][5];
 
-      tmp = GreenFunc2(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,
-                       myProjCntNew,myBuffer);
-      PhysCisAjsCktAltDC[idx] += w*tmp;
+      int *lazy_info_loc = lazy_info + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + idx;
+
+      *lazy_ip_loc = w *
+        GreenFunc2_(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+          lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) {
+        PhysCisAjsCktAltDC[idx] += *lazy_ip_loc;
+      } /* else {
+        // Reverse lookup for the idx
+        *lazy_info_loc = idx + 1;
+      } */
     }
-    
+
+    /* Batch-compute 2-body Green's functions. */
+    #pragma omp barrier
+    if ( Nsize <= 200 ) { // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<NCisAjsCktAltDC; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[NCisAjsCktAltDC + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
+        }
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + NCisAjsCktAltDC,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, NCisAjsCktAltDC,
+                                                   lazy_info, lazy_info + NCisAjsCktAltDC,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    }
+    #pragma omp barrier
+    #pragma omp for private(idx) schedule(dynamic) nowait
+    for(idx=0;idx<NCisAjsCktAltDC;idx++)
+      if ( lazy_info[idx] )
+        PhysCisAjsCktAltDC[idx] += conj(CalculateIP_fcmp(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
+
     #pragma omp master
     {StopTimer(51);StartTimer(52);}
 
@@ -105,6 +176,11 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in
     #pragma omp master
     {StopTimer(53);}
   }
+  free(lazy_info);
+  free(lazy_rsi);
+  free(lazy_msj);
+  free(lazy_ip);
+  free(lazy_pfa);
 
   ReleaseWorkSpaceThreadInt();
   ReleaseWorkSpaceThreadComplex();
diff --git a/src/mVMC/calham.c b/src/mVMC/calham.c
index d35445eb..a44bf73b 100644
--- a/src/mVMC/calham.c
+++ b/src/mVMC/calham.c
@@ -67,6 +67,18 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
   double complex *myBuffer;
   double complex myEnergy;
 
+  const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll;
+  int *lazy_info = malloc(    sizeof(int) * nHamiltonianTwo * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  int *lazy_msj  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  double complex *lazy_ip  = malloc(sizeof(double complex) * nHamiltonianTwo);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * nHamiltonianTwo * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * nHamiltonianTwo);
+  memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo);
+
+  for (int mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
+  for (int mi=Ne;mi<Ne*2;mi++) EleSpn[mi] = 1;
+
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -82,13 +94,27 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
                NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling, \
                NTransfer, Transfer, ParaTransfer, NPairHopping, PairHopping, ParaPairHopping, \
                NExchangeCoupling, ExchangeCoupling, ParaExchangeCoupling, NInterAll, InterAll, ParaInterAll, n0, n1) \
-  shared(eleCfg, eleProjCnt, eleIdx, eleNum) reduction(+:e)
+  shared(eleCfg, eleProjCnt, eleIdx, eleNum, lazy_info) reduction(+:e)
   {
     myEleIdx = GetWorkSpaceThreadInt(Nsize);
     myEleNum = GetWorkSpaceThreadInt(Nsite2);
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadComplex(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_z(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm, Nsite2*Nsite2,
+                                     InvM, Nsize*Nsize,
+                                     eleIdx, EleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -142,15 +168,29 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
     #pragma omp master
     {StopTimer(71);StartTimer(72);}
 
+    /****************/
+    /* 2-body terms */
+    /****************/
+    int noffset_lazy = 0, noffset_rsij = 0;
+
     /* Pair Hopping */
     #pragma omp for private(idx,ri,rj) schedule(dynamic) nowait
     for(idx=0;idx<NPairHopping;idx++) {
       ri = PairHopping[idx][0];
       rj = PairHopping[idx][1];
-    
-      myEnergy += ParaPairHopping[idx]
-        * GreenFunc2(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaPairHopping[idx] *
+        GreenFunc2_(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                    lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NPairHopping;
+    noffset_rsij += NPairHopping * 2;
 
     /* Exchange Coupling */
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
@@ -158,10 +198,22 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
     
-      tmp =  GreenFunc2(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      tmp += GreenFunc2(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      myEnergy += ParaExchangeCoupling[idx] * tmp;
+      int *lazy_info_even = lazy_info + noffset_lazy +  idx * 2;
+      int *lazy_info_odd  = lazy_info + noffset_lazy +  idx * 2 + 1;
+      int *lazy_rsi_even  = lazy_rsi  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_rsi_odd   = lazy_rsi  + noffset_rsij + (idx * 2 + 1) * 2;
+      int *lazy_msj_even  = lazy_msj  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_msj_odd   = lazy_msj  + noffset_rsij + (idx * 2 + 1) * 2;
+      double complex *lazy_ip_even = lazy_ip + noffset_lazy + idx * 2;
+      double complex *lazy_ip_odd  = lazy_ip + noffset_lazy + idx * 2 + 1;
+
+      *lazy_ip_even = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
+      *lazy_ip_odd  = ParaExchangeCoupling[idx] * GreenFunc2_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
+      if ( !*lazy_info_even ) myEnergy += *lazy_ip_even;
+      if ( !*lazy_info_odd  ) myEnergy += *lazy_ip_odd;
     }
+    noffset_lazy += 2 * NExchangeCoupling;
+    noffset_rsij += 2 * NExchangeCoupling * 2;
 
     /* Inter All */
     #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
@@ -172,16 +224,68 @@ double complex CalculateHamiltonian(const double complex ip, int *eleIdx, const
       rk = InterAll[idx][4];
       rl = InterAll[idx][6];
       t  = InterAll[idx][7];
-      
-      myEnergy += ParaInterAll[idx]
-        * GreenFunc2(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaInterAll[idx] *
+        GreenFunc2_(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                    lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
+    }
+    noffset_lazy += NInterAll;
+    noffset_rsij += NInterAll * 2;
+
+    /* Batch-compute 2-body Green's functions. */
+    #pragma omp barrier
+    if ( Nsize <= 200 ) { // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<nHamiltonianTwo; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
+        }
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, nHamiltonianTwo,
+                                                   lazy_info, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
     }
+    #pragma omp barrier
+    #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
+    for(idx=0;idx<nHamiltonianTwo;idx++)
+      if ( lazy_info[idx] )
+        myEnergy += conj(CalculateIP_fcmp(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
 
     #pragma omp master
     {StopTimer(72);}
 
     e += myEnergy;
+
+    // Freeing within OMP region is thread-private.
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
   }
+  free(lazy_info);
+  free(lazy_rsi);
+  free(lazy_msj);
+  free(lazy_ip);
+  free(lazy_pfa);
 
   ReleaseWorkSpaceThreadInt();
   ReleaseWorkSpaceThreadComplex();
diff --git a/src/mVMC/calham_fsz.c b/src/mVMC/calham_fsz.c
index 10239e14..c3935ada 100644
--- a/src/mVMC/calham_fsz.c
+++ b/src/mVMC/calham_fsz.c
@@ -2,7 +2,7 @@
 mVMC - A numerical solver package for a wide range of quantum lattice models based on many-variable Variational Monte Carlo method
 Copyright (C) 2016 The University of Tokyo, All rights reserved.
 
-his program is developed based on the mVMC-mini program
+This program is developed based on the mVMC-mini program
 (https://github.com/fiber-miniapp/mVMC-mini)
 which follows "The BSD 3-Clause License".
 
@@ -57,6 +57,15 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
   double complex *myBuffer;
   double complex myEnergy;
 
+  const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll;
+  int *lazy_info = malloc(    sizeof(int) * nHamiltonianTwo * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  int *lazy_msj  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  double complex *lazy_ip  = malloc(sizeof(double complex) * nHamiltonianTwo);
+  double complex *lazy_pfa = malloc(sizeof(double complex) * nHamiltonianTwo * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * nHamiltonianTwo);
+  memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo);
+
   RequestWorkSpaceThreadInt(Nsize+Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadComplex(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -66,7 +75,7 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
   private(myEleIdx,myEleNum,myProjCntNew,myBuffer,myEnergy,idx)  \
   reduction(+:e)
   */
-#pragma omp parallel default(none)                                      \
+#pragma omp parallel default(shared)                                      \
   private(myEleIdx,myEleSpn,myEleNum,myProjCntNew,myBuffer,myEnergy, idx, ri, rj, rk, rl, s, t,u,v) \
   firstprivate(ip, Nsize, Nsite2, NProj, NQPFull, NCoulombIntra, CoulombIntra, ParaCoulombIntra, \
                NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling, \
@@ -80,6 +89,20 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadComplex(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_z(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm, Nsite2*Nsite2,
+                                     InvM, Nsize*Nsize,
+                                     eleIdx, eleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -140,29 +163,52 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
     #pragma omp master
     {StopTimer(71);StartTimer(72);}
 
+    /****************/
+    /* 2-body terms */
+    /****************/
+    int noffset_lazy = 0, noffset_rsij = 0;
+
     /* Pair Hopping */
     #pragma omp for private(idx,ri,rj) schedule(dynamic) nowait
     for(idx=0;idx<NPairHopping;idx++) {
       ri = PairHopping[idx][0];
       rj = PairHopping[idx][1];
-    
-      myEnergy += ParaPairHopping[idx]
-        * GreenFunc2_fsz(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaPairHopping[idx] *
+        GreenFunc2_fsz_(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer,
+                        lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+        if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NPairHopping;
+    noffset_rsij += NPairHopping * 2;
 
     /* Exchange Coupling */
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
     for(idx=0;idx<NExchangeCoupling;idx++) {
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
-    
-      tmp =  GreenFunc2_fsz(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      //printf("idx=%d ri=%d rj=%d:  tmp=%lf \n",idx,ri,rj,creal(tmp));
-      tmp += GreenFunc2_fsz(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      //tmp =  GreenFunc2_fsz2(ri,rj,rj,ri,0,0,1,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      //tmp += GreenFunc2_fsz2(ri,rj,rj,ri,1,1,0,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      myEnergy += ParaExchangeCoupling[idx] * tmp;
+
+      int *lazy_info_even = lazy_info + noffset_lazy +  idx * 2;
+      int *lazy_info_odd  = lazy_info + noffset_lazy +  idx * 2 + 1;
+      int *lazy_rsi_even  = lazy_rsi  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_rsi_odd   = lazy_rsi  + noffset_rsij + (idx * 2 + 1) * 2;
+      int *lazy_msj_even  = lazy_msj  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_msj_odd   = lazy_msj  + noffset_rsij + (idx * 2 + 1) * 2;
+      double complex *lazy_ip_even = lazy_ip + noffset_lazy + idx * 2;
+      double complex *lazy_ip_odd  = lazy_ip + noffset_lazy + idx * 2 + 1;
+
+      *lazy_ip_even = ParaExchangeCoupling[idx] * GreenFunc2_fsz_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
+      *lazy_ip_odd  = ParaExchangeCoupling[idx] * GreenFunc2_fsz_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
+      if ( !*lazy_info_even ) myEnergy += *lazy_ip_even;
+      if ( !*lazy_info_odd  ) myEnergy += *lazy_ip_odd;
     }
+    noffset_lazy += 2 * NExchangeCoupling;
+    noffset_rsij += 2 * NExchangeCoupling * 2;
 
     /* Inter All */
     #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
@@ -175,21 +221,74 @@ double complex CalculateHamiltonian_fsz(const double complex ip, int *eleIdx, co
       u  = InterAll[idx][5];
       rl = InterAll[idx][6];
       v  = InterAll[idx][7];
-      
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double complex *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
       if(s==t && u==v){
-        myEnergy += ParaInterAll[idx]
-          * GreenFunc2_fsz(ri,rj,rk,rl,s,u,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
+        *lazy_ip_loc = ParaInterAll[idx] *
+          GreenFunc2_fsz_(ri,rj,rk,rl,s,u,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer,
+                          lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
       }else{
-        myEnergy += ParaInterAll[idx]
-          * GreenFunc2_fsz2(ri,rj,rk,rl,s,t,u,v,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer);
-      } 
+        *lazy_ip_loc = ParaInterAll[idx] *
+          GreenFunc2_fsz2_(ri,rj,rk,rl,s,t,u,v,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myEleSpn,myProjCntNew,myBuffer,
+                           lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      }
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
+    }
+    noffset_lazy += NInterAll;
+    noffset_rsij += NInterAll * 2;
+
+    /* Batch-compute 2-body Green's functions. */
+    #pragma omp barrier
+    if ( Nsize <= 200 ) { // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<nHamiltonianTwo; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
+        }
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_z(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_z(NQPFull, nHamiltonianTwo,
+                                                   lazy_info, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
     }
+    #pragma omp barrier
+    #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
+    for(idx=0;idx<nHamiltonianTwo;idx++)
+      if ( lazy_info[idx] )
+        myEnergy += conj(CalculateIP_fcmp(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF)) * lazy_ip[idx];
 
     #pragma omp master
     {StopTimer(72);}
 
     e += myEnergy;
+
+    // Freeing within OMP region is thread-private.
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
   }
+  free(lazy_info);
+  free(lazy_rsi);
+  free(lazy_msj);
+  free(lazy_ip);
+  free(lazy_pfa);
 
   ReleaseWorkSpaceThreadInt();
   ReleaseWorkSpaceThreadComplex();
diff --git a/src/mVMC/calham_real.c b/src/mVMC/calham_real.c
index 660aa2d4..fc2e76b9 100644
--- a/src/mVMC/calham_real.c
+++ b/src/mVMC/calham_real.c
@@ -51,6 +51,18 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
   double  *myBuffer;
   double  myEnergy;
 
+  const int nHamiltonianTwo = NPairHopping + NExchangeCoupling * 2 + NInterAll;
+  int *lazy_info = malloc(    sizeof(int) * nHamiltonianTwo * 2);
+  int *lazy_rsi  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  int *lazy_msj  = malloc(2 * sizeof(int) * nHamiltonianTwo);
+  double *lazy_ip  = malloc(sizeof(double) * nHamiltonianTwo);
+  double *lazy_pfa = malloc(sizeof(double) * nHamiltonianTwo * NQPFull);
+  memset(lazy_info,                    0, sizeof(int) * nHamiltonianTwo);
+  memset(lazy_info + nHamiltonianTwo, -1, sizeof(int) * nHamiltonianTwo);
+
+  for (int mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
+  for (int mi=Ne;mi<Ne*2;mi++) EleSpn[mi] = 1;
+
   RequestWorkSpaceThreadInt(Nsize+Nsite2+NProj);
   RequestWorkSpaceThreadDouble(NQPFull+2*Nsize);
   /* GreenFunc1: NQPFull, GreenFunc2: NQPFull+2*Nsize */
@@ -66,13 +78,27 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
     NCoulombInter, CoulombInter, ParaCoulombInter, NHundCoupling, HundCoupling, ParaHundCoupling,    \
     NTransfer, Transfer, ParaTransfer, NPairHopping, PairHopping, ParaPairHopping,    \
     NExchangeCoupling, ExchangeCoupling, ParaExchangeCoupling, NInterAll, InterAll, ParaInterAll, n0, n1)\
-    shared(eleCfg, eleProjCnt, eleIdx, eleNum) reduction(+:e)
+    shared(eleCfg, eleProjCnt, eleIdx, eleNum, lazy_info) reduction(+:e)
   {
     myEleIdx = GetWorkSpaceThreadInt(Nsize);
     myEleNum = GetWorkSpaceThreadInt(Nsite2);
     myProjCntNew = GetWorkSpaceThreadInt(NProj);
     myBuffer = GetWorkSpaceThreadDouble(NQPFull+2*Nsize);
 
+    void *pfOrbital[NQPFull];
+    void *pfUpdator[NQPFull];
+    void *pfMat[NQPFull];
+    void *pfMap[NQPFull];
+
+    // Attaching thread-private objects to thread-shared InvM.
+    // These objects no long need mutating states in this use. Just functor-like stuff.
+    updated_tdi_v_seq_init_precomp_d(NQPFull, Nsite, Nsite2, Nsize,
+                                     SlaterElm_real, Nsite2*Nsite2,
+                                     InvM_real, Nsize*Nsize,
+                                     eleIdx, EleSpn,
+                                     2 /* GF @ measure: 2 at max. */, PfM_real,
+                                     pfUpdator, pfOrbital, pfMat, pfMap);
+
     #pragma loop noalias
     for(idx=0;idx<Nsize;idx++) myEleIdx[idx] = eleIdx[idx];
     #pragma loop noalias
@@ -145,36 +171,54 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
 #pragma omp master
     printf("    Debug: PairHopping\n");
 #endif
+
+    /****************/
+    /* 2-body terms */
+    /****************/
+    int noffset_lazy = 0, noffset_rsij = 0;
+
     /* Pair Hopping */
     #pragma omp for private(idx,ri,rj) schedule(dynamic) nowait
     for(idx=0;idx<NPairHopping;idx++) {
       ri = PairHopping[idx][0];
       rj = PairHopping[idx][1];
-    
-      myEnergy += ParaPairHopping[idx]
-        * GreenFunc2_real(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaPairHopping[idx]
+        * GreenFunc2_real_(ri,rj,ri,rj,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                           lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NPairHopping;
+    noffset_rsij += NPairHopping * 2;
 
-#ifdef _DEBUG
-//#pragma omp master
-    //printf("    Debug: ExchangeCoupling, NExchangeCoupling=%d\n", NExchangeCoulpling);
-#endif
     /* Exchange Coupling */
     #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
     for(idx=0;idx<NExchangeCoupling;idx++) {
       ri = ExchangeCoupling[idx][0];
       rj = ExchangeCoupling[idx][1];
     
-      tmp =  GreenFunc2_real(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      tmp += GreenFunc2_real(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
-      myEnergy += ParaExchangeCoupling[idx] * tmp;
-      //printf("XDEBUG: idx=%d tmp=%lf %lf\n",idx,tmp,ParaExchangeCoupling[idx]);
+      int *lazy_info_even = lazy_info + noffset_lazy +  idx * 2;
+      int *lazy_info_odd  = lazy_info + noffset_lazy +  idx * 2 + 1;
+      int *lazy_rsi_even  = lazy_rsi  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_rsi_odd   = lazy_rsi  + noffset_rsij + (idx * 2 + 1) * 2;
+      int *lazy_msj_even  = lazy_msj  + noffset_rsij + (idx * 2    ) * 2;
+      int *lazy_msj_odd   = lazy_msj  + noffset_rsij + (idx * 2 + 1) * 2;
+      double *lazy_ip_even = lazy_ip + noffset_lazy + idx * 2;
+      double *lazy_ip_odd  = lazy_ip + noffset_lazy + idx * 2 + 1;
+
+      *lazy_ip_even = ParaExchangeCoupling[idx] * GreenFunc2_real_(ri,rj,rj,ri,0,1,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_even, lazy_rsi_even, lazy_msj_even);
+      *lazy_ip_odd  = ParaExchangeCoupling[idx] * GreenFunc2_real_(ri,rj,rj,ri,1,0,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer, lazy_info_odd, lazy_rsi_odd, lazy_msj_odd);
+      if ( !*lazy_info_even ) myEnergy += *lazy_ip_even;
+      if ( !*lazy_info_odd  ) myEnergy += *lazy_ip_odd;
     }
-    
-#ifdef _DEBUG
-#pragma omp master
-    printf("    Debug: InterAll, NInterAll=%d\n", NInterAll);
-#endif
+    noffset_lazy += 2 * NExchangeCoupling;
+    noffset_rsij += 2 * NExchangeCoupling * 2;
+
     /* Inter All */
     #pragma omp for private(idx,ri,rj,s,rk,rl,t) schedule(dynamic) nowait
     for(idx=0;idx<NInterAll;idx++) {
@@ -184,10 +228,55 @@ double CalculateHamiltonian_real(const double ip, int *eleIdx, const int *eleCfg
       rk = InterAll[idx][4];
       rl = InterAll[idx][6];
       t  = InterAll[idx][7];
-      
-      myEnergy += ParaInterAll[idx]
-        * GreenFunc2_real(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer);
+
+      int *lazy_info_loc = lazy_info + noffset_lazy + idx;
+      int *lazy_rsi_loc  = lazy_rsi  + noffset_rsij + idx * 2;
+      int *lazy_msj_loc  = lazy_msj  + noffset_rsij + idx * 2;
+      double *lazy_ip_loc = lazy_ip + noffset_lazy + idx;
+
+      *lazy_ip_loc = ParaInterAll[idx]
+        * GreenFunc2_real_(ri,rj,rk,rl,s,t,ip,myEleIdx,eleCfg,myEleNum,eleProjCnt,myProjCntNew,myBuffer,
+                           lazy_info_loc, lazy_rsi_loc, lazy_msj_loc);
+      if ( !*lazy_info_loc ) myEnergy += *lazy_ip_loc;
     }
+    noffset_lazy += NInterAll;
+    noffset_rsij += NInterAll * 2;
+
+    /* Batch-compute 2-body Green's functions. */
+    #pragma omp barrier
+    if ( Nsize <= 400 ) // Heuristics: Huge Nelec seems to cause parallelize-over-nGF spill L2.
+    {
+      int num_qp_var0 = 0;
+      // Pack lazy info.
+      for (idx=0; idx<nHamiltonianTwo; ++idx)
+        if (lazy_info[idx]) {
+          if (omp_get_thread_num() == 0) {
+            lazy_rsi[num_qp_var0 * 2    ] = lazy_rsi[idx * 2]; \
+            lazy_rsi[num_qp_var0 * 2 + 1] = lazy_rsi[idx * 2 + 1]; \
+            lazy_msj[num_qp_var0 * 2    ] = lazy_msj[idx * 2]; \
+            lazy_msj[num_qp_var0 * 2 + 1] = lazy_msj[idx * 2 + 1]; \
+            lazy_info[nHamiltonianTwo + num_qp_var0] = idx; \
+          }
+          num_qp_var0++;
+        }
+      #pragma omp barrier
+      updated_tdi_v_omp_var0_proc_batch_greentwo_d(NQPFull, num_qp_var0,
+                                                   NULL, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    } else {
+      updated_tdi_v_omp_var1_proc_batch_greentwo_d(NQPFull, nHamiltonianTwo,
+                                                   lazy_info, lazy_info + nHamiltonianTwo,
+                                                   lazy_rsi, lazy_msj,
+                                                   lazy_pfa,
+                                                   pfUpdator, pfOrbital, pfMat, pfMap);
+    }
+    #pragma omp barrier
+    #pragma omp for private(idx,ri,rj,tmp) schedule(dynamic) nowait
+    for(idx=0;idx<nHamiltonianTwo;idx++)
+      if ( lazy_info[idx] )
+        myEnergy += CalculateIP_real(lazy_pfa + idx * NQPFull, 0, NQPFull, MPI_COMM_SELF) * lazy_ip[idx];
 
     #pragma omp master
     {StopTimer(72);}
diff --git a/src/mVMC/include/blas_externs.h b/src/mVMC/include/blas_externs.h
index c1df0682..7679cac4 100644
--- a/src/mVMC/include/blas_externs.h
+++ b/src/mVMC/include/blas_externs.h
@@ -46,15 +46,11 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #define M_ZGETRI zgetri_
 #define M_ZPOSV  zposv_
 
-// Skew-symmetric LAPACK-level routines:
-// Vendor switch: PFAPACK77 or Pfaffine
-#ifdef _pfaffine
-#define M_DSKPFA m_dskpfa_
-#define M_ZSKPFA m_zskpfa_
-#else
+// Pfapack
 #define M_DSKPFA dskpfa_
 #define M_ZSKPFA zskpfa_
-#endif
+#define M_DSKTRF dsktrf_
+#define M_ZSKTRF zsktrf_
 
 // pBLAS
 #define M_PDGEMV  pdgemv_
@@ -116,11 +112,21 @@ extern int M_DSKPFA(const char *uplo, const char *mthd, const int *n,
                     double *work, const int *lwork, int *info);
 extern int M_ZSKPFA(const char *uplo, const char *mthd, const int *n,
                     double complex *a, const int *lda, double complex *pfaff, int *iwork,
-                    double complex *work, const int *lwork,
-#ifndef _pfaffine
-                    double *rwork,
-#endif
-                    int *info);
+                    double complex *work, const int *lwork, double *rwork, int *info);
+extern int M_DSKTRF(const char *uplo, const char *mode, const int *n,
+                    double *a, const int *lda, int *ipiv,
+                    double *work, const int *lwork, int *info);
+extern int M_ZSKTRF(const char *uplo, const char *mode, const int *n,
+                    double complex *a, const int *lda, int *ipiv,
+                    double complex *work, const int *lwork, int *info);
+extern void ltl2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa);
+extern void ltl2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM);
+extern void ltl2pfa_z(int n, double complex *A_, int ldA, int *iPiv, double complex *Pfa);
+extern void ltl2inv_z(int n, double complex *A_, int ldA, int *iPiv, double complex *vT, double complex *M_, int ldM);
+extern void utu2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa);
+extern void utu2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM);
+extern void utu2pfa_z(int n, double complex *A_, int ldA, int *iPiv, double complex *Pfa);
+extern void utu2inv_z(int n, double complex *A_, int ldA, int *iPiv, double complex *vT, double complex *M_, int ldM);
 
 // pBLAS
 extern void M_PDGEMV(const char *trans, const int *m, const int *n,
diff --git a/src/mVMC/include/vmcmain.h b/src/mVMC/include/vmcmain.h
index 090c7b69..cf642973 100644
--- a/src/mVMC/include/vmcmain.h
+++ b/src/mVMC/include/vmcmain.h
@@ -72,6 +72,7 @@ extern int omp_get_thread_num(void);
 #else
  #include "../stcopt_pdposv.c"
 #endif
+#include "../../pfupdates/pf_interface.h"
 
 #include "../stcopt_cg.c"
 
diff --git a/src/mVMC/locgrn.c b/src/mVMC/locgrn.c
index 0f8837c3..058962aa 100644
--- a/src/mVMC/locgrn.c
+++ b/src/mVMC/locgrn.c
@@ -73,10 +73,11 @@ double complex GreenFunc1(const int ri, const int rj, const int s, const double
 
 /* Calculate 2-body Green function <psi|CisAjsCktAlt|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl,
+double complex GreenFunc2_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
-                  int *projCntNew, double complex *buffer) {
+                  int *projCntNew, double complex *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double complex z;
   int mj,msj,ml,mtl;
   int rsi,rsj,rtk,rtl;
@@ -145,8 +146,17 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV);
-  z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_info) {
+    CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV);
+    z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    // Save invocation hook instead of calling.
+    lazy_msj[0] = msj; //< to edit to ri, revert to rj
+    lazy_msj[1] = mtl; //< to edit to rk, revert to rl
+    lazy_rsi[0] = rsi;
+    lazy_rsi[1] = rtk;
+    lazy_info[0] = 1; //< "is delayed" flag..
+  }
 
   /* revert hopping */
   eleIdx[mtl] = rl;
@@ -159,6 +169,12 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl
   return conj(z/ip);//TBC
 }
 
+double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t, const double complex ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
+                  int *projCntNew, double complex *buffer) {
+  return GreenFunc2_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0, 0, 0);
+}
 
 // ignore GreenFuncN: to be added
 
@@ -392,12 +408,7 @@ double complex calculateNewPfMN_child(const int qpidx, const int n, const int *m
   /* calculate Pf M */
   //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
   //M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); //TBC
-#ifdef _pfaffine
-  info = 1; // Skip inverse.
-  M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
 
   return sgn * pfaff * PfM[qpidx];
diff --git a/src/mVMC/locgrn_fsz.c b/src/mVMC/locgrn_fsz.c
index a686906f..6ca1bb2d 100644
--- a/src/mVMC/locgrn_fsz.c
+++ b/src/mVMC/locgrn_fsz.c
@@ -125,10 +125,11 @@ double complex GreenFunc1_fsz2(const int ri, const int rj, const int s,const int
 
 /* Calculate 2-body Green function <psi|CisAjsCktAlt|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl,
+double complex GreenFunc2_fsz_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
-                  int *projCntNew, double complex *buffer) {
+                  int *projCntNew, double complex *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double complex z;
   int mj,msj,ml,mtl;
   int rsi,rsj,rtk,rtl;
@@ -199,8 +200,17 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV);
-  z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_info) {
+    CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV);
+    z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    // Save invocation hook instead of calling.
+    lazy_msj[0] = msj; //< to edit to ri, revert to rj
+    lazy_msj[1] = mtl; //< to edit to rk, revert to rl
+    lazy_rsi[0] = rsi;
+    lazy_rsi[1] = rtk;
+    lazy_info[0] = 1; //< "is delayed" flag..
+  }
 
   /* revert hopping */
   eleIdx[mtl] = rl;
@@ -215,12 +225,21 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in
   return conj(z/ip);//TBC
 }
 
+double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t, const double complex ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
+                  int *projCntNew, double complex *buffer) {
+  return GreenFunc2_fsz_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn,
+                         projCntNew, buffer, 0, 0, 0);
+}
+
 /* Calculate 2-body Green function <psi|CisAjtCkuAlv|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl,
+double complex GreenFunc2_fsz2_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t,const int u,const int v, const double complex ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
-                  int *projCntNew, double complex *buffer) {
+                  int *projCntNew, double complex *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double complex z;
   //int mj,msj,ml,mtl;
   int mj,ml; // mj: (rj,t) -> (ri,s) ml:(rl,v) -> (rk,u)
@@ -325,8 +344,16 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s
-  z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_info) {
+    CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s
+    z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    lazy_msj[0] = mj;
+    lazy_msj[1] = ml;
+    lazy_rsi[0] = XI;
+    lazy_rsi[1] = XK;
+    lazy_info[0] = 1;
+  }
 
   /* revert hopping */
   eleIdx[ml]  = rl;  // ml : (rl,v)
@@ -342,6 +369,13 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i
 }
 
 
+double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t,const int u,const int v, const double complex ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn,
+                  int *projCntNew, double complex *buffer) {
+  return GreenFunc2_fsz2_(ri, rj, rk, rl, s, t, u, v, ip,
+                          eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn, projCntNew, buffer, 0, 0, 0);
+}
 
 // ignore GreenFuncN: to be added
 
diff --git a/src/mVMC/locgrn_real.c b/src/mVMC/locgrn_real.c
index 4991a54e..4c6a0e1f 100644
--- a/src/mVMC/locgrn_real.c
+++ b/src/mVMC/locgrn_real.c
@@ -77,10 +77,11 @@ double  GreenFunc1_real(const int ri, const int rj, const int s, const double ip
 
 /* Calculate 2-body Green function <psi|CisAjsCktAlt|x>/<psi|x> */
 /* buffer size = NQPFull+2*Nsize */
-double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
+double GreenFunc2_real_(const int ri, const int rj, const int rk, const int rl,
                   const int s, const int t, const double ip,
                   int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
-                  int *projCntNew, double *buffer) {
+                  int *projCntNew, double *buffer,
+                  int *lazy_info, int *lazy_rsi, int *lazy_msj) {
   double z;
   int mj,msj,ml,mtl;
   int rsi,rsj,rtk,rtl;
@@ -149,8 +150,17 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
   z = ProjRatio(projCntNew,eleProjCnt);
 
   /* calculate Pfaffian */
-  CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV);
-  z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF);
+  if (!lazy_info) {
+    CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV);
+    z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF);
+  } else {
+    // Save invocation hook instead of calling.
+    lazy_msj[0] = msj; //< to edit to ri, revert to rj
+    lazy_msj[1] = mtl; //< to edit to rk, revert to rl
+    lazy_rsi[0] = rsi;
+    lazy_rsi[1] = rtk;
+    lazy_info[0] = 1; //< "is delayed" flag..
+  }
 
   /* revert hopping */
   eleIdx[mtl] = rl;
@@ -163,6 +173,13 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
   return z/ip;//TBC
 }
 
+double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl,
+                  const int s, const int t, const double ip,
+                  int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,
+                  int *projCntNew, double *buffer) {
+  return GreenFunc2_real_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0, 0, 0);
+}
+
 
 /// Calculate n-body Green function
 /// <phi| c1 a1 c2 a2 ... cn an |x>
@@ -397,9 +414,6 @@ double calculateNewPfMN_real_child(const int qpidx, const int n, const int *msa,
   }
 
   /* calculate Pf M */
-  // TODO: Maybe block updating is faster than recalculation of Pfaffian.
-  //       Maybe I'll also give support for this in Pfaffine.
-  info = 1; // Skip inverse.
   M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
diff --git a/src/mVMC/makefile_src b/src/mVMC/makefile_src
index 52ac8076..bf9eae6f 100644
--- a/src/mVMC/makefile_src
+++ b/src/mVMC/makefile_src
@@ -1,8 +1,8 @@
 include ../make.sys
 
 PFAPACK = ../pfapack/fortran/libpfapack.a
-PFAFFINE = ../pfaffine/src/libpfaffine.a
 PFUPDATES = ../pfupdates/pfupdates.o
+LTL2INV = ../ltl2inv/ltl2inv.a
 SFMT = ../sfmt/SFMT.o
 STDFACE = ../StdFace/src/libStdFace.a
 OPTION = -D_mpi_use
@@ -19,7 +19,7 @@ OBJS = \
 	splitloop.o \
 	vmcmain.o \
 	$(PFAPACK) $(SFMT) $(STDFACE) \
-	$(PFUPDATES) $(PFAFFINE)
+	$(PFUPDATES) $(LTL2INV)
 
 SOURCES = \
 average.c \
@@ -107,7 +107,7 @@ HEADERS = \
 
 all : 
 	$(MAKE) -C ../pfapack/fortran                libpfapack.a
-	$(MAKE) -C ../pfaffine/src                   libpfaffine.a
+	$(MAKE) -C ../ltl2inv     -f makefile_ltl2inv
 	$(MAKE) -C ../pfupdates   -f makefile_pfupdates
 	$(MAKE) -C ../sfmt        -f makefile_sfmt
 	$(MAKE) -C ../StdFace/src -f makefile_StdFace libStdFace.a
@@ -130,7 +130,7 @@ clean :
 	rm -f *.o vmc.out vmcdry.out
 	$(MAKE) -C ../sfmt            -f makefile_sfmt      clean
 	$(MAKE) -C ../pfapack/fortran -f makefile           clean
-	$(MAKE) -C ../pfaffine        -f makefile           clean
+	$(MAKE) -C ../ltl2inv         -f makefile_ltl2inv   clean
 	$(MAKE) -C ../pfupdates       -f makefile_pfupdates clean
 	$(MAKE) -C ../StdFace/src     -f makefile_StdFace   clean
 	$(MAKE) -C ../ComplexUHF      -f makefile_uhf       clean
diff --git a/src/mVMC/matrix.c b/src/mVMC/matrix.c
index a86330ca..646c8f18 100644
--- a/src/mVMC/matrix.c
+++ b/src/mVMC/matrix.c
@@ -32,8 +32,6 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #include "./include/global.h"
 #include "workspace.c"
 
-//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx,
-//                        double *bufM, int *iwork, double *work, int lwork);
 #define D_PfLimit 1.0e-100
 
 int getLWork() {
@@ -69,11 +67,7 @@ int getLWork_fcmp() {
   lwork=-1;
   M_ZGETRI(&n, &a, &lda, &iwork, &optSize1, &lwork, &info);
   lwork=-1;
-#ifdef _pfaffine
-  M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork/*, &rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork, &rwork, &info);
-#endif
 
   lwork = (creal(optSize1)>creal(optSize2)) ? (int)creal(optSize1) : (int)creal(optSize2);
   return lwork;
@@ -135,7 +129,6 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta
   int msi,msj;
   int rsi,rsj;
 
-  char uplo='U', mthd='P';
   int m,n,nsq,one,lda,info=0;
   //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful?
   double complex pfaff,minus_one;
@@ -147,60 +140,40 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta
   const double complex *sltE_i;
 
   double complex *invM = InvM + qpidx*Nsize*Nsize;
-  double complex *invM_i;
-
-  double complex *bufM_i, *bufM_i2;
+  double complex *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + eleSpn[msi]*Nsite;//fsz
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + eleSpn[msj]*Nsite;//fsz
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work/*, rwork*/, &lwork, &info);
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-
-  /* Calculate Pf M */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
-
+  M_ZSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
+
+  ltl2pfa_z(n, invM, lda, iwork, &pfaff);
   if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* For fused Pfaffian/inverse, inv(M) is already stored in bufM.
-   * Now to transpose (.* -1) it to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
-  M_ZGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  M_ZGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
+  ltl2inv_z(n, invM, lda, iwork, work, bufM, lda);
 
   /* InvM -> InvM(T) -> -InvM */
   M_ZSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -254,7 +227,6 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int
   int msi,msj;
   int rsi,rsj;
 
-  char uplo='U', mthd='P';
   int m,n,lda,one,nsq,info=0;
   //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful?
   double pfaff,minus_one;
@@ -266,61 +238,41 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int
   const double *sltE_i;
 
   double *invM = InvM_real + qpidx*Nsize*Nsize;
-  double *invM_i;
-
-  double *bufM_i, *bufM_i2;
+  double *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + eleSpn[msi]*Nsite;//fsz
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + eleSpn[msj]*Nsite;//fsz
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-#endif
-
   /* Calculate Pf M */
-  M_DSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, &info);
-
+  M_DSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
+
+  ltl2pfa_d(n, invM, lda, iwork, &pfaff);
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
-  M_DGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  if(info!=0) return info;
-  M_DGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
-  if(info!=0) return info;
+  ltl2inv_d(n, invM, lda, iwork, work, bufM, lda);
 
   /* InvM -> InvM(T) -> -InvM */
   M_DSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -384,7 +336,6 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE
   int msi,msj;
   int rsi,rsj;
 
-  char uplo='U', mthd='P';
   int m,n,nsq,lda,one,info=0;
   double complex pfaff,minus_one;
 
@@ -395,61 +346,42 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE
   const double complex *sltE_i;
 
   double complex *invM = InvM + qpidx*Nsize*Nsize;
-  double complex *invM_i;
-
-  double complex *bufM_i, *bufM_i2;
+  double complex *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + (msi/Ne)*Nsite;
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + (msj/Ne)*Nsite;
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work/*, rwork*/, &lwork, &info);
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-
   /* Calculate Pf M */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
-
+  M_ZSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
+
+  ltl2pfa_z(n, invM, lda, iwork, &pfaff);
   if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
-  M_ZGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  M_ZGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
+  ltl2inv_z(n, invM, lda, iwork, work, bufM, lda);
 
   /* mVMC's handling InvM as row-major,
    * i.e. InvM needs a transpose, InvM -> -InvM according antisymmetric properties. */
   M_ZSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -553,10 +485,6 @@ int calculateMAll_BF_fcmp_child(
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   /* Pfaffian/inverse computed separately. */
   /* Copy bufM to invM before using bufM to compute Pfaffian. */
   for(msi=0;msi<nsize*nsize;msi++)
@@ -564,18 +492,11 @@ int calculateMAll_BF_fcmp_child(
 
   /* Calculate Pf M */
   M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
 
   if(info!=0) return info;
   if(!(isfinite(creal(pfaff)) && isfinite(cimag(pfaff)))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
   M_ZGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
   if(info!=0) return info;
@@ -584,7 +505,6 @@ int calculateMAll_BF_fcmp_child(
 
   /* InvM -> InvM(T) -> -InvM */
   M_ZSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -634,8 +554,6 @@ int CalculateMAll_real(const int *eleIdx, const int qpStart, const int qpEnd) {
   return info;
 }
 
-//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx,
-//                        double *bufM, int *iwork, double *work, int lwork) {
 int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx,
     double *bufM, int *iwork, double *work, int lwork, double* PfM_real, double *InvM_real) {
 #pragma procedure serial
@@ -654,60 +572,45 @@ int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpE
   const double *sltE_i;
 
   double *invM = InvM_real + qpidx*Nsize*Nsize;
-  double *invM_i;
-
-  double *bufM_i, *bufM_i2;
+  double *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + (msi/Ne)*Nsite;
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + (msj/Ne)*Nsite;
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
 
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-#endif
-
   /* calculate Pf M */
-  M_DSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, &info);
-
+  M_DSKTRF("L", "L", &n, invM, &lda, iwork, bufM, &nsq, &info);
+  // printf("M_DSKTRF info:%d\n", info);
   if(info!=0) return info;
+
+  ltl2pfa_d(n, invM, lda, iwork, &pfaff);
+  // printf("pfaff:%e\n", pfaff);
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Compute inverse */
-  M_DGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  M_DGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
+  ltl2inv_d(n, invM, lda, iwork, work, bufM, lda);
 
   // InvM -> InvM' = -InvM
+  // TODO: Include this in ltl2inv
   M_DSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -793,14 +696,10 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-#else
   /* Pfaffian/inverse computed separately. */
   /* Copy bufM to invM before using bufM to compute Pfaffian. */
   for(msi=0;msi<nsize*nsize;msi++)
     invM[msi] = bufM[msi];
-#endif
 
   /* Calculate Pf M */
   M_DSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, &info);
@@ -809,12 +708,6 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Compute inverse. */
   M_DGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
   if(info!=0) return info;
@@ -823,7 +716,6 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int
 
   // InvM -> InvM' = -InvM
   M_DSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
diff --git a/src/mVMC/parameter.c b/src/mVMC/parameter.c
index 9167c941..fc0b59ef 100644
--- a/src/mVMC/parameter.c
+++ b/src/mVMC/parameter.c
@@ -29,7 +29,7 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #ifndef _SRC_PARAMETER
 #define _SRC_PARAMETER
 
-#define D_AmpMax             4.0
+#define D_AmpMax             0.5
 
 /* initialize variational parameters */
 void InitParameter() {
diff --git a/src/mVMC/pfupdate.c b/src/mVMC/pfupdate.c
index 42b82b35..5dc67cec 100644
--- a/src/mVMC/pfupdate.c
+++ b/src/mVMC/pfupdate.c
@@ -352,12 +352,7 @@ double complex calculateNewPfMBFN4_child(const int qpidx, const int n, const int
 
   /* Calculate Pf M */
   //TODO: CHECK rwork is real <-- [R-Xu] Not needed anymore?
-#ifdef _pfaffine
-  info = 1; // Skipping inverse.
-  M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
   //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
@@ -593,12 +588,7 @@ double complex updateMAll_BF_fcmp_child(
   }
 
   /* Calculate Pf M */
-#ifdef _pfaffine
-  info = 1; // Skip Pfaffine inverse.
-  M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
   //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info);
   //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
diff --git a/src/mVMC/pfupdate_real.c b/src/mVMC/pfupdate_real.c
index 2b25909d..690b7c86 100644
--- a/src/mVMC/pfupdate_real.c
+++ b/src/mVMC/pfupdate_real.c
@@ -361,8 +361,6 @@ double calculateNewPfMBFN4_real_child(const int qpidx, const int n, const int *m
   }
 
   /* calculate Pf M */
-  //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-  info = 1; // Skip Pfaffine's inverse.
   M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
@@ -547,7 +545,7 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa,
   // NOTE: This routines is not called in current version (i.e. lcoal coverage 0),
   //       hence cannot be tested. I'm keeping GETRF and GETRI here.
   // TODO: If anyone's interested in utilizing this routine, she or he might want to
-  //       utilize inverse feature of Pfaffine skpfa.
+  //       utilize DSKTRF+LTL2INV
   m=nn=lda=n2;
   //M_ZGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */
   M_DGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */
@@ -601,8 +599,6 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa,
   /* calculate Pf M */
   //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
   //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info);
-  // As stated above, I'm now only setting Pfaffine to skip optinal inverse.
-  info = 1; // This is parameter for skipping inversion.
   M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c
index 57ab2dd2..fa35d7a5 100644
--- a/src/mVMC/vmcmake.c
+++ b/src/mVMC/vmcmake.c
@@ -36,11 +36,6 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #include "splitloop.h"
 #include "qp.h"
 
-#ifdef _pf_block_update
-// Block-update extension.
-#include "../pfupdates/pf_interface.h"
-#endif
-
 void VMCMakeSample(MPI_Comm comm) {
   int outStep,nOutStep;
   int inStep,nInStep;
@@ -74,16 +69,19 @@ void VMCMakeSample(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // Read block size from input.
   const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE");
   if (optBlockSize)
     NBlockUpdateSize = atoi(optBlockSize);
   // Fall back to default if input is invalid.
-  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100)
+  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) {
     if (NExUpdatePath == 0)
       NBlockUpdateSize = 4;
     else
       NBlockUpdateSize = 20;
+  }
 
   // Set one universal EleSpn.
   for (mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
@@ -94,7 +92,7 @@ void VMCMakeSample(MPI_Comm comm) {
                        InvM, Nsize*Nsize,
                        TmpEleIdx, EleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
   CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd);
@@ -108,13 +106,13 @@ void VMCMakeSample(MPI_Comm comm) {
                       qpStart,qpEnd,comm);
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm, Nsite2*Nsite2,
                          InvM, Nsize*Nsize,
                          TmpEleIdx, EleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
     CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd);
@@ -276,13 +274,13 @@ void VMCMakeSample(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm, Nsite2*Nsite2,
                              InvM, Nsize*Nsize,
                              TmpEleIdx, EleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
         CalculateMAll_fcmp(TmpEleIdx,qpStart,qpEnd);
@@ -309,7 +307,7 @@ void VMCMakeSample(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;
diff --git a/src/mVMC/vmcmake_fsz.c b/src/mVMC/vmcmake_fsz.c
index 404318e4..ea0d44af 100644
--- a/src/mVMC/vmcmake_fsz.c
+++ b/src/mVMC/vmcmake_fsz.c
@@ -83,16 +83,19 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // Read block size from input.
   const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE");
   if (optBlockSize)
     NBlockUpdateSize = atoi(optBlockSize);
   // Fall back to default if input is invalid.
-  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100)
+  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) {
     if (NExUpdatePath == 0)
       NBlockUpdateSize = 4;
     else
       NBlockUpdateSize = 20;
+  }
 
   // Initialize with free spin configuration.
   updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
@@ -100,7 +103,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
                        InvM, Nsize*Nsize,
                        TmpEleIdx, TmpEleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
   CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -117,13 +120,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
 
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm, Nsite2*Nsite2,
                          InvM, Nsize*Nsize,
                          TmpEleIdx, TmpEleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
     CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -382,13 +385,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm, Nsite2*Nsite2,
                              InvM, Nsize*Nsize,
                              TmpEleIdx, TmpEleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator);
 #else
         CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -426,7 +429,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;
diff --git a/src/mVMC/vmcmake_fsz_real.c b/src/mVMC/vmcmake_fsz_real.c
index b76e6505..f3dda1d6 100644
--- a/src/mVMC/vmcmake_fsz_real.c
+++ b/src/mVMC/vmcmake_fsz_real.c
@@ -79,6 +79,8 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // TODO: Make it input parameter.
   if (NExUpdatePath == 0)
     NBlockUpdateSize = 4;
@@ -91,7 +93,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
                        InvM_real, Nsize*Nsize,
                        TmpEleIdx, TmpEleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
   CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -108,13 +110,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
 
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm_real, Nsite2*Nsite2,
                          InvM_real, Nsize*Nsize,
                          TmpEleIdx, TmpEleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
     CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -368,13 +370,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm_real, Nsite2*Nsite2,
                              InvM_real, Nsize*Nsize,
                              TmpEleIdx, TmpEleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
         CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd);
@@ -412,7 +414,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;
diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c
index 6c9bde7d..9b73521f 100644
--- a/src/mVMC/vmcmake_real.c
+++ b/src/mVMC/vmcmake_real.c
@@ -37,11 +37,6 @@ which follows "The BSD 3-Clause License".
 #include "splitloop.h"
 #include "vmcmake.h"
 
-#ifdef _pf_block_update
-// Block-update extension.
-#include "../pfupdates/pf_interface.h"
-#endif
-
 void VMCMakeSample_real(MPI_Comm comm) {
   int outStep, nOutStep;
   int inStep, nInStep;
@@ -66,7 +61,7 @@ void VMCMakeSample_real(MPI_Comm comm) {
 
   StartTimer(30);
   if (BurnFlag == 0) {
-    makeInitialSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
+    makeInitialSample_real(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
                       qpStart, qpEnd, comm);
   } else {
     copyFromBurnSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt);
@@ -76,16 +71,19 @@ void VMCMakeSample_real(MPI_Comm comm) {
   // TODO: Compute from qpStart to qpEnd to support loop splitting.
   void *pfOrbital[NQPFull];
   void *pfUpdator[NQPFull];
+  void *pfMat[NQPFull];
+  void *pfMap[NQPFull];
   // Read block size from input.
   const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE");
   if (optBlockSize)
     NBlockUpdateSize = atoi(optBlockSize);
   // Fall back to default if input is invalid.
-  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100)
+  if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) {
     if (NExUpdatePath == 0)
       NBlockUpdateSize = 2;
     else
       NBlockUpdateSize = 20;
+  }
 
   // Set one universal EleSpn.
   for (mi=0; mi<Ne;  mi++) EleSpn[mi] = 0;
@@ -96,7 +94,7 @@ void VMCMakeSample_real(MPI_Comm comm) {
                        InvM_real, Nsize*Nsize,
                        TmpEleIdx, EleSpn,
                        NBlockUpdateSize,
-                       pfUpdator, pfOrbital);
+                       pfUpdator, pfOrbital, pfMat, pfMap);
   updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
   CalculateMAll_real(TmpEleIdx, qpStart, qpEnd);
@@ -106,17 +104,17 @@ void VMCMakeSample_real(MPI_Comm comm) {
 
   if (!isfinite(logIpOld)) {
     if (rank == 0) fprintf(stderr, "waring: VMCMakeSample remakeSample logIpOld=%e\n", creal(logIpOld)); //TBC
-    makeInitialSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
+    makeInitialSample_real(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt,
                       qpStart, qpEnd, comm);
 #ifdef _pf_block_update
     // Clear and reinitialize.
-    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+    updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                          SlaterElm_real, Nsite2*Nsite2,
                          InvM_real, Nsize*Nsize,
                          TmpEleIdx, EleSpn,
                          NBlockUpdateSize,
-                         pfUpdator, pfOrbital);
+                         pfUpdator, pfOrbital, pfMat, pfMap);
     updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
     CalculateMAll_real(TmpEleIdx, qpStart, qpEnd);
@@ -283,13 +281,13 @@ void VMCMakeSample_real(MPI_Comm comm) {
         StartTimer(34);
 #ifdef _pf_block_update
         // Clear and reinitialize.
-        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+        updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize,
                              SlaterElm_real, Nsite2*Nsite2,
                              InvM_real, Nsize*Nsize,
                              TmpEleIdx, EleSpn,
                              NBlockUpdateSize,
-                             pfUpdator, pfOrbital);
+                             pfUpdator, pfOrbital, pfMat, pfMap);
         updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator);
 #else
         CalculateMAll_real(TmpEleIdx, qpStart, qpEnd);
@@ -316,7 +314,7 @@ void VMCMakeSample_real(MPI_Comm comm) {
 
 #ifdef _pf_block_update
   // Free-up updator space.
-  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital);
+  updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap);
 #endif
 
   return;
diff --git a/src/pfaffine b/src/pfaffine
deleted file mode 160000
index ebbb37ee..00000000
--- a/src/pfaffine
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ebbb37ee4ac9b08b0907e81c5b593df5c975514a
diff --git a/src/pfapack b/src/pfapack
index b655435f..f541a51a 160000
--- a/src/pfapack
+++ b/src/pfapack
@@ -1 +1 @@
-Subproject commit b655435f2351c1f18bad0bfb7107822f505332ec
+Subproject commit f541a51a3a6968b84117cf12f69b0776053a79e3
diff --git a/src/pfupdates/CMakeLists.txt b/src/pfupdates/CMakeLists.txt
index 5f313232..a7ef83a4 100644
--- a/src/pfupdates/CMakeLists.txt
+++ b/src/pfupdates/CMakeLists.txt
@@ -5,14 +5,12 @@ if(${CMAKE_PROJECT_NAME} STREQUAL "Project")
   message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.")
 endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
 
-add_definitions(-DBLAS_EXTERNAL)
-add_definitions(-DF77_COMPLEX_RET_INTEL)
-include_directories(../pfaffine/src)
+if("${ARCHITECTURE}" STREQUAL "x86_64")
+  add_definitions(-DF77_COMPLEX_RET_INTEL)
+endif()
+include_directories(../ltl2inv)
+include_directories(../pfapack/c_interface)
 
-add_library(pfupdates STATIC
-  pf_interface.cc
-  ../pfaffine/src/skpfa.cc
-  ../pfaffine/src/sktdf.cc
-  ../pfaffine/src/sktdi.cc)
+add_library(pfupdates STATIC pf_interface.cc)
 target_compile_definitions(pfupdates PRIVATE -D_CC_IMPL)
 
diff --git a/src/pfupdates/config_manager.tcc b/src/pfupdates/config_manager.tcc
new file mode 100644
index 00000000..978cbcf5
--- /dev/null
+++ b/src/pfupdates/config_manager.tcc
@@ -0,0 +1,280 @@
+/**
+ * \copyright Copyright (c) Dept. Phys., Univ. Tokyo
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "error.hh"
+#include <random>
+#include <vector>
+#include <tuple>
+#include <cstdint>
+
+namespace vmc
+{
+struct config_manager
+{
+  using index_t = int32_t;
+  using base_t = std::vector<index_t>;
+
+  const index_t nsites;
+  index_t norbs()
+  { return nsites * 2; }
+
+  const  base_t &config_base()          const { return _cfg_base; }
+  const index_t  config_base(index_t i) const { return _cfg_base.at(i); }
+  const  base_t &   from_idx()          const { return _from_idx; }
+  const index_t     from_idx(index_t i) const { return _from_idx.at(i); }
+  const  base_t &     to_orb()          const { return _to_orb; }
+  const index_t       to_orb(index_t i) const { return _to_orb.at(i); }
+  const  base_t &     config()          const { return _cfg; }
+  const index_t       config(index_t i) const { return _cfg.at(i); }
+  const  base_t &        num()          const { return _num; }
+  const index_t          num(index_t i) const { return _num.at(i); }
+  const  base_t &        inv()          const { return _inv; }
+  const index_t          inv(index_t i) const { return _inv.at(i); }
+  const index_t         spin(index_t i) const { return _num.at(i) == 1 ? 1 : _num.at(i) == 2 ? -1 : 0; }
+
+  void reserve_space(index_t mmax)
+  {
+    _from_idx.reserve(mmax * sizeof(index_t));
+    _to_orb.reserve(mmax * sizeof(index_t));
+  }
+
+  void attach_config(const base_t &cfg)
+  {
+    _cfg_base = cfg;
+    _cfg = cfg;
+    _from_idx.clear();
+    _to_orb.clear();
+
+    // Reset _num.
+    for (index_t i = 0; i < nsites; ++i)
+      _num.at(i) = 0;
+
+    // Generate num.
+    for (index_t i = 0; i < cfg.size(); ++i) {
+      // 0b01 for up, 0b10 for down, 0b11 for double occupation.
+      index_t xi = cfg.at(i) % nsites;
+      index_t si = cfg.at(i) / nsites;
+      _num.at(xi) += (si + 1);
+    }
+
+    // Generate inv.
+    for (index_t i = 0; i < _inv.size(); ++i)
+      _inv.at(i) = -1;
+    for (index_t i = 0; i < cfg.size(); ++i)
+      _inv.at(cfg.at(i)) = i;
+  }
+
+  void merge_config()
+  {
+    // OR: cfg_base = cfg;
+    for (index_t j = 0; j < _from_idx.size(); ++j)
+      _cfg_base.at(_from_idx.at(j)) = _to_orb.at(j);
+    _from_idx.clear();
+    _to_orb.clear();
+  }
+
+  void push_update(index_t osi, index_t msj)
+  {
+    for (auto i = _from_idx.begin(); i != _from_idx.end(); ++i)
+      assert_(*i != msj, typeid(*this).name(), "Consecutive hopping unsupported. Please merge manually before proceeding.");
+    index_t osj = _cfg.at(msj);
+    index_t xj = osj % nsites;
+    index_t sj = osj / nsites;
+    index_t xi = osi % nsites;
+    index_t si = osi / nsites;
+    assert_(_inv.at(osj) == msj && _inv.at(osi) == -1, typeid(*this).name(), "Inv error @ push.");
+
+    _from_idx.push_back(msj);
+    _to_orb.push_back(osi);
+    _cfg.at(msj) = osi;
+    // Update num.
+    _num.at(xj) -= sj + 1;
+    _num.at(xi) += si + 1;
+    // Update inv.
+    _inv.at(osj) = -1;
+    _inv.at(osi) = msj;
+  }
+
+  void pop_update()
+  {
+    assert_(_from_idx.size(), typeid(*this).name(), "Trying to pop empty updates.");
+    index_t msj = _from_idx.back();
+    index_t osj = _cfg_base.at(msj);
+    index_t xj  = osj % nsites;
+    index_t sj  = osj / nsites;
+    index_t osi = _cfg.at(msj);
+    index_t xi  = osi % nsites;
+    index_t si  = osi / nsites;
+    assert_(_inv.at(osi) == msj && _inv.at(osj) == -1, typeid(*this).name(), "Inv error @ pop.");
+
+    _cfg.at(msj) = osj;
+    // Update num.
+    _num.at(xi) -= si + 1;
+    _num.at(xj) += sj + 1;
+    // Update inv.
+    _inv.at(osi) = -1;
+    _inv.at(osj) = msj;
+    _from_idx.pop_back();
+    _to_orb.pop_back();
+  }
+
+  void check()
+  {
+    assert_(_cfg.size() <= norbs(), typeid(*this).name(), "More Fermions than orbitals.");
+
+    base_t num(nsites, 0);
+    for (index_t i = 0; i < num.size(); ++i) {
+      index_t xi = _cfg.at(i) % nsites;
+      index_t si = _cfg.at(i) / nsites;
+
+      assert_(!(num.at(xi) & (si + 1)), typeid(*this).name(), "More than two Fermions occupying the same orbital.");
+      num.at(xi) += (si + 1);
+    }
+    for (index_t i = 0; i < nsites; ++i)
+      assert_(num.at(i) == _num.at(i), typeid(*this).name(), "Found broken member: _num.");
+
+    base_t cfg_curr = _cfg_base;
+    for (index_t j = 0; j < _from_idx.size(); ++j)
+      cfg_curr.at(_from_idx.at(j)) = _to_orb.at(j);
+    for (index_t i = 0; i < cfg_curr.size(); ++i)
+      assert_(cfg_curr.at(i) == _cfg.at(i), typeid(*this).name(), "Found broken member: _cfg.");
+    // Check inv.
+  }
+
+  config_manager(index_t nsites_)
+  : nsites(nsites_), _num(nsites_), _inv(norbs()) {  }
+
+  config_manager(index_t nsites_, base_t cfg)
+  : nsites(nsites_), _num(nsites_), _inv(norbs())
+  { attach_config(cfg); }
+
+  template <typename rng_t>
+  static base_t singlet_config(unsigned nsite, unsigned nsinglet, rng_t &rng)
+  {
+    using dist_t = std::uniform_int_distribution<unsigned>;
+
+    bool allow_doublon = nsinglet * 2 > nsite;
+    base_t marks(nsite, 0);
+    base_t cfg(0);
+
+    // Spin up.
+    for (int i = 0; i < nsinglet; ++i) {
+      int iseq = dist_t(0, nsite - i - 1)(rng);
+
+      int isite;
+      int isite_c = 0;
+      do {
+        // Find the next available site.
+        while (marks.at(isite_c)) { isite_c++; }
+        isite = isite_c;
+        isite_c++;
+      } while (iseq-- > 0);
+
+      marks.at(isite) = 1;
+      cfg.push_back(isite);
+    }
+
+    // Spin down.
+    for (int i = 0; i < nsinglet; ++i) {
+      int iseq;
+      if (allow_doublon)
+        iseq = dist_t(0, nsite - i - 1)(rng);
+      else
+        iseq = dist_t(0, nsite - i - 1 - nsinglet)(rng);
+
+      int isite;
+      int isite_c = 0;
+      do {
+        // Find the next unoccupied site / orb.
+        if (allow_doublon)
+          while (marks.at(isite_c) > 1) { isite_c++; }
+        else
+          while (marks.at(isite_c) > 0) { isite_c++; }
+        isite = isite_c;
+        isite_c++;
+      } while (iseq-- > 0);
+
+      marks.at(isite) = 2;
+      cfg.push_back(isite + nsite);
+    }
+    return cfg;
+  }
+
+  template <typename rng_t>
+  std::pair<int, int> propose_hop(rng_t &rng) const
+  {
+    int loop_breaker = 0;
+    int xi;
+    int msj = std::uniform_int_distribution<unsigned>(0, _cfg.size() - 1)(rng);
+    std::uniform_int_distribution<unsigned> select_site(0, nsites - 1);
+    int sj = msj / nsites;
+
+    do {
+      assert_(loop_breaker++ < 10000000, typeid(*this).name(), "Too many loops.");
+      xi = select_site(rng);
+    } while (_num.at(xi) & (sj + 1));
+
+    return std::make_pair(xi + sj * nsites, msj);
+  }
+
+  template <typename rng_t>
+  std::tuple<int, int, int, int> propose_exc(rng_t &rng) const
+  {
+    using dist_t = std::uniform_int_distribution<unsigned>;
+
+    int nup = 0;
+    int ndn = 0;
+    int loop_breaker = 0;
+    int msj, xj = 0;
+    int msl, xl = 0;
+
+    // Count # of spin-up/down sites.
+    for (int i = 0; i < _num.size(); ++i)
+      if (num(i) == 1)
+        nup++;
+      else if (num(i) == 2)
+        ndn++;
+
+    // Pick a singly occupied spin-up site.
+    int iseq_up = dist_t(0, nup - 1)(rng);
+    while (true) {
+      while (num(xj) != 1) ++xj;
+      if (--iseq_up < 0)
+        break;
+      else
+        ++xj;
+    }
+
+    // Pick a singly occupied spin-down site.
+    int iseq_dn = dist_t(0, ndn - 1)(rng);
+    while (true) {
+      while (num(xl) != 2) ++xl;
+      if (--iseq_dn < 0)
+        break;
+      else
+        ++xl;
+    }
+
+    // Reverse lookup config indices.
+    msj = inv(xj);
+    msl = inv(xl + nsites);
+    assert_(msj >= 0 && msl >= 0, typeid(*this).name(), "ExchangeConfig error.");
+
+    return std::make_tuple(xl, msj, xj + nsites, msl);
+  }
+
+private:
+  base_t _cfg_base;
+  base_t _from_idx;
+  base_t _to_orb;
+  base_t _cfg;
+  base_t _num;
+  base_t _inv;
+};
+}
+
diff --git a/src/pfupdates/makefile_pfupdates b/src/pfupdates/makefile_pfupdates
index ee946fc0..ab9be063 100644
--- a/src/pfupdates/makefile_pfupdates
+++ b/src/pfupdates/makefile_pfupdates
@@ -1,16 +1,15 @@
 include ../make.sys
 
-ifeq ($(BLIS_ROOT),)
-	BLIS_ROOT = ../pfaffine/src/dep
-endif
-
 OBJ = pfupdates.o
 SRC = pf_interface.cc
-HDR = pf_interface.h orbital_mat.tcc skmv.tcc updated_tdi.tcc
+HDR = pf_interface.h \
+		orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \
+		../ltl2inv/eigen_gemmt.tcc ../ltl2inv/invert.tcc ../ltl2inv/pfaffian.tcc ../ltl2inv/trmmt.tcc \
+		../ltl2inv/error.hh ../ltl2inv/ilaenv_lauum.hh ../ltl2inv/lapack2eigen.hh
 
 $(OBJ): $(SRC) $(HDR) 
 	$(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \
-		-I../pfaffine/src \
+		-I../pfapack/c_interface -I../ltl2inv \
 		-I$(BLIS_ROOT)/include \
 		-I$(BLIS_ROOT)/include/blis
 
diff --git a/src/pfupdates/orbital_mat.tcc b/src/pfupdates/orbital_mat.tcc
index cb8a6117..ddb4ce6c 100644
--- a/src/pfupdates/orbital_mat.tcc
+++ b/src/pfupdates/orbital_mat.tcc
@@ -6,52 +6,36 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "blis.h"
-#include "colmaj.tcc"
+#include <Eigen/Dense>
 #include <random>
-#ifdef UseBoost
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/vector.hpp>
-#include <boost/numeric/ublas/io.hpp>
-template <typename T>
-using matrix_t = boost::numeric::ublas::matrix<T, boost::numeric::ublas::column_major>;
-template <typename T>
-using vector_t = boost::numeric::ublas::vector<T>;
-#else
-template <typename T>
-using matrix_t = colmaj<T>;
-#endif
 
-template <typename T>
+namespace vmc
+{
+namespace orbital
+{
+template <typename matrix_t_>
 struct orbital_mat
 {
-  const uplo_t uplo;
-  const dim_t nsite;
-  matrix_t<T> X; ///< nsite*nsite.
+  using matrix_t = matrix_t_;
+  using T = typename matrix_t::Scalar;
+  using index_t = int32_t;
 
-  orbital_mat(uplo_t uplo_, dim_t nsite_, T *X_, inc_t ldX)
-      : uplo(uplo_), nsite(nsite_),
-  #ifdef UseBoost
-        X(nsite, nsite) {
-    colmaj<T> X_tmp(X_, ldX);
-    for (dim_t j = 0; j < nsite; ++j)
-      for (dim_t i = 0; i < nsite; ++i)
-        X(i, j) = X_tmp(i, j);
-  }
-  #else
-        X(X_, ldX) { }
-  #endif
+  const char uplo;
+  const index_t norb_;
+  matrix_t &X; ///< norb*norb.
+
+  orbital_mat(char uplo_, index_t norb__, matrix_t &X_)
+  : uplo(uplo_), norb_(norb__), X(X_) { }
 
-  orbital_mat(uplo_t uplo_, dim_t nsite_, matrix_t<T> &X_)
-  : uplo(uplo_), nsite(nsite_), X(X_) { }
+  virtual index_t norb() { return norb_; }
 
-  void randomize(double amplitude, unsigned seed) { 
+  virtual void randomize(double amplitude, unsigned seed) {
     using namespace std;
     mt19937_64 rng(seed);
     uniform_real_distribution<double> dist(-0.1, 1.0);
 
-    for (dim_t j = 0; j < nsite; ++j) {
-      for (dim_t i = 0; i < j; ++i) {
+    for (index_t j = 0; j < norb_; ++j) {
+      for (index_t i = 0; i < j; ++i) {
         X(i, j) = T(dist(rng)) * amplitude;
         X(j, i) = -X(i, j);
       }
@@ -61,18 +45,28 @@ struct orbital_mat
 
   void randomize(double amplitude) { randomize(amplitude, 511); }
 
-  T operator()(dim_t osi, dim_t osj) {
+  virtual index_t pack_idx(index_t osi, index_t osj) {
+    abort();
+  }
+
+  virtual void unpack_idx(index_t &osi, index_t &osj, index_t idx) {
+    abort();
+  }
+
+  virtual T operator()(index_t osi, index_t osj) {
     if (osi == osj)
       return T(0.0);
 
     switch (uplo) {
-    case BLIS_UPPER:
+    case 'U':
+    case 'u':
       if (osi < osj)
         return X(osi, osj);
       else
         return -X(osj, osi);
     
-    case BLIS_LOWER:
+    case 'L':
+    case 'l':
       if (osi > osj)
         return X(osi, osj);
       else
@@ -83,3 +77,5 @@ struct orbital_mat
     }
   }
 };
+}
+}
diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc
index a19596a4..575ca625 100644
--- a/src/pfupdates/pf_interface.cc
+++ b/src/pfupdates/pf_interface.cc
@@ -4,23 +4,72 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #include "updated_tdi.tcc"
+#include "orbital_mat.tcc"
 // In implementation, this should appear AFTER blis.h.
 #include "pf_interface.h"
+#include <omp.h>
 
 // Non-# Pragma support differs from compiler to compiler
 #if defined(__INTEL_COMPILER)
-#define OMP_PARALLEL_FOR_SHARED __pragma(omp parallel for default(shared))
+#define _ccPragma(_1) __pragma(_1)
 #elif defined(__GNUC__)
-#define OMP_PARALLEL_FOR_SHARED _Pragma("omp parallel for default(shared)")
+#define _ccPragma(_1) _Pragma(#_1)
 #else
-#error "Valid non-preprocessor _pragma() not found."
+#error "Valid non-preprocessor _Pragma() not found."
 #endif
 
-#define orbv( i, ctype ) ( (orbital_mat<ctype> *)orbv[i] )
-#define objv( i, ctype ) ( (updated_tdi<ctype> *)objv[i] )
+using namespace Eigen;
+using namespace vmc::orbital;
+template <typename T>
+using matrix_t = Map<Matrix<T, Dynamic, Dynamic>, 0, OuterStride<> >;
+
+#define matv( i, ctype ) ( (                        matrix_t<ctype>     *)matv[i] )
+#define mapv( i, ctype ) ( (                        matrix_t<ctype>     *)mapv[i] )
+#define orbv( i, ctype ) ( (            orbital_mat<matrix_t<ctype> >   *)orbv[i] )
+#define objv( i, ctype ) ( (updated_tdi<orbital_mat<matrix_t<ctype> > > *)objv[i] )
 
 #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar
 
+#define GENIMPL( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \
+    ( uint64_t  num_qp, \
+      uint64_t  nsite, \
+      uint64_t  norbs, \
+      uint64_t  nelec, \
+      ctype    *orbmat_base, \
+      int64_t   orbmat_stride, \
+      ctype    *invmat_base, \
+      int64_t   invmat_stride, \
+      int32_t  *eleidx, \
+      int32_t  *elespn, \
+      uint64_t  mmax, \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
+{ \
+  for (int iqp = 0; iqp < num_qp; ++iqp) { \
+    mapv[iqp] = new matrix_t<ctype>(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \
+    matv[iqp] = new matrix_t<ctype>(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \
+    orbv[iqp] = new orbital_mat<matrix_t<ctype>>('U', norbs, *mapv(iqp, ctype) ); \
+    objv[iqp] = new updated_tdi<orbital_mat<matrix_t<ctype>>> \
+        ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \
+\
+    /* Compute initial. */ \
+    vmc::config_manager::base_t cfg_i(nelec); \
+    for (int msi = 0; msi < nelec; ++msi) \
+      cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \
+    objv(iqp, ctype)->attach_config(cfg_i); \
+    objv(iqp, ctype)->initialize_precomputed(pfav[iqp]); \
+  } \
+}
+// GENIMPL( float,    s )
+GENIMPL( double,   d )
+// GENIMPL( ccscmplx, c )
+GENIMPL( ccdcmplx, z )
+#undef GENIMPL
+
 #define GENIMPL( ctype, cblachar ) \
   void EXPANDNAME( updated_tdi_v_init, cblachar ) \
     ( uint64_t  num_qp, \
@@ -35,25 +84,29 @@
       int32_t  *elespn, \
       uint64_t  mmax, \
       void     *objv[], \
-      void     *orbv[] ) \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
 { \
-  OMP_PARALLEL_FOR_SHARED \
+  _ccPragma(omp parallel for default(shared)) \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
-    orbv[iqp] = new orbital_mat<ctype>( \
-        BLIS_UPPER, norbs, orbmat_base + iqp * orbmat_stride, norbs); \
-    objv[iqp] = new updated_tdi<ctype>( \
-        *orbv(iqp, ctype), nelec, invmat_base + iqp * invmat_stride, nelec, mmax); \
+    mapv[iqp] = new matrix_t<ctype>(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \
+    matv[iqp] = new matrix_t<ctype>(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \
+    orbv[iqp] = new orbital_mat<matrix_t<ctype>>('U', norbs, *mapv(iqp, ctype) ); \
+    objv[iqp] = new updated_tdi<orbital_mat<matrix_t<ctype>>> \
+        ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \
 \
     /* Compute initial. */ \
-    auto &cfg_i = objv(iqp, ctype)->elem_cfg; \
+    vmc::config_manager::base_t cfg_i(nelec); \
     for (int msi = 0; msi < nelec; ++msi) \
       cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \
+    objv(iqp, ctype)->attach_config(cfg_i); \
     objv(iqp, ctype)->initialize(); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -61,16 +114,20 @@ GENIMPL( ccdcmplx, z )
   void EXPANDNAME( updated_tdi_v_free, cblachar ) \
     ( uint64_t  num_qp, \
       void     *objv[], \
-      void     *orbv[] ) \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
 { \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
     delete orbv(iqp, ctype); \
     delete objv(iqp, ctype); \
+    delete matv(iqp, ctype); \
+    delete mapv(iqp, ctype); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -82,11 +139,11 @@ GENIMPL( ccdcmplx, z )
 { \
   /* Minus sign appears from that SlaterElm is in fact interpreted as row-major. */ \
   for (int iqp = 0; iqp < num_qp; ++iqp) \
-    pfav[iqp] = -objv(iqp, ctype)->get_Pfa(); \
+    pfav[iqp] = -objv(iqp, ctype)->get_amplitude(); \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -98,13 +155,13 @@ GENIMPL( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] ) \
 { \
-  OMP_PARALLEL_FOR_SHARED \
+  _ccPragma(omp parallel for default(shared)) \
   for (int iqp = 0; iqp < num_qp; ++iqp) \
     objv(iqp, ctype)->push_update_safe(osi, msj, cal_pfa!=0); \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -118,9 +175,9 @@ GENIMPL( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] ) \
 { \
-  OMP_PARALLEL_FOR_SHARED \
+  _ccPragma(omp parallel for default(shared)) \
   for (int iqp = 0; iqp < num_qp; ++iqp) { \
-    auto &from_i = objv(iqp, ctype)->from_idx; \
+    const auto &from_i = objv(iqp, ctype)->get_config_manager().from_idx(); \
     if (from_i.size() > objv(iqp, ctype)->mmax - 2) \
       objv(iqp, ctype)->merge_updates(); \
     else \
@@ -134,9 +191,9 @@ GENIMPL( ccdcmplx, z )
     objv(iqp, ctype)->push_update(osk, msl, cal_pfa!=0); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -150,9 +207,121 @@ GENIMPL( ccdcmplx, z )
     objv(iqp, ctype)->pop_update(cal_pfa!=0); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
+GENIMPL( double,   d )
+// GENIMPL( ccscmplx, c )
+GENIMPL( ccdcmplx, z )
+#undef GENIMPL
+
+#define GENIMPL( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_var0_proc_batch_greentwo, cblachar ) \
+    ( uint64_t  num_qp, \
+      int       num_gf, \
+      int       needs_comput[], /* Already packed. var0 only does unpacking. */ \
+      int       unpack_idx[],   /* Already packed. var0 only does unpacking. */ \
+      int       to_orbs_[],     /* Already packed. var0 only does unpacking. */ \
+      int       from_ids_[],    /* Already packed. var0 only does unpacking. */ \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
+{ \
+  const int ith = omp_get_thread_num(); \
+  const int nth = omp_get_num_threads(); \
+\
+  /* Partitioning. */ \
+  const int qpCountThread = (num_qp + nth - 1) / nth; \
+  const int qpStart = qpCountThread * ith; \
+  const int qpEnd = std::min(int(num_qp), qpCountThread + qpStart); \
+\
+  Eigen::Map<VectorXi>  to_orbs( to_orbs_, num_gf * 2); \
+  Eigen::Map<VectorXi> from_ids(from_ids_, num_gf * 2); \
+\
+  for (int iqp = qpStart; iqp < qpEnd; ++iqp) { \
+    Eigen::Vector<ctype, Eigen::Dynamic> pfaBatch = \
+      objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs, from_ids); \
+    pfaBatch *= objv(iqp, ctype)->get_amplitude(); \
+\
+    /* Unpack computed Pfaffians. */ \
+    for (int igf = 0; igf < num_gf; ++igf) \
+      pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf]; \
+  } \
+}
+// GENIMPL( float,    s )
+GENIMPL( double,   d )
+// GENIMPL( ccscmplx, c )
+GENIMPL( ccdcmplx, z )
+#undef GENIMPL
+
+#define GENIMPL( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_var1_proc_batch_greentwo, cblachar ) \
+    ( uint64_t  num_qp, \
+      int       num_gf, \
+      int       needs_comput[], \
+      int       unpack_idx[], \
+      int       to_orbs[], \
+      int       from_ids[], \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] ) \
+{ \
+  const int ith = omp_get_thread_num(); \
+  const int nth = omp_get_num_threads(); \
+\
+  /* Count Green's funcs. */ \
+  int gf_count = 0; \
+  for (int i = 0; i < num_gf; ++i) \
+    if ( needs_comput[i] ) \
+      gf_count++; \
+\
+  /* Partitioning. */ \
+  int gfCountThread = (gf_count + nth - 1) / nth; \
+  const int gfStart = gfCountThread * ith; \
+  gfCountThread = std::min(gf_count - gfStart, gfCountThread); \
+\
+  /* Pack info space. */ \
+  gf_count = -gfStart; \
+  int *to_orbs_thread_, *from_ids_thread_, *unpack_idx_thread; \
+  for (int i = 0; i < num_gf && gf_count < gfCountThread; ++i) \
+    if ( needs_comput[i] ) { \
+      if ( gf_count >= 0 ) { \
+        if ( gf_count == 0 ) { \
+          /* Pin down scratchpad spots. */ \
+          to_orbs_thread_   = to_orbs  + i * 2; \
+          from_ids_thread_  = from_ids + i * 2; \
+          unpack_idx_thread = unpack_idx + i; \
+          unpack_idx_thread[0] = i; \
+        } else { \
+          /* 0th idx already in the place. Pack from 1st. */ \
+          to_orbs_thread_[gf_count * 2    ] = to_orbs[i * 2]; \
+          to_orbs_thread_[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \
+          from_ids_thread_[gf_count * 2    ] = from_ids[i * 2]; \
+          from_ids_thread_[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \
+          unpack_idx_thread[gf_count] = i; \
+        } \
+      } \
+      gf_count++; \
+    } \
+\
+  Eigen::Map<VectorXi>  to_orbs_thread( to_orbs_thread_, gfCountThread * 2); \
+  Eigen::Map<VectorXi> from_ids_thread(from_ids_thread_, gfCountThread * 2); \
+\
+  for (int iqp = 0; iqp < num_qp; ++iqp) { \
+    Eigen::Vector<ctype, Eigen::Dynamic> pfaBatch = \
+      objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs_thread, from_ids_thread); \
+    pfaBatch *= objv(iqp, ctype)->get_amplitude(); \
+\
+    /* Unpack computed Pfaffians. */ \
+    for (int igf = 0; igf < gfCountThread; ++igf) \
+      pfav[unpack_idx_thread[igf] * num_qp + iqp] = pfaBatch[igf]; \
+  } \
+}
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h
index b9f4cbf2..49ad0252 100644
--- a/src/pfupdates/pf_interface.h
+++ b/src/pfupdates/pf_interface.h
@@ -12,7 +12,7 @@
 #include "stdint.h"
 
 // Redefine double complex for (future) non-C99 compatibility.
-#ifndef _CC_IMPL
+#ifndef __cplusplus
 typedef _Complex double ccdcmplx;
 typedef _Complex float  ccscmplx;
 #else
@@ -24,6 +24,31 @@ extern "C" {
 
 #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar
 
+#define GENDEF( ctype, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \
+    ( uint64_t  num_qp, \
+      uint64_t  nsite, \
+      uint64_t  norbs, \
+      uint64_t  nelec, \
+      ctype    *orbmat_base, \
+      int64_t   orbmat_stride, \
+      ctype    *invmat_base, \
+      int64_t   invmat_stride, \
+      int32_t  *eleidx, \
+      int32_t  *elespn, \
+      uint64_t  mmax, \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
+// GENDEF( float,    s )
+GENDEF( double,   d )
+// GENDEF( ccscmplx, c )
+GENDEF( ccdcmplx, z )
+#undef GENDEF
+
+
 #define GENDEF( ctype, cblachar ) \
    void EXPANDNAME( updated_tdi_v_init, cblachar ) \
     ( uint64_t  num_qp, \
@@ -38,11 +63,13 @@ extern "C" {
       int32_t  *elespn, \
       uint64_t  mmax, \
       void     *objv[], \
-      void     *orbv[] );
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -50,11 +77,13 @@ GENDEF( ccdcmplx, z )
    void EXPANDNAME( updated_tdi_v_free, cblachar ) \
     ( uint64_t  num_qp, \
       void     *objv[], \
-      void     *orbv[] );
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -64,9 +93,9 @@ GENDEF( ccdcmplx, z )
       ctype     pfav[], \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -78,9 +107,9 @@ GENDEF( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -94,9 +123,9 @@ GENDEF( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -106,15 +135,38 @@ GENDEF( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
+#define GENDEF( ctype, infix, cblachar ) \
+  void EXPANDNAME( updated_tdi_v_omp_## infix ##_proc_batch_greentwo, cblachar ) \
+    ( uint64_t  num_qp, \
+      int       num_gf, \
+      int       needs_comput[], \
+      int       unpack_idx[], \
+      int       to_orbs[], \
+      int       from_ids[], \
+      ctype     pfav[], \
+      void     *objv[], \
+      void     *orbv[], \
+      void     *matv[], \
+      void     *mapv[] );
+// GENDEF( float, var0,    s )
+// GENDEF( float, var1,    s )
+GENDEF( double, var0,   d )
+GENDEF( double, var1,   d )
+// GENDEF( ccscmplx, var0, c )
+// GENDEF( ccscmplx, var1, c )
+GENDEF( ccdcmplx, var0, z )
+GENDEF( ccdcmplx, var1, z )
+#undef GENDEF
+
 #undef EXPANDNAME
 
-#ifdef _CC_IMPL
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/src/pfupdates/skmv.tcc b/src/pfupdates/skmv.tcc
index 5f484c53..31a0947c 100644
--- a/src/pfupdates/skmv.tcc
+++ b/src/pfupdates/skmv.tcc
@@ -6,50 +6,39 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "colmaj.tcc"
-#include "blalink.hh"
-#include <vector>
+#include "lapack2eigen.hh"
 
-template <typename T>
-void skmv(uplo_t uploA,
-          dim_t m,
-          T alpha,
-          T *_A, inc_t ldA,
-          T *X,
-          T *Y) {
-  using namespace std;
-  colmaj<T> A(_A, ldA);
+template <typename Mat, typename Vec>
+Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1>
+  skmv(const char &uploA,
+       const typename Mat::Scalar &alpha,
+       const Mat &A,
+       const Vec &X) {
+  using namespace Eigen;
 
+  Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1> Y;
   // Way 1:
-  // skmm(BLIS_LEFT, uploA, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, m, 1,
-  //      T(1.0), &A(0, 0), A.ld, &X[0], ldA, T(0.0), &Y[0], ldA);
-
-  // Way 2:
-  vector<T> Z(m);
-  for (dim_t i = 0; i < m; ++i) {
-    Z[i] = X[i];
-    Y[i] = X[i];
+  if (uploA == 'U' || uploA == 'u') {
+    Y = A.template triangularView<StrictlyUpper>() * X - A.template triangularView<StrictlyUpper>().transpose() * X;
+  } else {
+    Y = A.template triangularView<StrictlyLower>() * X - A.template triangularView<StrictlyLower>().transpose() * X;
   }
-  trmv(uploA, BLIS_NO_TRANSPOSE,
-       m, alpha, &A(0, 0), A.ld, &Y[0], 1);
-  trmv(uploA, BLIS_TRANSPOSE,
-       m, alpha, &A(0, 0), A.ld, &Z[0], 1);
-  for (dim_t i = 0; i < m; ++i)
-    Y[i] -= Z[i];
+  Y *= alpha;
 
-  // Way 3:
-  // for (dim_t i = 0; i < m; ++i)
+  // Way 2:
+  // for (int i = 0; i < m; ++i)
   //   Y[i] = 0.0;
-  // for (dim_t j = 0; j < m; ++j) {
+  // for (int j = 0; j < m; ++j) {
   //   T x_j = X[j];
-  //   for (dim_t i = 0; i < j; ++i) {
+  //   for (int i = 0; i < j; ++i) {
   //     T A_ij = A(i, j);
   //     Y[i] += A_ij * x_j;
   //     Y[j] -= A_ij * X[i];
   //   }
   // }
-  // for (dim_t i = 0; i < m; ++i)
+  // for (int i = 0; i < m; ++i)
   //   Y[i] *= alpha;
   
+  return Y;
 }
 
diff --git a/src/pfupdates/skr2k.tcc b/src/pfupdates/skr2k.tcc
new file mode 100644
index 00000000..a6bba999
--- /dev/null
+++ b/src/pfupdates/skr2k.tcc
@@ -0,0 +1,35 @@
+/**
+ * \file skr2k.tcc
+ * Dispatcher and minimal implementation for SKR2k.
+ * Minimal implementation is used when M is extremely small.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "lapack2eigen.hh"
+#include <iostream>
+
+
+template <typename Mat0, typename Mat1, typename Mat2>
+inline void skr2k(const char &uploC,
+                  const char &transAB,
+                  const typename Mat2::Scalar &alpha,
+                  const Mat0 &A,
+                  const Mat1 &B,
+                  const typename Mat2::Scalar &beta,
+                        Mat2 &C)
+{
+    using T = typename Mat0::Scalar;
+
+    l2e::le_mat_t<T> C_(C);
+    if (transAB == 'N' || transAB == 'n') {
+        gemmt<T>(uploC, 'N', 'T',  alpha, l2e::le_mat_t<T>(A), l2e::le_mat_t<T>(B), beta, C_);
+        gemmt<T>(uploC, 'N', 'T', -alpha, l2e::le_mat_t<T>(B), l2e::le_mat_t<T>(A), T(1), C_);
+    } else {
+        gemmt<T>(uploC, 'T', 'N',  alpha, l2e::le_mat_t<T>(A), l2e::le_mat_t<T>(B), beta, C_);
+        gemmt<T>(uploC, 'T', 'N', -alpha, l2e::le_mat_t<T>(B), l2e::le_mat_t<T>(A), T(1), C_);
+    }
+}
+
diff --git a/src/pfupdates/skslc.tcc b/src/pfupdates/skslc.tcc
new file mode 100644
index 00000000..084c965d
--- /dev/null
+++ b/src/pfupdates/skslc.tcc
@@ -0,0 +1,34 @@
+/**
+ * \file skslc.tcc
+ * Get a slice from antisymmetric matrix.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "lapack2eigen.hh"
+
+template<typename Mat>
+inline Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1>
+  skslc(const char &uploA, unsigned i, const Mat &A)
+{
+    Eigen::Matrix<typename Mat::Scalar, Eigen::Dynamic, 1> x(A.rows());
+
+    x[i] = 0.0;
+    switch (uploA) {
+    case 'U':
+    case 'u':
+        for (unsigned j = 0; j < i; ++j)
+            x[j] =  A(j, i);
+        for (unsigned j = i+1; j < A.rows(); ++j)
+            x[j] = -A(i, j);
+        break;
+
+    default:
+        std::cerr << "SKSLC: Lower triangular storage not implemented. Sorry." << std::endl;
+        break;;
+    }
+    return x;
+}
+
diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc
index 63bc035f..6705a2fb 100644
--- a/src/pfupdates/updated_tdi.tcc
+++ b/src/pfupdates/updated_tdi.tcc
@@ -6,69 +6,89 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "orbital_mat.tcc"
-#include "blalink.hh"
-#include "skpfa.hh"
-#include "sktdi.hh"
+#include "config_manager.tcc"
+#include "pfaffian.tcc"
+#include "invert.tcc"
 #include "skr2k.tcc"
 #include "skslc.tcc"
 #include "skmv.tcc"
-#include "optpanel.hh"
 #include <iostream>
 #include <vector>
 
-template <typename T> struct updated_tdi {
-  orbital_mat<T> &Xij;
-  uplo_t uplo; ///< Can be switched only with update_uplo().
-  const dim_t nelec;
-  const dim_t mmax;
-  dim_t nq_updated; // Update of Q(:, i) can be delayed against U.
-
-  // Performance tuning constants.
+namespace vmc
+{
+namespace orbital
+{
+template <typename orbital_t_,
+          typename amp_t_ = typename orbital_t_::T>
+struct updated_tdi {
+  using orbital_t = orbital_t_;
+  using index_t = vmc::config_manager::index_t;
+  using amp_t = amp_t_;
+  using T = typename orbital_t::T;
+  using invmat_t = typename orbital_t::matrix_t; ///< Just for mVMC's C wrapper. Mathematically not so good.
+  using matrix_t = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+  using submat_t = Eigen::Block<matrix_t, Eigen::Dynamic, Eigen::Dynamic, true>;
+  using vector_t = Eigen::Matrix<T, Eigen::Dynamic, 1>;
+  using idxvec_t = Eigen::VectorXi;
+
+  orbital_t &Xij;
+  const index_t nelec; ///< Total Fermionic number / Matrix size.
+  const index_t mmax;  ///< Max number of updates contained.
   const bool single_hop_alpha; ///< Whether to simplify k=1 situations.
-  const dim_t npanel_big;
-  const dim_t npanel_sub;
+
+private:
+  char uplo; ///< Can be switched only with update_uplo().
+  index_t nq_updated; // Update of Q(:, i) can be delayed against U.
 
   // Matrix M.
-  matrix_t<T> M;
+  invmat_t &M; ///< (nelec, nelec)
 
   // Updator blocks U, inv(M)U and inv(M)V.
   // Note that V is not stored.
-  matrix_t<T> U; ///< Serves also as B*inv
-  matrix_t<T> Q; ///< Contains the whole B buffer. Assert![ Q = M U ]
-  matrix_t<T> P; ///< Q(0, mmax)
+  matrix_t U; ///< Serves also as B*inv
+  matrix_t Q; ///< Contains the whole B buffer. Assert![ Q = M U ]
+  // This member causes pointer problem on mixed precision.
+  // (Which was fixed by constructing Block objects on-the-fly.)
+  // TODO: Check its reason.
+  // submat_t P; ///< &Q(0, mmax).
 
   // Central update buffer and its blocks.
-  matrix_t<T> W;
-  matrix_t<T> UMU, UMV, VMV;
-  matrix_t<T> Cp;       ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ]
-  matrix_t<T> Gc;       ///< Gaussian vectors when tri-diagonalizing C.
-  signed *const cPov; ///< Pivot when tri-diagonalizing C.
-  T Pfa;
-  T PfaRatio; //< Updated Pfaffian / Base Pfaffian.
-  // TODO: Maybe one should log all Pfaffian histories.
-
-  std::vector<dim_t> elem_cfg;
-  std::vector<dim_t> from_idx;
-  std::vector<dim_t> to_site;
+  matrix_t W;
+  matrix_t UMU, UMV, VMV;
+  matrix_t Cp;       ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ]
+  matrix_t Gc;       ///< Used to be Gaussian vectors. Now just scratchpads.
+  idxvec_t cPiv;     ///< Pivot when tri-diagonalizing C.
+  amp_t Pfa;
+  amp_t PfaRatio; //< Updated Pfaffian / Base Pfaffian.
+  // TODO: Maybe one should log partially the computed Pfaffian history.
+
+  // Configuration.
+  vmc::config_manager elem_cfg;
+
+public:
+
+  void initialize_precomputed(T Pfa_) {
+    elem_cfg.merge_config();
+    elem_cfg.reserve_space(mmax);
+    nq_updated = 0;
+    Pfa = Pfa_;
+    PfaRatio = 1.0;
+  }
 
   void initialize() {
     using namespace std;
-    auto &cfg = elem_cfg;
-    #ifdef UseBoost
-    matrix_t<T> G(nelec, nelec);
-    #else
-    matrix_t<T> G(new T[nelec * nelec], nelec);
-    #endif
-    from_idx.clear();
-    to_site.clear();
-    from_idx.reserve(mmax * sizeof(dim_t));
-    to_site.reserve(mmax * sizeof(dim_t));
+    elem_cfg.merge_config();
+    elem_cfg.reserve_space(mmax);
+    const auto &cfg = elem_cfg.config_base();
+    matrix_t G(nelec, nelec);
+    nq_updated = 0;
 
     switch (uplo) {
-    case BLIS_UPPER:
-      for (dim_t j = 0; j < nelec; ++j) {
-        for (dim_t i = 0; i < j; ++i) {
+    case 'U':
+    case 'u':
+      for (index_t j = 0; j < nelec; ++j) {
+        for (index_t i = 0; i < j; ++i) {
           M(i, j) = Xij(cfg.at(i), cfg.at(j));
         }
         M(j, j) = T(0.0);
@@ -76,146 +96,103 @@ template <typename T> struct updated_tdi {
       break;
 
     default:
-      cerr << "updated_tdi<T>: BLIS_LOWER is not supported. The error is fetal."
+      cerr << "updated_tdi: Lower triangular is not implemented yet."
            << endl;
     }
 
     // Allocate scratchpad.
-    signed *iPivFull = new signed[nelec + 1];
-    dim_t lwork = nelec * npanel_big;
-    T *pfwork = new T[lwork];
-
-    #ifdef UseBoost
-    signed info = skpfa(uplo, nelec, &M(0, 0), M.size1(), &G(0, 0), G.size1(), iPivFull,
-                        true, &Pfa, pfwork, lwork);
-    #else
-    signed info = skpfa(uplo, nelec, &M(0, 0), M.ld, &G(0, 0), G.ld, iPivFull,
-                        true, &Pfa, pfwork, lwork);
-    #endif
-#ifdef _DEBUG
-    cerr << "SKPFA+INV: n=" << nelec << " info=" << info << endl;
-#endif
-
-    delete[] iPivFull;
-    delete[] pfwork;
-    #ifndef UseBoost
-    delete[](&G(0, 0));
-    #endif
+    vector_t vT(nelec - 1);
+    idxvec_t iPiv(nelec);
+    l2e::le_mat_t<T> M_(M);
+
+    // Perform LTL factorization regardless of `uplo`.
+    signed info = sktrf(uplo, 'N', M_, &iPiv(0), &G(0, 0), G.size());
+    Pfa = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, M, iPiv);
+    PfaRatio = 1.0; // Reset Pfaffian ratio.
+    ltl2inv(uplo, M, iPiv, vT, G);
   }
 
-  ~updated_tdi() {
-    #ifndef UseBoost
-    delete[](&U(0, 0));
-    delete[](&Q(0, 0));
+  updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_, std::vector<index_t> cfg)
+      : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true),
+        M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2),
+        W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax),
+        // Cp, Gc and cPiv are dynamically allocated.
+        Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2, cfg), uplo('U')
+  { initialize(); }
 
-    delete[](&Cp(0, 0));
-    delete[](&Gc(0, 0));
-
-    delete[](&W(0, 0));
-    delete[](&UMU(0, 0));
-    delete[](&UMV(0, 0));
-    delete[](&VMV(0, 0));
-    #endif
-
-    delete[] cPov;
+  // Unsafe construction without initializaion.
+  updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_)
+      : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true),
+        M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2),
+        W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax),
+        Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2), uplo('U') { }
+
+  // Copy construction could be unsafe since it does not always contain initializaion.
+  // M is automatically allocated in this case.
+  updated_tdi(const updated_tdi<orbital_t> &tdi_) = delete;
+    /*
+      : Xij(tdi_.Xij), nelec(tdi_.nelec), mmax(tdi_.mmax), nq_updated(0), single_hop_alpha(true),
+        M(new invmat_t(...)), U(nelec, mmax * 2), Q(nelec, mmax * 2),
+        W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax),
+        Pfa(0.0), PfaRatio(1.0), elem_cfg(tdi_.Xij.norb()/2, tdi_.get_config()), uplo('U') {
+    if (elem_cfg.config().size() == nelec &&
+        elem_cfg.config(0) != elem_cfg.config(1) && // Short-circuit complicated check.
+        is_perm(elem_cfg.config()))
+      initialize();
+  } */
+
+  bool is_perm(const vmc::config_manager::base_t &cfg) const
+  {
+    std::vector<bool> occupied(Xij.norb(), false);
+    for (const auto &xi : cfg) {
+      if (xi > Xij.norb())
+        return false;
+      if (occupied.at(xi))
+        return false;
+      occupied.at(xi) = true;
+    }
+    return true;
   }
 
-  updated_tdi(orbital_mat<T> &Xij_, std::vector<dim_t> &cfg, T *M_, inc_t ldM,
-              dim_t mmax_)
-      : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4),
-    #ifdef UseBoost
-        M(nelec, nelec),
-        U(nelec, mmax * 2), Q(nelec, mmax * 2),
-        P(nelec, mmax), W(mmax, mmax),
-        UMU(mmax, mmax), UMV(mmax, mmax),
-        VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax),
-        Gc(2 * mmax, 2 * mmax),
-    #else
-        M(M_, ldM),
-        U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
-        P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
-        UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
-        VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-    #endif
-        cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg),
-        from_idx(0), to_site(0), uplo(BLIS_UPPER) {
-    #ifdef UseBoost
-    colmaj<T> M_tmp(M_, ldM);
-    for (dim_t j = 0; j < nelec; ++j)
-      for (dim_t i = 0; i < nelec; ++i)
-        M(i, j) = M_tmp(i, j);
-    #endif
-    initialize();
+  amp_t get_amplitude() { return Pfa * get_amplitude_ratio(); }
+  amp_t get_amplitude_ratio() {
+    if (PfaRatio == 0.0)
+      update_pfaffian_ratio();
+    return PfaRatio;
   }
+  int max_updates() const { return mmax; }
+  int num_updates() const { return elem_cfg.from_idx().size(); }
 
-  /*
-  updated_tdi(orbital_mat<T> &Xij_, std::vector<dim_t> &cfg, matrix_t<T> &M_,
-              dim_t mmax_)
-      : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4),
-        M(M_),
-        U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
-        P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
-        UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
-        VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg),
-        from_idx(0), to_site(0), uplo(BLIS_UPPER) {
-    initialize();
-  }*/
+  const vmc::config_manager::base_t get_config() const
+  { return elem_cfg.config(); }
 
-  // Unsafe construction without initializaion.
-  updated_tdi(orbital_mat<T> &Xij_, dim_t nelec_, T *M_, inc_t ldM, dim_t mmax_) 
-      : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4),
-    #ifdef UseBoost
-        M(nelec, nelec),
-        U(nelec, mmax * 2), Q(nelec, mmax * 2),
-        P(nelec, mmax), W(mmax, mmax),
-        UMU(mmax, mmax), UMV(mmax, mmax),
-        VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax),
-        Gc(2 * mmax, 2 * mmax),
-    #else
-        M(M_, ldM),
-        U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
-        P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
-        UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
-        VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
-        Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-    #endif
-        cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(nelec, 0),
-        from_idx(0), to_site(0), uplo(BLIS_UPPER) { 
-    #ifdef UseBoost
-    colmaj<T> M_tmp(M_, ldM);
-    for (dim_t j = 0; j < nelec; ++j)
-      for (dim_t i = 0; i < nelec; ++i)
-        M(i, j) = M_tmp(i, j);
-    #endif
-  }
+  const vmc::config_manager::base_t &get_config_base() const
+  { return elem_cfg.config_base(); }
+
+  const vmc::config_manager &get_config_manager() const
+  { return elem_cfg; }
 
-  T get_Pfa() { return Pfa * PfaRatio; }
+  void attach_config(const vmc::config_manager::base_t &cfg)
+  {
+    assert_(cfg.size() == nelec, typeid(*this).name(), "Invalid config feeded.");
+    elem_cfg.attach_config(cfg);
+  }
 
   void assemble_C_BMB() {
     using namespace std;
-    dim_t k = from_idx.size();
+    using namespace Eigen;
+    index_t k = elem_cfg.from_idx().size();
+    if (!k)
+      return;
 
     // Assemble (half of) C+BMB buffer.
+    Cp = matrix_t(2 * k, 2 * k);
     switch (uplo) {
-    case BLIS_UPPER:
-      for (dim_t j = 0; j < k; ++j)
-        for (dim_t i = 0; i < j; ++i)
-          Cp(i, j) = W(i, j) + UMU(i, j);
-      for (dim_t j = 0; j < k; ++j) {
-        for (dim_t i = 0; i < k; ++i)
-          Cp(i, j + k) = +UMV(i, j);
-
-        Cp(j, j + k) -= T(1.0);
-      }
-      for (dim_t j = 0; j < k; ++j)
-        for (dim_t i = 0; i < j; ++i)
-          Cp(i + k, j + k) = VMV(i, j);
+    case 'U':
+    case 'u':
+      // Left-lower block skipped due to uplo='U'.
+      Cp << W(seq(0, k-1), seq(0, k-1)) + UMU(seq(0, k-1), seq(0, k-1)), UMV(seq(0, k-1), seq(0, k-1)) - matrix_t::Identity(k, k),
+            matrix_t::Zero(k, k),                                        VMV(seq(0, k-1), seq(0, k-1));
       break;
 
     default:
@@ -225,12 +202,15 @@ template <typename T> struct updated_tdi {
   }
 
   // Update Q rows buffer.
-  bool require_Q(bool all) {
-    const dim_t k = from_idx.size();
-    const dim_t n = nelec;
-    dim_t k_cal;
+  bool require_Q(bool require_all) {
+    using namespace Eigen;
+    using namespace l2e;
+
+    const index_t k = elem_cfg.from_idx().size();
+    const index_t n = nelec;
+    index_t k_cal;
     // Require all K columns instead of first K-1.
-    if (all)
+    if (require_all)
       k_cal = k;
     else
       k_cal = k - 1;
@@ -241,22 +221,12 @@ template <typename T> struct updated_tdi {
 
     for (; nq_updated < k_cal; ++nq_updated) {
       // Update single column. Use SKMV.
-      #ifdef UseBoost
-      skmv(uplo, n, T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), &Q(0, nq_updated));
-      #else
-      skmv(uplo, n, T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), &Q(0, nq_updated));
-      #endif
+      Q(all, nq_updated) = skmv(uplo, T(1.0), M, U(all, nq_updated));
     } /* else {
       // Update multiple columns. Use SKMM.
-      #ifdef UseBoost
-      skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated,
-           T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), U.size1(),
-           T(0.0), &Q(0, nq_updated), Q.size1());
-      #else
-      skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated,
-           T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), U.ld,
-           T(0.0), &Q(0, nq_updated), Q.ld);
-      #endif
+      skmm('L', uplo, 'N', 'N',
+           T(1.0), le_mat_t<T>(M), le_mat_t<T>U(all, seq(nq_updated, k_cal-1)),
+           T(0.0), le_mat_t<T>(Q(all, seq(nq_updated, k_cal-1))));
     }
     nq_updated = k_cal;
     */
@@ -266,199 +236,197 @@ template <typename T> struct updated_tdi {
   // UMU needs to be recalculated for hopping change.
   // TODO: Find some way to avoid recalculating UQ?
   bool require_UMU() {
-    const dim_t k = from_idx.size();
-    const dim_t n = nelec;
+    using namespace Eigen;
 
-    for (dim_t o = 0; o < k; ++o)
-      for (dim_t l = 0; l < o; ++l)
+    const index_t k = elem_cfg.from_idx().size();
+    const index_t n = nelec;
+
+    for (index_t o = 0; o < k; ++o)
+      for (index_t l = 0; l < o; ++l)
         switch (uplo) {
-        case BLIS_UPPER:
-          UMU(l, o) = -dot(n, &U(0, o), 1, &Q(0, l), 1);
+        case 'U':
+        case 'u':
+          UMU(l, o) = -(U(all, o).transpose() * Q(all, l))(0, 0);
           break;
-        case BLIS_LOWER:
+        case 'L':
+        case 'l':
           // Always assume l < o to calculate one less column of Q.
-          UMU(o, l) = dot(n, &U(0, o), 1, &Q(0, l), 1);
+          UMU(o, l) = (U(all, o).transpose() * Q(all, l))(0, 0);
+          break;
+        default:
           break;
         }
     return true;
   }
 
-  // Inverts a configuration from fermion index to site index.
-  void config_to_site(std::vector<dim_t> &cfg, std::vector<signed> &site, bool init) {
-    using namespace std;
-
-    if (site.size() != Xij.nsite || cfg.size() != nelec) {
-      cerr << "updated_tdi<T>::inv_config:"
-           << " Invalid config / invert config buffer." << endl;
-      return;
-    }
-
-    if (init)
-      for (dim_t i = 0; i < site.size(); ++i)
-        site.at(i) = -1;
-    for (dim_t i = 0; i < cfg.size(); ++i)
-      site.at(cfg.at(i)) = i;
-
-    return;
-  }
-
   // Update osi <- os[msj].
   // i.e. c+_i c_{x_j}.
-  void push_update(dim_t osi, dim_t msj, bool compute_pfa) {
+  void push_update(index_t osi, index_t msj, bool compute_pfa) {
     using namespace std;
+    using namespace Eigen;
 
     // This is the k-th hopping.
-    dim_t n = nelec;
-    dim_t k = from_idx.size();
-    dim_t osj = elem_cfg.at(msj);
+    index_t n = nelec;
+    index_t k = elem_cfg.from_idx().size();
+    index_t osj = elem_cfg.config_base(msj);
 
-    // TODO: Check bounds, duplicates, etc.
-    from_idx.push_back(msj);
-    to_site.push_back(osi);
+    elem_cfg.push_update(osi, msj);
 
-    // TODO: Check for hopping-backs. 
+    // TODO: Check for hopping-backs.
     // This can only be handled by cancellation.
     // Singularity will emerge otherwise.
 
-    // Speed-up lookup of Xij.
-    std::vector<signed> sitecfg(Xij.nsite, -1);
-    config_to_site(elem_cfg, sitecfg, false);
-    for (dim_t i = 0; i < Xij.nsite; ++i)
-      // U(i, k) = Xij(elem_cfg.at(i), osi) - Xij(elem_cfg.at(i), osj);
-      if (sitecfg.at(i) >= 0)
-        // Direct access: special case when installed into VMC.
-        U(sitecfg.at(i), k) = Xij.X(i, osi) - Xij.X(i, osj);
+    // TODO: Speed-up lookup of Xij.
+    for (index_t i = 0; i < n; ++i)
+      U(i, k) = Xij(elem_cfg.config_base(i), osi) - Xij(elem_cfg.config_base(i), osj);
+    // for (index_t i = 0; i < Xij.norb(); ++i)
+    //   if (elem_cfg.inv(i) >= 0)
+    //     U(elem_cfg.inv(i), k) = Xij(i, osi) - Xij(i, osj);
+
+    // P matrix from latter half of Q (or rigorously speaking, B matrix).
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
 
-    skslc(uplo, n, msj, &P(0, k), &M(0, 0), M.ld);
+    P(all, k) = skslc(uplo, msj, M);
 
     // Updated the already logged U.
-    for (dim_t l = 0; l < k; ++l) {
-      dim_t msl = from_idx.at(l);
+    for (index_t l = 0; l < k; ++l) {
+      index_t msl = elem_cfg.from_idx().at(l);
       T U_jl_ = U(msj, l); ///< Backup this value before change.
 
       // Write updates.
-      U(msl, k) = Xij(to_site.at(l), osi) - Xij(elem_cfg.at(msl), osj);
-      U(msj, l) = Xij(osi, to_site.at(l)) - Xij(osj, elem_cfg.at(msl));
+      U(msl, k) = Xij(elem_cfg.to_orb().at(l), osi) - Xij(elem_cfg.config_base(msl), osj);
+      U(msj, l) = Xij(osi, elem_cfg.to_orb().at(l)) - Xij(osj, elem_cfg.config_base(msl));
 
       // Change of Q[:, l] induced by change of U[msj, l];
       // Utilize that P[:, k] = M[:, msj] ///< M is skew-symmetric.
-      axpy(n, U(msj, l) - U_jl_, &P(0, k), 1, &Q(0, l), 1);
+      Q(all, l) += P(all, k) * (U(msj, l) - U_jl_);
 
       // Change of UMV[l, k] induced by change of U[msj, l].
-      for (dim_t o = 0; o < k; ++o)
+      for (index_t o = 0; o < k; ++o)
         UMV(l, o) += (U(msj, l) - U_jl_) * P(msj, o);
     }
 
     // Update UMV and VMV. UMU is updated elsewhere.
     switch (uplo) {
-    case BLIS_UPPER:
-      for (dim_t l = 0; l < k; ++l)
-        W(l, k) = -Xij(to_site.at(l), osi) + Xij(elem_cfg.at(from_idx.at(l)), osj);
-
-      for (dim_t l = 0; l < k; ++l) {
-        UMV(l, k) = dot(n, &U(0, l), 1, &P(0, k), 1);
-        UMV(k, l) = dot(n, &U(0, k), 1, &P(0, l), 1);
+    case 'U':
+    case 'u':
+      for (index_t l = 0; l < k; ++l)
+        W(l, k) = -Xij(elem_cfg.to_orb(l), osi) + Xij(elem_cfg.config_base(elem_cfg.from_idx(l)), osj);
+
+      for (index_t l = 0; l < k; ++l) {
+        UMV(l, k) = (U(all, l).transpose() * P(all, k))(0, 0);
+        UMV(k, l) = (U(all, k).transpose() * P(all, l))(0, 0);
       }
-      UMV(k, k) = dot(n, &U(0, k), 1, &P(0, k), 1);
-      for (dim_t l = 0; l < k; ++l)
+      UMV(k, k) = (U(all, k).transpose() * P(all, k))(0, 0);
+      for (index_t l = 0; l < k; ++l)
         VMV(l, k) /* -VMV(k, l) */ = -P(msj, l); ///< P(from_idx.at(l), k);
       break;
 
     default:
-      cerr << "updated_tdi<T>::push_update:"
+      cerr << "updated_tdi::push_update:"
            << " Only upper-triangular storage is supported." << endl;
     }
 
     if (compute_pfa) {
-      // NOTE: Update k to be new size.
-      k += 1;
-      // Allocate scratchpad.
-      dim_t lwork = 2 * k * npanel_sub;
-      T *pfwork = new T[lwork];
+      // NOTE: Update k to be new size?
+      // k += 1;
+
+      update_pfaffian_ratio();
+    } else
+      // Set to 0.0 to denote dirty.
+      PfaRatio = 0.0;
+  }
 
-      // Calculate unupdated columns of U.
+  void update_pfaffian_ratio(void)
+  {
+    index_t k = elem_cfg.from_idx().size();
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
+    {
+      // All compute_pfa requires first K-1 of Q.
       require_Q(false);
 
+      // Recompute UMU.
       require_UMU();
-      // Assemble (half of) C+BMB buffer.
+      // Reassemble C and scratchpads.
       assemble_C_BMB();
 
       // If it's the first update Pfafian can be directly read out.
       if (k == 1) {
         PfaRatio = -UMV(0, 0) + T(1.0);
-        delete[] pfwork;
         return;
       }
 
       // Compute pfaffian.
-      #ifdef UseBoost
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #else
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #endif
-#ifdef _DEBUG
-      cerr << "SKPFA: info=" << info << endl;
-#endif
+      l2e::le_mat_t<T> Cp_(Cp);
+      Gc = matrix_t::Zero(2*k, 2*k);
+      cPiv = idxvec_t::Zero(2*k);
+      // Use not skpfa<T> for mixed precision.
+      signed info = sktrf(uplo, 'P', Cp_, &cPiv(0), &Gc(0, 0), Gc.size());
+      PfaRatio = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, Cp, cPiv);
+
       // Pfaffian of C = [ W -I; I 0 ].
       PfaRatio *= pow(-1.0, k * (k + 1) / 2);
+    }
+  }
 
-      delete[] pfwork;
-    } else
-      // Set to 0.0 to denote dirty.
-      PfaRatio = 0.0;
+  bool check_update_safety(const std::vector<index_t> &ms, bool merge_error = false)
+  {
+    if (elem_cfg.from_idx().size() + ms.size() > mmax)
+      return false;
+
+    // Check if alrady hopped out.
+    for (index_t l = 0; l < elem_cfg.from_idx().size(); ++l)
+      for (index_t j = 0; j < ms.size(); ++j)
+        if (ms.at(j) == elem_cfg.from_idx().at(l)) {
+          assert_(!merge_error, typeid(*this).name(), "Conflict on non-mergable push.");
+          return false;
+        }
+    return true;
   }
 
-  void push_update(dim_t osi, dim_t msj) { push_update(osi, msj, true); }
+  void push_update(index_t osi, index_t msj) { push_update(osi, msj, true); }
 
   // Check duplicate and push.
-  void push_update_safe(dim_t osi, dim_t msj, bool compute_pfa) {
-    if (from_idx.size() >= mmax)
+  void push_update_safe(index_t osi, index_t msj, bool compute_pfa, bool merge_error = false) {
+    if (!check_update_safety({ msj }))
       merge_updates();
 
-    // Check if already hopped out.
-    for (dim_t l = 0; l < from_idx.size(); ++l)
-      if (msj == from_idx.at(l)) {
-        merge_updates(); 
-        break;
-      }
     push_update(osi, msj, compute_pfa);
   }
 
-  void push_update_safe(dim_t osi, dim_t msj) {
+  void push_update_safe(index_t osi, index_t msj) {
     push_update_safe(osi, msj, true);
   }
 
   void pop_update(bool compute_pfa) {
     using namespace std;
+    using namespace Eigen;
 
     // Pop out.
-    dim_t msj_ = from_idx.at(from_idx.size() - 1);
-    dim_t osi_ = to_site.at(to_site.size() - 1);
-    dim_t osj_ = elem_cfg.at(msj_);
-    from_idx.pop_back();
-    to_site.pop_back();
+    index_t msj_ = elem_cfg.from_idx().back();
+    index_t osi_ = elem_cfg.to_orb().back();
+    index_t osj_ = elem_cfg.config_base(msj_);
+    elem_cfg.pop_update();
 
     // New update size.
-    dim_t k = from_idx.size();
+    index_t k = elem_cfg.from_idx().size();
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
 
     // Revert U[:, 1:k-1] update.
-    for (dim_t l = 0; l < k; ++l) {
-      dim_t msl = from_idx.at(l);
+    for (index_t l = 0; l < k; ++l) {
+      index_t msl = elem_cfg.from_idx(l);
       T U_jl_ = U(msj_, l);
 
       // Revert U rows. (final column is popped out).
-      U(msj_, l) = Xij(osj_, to_site.at(l)) - Xij(osj_, elem_cfg.at(msl));
+      U(msj_, l) = Xij(osj_, elem_cfg.to_orb(l)) - Xij(osj_, elem_cfg.config_base(msl));
 
       // Change of Q[:, l] induced by change of U[msj, l];
       T U_diff_msj = U(msj_, l) - U_jl_;
       switch (uplo) {
-      case BLIS_UPPER:
-        for (dim_t i = 0; i < msj_; ++i)
-          Q(i, l) += M(i, msj_) * U_diff_msj;
-        for (dim_t i = msj_ + 1; i < nelec; ++i)
-          Q(i, l) -= M(msj_, i) * U_diff_msj;
+      case 'U':
+      case 'u':
+        Q(seq(0, msj_-1), l) += M(seq(0, msj_-1), msj_) * U_diff_msj;
+        Q(seq(msj_+1, nelec-1), l) -= M(msj_, seq(msj_+1, nelec-1)) * U_diff_msj;
         break;
 
       default:
@@ -467,7 +435,7 @@ template <typename T> struct updated_tdi {
       }
 
       // Change of UMV[l, k] induced by change of U[msj, l].
-      for (dim_t o = 0; o < k; ++o)
+      for (index_t o = 0; o < k; ++o)
         UMV(l, o) += (U(msj_, l) - U_jl_) * P(msj_, o);
     }
     // Pop out outdated Q.
@@ -475,27 +443,11 @@ template <typename T> struct updated_tdi {
       nq_updated = k;
 
     // Compute new (previous, in fact) Pfaffian.
-    if (compute_pfa) {
-      // All compute_pfa requires first K-1 of Q.
-      require_Q(false);
-
-      // Recompute UMU.
-      require_UMU();
-      // Reassemble C and scratchpads.
-      assemble_C_BMB();
-      dim_t lwork = 2 * k * npanel_sub;
-      T *pfwork = new T[lwork];
-
-      #ifdef UseBoost
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #else
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #endif
-      PfaRatio *= pow(-1.0, k * (k + 1) / 2);
+    if (k == 0)
+      PfaRatio = 1.0;
+    else if (compute_pfa) {
+      update_pfaffian_ratio();
 
-      delete[] pfwork;
     } else
       // Set to 0.0 to denote dirty.
       PfaRatio = 0.0;
@@ -504,128 +456,79 @@ template <typename T> struct updated_tdi {
   void merge_updates() {
     using namespace std;
 
-    dim_t n = nelec;
-    dim_t k = from_idx.size();
+    index_t n = nelec;
+    index_t k = elem_cfg.from_idx().size();
     if (k == 0)
       return;
 
     // Allocate scratchpad.
-    dim_t lwork = 2 * k * npanel_sub;
-    T *pfwork = new T[lwork];
-    
+    vector_t vT(2 * k - 1);
+
     // Update whole Q.
     require_Q(true);
 
-    // If M is dirty, redo the tridiagonal factorization.
-    if (PfaRatio == T(0.0)) {
-      require_UMU();
-      assemble_C_BMB();
-      #ifdef UseBoost
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #else
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #endif
-      PfaRatio *= pow(-1.0, k * (k + 1) / 2);
-    }
-
     if (k == 1) {
+      // M is dirty.
+      if (PfaRatio == 0.0) {
+        require_UMU();
+        assemble_C_BMB();
+        PfaRatio = -Cp(0, 1);
+      }
       // Trivial inverse.
       Cp(0, 1) = T(-1.0) / Cp(0, 1);
       Cp(1, 0) = T(-1.0) / Cp(1, 0);
-    } else
-      #ifdef UseBoost
-      signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          pfwork, lwork);
-      #else
-      signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          pfwork, lwork);
-      #endif
-    inv_update(k, Cp);
+    } else {
+      // Redo the tridiagonal factorization.
+      require_UMU();
+      assemble_C_BMB();
+
+      l2e::le_mat_t<T> Cp_(Cp);
+      Gc = matrix_t::Zero(2*k, 2*k);
+      cPiv = idxvec_t::Zero(2*k);
+      signed info = sktrf(uplo, 'N', Cp_, &cPiv(0), &Gc(0, 0), 2*k * 2*k);
+      // PfaRatio is dirty.
+      if (PfaRatio == 0.0)
+        PfaRatio = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, Cp, cPiv) *
+          T(pow(-1.0, k * (k + 1) / 2));
+      ltl2inv(uplo, Cp, cPiv, vT, Gc);
+    }
+    assert_(Cp.rows() == 2 * k, typeid(*this).name(), "Cp is bad.");
+    inv_update(Cp);
 
     // Apply hopping.
-    for (int j = 0; j < k; ++j)
-      elem_cfg.at(from_idx.at(j)) = to_site.at(j);
-    from_idx.clear();
-    to_site.clear();
+    elem_cfg.merge_config();
     nq_updated = 0;
     Pfa *= PfaRatio;
     PfaRatio = 1.0;
-
-    delete[] pfwork;
   }
 
-  void inv_update(dim_t k, matrix_t<T> &C) {
-    #ifdef UseBoost
-    matrix_t<T> &ABC = U; ///< Use U as inv(A)*B*upper(C) buffer.
-    matrix_t<T> &AB = Q;
-    #else
-    matrix_t<T> ABC(&U(0, 0), U.ld); ///< Use U as inv(A)*B*upper(C) buffer.
-    matrix_t<T> AB(&Q(0, 0), Q.ld);
-    #endif
+  void inv_update(matrix_t &C) {
+    using namespace Eigen;
+    const index_t k = C.rows() / 2;
+    submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1)));
+
+    auto ABC = U(all, seq(0, 2 * k - 1)); ///< Use U as inv(A)*B*upper(C) buffer.
+    auto AB  = Q(all, seq(0, 2 * k - 1));
 
     if (k == 1 && single_hop_alpha) {
       // k == 1 requires no copying
-      #ifdef UseBoost
-      skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.size1(),
-            &P(0, 0), P.size1(), T(1.0), &M(0, 0), M.size1());
-      #else
-      skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.ld,
-            &P(0, 0), P.ld, T(1.0), &M(0, 0), M.ld);
-      #endif
+      skr2k(uplo, 'N', C(0, 1), Q(all, 0), P(all, 0), T(1.0), M);
       // See below for reason of calling this procedule.
       // update_uplo(uplo);
       return;
     }
 
     // Close empty space between Q and P.
-    #ifndef UseBoost
     if (k != mmax)
-    #endif
-      for (dim_t j = 0; j < k; ++j)
-        memcpy(&AB(0, k + j), &P(0, j), nelec * sizeof(T));
-
-    # if 0
-    // Copy AB to ABC for TRMM interface.
-    for (dim_t j = 0; j < 2 * k - 1; ++j)
-      // inv(A)*U  [ 0 + + +
-      //             0 0 + +
-      //             0 0 0 +
-      //             0 0 0 0 ] => AB[:, 0:2] -> ABC[:, 1:3]
-      memcpy(&ABC(0, j + 1), &AB(0, j), nelec * sizeof(T));
-    #ifdef UseBoost
-    trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0),
-         &C(0, 1), C.size1(), &ABC(0, 1), ABC.size1());
-    #else
-    trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0),
-         &C(0, 1), C.ld, &ABC(0, 1), ABC.ld);
-    #endif
-
-    // Update: write to M.
-    #ifdef UseBoost
-    skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.size1(),
-          &AB(0, 1), AB.size1(), T(1.0), &M(0, 0), M.size1());
-    #else
-    skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.ld,
-          &AB(0, 1), AB.ld, T(1.0), &M(0, 0), M.ld);
-    #endif
-    #else
-    gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE,
-         nelec, 2 * k, 2 * k,
-         T(1.0),
-         &AB(), AB.ld,
-         &C(), C.ld,
-         T(0.0),
-         &ABC(), ABC.ld);
-    gemmt(uplo, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE,
-          nelec, 2 * k,
-          T(1.0),
-          &ABC(), ABC.ld,
-          &AB(), AB.ld,
-          T(1.0),
-          &M(), M.ld);
-    #endif
+      for (index_t j = 0; j < k; ++j)
+        AB(all, k + j) = P(all, j);
+
+    l2e::le_mat_t<T> M_(M);
+    ABC = AB * C;
+    gemmt<T>(uplo, 'N', 'T', T(1.0),
+             l2e::le_mat_t<T>(ABC),
+             l2e::le_mat_t<T>(AB),
+             T(1.0), M_);
 
     // Identity update to complete antisymmetric matrix.
     // This called due to lack of skmm support at the moment.
@@ -635,15 +538,20 @@ template <typename T> struct updated_tdi {
   /**
    * Complete antisymmetric matrix.
    */
-  void skcomplete(uplo_t uplo_, dim_t n, matrix_t<T> &A) {
-    for (dim_t j = 0; j < n; ++j) {
-      for (dim_t i = 0; i < j; ++i) {
+  template <typename Mat>
+  static void skcomplete(const char &uplo_, Mat &A) {
+    const index_t n = A.rows();
+
+    for (index_t j = 0; j < n; ++j) {
+      for (index_t i = 0; i < j; ++i) {
         switch (uplo_) {
-        case BLIS_UPPER:
+        case 'U':
+        case 'u':
           A(j, i) = -A(i, j);
           break;
 
-        case BLIS_LOWER:
+        case 'L':
+        case 'l':
           A(i, j) = -A(j, i);
           break;
 
@@ -655,15 +563,160 @@ template <typename T> struct updated_tdi {
     }
   }
 
-  void update_uplo(uplo_t uplo_new) {
-    skcomplete(uplo /* NOTE: old uplo */, nelec, M);
-    if (from_idx.size()) {
-      skcomplete(uplo, from_idx.size(), UMU);
-      skcomplete(uplo, from_idx.size(), VMV);
-      skcomplete(uplo, from_idx.size(), W);
+  void update_uplo(const char &uplo_new) {
+    using namespace Eigen;
+
+    skcomplete(uplo /* NOTE: Old uplo. */, M);
+    const int k = elem_cfg.from_idx().size();
+    if (k) {
+      skcomplete(uplo, UMU(seq(0, k-1), seq(0, k-1)));
+      skcomplete(uplo, VMV(seq(0, k-1), seq(0, k-1)));
+      skcomplete(uplo,   W(seq(0, k-1), seq(0, k-1)));
     }
 
     uplo = uplo_new;
   }
 
-}; 
+  Eigen::Vector<amp_t, Eigen::Dynamic> batch_query_amplitudes
+      (int N, const Eigen::Map<idxvec_t> &to_orbs, const Eigen::Map<idxvec_t> &from_ids) {
+    using namespace Eigen;
+    using ampvec_t = Eigen::Vector<amp_t, Eigen::Dynamic>;
+
+    bool upper = (uplo == 'U' || uplo == 'u');
+    int k0 = from_ids.size();
+    int k1 = k0 / N;
+    assert_(k0 % N == 0, typeid(*this).name(), "Misaligned batch query.");
+    ampvec_t result(k1);
+
+    // Works w/ merged M only.
+    merge_updates();
+    // Operates on complete M.
+    skcomplete(uplo, M);
+
+    matrix_t UbB;
+    if (N > 1)
+      UbB = matrix_t::Zero(nelec, k1 * (N-1));
+    matrix_t UbF= matrix_t::Zero(nelec, k1);
+    matrix_t Pb = matrix_t::Zero(nelec, k0);
+
+    // Grouped comput. of Ub.
+    for (index_t l1 = 0; l1 < k1; ++l1) {
+      for (index_t l = l1 * N; l < (l1 + 1) * N; ++l)
+        elem_cfg.push_update(to_orbs(l), from_ids(l));
+
+      int osi, osj;
+
+      for (index_t l2 = 0; l2 < N-1; ++l2) {
+        index_t l = l1 * N + l2;
+        index_t lB = l1 * (N-1) + l2;
+
+        osi = to_orbs(l);
+        osj = elem_cfg.config_base(from_ids(l));
+
+        for (index_t i = 0; i < nelec; ++i) {
+          index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i);
+          UbB(i, lB) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj);
+        }
+      }
+
+      // Final column in the grouped U.
+      {
+        index_t l = l1 * N + N - 1;
+        osi = to_orbs(l);
+        osj = elem_cfg.config_base(from_ids(l));
+
+        for (index_t i = 0; i < nelec; ++i) {
+          index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i);
+          UbF(i, l1) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj);
+        }
+      }
+
+      for (index_t ll = 0; ll < N; ++ll)
+        elem_cfg.pop_update();
+    }
+
+    // Pb needs no grouping.
+    for (index_t l = 0; l < k0; ++l)
+      Pb(all, l) = M(all, from_ids(l));
+
+    matrix_t QbB;
+    if (N > 1)
+      // Require Q.
+      QbB = M * UbB;
+
+    // Batch assemble C.
+    matrix_t Cb = matrix_t::Zero(2*N, 2*N * k1);
+    for (index_t l1 = 0; l1 < k1; ++l1) {
+      auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1));
+      auto UCur = UbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1));
+      auto QCur = QbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1));
+      auto PCur = Pb(all, seq(N * l1, N * (l1+1) - 1));
+
+      // UMU + W.
+      for (index_t o = 0; o < N-1; ++o)
+        for (index_t l = 0; l < o; ++l)
+          if (upper)
+            CCur(l, o) = -(UCur(all, o).transpose() * QCur(all, l))(0,0) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + o)));
+          else
+            CCur(o, l) =  (UCur(all, o).transpose() * QCur(all, l))(0,0) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + o)));
+      {
+        for (index_t l = 0; l < N-1; ++l)
+          if (upper)
+            CCur(l, N-1) = -(UbF(all, l1).transpose() * QCur(all, l))(0,0) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + N-1)));
+          else
+            CCur(N-1, l) =  (UbF(all, l1).transpose() * QCur(all, l))(0,0) -
+              Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) +
+              Xij(elem_cfg.config_base(from_ids(l1 * N + l)),
+                  elem_cfg.config_base(from_ids(l1 * N + N-1)));
+      }
+
+      // UMV
+      if (upper) {
+        if (N > 1)
+          CCur(seq(0, N - 2), seq(N, 2*N - 1)) = UCur.transpose() * PCur;
+        CCur(N - 1, seq(N, 2*N - 1)) = UbF(all, l1).transpose() * PCur;
+      } else
+        assert_(false, typeid(*this).name(), "Lower diag not implemented.");
+      // -I
+      CCur(seq(0, N - 1), seq(N, 2*N - 1)) -= matrix_t::Identity(N, N);
+
+      // VMV
+      for (index_t o = 0; o < N; ++o)
+        for (index_t l = 0; l < o; ++l)
+          if (upper)
+            CCur(N + l, N + o) = -PCur(from_ids(l1 * N + o), l);
+          else
+            CCur(N + o, N + l) =  PCur(from_ids(l1 * N + o), l);
+    }
+
+    // Batch Pfaffian call.
+    matrix_t Gc = matrix_t::Zero(2*N, 2*N);
+    idxvec_t cPiv = idxvec_t::Zero(2*N);
+    for (index_t l1 = 0; l1 < k1; ++l1) {
+      auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1));
+      if (N == 1) {
+        result(l1) = -CCur(0, 1) + T(1.0);
+        continue;
+      }
+      l2e::le_mat_t<T> C_(CCur);
+      signed info = sktrf(uplo, 'P', C_, &cPiv(0), &Gc(0, 0), Gc.size());
+      result(l1) = ltl2pfa<matrix_t, idxvec_t, amp_t>(uplo, CCur, cPiv);
+    }
+    if (N > 1)
+      result = result * pow(-1.0, N * (N + 1) / 2);
+
+    return result;
+  }
+
+};
+}
+}
diff --git a/tool/CMakeLists.txt b/tool/CMakeLists.txt
index ae13a4f8..4c225b44 100644
--- a/tool/CMakeLists.txt
+++ b/tool/CMakeLists.txt
@@ -6,9 +6,8 @@ if(${CMAKE_PROJECT_NAME} STREQUAL "Project")
   message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.")
 endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
 
-add_library(key2lower STATIC key2lower.c)
 add_executable(greenr2k greenr2k.F90)
-target_link_libraries(greenr2k key2lower ${LAPACK_LIBRARIES})
+target_link_libraries(greenr2k ${LAPACK_LIBRARIES})
 
 install(TARGETS greenr2k RUNTIME DESTINATION bin)
 #
diff --git a/tool/greenr2k.F90 b/tool/greenr2k.F90
index eba207f0..1472978f 100644
--- a/tool/greenr2k.F90
+++ b/tool/greenr2k.F90
@@ -75,15 +75,21 @@ MODULE fourier_routine
   !
   IMPLICIT NONE
   !
-  INTERFACE
-     SUBROUTINE key2lower(key) BIND(c)
-       USE,INTRINSIC :: iso_c_binding
-       CHARACTER(KIND=C_CHAR) :: key(*)
-     END SUBROUTINE key2lower
-  END INTERFACE
-  !
 CONTAINS
 !
+SUBROUTINE key2lower(key)
+  CHARACTER(*) :: key
+  !
+  INTEGER :: ii, acode
+  !
+  DO ii = 1, LEN(TRIM(key))
+     acode = IACHAR(key(ii:ii))
+     IF(65 <= acode .AND. acode <= 90) THEN
+        key(ii:ii) = ACHAR(acode + 32)
+     END IF
+  END DO
+END SUBROUTINE key2lower
+!
 ! Read from HPhi/mVMC input files
 !
 SUBROUTINE read_filename()
@@ -461,10 +467,10 @@ END SUBROUTINE set_kpoints
 SUBROUTINE read_corrindx()
   !
   USE fourier_val, ONLY : file_one, file_two, ncor1, ncor2, ncor, indx, &
-  &                       calctype, nr, rindx, orb, norb
+  &                       calctype, nr, rindx, orb, norb, irv
   IMPLICIT NONE
   !
-  INTEGER :: fi = 10, itmp(8), icor
+  INTEGER :: fi = 10, itmp(8), icor, ir, iorb, jorb
   CHARACTER(100) :: ctmp
   !
   WRITE(*,*) 
@@ -489,7 +495,14 @@ SUBROUTINE read_corrindx()
   !
   ncor(1:2) = 0
   DO icor = 1, ncor1
+     !
      READ(fi,*) itmp(1:4)
+     !
+     IF(ANY(irv(1:3,1,rindx(itmp(1)+1)) /= 0)) THEN
+        WRITE(*,'(a,i0,a)') "       REMARK : The left operator at # ", icor, " is not at R=0."
+        CYCLE
+     END IF
+     !
      IF(itmp(2) == 0 .AND. itmp(4) == 0) THEN
         !
         ! Up-Up correlation
@@ -527,6 +540,11 @@ SUBROUTINE read_corrindx()
      READ(fi,*) itmp(1:8)
      !
      IF(itmp(1) == itmp(3) .AND. itmp(5) == itmp(7)) THEN
+        !
+        IF(ANY(irv(1:3,1,rindx(itmp(1)+1)) /= 0)) THEN
+           WRITE(*,'(a,i0,a)') "       REMARK : The left operator at # ", icor, " is not at R=0."
+           CYCLE
+        END IF
         !
         IF(itmp(2) == 0 .AND. itmp(4) == 0) THEN
            !
@@ -577,6 +595,11 @@ SUBROUTINE read_corrindx()
      END IF
      !
      IF(calctype == 4 .AND. (itmp(1) == itmp(7) .AND. itmp(3) == itmp(5))) THEN
+        !
+        IF(ANY(irv(1:3,1,rindx(itmp(1)+1)) /= 0)) THEN
+           WRITE(*,'(a,i0,a)') "       REMARK : The left operator at # ", icor, " is not at R=0."
+           CYCLE
+        END IF
         !
         ! S+S- & S-S+ for mVMC
         !
@@ -596,6 +619,35 @@ SUBROUTINE read_corrindx()
      !
   END DO
   !
+  IF(COUNT(indx(1:nr,3:8,1:norb,1:norb) == 0) /= 0) THEN
+     WRITE(*,*) "ERROR! The following correlation function is missed:"
+     WRITE(*,*) "R,   kind,   orb1,   orb2"
+     DO icor = 3, 8
+        DO ir = 1, nr
+           DO iorb = 1, norb
+              DO jorb = 1, norb
+                 IF(indx(ir,icor,iorb,jorb) == 0) THEN
+                    IF(icor == 3) THEN
+                       WRITE(*,'(i0,a,i0,2x,i0)') ir, " Up-Up-Up-Up         ", iorb, jorb
+                    ELSE IF(icor == 4) THEN
+                       WRITE(*,'(i0,a,i0,2x,i0)') ir, " Up-Up-Down-Down     ", iorb, jorb
+                    ELSE IF(icor == 5) THEN
+                       WRITE(*,'(i0,a,i0,2x,i0)') ir, " Down-Down-Up-Up     ", iorb, jorb
+                    ELSE IF(icor == 6) THEN
+                       WRITE(*,'(i0,a,i0,2x,i0)') ir, " Down-Down-Down-Down ", iorb, jorb
+                    ELSE IF(icor == 7) THEN
+                       WRITE(*,'(i0,a,i0,2x,i0)') ir, " Up-Down-Down-Up     ", iorb, jorb
+                    ELSE
+                       WRITE(*,'(i0,a,i0,2x,i0)') ir, " Down-Up-Up-Down     ", iorb, jorb
+                    END IF
+                 END IF
+              END DO
+           END DO
+        END DO
+     END DO
+     STOP "Missing indices for the Green function."
+  END IF
+  !
   WRITE(*,*) "    Number of UpUpUpUp         Index : ", COUNT(indx(1:nr, 3, 1:norb, 1:norb) /= 0)
   WRITE(*,*) "    Number of UpUpDownDown     Index : ", COUNT(indx(1:nr, 4, 1:norb, 1:norb) /= 0)
   WRITE(*,*) "    Number of DownDownUpUp     Index : ", COUNT(indx(1:nr, 5, 1:norb, 1:norb) /= 0)
@@ -861,7 +913,13 @@ SUBROUTINE output_cor()
         END DO
         !
         DO ik = 1, ikk
-           WRITE(fo,'(1000e15.5)') xk(ik), cor_ave(ik,1:6, 1:norb, 1:norb), cor_err(ik,1:6, 1:norb, 1:norb)
+           WRITE(fo,'(e15.5)',advance="no") xk(ik)
+           DO iorb = 1, norb
+              DO jorb = 1, norb
+                 WRITE(fo,'(24e15.5)',advance="no") cor_ave(ik,1:6, jorb, iorb), cor_err(ik,1:6, jorb, iorb)
+              END DO
+           END DO
+           WRITE(fo,*)
         END DO
         !
         CLOSE(fo)
diff --git a/tool/key2lower.c b/tool/key2lower.c
deleted file mode 100644
index f416eade..00000000
--- a/tool/key2lower.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <string.h>
-#include <ctype.h>
-
-void key2lower(char *key){
-  unsigned int ii;
-  for (ii = 0; ii < strlen(key); ii++) key[ii] = tolower(key[ii]);
-}
diff --git a/tool/makefile_tool b/tool/makefile_tool
index 1d2a177b..e89c7a9a 100644
--- a/tool/makefile_tool
+++ b/tool/makefile_tool
@@ -4,10 +4,10 @@ include ../src/make.sys
 .SUFFIXES : .o .F90
 .SUFFIXES : .o .c
 
-all:greenr2k
+all: greenr2k
 
-greenr2k:greenr2k.o key2lower.o
-	$(F90) greenr2k.o key2lower.o $(LIBS) -o $@
+greenr2k: greenr2k.o
+	$(F90) greenr2k.o $(LIBS) -o $@
 
 .F90.o:
 	$(F90) -c $< $(FFLAGS)