diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 02ab63ca..9c6855af 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -4,6 +4,7 @@ on: push: branches: - master + - develop - gh_actions # test branch - '!gh-pages' tags: '*' @@ -13,21 +14,21 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Inject slug/short variables - uses: rlespinasse/github-slug-action@v3.x + uses: rlespinasse/github-slug-action@v4.x - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: main - name: Checkout gh-pages - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: ref: gh-pages path: gh-pages - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: 3.8 @@ -42,7 +43,7 @@ jobs: cmake -E make_directory build cd build cmake -DDocument=ON -DENABLE_MPI=OFF ../ - make doc-ja-html doc-en-html + make doc-ja-html doc-en-html tutorial-ja-html - name: Deploy Configuration run: | @@ -63,6 +64,11 @@ jobs: mkdir -p "gh-pages/doc/${TARGET_NAME}" cp -r "main/build/doc/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/${lang}" done + for lang in ja; do + rm -rf "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}" + mkdir -p "gh-pages/doc/${TARGET_NAME}"/tutorial/ + cp -r "main/build/doc/tutorial/${lang}/source/html" "gh-pages/doc/${TARGET_NAME}/tutorial/${lang}" + done cd gh-pages git config --local user.name "${GIT_USER}" git config --local user.email "${GIT_EMAIL}" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b49b05fe..25c18f04 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,6 +2,7 @@ name: CI on: push: + pull_request: schedule: - cron: '0 0 1,15 * *' # JST 9:00 on 1st and 15th every month @@ -27,12 +28,12 @@ jobs: if: ${{ runner.os == 'Linux' }} run: | sudo apt update - sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev + sudo apt install liblapack-dev openmpi-bin libopenmpi-dev libscalapack-openmpi-dev libeigen3-dev - name: brew if: ${{ runner.os == 'macOS' }} run: | - brew install openmpi scalapack libomp + brew install openmpi scalapack libomp blis eigen - name: Setup Python uses: actions/setup-python@v5 @@ -52,13 +53,10 @@ jobs: run: | if [ ${{ runner.os }} = "macOS" ] ; then # CONFIG=apple requires gfortran but macOS runner has not, but gfortran-11, 12, ... - ln -s `which gfortran-11` gfortran - env PATH=`pwd`:$PATH cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE + cmake -DCONFIG=apple -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_Fortran_COMPILER=gfortran-14 $GITHUB_WORKSPACE else cmake -DCMAKE_VERBOSE_MAKEFILE=ON $GITHUB_WORKSPACE fi - env: - HOMEBREW_PREFIX: /opt/homebrew - name: build working-directory: ${{runner.workspace}}/build diff --git a/.gitmodules b/.gitmodules index 99e901f0..825489fb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,10 +1,6 @@ -[submodule "src/pfaffine"] - path = src/pfaffine - url = https://github.com/xrq-phys/Pfaffine - branch = blis [submodule "src/blis"] path = src/blis - url = https://github.com/xrq-phys/blis + url = https://github.com/flame/blis branch = mvmc [submodule "src/pfapack"] path = src/pfapack diff --git a/CMakeLists.txt b/CMakeLists.txt index 70851aa8..76b217fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,9 @@ -cmake_minimum_required(VERSION 2.8.0 FATAL_ERROR) +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) project(mVMC NONE) option(USE_SCALAPACK "Use Scalapack" OFF) option(PFAFFIAN_BLOCKED "Use blocked-update Pfaffian to speed up." OFF) +option(USE_GEMMT "Use GEMMT. Recommended regardless blocked-Pfaffian-update." ON) add_definitions(-D_mVMC) if(CONFIG) @@ -17,11 +18,10 @@ message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(GIT_SUBMODULE_UPDATE "execute `git submodule update` automatically" ON) -enable_language(C Fortran) -if(PFAFFIAN_BLOCKED) - enable_language(CXX) - set(CMAKE_CXX_STANDARD 11) -endif(PFAFFIAN_BLOCKED) + +# First, enables C language only. +# External packages only use their C API. +enable_language(C) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") set(CMAKE_SKIP_BUILD_RPATH FALSE) @@ -29,6 +29,7 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) set(CMAKE_MACOSX_RPATH 1) +# TODO: Is this really needed? if("${CMAKE_BUILD_TYPE}" MATCHES "Debug") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_DEBUG}") else() @@ -37,16 +38,19 @@ else() endif() if(CMAKE_C_COMPILER_ID STREQUAL "Intel") + # TODO: Really needs separation? if("${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "15.0.0.20140528") set(OMP_FLAG_Intel "-openmp") else() set(OMP_FLAG_Intel "-qopenmp") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OMP_FLAG_Intel}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OMP_FLAG_Intel}") else() find_package(OpenMP) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") endif(OPENMP_FOUND) endif() @@ -57,6 +61,9 @@ if(MPI_C_FOUND) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}") endif(MPI_C_FOUND) +# BLAS/LAPACK/Pfapack77 needs Fortran interface. +enable_language(Fortran) + find_package(LAPACK) if(USE_SCALAPACK MATCHES OFF) if(LAPACK_FOUND) @@ -64,6 +71,34 @@ if(USE_SCALAPACK MATCHES OFF) endif(LAPACK_FOUND) endif() +if(PFAFFIAN_BLOCKED + OR USE_GEMMT) + set(Require_BLIS ON) +endif() + +if(BLA_VENDOR MATCHES "Intel10*" + OR BLAS_LIBRARIES MATCHES "/*mkl*") + # Don't require BLIS when MKL is used. + add_definitions(-DMKL) + add_definitions(-DBLAS_EXTERNAL) + add_definitions(-DF77_COMPLEX_RET_INTEL) + set(Require_BLIS OFF) +elseif(BLA_VENDOR MATCHES "FLA*" + OR BLAS_LIBRARIES MATCHES "/*blis*") + # Skip extra BLIS if it's already the BLAS vendor. + list(GET BLAS_LIBRARIES 0 BLIS_FIRST_LIB) + get_filename_component(BLIS_LIB_DIR ${BLIS_FIRST_LIB} DIRECTORY) + include_directories(${BLIS_LIB_DIR}/../include) + include_directories(${BLIS_LIB_DIR}/../include/blis) + set(Require_BLIS OFF) +else() + # BLAS vendor preference: + # External > BLIS > Reference + if(DEFINED BLA_VENDOR) + add_definitions(-DBLAS_EXTERNAL) + endif() +endif() + # Build and enable tests # testing setup # enable_testing() must be called in the top-level CMakeLists.txt before any add_subdirectory() is called. @@ -97,13 +132,31 @@ if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") endif() add_subdirectory("${STDFACE_DIR}") +# C++ support for pfupdates & ltl2inv. +enable_language(CXX) +set(CMAKE_CXX_STANDARD 11) + +# Require Eigen +find_package(Eigen3 3.4 REQUIRED NO_MODULE) +include_directories(${EIGEN3_INCLUDE_DIR}) +# We surely want to offload mmuls to BLAS. +add_definitions(-DEIGEN_USE_BLAS) + +if(Require_BLIS) + add_definitions(-D_BLIS) + include("download_blis_artifact.cmake") +else(Require_BLIS) + # Use bundled blis.h + include_directories(src/common/deps) +endif(Require_BLIS) + if(PFAFFIAN_BLOCKED) add_definitions(-D_pf_block_update) - include("download_blis_artifact.cmake") - # Must set BLIS artifact BEFORE adding pfupdates target. - add_subdirectory(src/pfupdates) - add_dependencies(pfupdates blis_include) endif(PFAFFIAN_BLOCKED) +add_subdirectory(src/pfupdates) +if(Require_BLIS) + add_dependencies(pfupdates blis_include) +endif(Require_BLIS) if (Document) add_subdirectory(doc) @@ -111,5 +164,9 @@ endif(Document) add_subdirectory(src/ComplexUHF) add_subdirectory(src/pfapack/fortran) +add_subdirectory(src/ltl2inv) + if(Require_BLIS) + add_dependencies(ltl2inv blis_include) + endif(Require_BLIS) add_subdirectory(src/mVMC) add_subdirectory(tool) diff --git a/config/aocc.cmake b/config/aocc.cmake index a53a783a..258d9772 100644 --- a/config/aocc.cmake +++ b/config/aocc.cmake @@ -9,6 +9,9 @@ set(CMAKE_C_FLAGS_RELEASE "-Wno-unknown-pragmas -O3 -DNDEBUG -DHAVE_SSE2" CACHE set(CMAKE_Fortran_COMPILER "flang" CACHE STRING "" FORCE) set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -DHAVE_SSE2" CACHE STRING "" FORCE) +# OpenMP, libatomic +set(CMAKE_EXE_LINKER_FLAGS "-fopenmp -latomic") + # for AOCL set(BLA_VENDOR "FLAME" CACHE STRING "" FORCE) diff --git a/config/apple.cmake b/config/apple.cmake index 73082407..37a7a290 100644 --- a/config/apple.cmake +++ b/config/apple.cmake @@ -2,7 +2,7 @@ # additional libomp and gfortran installation required # mac computers are suggested to use this configuration for better performance -if(NOT $ENV{HOMEBREW_PREFIX}) +if(NOT DEFINED ENV{HOMEBREW_PREFIX}) message(FATAL "Homebrew is not installed. Please install Homebrew first.") endif() @@ -10,7 +10,6 @@ set(CMAKE_C_COMPILER "clang" CACHE STRING "" FORCE) set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "" FORCE) set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall -Wformat -Werror=format-security") set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas -Wno-logical-not-parentheses") -# set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE) # OpenMP with libomp set(CMAKE_EXE_LINKER_FLAGS "-L$ENV{HOMEBREW_PREFIX}/opt/libomp/lib -lomp") diff --git a/config/gcc.cmake b/config/gcc.cmake index 751912d7..8a15d2dc 100644 --- a/config/gcc.cmake +++ b/config/gcc.cmake @@ -3,4 +3,5 @@ set(CMAKE_C_COMPILER "gcc" CACHE STRING "" FORCE) set(CMAKE_CXX_COMPILER "g++" CACHE STRING "" FORCE) set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall -Wformat -Werror=format-security") set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas ") set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE) diff --git a/download_blis_artifact.cmake b/download_blis_artifact.cmake index f6274bcc..b2033f92 100644 --- a/download_blis_artifact.cmake +++ b/download_blis_artifact.cmake @@ -4,20 +4,8 @@ if(NOT DEFINED BLIS_ARTIFACT_CONFIG) COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE) if(${ARCHITECTURE} STREQUAL "x86_64") - execute_process( - COMMAND cat /proc/cpuinfo - COMMAND grep Intel - COMMAND head -1 - COMMAND tr -d '\n' - OUTPUT_VARIABLE UARCH_INTEL) - if(NOT "${UARCH_INTEL} " STREQUAL " ") - set(BLIS_ARTIFACT_CONFIG "intel64") - else() - set(BLIS_ARTIFACT_CONFIG "amd64") - endif() - elseif(${ARCHITECTURE} STREQUAL "aarch64" OR - ${ARCHITECTURE} STREQUAL "arm64") - # Currently SVE requires being set manually. + set(BLIS_ARTIFACT_CONFIG "x86_64") + elseif(${ARCHITECTURE} STREQUAL "aarch64" OR ${ARCHITECTURE} STREQUAL "arm64") set(BLIS_ARTIFACT_CONFIG "arm64") else() message(FATAL_ERROR "Failed to recognize architecture ${ARCHITECTURE}.") @@ -27,14 +15,10 @@ endif(NOT DEFINED BLIS_ARTIFACT_CONFIG) message(STATUS "Downloading BLIS artifact...") set(BLIS_ARCHIVE ${CMAKE_CURRENT_BINARY_DIR}/libblis_artifact.tar.gz) -if(${BLIS_ARTIFACT_CONFIG} STREQUAL "intel64") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_intel64_gcc.tar.gz) -elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "amd64") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_amd64_gcc.tar.gz) -elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_cortexa57_aarch64-linux-gnu-gcc.tar.gz) -elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve") - set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_armsve_aarch64-linux-gnu-gcc-10.tar.gz) +if(${BLIS_ARTIFACT_CONFIG} STREQUAL "x86_64") + set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.x86_64-linux-gnu.tar.gz) +elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "arm64" OR ${BLIS_ARTIFACT_CONFIG} STREQUAL "armsve") + set(BLIS_ARTIFACT_URL https://github.com/JuliaBinaryWrappers/blis_jll.jl/releases/download/blis-v0.9.0%2B4/blis.v0.9.0.aarch64-linux-gnu.tar.gz) elseif(${BLIS_ARTIFACT_CONFIG} STREQUAL "a64fx") set(BLIS_ARTIFACT_URL https://github.com/xrq-phys/blis/releases/download/sv0.8.1+arm/libblis_a64fx_aarch64-linux-gnu-gcc-10.tar.gz) else() @@ -53,6 +37,7 @@ add_custom_target(blis_target DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib/libblis.a) add_custom_target(blis_include DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/include/blis/blis.h) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/blis) add_library(blis STATIC IMPORTED GLOBAL) diff --git a/mVMCconfig.sh b/mVMCconfig.sh index f731ccc7..1e0d7a57 100755 --- a/mVMCconfig.sh +++ b/mVMCconfig.sh @@ -40,8 +40,8 @@ F90 = frt FFLAGS = -DNDEBUG -DFUJITSU -Kfast CFLAGS = -DNDEBUG -Ofast -fopenmp CXXFLAGS = -DNDEBUG -Ofast -fopenmp -CFLAGS += -D_lapack -D_pf_block_update # -D_pfaffine -CXXFLAGS += -DBLAS_EXTERNAL +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -DBLAS_EXTERNAL -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.so -SSL2 -lm -lpthread @@ -74,8 +74,8 @@ F90 = ifort FFLAGS = -O3 -implicitnone CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine -CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread @@ -100,8 +100,8 @@ F90 = ifort FFLAGS = -O3 -implicitnone CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine -CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -mkl=sequential -lm -lpthread @@ -124,8 +124,8 @@ F90 = flang FFLAGS = -O3 CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine -CXXFLAGS += +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread @@ -148,8 +148,8 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine -CXXFLAGS += +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = -lflame \$(BLIS_ROOT)/lib/libblis.a -lm -lpthread @@ -172,8 +172,8 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine -CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm -lpthread @@ -196,8 +196,8 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine -CXXFLAGS += +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread @@ -220,8 +220,8 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine -CXXFLAGS += # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple. +CFLAGS += -D_lapack -D_pf_block_update +CXXFLAGS += -D_BLIS -DEIGEN_USE_BLAS # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple. BLIS_ROOT = ${PWD}/src/blis-install LIBS = \$(BLIS_ROOT)/lib/libblis.a -llapack -lm -lpthread @@ -242,7 +242,6 @@ EOF cat src/make.sys cat src/make-ext.sys ln -s $PWD/src/make.sys src/StdFace/make.sys; - ln -s $PWD/src/make.sys src/pfaffine/make.inc; ln -s $PWD/src/make.sys src/pfapack/fortran/make.inc; cat > makefile < // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped + +// -- STATIC INLINE FUNCTIONS -------------------------------------------------- + +// C and C++ have different semantics for defining "inline" functions. In C, +// the keyword phrase "static inline" accomplishes this, though the "inline" +// is optional. In C++, the "inline" keyword is required and obviates "static" +// altogether. Why does this matter? While BLIS is compiled in C99, blis.h may +// be #included by a source file that is compiled with C++. +#ifdef __cplusplus + #define BLIS_INLINE inline +#else + //#define BLIS_INLINE static inline + #define BLIS_INLINE static +#endif + +// -- Common BLIS definitions -- + +// begin bli_type_defs.h + + +#ifndef BLIS_TYPE_DEFS_H +#define BLIS_TYPE_DEFS_H + + +// +// -- BLIS basic types --------------------------------------------------------- +// + +#ifdef __cplusplus + // For C++, include stdint.h. +#include // skipped +#elif __STDC_VERSION__ >= 199901L + // For C99 (or later), include stdint.h. +#include // skipped +#include // skipped +#else + // When stdint.h is not available, manually typedef the types we will use. + #ifdef _WIN32 + typedef __int32 int32_t; + typedef unsigned __int32 uint32_t; + typedef __int64 int64_t; + typedef unsigned __int64 uint64_t; + #else + #error "Attempting to compile on pre-C99 system without stdint.h." + #endif +#endif + +// -- General-purpose integers -- + +// If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. +// NOTE: This cpp guard will only meaningfully change BLIS's behavior on +// systems where the BLIS integer size would have been automatically selected +// to be 32 bits, since explicit selection of 32 bits is prohibited at +// configure-time (and explicit or automatic selection of 64 bits is fine +// and would have had the same result). +#if BLIS_BLAS_INT_SIZE == 64 + #undef BLIS_INT_TYPE_SIZE + #define BLIS_INT_TYPE_SIZE 64 +#endif + +// Define integer types depending on what size integer was requested. +#if BLIS_INT_TYPE_SIZE == 32 +typedef int32_t gint_t; +typedef uint32_t guint_t; +#elif BLIS_INT_TYPE_SIZE == 64 +typedef int64_t gint_t; +typedef uint64_t guint_t; +#else +typedef signed long int gint_t; +typedef unsigned long int guint_t; +#endif + +// -- Boolean type -- + +// NOTE: bool_t is no longer used and has been replaced with C99's bool type. +//typedef bool bool_t; + +// BLIS uses TRUE and FALSE macro constants as possible boolean values, but we +// define these macros in terms of true and false, respectively, which are +// defined by C99 in stdbool.h. +#ifndef TRUE + #define TRUE true +#endif + +#ifndef FALSE + #define FALSE false +#endif + +// -- Special-purpose integers -- + +// This cpp guard provides a temporary hack to allow libflame +// interoperability with BLIS. +#ifndef _DEFINED_DIM_T +#define _DEFINED_DIM_T +typedef gint_t dim_t; // dimension type +#endif +typedef gint_t inc_t; // increment/stride type +typedef gint_t doff_t; // diagonal offset type +typedef guint_t siz_t; // byte size type +typedef uint32_t objbits_t; // object information bit field + +// -- Real types -- + +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + +// There are some places where we need to use sizeof() inside of a C +// preprocessor #if conditional, and so here we define the various sizes +// for those purposes. +#define BLIS_SIZEOF_S 4 // sizeof(float) +#define BLIS_SIZEOF_D 8 // sizeof(double) +#define BLIS_SIZEOF_C 8 // sizeof(scomplex) +#define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) + +// -- Complex types -- + +#ifdef BLIS_ENABLE_C99_COMPLEX + + #if __STDC_VERSION__ >= 199901L +#include // skipped + + // Typedef official complex types to BLIS complex type names. + typedef float complex scomplex; + typedef double complex dcomplex; + #else + #error "Configuration requested C99 complex types, but C99 does not appear to be supported." + #endif + +#else // ifndef BLIS_ENABLE_C99_COMPLEX + + // This cpp guard provides a temporary hack to allow libflame + // interoperability with BLIS. + #ifndef _DEFINED_SCOMPLEX + #define _DEFINED_SCOMPLEX + typedef struct + { + float real; + float imag; + } scomplex; + #endif + + // This cpp guard provides a temporary hack to allow libflame + // interoperability with BLIS. + #ifndef _DEFINED_DCOMPLEX + #define _DEFINED_DCOMPLEX + typedef struct + { + double real; + double imag; + } dcomplex; + #endif + +#endif // BLIS_ENABLE_C99_COMPLEX + +// -- Atom type -- + +// Note: atom types are used to hold "bufferless" scalar object values. Note +// that it needs to be as large as the largest possible scalar value we might +// want to hold. Thus, for now, it is a dcomplex. +typedef dcomplex atom_t; + +// -- Fortran-77 types -- + +// Note: These types are typically only used by BLAS compatibility layer, but +// we must define them even when the compatibility layer isn't being built +// because they also occur in bli_slamch() and bli_dlamch(). + +// Define f77_int depending on what size of integer was requested. +#if BLIS_BLAS_INT_TYPE_SIZE == 32 +typedef int32_t f77_int; +#elif BLIS_BLAS_INT_TYPE_SIZE == 64 +typedef int64_t f77_int; +#else +typedef long int f77_int; +#endif + +typedef char f77_char; +typedef float f77_float; +typedef double f77_double; +typedef scomplex f77_scomplex; +typedef dcomplex f77_dcomplex; + +// -- Void function pointer types -- + +// Note: This type should be used in any situation where the address of a +// *function* will be conveyed or stored prior to it being typecast back +// to the correct function type. It does not need to be used when conveying +// or storing the address of *data* (such as an array of float or double). + +//typedef void (*void_fp)( void ); +typedef void* void_fp; + + +// +// -- BLIS info bit field offsets ---------------------------------------------- +// + + + +// info +#define BLIS_DATATYPE_SHIFT 0 +#define BLIS_DOMAIN_SHIFT 0 +#define BLIS_PRECISION_SHIFT 1 +#define BLIS_CONJTRANS_SHIFT 3 +#define BLIS_TRANS_SHIFT 3 +#define BLIS_CONJ_SHIFT 4 +#define BLIS_UPLO_SHIFT 5 +#define BLIS_UPPER_SHIFT 5 +#define BLIS_DIAG_SHIFT 6 +#define BLIS_LOWER_SHIFT 7 +#define BLIS_UNIT_DIAG_SHIFT 8 +#define BLIS_INVERT_DIAG_SHIFT 9 +#define BLIS_TARGET_DT_SHIFT 10 +#define BLIS_TARGET_DOMAIN_SHIFT 10 +#define BLIS_TARGET_PREC_SHIFT 11 +#define BLIS_EXEC_DT_SHIFT 13 +#define BLIS_EXEC_DOMAIN_SHIFT 13 +#define BLIS_EXEC_PREC_SHIFT 14 +#define BLIS_PACK_SCHEMA_SHIFT 16 +#define BLIS_PACK_RC_SHIFT 16 +#define BLIS_PACK_PANEL_SHIFT 17 +#define BLIS_PACK_FORMAT_SHIFT 18 +#define BLIS_PACK_SHIFT 22 +#define BLIS_PACK_REV_IF_UPPER_SHIFT 23 +#define BLIS_PACK_REV_IF_LOWER_SHIFT 24 +#define BLIS_PACK_BUFFER_SHIFT 25 +#define BLIS_STRUC_SHIFT 27 +#define BLIS_COMP_DT_SHIFT 29 +#define BLIS_COMP_DOMAIN_SHIFT 29 +#define BLIS_COMP_PREC_SHIFT 30 + +// info2 +#define BLIS_SCALAR_DT_SHIFT 0 +#define BLIS_SCALAR_DOMAIN_SHIFT 0 +#define BLIS_SCALAR_PREC_SHIFT 1 + +// +// -- BLIS info bit field masks ------------------------------------------------ +// + +// info +#define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) +#define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) +#define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) +#define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) +#define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) +#define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) +#define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) +#define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) +#define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) +#define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) +#define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) +#define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) +#define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) +#define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) +#define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) +#define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) +#define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) +#define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) +#define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) +#define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) +#define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) +#define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) +#define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) +#define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) +#define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) +#define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) +#define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) +#define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) + +// info2 +#define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) +#define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) +#define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) + + +// +// -- BLIS enumerated type value definitions ----------------------------------- +// + +#define BLIS_BITVAL_REAL 0x0 +#define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT +#define BLIS_BITVAL_SINGLE_PREC 0x0 +#define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT +#define BLIS_BITVAL_FLOAT_TYPE 0x0 +#define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT +#define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT +#define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) +#define BLIS_BITVAL_INT_TYPE 0x04 +#define BLIS_BITVAL_CONST_TYPE 0x05 +#define BLIS_BITVAL_NO_TRANS 0x0 +#define BLIS_BITVAL_TRANS BLIS_TRANS_BIT +#define BLIS_BITVAL_NO_CONJ 0x0 +#define BLIS_BITVAL_CONJ BLIS_CONJ_BIT +#define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) +#define BLIS_BITVAL_ZEROS 0x0 +#define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) +#define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) +#define BLIS_BITVAL_DENSE BLIS_UPLO_BITS +#define BLIS_BITVAL_NONUNIT_DIAG 0x0 +#define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT +#define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT +#define BLIS_BITVAL_NOT_PACKED 0x0 +#define BLIS_BITVAL_4MI ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_3MI ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_4MS ( 0x3 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_3MS ( 0x4 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) +#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) +#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 +#define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT +#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 +#define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT +#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 +#define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_BITVAL_GENERAL 0x0 +#define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) +#define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) +#define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) + + +// +// -- BLIS enumerated type definitions ----------------------------------------- +// + +// -- Operational parameter types -- + +typedef enum +{ + BLIS_NO_TRANSPOSE = 0x0, + BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, + BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, + BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS +} trans_t; + +typedef enum +{ + BLIS_NO_CONJUGATE = 0x0, + BLIS_CONJUGATE = BLIS_BITVAL_CONJ +} conj_t; + +typedef enum +{ + BLIS_ZEROS = BLIS_BITVAL_ZEROS, + BLIS_LOWER = BLIS_BITVAL_LOWER, + BLIS_UPPER = BLIS_BITVAL_UPPER, + BLIS_DENSE = BLIS_BITVAL_DENSE +} uplo_t; + +typedef enum +{ + BLIS_LEFT = 0x0, + BLIS_RIGHT +} side_t; + +typedef enum +{ + BLIS_NONUNIT_DIAG = 0x0, + BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG +} diag_t; + +typedef enum +{ + BLIS_NO_INVERT_DIAG = 0x0, + BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG +} invdiag_t; + +typedef enum +{ + BLIS_GENERAL = BLIS_BITVAL_GENERAL, + BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, + BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, + BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR +} struc_t; + + +// -- Data type -- + +typedef enum +{ + BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, + BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, + BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, + BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, + BLIS_INT = BLIS_BITVAL_INT_TYPE, + BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, + BLIS_DT_LO = BLIS_FLOAT, + BLIS_DT_HI = BLIS_DCOMPLEX +} num_t; + +typedef enum +{ + BLIS_REAL = BLIS_BITVAL_REAL, + BLIS_COMPLEX = BLIS_BITVAL_COMPLEX +} dom_t; + +typedef enum +{ + BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, + BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC +} prec_t; + +#endif +// end bli_type_defs.h + +// begin bli_param_map.h +BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) +{ + if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; + else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; + else + { + *blis_side = BLIS_LEFT; + } +} + +BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) +{ + if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; + else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; + else + { + *blis_uplo = BLIS_LOWER; + } +} + +BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) +{ + if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; + else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; + else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; + else + { + *blis_trans = BLIS_NO_TRANSPOSE; + } +} + +BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) +{ + if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; + else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; + else + { + *blis_diag = BLIS_NONUNIT_DIAG; + } +} +// end bli_param_map.h + +// End extern "C" construct block. +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt new file mode 100644 index 00000000..165ea01c --- /dev/null +++ b/src/ltl2inv/CMakeLists.txt @@ -0,0 +1,15 @@ +# include guard +cmake_minimum_required(VERSION 3.0) + +if(${CMAKE_PROJECT_NAME} STREQUAL "Project") + message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.") +endif(${CMAKE_PROJECT_NAME} STREQUAL "Project") + +if("${ARCHITECTURE}" STREQUAL "x86_64") + add_definitions(-DF77_COMPLEX_RET_INTEL) +endif() +include_directories(../pfapack/c_interface) + +add_library(ltl2inv STATIC ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc) +target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL) + diff --git a/src/ltl2inv/eigen_gemmt.tcc b/src/ltl2inv/eigen_gemmt.tcc new file mode 100644 index 00000000..4e569df9 --- /dev/null +++ b/src/ltl2inv/eigen_gemmt.tcc @@ -0,0 +1,77 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "lapack2eigen.hh" +#include "ilaenv_lauum.hh" + + +template +inline void gemmt(const char &uploC, + const char &transA, + const char &transB, + const T &alpha, + const l2e::le_mat_t &A_, + const l2e::le_mat_t &B_, + const T &beta, + l2e::le_mat_t &C_) +{ + using LProxy = l2e::le_mat_t; + using Matrix = Eigen::Matrix; + const int npanel = ilaenv_lauum(uploC, C_.rows); + + bool conjA = transA == 'C' || transA == 'c'; + bool conjB = transB == 'C' || transB == 'c'; + + const auto &A = (transA == 'T' || transA == 't') ? A_.transpose().to_eigen() : A_.to_eigen(); + const auto &B = (transB == 'T' || transB == 't') ? B_.transpose().to_eigen() : B_.to_eigen(); + auto C = C_.to_eigen(); + + if (uploC == 'L' || uploC == 'l') { + + for (int ipanel = 0; ipanel < A.rows(); ipanel += npanel) { + int sz_panel = std::min(npanel, A.rows() - ipanel); + + // Diagonal. + const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all); + const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1)); + const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha; + LProxy Cpanel_(Cpanel); + lacpy(uploC, LProxy(Cupdated), Cpanel_); + + // Off-diagonal. + const auto &Aremain = A(Eigen::seq(ipanel+sz_panel, A.rows()-1), Eigen::all); + const auto &Bremain = B(Eigen::all, Eigen::seq(ipanel+sz_panel, B.cols()-1)); + auto Cremain = C(Eigen::seq(ipanel+sz_panel, C.rows()-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Cremain = Cremain * beta + Aremain * Bpanel * alpha; + } + + } else { // (uploC == 'U' || uploC == 'u') + + for (int ipanel = (A.rows()-1) / npanel * npanel; ipanel >= 0; ipanel -= npanel) { + int sz_panel = std::min(npanel, A.rows() - ipanel); + + // Diagonal. + const auto &Apanel = A(Eigen::seq(ipanel, ipanel+sz_panel-1), Eigen::all); + const auto &Bpanel = B(Eigen::all, Eigen::seq(ipanel, ipanel+sz_panel-1)); + const auto &Cpanel = C(Eigen::seq(ipanel, ipanel+sz_panel-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Matrix Cupdated = Cpanel * beta + Apanel * Bpanel * alpha; + LProxy Cpanel_(Cpanel); + lacpy(uploC, LProxy(Cupdated), Cpanel_); + + // Off-diagonal. + const auto &Aremain = A(Eigen::seq(0, ipanel-1), Eigen::all); + const auto &Bremain = B(Eigen::all, Eigen::seq(0, ipanel-1)); + auto Cremain = C(Eigen::seq(0, ipanel-1), + Eigen::seq(ipanel, ipanel+sz_panel-1)); + Cremain = Cremain * beta + Aremain * Bpanel * alpha; + } + + } +} diff --git a/src/ltl2inv/error.hh b/src/ltl2inv/error.hh new file mode 100644 index 00000000..5c0a06e4 --- /dev/null +++ b/src/ltl2inv/error.hh @@ -0,0 +1,42 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include +#include + + +inline void abort(const char *msg) +{ + std::cout << msg << std::endl; + abort(); +} + +inline void abort(const char *msg0, const char *msg1) +{ + std::cout << msg0 << ": " << msg1 << std::endl; + abort(); +} + +#if (defined(_DEBUG) || !defined(SKIP_CHECKS)) + +inline void assert_(const bool cond, const char *msg) +{ + if (!cond) abort(msg); +} + +inline void assert_(const bool cond, const char *msg0, const char *msg1) +{ + if (!cond) abort(msg0, msg1); +} + +#else + +inline void assert_(const bool cond, const char *msg) { } + +inline void assert_(const bool cond, const char *msg0, const char *msg1) { } + +#endif + diff --git a/src/ltl2inv/gemmt2blas.cc b/src/ltl2inv/gemmt2blas.cc new file mode 100644 index 00000000..56cd6fe4 --- /dev/null +++ b/src/ltl2inv/gemmt2blas.cc @@ -0,0 +1,35 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "lapack2eigen.hh" + + +#if !defined(_BLAS_HAS_GEMMT) +// Import the customized GEMMT impl. to the BLAS API. +// Required by Pfapack77. +#define BLAGEN_MAC(cctype, ctype, cchar) \ + extern "C" \ + void cchar##gemmt_(const char *uplo_, \ + const char *transa_, \ + const char *transb_, \ + const int *m, const int *k, \ + const ctype *alpha, \ + ctype *a, const int *lda, \ + ctype *b, const int *ldb, \ + const ctype *beta, ctype *c, const int *ldc) \ + { \ + bool tA = (*transa_ == 'T' || *transa_ == 't'); \ + bool tB = (*transb_ == 'T' || *transb_ == 't'); \ + l2e::le_mat_t A(tA ? *k : *m, tA ? *m : *k, 1, *lda, (cctype *)a); \ + l2e::le_mat_t B(tB ? *k : *m, tB ? *m : *k, 1, *ldb, (cctype *)b); \ + l2e::le_mat_t C(*m, *m, 1, *ldc, (cctype *)c); \ + gemmt(*uplo_, *transa_, *transb_, *(cctype *)alpha, A, B, *(cctype *)beta, C); \ + } +BLAGEN_MAC( float, float, s ) +BLAGEN_MAC( double, double, d ) +BLAGEN_MAC( ccscmplx, scomplex, c ) +BLAGEN_MAC( ccdcmplx, dcomplex, z ) +#undef BLAGEN_MAC +#endif diff --git a/src/ltl2inv/ilaenv.h b/src/ltl2inv/ilaenv.h new file mode 100644 index 00000000..add0a1ed --- /dev/null +++ b/src/ltl2inv/ilaenv.h @@ -0,0 +1,15 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#ifdef __cplusplus +extern "C" { +#endif + +int ilaenv_(const int *ispec, const char *name, const char *opts, const int *n1, const int *n2, const int *n3, const int *n4); + +#ifdef __cplusplus +} +#endif diff --git a/src/ltl2inv/ilaenv_lauum.cc b/src/ltl2inv/ilaenv_lauum.cc new file mode 100644 index 00000000..6dcb8b90 --- /dev/null +++ b/src/ltl2inv/ilaenv_lauum.cc @@ -0,0 +1,25 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "ilaenv.h" +#include "ilaenv_lauum.hh" +#include +using ccscmplx = std::complex; +using ccdcmplx = std::complex; + + +#define EXPANDMAC(cctype, name) \ +template <> int ilaenv_lauum(const char uplo, const int n) \ +{ \ + int ispec = 1; \ + int dummy = 0; \ + return ilaenv_(&ispec, #name, &uplo, &n, &dummy, &dummy, &dummy); \ +} +EXPANDMAC( float, SLAUUM ) +EXPANDMAC( double, DLAUUM ) +EXPANDMAC( ccscmplx, CLAUUM ) +EXPANDMAC( ccdcmplx, ZLAUUM ) +#undef EXPANDMAC + diff --git a/src/ltl2inv/ilaenv_lauum.hh b/src/ltl2inv/ilaenv_lauum.hh new file mode 100644 index 00000000..46a59b0f --- /dev/null +++ b/src/ltl2inv/ilaenv_lauum.hh @@ -0,0 +1,10 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once + +template +int ilaenv_lauum(const char uplo, const int n); + diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc new file mode 100644 index 00000000..ba3bfd8a --- /dev/null +++ b/src/ltl2inv/invert.tcc @@ -0,0 +1,144 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "lapack2eigen.hh" +#include "trmmt.tcc" +#include "error.hh" + +template +void sktdsmx(const Vec &vT, const Mat &B, Mat_ &C) +{ + int n = B.rows(); + assert_(B.cols() == n && C.cols() == n && C.rows() == n, "sktdsmx: Dimension mismatch."); + + for (int j = 0; j < n; ++j) + C(1, j) = B(0, j) / -vT[0]; + for (int i = 2; i < n; i += 2) + for (int j = 0; j < n; ++j) + C(i+1, j) = (B(i, j) - C(i-1, j) * vT[i-1]) / -vT[i]; + + for (int j = 0; j < n; ++j) + C(n-2, j) = B(n-1, j) / vT[n-2]; + for (int i = n-3; i >= 0; i -= 2) + for (int j = 0; j < n; ++j) + C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1]; +} + +template +void ltl2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M) +{ + using namespace Eigen; + using T = typename Mat::Scalar; + int n = A.rows(); + int npanel_trmm = ilaenv_lauum('L', n); + int full = npanel_trmm <= 1 || npanel_trmm >= n; + + // Set M. + for (int j = 0; j < n; ++j) + for (int i = 0; i < n; ++i) + M(i, j) = T(!(i - j)); + + l2e::le_mat_t A_(A); + l2e::le_mat_t M_(M); + l2e::le_mat_t A_Lx(A(seq(1, n-1), seq(0, n-2))); + l2e::le_mat_t A_L(A(seq(2, n-1), seq(0, n-3))); + l2e::le_mat_t M_L(M(seq(2, n-1), seq(1, n-2))); + + trtri('L', 'U', A_Lx); + lacpy('L', A_L, M_L); + + for (int i = 0; i < n-1; ++i) + vT[i] = A(i+1, i); + sktdsmx(vT, M, A); + + if (!full) { + trmmt('L', 'U', T(1.0), M, A); + for (int j = 0; j < n; ++j) { + A(j, j) = 0.0; + for (int i = 0; i < j; ++i) + A(i, j) = -A(j, i); + } + } + + // In-place permute columns. + for (int j = n-1; j >= 0; --j) + if (iPiv[j]-1 != j) + A(Eigen::all, j).swap(A(Eigen::all, iPiv[j] - 1)); + + if (full) + A = M.template triangularView().transpose() * A; + + // In-place permute rows. + for (int i = n-1; i >= 0; --i) + if (iPiv[i]-1 != i) + A(i, Eigen::all).swap(A(iPiv[i] - 1, Eigen::all)); +} + +template +void utu2inv(Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M) +{ + using namespace Eigen; + using T = typename Mat::Scalar; + int n = A.rows(); + int npanel_trmm = ilaenv_lauum('U', n); + int full = npanel_trmm <= 1 || npanel_trmm >= n; + + // Set M. + for (int j = 0; j < n; ++j) + for (int i = 0; i < n; ++i) + M(i, j) = T(!(i - j)); + + l2e::le_mat_t A_(A); + l2e::le_mat_t M_(M); + l2e::le_mat_t A_Ux(A(seq(0, n-2), seq(1, n-1))); + l2e::le_mat_t A_U(A(seq(0, n-3), seq(2, n-1))); + l2e::le_mat_t M_U(M(seq(0, n-3), seq(1, n-2))); + + trtri('U', 'U', A_Ux); + lacpy('U', A_U, M_U); + + for (int i = 0; i < n-1; ++i) + vT[i] = -A(i, i+1); + sktdsmx(vT, M, A); + + if (!full) { + trmmt('U', 'U', T(1.0), M, A); + for (int j = 0; j < n; ++j) { + A(j, j) = 0.0; + for (int i = j + 1; i < n; ++i) + A(i, j) = -A(j, i); + } + } + + // In-place permute columns. + for (int j = 0; j < n; ++j) + if (iPiv[j]-1 != j) + A(Eigen::all, j).swap(A(Eigen::all, iPiv[j]-1)); + + if (full) + A = M.template triangularView().transpose() * A; + + // In-place permute rows. + for (int i = 0; i < n; ++i) + if (iPiv[i]-1 != i) + A(i, Eigen::all).swap(A(iPiv[i]-1, Eigen::all)); +} + +template +void ltl2inv(const char uplo, Mat &A, const iVec &iPiv, Vec &vT, Mat_ &M) +{ + switch (uplo) { + case 'l': + case 'L': + ltl2inv(A, iPiv, vT, M); break; + case 'u': + case 'U': + utu2inv(A, iPiv, vT, M); break; + default: + break; + } +} + diff --git a/src/ltl2inv/lapack2eigen.hh b/src/ltl2inv/lapack2eigen.hh new file mode 100644 index 00000000..1d850068 --- /dev/null +++ b/src/ltl2inv/lapack2eigen.hh @@ -0,0 +1,272 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include +#include +#ifndef EIGEN_USE_LAPACKE // This lets Eigen/Dense include LAPACK definitions. +#include +#endif +#ifdef _BLIS +#define BLIS_DISABLE_BLAS // Screen BLAS definition even if BLIS' built with them. +#include "blis/blis.h" +#else +using Eigen::scomplex; +using Eigen::dcomplex; +#endif +#include "error.hh" +#include "fortran_pfapack.h" +using f77_int = lapack_int; +using ccscmplx = std::complex; +using ccdcmplx = std::complex; + +namespace l2e +{ + +template +struct le_mat_t +{ + const f77_int rows, cols; ///< Matrix size. + const f77_int rs, cs; ///< Strides. + + T *const data; + +protected: + template + static f77_int _rs(const Mat &mat) + { + f77_int rs = 0; + if (mat.rows() > 1) + rs = &(mat(1, 0)) - &(mat(0, 0)); + return rs; + } + template + static f77_int _cs(const Mat &mat) + { + f77_int cs = 0; + if (mat.cols() > 1) + cs = &(mat(0, 1)) - &(mat(0, 0)); + return cs; + } + +public: + le_mat_t(const f77_int &rows_, const f77_int &cols_, + const f77_int &rs_, const f77_int &cs_, T *const data_) + : rows(rows_), cols(cols_), rs(rs_), cs(cs_), data(data_) { } + + template + le_mat_t(Eigen::Matrix &mat) + : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { } + + template + le_mat_t(Eigen::Block mat) // TODO: Check for data copying. + : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { } + + template + le_mat_t(Eigen::Map, Alignment, Stride> &mat) + : rows(mat.rows()), cols(mat.cols()), rs(_rs(mat)), cs(_cs(mat)), data(&(mat(0, 0))) { } + + template + Eigen::Map, Alignment, + Eigen::Stride > to_eigen() const + { + using matrix_t = Eigen::Matrix; + using stride_t = Eigen::Stride; + return Eigen::Map(data, rows, cols, stride_t(cs, rs)); + } + + le_mat_t transpose() const + { return le_mat_t(cols, rows, cs, rs, data); } +}; + +} + + +// ---- BLAS/LAPACK routines used in our settings. ----- + + +// [LAPACK] lacpy +#define BLALINK_PARAMS(T) const char &uploA, \ + const l2e::le_mat_t &A, \ + l2e::le_mat_t &B +template +inline void lacpy( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline void lacpy( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1 && B.rs == 1, "BLAS requires column-major."); \ + /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \ + cchar##lacpy_((char *)&uploA, (f77_int *)&A.rows, (f77_int *)&A.cols, \ + (ctype *)A.data, (f77_int *)&A.cs, \ + (ctype *)B.data, (f77_int *)&B.cs); \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, _Complex float, c ) +BLALINK_MAC( ccdcmplx, _Complex double, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [LAPACK] trtri +#define BLALINK_PARAMS(T) const char &uploA, \ + const char &diagA, \ + l2e::le_mat_t &A +template +inline f77_int trtri( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline f77_int trtri( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + f77_int info; \ + /* Eigen-vendored lapack.h misses const qualifier. Do force conversion here. */ \ + cchar##trtri_((char *)&uploA, (char *)&diagA, (f77_int *)&A.rows, \ + (ctype *)A.data, (f77_int *)&A.cs, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, _Complex float, c ) +BLALINK_MAC( ccdcmplx, _Complex double, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [PFAPACK] sktrf +#define BLALINK_PARAMS(T) const char &uplo, \ + const char &mode, \ + l2e::le_mat_t &A, \ + f77_int *ipiv, \ + T *work, f77_int lwork +template +inline f77_int sktrf( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline f77_int sktrf( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + f77_int info; \ + cchar##sktrf_(&uplo, &mode, &A.rows, (ctype *)A.data, &A.cs, ipiv, work, &lwork, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, floatcmplx, c ) +BLALINK_MAC( ccdcmplx, doublecmplx, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [PFAPACK] skpfa +#define BLALINK_PARAMS(T) const char &uplo, \ + const char &mthd, \ + l2e::le_mat_t &A, \ + T *pfa, f77_int *ipiv, \ + T *work, f77_int lwork +template +inline f77_int skpfa( BLALINK_PARAMS(T) ); +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + int info; \ + cchar##skpfa_(&uplo, &mthd, &A.rows, (ctype *)A.data, &A.cs, pfa, ipiv, work, &lwork, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +#undef BLALINK_MAC +#define BLALINK_MAC(T, ctype, rtype, cchar) \ + template <> inline f77_int skpfa( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1, "BLAS requires column-major."); \ + rtype *rwork = 0; \ + if (mthd != 'p' && mthd != 'P') \ + rwork = new rtype[lwork]; \ + int info; \ + cchar##skpfa_(&uplo, &mthd, &A.rows, A.data, &A.cs, pfa, ipiv, work, &lwork, rwork, &info); \ + delete[] rwork; \ + return info; \ + } +BLALINK_MAC( ccscmplx, floatcmplx, float, c ) +BLALINK_MAC( ccdcmplx, doublecmplx, double, z ) +#undef BLALINK_MAC +#undef BLALINK_PARAMS + + +// [Non-standard] gemmt +#define BLALINK_PARAMS(T) const char &uploC, \ + const char &transA, \ + const char &transB, \ + const T &alpha, \ + const l2e::le_mat_t &A, \ + const l2e::le_mat_t &B, \ + const T &beta, \ + l2e::le_mat_t &C +template +inline void gemmt( BLALINK_PARAMS(T) ); +#if !defined(_BLIS) && !defined(_BLAS_HAS_GEMMT) + #include "eigen_gemmt.tcc" +#else +#ifdef _BLIS +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline void gemmt( BLALINK_PARAMS(T) ) \ + { \ + uplo_t bli_uploC; \ + trans_t bli_transA, bli_transB; \ + bli_param_map_netlib_to_blis_uplo( uploC, &bli_uploC ); \ + bli_param_map_netlib_to_blis_trans( transA, &bli_transA ); \ + bli_param_map_netlib_to_blis_trans( transB, &bli_transB ); \ + bli_##cchar##gemmt(bli_uploC, \ + bli_transA, \ + bli_transB, \ + (transA == 'T' || transA == 't') ? A.cols : A.rows, \ + (transA == 'T' || transA == 't') ? A.rows : A.cols, \ + (const ctype *)&alpha, \ + (const ctype *)A.data, A.rs, A.cs, \ + (const ctype *)B.data, B.rs, B.cs, \ + (const ctype *)&beta, \ + (ctype *)C.data, C.rs, C.cs); \ + } +#else + // ifdef _BLAS_HAS_GEMMT +#define BLALINK_MAC(T, ctype, cchar) \ + template <> inline void gemmt( BLALINK_PARAMS(T) ) \ + { \ + assert_(A.rs == 1 && B.rs == 1 && C.rs == 1, "BLAS requires column-major."); \ + cchar##gemmt_(&uploC, \ + &transA, &transB, \ + (transA == 'T' || transA == 't') ? &A.cols : &A.rows, \ + (transA == 'T' || transA == 't') ? &A.rows : &A.cols, \ + (const ctype *)&alpha, \ + (const ctype *)A.data, &A.cs, \ + (const ctype *)B.data, &B.cs, \ + (const ctype *)&beta, \ + (ctype *)C.data, &C.cs); \ + } +extern "C" { +#define GEMMT_F77_PARAMS(T) const char *uploC, \ + const char *transA, \ + const char *transB, \ + const f77_int *m, \ + const f77_int *k, \ + const T *alpha, \ + const T *A, const f77_int *ldA, \ + const T *B, const f77_int *ldB, \ + const T *beta, \ + T *C, const f77_int *ldC +void sgemmt_( GEMMT_F77_PARAMS(float) ); +void dgemmt_( GEMMT_F77_PARAMS(double) ); +void cgemmt_( GEMMT_F77_PARAMS(scomplex) ); +void zgemmt_( GEMMT_F77_PARAMS(dcomplex) ); +#undef GEMMT_F77_PARAMS +} +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC +#endif +#undef BLALINK_PARAMS diff --git a/src/ltl2inv/ltl2inv.cc b/src/ltl2inv/ltl2inv.cc new file mode 100644 index 00000000..e7ef84fc --- /dev/null +++ b/src/ltl2inv/ltl2inv.cc @@ -0,0 +1,51 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "pfaffian.tcc" +#include "invert.tcc" + + + +extern "C" { + +#define GENDEF(schema, cchar, ctype, cctype) \ +void schema ##2inv_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *vT, cctype *M_, int ldM) \ +{ \ + using namespace Eigen; \ + Map, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \ + Map, 0, OuterStride<> > M(M_, n, n, OuterStride<>(ldM)); \ + Map Piv(iPiv, n); \ + Map> T(vT, n); \ + schema ##2inv(A, Piv, vT, M); \ +} +GENDEF(ltl, s, float, float) +GENDEF(ltl, d, double, double) +GENDEF(ltl, c, ccscmplx, ccscmplx) +GENDEF(ltl, z, ccdcmplx, ccdcmplx) +GENDEF(utu, s, float, float) +GENDEF(utu, d, double, double) +GENDEF(utu, c, ccscmplx, ccscmplx) +GENDEF(utu, z, ccdcmplx, ccdcmplx) + +#undef GENDEF +#define GENDEF(schema, cchar, ctype, cctype) \ +void schema ##2pfa_## cchar(int n, cctype *A_, int ldA, int *iPiv, cctype *Pfa) \ +{ \ + using namespace Eigen; \ + Map, 0, OuterStride<> > A(A_, n, n, OuterStride<>(ldA)); \ + Map Piv(iPiv, n); \ + *Pfa = schema ##2pfa(A, Piv); \ +} +GENDEF(ltl, s, float, float) +GENDEF(ltl, d, double, double) +GENDEF(ltl, c, ccscmplx, ccscmplx) +GENDEF(ltl, z, ccdcmplx, ccdcmplx) +GENDEF(utu, s, float, float) +GENDEF(utu, d, double, double) +GENDEF(utu, c, ccscmplx, ccscmplx) +GENDEF(utu, z, ccdcmplx, ccdcmplx) + +} + diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv new file mode 100644 index 00000000..d57c5fcd --- /dev/null +++ b/src/ltl2inv/makefile_ltl2inv @@ -0,0 +1,28 @@ +include ../make.sys + +OBJ = ilaenv_lauum.o gemmt2blas.o ltl2inv.o +SRC = ilaenv_lauum.cc gemmt2blas.cc ltl2inv.cc +HDR = ilaenv.h ilaenv_lauum.hh lapack2eigen.hh error.hh eigen_gemmt.tcc invert.tcc pfaffian.tcc trmmt.tcc + +ltl2inv.a: $(OBJ) + $(AR) rvu $@ $(OBJ) + +clean: + rm -f $(OBJ) + +SUFFIXES: .o .c + +.c.o: $(HDR) + $(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \ + -Wno-incompatible-pointer-types-discards-qualifiers \ + -I../pfapack/c_interface \ + -I$(BLIS_ROOT)/include \ + -I$(BLIS_ROOT)/include/blis + +SUFFIXES: .o .cc + +.cc.o: $(HDR) + $(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \ + -I../pfapack/c_interface \ + -I$(BLIS_ROOT)/include \ + -I$(BLIS_ROOT)/include/blis diff --git a/src/ltl2inv/pfaffian.tcc b/src/ltl2inv/pfaffian.tcc new file mode 100644 index 00000000..51777f0b --- /dev/null +++ b/src/ltl2inv/pfaffian.tcc @@ -0,0 +1,88 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "lapack2eigen.hh" + + +template +struct SignedLogScale +{ + using T = T_; + + T_ val; + int sgn; +}; + + +template +SignedLogScale utu2logpfa(const Mat &A, const Vec &iPiv) +{ + SignedLogScale logpfa{ 0.0, 1 }; + + for (int i = 0; i < A.rows(); i += 2) { + logpfa.val += std::log(std::abs(Amp( A(i, i+1) ))); + logpfa.sgn *= A(i, i+1) > 0 ? 1 : -1; + + if (iPiv(i) -1 != i ) logpfa.sgn *= -1; + if (iPiv(i+1)-1 != i+1) logpfa.sgn *= -1; + } + return logpfa; +} + +// Additional parameter for return type deduction. +template +Ret utu2logpfa(const Mat &A, const Vec &iPiv, Ret &ret) +{ return utu2logpfa(A, iPiv); } + + + +template +Amp ltl2pfa(const Mat &A, const Vec &iPiv) +{ + Amp pfa = 1.0; + + for (int i = 0; i < A.rows(); i += 2) { + pfa *= -A(i+1, i); + + if (iPiv(i) -1 != i ) pfa = -pfa; + if (iPiv(i+1)-1 != i+1) pfa = -pfa; + } + return pfa; +} + +template +Amp utu2pfa(const Mat &A, const Vec &iPiv) +{ + Amp pfa = 1.0; + + for (int i = 0; i < A.rows(); i += 2) { + pfa *= A(i, i+1); + + if (iPiv(i) -1 != i ) pfa = -pfa; + if (iPiv(i+1)-1 != i+1) pfa = -pfa; + } + return pfa; +} + +template +Amp ltl2pfa(const char &uplo, const Mat &A, const Vec &iPiv) +{ + switch (uplo) { + case 'l': + case 'L': + return ltl2pfa(A, iPiv); + case 'u': + case 'U': + return utu2pfa(A, iPiv); + default: + return 0.0; + } +} + diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc new file mode 100644 index 00000000..db60335a --- /dev/null +++ b/src/ltl2inv/trmmt.tcc @@ -0,0 +1,43 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "lapack2eigen.hh" +#include "ilaenv_lauum.hh" + + +template +inline void trmmt(const char &uploAB, const char &diagA, const typename Mat::Scalar &alpha, Mat &A, Mat_ &B) +{ + using namespace Eigen; + using T = typename Mat::Scalar; + int n = A.rows(); + int npanel = ilaenv_lauum(uploAB, n); + + if (uploAB == 'U' || uploAB == 'u') { + // A upper => A' lower -> write to UPPER part of B. + for (int j = 0; j < n; j += npanel) { + int nloc = std::min(npanel, n - j); + + auto Bslice = B(seq(0, j+nloc-1), seq(j, j+nloc-1)); + if (diagA == 'U' || diagA == 'u') + Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView().transpose() * Bslice * alpha; + else + Bslice = A(seq(0, j+nloc-1), seq(0, j+nloc-1)).template triangularView().transpose() * Bslice * alpha; + } + } else { + // A lower => A' upper -> write to LOWER part of B. + for (int j = 0; j < n; j += npanel) { + int nloc = std::min(npanel, n - j); + + auto Bslice = B(seq(j, n-1), seq(j, j+nloc-1)); + if (diagA == 'U' || diagA == 'u') + Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView().transpose() * Bslice * alpha; + else + Bslice = A(seq(j, n-1), seq(j, n-1)).template triangularView().transpose() * Bslice * alpha; + } + } +} + diff --git a/src/mVMC/CMakeLists.txt b/src/mVMC/CMakeLists.txt index f80471ca..dd898044 100644 --- a/src/mVMC/CMakeLists.txt +++ b/src/mVMC/CMakeLists.txt @@ -27,9 +27,11 @@ target_link_libraries(vmcdry.out StdFace m) add_executable(vmc.out ${SOURCES_vmcmain} ${SOURCES_sfmt}) target_link_libraries(vmc.out StdFace) target_link_libraries(vmc.out pfapack) -if(PFAFFIAN_BLOCKED) - target_link_libraries(vmc.out pfupdates blis pthread) -endif(PFAFFIAN_BLOCKED) +target_link_libraries(vmc.out ltl2inv) +target_link_libraries(vmc.out pfupdates) +if(Require_BLIS) + target_link_libraries(vmc.out blis pthread) +endif(Require_BLIS) target_link_libraries(vmc.out ${LAPACK_LIBRARIES} m) if(USE_SCALAPACK) diff --git a/src/mVMC/calgrn.c b/src/mVMC/calgrn.c index 8d13215b..e81503da 100644 --- a/src/mVMC/calgrn.c +++ b/src/mVMC/calgrn.c @@ -38,6 +38,17 @@ void CalculateGreenFunc(const double w, const double complex ip, int *eleIdx, in int *myEleIdx, *myEleNum, *myProjCntNew; double complex *myBuffer; + int *lazy_info = malloc( sizeof(int) * NCisAjsCktAltDC * 2); + int *lazy_rsi = malloc(2 * sizeof(int) * NCisAjsCktAltDC); + int *lazy_msj = malloc(2 * sizeof(int) * NCisAjsCktAltDC); + double complex *lazy_ip = malloc(sizeof(double complex) * NCisAjsCktAltDC); + double complex *lazy_pfa = malloc(sizeof(double complex) * NCisAjsCktAltDC * NQPFull); + memset(lazy_info, 0, sizeof(int) * NCisAjsCktAltDC); + memset(lazy_info + NCisAjsCktAltDC, -1, sizeof(int) * NCisAjsCktAltDC); + + for (int mi=0; mi/ */ /* buffer size = NQPFull+2*Nsize */ -double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl, +double complex GreenFunc2_(const int ri, const int rj, const int rk, const int rl, const int s, const int t, const double complex ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, - int *projCntNew, double complex *buffer) { + int *projCntNew, double complex *buffer, + int *lazy_info, int *lazy_rsi, int *lazy_msj) { double complex z; int mj,msj,ml,mtl; int rsi,rsj,rtk,rtl; @@ -145,8 +146,17 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV); - z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_info) { + CalculateNewPfMTwo_fcmp(ml, t, mj, s, pfMNew, eleIdx, 0, NQPFull, bufV); + z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + } else { + // Save invocation hook instead of calling. + lazy_msj[0] = msj; //< to edit to ri, revert to rj + lazy_msj[1] = mtl; //< to edit to rk, revert to rl + lazy_rsi[0] = rsi; + lazy_rsi[1] = rtk; + lazy_info[0] = 1; //< "is delayed" flag.. + } /* revert hopping */ eleIdx[mtl] = rl; @@ -159,6 +169,12 @@ double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl return conj(z/ip);//TBC } +double complex GreenFunc2(const int ri, const int rj, const int rk, const int rl, + const int s, const int t, const double complex ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, + int *projCntNew, double complex *buffer) { + return GreenFunc2_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0, 0, 0); +} // ignore GreenFuncN: to be added @@ -392,12 +408,7 @@ double complex calculateNewPfMN_child(const int qpidx, const int n, const int *m /* calculate Pf M */ //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); //M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); //TBC -#ifdef _pfaffine - info = 1; // Skip inverse. - M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); -#endif sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; return sgn * pfaff * PfM[qpidx]; diff --git a/src/mVMC/locgrn_fsz.c b/src/mVMC/locgrn_fsz.c index a686906f..6ca1bb2d 100644 --- a/src/mVMC/locgrn_fsz.c +++ b/src/mVMC/locgrn_fsz.c @@ -125,10 +125,11 @@ double complex GreenFunc1_fsz2(const int ri, const int rj, const int s,const int /* Calculate 2-body Green function / */ /* buffer size = NQPFull+2*Nsize */ -double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl, +double complex GreenFunc2_fsz_(const int ri, const int rj, const int rk, const int rl, const int s, const int t, const double complex ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, - int *projCntNew, double complex *buffer) { + int *projCntNew, double complex *buffer, + int *lazy_info, int *lazy_rsi, int *lazy_msj) { double complex z; int mj,msj,ml,mtl; int rsi,rsj,rtk,rtl; @@ -199,8 +200,17 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); - z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_info) { + CalculateNewPfMTwo_fsz(ml, t, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); + z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + } else { + // Save invocation hook instead of calling. + lazy_msj[0] = msj; //< to edit to ri, revert to rj + lazy_msj[1] = mtl; //< to edit to rk, revert to rl + lazy_rsi[0] = rsi; + lazy_rsi[1] = rtk; + lazy_info[0] = 1; //< "is delayed" flag.. + } /* revert hopping */ eleIdx[mtl] = rl; @@ -215,12 +225,21 @@ double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const in return conj(z/ip);//TBC } +double complex GreenFunc2_fsz(const int ri, const int rj, const int rk, const int rl, + const int s, const int t, const double complex ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, + int *projCntNew, double complex *buffer) { + return GreenFunc2_fsz_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn, + projCntNew, buffer, 0, 0, 0); +} + /* Calculate 2-body Green function / */ /* buffer size = NQPFull+2*Nsize */ -double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl, +double complex GreenFunc2_fsz2_(const int ri, const int rj, const int rk, const int rl, const int s, const int t,const int u,const int v, const double complex ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, - int *projCntNew, double complex *buffer) { + int *projCntNew, double complex *buffer, + int *lazy_info, int *lazy_rsi, int *lazy_msj) { double complex z; //int mj,msj,ml,mtl; int mj,ml; // mj: (rj,t) -> (ri,s) ml:(rl,v) -> (rk,u) @@ -325,8 +344,16 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s - z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_info) { + CalculateNewPfMTwo_fsz(ml, u, mj, s, pfMNew, eleIdx,eleSpn, 0, NQPFull, bufV); // ml -> rk,u; mj -> ri,s + z *= CalculateIP_fcmp(pfMNew, 0, NQPFull, MPI_COMM_SELF); + } else { + lazy_msj[0] = mj; + lazy_msj[1] = ml; + lazy_rsi[0] = XI; + lazy_rsi[1] = XK; + lazy_info[0] = 1; + } /* revert hopping */ eleIdx[ml] = rl; // ml : (rl,v) @@ -342,6 +369,13 @@ double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const i } +double complex GreenFunc2_fsz2(const int ri, const int rj, const int rk, const int rl, + const int s, const int t,const int u,const int v, const double complex ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt,int *eleSpn, + int *projCntNew, double complex *buffer) { + return GreenFunc2_fsz2_(ri, rj, rk, rl, s, t, u, v, ip, + eleIdx, eleCfg, eleNum, eleProjCnt, eleSpn, projCntNew, buffer, 0, 0, 0); +} // ignore GreenFuncN: to be added diff --git a/src/mVMC/locgrn_real.c b/src/mVMC/locgrn_real.c index 4991a54e..4c6a0e1f 100644 --- a/src/mVMC/locgrn_real.c +++ b/src/mVMC/locgrn_real.c @@ -77,10 +77,11 @@ double GreenFunc1_real(const int ri, const int rj, const int s, const double ip /* Calculate 2-body Green function / */ /* buffer size = NQPFull+2*Nsize */ -double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, +double GreenFunc2_real_(const int ri, const int rj, const int rk, const int rl, const int s, const int t, const double ip, int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, - int *projCntNew, double *buffer) { + int *projCntNew, double *buffer, + int *lazy_info, int *lazy_rsi, int *lazy_msj) { double z; int mj,msj,ml,mtl; int rsi,rsj,rtk,rtl; @@ -149,8 +150,17 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, z = ProjRatio(projCntNew,eleProjCnt); /* calculate Pfaffian */ - CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV); - z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF); + if (!lazy_info) { + CalculateNewPfMTwo_real(ml, t, mj, s, pfMNew_real, eleIdx, 0, NQPFull, bufV); + z *= CalculateIP_real(pfMNew_real, 0, NQPFull, MPI_COMM_SELF); + } else { + // Save invocation hook instead of calling. + lazy_msj[0] = msj; //< to edit to ri, revert to rj + lazy_msj[1] = mtl; //< to edit to rk, revert to rl + lazy_rsi[0] = rsi; + lazy_rsi[1] = rtk; + lazy_info[0] = 1; //< "is delayed" flag.. + } /* revert hopping */ eleIdx[mtl] = rl; @@ -163,6 +173,13 @@ double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, return z/ip;//TBC } +double GreenFunc2_real(const int ri, const int rj, const int rk, const int rl, + const int s, const int t, const double ip, + int *eleIdx, const int *eleCfg, int *eleNum, const int *eleProjCnt, + int *projCntNew, double *buffer) { + return GreenFunc2_real_(ri, rj, rk, rl, s, t, ip, eleIdx, eleCfg, eleNum, eleProjCnt, projCntNew, buffer, 0, 0, 0); +} + /// Calculate n-body Green function /// @@ -397,9 +414,6 @@ double calculateNewPfMN_real_child(const int qpidx, const int n, const int *msa, } /* calculate Pf M */ - // TODO: Maybe block updating is faster than recalculation of Pfaffian. - // Maybe I'll also give support for this in Pfaffine. - info = 1; // Skip inverse. M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; diff --git a/src/mVMC/makefile_src b/src/mVMC/makefile_src index 52ac8076..bf9eae6f 100644 --- a/src/mVMC/makefile_src +++ b/src/mVMC/makefile_src @@ -1,8 +1,8 @@ include ../make.sys PFAPACK = ../pfapack/fortran/libpfapack.a -PFAFFINE = ../pfaffine/src/libpfaffine.a PFUPDATES = ../pfupdates/pfupdates.o +LTL2INV = ../ltl2inv/ltl2inv.a SFMT = ../sfmt/SFMT.o STDFACE = ../StdFace/src/libStdFace.a OPTION = -D_mpi_use @@ -19,7 +19,7 @@ OBJS = \ splitloop.o \ vmcmain.o \ $(PFAPACK) $(SFMT) $(STDFACE) \ - $(PFUPDATES) $(PFAFFINE) + $(PFUPDATES) $(LTL2INV) SOURCES = \ average.c \ @@ -107,7 +107,7 @@ HEADERS = \ all : $(MAKE) -C ../pfapack/fortran libpfapack.a - $(MAKE) -C ../pfaffine/src libpfaffine.a + $(MAKE) -C ../ltl2inv -f makefile_ltl2inv $(MAKE) -C ../pfupdates -f makefile_pfupdates $(MAKE) -C ../sfmt -f makefile_sfmt $(MAKE) -C ../StdFace/src -f makefile_StdFace libStdFace.a @@ -130,7 +130,7 @@ clean : rm -f *.o vmc.out vmcdry.out $(MAKE) -C ../sfmt -f makefile_sfmt clean $(MAKE) -C ../pfapack/fortran -f makefile clean - $(MAKE) -C ../pfaffine -f makefile clean + $(MAKE) -C ../ltl2inv -f makefile_ltl2inv clean $(MAKE) -C ../pfupdates -f makefile_pfupdates clean $(MAKE) -C ../StdFace/src -f makefile_StdFace clean $(MAKE) -C ../ComplexUHF -f makefile_uhf clean diff --git a/src/mVMC/matrix.c b/src/mVMC/matrix.c index a86330ca..646c8f18 100644 --- a/src/mVMC/matrix.c +++ b/src/mVMC/matrix.c @@ -32,8 +32,6 @@ along with this program. If not, see http://www.gnu.org/licenses/. #include "./include/global.h" #include "workspace.c" -//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx, -// double *bufM, int *iwork, double *work, int lwork); #define D_PfLimit 1.0e-100 int getLWork() { @@ -69,11 +67,7 @@ int getLWork_fcmp() { lwork=-1; M_ZGETRI(&n, &a, &lda, &iwork, &optSize1, &lwork, &info); lwork=-1; -#ifdef _pfaffine - M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork/*, &rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork, &rwork, &info); -#endif lwork = (creal(optSize1)>creal(optSize2)) ? (int)creal(optSize1) : (int)creal(optSize2); return lwork; @@ -135,7 +129,6 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta int msi,msj; int rsi,rsj; - char uplo='U', mthd='P'; int m,n,nsq,one,lda,info=0; //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful? double complex pfaff,minus_one; @@ -147,60 +140,40 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta const double complex *sltE_i; double complex *invM = InvM + qpidx*Nsize*Nsize; - double complex *invM_i; - - double complex *bufM_i, *bufM_i2; + double complex *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi InvM(T) -> -InvM */ M_ZSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -254,7 +227,6 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int int msi,msj; int rsi,rsj; - char uplo='U', mthd='P'; int m,n,lda,one,nsq,info=0; //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful? double pfaff,minus_one; @@ -266,61 +238,41 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int const double *sltE_i; double *invM = InvM_real + qpidx*Nsize*Nsize; - double *invM_i; - - double *bufM_i, *bufM_i2; + double *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi InvM(T) -> -InvM */ M_DSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -384,7 +336,6 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE int msi,msj; int rsi,rsj; - char uplo='U', mthd='P'; int m,n,nsq,lda,one,info=0; double complex pfaff,minus_one; @@ -395,61 +346,42 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE const double complex *sltE_i; double complex *invM = InvM + qpidx*Nsize*Nsize; - double complex *invM_i; - - double complex *bufM_i, *bufM_i2; + double complex *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi -InvM according antisymmetric properties. */ M_ZSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -553,10 +485,6 @@ int calculateMAll_BF_fcmp_child( } } -#ifdef _pfaffine - info=0; /* Fused Pfaffian/inverse computation. */ - M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else /* Pfaffian/inverse computed separately. */ /* Copy bufM to invM before using bufM to compute Pfaffian. */ for(msi=0;msi InvM(T) -> -InvM */ M_ZSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -634,8 +554,6 @@ int CalculateMAll_real(const int *eleIdx, const int qpStart, const int qpEnd) { return info; } -//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx, -// double *bufM, int *iwork, double *work, int lwork) { int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx, double *bufM, int *iwork, double *work, int lwork, double* PfM_real, double *InvM_real) { #pragma procedure serial @@ -654,60 +572,45 @@ int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpE const double *sltE_i; double *invM = InvM_real + qpidx*Nsize*Nsize; - double *invM_i; - - double *bufM_i, *bufM_i2; + double *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi InvM' = -InvM + // TODO: Include this in ltl2inv M_DSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -793,14 +696,10 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int } } -#ifdef _pfaffine - info=0; /* Fused Pfaffian/inverse computation. */ -#else /* Pfaffian/inverse computed separately. */ /* Copy bufM to invM before using bufM to compute Pfaffian. */ for(msi=0;msi InvM' = -InvM M_DSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } diff --git a/src/mVMC/parameter.c b/src/mVMC/parameter.c index 9167c941..fc0b59ef 100644 --- a/src/mVMC/parameter.c +++ b/src/mVMC/parameter.c @@ -29,7 +29,7 @@ along with this program. If not, see http://www.gnu.org/licenses/. #ifndef _SRC_PARAMETER #define _SRC_PARAMETER -#define D_AmpMax 4.0 +#define D_AmpMax 0.5 /* initialize variational parameters */ void InitParameter() { diff --git a/src/mVMC/pfupdate.c b/src/mVMC/pfupdate.c index 42b82b35..5dc67cec 100644 --- a/src/mVMC/pfupdate.c +++ b/src/mVMC/pfupdate.c @@ -352,12 +352,7 @@ double complex calculateNewPfMBFN4_child(const int qpidx, const int n, const int /* Calculate Pf M */ //TODO: CHECK rwork is real <-- [R-Xu] Not needed anymore? -#ifdef _pfaffine - info = 1; // Skipping inverse. - M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); -#endif //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; @@ -593,12 +588,7 @@ double complex updateMAll_BF_fcmp_child( } /* Calculate Pf M */ -#ifdef _pfaffine - info = 1; // Skip Pfaffine inverse. - M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); -#endif //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info); //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); diff --git a/src/mVMC/pfupdate_real.c b/src/mVMC/pfupdate_real.c index 2b25909d..690b7c86 100644 --- a/src/mVMC/pfupdate_real.c +++ b/src/mVMC/pfupdate_real.c @@ -361,8 +361,6 @@ double calculateNewPfMBFN4_real_child(const int qpidx, const int n, const int *m } /* calculate Pf M */ - //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); - info = 1; // Skip Pfaffine's inverse. M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; @@ -547,7 +545,7 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa, // NOTE: This routines is not called in current version (i.e. lcoal coverage 0), // hence cannot be tested. I'm keeping GETRF and GETRI here. // TODO: If anyone's interested in utilizing this routine, she or he might want to - // utilize inverse feature of Pfaffine skpfa. + // utilize DSKTRF+LTL2INV m=nn=lda=n2; //M_ZGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */ M_DGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */ @@ -601,8 +599,6 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa, /* calculate Pf M */ //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info); - // As stated above, I'm now only setting Pfaffine to skip optinal inverse. - info = 1; // This is parameter for skipping inversion. M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; diff --git a/src/mVMC/vmcmake.c b/src/mVMC/vmcmake.c index 57ab2dd2..fa35d7a5 100644 --- a/src/mVMC/vmcmake.c +++ b/src/mVMC/vmcmake.c @@ -36,11 +36,6 @@ along with this program. If not, see http://www.gnu.org/licenses/. #include "splitloop.h" #include "qp.h" -#ifdef _pf_block_update -// Block-update extension. -#include "../pfupdates/pf_interface.h" -#endif - void VMCMakeSample(MPI_Comm comm) { int outStep,nOutStep; int inStep,nInStep; @@ -74,16 +69,19 @@ void VMCMakeSample(MPI_Comm comm) { // TODO: Compute from qpStart to qpEnd to support loop splitting. void *pfOrbital[NQPFull]; void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; // Read block size from input. const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE"); if (optBlockSize) NBlockUpdateSize = atoi(optBlockSize); // Fall back to default if input is invalid. - if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) + if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) { if (NExUpdatePath == 0) NBlockUpdateSize = 4; else NBlockUpdateSize = 20; + } // Set one universal EleSpn. for (mi=0; mi 100) + if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) { if (NExUpdatePath == 0) NBlockUpdateSize = 4; else NBlockUpdateSize = 20; + } // Initialize with free spin configuration. updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, @@ -100,7 +103,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) { InvM, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -117,13 +120,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) { #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, SlaterElm, Nsite2*Nsite2, InvM, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -382,13 +385,13 @@ void VMCMakeSample_fsz(MPI_Comm comm) { StartTimer(34); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_z(NQPFull, Nsite, Nsite2, Nsize, SlaterElm, Nsite2*Nsite2, InvM, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_z(NQPFull, PfM, pfUpdator); #else CalculateMAll_fsz(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -426,7 +429,7 @@ void VMCMakeSample_fsz(MPI_Comm comm) { #ifdef _pf_block_update // Free-up updator space. - updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_z(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); #endif return; diff --git a/src/mVMC/vmcmake_fsz_real.c b/src/mVMC/vmcmake_fsz_real.c index b76e6505..f3dda1d6 100644 --- a/src/mVMC/vmcmake_fsz_real.c +++ b/src/mVMC/vmcmake_fsz_real.c @@ -79,6 +79,8 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { // TODO: Compute from qpStart to qpEnd to support loop splitting. void *pfOrbital[NQPFull]; void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; // TODO: Make it input parameter. if (NExUpdatePath == 0) NBlockUpdateSize = 4; @@ -91,7 +93,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { InvM_real, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -108,13 +110,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize, SlaterElm_real, Nsite2*Nsite2, InvM_real, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -368,13 +370,13 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { StartTimer(34); #ifdef _pf_block_update // Clear and reinitialize. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_init_d(NQPFull, Nsite, Nsite2, Nsize, SlaterElm_real, Nsite2*Nsite2, InvM_real, Nsize*Nsize, TmpEleIdx, TmpEleSpn, NBlockUpdateSize, - pfUpdator, pfOrbital); + pfUpdator, pfOrbital, pfMat, pfMap); updated_tdi_v_get_pfa_d(NQPFull, PfM_real, pfUpdator); #else CalculateMAll_fsz_real(TmpEleIdx,TmpEleSpn,qpStart,qpEnd); @@ -412,7 +414,7 @@ void VMCMakeSample_fsz_real(MPI_Comm comm) { #ifdef _pf_block_update // Free-up updator space. - updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital); + updated_tdi_v_free_d(NQPFull, pfUpdator, pfOrbital, pfMat, pfMap); #endif return; diff --git a/src/mVMC/vmcmake_real.c b/src/mVMC/vmcmake_real.c index 6c9bde7d..9b73521f 100644 --- a/src/mVMC/vmcmake_real.c +++ b/src/mVMC/vmcmake_real.c @@ -37,11 +37,6 @@ which follows "The BSD 3-Clause License". #include "splitloop.h" #include "vmcmake.h" -#ifdef _pf_block_update -// Block-update extension. -#include "../pfupdates/pf_interface.h" -#endif - void VMCMakeSample_real(MPI_Comm comm) { int outStep, nOutStep; int inStep, nInStep; @@ -66,7 +61,7 @@ void VMCMakeSample_real(MPI_Comm comm) { StartTimer(30); if (BurnFlag == 0) { - makeInitialSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt, + makeInitialSample_real(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt, qpStart, qpEnd, comm); } else { copyFromBurnSample(TmpEleIdx, TmpEleCfg, TmpEleNum, TmpEleProjCnt); @@ -76,16 +71,19 @@ void VMCMakeSample_real(MPI_Comm comm) { // TODO: Compute from qpStart to qpEnd to support loop splitting. void *pfOrbital[NQPFull]; void *pfUpdator[NQPFull]; + void *pfMat[NQPFull]; + void *pfMap[NQPFull]; // Read block size from input. const char *optBlockSize = getenv("VMC_BLOCK_UPDATE_SIZE"); if (optBlockSize) NBlockUpdateSize = atoi(optBlockSize); // Fall back to default if input is invalid. - if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) + if (NBlockUpdateSize < 1 || NBlockUpdateSize > 100) { if (NExUpdatePath == 0) NBlockUpdateSize = 2; else NBlockUpdateSize = 20; + } // Set one universal EleSpn. for (mi=0; mi +#include +#include +#include + +namespace vmc +{ +struct config_manager +{ + using index_t = int32_t; + using base_t = std::vector; + + const index_t nsites; + index_t norbs() + { return nsites * 2; } + + const base_t &config_base() const { return _cfg_base; } + const index_t config_base(index_t i) const { return _cfg_base.at(i); } + const base_t & from_idx() const { return _from_idx; } + const index_t from_idx(index_t i) const { return _from_idx.at(i); } + const base_t & to_orb() const { return _to_orb; } + const index_t to_orb(index_t i) const { return _to_orb.at(i); } + const base_t & config() const { return _cfg; } + const index_t config(index_t i) const { return _cfg.at(i); } + const base_t & num() const { return _num; } + const index_t num(index_t i) const { return _num.at(i); } + const base_t & inv() const { return _inv; } + const index_t inv(index_t i) const { return _inv.at(i); } + const index_t spin(index_t i) const { return _num.at(i) == 1 ? 1 : _num.at(i) == 2 ? -1 : 0; } + + void reserve_space(index_t mmax) + { + _from_idx.reserve(mmax * sizeof(index_t)); + _to_orb.reserve(mmax * sizeof(index_t)); + } + + void attach_config(const base_t &cfg) + { + _cfg_base = cfg; + _cfg = cfg; + _from_idx.clear(); + _to_orb.clear(); + + // Reset _num. + for (index_t i = 0; i < nsites; ++i) + _num.at(i) = 0; + + // Generate num. + for (index_t i = 0; i < cfg.size(); ++i) { + // 0b01 for up, 0b10 for down, 0b11 for double occupation. + index_t xi = cfg.at(i) % nsites; + index_t si = cfg.at(i) / nsites; + _num.at(xi) += (si + 1); + } + + // Generate inv. + for (index_t i = 0; i < _inv.size(); ++i) + _inv.at(i) = -1; + for (index_t i = 0; i < cfg.size(); ++i) + _inv.at(cfg.at(i)) = i; + } + + void merge_config() + { + // OR: cfg_base = cfg; + for (index_t j = 0; j < _from_idx.size(); ++j) + _cfg_base.at(_from_idx.at(j)) = _to_orb.at(j); + _from_idx.clear(); + _to_orb.clear(); + } + + void push_update(index_t osi, index_t msj) + { + for (auto i = _from_idx.begin(); i != _from_idx.end(); ++i) + assert_(*i != msj, typeid(*this).name(), "Consecutive hopping unsupported. Please merge manually before proceeding."); + index_t osj = _cfg.at(msj); + index_t xj = osj % nsites; + index_t sj = osj / nsites; + index_t xi = osi % nsites; + index_t si = osi / nsites; + assert_(_inv.at(osj) == msj && _inv.at(osi) == -1, typeid(*this).name(), "Inv error @ push."); + + _from_idx.push_back(msj); + _to_orb.push_back(osi); + _cfg.at(msj) = osi; + // Update num. + _num.at(xj) -= sj + 1; + _num.at(xi) += si + 1; + // Update inv. + _inv.at(osj) = -1; + _inv.at(osi) = msj; + } + + void pop_update() + { + assert_(_from_idx.size(), typeid(*this).name(), "Trying to pop empty updates."); + index_t msj = _from_idx.back(); + index_t osj = _cfg_base.at(msj); + index_t xj = osj % nsites; + index_t sj = osj / nsites; + index_t osi = _cfg.at(msj); + index_t xi = osi % nsites; + index_t si = osi / nsites; + assert_(_inv.at(osi) == msj && _inv.at(osj) == -1, typeid(*this).name(), "Inv error @ pop."); + + _cfg.at(msj) = osj; + // Update num. + _num.at(xi) -= si + 1; + _num.at(xj) += sj + 1; + // Update inv. + _inv.at(osi) = -1; + _inv.at(osj) = msj; + _from_idx.pop_back(); + _to_orb.pop_back(); + } + + void check() + { + assert_(_cfg.size() <= norbs(), typeid(*this).name(), "More Fermions than orbitals."); + + base_t num(nsites, 0); + for (index_t i = 0; i < num.size(); ++i) { + index_t xi = _cfg.at(i) % nsites; + index_t si = _cfg.at(i) / nsites; + + assert_(!(num.at(xi) & (si + 1)), typeid(*this).name(), "More than two Fermions occupying the same orbital."); + num.at(xi) += (si + 1); + } + for (index_t i = 0; i < nsites; ++i) + assert_(num.at(i) == _num.at(i), typeid(*this).name(), "Found broken member: _num."); + + base_t cfg_curr = _cfg_base; + for (index_t j = 0; j < _from_idx.size(); ++j) + cfg_curr.at(_from_idx.at(j)) = _to_orb.at(j); + for (index_t i = 0; i < cfg_curr.size(); ++i) + assert_(cfg_curr.at(i) == _cfg.at(i), typeid(*this).name(), "Found broken member: _cfg."); + // Check inv. + } + + config_manager(index_t nsites_) + : nsites(nsites_), _num(nsites_), _inv(norbs()) { } + + config_manager(index_t nsites_, base_t cfg) + : nsites(nsites_), _num(nsites_), _inv(norbs()) + { attach_config(cfg); } + + template + static base_t singlet_config(unsigned nsite, unsigned nsinglet, rng_t &rng) + { + using dist_t = std::uniform_int_distribution; + + bool allow_doublon = nsinglet * 2 > nsite; + base_t marks(nsite, 0); + base_t cfg(0); + + // Spin up. + for (int i = 0; i < nsinglet; ++i) { + int iseq = dist_t(0, nsite - i - 1)(rng); + + int isite; + int isite_c = 0; + do { + // Find the next available site. + while (marks.at(isite_c)) { isite_c++; } + isite = isite_c; + isite_c++; + } while (iseq-- > 0); + + marks.at(isite) = 1; + cfg.push_back(isite); + } + + // Spin down. + for (int i = 0; i < nsinglet; ++i) { + int iseq; + if (allow_doublon) + iseq = dist_t(0, nsite - i - 1)(rng); + else + iseq = dist_t(0, nsite - i - 1 - nsinglet)(rng); + + int isite; + int isite_c = 0; + do { + // Find the next unoccupied site / orb. + if (allow_doublon) + while (marks.at(isite_c) > 1) { isite_c++; } + else + while (marks.at(isite_c) > 0) { isite_c++; } + isite = isite_c; + isite_c++; + } while (iseq-- > 0); + + marks.at(isite) = 2; + cfg.push_back(isite + nsite); + } + return cfg; + } + + template + std::pair propose_hop(rng_t &rng) const + { + int loop_breaker = 0; + int xi; + int msj = std::uniform_int_distribution(0, _cfg.size() - 1)(rng); + std::uniform_int_distribution select_site(0, nsites - 1); + int sj = msj / nsites; + + do { + assert_(loop_breaker++ < 10000000, typeid(*this).name(), "Too many loops."); + xi = select_site(rng); + } while (_num.at(xi) & (sj + 1)); + + return std::make_pair(xi + sj * nsites, msj); + } + + template + std::tuple propose_exc(rng_t &rng) const + { + using dist_t = std::uniform_int_distribution; + + int nup = 0; + int ndn = 0; + int loop_breaker = 0; + int msj, xj = 0; + int msl, xl = 0; + + // Count # of spin-up/down sites. + for (int i = 0; i < _num.size(); ++i) + if (num(i) == 1) + nup++; + else if (num(i) == 2) + ndn++; + + // Pick a singly occupied spin-up site. + int iseq_up = dist_t(0, nup - 1)(rng); + while (true) { + while (num(xj) != 1) ++xj; + if (--iseq_up < 0) + break; + else + ++xj; + } + + // Pick a singly occupied spin-down site. + int iseq_dn = dist_t(0, ndn - 1)(rng); + while (true) { + while (num(xl) != 2) ++xl; + if (--iseq_dn < 0) + break; + else + ++xl; + } + + // Reverse lookup config indices. + msj = inv(xj); + msl = inv(xl + nsites); + assert_(msj >= 0 && msl >= 0, typeid(*this).name(), "ExchangeConfig error."); + + return std::make_tuple(xl, msj, xj + nsites, msl); + } + +private: + base_t _cfg_base; + base_t _from_idx; + base_t _to_orb; + base_t _cfg; + base_t _num; + base_t _inv; +}; +} + diff --git a/src/pfupdates/makefile_pfupdates b/src/pfupdates/makefile_pfupdates index ee946fc0..ab9be063 100644 --- a/src/pfupdates/makefile_pfupdates +++ b/src/pfupdates/makefile_pfupdates @@ -1,16 +1,15 @@ include ../make.sys -ifeq ($(BLIS_ROOT),) - BLIS_ROOT = ../pfaffine/src/dep -endif - OBJ = pfupdates.o SRC = pf_interface.cc -HDR = pf_interface.h orbital_mat.tcc skmv.tcc updated_tdi.tcc +HDR = pf_interface.h \ + orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \ + ../ltl2inv/eigen_gemmt.tcc ../ltl2inv/invert.tcc ../ltl2inv/pfaffian.tcc ../ltl2inv/trmmt.tcc \ + ../ltl2inv/error.hh ../ltl2inv/ilaenv_lauum.hh ../ltl2inv/lapack2eigen.hh $(OBJ): $(SRC) $(HDR) $(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \ - -I../pfaffine/src \ + -I../pfapack/c_interface -I../ltl2inv \ -I$(BLIS_ROOT)/include \ -I$(BLIS_ROOT)/include/blis diff --git a/src/pfupdates/orbital_mat.tcc b/src/pfupdates/orbital_mat.tcc index cb8a6117..ddb4ce6c 100644 --- a/src/pfupdates/orbital_mat.tcc +++ b/src/pfupdates/orbital_mat.tcc @@ -6,52 +6,36 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "blis.h" -#include "colmaj.tcc" +#include #include -#ifdef UseBoost -#include -#include -#include -template -using matrix_t = boost::numeric::ublas::matrix; -template -using vector_t = boost::numeric::ublas::vector; -#else -template -using matrix_t = colmaj; -#endif -template +namespace vmc +{ +namespace orbital +{ +template struct orbital_mat { - const uplo_t uplo; - const dim_t nsite; - matrix_t X; ///< nsite*nsite. + using matrix_t = matrix_t_; + using T = typename matrix_t::Scalar; + using index_t = int32_t; - orbital_mat(uplo_t uplo_, dim_t nsite_, T *X_, inc_t ldX) - : uplo(uplo_), nsite(nsite_), - #ifdef UseBoost - X(nsite, nsite) { - colmaj X_tmp(X_, ldX); - for (dim_t j = 0; j < nsite; ++j) - for (dim_t i = 0; i < nsite; ++i) - X(i, j) = X_tmp(i, j); - } - #else - X(X_, ldX) { } - #endif + const char uplo; + const index_t norb_; + matrix_t &X; ///< norb*norb. + + orbital_mat(char uplo_, index_t norb__, matrix_t &X_) + : uplo(uplo_), norb_(norb__), X(X_) { } - orbital_mat(uplo_t uplo_, dim_t nsite_, matrix_t &X_) - : uplo(uplo_), nsite(nsite_), X(X_) { } + virtual index_t norb() { return norb_; } - void randomize(double amplitude, unsigned seed) { + virtual void randomize(double amplitude, unsigned seed) { using namespace std; mt19937_64 rng(seed); uniform_real_distribution dist(-0.1, 1.0); - for (dim_t j = 0; j < nsite; ++j) { - for (dim_t i = 0; i < j; ++i) { + for (index_t j = 0; j < norb_; ++j) { + for (index_t i = 0; i < j; ++i) { X(i, j) = T(dist(rng)) * amplitude; X(j, i) = -X(i, j); } @@ -61,18 +45,28 @@ struct orbital_mat void randomize(double amplitude) { randomize(amplitude, 511); } - T operator()(dim_t osi, dim_t osj) { + virtual index_t pack_idx(index_t osi, index_t osj) { + abort(); + } + + virtual void unpack_idx(index_t &osi, index_t &osj, index_t idx) { + abort(); + } + + virtual T operator()(index_t osi, index_t osj) { if (osi == osj) return T(0.0); switch (uplo) { - case BLIS_UPPER: + case 'U': + case 'u': if (osi < osj) return X(osi, osj); else return -X(osj, osi); - case BLIS_LOWER: + case 'L': + case 'l': if (osi > osj) return X(osi, osj); else @@ -83,3 +77,5 @@ struct orbital_mat } } }; +} +} diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc index a19596a4..575ca625 100644 --- a/src/pfupdates/pf_interface.cc +++ b/src/pfupdates/pf_interface.cc @@ -4,23 +4,72 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #include "updated_tdi.tcc" +#include "orbital_mat.tcc" // In implementation, this should appear AFTER blis.h. #include "pf_interface.h" +#include // Non-# Pragma support differs from compiler to compiler #if defined(__INTEL_COMPILER) -#define OMP_PARALLEL_FOR_SHARED __pragma(omp parallel for default(shared)) +#define _ccPragma(_1) __pragma(_1) #elif defined(__GNUC__) -#define OMP_PARALLEL_FOR_SHARED _Pragma("omp parallel for default(shared)") +#define _ccPragma(_1) _Pragma(#_1) #else -#error "Valid non-preprocessor _pragma() not found." +#error "Valid non-preprocessor _Pragma() not found." #endif -#define orbv( i, ctype ) ( (orbital_mat *)orbv[i] ) -#define objv( i, ctype ) ( (updated_tdi *)objv[i] ) +using namespace Eigen; +using namespace vmc::orbital; +template +using matrix_t = Map, 0, OuterStride<> >; + +#define matv( i, ctype ) ( ( matrix_t *)matv[i] ) +#define mapv( i, ctype ) ( ( matrix_t *)mapv[i] ) +#define orbv( i, ctype ) ( ( orbital_mat > *)orbv[i] ) +#define objv( i, ctype ) ( (updated_tdi > > *)objv[i] ) #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar +#define GENIMPL( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \ + ( uint64_t num_qp, \ + uint64_t nsite, \ + uint64_t norbs, \ + uint64_t nelec, \ + ctype *orbmat_base, \ + int64_t orbmat_stride, \ + ctype *invmat_base, \ + int64_t invmat_stride, \ + int32_t *eleidx, \ + int32_t *elespn, \ + uint64_t mmax, \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ +{ \ + for (int iqp = 0; iqp < num_qp; ++iqp) { \ + mapv[iqp] = new matrix_t(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \ + matv[iqp] = new matrix_t(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \ + orbv[iqp] = new orbital_mat>('U', norbs, *mapv(iqp, ctype) ); \ + objv[iqp] = new updated_tdi>> \ + ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \ +\ + /* Compute initial. */ \ + vmc::config_manager::base_t cfg_i(nelec); \ + for (int msi = 0; msi < nelec; ++msi) \ + cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \ + objv(iqp, ctype)->attach_config(cfg_i); \ + objv(iqp, ctype)->initialize_precomputed(pfav[iqp]); \ + } \ +} +// GENIMPL( float, s ) +GENIMPL( double, d ) +// GENIMPL( ccscmplx, c ) +GENIMPL( ccdcmplx, z ) +#undef GENIMPL + #define GENIMPL( ctype, cblachar ) \ void EXPANDNAME( updated_tdi_v_init, cblachar ) \ ( uint64_t num_qp, \ @@ -35,25 +84,29 @@ int32_t *elespn, \ uint64_t mmax, \ void *objv[], \ - void *orbv[] ) \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ { \ - OMP_PARALLEL_FOR_SHARED \ + _ccPragma(omp parallel for default(shared)) \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ - orbv[iqp] = new orbital_mat( \ - BLIS_UPPER, norbs, orbmat_base + iqp * orbmat_stride, norbs); \ - objv[iqp] = new updated_tdi( \ - *orbv(iqp, ctype), nelec, invmat_base + iqp * invmat_stride, nelec, mmax); \ + mapv[iqp] = new matrix_t(orbmat_base + iqp * orbmat_stride, norbs, norbs, OuterStride<>(norbs)); \ + matv[iqp] = new matrix_t(invmat_base + iqp * invmat_stride, nelec, nelec, OuterStride<>(nelec)); \ + orbv[iqp] = new orbital_mat>('U', norbs, *mapv(iqp, ctype) ); \ + objv[iqp] = new updated_tdi>> \ + ( *orbv(iqp, ctype), *matv(iqp, ctype), nelec, mmax); \ \ /* Compute initial. */ \ - auto &cfg_i = objv(iqp, ctype)->elem_cfg; \ + vmc::config_manager::base_t cfg_i(nelec); \ for (int msi = 0; msi < nelec; ++msi) \ cfg_i.at(msi) = eleidx[msi] + elespn[msi]*nsite; \ + objv(iqp, ctype)->attach_config(cfg_i); \ objv(iqp, ctype)->initialize(); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -61,16 +114,20 @@ GENIMPL( ccdcmplx, z ) void EXPANDNAME( updated_tdi_v_free, cblachar ) \ ( uint64_t num_qp, \ void *objv[], \ - void *orbv[] ) \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ { \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ delete orbv(iqp, ctype); \ delete objv(iqp, ctype); \ + delete matv(iqp, ctype); \ + delete mapv(iqp, ctype); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -82,11 +139,11 @@ GENIMPL( ccdcmplx, z ) { \ /* Minus sign appears from that SlaterElm is in fact interpreted as row-major. */ \ for (int iqp = 0; iqp < num_qp; ++iqp) \ - pfav[iqp] = -objv(iqp, ctype)->get_Pfa(); \ + pfav[iqp] = -objv(iqp, ctype)->get_amplitude(); \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -98,13 +155,13 @@ GENIMPL( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ) \ { \ - OMP_PARALLEL_FOR_SHARED \ + _ccPragma(omp parallel for default(shared)) \ for (int iqp = 0; iqp < num_qp; ++iqp) \ objv(iqp, ctype)->push_update_safe(osi, msj, cal_pfa!=0); \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -118,9 +175,9 @@ GENIMPL( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ) \ { \ - OMP_PARALLEL_FOR_SHARED \ + _ccPragma(omp parallel for default(shared)) \ for (int iqp = 0; iqp < num_qp; ++iqp) { \ - auto &from_i = objv(iqp, ctype)->from_idx; \ + const auto &from_i = objv(iqp, ctype)->get_config_manager().from_idx(); \ if (from_i.size() > objv(iqp, ctype)->mmax - 2) \ objv(iqp, ctype)->merge_updates(); \ else \ @@ -134,9 +191,9 @@ GENIMPL( ccdcmplx, z ) objv(iqp, ctype)->push_update(osk, msl, cal_pfa!=0); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -150,9 +207,121 @@ GENIMPL( ccdcmplx, z ) objv(iqp, ctype)->pop_update(cal_pfa!=0); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) +GENIMPL( double, d ) +// GENIMPL( ccscmplx, c ) +GENIMPL( ccdcmplx, z ) +#undef GENIMPL + +#define GENIMPL( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_omp_var0_proc_batch_greentwo, cblachar ) \ + ( uint64_t num_qp, \ + int num_gf, \ + int needs_comput[], /* Already packed. var0 only does unpacking. */ \ + int unpack_idx[], /* Already packed. var0 only does unpacking. */ \ + int to_orbs_[], /* Already packed. var0 only does unpacking. */ \ + int from_ids_[], /* Already packed. var0 only does unpacking. */ \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ +{ \ + const int ith = omp_get_thread_num(); \ + const int nth = omp_get_num_threads(); \ +\ + /* Partitioning. */ \ + const int qpCountThread = (num_qp + nth - 1) / nth; \ + const int qpStart = qpCountThread * ith; \ + const int qpEnd = std::min(int(num_qp), qpCountThread + qpStart); \ +\ + Eigen::Map to_orbs( to_orbs_, num_gf * 2); \ + Eigen::Map from_ids(from_ids_, num_gf * 2); \ +\ + for (int iqp = qpStart; iqp < qpEnd; ++iqp) { \ + Eigen::Vector pfaBatch = \ + objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs, from_ids); \ + pfaBatch *= objv(iqp, ctype)->get_amplitude(); \ +\ + /* Unpack computed Pfaffians. */ \ + for (int igf = 0; igf < num_gf; ++igf) \ + pfav[unpack_idx[igf] * num_qp + iqp] = pfaBatch[igf]; \ + } \ +} +// GENIMPL( float, s ) +GENIMPL( double, d ) +// GENIMPL( ccscmplx, c ) +GENIMPL( ccdcmplx, z ) +#undef GENIMPL + +#define GENIMPL( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_omp_var1_proc_batch_greentwo, cblachar ) \ + ( uint64_t num_qp, \ + int num_gf, \ + int needs_comput[], \ + int unpack_idx[], \ + int to_orbs[], \ + int from_ids[], \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ) \ +{ \ + const int ith = omp_get_thread_num(); \ + const int nth = omp_get_num_threads(); \ +\ + /* Count Green's funcs. */ \ + int gf_count = 0; \ + for (int i = 0; i < num_gf; ++i) \ + if ( needs_comput[i] ) \ + gf_count++; \ +\ + /* Partitioning. */ \ + int gfCountThread = (gf_count + nth - 1) / nth; \ + const int gfStart = gfCountThread * ith; \ + gfCountThread = std::min(gf_count - gfStart, gfCountThread); \ +\ + /* Pack info space. */ \ + gf_count = -gfStart; \ + int *to_orbs_thread_, *from_ids_thread_, *unpack_idx_thread; \ + for (int i = 0; i < num_gf && gf_count < gfCountThread; ++i) \ + if ( needs_comput[i] ) { \ + if ( gf_count >= 0 ) { \ + if ( gf_count == 0 ) { \ + /* Pin down scratchpad spots. */ \ + to_orbs_thread_ = to_orbs + i * 2; \ + from_ids_thread_ = from_ids + i * 2; \ + unpack_idx_thread = unpack_idx + i; \ + unpack_idx_thread[0] = i; \ + } else { \ + /* 0th idx already in the place. Pack from 1st. */ \ + to_orbs_thread_[gf_count * 2 ] = to_orbs[i * 2]; \ + to_orbs_thread_[gf_count * 2 + 1] = to_orbs[i * 2 + 1]; \ + from_ids_thread_[gf_count * 2 ] = from_ids[i * 2]; \ + from_ids_thread_[gf_count * 2 + 1] = from_ids[i * 2 + 1]; \ + unpack_idx_thread[gf_count] = i; \ + } \ + } \ + gf_count++; \ + } \ +\ + Eigen::Map to_orbs_thread( to_orbs_thread_, gfCountThread * 2); \ + Eigen::Map from_ids_thread(from_ids_thread_, gfCountThread * 2); \ +\ + for (int iqp = 0; iqp < num_qp; ++iqp) { \ + Eigen::Vector pfaBatch = \ + objv(iqp, ctype)->batch_query_amplitudes(2, to_orbs_thread, from_ids_thread); \ + pfaBatch *= objv(iqp, ctype)->get_amplitude(); \ +\ + /* Unpack computed Pfaffians. */ \ + for (int igf = 0; igf < gfCountThread; ++igf) \ + pfav[unpack_idx_thread[igf] * num_qp + iqp] = pfaBatch[igf]; \ + } \ +} +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h index b9f4cbf2..49ad0252 100644 --- a/src/pfupdates/pf_interface.h +++ b/src/pfupdates/pf_interface.h @@ -12,7 +12,7 @@ #include "stdint.h" // Redefine double complex for (future) non-C99 compatibility. -#ifndef _CC_IMPL +#ifndef __cplusplus typedef _Complex double ccdcmplx; typedef _Complex float ccscmplx; #else @@ -24,6 +24,31 @@ extern "C" { #define EXPANDNAME( funcname, cblachar ) funcname##_##cblachar +#define GENDEF( ctype, cblachar ) \ + void EXPANDNAME( updated_tdi_v_seq_init_precomp, cblachar ) \ + ( uint64_t num_qp, \ + uint64_t nsite, \ + uint64_t norbs, \ + uint64_t nelec, \ + ctype *orbmat_base, \ + int64_t orbmat_stride, \ + ctype *invmat_base, \ + int64_t invmat_stride, \ + int32_t *eleidx, \ + int32_t *elespn, \ + uint64_t mmax, \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ); +// GENDEF( float, s ) +GENDEF( double, d ) +// GENDEF( ccscmplx, c ) +GENDEF( ccdcmplx, z ) +#undef GENDEF + + #define GENDEF( ctype, cblachar ) \ void EXPANDNAME( updated_tdi_v_init, cblachar ) \ ( uint64_t num_qp, \ @@ -38,11 +63,13 @@ extern "C" { int32_t *elespn, \ uint64_t mmax, \ void *objv[], \ - void *orbv[] ); + void *orbv[], \ + void *matv[], \ + void *mapv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -50,11 +77,13 @@ GENDEF( ccdcmplx, z ) void EXPANDNAME( updated_tdi_v_free, cblachar ) \ ( uint64_t num_qp, \ void *objv[], \ - void *orbv[] ); + void *orbv[], \ + void *matv[], \ + void *mapv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -64,9 +93,9 @@ GENDEF( ccdcmplx, z ) ctype pfav[], \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -78,9 +107,9 @@ GENDEF( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -94,9 +123,9 @@ GENDEF( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -106,15 +135,38 @@ GENDEF( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF +#define GENDEF( ctype, infix, cblachar ) \ + void EXPANDNAME( updated_tdi_v_omp_## infix ##_proc_batch_greentwo, cblachar ) \ + ( uint64_t num_qp, \ + int num_gf, \ + int needs_comput[], \ + int unpack_idx[], \ + int to_orbs[], \ + int from_ids[], \ + ctype pfav[], \ + void *objv[], \ + void *orbv[], \ + void *matv[], \ + void *mapv[] ); +// GENDEF( float, var0, s ) +// GENDEF( float, var1, s ) +GENDEF( double, var0, d ) +GENDEF( double, var1, d ) +// GENDEF( ccscmplx, var0, c ) +// GENDEF( ccscmplx, var1, c ) +GENDEF( ccdcmplx, var0, z ) +GENDEF( ccdcmplx, var1, z ) +#undef GENDEF + #undef EXPANDNAME -#ifdef _CC_IMPL +#ifdef __cplusplus } #endif diff --git a/src/pfupdates/skmv.tcc b/src/pfupdates/skmv.tcc index 5f484c53..31a0947c 100644 --- a/src/pfupdates/skmv.tcc +++ b/src/pfupdates/skmv.tcc @@ -6,50 +6,39 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "colmaj.tcc" -#include "blalink.hh" -#include +#include "lapack2eigen.hh" -template -void skmv(uplo_t uploA, - dim_t m, - T alpha, - T *_A, inc_t ldA, - T *X, - T *Y) { - using namespace std; - colmaj A(_A, ldA); +template +Eigen::Matrix + skmv(const char &uploA, + const typename Mat::Scalar &alpha, + const Mat &A, + const Vec &X) { + using namespace Eigen; + Eigen::Matrix Y; // Way 1: - // skmm(BLIS_LEFT, uploA, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, m, 1, - // T(1.0), &A(0, 0), A.ld, &X[0], ldA, T(0.0), &Y[0], ldA); - - // Way 2: - vector Z(m); - for (dim_t i = 0; i < m; ++i) { - Z[i] = X[i]; - Y[i] = X[i]; + if (uploA == 'U' || uploA == 'u') { + Y = A.template triangularView() * X - A.template triangularView().transpose() * X; + } else { + Y = A.template triangularView() * X - A.template triangularView().transpose() * X; } - trmv(uploA, BLIS_NO_TRANSPOSE, - m, alpha, &A(0, 0), A.ld, &Y[0], 1); - trmv(uploA, BLIS_TRANSPOSE, - m, alpha, &A(0, 0), A.ld, &Z[0], 1); - for (dim_t i = 0; i < m; ++i) - Y[i] -= Z[i]; + Y *= alpha; - // Way 3: - // for (dim_t i = 0; i < m; ++i) + // Way 2: + // for (int i = 0; i < m; ++i) // Y[i] = 0.0; - // for (dim_t j = 0; j < m; ++j) { + // for (int j = 0; j < m; ++j) { // T x_j = X[j]; - // for (dim_t i = 0; i < j; ++i) { + // for (int i = 0; i < j; ++i) { // T A_ij = A(i, j); // Y[i] += A_ij * x_j; // Y[j] -= A_ij * X[i]; // } // } - // for (dim_t i = 0; i < m; ++i) + // for (int i = 0; i < m; ++i) // Y[i] *= alpha; + return Y; } diff --git a/src/pfupdates/skr2k.tcc b/src/pfupdates/skr2k.tcc new file mode 100644 index 00000000..a6bba999 --- /dev/null +++ b/src/pfupdates/skr2k.tcc @@ -0,0 +1,35 @@ +/** + * \file skr2k.tcc + * Dispatcher and minimal implementation for SKR2k. + * Minimal implementation is used when M is extremely small. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "lapack2eigen.hh" +#include + + +template +inline void skr2k(const char &uploC, + const char &transAB, + const typename Mat2::Scalar &alpha, + const Mat0 &A, + const Mat1 &B, + const typename Mat2::Scalar &beta, + Mat2 &C) +{ + using T = typename Mat0::Scalar; + + l2e::le_mat_t C_(C); + if (transAB == 'N' || transAB == 'n') { + gemmt(uploC, 'N', 'T', alpha, l2e::le_mat_t(A), l2e::le_mat_t(B), beta, C_); + gemmt(uploC, 'N', 'T', -alpha, l2e::le_mat_t(B), l2e::le_mat_t(A), T(1), C_); + } else { + gemmt(uploC, 'T', 'N', alpha, l2e::le_mat_t(A), l2e::le_mat_t(B), beta, C_); + gemmt(uploC, 'T', 'N', -alpha, l2e::le_mat_t(B), l2e::le_mat_t(A), T(1), C_); + } +} + diff --git a/src/pfupdates/skslc.tcc b/src/pfupdates/skslc.tcc new file mode 100644 index 00000000..084c965d --- /dev/null +++ b/src/pfupdates/skslc.tcc @@ -0,0 +1,34 @@ +/** + * \file skslc.tcc + * Get a slice from antisymmetric matrix. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "lapack2eigen.hh" + +template +inline Eigen::Matrix + skslc(const char &uploA, unsigned i, const Mat &A) +{ + Eigen::Matrix x(A.rows()); + + x[i] = 0.0; + switch (uploA) { + case 'U': + case 'u': + for (unsigned j = 0; j < i; ++j) + x[j] = A(j, i); + for (unsigned j = i+1; j < A.rows(); ++j) + x[j] = -A(i, j); + break; + + default: + std::cerr << "SKSLC: Lower triangular storage not implemented. Sorry." << std::endl; + break;; + } + return x; +} + diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc index 63bc035f..6705a2fb 100644 --- a/src/pfupdates/updated_tdi.tcc +++ b/src/pfupdates/updated_tdi.tcc @@ -6,69 +6,89 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "orbital_mat.tcc" -#include "blalink.hh" -#include "skpfa.hh" -#include "sktdi.hh" +#include "config_manager.tcc" +#include "pfaffian.tcc" +#include "invert.tcc" #include "skr2k.tcc" #include "skslc.tcc" #include "skmv.tcc" -#include "optpanel.hh" #include #include -template struct updated_tdi { - orbital_mat &Xij; - uplo_t uplo; ///< Can be switched only with update_uplo(). - const dim_t nelec; - const dim_t mmax; - dim_t nq_updated; // Update of Q(:, i) can be delayed against U. - - // Performance tuning constants. +namespace vmc +{ +namespace orbital +{ +template +struct updated_tdi { + using orbital_t = orbital_t_; + using index_t = vmc::config_manager::index_t; + using amp_t = amp_t_; + using T = typename orbital_t::T; + using invmat_t = typename orbital_t::matrix_t; ///< Just for mVMC's C wrapper. Mathematically not so good. + using matrix_t = Eigen::Matrix; + using submat_t = Eigen::Block; + using vector_t = Eigen::Matrix; + using idxvec_t = Eigen::VectorXi; + + orbital_t &Xij; + const index_t nelec; ///< Total Fermionic number / Matrix size. + const index_t mmax; ///< Max number of updates contained. const bool single_hop_alpha; ///< Whether to simplify k=1 situations. - const dim_t npanel_big; - const dim_t npanel_sub; + +private: + char uplo; ///< Can be switched only with update_uplo(). + index_t nq_updated; // Update of Q(:, i) can be delayed against U. // Matrix M. - matrix_t M; + invmat_t &M; ///< (nelec, nelec) // Updator blocks U, inv(M)U and inv(M)V. // Note that V is not stored. - matrix_t U; ///< Serves also as B*inv - matrix_t Q; ///< Contains the whole B buffer. Assert![ Q = M U ] - matrix_t P; ///< Q(0, mmax) + matrix_t U; ///< Serves also as B*inv + matrix_t Q; ///< Contains the whole B buffer. Assert![ Q = M U ] + // This member causes pointer problem on mixed precision. + // (Which was fixed by constructing Block objects on-the-fly.) + // TODO: Check its reason. + // submat_t P; ///< &Q(0, mmax). // Central update buffer and its blocks. - matrix_t W; - matrix_t UMU, UMV, VMV; - matrix_t Cp; ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ] - matrix_t Gc; ///< Gaussian vectors when tri-diagonalizing C. - signed *const cPov; ///< Pivot when tri-diagonalizing C. - T Pfa; - T PfaRatio; //< Updated Pfaffian / Base Pfaffian. - // TODO: Maybe one should log all Pfaffian histories. - - std::vector elem_cfg; - std::vector from_idx; - std::vector to_site; + matrix_t W; + matrix_t UMU, UMV, VMV; + matrix_t Cp; ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ] + matrix_t Gc; ///< Used to be Gaussian vectors. Now just scratchpads. + idxvec_t cPiv; ///< Pivot when tri-diagonalizing C. + amp_t Pfa; + amp_t PfaRatio; //< Updated Pfaffian / Base Pfaffian. + // TODO: Maybe one should log partially the computed Pfaffian history. + + // Configuration. + vmc::config_manager elem_cfg; + +public: + + void initialize_precomputed(T Pfa_) { + elem_cfg.merge_config(); + elem_cfg.reserve_space(mmax); + nq_updated = 0; + Pfa = Pfa_; + PfaRatio = 1.0; + } void initialize() { using namespace std; - auto &cfg = elem_cfg; - #ifdef UseBoost - matrix_t G(nelec, nelec); - #else - matrix_t G(new T[nelec * nelec], nelec); - #endif - from_idx.clear(); - to_site.clear(); - from_idx.reserve(mmax * sizeof(dim_t)); - to_site.reserve(mmax * sizeof(dim_t)); + elem_cfg.merge_config(); + elem_cfg.reserve_space(mmax); + const auto &cfg = elem_cfg.config_base(); + matrix_t G(nelec, nelec); + nq_updated = 0; switch (uplo) { - case BLIS_UPPER: - for (dim_t j = 0; j < nelec; ++j) { - for (dim_t i = 0; i < j; ++i) { + case 'U': + case 'u': + for (index_t j = 0; j < nelec; ++j) { + for (index_t i = 0; i < j; ++i) { M(i, j) = Xij(cfg.at(i), cfg.at(j)); } M(j, j) = T(0.0); @@ -76,146 +96,103 @@ template struct updated_tdi { break; default: - cerr << "updated_tdi: BLIS_LOWER is not supported. The error is fetal." + cerr << "updated_tdi: Lower triangular is not implemented yet." << endl; } // Allocate scratchpad. - signed *iPivFull = new signed[nelec + 1]; - dim_t lwork = nelec * npanel_big; - T *pfwork = new T[lwork]; - - #ifdef UseBoost - signed info = skpfa(uplo, nelec, &M(0, 0), M.size1(), &G(0, 0), G.size1(), iPivFull, - true, &Pfa, pfwork, lwork); - #else - signed info = skpfa(uplo, nelec, &M(0, 0), M.ld, &G(0, 0), G.ld, iPivFull, - true, &Pfa, pfwork, lwork); - #endif -#ifdef _DEBUG - cerr << "SKPFA+INV: n=" << nelec << " info=" << info << endl; -#endif - - delete[] iPivFull; - delete[] pfwork; - #ifndef UseBoost - delete[](&G(0, 0)); - #endif + vector_t vT(nelec - 1); + idxvec_t iPiv(nelec); + l2e::le_mat_t M_(M); + + // Perform LTL factorization regardless of `uplo`. + signed info = sktrf(uplo, 'N', M_, &iPiv(0), &G(0, 0), G.size()); + Pfa = ltl2pfa(uplo, M, iPiv); + PfaRatio = 1.0; // Reset Pfaffian ratio. + ltl2inv(uplo, M, iPiv, vT, G); } - ~updated_tdi() { - #ifndef UseBoost - delete[](&U(0, 0)); - delete[](&Q(0, 0)); + updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_, std::vector cfg) + : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true), + M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2), + W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax), + // Cp, Gc and cPiv are dynamically allocated. + Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2, cfg), uplo('U') + { initialize(); } - delete[](&Cp(0, 0)); - delete[](&Gc(0, 0)); - - delete[](&W(0, 0)); - delete[](&UMU(0, 0)); - delete[](&UMV(0, 0)); - delete[](&VMV(0, 0)); - #endif - - delete[] cPov; + // Unsafe construction without initializaion. + updated_tdi(orbital_t &Xij_, invmat_t &M_, index_t nelec_, index_t mmax_) + : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), single_hop_alpha(true), + M(M_), U(nelec, mmax * 2), Q(nelec, mmax * 2), + W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax), + Pfa(0.0), PfaRatio(1.0), elem_cfg(Xij_.norb()/2), uplo('U') { } + + // Copy construction could be unsafe since it does not always contain initializaion. + // M is automatically allocated in this case. + updated_tdi(const updated_tdi &tdi_) = delete; + /* + : Xij(tdi_.Xij), nelec(tdi_.nelec), mmax(tdi_.mmax), nq_updated(0), single_hop_alpha(true), + M(new invmat_t(...)), U(nelec, mmax * 2), Q(nelec, mmax * 2), + W(mmax, mmax), UMU(mmax, mmax), UMV(mmax, mmax), VMV(mmax, mmax), + Pfa(0.0), PfaRatio(1.0), elem_cfg(tdi_.Xij.norb()/2, tdi_.get_config()), uplo('U') { + if (elem_cfg.config().size() == nelec && + elem_cfg.config(0) != elem_cfg.config(1) && // Short-circuit complicated check. + is_perm(elem_cfg.config())) + initialize(); + } */ + + bool is_perm(const vmc::config_manager::base_t &cfg) const + { + std::vector occupied(Xij.norb(), false); + for (const auto &xi : cfg) { + if (xi > Xij.norb()) + return false; + if (occupied.at(xi)) + return false; + occupied.at(xi) = true; + } + return true; } - updated_tdi(orbital_mat &Xij_, std::vector &cfg, T *M_, inc_t ldM, - dim_t mmax_) - : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4), - #ifdef UseBoost - M(nelec, nelec), - U(nelec, mmax * 2), Q(nelec, mmax * 2), - P(nelec, mmax), W(mmax, mmax), - UMU(mmax, mmax), UMV(mmax, mmax), - VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax), - Gc(2 * mmax, 2 * mmax), - #else - M(M_, ldM), - U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), - P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), - UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), - VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), - Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - #endif - cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg), - from_idx(0), to_site(0), uplo(BLIS_UPPER) { - #ifdef UseBoost - colmaj M_tmp(M_, ldM); - for (dim_t j = 0; j < nelec; ++j) - for (dim_t i = 0; i < nelec; ++i) - M(i, j) = M_tmp(i, j); - #endif - initialize(); + amp_t get_amplitude() { return Pfa * get_amplitude_ratio(); } + amp_t get_amplitude_ratio() { + if (PfaRatio == 0.0) + update_pfaffian_ratio(); + return PfaRatio; } + int max_updates() const { return mmax; } + int num_updates() const { return elem_cfg.from_idx().size(); } - /* - updated_tdi(orbital_mat &Xij_, std::vector &cfg, matrix_t &M_, - dim_t mmax_) - : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4), - M(M_), - U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), - P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), - UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), - VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), - Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg), - from_idx(0), to_site(0), uplo(BLIS_UPPER) { - initialize(); - }*/ + const vmc::config_manager::base_t get_config() const + { return elem_cfg.config(); } - // Unsafe construction without initializaion. - updated_tdi(orbital_mat &Xij_, dim_t nelec_, T *M_, inc_t ldM, dim_t mmax_) - : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4), - #ifdef UseBoost - M(nelec, nelec), - U(nelec, mmax * 2), Q(nelec, mmax * 2), - P(nelec, mmax), W(mmax, mmax), - UMU(mmax, mmax), UMV(mmax, mmax), - VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax), - Gc(2 * mmax, 2 * mmax), - #else - M(M_, ldM), - U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), - P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), - UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), - VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), - Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - #endif - cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(nelec, 0), - from_idx(0), to_site(0), uplo(BLIS_UPPER) { - #ifdef UseBoost - colmaj M_tmp(M_, ldM); - for (dim_t j = 0; j < nelec; ++j) - for (dim_t i = 0; i < nelec; ++i) - M(i, j) = M_tmp(i, j); - #endif - } + const vmc::config_manager::base_t &get_config_base() const + { return elem_cfg.config_base(); } + + const vmc::config_manager &get_config_manager() const + { return elem_cfg; } - T get_Pfa() { return Pfa * PfaRatio; } + void attach_config(const vmc::config_manager::base_t &cfg) + { + assert_(cfg.size() == nelec, typeid(*this).name(), "Invalid config feeded."); + elem_cfg.attach_config(cfg); + } void assemble_C_BMB() { using namespace std; - dim_t k = from_idx.size(); + using namespace Eigen; + index_t k = elem_cfg.from_idx().size(); + if (!k) + return; // Assemble (half of) C+BMB buffer. + Cp = matrix_t(2 * k, 2 * k); switch (uplo) { - case BLIS_UPPER: - for (dim_t j = 0; j < k; ++j) - for (dim_t i = 0; i < j; ++i) - Cp(i, j) = W(i, j) + UMU(i, j); - for (dim_t j = 0; j < k; ++j) { - for (dim_t i = 0; i < k; ++i) - Cp(i, j + k) = +UMV(i, j); - - Cp(j, j + k) -= T(1.0); - } - for (dim_t j = 0; j < k; ++j) - for (dim_t i = 0; i < j; ++i) - Cp(i + k, j + k) = VMV(i, j); + case 'U': + case 'u': + // Left-lower block skipped due to uplo='U'. + Cp << W(seq(0, k-1), seq(0, k-1)) + UMU(seq(0, k-1), seq(0, k-1)), UMV(seq(0, k-1), seq(0, k-1)) - matrix_t::Identity(k, k), + matrix_t::Zero(k, k), VMV(seq(0, k-1), seq(0, k-1)); break; default: @@ -225,12 +202,15 @@ template struct updated_tdi { } // Update Q rows buffer. - bool require_Q(bool all) { - const dim_t k = from_idx.size(); - const dim_t n = nelec; - dim_t k_cal; + bool require_Q(bool require_all) { + using namespace Eigen; + using namespace l2e; + + const index_t k = elem_cfg.from_idx().size(); + const index_t n = nelec; + index_t k_cal; // Require all K columns instead of first K-1. - if (all) + if (require_all) k_cal = k; else k_cal = k - 1; @@ -241,22 +221,12 @@ template struct updated_tdi { for (; nq_updated < k_cal; ++nq_updated) { // Update single column. Use SKMV. - #ifdef UseBoost - skmv(uplo, n, T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), &Q(0, nq_updated)); - #else - skmv(uplo, n, T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), &Q(0, nq_updated)); - #endif + Q(all, nq_updated) = skmv(uplo, T(1.0), M, U(all, nq_updated)); } /* else { // Update multiple columns. Use SKMM. - #ifdef UseBoost - skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated, - T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), U.size1(), - T(0.0), &Q(0, nq_updated), Q.size1()); - #else - skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated, - T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), U.ld, - T(0.0), &Q(0, nq_updated), Q.ld); - #endif + skmm('L', uplo, 'N', 'N', + T(1.0), le_mat_t(M), le_mat_tU(all, seq(nq_updated, k_cal-1)), + T(0.0), le_mat_t(Q(all, seq(nq_updated, k_cal-1)))); } nq_updated = k_cal; */ @@ -266,199 +236,197 @@ template struct updated_tdi { // UMU needs to be recalculated for hopping change. // TODO: Find some way to avoid recalculating UQ? bool require_UMU() { - const dim_t k = from_idx.size(); - const dim_t n = nelec; + using namespace Eigen; - for (dim_t o = 0; o < k; ++o) - for (dim_t l = 0; l < o; ++l) + const index_t k = elem_cfg.from_idx().size(); + const index_t n = nelec; + + for (index_t o = 0; o < k; ++o) + for (index_t l = 0; l < o; ++l) switch (uplo) { - case BLIS_UPPER: - UMU(l, o) = -dot(n, &U(0, o), 1, &Q(0, l), 1); + case 'U': + case 'u': + UMU(l, o) = -(U(all, o).transpose() * Q(all, l))(0, 0); break; - case BLIS_LOWER: + case 'L': + case 'l': // Always assume l < o to calculate one less column of Q. - UMU(o, l) = dot(n, &U(0, o), 1, &Q(0, l), 1); + UMU(o, l) = (U(all, o).transpose() * Q(all, l))(0, 0); + break; + default: break; } return true; } - // Inverts a configuration from fermion index to site index. - void config_to_site(std::vector &cfg, std::vector &site, bool init) { - using namespace std; - - if (site.size() != Xij.nsite || cfg.size() != nelec) { - cerr << "updated_tdi::inv_config:" - << " Invalid config / invert config buffer." << endl; - return; - } - - if (init) - for (dim_t i = 0; i < site.size(); ++i) - site.at(i) = -1; - for (dim_t i = 0; i < cfg.size(); ++i) - site.at(cfg.at(i)) = i; - - return; - } - // Update osi <- os[msj]. // i.e. c+_i c_{x_j}. - void push_update(dim_t osi, dim_t msj, bool compute_pfa) { + void push_update(index_t osi, index_t msj, bool compute_pfa) { using namespace std; + using namespace Eigen; // This is the k-th hopping. - dim_t n = nelec; - dim_t k = from_idx.size(); - dim_t osj = elem_cfg.at(msj); + index_t n = nelec; + index_t k = elem_cfg.from_idx().size(); + index_t osj = elem_cfg.config_base(msj); - // TODO: Check bounds, duplicates, etc. - from_idx.push_back(msj); - to_site.push_back(osi); + elem_cfg.push_update(osi, msj); - // TODO: Check for hopping-backs. + // TODO: Check for hopping-backs. // This can only be handled by cancellation. // Singularity will emerge otherwise. - // Speed-up lookup of Xij. - std::vector sitecfg(Xij.nsite, -1); - config_to_site(elem_cfg, sitecfg, false); - for (dim_t i = 0; i < Xij.nsite; ++i) - // U(i, k) = Xij(elem_cfg.at(i), osi) - Xij(elem_cfg.at(i), osj); - if (sitecfg.at(i) >= 0) - // Direct access: special case when installed into VMC. - U(sitecfg.at(i), k) = Xij.X(i, osi) - Xij.X(i, osj); + // TODO: Speed-up lookup of Xij. + for (index_t i = 0; i < n; ++i) + U(i, k) = Xij(elem_cfg.config_base(i), osi) - Xij(elem_cfg.config_base(i), osj); + // for (index_t i = 0; i < Xij.norb(); ++i) + // if (elem_cfg.inv(i) >= 0) + // U(elem_cfg.inv(i), k) = Xij(i, osi) - Xij(i, osj); + + // P matrix from latter half of Q (or rigorously speaking, B matrix). + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); - skslc(uplo, n, msj, &P(0, k), &M(0, 0), M.ld); + P(all, k) = skslc(uplo, msj, M); // Updated the already logged U. - for (dim_t l = 0; l < k; ++l) { - dim_t msl = from_idx.at(l); + for (index_t l = 0; l < k; ++l) { + index_t msl = elem_cfg.from_idx().at(l); T U_jl_ = U(msj, l); ///< Backup this value before change. // Write updates. - U(msl, k) = Xij(to_site.at(l), osi) - Xij(elem_cfg.at(msl), osj); - U(msj, l) = Xij(osi, to_site.at(l)) - Xij(osj, elem_cfg.at(msl)); + U(msl, k) = Xij(elem_cfg.to_orb().at(l), osi) - Xij(elem_cfg.config_base(msl), osj); + U(msj, l) = Xij(osi, elem_cfg.to_orb().at(l)) - Xij(osj, elem_cfg.config_base(msl)); // Change of Q[:, l] induced by change of U[msj, l]; // Utilize that P[:, k] = M[:, msj] ///< M is skew-symmetric. - axpy(n, U(msj, l) - U_jl_, &P(0, k), 1, &Q(0, l), 1); + Q(all, l) += P(all, k) * (U(msj, l) - U_jl_); // Change of UMV[l, k] induced by change of U[msj, l]. - for (dim_t o = 0; o < k; ++o) + for (index_t o = 0; o < k; ++o) UMV(l, o) += (U(msj, l) - U_jl_) * P(msj, o); } // Update UMV and VMV. UMU is updated elsewhere. switch (uplo) { - case BLIS_UPPER: - for (dim_t l = 0; l < k; ++l) - W(l, k) = -Xij(to_site.at(l), osi) + Xij(elem_cfg.at(from_idx.at(l)), osj); - - for (dim_t l = 0; l < k; ++l) { - UMV(l, k) = dot(n, &U(0, l), 1, &P(0, k), 1); - UMV(k, l) = dot(n, &U(0, k), 1, &P(0, l), 1); + case 'U': + case 'u': + for (index_t l = 0; l < k; ++l) + W(l, k) = -Xij(elem_cfg.to_orb(l), osi) + Xij(elem_cfg.config_base(elem_cfg.from_idx(l)), osj); + + for (index_t l = 0; l < k; ++l) { + UMV(l, k) = (U(all, l).transpose() * P(all, k))(0, 0); + UMV(k, l) = (U(all, k).transpose() * P(all, l))(0, 0); } - UMV(k, k) = dot(n, &U(0, k), 1, &P(0, k), 1); - for (dim_t l = 0; l < k; ++l) + UMV(k, k) = (U(all, k).transpose() * P(all, k))(0, 0); + for (index_t l = 0; l < k; ++l) VMV(l, k) /* -VMV(k, l) */ = -P(msj, l); ///< P(from_idx.at(l), k); break; default: - cerr << "updated_tdi::push_update:" + cerr << "updated_tdi::push_update:" << " Only upper-triangular storage is supported." << endl; } if (compute_pfa) { - // NOTE: Update k to be new size. - k += 1; - // Allocate scratchpad. - dim_t lwork = 2 * k * npanel_sub; - T *pfwork = new T[lwork]; + // NOTE: Update k to be new size? + // k += 1; + + update_pfaffian_ratio(); + } else + // Set to 0.0 to denote dirty. + PfaRatio = 0.0; + } - // Calculate unupdated columns of U. + void update_pfaffian_ratio(void) + { + index_t k = elem_cfg.from_idx().size(); + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); + { + // All compute_pfa requires first K-1 of Q. require_Q(false); + // Recompute UMU. require_UMU(); - // Assemble (half of) C+BMB buffer. + // Reassemble C and scratchpads. assemble_C_BMB(); // If it's the first update Pfafian can be directly read out. if (k == 1) { PfaRatio = -UMV(0, 0) + T(1.0); - delete[] pfwork; return; } // Compute pfaffian. - #ifdef UseBoost - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - false, &PfaRatio, pfwork, lwork); - #else - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - false, &PfaRatio, pfwork, lwork); - #endif -#ifdef _DEBUG - cerr << "SKPFA: info=" << info << endl; -#endif + l2e::le_mat_t Cp_(Cp); + Gc = matrix_t::Zero(2*k, 2*k); + cPiv = idxvec_t::Zero(2*k); + // Use not skpfa for mixed precision. + signed info = sktrf(uplo, 'P', Cp_, &cPiv(0), &Gc(0, 0), Gc.size()); + PfaRatio = ltl2pfa(uplo, Cp, cPiv); + // Pfaffian of C = [ W -I; I 0 ]. PfaRatio *= pow(-1.0, k * (k + 1) / 2); + } + } - delete[] pfwork; - } else - // Set to 0.0 to denote dirty. - PfaRatio = 0.0; + bool check_update_safety(const std::vector &ms, bool merge_error = false) + { + if (elem_cfg.from_idx().size() + ms.size() > mmax) + return false; + + // Check if alrady hopped out. + for (index_t l = 0; l < elem_cfg.from_idx().size(); ++l) + for (index_t j = 0; j < ms.size(); ++j) + if (ms.at(j) == elem_cfg.from_idx().at(l)) { + assert_(!merge_error, typeid(*this).name(), "Conflict on non-mergable push."); + return false; + } + return true; } - void push_update(dim_t osi, dim_t msj) { push_update(osi, msj, true); } + void push_update(index_t osi, index_t msj) { push_update(osi, msj, true); } // Check duplicate and push. - void push_update_safe(dim_t osi, dim_t msj, bool compute_pfa) { - if (from_idx.size() >= mmax) + void push_update_safe(index_t osi, index_t msj, bool compute_pfa, bool merge_error = false) { + if (!check_update_safety({ msj })) merge_updates(); - // Check if already hopped out. - for (dim_t l = 0; l < from_idx.size(); ++l) - if (msj == from_idx.at(l)) { - merge_updates(); - break; - } push_update(osi, msj, compute_pfa); } - void push_update_safe(dim_t osi, dim_t msj) { + void push_update_safe(index_t osi, index_t msj) { push_update_safe(osi, msj, true); } void pop_update(bool compute_pfa) { using namespace std; + using namespace Eigen; // Pop out. - dim_t msj_ = from_idx.at(from_idx.size() - 1); - dim_t osi_ = to_site.at(to_site.size() - 1); - dim_t osj_ = elem_cfg.at(msj_); - from_idx.pop_back(); - to_site.pop_back(); + index_t msj_ = elem_cfg.from_idx().back(); + index_t osi_ = elem_cfg.to_orb().back(); + index_t osj_ = elem_cfg.config_base(msj_); + elem_cfg.pop_update(); // New update size. - dim_t k = from_idx.size(); + index_t k = elem_cfg.from_idx().size(); + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); // Revert U[:, 1:k-1] update. - for (dim_t l = 0; l < k; ++l) { - dim_t msl = from_idx.at(l); + for (index_t l = 0; l < k; ++l) { + index_t msl = elem_cfg.from_idx(l); T U_jl_ = U(msj_, l); // Revert U rows. (final column is popped out). - U(msj_, l) = Xij(osj_, to_site.at(l)) - Xij(osj_, elem_cfg.at(msl)); + U(msj_, l) = Xij(osj_, elem_cfg.to_orb(l)) - Xij(osj_, elem_cfg.config_base(msl)); // Change of Q[:, l] induced by change of U[msj, l]; T U_diff_msj = U(msj_, l) - U_jl_; switch (uplo) { - case BLIS_UPPER: - for (dim_t i = 0; i < msj_; ++i) - Q(i, l) += M(i, msj_) * U_diff_msj; - for (dim_t i = msj_ + 1; i < nelec; ++i) - Q(i, l) -= M(msj_, i) * U_diff_msj; + case 'U': + case 'u': + Q(seq(0, msj_-1), l) += M(seq(0, msj_-1), msj_) * U_diff_msj; + Q(seq(msj_+1, nelec-1), l) -= M(msj_, seq(msj_+1, nelec-1)) * U_diff_msj; break; default: @@ -467,7 +435,7 @@ template struct updated_tdi { } // Change of UMV[l, k] induced by change of U[msj, l]. - for (dim_t o = 0; o < k; ++o) + for (index_t o = 0; o < k; ++o) UMV(l, o) += (U(msj_, l) - U_jl_) * P(msj_, o); } // Pop out outdated Q. @@ -475,27 +443,11 @@ template struct updated_tdi { nq_updated = k; // Compute new (previous, in fact) Pfaffian. - if (compute_pfa) { - // All compute_pfa requires first K-1 of Q. - require_Q(false); - - // Recompute UMU. - require_UMU(); - // Reassemble C and scratchpads. - assemble_C_BMB(); - dim_t lwork = 2 * k * npanel_sub; - T *pfwork = new T[lwork]; - - #ifdef UseBoost - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - false, &PfaRatio, pfwork, lwork); - #else - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - false, &PfaRatio, pfwork, lwork); - #endif - PfaRatio *= pow(-1.0, k * (k + 1) / 2); + if (k == 0) + PfaRatio = 1.0; + else if (compute_pfa) { + update_pfaffian_ratio(); - delete[] pfwork; } else // Set to 0.0 to denote dirty. PfaRatio = 0.0; @@ -504,128 +456,79 @@ template struct updated_tdi { void merge_updates() { using namespace std; - dim_t n = nelec; - dim_t k = from_idx.size(); + index_t n = nelec; + index_t k = elem_cfg.from_idx().size(); if (k == 0) return; // Allocate scratchpad. - dim_t lwork = 2 * k * npanel_sub; - T *pfwork = new T[lwork]; - + vector_t vT(2 * k - 1); + // Update whole Q. require_Q(true); - // If M is dirty, redo the tridiagonal factorization. - if (PfaRatio == T(0.0)) { - require_UMU(); - assemble_C_BMB(); - #ifdef UseBoost - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - false, &PfaRatio, pfwork, lwork); - #else - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - false, &PfaRatio, pfwork, lwork); - #endif - PfaRatio *= pow(-1.0, k * (k + 1) / 2); - } - if (k == 1) { + // M is dirty. + if (PfaRatio == 0.0) { + require_UMU(); + assemble_C_BMB(); + PfaRatio = -Cp(0, 1); + } // Trivial inverse. Cp(0, 1) = T(-1.0) / Cp(0, 1); Cp(1, 0) = T(-1.0) / Cp(1, 0); - } else - #ifdef UseBoost - signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - pfwork, lwork); - #else - signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - pfwork, lwork); - #endif - inv_update(k, Cp); + } else { + // Redo the tridiagonal factorization. + require_UMU(); + assemble_C_BMB(); + + l2e::le_mat_t Cp_(Cp); + Gc = matrix_t::Zero(2*k, 2*k); + cPiv = idxvec_t::Zero(2*k); + signed info = sktrf(uplo, 'N', Cp_, &cPiv(0), &Gc(0, 0), 2*k * 2*k); + // PfaRatio is dirty. + if (PfaRatio == 0.0) + PfaRatio = ltl2pfa(uplo, Cp, cPiv) * + T(pow(-1.0, k * (k + 1) / 2)); + ltl2inv(uplo, Cp, cPiv, vT, Gc); + } + assert_(Cp.rows() == 2 * k, typeid(*this).name(), "Cp is bad."); + inv_update(Cp); // Apply hopping. - for (int j = 0; j < k; ++j) - elem_cfg.at(from_idx.at(j)) = to_site.at(j); - from_idx.clear(); - to_site.clear(); + elem_cfg.merge_config(); nq_updated = 0; Pfa *= PfaRatio; PfaRatio = 1.0; - - delete[] pfwork; } - void inv_update(dim_t k, matrix_t &C) { - #ifdef UseBoost - matrix_t &ABC = U; ///< Use U as inv(A)*B*upper(C) buffer. - matrix_t &AB = Q; - #else - matrix_t ABC(&U(0, 0), U.ld); ///< Use U as inv(A)*B*upper(C) buffer. - matrix_t AB(&Q(0, 0), Q.ld); - #endif + void inv_update(matrix_t &C) { + using namespace Eigen; + const index_t k = C.rows() / 2; + submat_t P(Q(Eigen::all, Eigen::seq(mmax, mmax * 2 - 1))); + + auto ABC = U(all, seq(0, 2 * k - 1)); ///< Use U as inv(A)*B*upper(C) buffer. + auto AB = Q(all, seq(0, 2 * k - 1)); if (k == 1 && single_hop_alpha) { // k == 1 requires no copying - #ifdef UseBoost - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.size1(), - &P(0, 0), P.size1(), T(1.0), &M(0, 0), M.size1()); - #else - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.ld, - &P(0, 0), P.ld, T(1.0), &M(0, 0), M.ld); - #endif + skr2k(uplo, 'N', C(0, 1), Q(all, 0), P(all, 0), T(1.0), M); // See below for reason of calling this procedule. // update_uplo(uplo); return; } // Close empty space between Q and P. - #ifndef UseBoost if (k != mmax) - #endif - for (dim_t j = 0; j < k; ++j) - memcpy(&AB(0, k + j), &P(0, j), nelec * sizeof(T)); - - # if 0 - // Copy AB to ABC for TRMM interface. - for (dim_t j = 0; j < 2 * k - 1; ++j) - // inv(A)*U [ 0 + + + - // 0 0 + + - // 0 0 0 + - // 0 0 0 0 ] => AB[:, 0:2] -> ABC[:, 1:3] - memcpy(&ABC(0, j + 1), &AB(0, j), nelec * sizeof(T)); - #ifdef UseBoost - trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), - &C(0, 1), C.size1(), &ABC(0, 1), ABC.size1()); - #else - trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), - &C(0, 1), C.ld, &ABC(0, 1), ABC.ld); - #endif - - // Update: write to M. - #ifdef UseBoost - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.size1(), - &AB(0, 1), AB.size1(), T(1.0), &M(0, 0), M.size1()); - #else - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.ld, - &AB(0, 1), AB.ld, T(1.0), &M(0, 0), M.ld); - #endif - #else - gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, - nelec, 2 * k, 2 * k, - T(1.0), - &AB(), AB.ld, - &C(), C.ld, - T(0.0), - &ABC(), ABC.ld); - gemmt(uplo, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, - nelec, 2 * k, - T(1.0), - &ABC(), ABC.ld, - &AB(), AB.ld, - T(1.0), - &M(), M.ld); - #endif + for (index_t j = 0; j < k; ++j) + AB(all, k + j) = P(all, j); + + l2e::le_mat_t M_(M); + ABC = AB * C; + gemmt(uplo, 'N', 'T', T(1.0), + l2e::le_mat_t(ABC), + l2e::le_mat_t(AB), + T(1.0), M_); // Identity update to complete antisymmetric matrix. // This called due to lack of skmm support at the moment. @@ -635,15 +538,20 @@ template struct updated_tdi { /** * Complete antisymmetric matrix. */ - void skcomplete(uplo_t uplo_, dim_t n, matrix_t &A) { - for (dim_t j = 0; j < n; ++j) { - for (dim_t i = 0; i < j; ++i) { + template + static void skcomplete(const char &uplo_, Mat &A) { + const index_t n = A.rows(); + + for (index_t j = 0; j < n; ++j) { + for (index_t i = 0; i < j; ++i) { switch (uplo_) { - case BLIS_UPPER: + case 'U': + case 'u': A(j, i) = -A(i, j); break; - case BLIS_LOWER: + case 'L': + case 'l': A(i, j) = -A(j, i); break; @@ -655,15 +563,160 @@ template struct updated_tdi { } } - void update_uplo(uplo_t uplo_new) { - skcomplete(uplo /* NOTE: old uplo */, nelec, M); - if (from_idx.size()) { - skcomplete(uplo, from_idx.size(), UMU); - skcomplete(uplo, from_idx.size(), VMV); - skcomplete(uplo, from_idx.size(), W); + void update_uplo(const char &uplo_new) { + using namespace Eigen; + + skcomplete(uplo /* NOTE: Old uplo. */, M); + const int k = elem_cfg.from_idx().size(); + if (k) { + skcomplete(uplo, UMU(seq(0, k-1), seq(0, k-1))); + skcomplete(uplo, VMV(seq(0, k-1), seq(0, k-1))); + skcomplete(uplo, W(seq(0, k-1), seq(0, k-1))); } uplo = uplo_new; } -}; + Eigen::Vector batch_query_amplitudes + (int N, const Eigen::Map &to_orbs, const Eigen::Map &from_ids) { + using namespace Eigen; + using ampvec_t = Eigen::Vector; + + bool upper = (uplo == 'U' || uplo == 'u'); + int k0 = from_ids.size(); + int k1 = k0 / N; + assert_(k0 % N == 0, typeid(*this).name(), "Misaligned batch query."); + ampvec_t result(k1); + + // Works w/ merged M only. + merge_updates(); + // Operates on complete M. + skcomplete(uplo, M); + + matrix_t UbB; + if (N > 1) + UbB = matrix_t::Zero(nelec, k1 * (N-1)); + matrix_t UbF= matrix_t::Zero(nelec, k1); + matrix_t Pb = matrix_t::Zero(nelec, k0); + + // Grouped comput. of Ub. + for (index_t l1 = 0; l1 < k1; ++l1) { + for (index_t l = l1 * N; l < (l1 + 1) * N; ++l) + elem_cfg.push_update(to_orbs(l), from_ids(l)); + + int osi, osj; + + for (index_t l2 = 0; l2 < N-1; ++l2) { + index_t l = l1 * N + l2; + index_t lB = l1 * (N-1) + l2; + + osi = to_orbs(l); + osj = elem_cfg.config_base(from_ids(l)); + + for (index_t i = 0; i < nelec; ++i) { + index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i); + UbB(i, lB) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj); + } + } + + // Final column in the grouped U. + { + index_t l = l1 * N + N - 1; + osi = to_orbs(l); + osj = elem_cfg.config_base(from_ids(l)); + + for (index_t i = 0; i < nelec; ++i) { + index_t osx = i == from_ids(l) ? elem_cfg.config_base(i) : elem_cfg.config(i); + UbF(i, l1) = Xij(osx, osi) - Xij(elem_cfg.config_base(i), osj); + } + } + + for (index_t ll = 0; ll < N; ++ll) + elem_cfg.pop_update(); + } + + // Pb needs no grouping. + for (index_t l = 0; l < k0; ++l) + Pb(all, l) = M(all, from_ids(l)); + + matrix_t QbB; + if (N > 1) + // Require Q. + QbB = M * UbB; + + // Batch assemble C. + matrix_t Cb = matrix_t::Zero(2*N, 2*N * k1); + for (index_t l1 = 0; l1 < k1; ++l1) { + auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1)); + auto UCur = UbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1)); + auto QCur = QbB(all, seq((N-1) * l1, (N-1) * (l1+1) - 1)); + auto PCur = Pb(all, seq(N * l1, N * (l1+1) - 1)); + + // UMU + W. + for (index_t o = 0; o < N-1; ++o) + for (index_t l = 0; l < o; ++l) + if (upper) + CCur(l, o) = -(UCur(all, o).transpose() * QCur(all, l))(0,0) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + o))); + else + CCur(o, l) = (UCur(all, o).transpose() * QCur(all, l))(0,0) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + o)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + o))); + { + for (index_t l = 0; l < N-1; ++l) + if (upper) + CCur(l, N-1) = -(UbF(all, l1).transpose() * QCur(all, l))(0,0) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + N-1))); + else + CCur(N-1, l) = (UbF(all, l1).transpose() * QCur(all, l))(0,0) - + Xij(to_orbs(l1 * N + l), to_orbs(l1 * N + N-1)) + + Xij(elem_cfg.config_base(from_ids(l1 * N + l)), + elem_cfg.config_base(from_ids(l1 * N + N-1))); + } + + // UMV + if (upper) { + if (N > 1) + CCur(seq(0, N - 2), seq(N, 2*N - 1)) = UCur.transpose() * PCur; + CCur(N - 1, seq(N, 2*N - 1)) = UbF(all, l1).transpose() * PCur; + } else + assert_(false, typeid(*this).name(), "Lower diag not implemented."); + // -I + CCur(seq(0, N - 1), seq(N, 2*N - 1)) -= matrix_t::Identity(N, N); + + // VMV + for (index_t o = 0; o < N; ++o) + for (index_t l = 0; l < o; ++l) + if (upper) + CCur(N + l, N + o) = -PCur(from_ids(l1 * N + o), l); + else + CCur(N + o, N + l) = PCur(from_ids(l1 * N + o), l); + } + + // Batch Pfaffian call. + matrix_t Gc = matrix_t::Zero(2*N, 2*N); + idxvec_t cPiv = idxvec_t::Zero(2*N); + for (index_t l1 = 0; l1 < k1; ++l1) { + auto CCur = Cb(all, seq(2*N * l1, 2*N * (l1+1) - 1)); + if (N == 1) { + result(l1) = -CCur(0, 1) + T(1.0); + continue; + } + l2e::le_mat_t C_(CCur); + signed info = sktrf(uplo, 'P', C_, &cPiv(0), &Gc(0, 0), Gc.size()); + result(l1) = ltl2pfa(uplo, CCur, cPiv); + } + if (N > 1) + result = result * pow(-1.0, N * (N + 1) / 2); + + return result; + } + +}; +} +} diff --git a/tool/CMakeLists.txt b/tool/CMakeLists.txt index ae13a4f8..4c225b44 100644 --- a/tool/CMakeLists.txt +++ b/tool/CMakeLists.txt @@ -6,9 +6,8 @@ if(${CMAKE_PROJECT_NAME} STREQUAL "Project") message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.") endif(${CMAKE_PROJECT_NAME} STREQUAL "Project") -add_library(key2lower STATIC key2lower.c) add_executable(greenr2k greenr2k.F90) -target_link_libraries(greenr2k key2lower ${LAPACK_LIBRARIES}) +target_link_libraries(greenr2k ${LAPACK_LIBRARIES}) install(TARGETS greenr2k RUNTIME DESTINATION bin) # diff --git a/tool/greenr2k.F90 b/tool/greenr2k.F90 index eba207f0..1472978f 100644 --- a/tool/greenr2k.F90 +++ b/tool/greenr2k.F90 @@ -75,15 +75,21 @@ MODULE fourier_routine ! IMPLICIT NONE ! - INTERFACE - SUBROUTINE key2lower(key) BIND(c) - USE,INTRINSIC :: iso_c_binding - CHARACTER(KIND=C_CHAR) :: key(*) - END SUBROUTINE key2lower - END INTERFACE - ! CONTAINS ! +SUBROUTINE key2lower(key) + CHARACTER(*) :: key + ! + INTEGER :: ii, acode + ! + DO ii = 1, LEN(TRIM(key)) + acode = IACHAR(key(ii:ii)) + IF(65 <= acode .AND. acode <= 90) THEN + key(ii:ii) = ACHAR(acode + 32) + END IF + END DO +END SUBROUTINE key2lower +! ! Read from HPhi/mVMC input files ! SUBROUTINE read_filename() @@ -461,10 +467,10 @@ END SUBROUTINE set_kpoints SUBROUTINE read_corrindx() ! USE fourier_val, ONLY : file_one, file_two, ncor1, ncor2, ncor, indx, & - & calctype, nr, rindx, orb, norb + & calctype, nr, rindx, orb, norb, irv IMPLICIT NONE ! - INTEGER :: fi = 10, itmp(8), icor + INTEGER :: fi = 10, itmp(8), icor, ir, iorb, jorb CHARACTER(100) :: ctmp ! WRITE(*,*) @@ -489,7 +495,14 @@ SUBROUTINE read_corrindx() ! ncor(1:2) = 0 DO icor = 1, ncor1 + ! READ(fi,*) itmp(1:4) + ! + IF(ANY(irv(1:3,1,rindx(itmp(1)+1)) /= 0)) THEN + WRITE(*,'(a,i0,a)') " REMARK : The left operator at # ", icor, " is not at R=0." + CYCLE + END IF + ! IF(itmp(2) == 0 .AND. itmp(4) == 0) THEN ! ! Up-Up correlation @@ -527,6 +540,11 @@ SUBROUTINE read_corrindx() READ(fi,*) itmp(1:8) ! IF(itmp(1) == itmp(3) .AND. itmp(5) == itmp(7)) THEN + ! + IF(ANY(irv(1:3,1,rindx(itmp(1)+1)) /= 0)) THEN + WRITE(*,'(a,i0,a)') " REMARK : The left operator at # ", icor, " is not at R=0." + CYCLE + END IF ! IF(itmp(2) == 0 .AND. itmp(4) == 0) THEN ! @@ -577,6 +595,11 @@ SUBROUTINE read_corrindx() END IF ! IF(calctype == 4 .AND. (itmp(1) == itmp(7) .AND. itmp(3) == itmp(5))) THEN + ! + IF(ANY(irv(1:3,1,rindx(itmp(1)+1)) /= 0)) THEN + WRITE(*,'(a,i0,a)') " REMARK : The left operator at # ", icor, " is not at R=0." + CYCLE + END IF ! ! S+S- & S-S+ for mVMC ! @@ -596,6 +619,35 @@ SUBROUTINE read_corrindx() ! END DO ! + IF(COUNT(indx(1:nr,3:8,1:norb,1:norb) == 0) /= 0) THEN + WRITE(*,*) "ERROR! The following correlation function is missed:" + WRITE(*,*) "R, kind, orb1, orb2" + DO icor = 3, 8 + DO ir = 1, nr + DO iorb = 1, norb + DO jorb = 1, norb + IF(indx(ir,icor,iorb,jorb) == 0) THEN + IF(icor == 3) THEN + WRITE(*,'(i0,a,i0,2x,i0)') ir, " Up-Up-Up-Up ", iorb, jorb + ELSE IF(icor == 4) THEN + WRITE(*,'(i0,a,i0,2x,i0)') ir, " Up-Up-Down-Down ", iorb, jorb + ELSE IF(icor == 5) THEN + WRITE(*,'(i0,a,i0,2x,i0)') ir, " Down-Down-Up-Up ", iorb, jorb + ELSE IF(icor == 6) THEN + WRITE(*,'(i0,a,i0,2x,i0)') ir, " Down-Down-Down-Down ", iorb, jorb + ELSE IF(icor == 7) THEN + WRITE(*,'(i0,a,i0,2x,i0)') ir, " Up-Down-Down-Up ", iorb, jorb + ELSE + WRITE(*,'(i0,a,i0,2x,i0)') ir, " Down-Up-Up-Down ", iorb, jorb + END IF + END IF + END DO + END DO + END DO + END DO + STOP "Missing indices for the Green function." + END IF + ! WRITE(*,*) " Number of UpUpUpUp Index : ", COUNT(indx(1:nr, 3, 1:norb, 1:norb) /= 0) WRITE(*,*) " Number of UpUpDownDown Index : ", COUNT(indx(1:nr, 4, 1:norb, 1:norb) /= 0) WRITE(*,*) " Number of DownDownUpUp Index : ", COUNT(indx(1:nr, 5, 1:norb, 1:norb) /= 0) @@ -861,7 +913,13 @@ SUBROUTINE output_cor() END DO ! DO ik = 1, ikk - WRITE(fo,'(1000e15.5)') xk(ik), cor_ave(ik,1:6, 1:norb, 1:norb), cor_err(ik,1:6, 1:norb, 1:norb) + WRITE(fo,'(e15.5)',advance="no") xk(ik) + DO iorb = 1, norb + DO jorb = 1, norb + WRITE(fo,'(24e15.5)',advance="no") cor_ave(ik,1:6, jorb, iorb), cor_err(ik,1:6, jorb, iorb) + END DO + END DO + WRITE(fo,*) END DO ! CLOSE(fo) diff --git a/tool/key2lower.c b/tool/key2lower.c deleted file mode 100644 index f416eade..00000000 --- a/tool/key2lower.c +++ /dev/null @@ -1,7 +0,0 @@ -#include -#include - -void key2lower(char *key){ - unsigned int ii; - for (ii = 0; ii < strlen(key); ii++) key[ii] = tolower(key[ii]); -} diff --git a/tool/makefile_tool b/tool/makefile_tool index 1d2a177b..e89c7a9a 100644 --- a/tool/makefile_tool +++ b/tool/makefile_tool @@ -4,10 +4,10 @@ include ../src/make.sys .SUFFIXES : .o .F90 .SUFFIXES : .o .c -all:greenr2k +all: greenr2k -greenr2k:greenr2k.o key2lower.o - $(F90) greenr2k.o key2lower.o $(LIBS) -o $@ +greenr2k: greenr2k.o + $(F90) greenr2k.o $(LIBS) -o $@ .F90.o: $(F90) -c $< $(FFLAGS)