diff --git a/CMakeLists.txt b/CMakeLists.txt index a7ff612..ab1b3f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,11 @@ cmake_minimum_required (VERSION 3.5) project (qcint C) -set(qcint_VERSION_MAJOR "5") -set(qcint_VERSION_MINOR "4") +set(qcint_VERSION_MAJOR "6") +set(qcint_VERSION_MINOR "0") set(qcint_VERSION_PATCH "0") set(qcint_VERSION_TWEAK "0") set(qcint_VERSION "${qcint_VERSION_MAJOR}.${qcint_VERSION_MINOR}.${qcint_VERSION_PATCH}") -set(qcint_SOVERSION "${qcint_VERSION_MAJOR}") +set(cint_SOVERSION "${qcint_VERSION_MAJOR}") #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O2 -DNDEBUG") if ("${CMAKE_BUILD_TYPE}" STREQUAL "") @@ -71,7 +71,7 @@ set(cintSrc src/g1e.c src/g2e.c src/g2e_simd1.c src/g3c1e.c src/g3c2e.c src/gout2e.c src/misc.c src/optimizer.c src/fmt.c src/rys_wheeler.c src/eigh.c src/rys_roots.c src/find_roots.c - src/polyfits.c src/sr_rys_polyfits.c + src/polyfits.c src/cint1e_a.c src/cint3c1e_a.c src/cint1e_grids.c src/g1e_grids.c src/autocode/breit1.c src/autocode/dkb.c src/autocode/gaunt1.c @@ -84,17 +84,17 @@ set(cintSrc if(WITH_RANGE_COULOMB) # defined in config.h # add_definitions(-DWITH_RANGE_COULOMB) - message("Enabled WITH_RANGE_COULOMB") - if(WITH_POLYNOMIAL_FIT) - add_definitions(-DWITH_POLYNOMIAL_FIT) - message("Enabled WITH_POLYNOMIAL_FIT") - endif(WITH_POLYNOMIAL_FIT) +# message("Enabled WITH_RANGE_COULOMB") endif(WITH_RANGE_COULOMB) +if(WITH_POLYNOMIAL_FIT) + set(cintSrc ${cintSrc} src/sr_rys_polyfits.c) + add_definitions(-DWITH_POLYNOMIAL_FIT) + message("Enabled WITH_POLYNOMIAL_FIT") +endif(WITH_POLYNOMIAL_FIT) + if(WITH_COULOMB_ERF) - set(cintSrc ${cintSrc} src/g2e_coulerf.c src/cint2e_coulerf.c) - add_definitions(-DWITH_COULOMB_ERF) - message("Enabled WITH_COULOMB_ERF") + message("WITH_COULOMB_ERF is deprecated since v6.0") endif(WITH_COULOMB_ERF) if(WITH_F12) @@ -103,14 +103,6 @@ if(WITH_F12) message("Enabled WITH_F12") endif(WITH_F12) -if(WITH_GTG) - set(cintSrc ${cintSrc} src/g2e_gtg.c src/cint2e_gtg.c src/cint3c2e_gtg.c - src/cint2c2e_gtg.c) - add_definitions(-DWITH_GTG) - message("Enabled WITH_GTG") - message("Enabled WITH_GTG. Note there are bugs in gtg type integrals") -endif(WITH_GTG) - if(PYPZPX) add_definitions(-DPYPZPX) message("P orbitals convention (py, pz, px)") diff --git a/include/cint.h.in b/include/cint.h.in index 7eeabd9..2c4c918 100644 --- a/include/cint.h.in +++ b/include/cint.h.in @@ -22,7 +22,8 @@ * Parameters and function prototypes for libcint. */ -#define QCINT_VERSION @qcint_VERSION@ +#define QCINT_VERSION @qcint_VERSION@ +#define CINT_SOVERSION @cint_SOVERSION #cmakedefine CACHE_SIZE_I8 #ifdef CACHE_SIZE_I8 @@ -203,7 +204,7 @@ typedef struct { union {int nfk; int grids_offset;}; union {int nfl; int ngrids;}; int nf; // = nfi*nfj*nfk*nfl; - int _padding; + int rys_order; // = nrys_roots for regular ERIs. can be nrys_roots/2 for SR ERIs int x_ctr[4]; int gbits; diff --git a/src/cint2c2e.c b/src/cint2c2e.c index a005aab..6cf98aa 100644 --- a/src/cint2c2e.c +++ b/src/cint2c2e.c @@ -21,8 +21,8 @@ #include #include #include -#include #include +#include #include "cint_bas.h" #include "misc.h" #include "g2e.h" @@ -65,15 +65,38 @@ (*(fp2c[i]))(gctr[it], gprim[i], coeff[it]+im, \ ngp[it], x_prim[it], x_ctr[it], \ non0ctr[it][im], non0idx[it]+im*x_ctr[it]); \ + empty_overall = 0; \ + } \ + cum = 0; \ + np2c = 0; + +#define POP_PRIM2CTR_AND_SET0 \ + for (i = 0; i < np2c; i++) { \ + it = shltyp[i]; \ + if (it != SHLTYPi) { \ + im = iprim[i]; \ + (*(fp2c[i]))(gctr[i], gprim[i], coeff[it]+im, \ + ngp[it], x_prim[it], x_ctr[it], \ + non0ctr[it][im], non0idx[it]+im*x_ctr[it]); \ + empty_overall = 0; \ + } else if (fp2c[i] == CINTiprim_to_ctr_0) { \ + double *pout = gctr[it]; \ + for (int k = 0; k < nf; k++) { \ + pout[k] = 0.; \ + } \ + } \ } \ cum = 0; \ np2c = 0; #define PUSH \ if (cum == SIMDD) { \ - (*envs->f_g0_2e)(g, cutoff, &bc, envs, cum); \ - (*envs->f_gout)(gout, g, idx, envs); \ - POP_PRIM2CTR; \ + if ((*envs->f_g0_2e)(g, cutoff, &bc, envs, cum)) { \ + (*envs->f_gout)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ + } \ } \ envs->ai[cum] = ai[ip]; \ envs->ak[cum] = ak[kp]; \ @@ -111,26 +134,30 @@ #define RUN_REST \ if (cum == 1) { \ - (*envs->f_g0_2e_simd1)(g, cutoff, &bc, envs, 0); \ - (*envs->f_gout_simd1)(gout, g, idx, envs); \ + if ((*envs->f_g0_2e_simd1)(g, cutoff, &bc, envs, 0)) { \ + (*envs->f_gout_simd1)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ + } \ } else if (cum > 1) { \ - r1 = MM_SET1(1.); \ - for (i = 0; i < envs->nrys_roots; i++) { \ - MM_STORE(bc.u+i*SIMDD, r1); \ - MM_STORE(bc.w+i*SIMDD, r1); \ + if ((*envs->f_g0_2e)(g, cutoff, &bc, envs, cum)) { \ + (*envs->f_gout)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ } \ - (*envs->f_g0_2e)(g, cutoff, &bc, envs, cum); \ - (*envs->f_gout)(gout, g, idx, envs); \ - } \ - POP_PRIM2CTR; + } else { \ + assert(np2c == 0); \ + } #define TRANSPOSE(a) \ if (*empty) { \ CINTdmat_transpose(out, a, nf*nc, n_comp); \ + *empty = 0; \ } else { \ CINTdplus_transpose(out, a, nf*nc, n_comp); \ - } \ - *empty = 0; + } int CINT2c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) { @@ -158,6 +185,7 @@ int CINT2c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty int _empty[2] = {1, 1}; int *iempty = _empty + 0; int *kempty = _empty + 1; + int empty_overall = 1; int ngp[2]; ngp[0] = nf * n_comp; ngp[1] = ngp[0] * i_ctr; @@ -171,7 +199,7 @@ int CINT2c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty if (n_comp == 1) { MALLOC_INSTACK(g1, lenk); gctr[SHLTYPk] = out; - kempty = empty; + *kempty = *empty; } else { MALLOC_INSTACK(gctr[SHLTYPk], lenk); g1 = out; @@ -195,8 +223,7 @@ int CINT2c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty CINTOpt_non0coeff_byshell(non0idxk, non0ctrk, coeff[1], k_prim, k_ctr); int *non0ctr[2] = {non0ctri, non0ctrk}; int *non0idx[2] = {non0idxi, non0idxk}; - double common_factor = envs->common_factor * (M_PI*M_PI*M_PI)*2/SQRTPI - * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->k_l); + double common_factor = envs->common_factor; INITSIMD; for (kp = 0; kp < k_prim; kp++) { @@ -215,11 +242,12 @@ int CINT2c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty } // end loop k_prim RUN_REST; - if (n_comp > 1 && !*kempty) { + if (n_comp > 1 && !empty_overall) { int nc = i_ctr * k_ctr; TRANSPOSE(gctr[SHLTYPk]); } - return !*empty; + *empty &= empty_overall; + return !empty_overall; } @@ -249,6 +277,7 @@ int CINT2c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) int _empty[2] = {1, 1}; int *iempty = _empty + 0; int *kempty = _empty + 1; + int empty_overall = 1; int ngp[2]; ngp[0] = nf * n_comp; ngp[1] = ngp[0] * i_ctr; @@ -262,7 +291,7 @@ int CINT2c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) if (n_comp == 1) { MALLOC_INSTACK(g1, lenk); gctr[SHLTYPk] = out; - kempty = empty; + *kempty = *empty; } else { MALLOC_INSTACK(gctr[SHLTYPk], lenk); g1 = out; @@ -273,8 +302,7 @@ int CINT2c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) ALIGNMM Rys2eT bc; ALIGNMM double cutoff[SIMDD]; - double common_factor = envs->common_factor * (M_PI*M_PI*M_PI)*2/SQRTPI - * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->k_l); + double common_factor = envs->common_factor; CINTOpt *opt = envs->opt; int *idx = opt->index_xyz_array[envs->i_l*LMAX1+envs->k_l]; @@ -299,11 +327,12 @@ int CINT2c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) } // end loop k_prim RUN_REST; - if (n_comp > 1 && !*kempty) { + if (n_comp > 1 && !empty_overall) { int nc = i_ctr * k_ctr; TRANSPOSE(gctr[SHLTYPk]); } - return !*empty; + *empty &= empty_overall; + return !empty_overall; } void CINTinit_int2c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, @@ -331,7 +360,12 @@ void CINTinit_int2c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->nfk = (envs->k_l+1)*(envs->k_l+2)/2; envs->nfl = 1; envs->nf = envs->nfi * envs->nfk; - envs->common_factor = 1; + + envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); + envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); + + envs->common_factor = (M_PI*M_PI*M_PI)*2/SQRTPI + * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->k_l); if (env[PTR_EXPCUTOFF] == 0) { envs->expcutoff = EXPCUTOFF; } else { @@ -347,20 +381,21 @@ void CINTinit_int2c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->lj_ceil = 0; envs->lk_ceil = envs->k_l + ng[KINC]; envs->ll_ceil = 0; - - envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); - envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); - - int nroots = (envs->li_ceil + envs->lk_ceil)/2 + 1; - envs->nrys_roots = nroots; - assert(nroots < MXRYSROOTS); + int rys_order =(envs->li_ceil + envs->lk_ceil)/2 + 1; + int nrys_roots = rys_order; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0 && rys_order <= 3) { + nrys_roots *= 2; + } + envs->rys_order = rys_order; + envs->nrys_roots = nrys_roots; int dli = envs->li_ceil + 1; int dlk = envs->lk_ceil + 1; - envs->g_stride_i = nroots; - envs->g_stride_k = nroots * dli; + envs->g_stride_i = nrys_roots; + envs->g_stride_k = nrys_roots * dli; envs->g_stride_l = envs->g_stride_k; - envs->g_size = nroots * dli * dlk; + envs->g_size = nrys_roots * dli * dlk; MM_STORE(envs->aj, MM_SET1(0.)); MM_STORE(envs->al, MM_SET1(0.)); @@ -381,9 +416,13 @@ void CINTinit_int2c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->rx_in_rklrx = envs->rk; envs->rx_in_rijrx = envs->ri; - if (nroots <= 2) { + if (rys_order <= 2) { envs->f_g0_2d4d = &CINTg0_2e_2d4d_unrolled; envs->f_g0_2d4d_simd1 = &CINTg0_2e_2d4d_unrolled_simd1; + if (rys_order != nrys_roots) { + envs->f_g0_2d4d = &CINTsrg0_2e_2d4d_unrolled; + envs->f_g0_2d4d_simd1 = &CINTsrg0_2e_2d4d_unrolled_simd1; + } } else { envs->f_g0_2d4d = &CINTg0_2e_2d; envs->f_g0_2d4d_simd1 = &CINTg0_2e_2d_simd1; diff --git a/src/cint2c2e_gtg.c b/src/cint2c2e_gtg.c deleted file mode 100644 index 96ed138..0000000 --- a/src/cint2c2e_gtg.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (C) 2013- Qiming Sun - */ - -#include -#include -#include "cint_bas.h" -#include "g2e.h" -#include "optimizer.h" -#include "cint2e.h" -#include "misc.h" -#include "cart2sph.h" - - -#ifdef WITH_GTG -int CINTg0_2e_gtg(double *g, Rys2eT *bc, CINTEnvVars *envs, int count); -int CINTg0_2e_gtg_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs, int idsimd); - -void CINTinit_int2c2e_gtg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env) -{ - envs->natm = natm; - envs->nbas = nbas; - envs->atm = atm; - envs->bas = bas; - envs->env = env; - envs->shls = shls; - - int i_sh = shls[0]; - int k_sh = shls[1]; - envs->i_l = bas(ANG_OF, i_sh); - envs->j_l = 0; - envs->k_l = bas(ANG_OF, k_sh); - envs->l_l = 0; - envs->x_ctr[0] = bas(NCTR_OF, i_sh); - envs->x_ctr[1] = bas(NCTR_OF, k_sh); - envs->x_ctr[2] = 1; - envs->x_ctr[3] = 1; - envs->nfi = (envs->i_l+1)*(envs->i_l+2)/2; - envs->nfj = 1; - envs->nfk = (envs->k_l+1)*(envs->k_l+2)/2; - envs->nfl = 1; - envs->nf = envs->nfi * envs->nfk; - envs->common_factor = SQRTPI * .5; - if (env[PTR_EXPCUTOFF] == 0) { - envs->expcutoff = EXPCUTOFF; - } else { - envs->expcutoff = MAX(MIN_EXPCUTOFF, env[PTR_EXPCUTOFF]); - } - - envs->gbits = ng[GSHIFT]; - envs->ncomp_e1 = ng[POS_E1]; - envs->ncomp_e2 = ng[POS_E2]; - envs->ncomp_tensor = ng[TENSOR]; - - envs->li_ceil = envs->i_l + ng[IINC]; - envs->lj_ceil = 0; - envs->lk_ceil = envs->k_l + ng[KINC]; - envs->ll_ceil = 0; - - envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); - envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); - - envs->nrys_roots = 1; - - int dli = envs->li_ceil + 1; - int dlk = envs->lk_ceil + 1; - envs->g_stride_i = 1; - envs->g_stride_k = dli; - envs->g_stride_l = envs->g_stride_k; - envs->g_size = dli * dlk; - - MM_STORE(envs->aj, MM_SET1(0.)); - MM_STORE(envs->al, MM_SET1(0.)); - MM_STORE(envs->rij+0*SIMDD, MM_SET1(envs->ri[0])); - MM_STORE(envs->rij+1*SIMDD, MM_SET1(envs->ri[1])); - MM_STORE(envs->rij+2*SIMDD, MM_SET1(envs->ri[2])); - MM_STORE(envs->rkl+0*SIMDD, MM_SET1(envs->rk[0])); - MM_STORE(envs->rkl+1*SIMDD, MM_SET1(envs->rk[1])); - MM_STORE(envs->rkl+2*SIMDD, MM_SET1(envs->rk[2])); - envs->g2d_ijmax = envs->g_stride_i; - envs->g2d_klmax = envs->g_stride_k; - envs->rkrl[0] = envs->rk[0]; - envs->rkrl[1] = envs->rk[1]; - envs->rkrl[2] = envs->rk[2]; - envs->rirj[0] = envs->ri[0]; - envs->rirj[1] = envs->ri[1]; - envs->rirj[2] = envs->ri[2]; - envs->rx_in_rklrx = envs->rk; - envs->rx_in_rijrx = envs->ri; - - envs->f_g0_2d4d = &CINTg0_2e_2d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_2d_simd1; - envs->f_g0_2e = &CINTg0_2e_gtg; - envs->f_g0_2e_simd1 = &CINTg0_2e_gtg_simd1; - - // for CINTg2c_index_xyz and c2s_sph_1e function - envs->j_l = envs->k_l; - envs->nfj = envs->nfk; - envs->g_stride_j = envs->g_stride_k; -} -#endif - -CACHE_SIZE_T int2c2e_gtg_sph(double *out, int *dims, int *shls, int *atm, int natm, - int *bas, int nbas, double *env, CINTOpt *opt, double *cache) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTEnvVars envs; - CINTinit_int2c2e_gtg_EnvVars(&envs, ng, shls, atm, natm, bas, nbas, env); - envs.f_gout = &CINTgout2e; - envs.f_gout_simd1 = &CINTgout2e_simd1; - return CINT2c2e_drv(out, dims, &envs, opt, cache, &c2s_sph_1e); -} -void int2c2e_gtg_optimizer(CINTOpt **opt, int *atm, int natm, - int *bas, int nbas, double *env) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTall_2c2e_gtg_optimizer(opt, ng, atm, natm, bas, nbas, env); -} - -CACHE_SIZE_T int2c2e_gtg_cart(double *out, int *dims, int *shls, int *atm, int natm, - int *bas, int nbas, double *env, CINTOpt *opt, double *cache) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTEnvVars envs; - CINTinit_int2c2e_gtg_EnvVars(&envs, ng, shls, atm, natm, bas, nbas, env); - envs.f_gout = &CINTgout2e; - envs.f_gout_simd1 = &CINTgout2e_simd1; - return CINT2c2e_drv(out, dims, &envs, opt, cache, &c2s_cart_1e); -} - -CACHE_SIZE_T int2c2e_gtg_spinor(double complex *out, int *dims, int *shls, int *atm, int natm, - int *bas, int nbas, double *env, CINTOpt *opt, double *cache) -{ - fprintf(stderr, "int2c2e_gtg_spinor not implemented\n"); -} - -ALL_CINT(int2c2e_gtg) - diff --git a/src/cint2e.c b/src/cint2e.c index 2e8f69e..6c6b4ed 100644 --- a/src/cint2e.c +++ b/src/cint2e.c @@ -22,11 +22,12 @@ #include #include #include +#include #include "cint_bas.h" -#include "misc.h" #include "g2e.h" #include "optimizer.h" #include "cint2e.h" +#include "misc.h" #include "cart2sph.h" #include "c2f.h" @@ -64,14 +65,12 @@ fp2c[np2c] = CINTprim_to_ctr_0; \ shltyp[np2c] = SHLTYP##csymb; \ gprim[np2c] = gctr[SHLTYP##psymb]; \ - gp2c [np2c] = gctr[SHLTYP##csymb]; \ iprim[np2c] = csymb##p; \ np2c++; \ } else if (non0ctr##csymb[csymb##p] > 1) { \ fp2c[np2c] = CINTprim_to_ctr_1; \ shltyp[np2c] = SHLTYP##csymb; \ gprim[np2c] = gctr[SHLTYP##psymb]; \ - gp2c [np2c] = gctr[SHLTYP##csymb]; \ iprim[np2c] = csymb##p; \ np2c++; \ } \ @@ -82,9 +81,29 @@ for (i = 0; i < np2c; i++) { \ it = shltyp[i]; \ im = iprim[i]; \ - (*(fp2c[i]))(gp2c[i], gprim[i], coeff[it]+im, \ + (*(fp2c[i]))(gctr[it], gprim[i], coeff[it]+im, \ ngp[it], x_prim[it], x_ctr[it], \ non0ctr[it][im], non0idx[it]+im*x_ctr[it]); \ + empty_overall = 0; \ + } \ + cum = 0; \ + np2c = 0; + +#define POP_PRIM2CTR_AND_SET0 \ + for (i = 0; i < np2c; i++) { \ + it = shltyp[i]; \ + if (it != SHLTYPi) { \ + im = iprim[i]; \ + (*(fp2c[i]))(gctr[it], gprim[i], coeff[it]+im, \ + ngp[it], x_prim[it], x_ctr[it], \ + non0ctr[it][im], non0idx[it]+im*x_ctr[it]); \ + empty_overall = 0; \ + } else if (fp2c[i] == CINTiprim_to_ctr_0) { \ + double *pout = gctr[it]; \ + for (int k = 0; k < ngp[1]; k++) { \ + pout[k] = 0.; \ + } \ + } \ } \ cum = 0; \ np2c = 0; @@ -99,9 +118,12 @@ #define PUSH(RIJ, RKL) \ if (cum == SIMDD) { \ - (*envs->f_g0_2e)(g, cutoff, &bc, envs, cum); \ - (*envs->f_gout)(gout, g, idx, envs); \ - POP_PRIM2CTR; \ + if ((*envs->f_g0_2e)(g, cutoff, &bc, envs, cum)) { \ + (*envs->f_gout)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ + } \ } \ envs->ai[cum] = ai[ip]; \ envs->aj[cum] = aj[jp]; \ @@ -123,9 +145,8 @@ fp2c[np2c] = CINTiprim_to_ctr_1; \ } \ gprim[np2c] = gout + cum * ngp[0]; \ - gp2c [np2c] = gctr[SHLTYPi]; \ iprim[np2c] = ip; \ - shltyp[np2c] = 0; \ + shltyp[np2c] = SHLTYPi; \ cum++; \ np2c++; @@ -139,7 +160,6 @@ } \ int cum = 0; \ int np2c = 0; \ - double *gp2c [SIMDD*4]; \ double *gprim[SIMDD*4]; \ int shltyp[SIMDD*4]; \ int iprim[SIMDD*4]; \ @@ -152,18 +172,22 @@ #define RUN_REST \ if (cum == 1) { \ - (*envs->f_g0_2e_simd1)(g, cutoff, &bc, envs, 0); \ - (*envs->f_gout_simd1)(gout, g, idx, envs); \ + if ((*envs->f_g0_2e_simd1)(g, cutoff, &bc, envs, 0)) { \ + (*envs->f_gout_simd1)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ + } \ } else if (cum > 1) { \ - r1 = MM_SET1(1.); \ - for (i = 0; i < envs->nrys_roots; i++) { \ - MM_STORE(bc.u+i*SIMDD, r1); \ - MM_STORE(bc.w+i*SIMDD, r1); \ + if ((*envs->f_g0_2e)(g, cutoff, &bc, envs, cum)) { \ + (*envs->f_gout)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ } \ - (*envs->f_g0_2e)(g, cutoff, &bc, envs, cum); \ - (*envs->f_gout)(gout, g, idx, envs); \ - } \ - POP_PRIM2CTR + } else { \ + assert(np2c == 0); \ + } int CINT2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) { @@ -194,9 +218,9 @@ int CINT2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) double *cj = env + bas(PTR_COEFF, j_sh); double *ck = env + bas(PTR_COEFF, k_sh); double *cl = env + bas(PTR_COEFF, l_sh); - double *coeff[4] = {ci, cj, ck, cl}; double expcutoff = envs->expcutoff; double rr_ij = SQUARE(envs->rirj); + double rr_kl = SQUARE(envs->rkrl); double *log_maxci, *log_maxcj, *log_maxck, *log_maxcl; PairData *pdata_base, *pdata_ij; MALLOC_DATA_INSTACK(log_maxci, i_prim+j_prim+k_prim+l_prim); @@ -223,11 +247,47 @@ int CINT2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) int *jempty = _empty + 1; int *kempty = _empty + 2; int *lempty = _empty + 3; + int empty_overall = 1; + int lkl = envs->lk_ceil + envs->ll_ceil; + double akl, ekl, expijkl, ccekl, log_rr_kl, eijcutoff; + + akl = ak[k_prim-1] + al[l_prim-1]; + log_rr_kl = 1.7 - 1.5 * approx_log(akl); + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { + // Normally the factor + // (aj*d/aij+theta*R)^li * (ai*d/aij+theta*R)^lj * pi^1.5/aij^{(li+lj+3)/2} + // is a good approximation for polynomial parts in SR-ERIs. + // <~ (aj*d/aij+theta*R)^li * (ai*d/aij+theta*R)^lj * (pi/aij)^1.5 + // <~ (d+theta*R)^li * (d+theta*R)^lj * (pi/aij)^1.5 + if (envs->rys_order > 1) { + double r_guess = 8.; + double omega2 = omega * omega; + int lij = envs->li_ceil + envs->lj_ceil; + if (lij > 0) { + double aij = ai[i_prim-1] + aj[j_prim-1]; + double dist_ij = sqrt(rr_ij); + double theta = omega2 / (omega2 + aij); + expcutoff += lij * approx_log( + (dist_ij+theta*r_guess+1.)/(dist_ij+1.)); + } + if (lkl > 0) { + double theta = omega2 / (omega2 + akl); + log_rr_kl += lkl * approx_log( + sqrt(rr_kl) + theta*r_guess + 1.); + } + } + } else { + if (lkl > 0) { + log_rr_kl += lkl * approx_log(sqrt(rr_kl) + 1.); + } + } int *idx; MALLOC_DATA_INSTACK(idx, nf * 3); CINTg4c_index_xyz(idx, envs); + double *coeff[4] = {ci, cj, ck, cl}; int *non0ctri, *non0ctrj, *non0ctrk, *non0ctrl; int *non0idxi, *non0idxj, *non0idxk, *non0idxl; MALLOC_DATA_INSTACK(non0ctri, i_prim+j_prim+k_prim+l_prim+i_prim*i_ctr+j_prim*j_ctr+k_prim*k_ctr+l_prim*l_ctr); @@ -244,9 +304,7 @@ int CINT2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) CINTOpt_non0coeff_byshell(non0idxl, non0ctrl, coeff[3], l_prim, l_ctr); int *non0ctr[4] = {non0ctri, non0ctrj, non0ctrk, non0ctrl}; int *non0idx[4] = {non0idxi, non0idxj, non0idxk, non0idxl}; - double common_factor = envs->common_factor * (M_PI*M_PI*M_PI)*2/SQRTPI - * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->j_l) - * CINTcommon_fac_sp(envs->k_l) * CINTcommon_fac_sp(envs->l_l); + double common_factor = envs->common_factor; size_t ngp[4]; ngp[0] = nf * n_comp; @@ -268,7 +326,7 @@ int CINT2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) // patch SIMDD*2 for leni, lenj, lenk with s functions MALLOC_INSTACK(g1, lenl+SIMDD*2); bufctr[SHLTYPl] = out; - lempty = empty; + *lempty = *empty; } else { // enlarge sizeo of out by SIMDD*2, for leni, lenj, lenk with s functions cache += SIMDD*2; @@ -285,46 +343,6 @@ int CINT2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) ALIGNMM Rys2eT bc; ALIGNMM double cutoff[SIMDD]; ALIGNMM double rkl[4]; - double rr_kl = SQUARE(envs->rkrl); - double log_rr_kl, akl, ekl, expijkl, ccekl, eijcutoff; - akl = ak[k_prim-1] + al[l_prim-1]; - log_rr_kl = 1.7 - 1.5 * approx_log(akl); - int lkl = envs->lk_ceil + envs->ll_ceil; -#ifdef WITH_RANGE_COULOMB - double omega = env[PTR_RANGE_OMEGA]; - if (omega < 0) { - // Normally the factor - // (aj*d/aij+theta*R)^li * (ai*d/aij+theta*R)^lj * pi^1.5/aij^{(li+lj+3)/2} - // is a good approximation for polynomial parts in SR-ERIs. - // <~ (aj*d/aij+theta*R)^li * (ai*d/aij+theta*R)^lj * (pi/aij)^1.5 - // <~ (d+theta*R)^li * (d+theta*R)^lj * (pi/aij)^1.5 - if (envs->nrys_roots > 1) { - double r_guess = 8.; - double omega2 = omega * omega; - int lij = envs->li_ceil + envs->lj_ceil; - if (lij > 0) { - double aij = ai[i_prim-1] + aj[j_prim-1]; - double dist_ij = sqrt(rr_ij); - double theta = omega2 / (omega2 + aij); - expcutoff += lij * approx_log( - (dist_ij+theta*r_guess+1.)/(dist_ij+1.)); - } - if (lkl > 0) { - double theta = omega2 / (omega2 + akl); - log_rr_kl += lkl * approx_log( - sqrt(rr_kl) + theta*r_guess + 1.); - } - } - } else { - if (lkl > 0) { - log_rr_kl += lkl * approx_log(sqrt(rr_kl) + 1.); - } - } -#else - if (lkl > 0) { - log_rr_kl += lkl * approx_log(sqrt(rr_kl) + 1.); - } -#endif INITSIMD; @@ -370,11 +388,12 @@ k_contracted: ; } // end loop l_prim RUN_REST; - if (n_comp > 1 && !*lempty) { + if (n_comp > 1 && !empty_overall) { int nc = i_ctr * j_ctr * k_ctr * l_ctr; TRANSPOSE(gctr[SHLTYPl]); } - return !*empty; + *empty &= empty_overall; + return !empty_overall; } @@ -411,7 +430,6 @@ int CINT2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) double *cj = env + bas(PTR_COEFF, j_sh); double *ck = env + bas(PTR_COEFF, k_sh); double *cl = env + bas(PTR_COEFF, l_sh); - double *coeff[4] = {ci, cj, ck, cl}; double expcutoff = envs->expcutoff; PairData *_pdata_ij, *_pdata_kl, *pdata_kl, *pdata_ij; if (opt->pairdata != NULL) { @@ -446,6 +464,7 @@ int CINT2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) int *jempty = _empty + 1; int *kempty = _empty + 2; int *lempty = _empty + 3; + int empty_overall = 1; int *idx = opt->index_xyz_array[envs->i_l*LMAX1*LMAX1*LMAX1 +envs->j_l*LMAX1*LMAX1 @@ -475,7 +494,7 @@ int CINT2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) // patch SIMDD*2 for leni, lenj, lenk with s functions MALLOC_INSTACK(g1, lenl+SIMDD*2); bufctr[SHLTYPl] = out; - lempty = empty; + *lempty = *empty; } else { // enlarge out by SIMDD*2, for leni, lenj, lenk with s functions cache += SIMDD*2; @@ -489,10 +508,9 @@ int CINT2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) MALLOC_INSTACK(gout, len0+MAX(len0,leng)); g = gout + len0; // for gx, gy, gz + double *coeff[4] = {ci, cj, ck, cl}; ALIGNMM Rys2eT bc; - double common_factor = envs->common_factor * (M_PI*M_PI*M_PI)*2/SQRTPI - * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->j_l) - * CINTcommon_fac_sp(envs->k_l) * CINTcommon_fac_sp(envs->l_l); + double common_factor = envs->common_factor; double expijkl, eijcutoff; int *non0ctri = opt->non0ctr[i_sh]; @@ -545,11 +563,12 @@ k_contracted: ; } // end loop l_prim RUN_REST; - if (n_comp > 1 && !*lempty) { + if (n_comp > 1 && !empty_overall) { int nc = i_ctr * j_ctr * k_ctr * l_ctr; TRANSPOSE(gctr[SHLTYPl]); } - return !*empty; + *empty &= empty_overall; + return !empty_overall; } int (*CINT2e_1111_loop)(double *, CINTEnvVars *, double *, int *) = &CINT2e_loop; diff --git a/src/cint2e_coulerf.c b/src/cint2e_coulerf.c deleted file mode 100644 index c6f98e8..0000000 --- a/src/cint2e_coulerf.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Qcint is a general GTO integral library for computational chemistry - * Copyright (C) 2014- Qiming Sun - * - * This file is part of Qcint. - * - * Qcint is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - - -#include -#include "cint_bas.h" -#include "g2e.h" -#include "optimizer.h" -#include "cint2e.h" -#include "cart2sph.h" - -CACHE_SIZE_T int2e_coulerf_sph(double *out, int *dims, int *shls, int *atm, int natm, - int *bas, int nbas, double *env, CINTOpt *opt, double *cache) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTEnvVars envs; - CINTinit_int2e_coulerf_EnvVars(&envs, ng, shls, atm, natm, bas, nbas, env); - envs.f_gout = &CINTgout2e; - envs.f_gout_simd1 = &CINTgout2e_simd1; - return CINT2e_drv(out, dims, &envs, opt, cache, &c2s_sph_2e1); -} -void int2e_coulerf_optimizer(CINTOpt **opt, int *atm, int natm, - int *bas, int nbas, double *env) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTall_2e_optimizer(opt, ng, atm, natm, bas, nbas, env); -} - -#define ALL_CINT(NAME) \ -int c##NAME##_sph(double *out, int *shls, int *atm, int natm, \ - int *bas, int nbas, double *env, CINTOpt *opt) { \ - return NAME##_sph(out, NULL, shls, atm, natm, bas, nbas, env, opt, NULL); \ -} \ -void c##NAME##_sph_optimizer(CINTOpt **opt, int *atm, int natm, \ - int *bas, int nbas, double *env) { \ - NAME##_optimizer(opt, atm, natm, bas, nbas, env); \ -} - -ALL_CINT(int2e_coulerf) diff --git a/src/cint2e_gtg.c b/src/cint2e_gtg.c deleted file mode 100644 index 99baf26..0000000 --- a/src/cint2e_gtg.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Qcint is a general GTO integral library for computational chemistry - * Copyright (C) 2014- Qiming Sun - * - * This file is part of Qcint. - * - * Qcint is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - - -#include -#include "cint_bas.h" -#include "g2e.h" -#include "optimizer.h" -#include "cint2e.h" -#include "cart2sph.h" - -CACHE_SIZE_T int2e_gtg_sph(double *out, int *dims, int *shls, int *atm, int natm, - int *bas, int nbas, double *env, CINTOpt *opt, double *cache) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTEnvVars envs; - CINTinit_int2e_gtg_EnvVars(&envs, ng, shls, atm, natm, bas, nbas, env); - envs.f_gout = &CINTgout2e; - envs.f_gout_simd1 = &CINTgout2e_simd1; - return CINT2e_drv(out, dims, &envs, opt, cache, &c2s_sph_2e1); -} -void int2e_gtg_optimizer(CINTOpt **opt, int *atm, int natm, - int *bas, int nbas, double *env) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTall_2e_gtg_optimizer(opt, ng, atm, natm, bas, nbas, env); -} - -#define ALL_CINT(NAME) \ -int c##NAME##_sph(double *out, int *shls, int *atm, int natm, \ - int *bas, int nbas, double *env, CINTOpt *opt) { \ - return NAME##_sph(out, NULL, shls, atm, natm, bas, nbas, env, opt, NULL); \ -} \ -void c##NAME##_sph_optimizer(CINTOpt **opt, int *atm, int natm, \ - int *bas, int nbas, double *env) { \ - NAME##_optimizer(opt, atm, natm, bas, nbas, env); \ -} - -ALL_CINT(int2e_gtg); - diff --git a/src/cint3c2e.c b/src/cint3c2e.c index 6e681f5..a19b164 100644 --- a/src/cint3c2e.c +++ b/src/cint3c2e.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "cint_bas.h" #include "misc.h" #include "g2e.h" @@ -63,15 +64,38 @@ (*(fp2c[i]))(gctr[it], gprim[i], coeff[it]+im, \ ngp[it], x_prim[it], x_ctr[it], \ non0ctr[it][im], non0idx[it]+im*x_ctr[it]); \ + empty_overall = 0; \ + } \ + cum = 0; \ + np2c = 0; + +#define POP_PRIM2CTR_AND_SET0 \ + for (i = 0; i < np2c; i++) { \ + it = shltyp[i]; \ + if (it != SHLTYPi) { \ + im = iprim[i]; \ + (*(fp2c[i]))(gctr[it], gprim[i], coeff[it]+im, \ + ngp[it], x_prim[it], x_ctr[it], \ + non0ctr[it][im], non0idx[it]+im*x_ctr[it]); \ + empty_overall = 0; \ + } else if (fp2c[i] == CINTiprim_to_ctr_0) { \ + double *pout = gctr[it]; \ + for (int k = 0; k < nf; k++) { \ + pout[k] = 0.; \ + } \ + } \ } \ cum = 0; \ np2c = 0; #define PUSH(RIJ, EXPIJ) \ if (cum == SIMDD) { \ - (*envs->f_g0_2e)(g, cutoff, &bc, envs, cum); \ - (*envs->f_gout)(gout, g, idx, envs); \ - POP_PRIM2CTR; \ + if ((*envs->f_g0_2e)(g, cutoff, &bc, envs, cum)) { \ + (*envs->f_gout)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ + } \ } \ envs->ai[cum] = ai[ip]; \ envs->aj[cum] = aj[jp]; \ @@ -115,26 +139,30 @@ #define RUN_REST \ if (cum == 1) { \ - (*envs->f_g0_2e_simd1)(g, cutoff, &bc, envs, 0); \ - (*envs->f_gout_simd1)(gout, g, idx, envs); \ + if ((*envs->f_g0_2e_simd1)(g, cutoff, &bc, envs, 0)) { \ + (*envs->f_gout_simd1)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ + } \ } else if (cum > 1) { \ - r1 = MM_SET1(1.); \ - for (i = 0; i < envs->nrys_roots; i++) { \ - MM_STORE(bc.u+i*SIMDD, r1); \ - MM_STORE(bc.w+i*SIMDD, r1); \ + if ((*envs->f_g0_2e)(g, cutoff, &bc, envs, cum)) { \ + (*envs->f_gout)(gout, g, idx, envs); \ + POP_PRIM2CTR; \ + } else { \ + POP_PRIM2CTR_AND_SET0; \ } \ - (*envs->f_g0_2e)(g, cutoff, &bc, envs, cum); \ - (*envs->f_gout)(gout, g, idx, envs); \ - } \ - POP_PRIM2CTR + } else { \ + assert(np2c == 0); \ + } #define TRANSPOSE(a) \ if (*empty) { \ CINTdmat_transpose(out, a, nf*nc, n_comp); \ + *empty = 0; \ } else { \ CINTdplus_transpose(out, a, nf*nc, n_comp); \ - } \ - *empty = 0; + } int CINT3c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty) { @@ -161,6 +189,7 @@ int CINT3c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty double *coeff[3] = {ci, cj, ck}; double expcutoff = envs->expcutoff; + double rr_ij = SQUARE(envs->rirj); double *log_maxci, *log_maxcj; PairData *pdata_base; MALLOC_DATA_INSTACK(pdata_base, i_prim*j_prim); @@ -170,7 +199,7 @@ int CINT3c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty CINTOpt_log_max_pgto_coeff(log_maxcj, cj, j_prim, j_ctr); if (CINTset_pairdata(pdata_base, ai, aj, envs->ri, envs->rj, log_maxci, log_maxcj, envs->li_ceil, envs->lj_ceil, - i_prim, j_prim, SQUARE(envs->rirj), expcutoff, env)) { + i_prim, j_prim, rr_ij, expcutoff, env)) { return 0; } @@ -182,6 +211,25 @@ int CINT3c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty int *iempty = _empty + 0; int *jempty = _empty + 1; int *kempty = _empty + 2; + int empty_overall = 1; + + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0 && envs->rys_order > 1) { + double r_guess = 8.; + double omega2 = omega * omega; + int lij = envs->li_ceil + envs->lj_ceil; + if (lij > 0) { + double dist_ij = sqrt(rr_ij); + double aij = ai[i_prim-1] + aj[j_prim-1]; + double theta = omega2 / (omega2 + aij); + expcutoff += lij * approx_log( + (dist_ij+theta*r_guess+1.)/(dist_ij+1.)); + } + if (envs->lk_ceil > 0) { + double theta = omega2 / (omega2 + ak[k_prim-1]); + expcutoff += envs->lk_ceil * approx_log(theta*r_guess+1.); + } + } int *idx; MALLOC_DATA_INSTACK(idx, nf * 3); @@ -200,9 +248,7 @@ int CINT3c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty CINTOpt_non0coeff_byshell(non0idxk, non0ctrk, coeff[2], k_prim, k_ctr); int *non0ctr[3] = {non0ctri, non0ctrj, non0ctrk}; int *non0idx[3] = {non0idxi, non0idxj, non0idxk}; - double common_factor = envs->common_factor * (M_PI*M_PI*M_PI)*2/SQRTPI - * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->j_l) - * CINTcommon_fac_sp(envs->k_l); + double common_factor = envs->common_factor; int ngp[4]; ngp[0] = nf * n_comp; @@ -221,7 +267,7 @@ int CINT3c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty // patch SIMDD for leni, lenj with s functions MALLOC_INSTACK(g1, lenk+SIMDD); gctr[SHLTYPk] = out; - kempty = empty; + *kempty = *empty; } else { // enlarge out by SIMDD, for leni, lenj with s functions cache += SIMDD; @@ -269,11 +315,12 @@ int CINT3c2e_loop_nopt(double *out, CINTEnvVars *envs, double *cache, int *empty } // end loop k_prim RUN_REST; - if (n_comp > 1 && !*kempty) { + if (n_comp > 1 && !empty_overall) { int nc = i_ctr * j_ctr * k_ctr; TRANSPOSE(gctr[SHLTYPk]); } - return !*empty; + *empty &= empty_overall; + return !empty_overall; } @@ -306,6 +353,7 @@ int CINT3c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) double *ck = env + bas(PTR_COEFF, k_sh); double *coeff[3] = {ci, cj, ck}; double expcutoff = envs->expcutoff; + double rr_ij = SQUARE(envs->rirj); PairData *pdata_base, *pdata_ij; if (opt->pairdata != NULL) { pdata_base = opt->pairdata[i_sh*opt->nbas+j_sh]; @@ -315,7 +363,7 @@ int CINT3c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) MALLOC_DATA_INSTACK(pdata_base, i_prim*j_prim); if (CINTset_pairdata(pdata_base, ai, aj, envs->ri, envs->rj, log_maxci, log_maxcj, envs->li_ceil, envs->lj_ceil, - i_prim, j_prim, SQUARE(envs->rirj), expcutoff, env)) { + i_prim, j_prim, rr_ij, expcutoff, env)) { return 0; } } @@ -328,6 +376,25 @@ int CINT3c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) int *iempty = _empty + 0; int *jempty = _empty + 1; int *kempty = _empty + 2; + int empty_overall = 1; + + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0 && envs->rys_order > 1) { + double r_guess = 8.; + double omega2 = omega * omega; + int lij = envs->li_ceil + envs->lj_ceil; + if (lij > 0) { + double dist_ij = sqrt(rr_ij); + double aij = ai[i_prim-1] + aj[j_prim-1]; + double theta = omega2 / (omega2 + aij); + expcutoff += lij * approx_log( + (dist_ij+theta*r_guess+1.)/(dist_ij+1.)); + } + if (envs->lk_ceil > 0) { + double theta = omega2 / (omega2 + ak[k_prim-1]); + expcutoff += envs->lk_ceil * approx_log(theta*r_guess+1.); + } + } int *idx = opt->index_xyz_array[envs->i_l*LMAX1*LMAX1 +envs->j_l*LMAX1 @@ -343,9 +410,7 @@ int CINT3c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) CINTOpt_non0coeff_byshell(non0idxk, non0ctrk, ck, k_prim, k_ctr); int *non0ctr[3] = {opt->non0ctr[i_sh], opt->non0ctr[j_sh], non0ctrk}; int *non0idx[3] = {opt->sortedidx[i_sh], opt->sortedidx[j_sh], non0idxk}; - double common_factor = envs->common_factor * (M_PI*M_PI*M_PI)*2/SQRTPI - * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->j_l) - * CINTcommon_fac_sp(envs->k_l); + double common_factor = envs->common_factor; int ngp[4]; ngp[0] = nf * n_comp; @@ -365,7 +430,7 @@ int CINT3c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) // patch SIMDD for leni, lenj with s functions MALLOC_INSTACK(g1, lenk+SIMDD); gctr[SHLTYPk] = out; - kempty = empty; + *kempty = *empty; } else { // enlarge out by SIMDD, for leni, lenj with s functions cache += SIMDD; @@ -412,11 +477,12 @@ int CINT3c2e_loop(double *out, CINTEnvVars *envs, double *cache, int *empty) } // end loop k_prim RUN_REST; - if (n_comp > 1 && !*kempty) { + if (n_comp > 1 && !empty_overall) { int nc = i_ctr * j_ctr * k_ctr; TRANSPOSE(gctr[SHLTYPk]); } - return !*empty; + *empty &= empty_overall; + return !empty_overall; } diff --git a/src/cint3c2e_gtg.c b/src/cint3c2e_gtg.c deleted file mode 100644 index 9808a75..0000000 --- a/src/cint3c2e_gtg.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (C) 2013- Qiming Sun - */ - -#include -#include "cint_bas.h" -#include "g2e.h" -#include "optimizer.h" -#include "cint2e.h" -#include "cart2sph.h" - -CACHE_SIZE_T int3c2e_gtg_sph(double *out, int *dims, int *shls, int *atm, int natm, - int *bas, int nbas, double *env, CINTOpt *opt, double *cache) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTEnvVars envs; - CINTinit_int3c2e_gtg_EnvVars(&envs, ng, shls, atm, natm, bas, nbas, env); - envs.f_gout = &CINTgout2e; - envs.f_gout_simd1 = &CINTgout2e_simd1; - return CINT3c2e_drv(out, dims, &envs, opt, cache, &c2s_sph_3c2e1, 0); -} -void int3c2e_gtg_optimizer(CINTOpt **opt, int *atm, int natm, - int *bas, int nbas, double *env) -{ - int ng[] = {0, 0, 0, 0, 0, 1, 1, 1}; - CINTall_3c2e_gtg_optimizer(opt, ng, atm, natm, bas, nbas, env); -} - -#define ALL_CINT(NAME) \ -int c##NAME##_sph(double *out, int *shls, int *atm, int natm, \ - int *bas, int nbas, double *env, CINTOpt *opt) { \ - return NAME##_sph(out, NULL, shls, atm, natm, bas, nbas, env, opt, NULL); \ -} \ -void c##NAME##_sph_optimizer(CINTOpt **opt, int *atm, int natm, \ - int *bas, int nbas, double *env) { \ - NAME##_optimizer(opt, atm, natm, bas, nbas, env); \ -} - -ALL_CINT(int3c2e_gtg) - diff --git a/src/fmt.c b/src/fmt.c index 018d418..33e1f3c 100644 --- a/src/fmt.c +++ b/src/fmt.c @@ -319,7 +319,6 @@ void fmt_erfc_like(double *f, double t, double lower, int m) int i; double lower2 = lower * lower; -#ifdef WITH_RANGE_COULOMB // F[m] < .5*sqrt(pi/t) * erfc(low*tt) if (t * lower2 > ERFC_bound) { for (i = 0; i <= m; i++) { @@ -327,7 +326,6 @@ void fmt_erfc_like(double *f, double t, double lower, int m) } return; } -#endif if (t < TURNOVER_POINT[m]) { fmt1_erfc_like(f, t, lower, m); @@ -357,7 +355,6 @@ void fmt_lerfc_like(long double *f, long double t, long double lower, int m) int i; long double lower2 = lower * lower; -#ifdef WITH_RANGE_COULOMB // F[m] < .5*sqrt(pi/t) * erfc(low*tt) if (t * lower2 > ERFC_bound) { for (i = 0; i <= m; i++) { @@ -365,7 +362,6 @@ void fmt_lerfc_like(long double *f, long double t, long double lower, int m) } return; } -#endif if (t < TURNOVER_POINT[m]) { fmt1_lerfc_like(f, t, lower, m); @@ -477,7 +473,6 @@ void fmt_qerfc_like(__float128 *f, __float128 t, __float128 lower, int m) int i; __float128 lower2 = lower * lower; -#ifdef WITH_RANGE_COULOMB // F[m] < .5*sqrt(pi/t) * erfc(low*tt) if (t * lower2 > ERFC_bound) { for (i = 0; i <= m; i++) { @@ -485,7 +480,6 @@ void fmt_qerfc_like(__float128 *f, __float128 t, __float128 lower, int m) } return; } -#endif if (t < TURNOVER_POINT[m]) { fmt1_qerfc_like(f, t, lower, m); diff --git a/src/g2e.c b/src/g2e.c index ca79b07..7465b70 100644 --- a/src/g2e.c +++ b/src/g2e.c @@ -62,7 +62,15 @@ void CINTinit_int2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->nfk = (envs->k_l+1)*(envs->k_l+2)/2; envs->nfl = (envs->l_l+1)*(envs->l_l+2)/2; envs->nf = envs->nfi * envs->nfk * envs->nfl * envs->nfj; - envs->common_factor = 1; + + envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); + envs->rj = env + atm(PTR_COORD, bas(ATOM_OF, j_sh)); + envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); + envs->rl = env + atm(PTR_COORD, bas(ATOM_OF, l_sh)); + + envs->common_factor = (M_PI*M_PI*M_PI)*2/SQRTPI + * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->j_l) + * CINTcommon_fac_sp(envs->k_l) * CINTcommon_fac_sp(envs->l_l); if (env[PTR_EXPCUTOFF] == 0) { envs->expcutoff = EXPCUTOFF; } else { @@ -79,42 +87,19 @@ void CINTinit_int2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->lj_ceil = envs->j_l + ng[JINC]; envs->lk_ceil = envs->k_l + ng[KINC]; envs->ll_ceil = envs->l_l + ng[LINC]; - - envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); - envs->rj = env + atm(PTR_COORD, bas(ATOM_OF, j_sh)); - envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); - envs->rl = env + atm(PTR_COORD, bas(ATOM_OF, l_sh)); - - int nroots = (envs->li_ceil + envs->lj_ceil + - envs->lk_ceil + envs->ll_ceil)/2 + 1; - envs->nrys_roots = nroots; - assert(nroots < MXRYSROOTS); + int rys_order =(envs->li_ceil + envs->lj_ceil + + envs->lk_ceil + envs->ll_ceil)/2 + 1; + int nrys_roots = rys_order; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0 && rys_order <= 3) { + nrys_roots *= 2; + } + envs->rys_order = rys_order; + envs->nrys_roots = nrys_roots; int dli, dlj, dlk, dll; int ibase = envs->li_ceil > envs->lj_ceil; int kbase = envs->lk_ceil > envs->ll_ceil; - if (nroots <= 2) { - envs->f_g0_2d4d = &CINTg0_2e_2d4d_unrolled; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_2d4d_unrolled_simd1; - } else if (kbase) { - if (ibase) { - envs->f_g0_2d4d = &CINTg0_2e_ik2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_ik2d4d_simd1; - } else { - envs->f_g0_2d4d = &CINTg0_2e_kj2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_kj2d4d_simd1; - } - } else { - if (ibase) { - envs->f_g0_2d4d = &CINTg0_2e_il2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_il2d4d_simd1; - } else { - envs->f_g0_2d4d = &CINTg0_2e_lj2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_lj2d4d_simd1; - } - } - envs->f_g0_2e = &CINTg0_2e; - envs->f_g0_2e_simd1 = &CINTg0_2e_simd1; if (kbase) { dlk = envs->lk_ceil + envs->ll_ceil + 1; @@ -131,11 +116,11 @@ void CINTinit_int2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, dli = envs->li_ceil + 1; dlj = envs->li_ceil + envs->lj_ceil + 1; } - envs->g_stride_i = nroots; - envs->g_stride_k = nroots * dli; - envs->g_stride_l = nroots * dli * dlk; - envs->g_stride_j = nroots * dli * dlk * dll; - envs->g_size = nroots * dli * dlk * dll * dlj; + envs->g_stride_i = nrys_roots; + envs->g_stride_k = nrys_roots * dli; + envs->g_stride_l = nrys_roots * dli * dlk; + envs->g_stride_j = nrys_roots * dli * dlk * dll; + envs->g_size = nrys_roots * dli * dlk * dll * dlj; if (kbase) { envs->g2d_klmax = envs->g_stride_k; @@ -164,6 +149,33 @@ void CINTinit_int2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->rirj[1] = envs->rj[1] - envs->ri[1]; envs->rirj[2] = envs->rj[2] - envs->ri[2]; } + + if (rys_order <= 2) { + envs->f_g0_2d4d = &CINTg0_2e_2d4d_unrolled; + envs->f_g0_2d4d_simd1 = &CINTg0_2e_2d4d_unrolled_simd1; + if (rys_order != nrys_roots) { + envs->f_g0_2d4d = &CINTsrg0_2e_2d4d_unrolled; + envs->f_g0_2d4d_simd1 = &CINTsrg0_2e_2d4d_unrolled_simd1; + } + } else if (kbase) { + if (ibase) { + envs->f_g0_2d4d = &CINTg0_2e_ik2d4d; + envs->f_g0_2d4d_simd1 = &CINTg0_2e_ik2d4d_simd1; + } else { + envs->f_g0_2d4d = &CINTg0_2e_kj2d4d; + envs->f_g0_2d4d_simd1 = &CINTg0_2e_kj2d4d_simd1; + } + } else { + if (ibase) { + envs->f_g0_2d4d = &CINTg0_2e_il2d4d; + envs->f_g0_2d4d_simd1 = &CINTg0_2e_il2d4d_simd1; + } else { + envs->f_g0_2d4d = &CINTg0_2e_lj2d4d; + envs->f_g0_2d4d_simd1 = &CINTg0_2e_lj2d4d_simd1; + } + } + envs->f_g0_2e = &CINTg0_2e; + envs->f_g0_2e_simd1 = &CINTg0_2e_simd1; } void CINTg4c_index_xyz(int *idx, CINTEnvVars *envs) @@ -765,24 +777,8 @@ MM_STORE(gz+n*SIMDD, MM_FMA(rz, MM_LOAD(p1z+n*SIMDD), MM_LOAD(p2z+n*SIMDD))); } /************* some special g0_4d results *************/ /* 4 digits stand for i_ceil, k_ceil, l_ceil, j_ceil */ -static inline void _make_rc(double *rc, double *cx, double *cy, double *cz, double *r) -{ - __MD r0 = MM_SET1(r[0]); - __MD r1 = MM_SET1(r[1]); - __MD r2 = MM_SET1(r[2]); - MM_STORE(rc+0*SIMDD, MM_ADD(r0, MM_LOAD(cx+0*SIMDD))); - MM_STORE(rc+1*SIMDD, MM_ADD(r0, MM_LOAD(cx+1*SIMDD))); - MM_STORE(rc+2*SIMDD, MM_ADD(r1, MM_LOAD(cy+0*SIMDD))); - MM_STORE(rc+3*SIMDD, MM_ADD(r1, MM_LOAD(cy+1*SIMDD))); - MM_STORE(rc+4*SIMDD, MM_ADD(r2, MM_LOAD(cz+0*SIMDD))); - MM_STORE(rc+5*SIMDD, MM_ADD(r2, MM_LOAD(cz+1*SIMDD))); -} - static inline void _g0_2d4d_0000(double *g, Rys2eT *bc, CINTEnvVars *envs) { - //MM_STORE(g+0*SIMDD, MM_SET1(1.)); - //MM_STORE(g+1*SIMDD, MM_SET1(1.)); - //g[2] = w[0]; } static inline void _g0_2d4d_0001(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -831,29 +827,35 @@ static inline void _g0_2d4d_0003(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cy = bc->c0py; double *cz = bc->c0pz; double *b = bc->b01; + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); __MD b0 = MM_LOAD(b+0*SIMDD); __MD b1 = MM_LOAD(b+1*SIMDD); __MD g16 = MM_LOAD(g+16*SIMDD); __MD g17 = MM_LOAD(g+17*SIMDD); __MD i3 = MM_SET1(3.); - MM_STORE(g+2 *SIMDD, MM_LOAD(cx+0*SIMDD)); - MM_STORE(g+3 *SIMDD, MM_LOAD(cx+1*SIMDD)); - MM_STORE(g+10*SIMDD, MM_LOAD(cy+0*SIMDD)); - MM_STORE(g+11*SIMDD, MM_LOAD(cy+1*SIMDD)); - MM_STORE(g+18*SIMDD, MM_LOAD(cz+0*SIMDD) * g16); - MM_STORE(g+19*SIMDD, MM_LOAD(cz+1*SIMDD) * g17); - MM_STORE(g+4 *SIMDD, MM_LOAD(cx+0*SIMDD) * MM_LOAD(cx+0*SIMDD) + b0); - MM_STORE(g+5 *SIMDD, MM_LOAD(cx+1*SIMDD) * MM_LOAD(cx+1*SIMDD) + b1); - MM_STORE(g+12*SIMDD, MM_LOAD(cy+0*SIMDD) * MM_LOAD(cy+0*SIMDD) + b0); - MM_STORE(g+13*SIMDD, MM_LOAD(cy+1*SIMDD) * MM_LOAD(cy+1*SIMDD) + b1); - MM_STORE(g+20*SIMDD,(MM_LOAD(cz+0*SIMDD) * MM_LOAD(cz+0*SIMDD) + b0)* g16); - MM_STORE(g+21*SIMDD,(MM_LOAD(cz+1*SIMDD) * MM_LOAD(cz+1*SIMDD) + b1)* g17); - MM_STORE(g+6 *SIMDD, MM_LOAD(cx+0*SIMDD) *(MM_LOAD(cx+0*SIMDD) * MM_LOAD(cx+0*SIMDD) + i3 * b0)); - MM_STORE(g+7 *SIMDD, MM_LOAD(cx+1*SIMDD) *(MM_LOAD(cx+1*SIMDD) * MM_LOAD(cx+1*SIMDD) + i3 * b1)); - MM_STORE(g+14*SIMDD, MM_LOAD(cy+0*SIMDD) *(MM_LOAD(cy+0*SIMDD) * MM_LOAD(cy+0*SIMDD) + i3 * b0)); - MM_STORE(g+15*SIMDD, MM_LOAD(cy+1*SIMDD) *(MM_LOAD(cy+1*SIMDD) * MM_LOAD(cy+1*SIMDD) + i3 * b1)); - MM_STORE(g+22*SIMDD,(MM_LOAD(cz+0*SIMDD) * MM_LOAD(cz+0*SIMDD) + i3 * b0)* MM_LOAD(g+18*SIMDD)); - MM_STORE(g+23*SIMDD,(MM_LOAD(cz+1*SIMDD) * MM_LOAD(cz+1*SIMDD) + i3 * b1)* MM_LOAD(g+19*SIMDD)); + MM_STORE(g+2 *SIMDD, cx0); + MM_STORE(g+3 *SIMDD, cx1); + MM_STORE(g+10*SIMDD, cy0); + MM_STORE(g+11*SIMDD, cy1); + MM_STORE(g+18*SIMDD, cz0 * g16); + MM_STORE(g+19*SIMDD, cz1 * g17); + MM_STORE(g+4 *SIMDD, cx0 * cx0 + b0); + MM_STORE(g+5 *SIMDD, cx1 * cx1 + b1); + MM_STORE(g+12*SIMDD, cy0 * cy0 + b0); + MM_STORE(g+13*SIMDD, cy1 * cy1 + b1); + MM_STORE(g+20*SIMDD,(cz0 * cz0 + b0)* g16); + MM_STORE(g+21*SIMDD,(cz1 * cz1 + b1)* g17); + MM_STORE(g+6 *SIMDD, cx0 *(cx0 * cx0 + i3 * b0)); + MM_STORE(g+7 *SIMDD, cx1 *(cx1 * cx1 + i3 * b1)); + MM_STORE(g+14*SIMDD, cy0 *(cy0 * cy0 + i3 * b0)); + MM_STORE(g+15*SIMDD, cy1 *(cy1 * cy1 + i3 * b1)); + MM_STORE(g+22*SIMDD,(cz0 * cz0 + i3 * b0)* MM_LOAD(g+18*SIMDD)); + MM_STORE(g+23*SIMDD,(cz1 * cz1 + i3 * b1)* MM_LOAD(g+19*SIMDD)); } static inline void _g0_2d4d_0010(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -873,20 +875,21 @@ static inline void _g0_2d4d_0011(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cz = bc->c0pz; double *b = bc->b01; double *r = envs->rkrl; - ALIGNMM double rc[6*SIMDD]; - _make_rc(rc, cx, cy, cz, r); - __MD r0 = MM_LOAD(rc+0*SIMDD); - __MD r1 = MM_LOAD(rc+1*SIMDD); - __MD r2 = MM_LOAD(rc+2*SIMDD); - __MD r3 = MM_LOAD(rc+3*SIMDD); - __MD r4 = MM_LOAD(rc+4*SIMDD); - __MD r5 = MM_LOAD(rc+5*SIMDD); + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); __MD cx0 = MM_LOAD(cx+0*SIMDD); __MD cx1 = MM_LOAD(cx+1*SIMDD); __MD cy0 = MM_LOAD(cy+0*SIMDD); __MD cy1 = MM_LOAD(cy+1*SIMDD); __MD cz0 = MM_LOAD(cz+0*SIMDD); __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(ry, cy0); + __MD r3 = MM_ADD(ry, cy1); + __MD r4 = MM_ADD(rz, cz0); + __MD r5 = MM_ADD(rz, cz1); __MD b0 = MM_LOAD(b+0*SIMDD); __MD b1 = MM_LOAD(b+1*SIMDD); __MD g24 = MM_LOAD(g+24*SIMDD); @@ -918,20 +921,21 @@ static inline void _g0_2d4d_0012(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cz = bc->c0pz; double *b = bc->b01; double *r = envs->rkrl; - ALIGNMM double rc[6*SIMDD]; - _make_rc(rc, cx, cy, cz, r); - __MD r0 = MM_LOAD(rc+0*SIMDD); - __MD r1 = MM_LOAD(rc+1*SIMDD); - __MD r2 = MM_LOAD(rc+2*SIMDD); - __MD r3 = MM_LOAD(rc+3*SIMDD); - __MD r4 = MM_LOAD(rc+4*SIMDD); - __MD r5 = MM_LOAD(rc+5*SIMDD); + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); __MD cx0 = MM_LOAD(cx+0*SIMDD); __MD cx1 = MM_LOAD(cx+1*SIMDD); __MD cy0 = MM_LOAD(cy+0*SIMDD); __MD cy1 = MM_LOAD(cy+1*SIMDD); __MD cz0 = MM_LOAD(cz+0*SIMDD); __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(ry, cy0); + __MD r3 = MM_ADD(ry, cy1); + __MD r4 = MM_ADD(rz, cz0); + __MD r5 = MM_ADD(rz, cz1); __MD b0 = MM_LOAD(b+0*SIMDD); __MD b1 = MM_LOAD(b+1*SIMDD); __MD i2 = MM_SET1(2.); @@ -1005,26 +1009,27 @@ static inline void _g0_2d4d_0021(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cy = bc->c0py; double *cz = bc->c0pz; double *b1 = bc->b01; - double *rkl = envs->rkrl; - ALIGNMM double rc[6*SIMDD]; - _make_rc(rc, cx, cy, cz, rkl); - __MD b10 = MM_LOAD(b1+0*SIMDD); - __MD b11 = MM_LOAD(b1+1*SIMDD); - __MD g32 = MM_LOAD(g+32*SIMDD); - __MD g33 = MM_LOAD(g+33*SIMDD); - __MD i2 = MM_SET1(2.); - __MD r0 = MM_LOAD(rc+0*SIMDD); - __MD r1 = MM_LOAD(rc+1*SIMDD); - __MD r2 = MM_LOAD(rc+2*SIMDD); - __MD r3 = MM_LOAD(rc+3*SIMDD); - __MD r4 = MM_LOAD(rc+4*SIMDD); - __MD r5 = MM_LOAD(rc+5*SIMDD); + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); __MD s0 = MM_LOAD(cx+0*SIMDD); __MD s1 = MM_LOAD(cx+1*SIMDD); __MD s2 = MM_LOAD(cy+0*SIMDD); __MD s3 = MM_LOAD(cy+1*SIMDD); __MD s4 = MM_LOAD(cz+0*SIMDD); __MD s5 = MM_LOAD(cz+1*SIMDD); + __MD r0 = MM_ADD(rx, s0); + __MD r1 = MM_ADD(rx, s1); + __MD r2 = MM_ADD(ry, s2); + __MD r3 = MM_ADD(ry, s3); + __MD r4 = MM_ADD(rz, s4); + __MD r5 = MM_ADD(rz, s5); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD i2 = MM_SET1(2.); MM_STORE(g+2 *SIMDD, s0); MM_STORE(g+3 *SIMDD, s1); MM_STORE(g+8 *SIMDD, r0); @@ -1218,8 +1223,8 @@ static inline void _g0_2d4d_0110(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b = bc->b00; - __MD b0 = MM_LOAD(b+0*SIMDD); - __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); __MD r0 = MM_LOAD(c0x+0*SIMDD); __MD r1 = MM_LOAD(c0x+1*SIMDD); __MD r2 = MM_LOAD(c0y+0*SIMDD); @@ -1264,57 +1269,76 @@ static inline void _g0_2d4d_0111(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpz = bc->c0pz; double *b0 = bc->b00; double *b1 = bc->b01; - double *rkl = envs->rkrl; - ALIGNMM double rcp[6*SIMDD]; - _make_rc(rcp, cpx, cpy, cpz, rkl); + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); + __MD rcp0 = MM_ADD(rx, cpx0); + __MD rcp1 = MM_ADD(rx, cpx1); + __MD rcp2 = MM_ADD(ry, cpy0); + __MD rcp3 = MM_ADD(ry, cpy1); + __MD rcp4 = MM_ADD(rz, cpz0); + __MD rcp5 = MM_ADD(rz, cpz1); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); __MD b00 = MM_LOAD(b0+0*SIMDD); __MD b01 = MM_LOAD(b0+1*SIMDD); __MD b10 = MM_LOAD(b1+0*SIMDD); __MD b11 = MM_LOAD(b1+1*SIMDD); __MD g48 = MM_LOAD(g+48*SIMDD); __MD g49 = MM_LOAD(g+49*SIMDD); - MM_STORE(g+2 *SIMDD, MM_LOAD(rcp+0*SIMDD)); - MM_STORE(g+3 *SIMDD, MM_LOAD(rcp+1*SIMDD)); - MM_STORE(g+4 *SIMDD, MM_LOAD(cpx+0*SIMDD)); - MM_STORE(g+5 *SIMDD, MM_LOAD(cpx+1*SIMDD)); - MM_STORE(g+12*SIMDD, MM_LOAD(c0x+0*SIMDD)); - MM_STORE(g+13*SIMDD, MM_LOAD(c0x+1*SIMDD)); - MM_STORE(g+26*SIMDD, MM_LOAD(rcp+2*SIMDD)); - MM_STORE(g+27*SIMDD, MM_LOAD(rcp+3*SIMDD)); - MM_STORE(g+28*SIMDD, MM_LOAD(cpy+0*SIMDD)); - MM_STORE(g+29*SIMDD, MM_LOAD(cpy+1*SIMDD)); - MM_STORE(g+36*SIMDD, MM_LOAD(c0y+0*SIMDD)); - MM_STORE(g+37*SIMDD, MM_LOAD(c0y+1*SIMDD)); - MM_STORE(g+50*SIMDD, MM_LOAD(rcp+4*SIMDD) * g48); - MM_STORE(g+51*SIMDD, MM_LOAD(rcp+5*SIMDD) * g49); - MM_STORE(g+52*SIMDD, MM_LOAD(cpz+0*SIMDD) * g48); - MM_STORE(g+53*SIMDD, MM_LOAD(cpz+1*SIMDD) * g49); - MM_STORE(g+60*SIMDD, MM_LOAD(c0z+0*SIMDD) * g48); - MM_STORE(g+61*SIMDD, MM_LOAD(c0z+1*SIMDD) * g49); - MM_STORE(g+14*SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(rcp+0*SIMDD) + b00); - MM_STORE(g+15*SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(rcp+1*SIMDD) + b01); - MM_STORE(g+16*SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(cpx+0*SIMDD) + b00); - MM_STORE(g+17*SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(cpx+1*SIMDD) + b01); - MM_STORE(g+6 *SIMDD, MM_LOAD(cpx+0*SIMDD) * MM_LOAD(rcp+0*SIMDD) + b10); - MM_STORE(g+7 *SIMDD, MM_LOAD(cpx+1*SIMDD) * MM_LOAD(rcp+1*SIMDD) + b11); - MM_STORE(g+30*SIMDD, MM_LOAD(cpy+0*SIMDD) * MM_LOAD(rcp+2*SIMDD) + b10); - MM_STORE(g+31*SIMDD, MM_LOAD(cpy+1*SIMDD) * MM_LOAD(rcp+3*SIMDD) + b11); - MM_STORE(g+38*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(rcp+2*SIMDD) + b00); - MM_STORE(g+39*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(rcp+3*SIMDD) + b01); - MM_STORE(g+40*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(cpy+0*SIMDD) + b00); - MM_STORE(g+41*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(cpy+1*SIMDD) + b01); - MM_STORE(g+54*SIMDD,(MM_LOAD(cpz+0*SIMDD) * MM_LOAD(rcp+4*SIMDD) + b10) * g48); - MM_STORE(g+55*SIMDD,(MM_LOAD(cpz+1*SIMDD) * MM_LOAD(rcp+5*SIMDD) + b11) * g49); - MM_STORE(g+62*SIMDD,(MM_LOAD(c0z+0*SIMDD) * MM_LOAD(rcp+4*SIMDD) + b00) * g48); - MM_STORE(g+63*SIMDD,(MM_LOAD(c0z+1*SIMDD) * MM_LOAD(rcp+5*SIMDD) + b01) * g49); - MM_STORE(g+64*SIMDD,(MM_LOAD(c0z+0*SIMDD) * MM_LOAD(cpz+0*SIMDD) + b00) * g48); - MM_STORE(g+65*SIMDD,(MM_LOAD(c0z+1*SIMDD) * MM_LOAD(cpz+1*SIMDD) + b01) * g49); - MM_STORE(g+18*SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(g+6 *SIMDD) + b00 * (MM_LOAD(rcp+0*SIMDD) + MM_LOAD(cpx+0*SIMDD))); - MM_STORE(g+19*SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(g+7 *SIMDD) + b01 * (MM_LOAD(rcp+1*SIMDD) + MM_LOAD(cpx+1*SIMDD))); - MM_STORE(g+42*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(g+30*SIMDD) + b00 * (MM_LOAD(rcp+2*SIMDD) + MM_LOAD(cpy+0*SIMDD))); - MM_STORE(g+43*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(g+31*SIMDD) + b01 * (MM_LOAD(rcp+3*SIMDD) + MM_LOAD(cpy+1*SIMDD))); - MM_STORE(g+66*SIMDD, MM_LOAD(c0z+0*SIMDD) * MM_LOAD(g+54*SIMDD) + b00 * (MM_LOAD(g+50*SIMDD) + MM_LOAD(g+52*SIMDD))); - MM_STORE(g+67*SIMDD, MM_LOAD(c0z+1*SIMDD) * MM_LOAD(g+55*SIMDD) + b01 * (MM_LOAD(g+51*SIMDD) + MM_LOAD(g+53*SIMDD))); + MM_STORE(g+2 *SIMDD, rcp0); + MM_STORE(g+3 *SIMDD, rcp1); + MM_STORE(g+4 *SIMDD, cpx0); + MM_STORE(g+5 *SIMDD, cpx1); + MM_STORE(g+12*SIMDD, c0x0); + MM_STORE(g+13*SIMDD, c0x1); + MM_STORE(g+26*SIMDD, rcp2); + MM_STORE(g+27*SIMDD, rcp3); + MM_STORE(g+28*SIMDD, cpy0); + MM_STORE(g+29*SIMDD, cpy1); + MM_STORE(g+36*SIMDD, c0y0); + MM_STORE(g+37*SIMDD, c0y1); + MM_STORE(g+50*SIMDD, rcp4 * g48); + MM_STORE(g+51*SIMDD, rcp5 * g49); + MM_STORE(g+52*SIMDD, cpz0 * g48); + MM_STORE(g+53*SIMDD, cpz1 * g49); + MM_STORE(g+60*SIMDD, c0z0 * g48); + MM_STORE(g+61*SIMDD, c0z1 * g49); + MM_STORE(g+14*SIMDD, c0x0 * rcp0 + b00); + MM_STORE(g+15*SIMDD, c0x1 * rcp1 + b01); + MM_STORE(g+16*SIMDD, c0x0 * cpx0 + b00); + MM_STORE(g+17*SIMDD, c0x1 * cpx1 + b01); + MM_STORE(g+6 *SIMDD, cpx0 * rcp0 + b10); + MM_STORE(g+7 *SIMDD, cpx1 * rcp1 + b11); + MM_STORE(g+30*SIMDD, cpy0 * rcp2 + b10); + MM_STORE(g+31*SIMDD, cpy1 * rcp3 + b11); + MM_STORE(g+38*SIMDD, c0y0 * rcp2 + b00); + MM_STORE(g+39*SIMDD, c0y1 * rcp3 + b01); + MM_STORE(g+40*SIMDD, c0y0 * cpy0 + b00); + MM_STORE(g+41*SIMDD, c0y1 * cpy1 + b01); + MM_STORE(g+54*SIMDD,(cpz0 * rcp4 + b10) * g48); + MM_STORE(g+55*SIMDD,(cpz1 * rcp5 + b11) * g49); + MM_STORE(g+62*SIMDD,(c0z0 * rcp4 + b00) * g48); + MM_STORE(g+63*SIMDD,(c0z1 * rcp5 + b01) * g49); + MM_STORE(g+64*SIMDD,(c0z0 * cpz0 + b00) * g48); + MM_STORE(g+65*SIMDD,(c0z1 * cpz1 + b01) * g49); + MM_STORE(g+18*SIMDD, c0x0 * MM_LOAD(g+6 *SIMDD) + b00 * (rcp0 + cpx0)); + MM_STORE(g+19*SIMDD, c0x1 * MM_LOAD(g+7 *SIMDD) + b01 * (rcp1 + cpx1)); + MM_STORE(g+42*SIMDD, c0y0 * MM_LOAD(g+30*SIMDD) + b00 * (rcp2 + cpy0)); + MM_STORE(g+43*SIMDD, c0y1 * MM_LOAD(g+31*SIMDD) + b01 * (rcp3 + cpy1)); + MM_STORE(g+66*SIMDD, c0z0 * MM_LOAD(g+54*SIMDD) + b00 * (MM_LOAD(g+50*SIMDD) + MM_LOAD(g+52*SIMDD))); + MM_STORE(g+67*SIMDD, c0z1 * MM_LOAD(g+55*SIMDD) + b01 * (MM_LOAD(g+51*SIMDD) + MM_LOAD(g+53*SIMDD))); } static inline void _g0_2d4d_0120(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -1417,6 +1441,18 @@ static inline void _g0_2d4d_0201(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpz = bc->c0pz; double *b0 = bc->b00; double *b1 = bc->b10; + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); __MD b00 = MM_LOAD(b0+0*SIMDD); __MD b01 = MM_LOAD(b0+1*SIMDD); __MD b10 = MM_LOAD(b1+0*SIMDD); @@ -1424,36 +1460,36 @@ static inline void _g0_2d4d_0201(double *g, Rys2eT *bc, CINTEnvVars *envs) __MD g24 = MM_LOAD(g+24*SIMDD); __MD g25 = MM_LOAD(g+25*SIMDD); __MD i2 = MM_SET1(2.); - MM_STORE(g+2 *SIMDD, MM_LOAD(cpx+0*SIMDD)); - MM_STORE(g+3 *SIMDD, MM_LOAD(cpx+1*SIMDD)); - MM_STORE(g+4 *SIMDD, MM_LOAD(c0x+0*SIMDD)); - MM_STORE(g+5 *SIMDD, MM_LOAD(c0x+1*SIMDD)); - MM_STORE(g+14*SIMDD, MM_LOAD(cpy+0*SIMDD)); - MM_STORE(g+15*SIMDD, MM_LOAD(cpy+1*SIMDD)); - MM_STORE(g+16*SIMDD, MM_LOAD(c0y+0*SIMDD)); - MM_STORE(g+17*SIMDD, MM_LOAD(c0y+1*SIMDD)); - MM_STORE(g+26*SIMDD, MM_LOAD(cpz+0*SIMDD) * g24); - MM_STORE(g+27*SIMDD, MM_LOAD(cpz+1*SIMDD) * g25); - MM_STORE(g+28*SIMDD, MM_LOAD(c0z+0*SIMDD) * g24); - MM_STORE(g+29*SIMDD, MM_LOAD(c0z+1*SIMDD) * g25); - MM_STORE(g+6 *SIMDD, MM_LOAD(cpx+0*SIMDD) * MM_LOAD(c0x+0*SIMDD) + b00); - MM_STORE(g+7 *SIMDD, MM_LOAD(cpx+1*SIMDD) * MM_LOAD(c0x+1*SIMDD) + b01); - MM_STORE(g+8 *SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(c0x+0*SIMDD) + b10); - MM_STORE(g+9 *SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(c0x+1*SIMDD) + b11); - MM_STORE(g+18*SIMDD, MM_LOAD(cpy+0*SIMDD) * MM_LOAD(c0y+0*SIMDD) + b00); - MM_STORE(g+19*SIMDD, MM_LOAD(cpy+1*SIMDD) * MM_LOAD(c0y+1*SIMDD) + b01); - MM_STORE(g+20*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(c0y+0*SIMDD) + b10); - MM_STORE(g+21*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(c0y+1*SIMDD) + b11); - MM_STORE(g+30*SIMDD,(MM_LOAD(cpz+0*SIMDD) * MM_LOAD(c0z+0*SIMDD) + b00) * g24); - MM_STORE(g+31*SIMDD,(MM_LOAD(cpz+1*SIMDD) * MM_LOAD(c0z+1*SIMDD) + b01) * g25); - MM_STORE(g+32*SIMDD,(MM_LOAD(c0z+0*SIMDD) * MM_LOAD(c0z+0*SIMDD) + b10) * g24); - MM_STORE(g+33*SIMDD,(MM_LOAD(c0z+1*SIMDD) * MM_LOAD(c0z+1*SIMDD) + b11) * g25); - MM_STORE(g+10*SIMDD, MM_LOAD(cpx+0*SIMDD) * MM_LOAD(g+8 *SIMDD ) + i2 * b00 * MM_LOAD(c0x+0*SIMDD)); - MM_STORE(g+11*SIMDD, MM_LOAD(cpx+1*SIMDD) * MM_LOAD(g+9 *SIMDD ) + i2 * b01 * MM_LOAD(c0x+1*SIMDD)); - MM_STORE(g+22*SIMDD, MM_LOAD(cpy+0*SIMDD) * MM_LOAD(g+20*SIMDD ) + i2 * b00 * MM_LOAD(c0y+0*SIMDD)); - MM_STORE(g+23*SIMDD, MM_LOAD(cpy+1*SIMDD) * MM_LOAD(g+21*SIMDD ) + i2 * b01 * MM_LOAD(c0y+1*SIMDD)); - MM_STORE(g+34*SIMDD, MM_LOAD(cpz+0*SIMDD) * MM_LOAD(g+32*SIMDD ) + i2 * b00 * MM_LOAD(g+28*SIMDD)); - MM_STORE(g+35*SIMDD, MM_LOAD(cpz+1*SIMDD) * MM_LOAD(g+33*SIMDD ) + i2 * b01 * MM_LOAD(g+29*SIMDD)); + MM_STORE(g+2 *SIMDD, cpx0); + MM_STORE(g+3 *SIMDD, cpx1); + MM_STORE(g+4 *SIMDD, c0x0); + MM_STORE(g+5 *SIMDD, c0x1); + MM_STORE(g+14*SIMDD, cpy0); + MM_STORE(g+15*SIMDD, cpy1); + MM_STORE(g+16*SIMDD, c0y0); + MM_STORE(g+17*SIMDD, c0y1); + MM_STORE(g+26*SIMDD, cpz0 * g24); + MM_STORE(g+27*SIMDD, cpz1 * g25); + MM_STORE(g+28*SIMDD, c0z0 * g24); + MM_STORE(g+29*SIMDD, c0z1 * g25); + MM_STORE(g+6 *SIMDD, cpx0 * c0x0 + b00); + MM_STORE(g+7 *SIMDD, cpx1 * c0x1 + b01); + MM_STORE(g+8 *SIMDD, c0x0 * c0x0 + b10); + MM_STORE(g+9 *SIMDD, c0x1 * c0x1 + b11); + MM_STORE(g+18*SIMDD, cpy0 * c0y0 + b00); + MM_STORE(g+19*SIMDD, cpy1 * c0y1 + b01); + MM_STORE(g+20*SIMDD, c0y0 * c0y0 + b10); + MM_STORE(g+21*SIMDD, c0y1 * c0y1 + b11); + MM_STORE(g+30*SIMDD,(cpz0 * c0z0 + b00) * g24); + MM_STORE(g+31*SIMDD,(cpz1 * c0z1 + b01) * g25); + MM_STORE(g+32*SIMDD,(c0z0 * c0z0 + b10) * g24); + MM_STORE(g+33*SIMDD,(c0z1 * c0z1 + b11) * g25); + MM_STORE(g+10*SIMDD, cpx0 * MM_LOAD(g+8 *SIMDD ) + i2 * b00 * c0x0); + MM_STORE(g+11*SIMDD, cpx1 * MM_LOAD(g+9 *SIMDD ) + i2 * b01 * c0x1); + MM_STORE(g+22*SIMDD, cpy0 * MM_LOAD(g+20*SIMDD ) + i2 * b00 * c0y0); + MM_STORE(g+23*SIMDD, cpy1 * MM_LOAD(g+21*SIMDD ) + i2 * b01 * c0y1); + MM_STORE(g+34*SIMDD, cpz0 * MM_LOAD(g+32*SIMDD ) + i2 * b00 * MM_LOAD(g+28*SIMDD)); + MM_STORE(g+35*SIMDD, cpz1 * MM_LOAD(g+33*SIMDD ) + i2 * b01 * MM_LOAD(g+29*SIMDD)); } static inline void _g0_2d4d_0210(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -1465,7 +1501,7 @@ static inline void _g0_2d4d_0210(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b0 = bc->b00; - double *b1 = bc->b01; + double *b1 = bc->b10; __MD r0 = MM_LOAD(c0x+0*SIMDD); __MD r1 = MM_LOAD(c0x+1*SIMDD); __MD r2 = MM_LOAD(c0y+0*SIMDD); @@ -1522,29 +1558,35 @@ static inline void _g0_2d4d_0300(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cy = bc->c00y; double *cz = bc->c00z; double *b = bc->b10; + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); __MD b0 = MM_LOAD(b+0*SIMDD); __MD b1 = MM_LOAD(b+1*SIMDD); __MD g16 = MM_LOAD(g+16*SIMDD); __MD g17 = MM_LOAD(g+17*SIMDD); __MD i3 = MM_SET1(3.); - MM_STORE(g+2 *SIMDD, MM_LOAD(cx+0*SIMDD)); - MM_STORE(g+3 *SIMDD, MM_LOAD(cx+1*SIMDD)); - MM_STORE(g+10*SIMDD, MM_LOAD(cy+0*SIMDD)); - MM_STORE(g+11*SIMDD, MM_LOAD(cy+1*SIMDD)); - MM_STORE(g+18*SIMDD, MM_LOAD(cz+0*SIMDD) * g16); - MM_STORE(g+19*SIMDD, MM_LOAD(cz+1*SIMDD) * g17); - MM_STORE(g+4 *SIMDD, MM_LOAD(cx+0*SIMDD) * MM_LOAD(cx+0*SIMDD) + b0); - MM_STORE(g+5 *SIMDD, MM_LOAD(cx+1*SIMDD) * MM_LOAD(cx+1*SIMDD) + b1); - MM_STORE(g+12*SIMDD, MM_LOAD(cy+0*SIMDD) * MM_LOAD(cy+0*SIMDD) + b0); - MM_STORE(g+13*SIMDD, MM_LOAD(cy+1*SIMDD) * MM_LOAD(cy+1*SIMDD) + b1); - MM_STORE(g+20*SIMDD,(MM_LOAD(cz+0*SIMDD) * MM_LOAD(cz+0*SIMDD) + b0)* g16); - MM_STORE(g+21*SIMDD,(MM_LOAD(cz+1*SIMDD) * MM_LOAD(cz+1*SIMDD) + b1)* g17); - MM_STORE(g+6 *SIMDD, MM_LOAD(cx+0*SIMDD) *(MM_LOAD(cx+0*SIMDD) * MM_LOAD(cx+0*SIMDD) + i3 * b0)); - MM_STORE(g+7 *SIMDD, MM_LOAD(cx+1*SIMDD) *(MM_LOAD(cx+1*SIMDD) * MM_LOAD(cx+1*SIMDD) + i3 * b1)); - MM_STORE(g+14*SIMDD, MM_LOAD(cy+0*SIMDD) *(MM_LOAD(cy+0*SIMDD) * MM_LOAD(cy+0*SIMDD) + i3 * b0)); - MM_STORE(g+15*SIMDD, MM_LOAD(cy+1*SIMDD) *(MM_LOAD(cy+1*SIMDD) * MM_LOAD(cy+1*SIMDD) + i3 * b1)); - MM_STORE(g+22*SIMDD,(MM_LOAD(cz+0*SIMDD) * MM_LOAD(cz+0*SIMDD) + i3 * b0)* MM_LOAD(g+18*SIMDD)); - MM_STORE(g+23*SIMDD,(MM_LOAD(cz+1*SIMDD) * MM_LOAD(cz+1*SIMDD) + i3 * b1)* MM_LOAD(g+19*SIMDD)); + MM_STORE(g+2 *SIMDD, cx0); + MM_STORE(g+3 *SIMDD, cx1); + MM_STORE(g+10*SIMDD, cy0); + MM_STORE(g+11*SIMDD, cy1); + MM_STORE(g+18*SIMDD, cz0 * g16); + MM_STORE(g+19*SIMDD, cz1 * g17); + MM_STORE(g+4 *SIMDD, cx0 * cx0 + b0); + MM_STORE(g+5 *SIMDD, cx1 * cx1 + b1); + MM_STORE(g+12*SIMDD, cy0 * cy0 + b0); + MM_STORE(g+13*SIMDD, cy1 * cy1 + b1); + MM_STORE(g+20*SIMDD,(cz0 * cz0 + b0)* g16); + MM_STORE(g+21*SIMDD,(cz1 * cz1 + b1)* g17); + MM_STORE(g+6 *SIMDD, cx0 *(cx0 * cx0 + i3 * b0)); + MM_STORE(g+7 *SIMDD, cx1 *(cx1 * cx1 + i3 * b1)); + MM_STORE(g+14*SIMDD, cy0 *(cy0 * cy0 + i3 * b0)); + MM_STORE(g+15*SIMDD, cy1 *(cy1 * cy1 + i3 * b1)); + MM_STORE(g+22*SIMDD,(cz0 * cz0 + i3 * b0)* MM_LOAD(g+18*SIMDD)); + MM_STORE(g+23*SIMDD,(cz1 * cz1 + i3 * b1)* MM_LOAD(g+19*SIMDD)); } static inline void _g0_2d4d_1000(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -1578,22 +1620,22 @@ static inline void _g0_2d4d_1001(double *g, Rys2eT *bc, CINTEnvVars *envs) __MD py1 = MM_LOAD(cpy+1*SIMDD); __MD pz0 = MM_LOAD(cpz+0*SIMDD); __MD pz1 = MM_LOAD(cpz+1*SIMDD); - __MD b0 = MM_LOAD(b+0*SIMDD); - __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); __MD g16 = MM_LOAD(g+16*SIMDD); __MD g17 = MM_LOAD(g+17*SIMDD); - MM_STORE(g+2 *SIMDD, px0); - MM_STORE(g+3 *SIMDD, px1); - MM_STORE(g+4 *SIMDD, cx0); - MM_STORE(g+5 *SIMDD, cx1); - MM_STORE(g+10*SIMDD, py0); - MM_STORE(g+11*SIMDD, py1); - MM_STORE(g+12*SIMDD, cy0); - MM_STORE(g+13*SIMDD, cy1); - MM_STORE(g+18*SIMDD, MM_MUL(pz0, g16)); - MM_STORE(g+19*SIMDD, MM_MUL(pz1, g17)); - MM_STORE(g+20*SIMDD, MM_MUL(cz0, g16)); - MM_STORE(g+21*SIMDD, MM_MUL(cz1, g17)); + MM_STORE(g+2 *SIMDD, cx0); + MM_STORE(g+3 *SIMDD, cx1); + MM_STORE(g+4 *SIMDD, px0); + MM_STORE(g+5 *SIMDD, px1); + MM_STORE(g+10*SIMDD, cy0); + MM_STORE(g+11*SIMDD, cy1); + MM_STORE(g+12*SIMDD, py0); + MM_STORE(g+13*SIMDD, py1); + MM_STORE(g+18*SIMDD, MM_MUL(cz0, g16)); + MM_STORE(g+19*SIMDD, MM_MUL(cz1, g17)); + MM_STORE(g+20*SIMDD, MM_MUL(pz0, g16)); + MM_STORE(g+21*SIMDD, MM_MUL(pz1, g17)); MM_STORE(g+6 *SIMDD, MM_FMA(px0, cx0, b0)); MM_STORE(g+7 *SIMDD, MM_FMA(px1, cx1, b1)); MM_STORE(g+14*SIMDD, MM_FMA(py0, cy0, b0)); @@ -1683,22 +1725,22 @@ static inline void _g0_2d4d_1010(double *g, Rys2eT *bc, CINTEnvVars *envs) __MD py1 = MM_LOAD(cpy+1*SIMDD); __MD pz0 = MM_LOAD(cpz+0*SIMDD); __MD pz1 = MM_LOAD(cpz+1*SIMDD); - __MD b0 = MM_LOAD(b+0*SIMDD); - __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); __MD g16 = MM_LOAD(g+16*SIMDD); __MD g17 = MM_LOAD(g+17*SIMDD); - MM_STORE(g+2 *SIMDD, px0); - MM_STORE(g+3 *SIMDD, px1); - MM_STORE(g+4 *SIMDD, cx0); - MM_STORE(g+5 *SIMDD, cx1); - MM_STORE(g+10*SIMDD, py0); - MM_STORE(g+11*SIMDD, py1); - MM_STORE(g+12*SIMDD, cy0); - MM_STORE(g+13*SIMDD, cy1); - MM_STORE(g+18*SIMDD, MM_MUL(pz0, g16)); - MM_STORE(g+19*SIMDD, MM_MUL(pz1, g17)); - MM_STORE(g+20*SIMDD, MM_MUL(cz0, g16)); - MM_STORE(g+21*SIMDD, MM_MUL(cz1, g17)); + MM_STORE(g+2 *SIMDD, cx0); + MM_STORE(g+3 *SIMDD, cx1); + MM_STORE(g+4 *SIMDD, px0); + MM_STORE(g+5 *SIMDD, px1); + MM_STORE(g+10*SIMDD, cy0); + MM_STORE(g+11*SIMDD, cy1); + MM_STORE(g+12*SIMDD, py0); + MM_STORE(g+13*SIMDD, py1); + MM_STORE(g+18*SIMDD, MM_MUL(cz0, g16)); + MM_STORE(g+19*SIMDD, MM_MUL(cz1, g17)); + MM_STORE(g+20*SIMDD, MM_MUL(pz0, g16)); + MM_STORE(g+21*SIMDD, MM_MUL(pz1, g17)); MM_STORE(g+6 *SIMDD, MM_FMA(px0, cx0, b0)); MM_STORE(g+7 *SIMDD, MM_FMA(px1, cx1, b1)); MM_STORE(g+14*SIMDD, MM_FMA(py0, cy0, b0)); @@ -1717,57 +1759,76 @@ static inline void _g0_2d4d_1011(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpz = bc->c0pz; double *b0 = bc->b00; double *b1 = bc->b01; - double *rkl = envs->rkrl; - ALIGNMM double rcp[6*SIMDD]; - _make_rc(rcp, cpx, cpy, cpz, rkl); + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); + __MD rcp0 = MM_ADD(rx, cpx0); + __MD rcp1 = MM_ADD(rx, cpx1); + __MD rcp2 = MM_ADD(ry, cpy0); + __MD rcp3 = MM_ADD(ry, cpy1); + __MD rcp4 = MM_ADD(rz, cpz0); + __MD rcp5 = MM_ADD(rz, cpz1); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); __MD b00 = MM_LOAD(b0+0*SIMDD); __MD b01 = MM_LOAD(b0+1*SIMDD); __MD b10 = MM_LOAD(b1+0*SIMDD); __MD b11 = MM_LOAD(b1+1*SIMDD); __MD g48 = MM_LOAD(g+48*SIMDD); __MD g49 = MM_LOAD(g+49*SIMDD); - MM_STORE(g+2 *SIMDD, MM_LOAD(c0x+0*SIMDD)); - MM_STORE(g+3 *SIMDD, MM_LOAD(c0x+1*SIMDD)); - MM_STORE(g+4 *SIMDD, MM_LOAD(rcp+0*SIMDD)); - MM_STORE(g+5 *SIMDD, MM_LOAD(rcp+1*SIMDD)); - MM_STORE(g+8 *SIMDD, MM_LOAD(cpx+0*SIMDD)); - MM_STORE(g+9 *SIMDD, MM_LOAD(cpx+1*SIMDD)); - MM_STORE(g+26*SIMDD, MM_LOAD(c0y+0*SIMDD)); - MM_STORE(g+27*SIMDD, MM_LOAD(c0y+1*SIMDD)); - MM_STORE(g+28*SIMDD, MM_LOAD(rcp+2*SIMDD)); - MM_STORE(g+29*SIMDD, MM_LOAD(rcp+3*SIMDD)); - MM_STORE(g+32*SIMDD, MM_LOAD(cpy+0*SIMDD)); - MM_STORE(g+33*SIMDD, MM_LOAD(cpy+1*SIMDD)); - MM_STORE(g+50*SIMDD, MM_LOAD(c0z+0*SIMDD) * g48); - MM_STORE(g+51*SIMDD, MM_LOAD(c0z+1*SIMDD) * g49); - MM_STORE(g+52*SIMDD, MM_LOAD(rcp+4*SIMDD) * g48); - MM_STORE(g+53*SIMDD, MM_LOAD(rcp+5*SIMDD) * g49); - MM_STORE(g+56*SIMDD, MM_LOAD(cpz+0*SIMDD) * g48); - MM_STORE(g+57*SIMDD, MM_LOAD(cpz+1*SIMDD) * g49); - MM_STORE(g+6 *SIMDD , MM_LOAD(rcp+0*SIMDD) * MM_LOAD(c0x+0*SIMDD) + b00); - MM_STORE(g+7 *SIMDD , MM_LOAD(rcp+1*SIMDD) * MM_LOAD(c0x+1*SIMDD) + b01); - MM_STORE(g+10*SIMDD , MM_LOAD(cpx+0*SIMDD) * MM_LOAD(c0x+0*SIMDD) + b00); - MM_STORE(g+11*SIMDD , MM_LOAD(cpx+1*SIMDD) * MM_LOAD(c0x+1*SIMDD) + b01); - MM_STORE(g+12*SIMDD , MM_LOAD(cpx+0*SIMDD) * MM_LOAD(rcp+0*SIMDD) + b10); - MM_STORE(g+13*SIMDD , MM_LOAD(cpx+1*SIMDD) * MM_LOAD(rcp+1*SIMDD) + b11); - MM_STORE(g+30*SIMDD , MM_LOAD(rcp+2*SIMDD) * MM_LOAD(c0y+0*SIMDD) + b00); - MM_STORE(g+31*SIMDD , MM_LOAD(rcp+3*SIMDD) * MM_LOAD(c0y+1*SIMDD) + b01); - MM_STORE(g+34*SIMDD , MM_LOAD(cpy+0*SIMDD) * MM_LOAD(c0y+0*SIMDD) + b00); - MM_STORE(g+35*SIMDD , MM_LOAD(cpy+1*SIMDD) * MM_LOAD(c0y+1*SIMDD) + b01); - MM_STORE(g+36*SIMDD , MM_LOAD(cpy+0*SIMDD) * MM_LOAD(rcp+2*SIMDD) + b10); - MM_STORE(g+37*SIMDD , MM_LOAD(cpy+1*SIMDD) * MM_LOAD(rcp+3*SIMDD) + b11); - MM_STORE(g+54*SIMDD,(MM_LOAD(rcp+4*SIMDD) * MM_LOAD(c0z+0*SIMDD) + b00)* g48); - MM_STORE(g+55*SIMDD,(MM_LOAD(rcp+5*SIMDD) * MM_LOAD(c0z+1*SIMDD) + b01)* g49); - MM_STORE(g+58*SIMDD,(MM_LOAD(cpz+0*SIMDD) * MM_LOAD(c0z+0*SIMDD) + b00)* g48); - MM_STORE(g+59*SIMDD,(MM_LOAD(cpz+1*SIMDD) * MM_LOAD(c0z+1*SIMDD) + b01)* g49); - MM_STORE(g+60*SIMDD,(MM_LOAD(cpz+0*SIMDD) * MM_LOAD(rcp+4*SIMDD) + b10)* g48); - MM_STORE(g+61*SIMDD,(MM_LOAD(cpz+1*SIMDD) * MM_LOAD(rcp+5*SIMDD) + b11)* g49); - MM_STORE(g+14*SIMDD, MM_LOAD(rcp+0*SIMDD) * MM_LOAD(g+10*SIMDD) + b00 * MM_LOAD(cpx+0*SIMDD) + b10 * MM_LOAD(c0x+0*SIMDD)); - MM_STORE(g+15*SIMDD, MM_LOAD(rcp+1*SIMDD) * MM_LOAD(g+11*SIMDD) + b01 * MM_LOAD(cpx+1*SIMDD) + b11 * MM_LOAD(c0x+1*SIMDD)); - MM_STORE(g+38*SIMDD, MM_LOAD(rcp+2*SIMDD) * MM_LOAD(g+34*SIMDD) + b00 * MM_LOAD(cpy+0*SIMDD) + b10 * MM_LOAD(c0y+0*SIMDD)); - MM_STORE(g+39*SIMDD, MM_LOAD(rcp+3*SIMDD) * MM_LOAD(g+35*SIMDD) + b01 * MM_LOAD(cpy+1*SIMDD) + b11 * MM_LOAD(c0y+1*SIMDD)); - MM_STORE(g+62*SIMDD, MM_LOAD(rcp+4*SIMDD) * MM_LOAD(g+58*SIMDD) + b00 * MM_LOAD(g+56*SIMDD) + b10 * MM_LOAD(g+50*SIMDD)); - MM_STORE(g+63*SIMDD, MM_LOAD(rcp+5*SIMDD) * MM_LOAD(g+59*SIMDD) + b01 * MM_LOAD(g+57*SIMDD) + b11 * MM_LOAD(g+51*SIMDD)); + MM_STORE(g+2 *SIMDD, c0x0); + MM_STORE(g+3 *SIMDD, c0x1); + MM_STORE(g+4 *SIMDD, rcp0); + MM_STORE(g+5 *SIMDD, rcp1); + MM_STORE(g+8 *SIMDD, cpx0); + MM_STORE(g+9 *SIMDD, cpx1); + MM_STORE(g+26*SIMDD, c0y0); + MM_STORE(g+27*SIMDD, c0y1); + MM_STORE(g+28*SIMDD, rcp2); + MM_STORE(g+29*SIMDD, rcp3); + MM_STORE(g+32*SIMDD, cpy0); + MM_STORE(g+33*SIMDD, cpy1); + MM_STORE(g+50*SIMDD, c0z0 * g48); + MM_STORE(g+51*SIMDD, c0z1 * g49); + MM_STORE(g+52*SIMDD, rcp4 * g48); + MM_STORE(g+53*SIMDD, rcp5 * g49); + MM_STORE(g+56*SIMDD, cpz0 * g48); + MM_STORE(g+57*SIMDD, cpz1 * g49); + MM_STORE(g+6 *SIMDD, rcp0 * c0x0 + b00); + MM_STORE(g+7 *SIMDD, rcp1 * c0x1 + b01); + MM_STORE(g+10*SIMDD, cpx0 * c0x0 + b00); + MM_STORE(g+11*SIMDD, cpx1 * c0x1 + b01); + MM_STORE(g+12*SIMDD, cpx0 * rcp0 + b10); + MM_STORE(g+13*SIMDD, cpx1 * rcp1 + b11); + MM_STORE(g+30*SIMDD, rcp2 * c0y0 + b00); + MM_STORE(g+31*SIMDD, rcp3 * c0y1 + b01); + MM_STORE(g+34*SIMDD, cpy0 * c0y0 + b00); + MM_STORE(g+35*SIMDD, cpy1 * c0y1 + b01); + MM_STORE(g+36*SIMDD, cpy0 * rcp2 + b10); + MM_STORE(g+37*SIMDD, cpy1 * rcp3 + b11); + MM_STORE(g+54*SIMDD,(rcp4 * c0z0 + b00)* g48); + MM_STORE(g+55*SIMDD,(rcp5 * c0z1 + b01)* g49); + MM_STORE(g+58*SIMDD,(cpz0 * c0z0 + b00)* g48); + MM_STORE(g+59*SIMDD,(cpz1 * c0z1 + b01)* g49); + MM_STORE(g+60*SIMDD,(cpz0 * rcp4 + b10)* g48); + MM_STORE(g+61*SIMDD,(cpz1 * rcp5 + b11)* g49); + MM_STORE(g+14*SIMDD, rcp0 * MM_LOAD(g+10*SIMDD) + b00 * cpx0 + b10 * c0x0); + MM_STORE(g+15*SIMDD, rcp1 * MM_LOAD(g+11*SIMDD) + b01 * cpx1 + b11 * c0x1); + MM_STORE(g+38*SIMDD, rcp2 * MM_LOAD(g+34*SIMDD) + b00 * cpy0 + b10 * c0y0); + MM_STORE(g+39*SIMDD, rcp3 * MM_LOAD(g+35*SIMDD) + b01 * cpy1 + b11 * c0y1); + MM_STORE(g+62*SIMDD, rcp4 * MM_LOAD(g+58*SIMDD) + b00 * MM_LOAD(g+56*SIMDD) + b10 * MM_LOAD(g+50*SIMDD)); + MM_STORE(g+63*SIMDD, rcp5 * MM_LOAD(g+59*SIMDD) + b01 * MM_LOAD(g+57*SIMDD) + b11 * MM_LOAD(g+51*SIMDD)); } static inline void _g0_2d4d_1020(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -1837,20 +1898,21 @@ static inline void _g0_2d4d_1100(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cz = bc->c00z; double *b = bc->b10; double *r = envs->rirj; - ALIGNMM double rc[6*SIMDD]; - _make_rc(rc, cx, cy, cz, r); - __MD r0 = MM_LOAD(rc+0*SIMDD); - __MD r1 = MM_LOAD(rc+1*SIMDD); - __MD r2 = MM_LOAD(rc+2*SIMDD); - __MD r3 = MM_LOAD(rc+3*SIMDD); - __MD r4 = MM_LOAD(rc+4*SIMDD); - __MD r5 = MM_LOAD(rc+5*SIMDD); + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); __MD cx0 = MM_LOAD(cx+0*SIMDD); __MD cx1 = MM_LOAD(cx+1*SIMDD); __MD cy0 = MM_LOAD(cy+0*SIMDD); __MD cy1 = MM_LOAD(cy+1*SIMDD); __MD cz0 = MM_LOAD(cz+0*SIMDD); __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(ry, cy0); + __MD r3 = MM_ADD(ry, cy1); + __MD r4 = MM_ADD(rz, cz0); + __MD r5 = MM_ADD(rz, cz1); __MD b0 = MM_LOAD(b+0*SIMDD); __MD b1 = MM_LOAD(b+1*SIMDD); __MD g24 = MM_LOAD(g+24*SIMDD); @@ -1885,57 +1947,76 @@ static inline void _g0_2d4d_1101(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpz = bc->c0pz; double *b0 = bc->b00; double *b1 = bc->b10; - double *rij = envs->rirj; - ALIGNMM double rc0[6*SIMDD]; - _make_rc(rc0, c0x, c0y, c0z, rij); + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD rc00 = MM_ADD(rx, c0x0); + __MD rc01 = MM_ADD(rx, c0x1); + __MD rc02 = MM_ADD(ry, c0y0); + __MD rc03 = MM_ADD(ry, c0y1); + __MD rc04 = MM_ADD(rz, c0z0); + __MD rc05 = MM_ADD(rz, c0z1); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); __MD b00 = MM_LOAD(b0+0*SIMDD); __MD b01 = MM_LOAD(b0+1*SIMDD); __MD b10 = MM_LOAD(b1+0*SIMDD); __MD b11 = MM_LOAD(b1+1*SIMDD); __MD g48 = MM_LOAD(g+48*SIMDD); __MD g49 = MM_LOAD(g+49*SIMDD); - MM_STORE(g+2 *SIMDD, MM_LOAD(rc0+0*SIMDD)); - MM_STORE(g+3 *SIMDD, MM_LOAD(rc0+1*SIMDD)); - MM_STORE(g+4 *SIMDD, MM_LOAD(cpx+0*SIMDD)); - MM_STORE(g+5 *SIMDD, MM_LOAD(cpx+1*SIMDD)); - MM_STORE(g+8 *SIMDD, MM_LOAD(c0x+0*SIMDD)); - MM_STORE(g+9 *SIMDD, MM_LOAD(c0x+1*SIMDD)); - MM_STORE(g+26*SIMDD, MM_LOAD(rc0+2*SIMDD)); - MM_STORE(g+27*SIMDD, MM_LOAD(rc0+3*SIMDD)); - MM_STORE(g+28*SIMDD, MM_LOAD(cpy+0*SIMDD)); - MM_STORE(g+29*SIMDD, MM_LOAD(cpy+1*SIMDD)); - MM_STORE(g+32*SIMDD, MM_LOAD(c0y+0*SIMDD)); - MM_STORE(g+33*SIMDD, MM_LOAD(c0y+1*SIMDD)); - MM_STORE(g+50*SIMDD, MM_LOAD(rc0+4*SIMDD) * g48); - MM_STORE(g+51*SIMDD, MM_LOAD(rc0+5*SIMDD) * g49); - MM_STORE(g+52*SIMDD, MM_LOAD(cpz+0*SIMDD) * g48); - MM_STORE(g+53*SIMDD, MM_LOAD(cpz+1*SIMDD) * g49); - MM_STORE(g+56*SIMDD, MM_LOAD(c0z+0*SIMDD) * g48); - MM_STORE(g+57*SIMDD, MM_LOAD(c0z+1*SIMDD) * g49); - MM_STORE(g+6 *SIMDD, MM_LOAD(cpx+0*SIMDD) * MM_LOAD(rc0+0*SIMDD) + b00); - MM_STORE(g+7 *SIMDD, MM_LOAD(cpx+1*SIMDD) * MM_LOAD(rc0+1*SIMDD) + b01); - MM_STORE(g+10*SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(rc0+0*SIMDD) + b10); - MM_STORE(g+11*SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(rc0+1*SIMDD) + b11); - MM_STORE(g+12*SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(cpx+0*SIMDD) + b00); - MM_STORE(g+13*SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(cpx+1*SIMDD) + b01); - MM_STORE(g+30*SIMDD, MM_LOAD(cpy+0*SIMDD) * MM_LOAD(rc0+2*SIMDD) + b00); - MM_STORE(g+31*SIMDD, MM_LOAD(cpy+1*SIMDD) * MM_LOAD(rc0+3*SIMDD) + b01); - MM_STORE(g+34*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(rc0+2*SIMDD) + b10); - MM_STORE(g+35*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(rc0+3*SIMDD) + b11); - MM_STORE(g+36*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(cpy+0*SIMDD) + b00); - MM_STORE(g+37*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(cpy+1*SIMDD) + b01); - MM_STORE(g+54*SIMDD,(MM_LOAD(cpz+0*SIMDD) * MM_LOAD(rc0+4*SIMDD) + b00)* g48); - MM_STORE(g+55*SIMDD,(MM_LOAD(cpz+1*SIMDD) * MM_LOAD(rc0+5*SIMDD) + b01)* g49); - MM_STORE(g+58*SIMDD,(MM_LOAD(c0z+0*SIMDD) * MM_LOAD(rc0+4*SIMDD) + b10)* g48); - MM_STORE(g+59*SIMDD,(MM_LOAD(c0z+1*SIMDD) * MM_LOAD(rc0+5*SIMDD) + b11)* g49); - MM_STORE(g+60*SIMDD,(MM_LOAD(c0z+0*SIMDD) * MM_LOAD(cpz+0*SIMDD) + b00)* g48); - MM_STORE(g+61*SIMDD,(MM_LOAD(c0z+1*SIMDD) * MM_LOAD(cpz+1*SIMDD) + b01)* g49); - MM_STORE(g+14*SIMDD, MM_LOAD(cpx+0*SIMDD) * MM_LOAD(g+10*SIMDD ) + b00 *(MM_LOAD(rc0+0*SIMDD) + MM_LOAD(c0x+0*SIMDD))); - MM_STORE(g+15*SIMDD, MM_LOAD(cpx+1*SIMDD) * MM_LOAD(g+11*SIMDD ) + b01 *(MM_LOAD(rc0+1*SIMDD) + MM_LOAD(c0x+1*SIMDD))); - MM_STORE(g+38*SIMDD, MM_LOAD(cpy+0*SIMDD) * MM_LOAD(g+34*SIMDD ) + b00 *(MM_LOAD(rc0+2*SIMDD) + MM_LOAD(c0y+0*SIMDD))); - MM_STORE(g+39*SIMDD, MM_LOAD(cpy+1*SIMDD) * MM_LOAD(g+35*SIMDD ) + b01 *(MM_LOAD(rc0+3*SIMDD) + MM_LOAD(c0y+1*SIMDD))); - MM_STORE(g+62*SIMDD, MM_LOAD(cpz+0*SIMDD) * MM_LOAD(g+58*SIMDD ) + b00 *(MM_LOAD(g+50*SIMDD ) + MM_LOAD(g+56*SIMDD ))); - MM_STORE(g+63*SIMDD, MM_LOAD(cpz+1*SIMDD) * MM_LOAD(g+59*SIMDD ) + b01 *(MM_LOAD(g+51*SIMDD ) + MM_LOAD(g+57*SIMDD ))); + MM_STORE(g+2 *SIMDD, rc00); + MM_STORE(g+3 *SIMDD, rc01); + MM_STORE(g+4 *SIMDD, cpx0); + MM_STORE(g+5 *SIMDD, cpx1); + MM_STORE(g+8 *SIMDD, c0x0); + MM_STORE(g+9 *SIMDD, c0x1); + MM_STORE(g+26*SIMDD, rc02); + MM_STORE(g+27*SIMDD, rc03); + MM_STORE(g+28*SIMDD, cpy0); + MM_STORE(g+29*SIMDD, cpy1); + MM_STORE(g+32*SIMDD, c0y0); + MM_STORE(g+33*SIMDD, c0y1); + MM_STORE(g+50*SIMDD, rc04 * g48); + MM_STORE(g+51*SIMDD, rc05 * g49); + MM_STORE(g+52*SIMDD, cpz0 * g48); + MM_STORE(g+53*SIMDD, cpz1 * g49); + MM_STORE(g+56*SIMDD, c0z0 * g48); + MM_STORE(g+57*SIMDD, c0z1 * g49); + MM_STORE(g+6 *SIMDD, cpx0 * rc00 + b00); + MM_STORE(g+7 *SIMDD, cpx1 * rc01 + b01); + MM_STORE(g+10*SIMDD, c0x0 * rc00 + b10); + MM_STORE(g+11*SIMDD, c0x1 * rc01 + b11); + MM_STORE(g+12*SIMDD, c0x0 * cpx0 + b00); + MM_STORE(g+13*SIMDD, c0x1 * cpx1 + b01); + MM_STORE(g+30*SIMDD, cpy0 * rc02 + b00); + MM_STORE(g+31*SIMDD, cpy1 * rc03 + b01); + MM_STORE(g+34*SIMDD, c0y0 * rc02 + b10); + MM_STORE(g+35*SIMDD, c0y1 * rc03 + b11); + MM_STORE(g+36*SIMDD, c0y0 * cpy0 + b00); + MM_STORE(g+37*SIMDD, c0y1 * cpy1 + b01); + MM_STORE(g+54*SIMDD,(cpz0 * rc04 + b00)* g48); + MM_STORE(g+55*SIMDD,(cpz1 * rc05 + b01)* g49); + MM_STORE(g+58*SIMDD,(c0z0 * rc04 + b10)* g48); + MM_STORE(g+59*SIMDD,(c0z1 * rc05 + b11)* g49); + MM_STORE(g+60*SIMDD,(c0z0 * cpz0 + b00)* g48); + MM_STORE(g+61*SIMDD,(c0z1 * cpz1 + b01)* g49); + MM_STORE(g+14*SIMDD, cpx0 * MM_LOAD(g+10*SIMDD ) + b00 *(rc00 + c0x0)); + MM_STORE(g+15*SIMDD, cpx1 * MM_LOAD(g+11*SIMDD ) + b01 *(rc01 + c0x1)); + MM_STORE(g+38*SIMDD, cpy0 * MM_LOAD(g+34*SIMDD ) + b00 *(rc02 + c0y0)); + MM_STORE(g+39*SIMDD, cpy1 * MM_LOAD(g+35*SIMDD ) + b01 *(rc03 + c0y1)); + MM_STORE(g+62*SIMDD, cpz0 * MM_LOAD(g+58*SIMDD ) + b00 *(MM_LOAD(g+50*SIMDD ) + MM_LOAD(g+56*SIMDD ))); + MM_STORE(g+63*SIMDD, cpz1 * MM_LOAD(g+59*SIMDD ) + b01 *(MM_LOAD(g+51*SIMDD ) + MM_LOAD(g+57*SIMDD ))); } static inline void _g0_2d4d_1110(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -1948,57 +2029,76 @@ static inline void _g0_2d4d_1110(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpz = bc->c0pz; double *b0 = bc->b00; double *b1 = bc->b10; - double *r0 = envs->rirj; - ALIGNMM double rc0[6*SIMDD]; - _make_rc(rc0, c0x, c0y, c0z, r0); + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD rc00 = MM_ADD(rx, c0x0); + __MD rc01 = MM_ADD(rx, c0x1); + __MD rc02 = MM_ADD(ry, c0y0); + __MD rc03 = MM_ADD(ry, c0y1); + __MD rc04 = MM_ADD(rz, c0z0); + __MD rc05 = MM_ADD(rz, c0z1); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); __MD b00 = MM_LOAD(b0+0*SIMDD); __MD b01 = MM_LOAD(b0+1*SIMDD); __MD b10 = MM_LOAD(b1+0*SIMDD); __MD b11 = MM_LOAD(b1+1*SIMDD); __MD g48 = MM_LOAD(g+48*SIMDD); __MD g49 = MM_LOAD(g+49*SIMDD); - MM_STORE(g+2 *SIMDD, MM_LOAD(rc0+0*SIMDD)); - MM_STORE(g+3 *SIMDD, MM_LOAD(rc0+1*SIMDD)); - MM_STORE(g+4 *SIMDD, MM_LOAD(cpx+0*SIMDD)); - MM_STORE(g+5 *SIMDD, MM_LOAD(cpx+1*SIMDD)); - MM_STORE(g+8 *SIMDD, MM_LOAD(c0x+0*SIMDD)); - MM_STORE(g+9 *SIMDD, MM_LOAD(c0x+1*SIMDD)); - MM_STORE(g+26*SIMDD, MM_LOAD(rc0+2*SIMDD)); - MM_STORE(g+27*SIMDD, MM_LOAD(rc0+3*SIMDD)); - MM_STORE(g+28*SIMDD, MM_LOAD(cpy+0*SIMDD)); - MM_STORE(g+29*SIMDD, MM_LOAD(cpy+1*SIMDD)); - MM_STORE(g+32*SIMDD, MM_LOAD(c0y+0*SIMDD)); - MM_STORE(g+33*SIMDD, MM_LOAD(c0y+1*SIMDD)); - MM_STORE(g+50*SIMDD, MM_LOAD(rc0+4*SIMDD) * g48); - MM_STORE(g+51*SIMDD, MM_LOAD(rc0+5*SIMDD) * g49); - MM_STORE(g+52*SIMDD, MM_LOAD(cpz+0*SIMDD) * g48); - MM_STORE(g+53*SIMDD, MM_LOAD(cpz+1*SIMDD) * g49); - MM_STORE(g+56*SIMDD, MM_LOAD(c0z+0*SIMDD) * g48); - MM_STORE(g+57*SIMDD, MM_LOAD(c0z+1*SIMDD) * g49); - MM_STORE(g+6 *SIMDD, MM_LOAD(cpx+0*SIMDD) * MM_LOAD(rc0+0*SIMDD) + b00); - MM_STORE(g+7 *SIMDD, MM_LOAD(cpx+1*SIMDD) * MM_LOAD(rc0+1*SIMDD) + b01); - MM_STORE(g+10*SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(rc0+0*SIMDD) + b10); - MM_STORE(g+11*SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(rc0+1*SIMDD) + b11); - MM_STORE(g+12*SIMDD, MM_LOAD(c0x+0*SIMDD) * MM_LOAD(cpx+0*SIMDD) + b00); - MM_STORE(g+13*SIMDD, MM_LOAD(c0x+1*SIMDD) * MM_LOAD(cpx+1*SIMDD) + b01); - MM_STORE(g+30*SIMDD, MM_LOAD(cpy+0*SIMDD) * MM_LOAD(rc0+2*SIMDD) + b00); - MM_STORE(g+31*SIMDD, MM_LOAD(cpy+1*SIMDD) * MM_LOAD(rc0+3*SIMDD) + b01); - MM_STORE(g+34*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(rc0+2*SIMDD) + b10); - MM_STORE(g+35*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(rc0+3*SIMDD) + b11); - MM_STORE(g+36*SIMDD, MM_LOAD(c0y+0*SIMDD) * MM_LOAD(cpy+0*SIMDD) + b00); - MM_STORE(g+37*SIMDD, MM_LOAD(c0y+1*SIMDD) * MM_LOAD(cpy+1*SIMDD) + b01); - MM_STORE(g+54*SIMDD,(MM_LOAD(cpz+0*SIMDD) * MM_LOAD(rc0+4*SIMDD) + b00)* g48); - MM_STORE(g+55*SIMDD,(MM_LOAD(cpz+1*SIMDD) * MM_LOAD(rc0+5*SIMDD) + b01)* g49); - MM_STORE(g+58*SIMDD,(MM_LOAD(c0z+0*SIMDD) * MM_LOAD(rc0+4*SIMDD) + b10)* g48); - MM_STORE(g+59*SIMDD,(MM_LOAD(c0z+1*SIMDD) * MM_LOAD(rc0+5*SIMDD) + b11)* g49); - MM_STORE(g+60*SIMDD,(MM_LOAD(c0z+0*SIMDD) * MM_LOAD(cpz+0*SIMDD) + b00)* g48); - MM_STORE(g+61*SIMDD,(MM_LOAD(c0z+1*SIMDD) * MM_LOAD(cpz+1*SIMDD) + b01)* g49); - MM_STORE(g+14*SIMDD, MM_LOAD(cpx+0*SIMDD) * MM_LOAD(g+10*SIMDD ) + b00 *(MM_LOAD(rc0+0*SIMDD) + MM_LOAD(c0x+0*SIMDD))); - MM_STORE(g+15*SIMDD, MM_LOAD(cpx+1*SIMDD) * MM_LOAD(g+11*SIMDD ) + b01 *(MM_LOAD(rc0+1*SIMDD) + MM_LOAD(c0x+1*SIMDD))); - MM_STORE(g+38*SIMDD, MM_LOAD(cpy+0*SIMDD) * MM_LOAD(g+34*SIMDD ) + b00 *(MM_LOAD(rc0+2*SIMDD) + MM_LOAD(c0y+0*SIMDD))); - MM_STORE(g+39*SIMDD, MM_LOAD(cpy+1*SIMDD) * MM_LOAD(g+35*SIMDD ) + b01 *(MM_LOAD(rc0+3*SIMDD) + MM_LOAD(c0y+1*SIMDD))); - MM_STORE(g+62*SIMDD, MM_LOAD(cpz+0*SIMDD) * MM_LOAD(g+58*SIMDD ) + b00 *(MM_LOAD(g+50*SIMDD ) + MM_LOAD(g+56*SIMDD ))); - MM_STORE(g+63*SIMDD, MM_LOAD(cpz+1*SIMDD) * MM_LOAD(g+59*SIMDD ) + b01 *(MM_LOAD(g+51*SIMDD ) + MM_LOAD(g+57*SIMDD ))); + MM_STORE(g+2 *SIMDD, rc00); + MM_STORE(g+3 *SIMDD, rc01); + MM_STORE(g+4 *SIMDD, cpx0); + MM_STORE(g+5 *SIMDD, cpx1); + MM_STORE(g+8 *SIMDD, c0x0); + MM_STORE(g+9 *SIMDD, c0x1); + MM_STORE(g+26*SIMDD, rc02); + MM_STORE(g+27*SIMDD, rc03); + MM_STORE(g+28*SIMDD, cpy0); + MM_STORE(g+29*SIMDD, cpy1); + MM_STORE(g+32*SIMDD, c0y0); + MM_STORE(g+33*SIMDD, c0y1); + MM_STORE(g+50*SIMDD, rc04 * g48); + MM_STORE(g+51*SIMDD, rc05 * g49); + MM_STORE(g+52*SIMDD, cpz0 * g48); + MM_STORE(g+53*SIMDD, cpz1 * g49); + MM_STORE(g+56*SIMDD, c0z0 * g48); + MM_STORE(g+57*SIMDD, c0z1 * g49); + MM_STORE(g+6 *SIMDD, cpx0 * rc00 + b00); + MM_STORE(g+7 *SIMDD, cpx1 * rc01 + b01); + MM_STORE(g+10*SIMDD, c0x0 * rc00 + b10); + MM_STORE(g+11*SIMDD, c0x1 * rc01 + b11); + MM_STORE(g+12*SIMDD, c0x0 * cpx0 + b00); + MM_STORE(g+13*SIMDD, c0x1 * cpx1 + b01); + MM_STORE(g+30*SIMDD, cpy0 * rc02 + b00); + MM_STORE(g+31*SIMDD, cpy1 * rc03 + b01); + MM_STORE(g+34*SIMDD, c0y0 * rc02 + b10); + MM_STORE(g+35*SIMDD, c0y1 * rc03 + b11); + MM_STORE(g+36*SIMDD, c0y0 * cpy0 + b00); + MM_STORE(g+37*SIMDD, c0y1 * cpy1 + b01); + MM_STORE(g+54*SIMDD,(cpz0 * rc04 + b00)* g48); + MM_STORE(g+55*SIMDD,(cpz1 * rc05 + b01)* g49); + MM_STORE(g+58*SIMDD,(c0z0 * rc04 + b10)* g48); + MM_STORE(g+59*SIMDD,(c0z1 * rc05 + b11)* g49); + MM_STORE(g+60*SIMDD,(c0z0 * cpz0 + b00)* g48); + MM_STORE(g+61*SIMDD,(c0z1 * cpz1 + b01)* g49); + MM_STORE(g+14*SIMDD, cpx0 * MM_LOAD(g+10*SIMDD) + b00 *(rc00 + c0x0)); + MM_STORE(g+15*SIMDD, cpx1 * MM_LOAD(g+11*SIMDD) + b01 *(rc01 + c0x1)); + MM_STORE(g+38*SIMDD, cpy0 * MM_LOAD(g+34*SIMDD) + b00 *(rc02 + c0y0)); + MM_STORE(g+39*SIMDD, cpy1 * MM_LOAD(g+35*SIMDD) + b01 *(rc03 + c0y1)); + MM_STORE(g+62*SIMDD, cpz0 * MM_LOAD(g+58*SIMDD) + b00 *(MM_LOAD(g+50*SIMDD) + MM_LOAD(g+56*SIMDD))); + MM_STORE(g+63*SIMDD, cpz1 * MM_LOAD(g+59*SIMDD) + b01 *(MM_LOAD(g+51*SIMDD) + MM_LOAD(g+57*SIMDD))); } static inline void _g0_2d4d_1200(double *g, Rys2eT *bc, CINTEnvVars *envs) @@ -2008,20 +2108,21 @@ static inline void _g0_2d4d_1200(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cz = bc->c00z; double *b = bc->b10; double *r = envs->rirj; - ALIGNMM double rc[6*SIMDD]; - _make_rc(rc, cx, cy, cz, r); - __MD r0 = MM_LOAD(rc+0*SIMDD); - __MD r1 = MM_LOAD(rc+1*SIMDD); - __MD r2 = MM_LOAD(rc+2*SIMDD); - __MD r3 = MM_LOAD(rc+3*SIMDD); - __MD r4 = MM_LOAD(rc+4*SIMDD); - __MD r5 = MM_LOAD(rc+5*SIMDD); + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); __MD cx0 = MM_LOAD(cx+0*SIMDD); __MD cx1 = MM_LOAD(cx+1*SIMDD); __MD cy0 = MM_LOAD(cy+0*SIMDD); __MD cy1 = MM_LOAD(cy+1*SIMDD); __MD cz0 = MM_LOAD(cz+0*SIMDD); __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(ry, cy0); + __MD r3 = MM_ADD(ry, cy1); + __MD r4 = MM_ADD(rz, cz0); + __MD r5 = MM_ADD(rz, cz1); __MD b0 = MM_LOAD(b+0*SIMDD); __MD b1 = MM_LOAD(b+1*SIMDD); __MD i2 = MM_SET1(2.); @@ -2098,7 +2199,7 @@ static inline void _g0_2d4d_2001(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b0 = bc->b00; - double *b1 = bc->b01; + double *b1 = bc->b10; __MD r0 = MM_LOAD(c0x+0*SIMDD); __MD r1 = MM_LOAD(c0x+1*SIMDD); __MD r2 = MM_LOAD(c0y+0*SIMDD); @@ -2158,7 +2259,7 @@ static inline void _g0_2d4d_2010(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b0 = bc->b00; - double *b1 = bc->b01; + double *b1 = bc->b10; __MD r0 = MM_LOAD(c0x+0*SIMDD); __MD r1 = MM_LOAD(c0x+1*SIMDD); __MD r2 = MM_LOAD(c0y+0*SIMDD); @@ -2215,26 +2316,27 @@ static inline void _g0_2d4d_2100(double *g, Rys2eT *bc, CINTEnvVars *envs) double *cy = bc->c00y; double *cz = bc->c00z; double *b1 = bc->b10; - double *rij = envs->rirj; - ALIGNMM double rc[6*SIMDD]; - _make_rc(rc, cx, cy, cz, rij); - __MD b10 = MM_LOAD(b1+0*SIMDD); - __MD b11 = MM_LOAD(b1+1*SIMDD); - __MD g32 = MM_LOAD(g+32*SIMDD); - __MD g33 = MM_LOAD(g+33*SIMDD); - __MD i2 = MM_SET1(2.); - __MD r0 = MM_LOAD(rc+0*SIMDD); - __MD r1 = MM_LOAD(rc+1*SIMDD); - __MD r2 = MM_LOAD(rc+2*SIMDD); - __MD r3 = MM_LOAD(rc+3*SIMDD); - __MD r4 = MM_LOAD(rc+4*SIMDD); - __MD r5 = MM_LOAD(rc+5*SIMDD); + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); __MD s0 = MM_LOAD(cx+0*SIMDD); __MD s1 = MM_LOAD(cx+1*SIMDD); __MD s2 = MM_LOAD(cy+0*SIMDD); __MD s3 = MM_LOAD(cy+1*SIMDD); __MD s4 = MM_LOAD(cz+0*SIMDD); __MD s5 = MM_LOAD(cz+1*SIMDD); + __MD r0 = MM_ADD(rx, s0); + __MD r1 = MM_ADD(rx, s1); + __MD r2 = MM_ADD(ry, s2); + __MD r3 = MM_ADD(ry, s3); + __MD r4 = MM_ADD(rz, s4); + __MD r5 = MM_ADD(rz, s5); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD i2 = MM_SET1(2.); MM_STORE(g+2 *SIMDD, s0); MM_STORE(g+3 *SIMDD, s1); MM_STORE(g+8 *SIMDD, r0); @@ -2353,94 +2455,2990 @@ void CINTg0_2e_2d4d_unrolled(double *g, Rys2eT *bc, CINTEnvVars *envs) (int)envs->ll_ceil, (int)envs->lj_ceil); } -void CINTg0_2e_lj2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) +static inline void _srg0_2d4d_0000(double *g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d(g, bc, envs); - CINTg0_lj_4d(g, envs); } -void CINTg0_2e_kj2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) + +static inline void _srg0_2d4d_0001(double *g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d(g, bc, envs); - CINTg0_kj_4d(g, envs); + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + MM_STORE(g+2*SIMDD, MM_LOAD(cx+0*SIMDD)); + MM_STORE(g+3*SIMDD, MM_LOAD(cx+1*SIMDD)); + MM_STORE(g+6*SIMDD, MM_LOAD(cy+0*SIMDD)); + MM_STORE(g+7*SIMDD, MM_LOAD(cy+1*SIMDD)); + MM_STORE(g+10*SIMDD, MM_MUL(MM_LOAD(cz+0*SIMDD), MM_LOAD(g+8*SIMDD))); + MM_STORE(g+11*SIMDD, MM_MUL(MM_LOAD(cz+1*SIMDD), MM_LOAD(g+9*SIMDD))); } -void CINTg0_2e_ik2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) + +static inline void _srg0_2d4d_0002(double *g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d(g, bc, envs); - CINTg0_ik_4d(g, envs); + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + double *b = bc->b01; + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g24 = MM_LOAD(g+24*SIMDD); + __MD g25 = MM_LOAD(g+25*SIMDD); + __MD g26 = MM_LOAD(g+26*SIMDD); + __MD g27 = MM_LOAD(g+27*SIMDD); + MM_STORE(g+4 *SIMDD, cx0); + MM_STORE(g+5 *SIMDD, cx1); + MM_STORE(g+6 *SIMDD, cx2); + MM_STORE(g+7 *SIMDD, cx3); + MM_STORE(g+16*SIMDD, cy0); + MM_STORE(g+17*SIMDD, cy1); + MM_STORE(g+18*SIMDD, cy2); + MM_STORE(g+19*SIMDD, cy3); + MM_STORE(g+28*SIMDD, MM_MUL(cz0, g24)); + MM_STORE(g+29*SIMDD, MM_MUL(cz1, g25)); + MM_STORE(g+30*SIMDD, MM_MUL(cz2, g26)); + MM_STORE(g+31*SIMDD, MM_MUL(cz3, g27)); + MM_STORE(g+8 *SIMDD, MM_FMA(cx0, cx0, b0)); + MM_STORE(g+9 *SIMDD, MM_FMA(cx1, cx1, b1)); + MM_STORE(g+10*SIMDD, MM_FMA(cx2, cx2, b2)); + MM_STORE(g+11*SIMDD, MM_FMA(cx3, cx3, b3)); + MM_STORE(g+20*SIMDD, MM_FMA(cy0, cy0, b0)); + MM_STORE(g+21*SIMDD, MM_FMA(cy1, cy1, b1)); + MM_STORE(g+22*SIMDD, MM_FMA(cy2, cy2, b2)); + MM_STORE(g+23*SIMDD, MM_FMA(cy3, cy3, b3)); + MM_STORE(g+32*SIMDD, MM_MUL(MM_FMA(cz0, cz0, b0), g24)); + MM_STORE(g+33*SIMDD, MM_MUL(MM_FMA(cz1, cz1, b1), g25)); + MM_STORE(g+34*SIMDD, MM_MUL(MM_FMA(cz2, cz2, b2), g26)); + MM_STORE(g+35*SIMDD, MM_MUL(MM_FMA(cz3, cz3, b3), g27)); } -void CINTg0_2e_il2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) + +static inline void _srg0_2d4d_0003(double *g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d(g, bc, envs); - CINTg0_il_4d(g, envs); + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + double *b = bc->b01; + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + __MD i3 = MM_SET1(3.); + MM_STORE(g+4 *SIMDD, cx0); + MM_STORE(g+5 *SIMDD, cx1); + MM_STORE(g+6 *SIMDD, cx2); + MM_STORE(g+7 *SIMDD, cx3); + MM_STORE(g+20*SIMDD, cy0); + MM_STORE(g+21*SIMDD, cy1); + MM_STORE(g+22*SIMDD, cy2); + MM_STORE(g+23*SIMDD, cy3); + MM_STORE(g+36*SIMDD, cz0 * g32); + MM_STORE(g+37*SIMDD, cz1 * g33); + MM_STORE(g+38*SIMDD, cz2 * g34); + MM_STORE(g+39*SIMDD, cz3 * g35); + MM_STORE(g+8 *SIMDD, cx0 * cx0 + b0); + MM_STORE(g+9 *SIMDD, cx1 * cx1 + b1); + MM_STORE(g+10*SIMDD, cx2 * cx2 + b2); + MM_STORE(g+11*SIMDD, cx3 * cx3 + b3); + MM_STORE(g+24*SIMDD, cy0 * cy0 + b0); + MM_STORE(g+25*SIMDD, cy1 * cy1 + b1); + MM_STORE(g+26*SIMDD, cy2 * cy2 + b2); + MM_STORE(g+27*SIMDD, cy3 * cy3 + b3); + MM_STORE(g+40*SIMDD,(cz0 * cz0 + b0)* g32); + MM_STORE(g+41*SIMDD,(cz1 * cz1 + b1)* g33); + MM_STORE(g+42*SIMDD,(cz2 * cz2 + b2)* g34); + MM_STORE(g+43*SIMDD,(cz3 * cz3 + b3)* g35); + MM_STORE(g+12*SIMDD, cx0 *(cx0 * cx0 + i3 * b0)); + MM_STORE(g+13*SIMDD, cx1 *(cx1 * cx1 + i3 * b1)); + MM_STORE(g+14*SIMDD, cx2 *(cx2 * cx2 + i3 * b2)); + MM_STORE(g+15*SIMDD, cx3 *(cx3 * cx3 + i3 * b3)); + MM_STORE(g+28*SIMDD, cy0 *(cy0 * cy0 + i3 * b0)); + MM_STORE(g+29*SIMDD, cy1 *(cy1 * cy1 + i3 * b1)); + MM_STORE(g+30*SIMDD, cy2 *(cy2 * cy2 + i3 * b2)); + MM_STORE(g+31*SIMDD, cy3 *(cy3 * cy3 + i3 * b3)); + MM_STORE(g+44*SIMDD,(cz0 * cz0 + i3 * b0)* MM_LOAD(g+36*SIMDD)); + MM_STORE(g+45*SIMDD,(cz1 * cz1 + i3 * b1)* MM_LOAD(g+37*SIMDD)); + MM_STORE(g+46*SIMDD,(cz2 * cz2 + i3 * b2)* MM_LOAD(g+38*SIMDD)); + MM_STORE(g+47*SIMDD,(cz3 * cz3 + i3 * b3)* MM_LOAD(g+39*SIMDD)); } -/* - * g[i,k,l,j] = < ik | lj > = ( i j | k l ) - */ -int CINTg0_2e(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int count) +static inline void _srg0_2d4d_0010(double *g, Rys2eT *bc, CINTEnvVars *envs) { - ALIGNMM double aij[SIMDD]; - ALIGNMM double akl[SIMDD]; - ALIGNMM double a0[SIMDD]; - ALIGNMM double a1[SIMDD]; - ALIGNMM double aijkl[SIMDD]; - ALIGNMM double fac1[SIMDD]; - ALIGNMM double x[SIMDD]; - ALIGNMM double rijrkl[SIMDD*3]; - ALIGNMM double rijrx[SIMDD*3]; - ALIGNMM double rklrx[SIMDD*3]; - double *rij = envs->rij; - double *rkl = envs->rkl; - double *u = bc->u; - double *w = bc->w; - __MD ra, r0, r1, r2, r3, r4, r5, r6, r7, r8; - int nroots = envs->nrys_roots; - int i; + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + MM_STORE(g+2*SIMDD, MM_LOAD(cx+0*SIMDD)); + MM_STORE(g+3*SIMDD, MM_LOAD(cx+1*SIMDD)); + MM_STORE(g+6*SIMDD, MM_LOAD(cy+0*SIMDD)); + MM_STORE(g+7*SIMDD, MM_LOAD(cy+1*SIMDD)); + MM_STORE(g+10*SIMDD, MM_MUL(MM_LOAD(cz+0*SIMDD), MM_LOAD(g+8*SIMDD))); + MM_STORE(g+11*SIMDD, MM_MUL(MM_LOAD(cz+1*SIMDD), MM_LOAD(g+9*SIMDD))); +} - //:for (int k = 0; k < count; k++) { - //: aij[k] = envs->ai[k] + envs->aj[k]; - //: akl[k] = envs->ak[k] + envs->al[k]; - //: aijkl[k] = aij[k] + akl[k]; - //: a1[k] = aij[k] * akl[k]; - //: a0[k] = a1[k] / aijkl[k]; - //: //fac1[k] = sqrt(a0[k] / (a1[k] * a1[k] * a1[k])) * envs->fac[k]; - //: fac1[k] = envs->fac[k] / (sqrt(aijakl[k]) * a1[k]); - //:} - r2 = MM_ADD(MM_LOAD(envs->ai), MM_LOAD(envs->aj)); - r3 = MM_ADD(MM_LOAD(envs->ak), MM_LOAD(envs->al)); - MM_STORE(aij, r2); - MM_STORE(akl, r3); - r1 = MM_MUL(r2, r3); - MM_STORE(a1, r1); - ra = MM_ADD(r2, r3); - MM_STORE(aijkl, ra); - r0 = MM_DIV(r1, ra); - MM_STORE(a0, r0); +static inline void _srg0_2d4d_0011(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + double *b = bc->b01; + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(rx, cx2); + __MD r3 = MM_ADD(rx, cx3); + __MD r4 = MM_ADD(ry, cy0); + __MD r5 = MM_ADD(ry, cy1); + __MD r6 = MM_ADD(ry, cy2); + __MD r7 = MM_ADD(ry, cy3); + __MD r8 = MM_ADD(rz, cz0); + __MD r9 = MM_ADD(rz, cz1); + __MD r10= MM_ADD(rz, cz2); + __MD r11= MM_ADD(rz, cz3); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+8 *SIMDD, cx0); + MM_STORE(g+9 *SIMDD, cx1); + MM_STORE(g+10*SIMDD, cx2); + MM_STORE(g+11*SIMDD, cx3); + MM_STORE(g+28*SIMDD, r4); + MM_STORE(g+29*SIMDD, r5); + MM_STORE(g+30*SIMDD, r6); + MM_STORE(g+31*SIMDD, r7); + MM_STORE(g+32*SIMDD, cy0); + MM_STORE(g+33*SIMDD, cy1); + MM_STORE(g+34*SIMDD, cy2); + MM_STORE(g+35*SIMDD, cy3); + MM_STORE(g+52*SIMDD, MM_MUL(r8, g48)); + MM_STORE(g+53*SIMDD, MM_MUL(r9, g49)); + MM_STORE(g+54*SIMDD, MM_MUL(r10,g50)); + MM_STORE(g+55*SIMDD, MM_MUL(r11,g51)); + MM_STORE(g+56*SIMDD, MM_MUL(cz0,g48)); + MM_STORE(g+57*SIMDD, MM_MUL(cz1,g49)); + MM_STORE(g+58*SIMDD, MM_MUL(cz2,g50)); + MM_STORE(g+59*SIMDD, MM_MUL(cz3,g51)); + MM_STORE(g+12*SIMDD, MM_FMA(r0, cx0, b0)); + MM_STORE(g+13*SIMDD, MM_FMA(r1, cx1, b1)); + MM_STORE(g+14*SIMDD, MM_FMA(r2, cx2, b2)); + MM_STORE(g+15*SIMDD, MM_FMA(r3, cx3, b3)); + MM_STORE(g+36*SIMDD, MM_FMA(r4, cy0, b0)); + MM_STORE(g+37*SIMDD, MM_FMA(r5, cy1, b1)); + MM_STORE(g+38*SIMDD, MM_FMA(r6, cy2, b2)); + MM_STORE(g+39*SIMDD, MM_FMA(r7, cy3, b3)); + MM_STORE(g+60*SIMDD, MM_MUL(MM_FMA(r8, cz0, b0), g48)); + MM_STORE(g+61*SIMDD, MM_MUL(MM_FMA(r9, cz1, b1), g49)); + MM_STORE(g+62*SIMDD, MM_MUL(MM_FMA(r10,cz2, b2), g50)); + MM_STORE(g+63*SIMDD, MM_MUL(MM_FMA(r11,cz3, b3), g51)); +} -#ifdef WITH_RANGE_COULOMB -// Not recommended to mix range-separated Coulomb with regular Coulomb operator. -// Keep this for backward compatibility to cint2 - const double omega = envs->env[PTR_RANGE_OMEGA]; - ALIGNMM double theta[SIMDD]; - if (omega != 0) { - //:theta = omega * omega / (omega * omega + a0); - r0 = MM_SET1(omega); - r0 = MM_MUL(r0, r0); - r1 = MM_LOAD(a0); - r0 = MM_DIV(r0, MM_ADD(r0, r1)); - MM_STORE(theta, r0); - if (omega > 0) { // long-range part of range-separated Coulomb operator - //:a0 *= theta; - MM_STORE(a0, MM_MUL(r0, r1)); - } +static inline void _srg0_2d4d_0012(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + double *b = bc->b01; + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(rx, cx2); + __MD r3 = MM_ADD(rx, cx3); + __MD r4 = MM_ADD(ry, cy0); + __MD r5 = MM_ADD(ry, cy1); + __MD r6 = MM_ADD(ry, cy2); + __MD r7 = MM_ADD(ry, cy3); + __MD r8 = MM_ADD(rz, cz0); + __MD r9 = MM_ADD(rz, cz1); + __MD r10= MM_ADD(rz, cz2); + __MD r11= MM_ADD(rz, cz3); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD i2 = MM_SET1(2.); + __MD g64 = MM_LOAD(g+64*SIMDD); + __MD g65 = MM_LOAD(g+65*SIMDD); + __MD g66 = MM_LOAD(g+66*SIMDD); + __MD g67 = MM_LOAD(g+67*SIMDD); + MM_STORE(g+4 *SIMDD, r0 ); + MM_STORE(g+5 *SIMDD, r1 ); + MM_STORE(g+6 *SIMDD, r2 ); + MM_STORE(g+7 *SIMDD, r3 ); + MM_STORE(g+8 *SIMDD, cx0); + MM_STORE(g+9 *SIMDD, cx1); + MM_STORE(g+10*SIMDD, cx2); + MM_STORE(g+11*SIMDD, cx3); + MM_STORE(g+36*SIMDD, r4 ); + MM_STORE(g+37*SIMDD, r5 ); + MM_STORE(g+38*SIMDD, r6 ); + MM_STORE(g+39*SIMDD, r7 ); + MM_STORE(g+40*SIMDD, cy0); + MM_STORE(g+41*SIMDD, cy1); + MM_STORE(g+42*SIMDD, cy2); + MM_STORE(g+43*SIMDD, cy3); + MM_STORE(g+68*SIMDD, MM_MUL(r8 , g64)); + MM_STORE(g+69*SIMDD, MM_MUL(r9 , g65)); + MM_STORE(g+70*SIMDD, MM_MUL(r10, g66)); + MM_STORE(g+71*SIMDD, MM_MUL(r11, g67)); + MM_STORE(g+72*SIMDD, MM_MUL(cz0, g64)); + MM_STORE(g+73*SIMDD, MM_MUL(cz1, g65)); + MM_STORE(g+74*SIMDD, MM_MUL(cz2, g66)); + MM_STORE(g+75*SIMDD, MM_MUL(cz3, g67)); + MM_STORE(g+12*SIMDD, MM_FMA(r0 , cx0, b0)); + MM_STORE(g+13*SIMDD, MM_FMA(r1 , cx1, b1)); + MM_STORE(g+14*SIMDD, MM_FMA(r2 , cx2, b2)); + MM_STORE(g+15*SIMDD, MM_FMA(r3 , cx3, b3)); + MM_STORE(g+16*SIMDD, MM_FMA(cx0, cx0, b0)); + MM_STORE(g+17*SIMDD, MM_FMA(cx1, cx1, b1)); + MM_STORE(g+18*SIMDD, MM_FMA(cx2, cx2, b2)); + MM_STORE(g+19*SIMDD, MM_FMA(cx3, cx3, b3)); + MM_STORE(g+44*SIMDD, MM_FMA(r4 , cy0, b0)); + MM_STORE(g+45*SIMDD, MM_FMA(r5 , cy1, b1)); + MM_STORE(g+46*SIMDD, MM_FMA(r6 , cy2, b2)); + MM_STORE(g+47*SIMDD, MM_FMA(r7 , cy3, b3)); + MM_STORE(g+48*SIMDD, MM_FMA(cy0, cy0, b0)); + MM_STORE(g+49*SIMDD, MM_FMA(cy1, cy1, b1)); + MM_STORE(g+50*SIMDD, MM_FMA(cy2, cy2, b2)); + MM_STORE(g+51*SIMDD, MM_FMA(cy3, cy3, b3)); + MM_STORE(g+76*SIMDD, MM_MUL(MM_FMA(r8 , cz0, b0), g64)); + MM_STORE(g+77*SIMDD, MM_MUL(MM_FMA(r9 , cz1, b1), g65)); + MM_STORE(g+78*SIMDD, MM_MUL(MM_FMA(r10, cz2, b2), g66)); + MM_STORE(g+79*SIMDD, MM_MUL(MM_FMA(r11, cz3, b3), g67)); + MM_STORE(g+80*SIMDD, MM_MUL(MM_FMA(cz0, cz0, b0), g64)); + MM_STORE(g+81*SIMDD, MM_MUL(MM_FMA(cz1, cz1, b1), g65)); + MM_STORE(g+82*SIMDD, MM_MUL(MM_FMA(cz2, cz2, b2), g66)); + MM_STORE(g+83*SIMDD, MM_MUL(MM_FMA(cz3, cz3, b3), g67)); + MM_STORE(g+20*SIMDD, r0 * MM_LOAD(g+16*SIMDD) + i2 * b0 * cx0 ); + MM_STORE(g+21*SIMDD, r1 * MM_LOAD(g+17*SIMDD) + i2 * b1 * cx1 ); + MM_STORE(g+22*SIMDD, r2 * MM_LOAD(g+18*SIMDD) + i2 * b2 * cx2 ); + MM_STORE(g+23*SIMDD, r3 * MM_LOAD(g+19*SIMDD) + i2 * b3 * cx3 ); + MM_STORE(g+52*SIMDD, r4 * MM_LOAD(g+48*SIMDD) + i2 * b0 * cy0 ); + MM_STORE(g+53*SIMDD, r5 * MM_LOAD(g+49*SIMDD) + i2 * b1 * cy1 ); + MM_STORE(g+54*SIMDD, r6 * MM_LOAD(g+50*SIMDD) + i2 * b2 * cy2 ); + MM_STORE(g+55*SIMDD, r7 * MM_LOAD(g+51*SIMDD) + i2 * b3 * cy3 ); + MM_STORE(g+84*SIMDD, r8 * MM_LOAD(g+80*SIMDD) + i2 * b0 * MM_LOAD(g+72*SIMDD)); + MM_STORE(g+85*SIMDD, r9 * MM_LOAD(g+81*SIMDD) + i2 * b1 * MM_LOAD(g+73*SIMDD)); + MM_STORE(g+86*SIMDD, r10* MM_LOAD(g+82*SIMDD) + i2 * b2 * MM_LOAD(g+74*SIMDD)); + MM_STORE(g+87*SIMDD, r11* MM_LOAD(g+83*SIMDD) + i2 * b3 * MM_LOAD(g+75*SIMDD)); +} + +static inline void _srg0_2d4d_0020(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + double *b = bc->b01; + __MD r0 = MM_LOAD(cx+0*SIMDD); + __MD r1 = MM_LOAD(cx+1*SIMDD); + __MD r2 = MM_LOAD(cx+2*SIMDD); + __MD r3 = MM_LOAD(cx+3*SIMDD); + __MD r4 = MM_LOAD(cy+0*SIMDD); + __MD r5 = MM_LOAD(cy+1*SIMDD); + __MD r6 = MM_LOAD(cy+2*SIMDD); + __MD r7 = MM_LOAD(cy+3*SIMDD); + __MD r8 = MM_LOAD(cz+0*SIMDD); + __MD r9 = MM_LOAD(cz+1*SIMDD); + __MD r10 = MM_LOAD(cz+2*SIMDD); + __MD r11 = MM_LOAD(cz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g24 = MM_LOAD(g+24*SIMDD); + __MD g25 = MM_LOAD(g+25*SIMDD); + __MD g26 = MM_LOAD(g+26*SIMDD); + __MD g27 = MM_LOAD(g+27*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+16*SIMDD, r4); + MM_STORE(g+17*SIMDD, r5); + MM_STORE(g+18*SIMDD, r6); + MM_STORE(g+19*SIMDD, r7); + MM_STORE(g+28*SIMDD, MM_MUL(r8 , g24)); + MM_STORE(g+29*SIMDD, MM_MUL(r9 , g25)); + MM_STORE(g+30*SIMDD, MM_MUL(r10, g26)); + MM_STORE(g+31*SIMDD, MM_MUL(r11, g27)); + MM_STORE(g+8 *SIMDD, MM_FMA(r0, r0, b0)); + MM_STORE(g+9 *SIMDD, MM_FMA(r1, r1, b1)); + MM_STORE(g+10*SIMDD, MM_FMA(r2, r2, b2)); + MM_STORE(g+11*SIMDD, MM_FMA(r3, r3, b3)); + MM_STORE(g+20*SIMDD, MM_FMA(r4, r4, b0)); + MM_STORE(g+21*SIMDD, MM_FMA(r5, r5, b1)); + MM_STORE(g+22*SIMDD, MM_FMA(r6, r6, b2)); + MM_STORE(g+23*SIMDD, MM_FMA(r7, r7, b3)); + MM_STORE(g+32*SIMDD, MM_MUL(MM_FMA(r8 , r8 , b0), g24)); + MM_STORE(g+33*SIMDD, MM_MUL(MM_FMA(r9 , r9 , b1), g25)); + MM_STORE(g+34*SIMDD, MM_MUL(MM_FMA(r10, r10, b2), g26)); + MM_STORE(g+35*SIMDD, MM_MUL(MM_FMA(r11, r11, b3), g27)); +} + +static inline void _srg0_2d4d_0021(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + double *b1 = bc->b01; + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD s0 = MM_LOAD(cx+0*SIMDD); + __MD s1 = MM_LOAD(cx+1*SIMDD); + __MD s2 = MM_LOAD(cx+2*SIMDD); + __MD s3 = MM_LOAD(cx+3*SIMDD); + __MD s4 = MM_LOAD(cy+0*SIMDD); + __MD s5 = MM_LOAD(cy+1*SIMDD); + __MD s6 = MM_LOAD(cy+2*SIMDD); + __MD s7 = MM_LOAD(cy+3*SIMDD); + __MD s8 = MM_LOAD(cz+0*SIMDD); + __MD s9 = MM_LOAD(cz+1*SIMDD); + __MD s10= MM_LOAD(cz+2*SIMDD); + __MD s11= MM_LOAD(cz+3*SIMDD); + __MD r0 = MM_ADD(rx, s0 ); + __MD r1 = MM_ADD(rx, s1 ); + __MD r2 = MM_ADD(rx, s2 ); + __MD r3 = MM_ADD(rx, s3 ); + __MD r4 = MM_ADD(ry, s4 ); + __MD r5 = MM_ADD(ry, s5 ); + __MD r6 = MM_ADD(ry, s6 ); + __MD r7 = MM_ADD(ry, s7 ); + __MD r8 = MM_ADD(rz, s8 ); + __MD r9 = MM_ADD(rz, s9 ); + __MD r10= MM_ADD(rz, s10); + __MD r11= MM_ADD(rz, s11); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g64 = MM_LOAD(g+64*SIMDD); + __MD g65 = MM_LOAD(g+65*SIMDD); + __MD g66 = MM_LOAD(g+66*SIMDD); + __MD g67 = MM_LOAD(g+67*SIMDD); + __MD i2 = MM_SET1(2.); + MM_STORE(g+4 *SIMDD, s0); + MM_STORE(g+5 *SIMDD, s1); + MM_STORE(g+6 *SIMDD, s2); + MM_STORE(g+7 *SIMDD, s3); + MM_STORE(g+16*SIMDD, r0); + MM_STORE(g+17*SIMDD, r1); + MM_STORE(g+18*SIMDD, r2); + MM_STORE(g+19*SIMDD, r3); + MM_STORE(g+36*SIMDD, s4); + MM_STORE(g+37*SIMDD, s5); + MM_STORE(g+38*SIMDD, s6); + MM_STORE(g+39*SIMDD, s7); + MM_STORE(g+48*SIMDD, r4); + MM_STORE(g+49*SIMDD, r5); + MM_STORE(g+50*SIMDD, r6); + MM_STORE(g+51*SIMDD, r7); + MM_STORE(g+68*SIMDD, s8 * g64); + MM_STORE(g+69*SIMDD, s9 * g65); + MM_STORE(g+70*SIMDD, s10* g66); + MM_STORE(g+71*SIMDD, s11* g67); + MM_STORE(g+80*SIMDD, r8 * g64); + MM_STORE(g+81*SIMDD, r9 * g65); + MM_STORE(g+82*SIMDD, r10* g66); + MM_STORE(g+83*SIMDD, r11* g67); + MM_STORE(g+8 *SIMDD, s0 * s0 + b10); + MM_STORE(g+9 *SIMDD, s1 * s1 + b11); + MM_STORE(g+10*SIMDD, s2 * s2 + b12); + MM_STORE(g+11*SIMDD, s3 * s3 + b13); + MM_STORE(g+20*SIMDD, s0 * r0 + b10); + MM_STORE(g+21*SIMDD, s1 * r1 + b11); + MM_STORE(g+22*SIMDD, s2 * r2 + b12); + MM_STORE(g+23*SIMDD, s3 * r3 + b13); + MM_STORE(g+40*SIMDD, s4 * s4 + b10); + MM_STORE(g+41*SIMDD, s5 * s5 + b11); + MM_STORE(g+42*SIMDD, s6 * s6 + b12); + MM_STORE(g+43*SIMDD, s7 * s7 + b13); + MM_STORE(g+52*SIMDD, s4 * r4 + b10); + MM_STORE(g+53*SIMDD, s5 * r5 + b11); + MM_STORE(g+54*SIMDD, s6 * r6 + b12); + MM_STORE(g+55*SIMDD, s7 * r7 + b13); + MM_STORE(g+72*SIMDD,(s8 * s8 + b10) * g64); + MM_STORE(g+73*SIMDD,(s9 * s9 + b11) * g65); + MM_STORE(g+74*SIMDD,(s10* s10+ b12)* g66); + MM_STORE(g+75*SIMDD,(s11* s11+ b13)* g67); + MM_STORE(g+84*SIMDD,(s8 * r8 + b10) * g64); + MM_STORE(g+85*SIMDD,(s9 * r9 + b11) * g65); + MM_STORE(g+86*SIMDD,(s10* r10+ b12)* g66); + MM_STORE(g+87*SIMDD,(s11* r11+ b13)* g67); + MM_STORE(g+24*SIMDD, r0 * MM_LOAD(g+8 *SIMDD ) + i2 * b10 * s0); + MM_STORE(g+25*SIMDD, r1 * MM_LOAD(g+9 *SIMDD ) + i2 * b11 * s1); + MM_STORE(g+26*SIMDD, r2 * MM_LOAD(g+10*SIMDD ) + i2 * b12 * s2); + MM_STORE(g+27*SIMDD, r3 * MM_LOAD(g+11*SIMDD ) + i2 * b13 * s3); + MM_STORE(g+56*SIMDD, r4 * MM_LOAD(g+40*SIMDD ) + i2 * b10 * s4); + MM_STORE(g+57*SIMDD, r5 * MM_LOAD(g+41*SIMDD ) + i2 * b11 * s5); + MM_STORE(g+58*SIMDD, r6 * MM_LOAD(g+42*SIMDD ) + i2 * b12 * s6); + MM_STORE(g+59*SIMDD, r7 * MM_LOAD(g+43*SIMDD ) + i2 * b13 * s7); + MM_STORE(g+88*SIMDD, r8 * MM_LOAD(g+72*SIMDD ) + i2 * b10 * MM_LOAD(g+68*SIMDD )); + MM_STORE(g+89*SIMDD, r9 * MM_LOAD(g+73*SIMDD ) + i2 * b11 * MM_LOAD(g+69*SIMDD )); + MM_STORE(g+90*SIMDD, r10* MM_LOAD(g+74*SIMDD ) + i2 * b12 * MM_LOAD(g+70*SIMDD )); + MM_STORE(g+91*SIMDD, r11* MM_LOAD(g+75*SIMDD ) + i2 * b13 * MM_LOAD(g+71*SIMDD )); +} + +static inline void _srg0_2d4d_0030(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c0px; + double *cy = bc->c0py; + double *cz = bc->c0pz; + double *b = bc->b01; + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD i3 = MM_SET1(3.); + __MD r0 = MM_LOAD(cx+0*SIMDD); + __MD r1 = MM_LOAD(cx+1*SIMDD); + __MD r2 = MM_LOAD(cx+2*SIMDD); + __MD r3 = MM_LOAD(cx+3*SIMDD); + __MD r4 = MM_LOAD(cy+0*SIMDD); + __MD r5 = MM_LOAD(cy+1*SIMDD); + __MD r6 = MM_LOAD(cy+2*SIMDD); + __MD r7 = MM_LOAD(cy+3*SIMDD); + __MD r8 = MM_LOAD(cz+0*SIMDD); + __MD r9 = MM_LOAD(cz+1*SIMDD); + __MD r10 = MM_LOAD(cz+2*SIMDD); + __MD r11 = MM_LOAD(cz+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+20*SIMDD, r4); + MM_STORE(g+21*SIMDD, r5); + MM_STORE(g+22*SIMDD, r6); + MM_STORE(g+23*SIMDD, r7); + MM_STORE(g+36*SIMDD, r8 * g32); + MM_STORE(g+37*SIMDD, r9 * g33); + MM_STORE(g+38*SIMDD, r10* g34); + MM_STORE(g+39*SIMDD, r11* g35); + MM_STORE(g+8 *SIMDD, r0 * r0 + b0); + MM_STORE(g+9 *SIMDD, r1 * r1 + b1); + MM_STORE(g+10*SIMDD, r2 * r2 + b2); + MM_STORE(g+11*SIMDD, r3 * r3 + b3); + MM_STORE(g+24*SIMDD, r4 * r4 + b0); + MM_STORE(g+25*SIMDD, r5 * r5 + b1); + MM_STORE(g+26*SIMDD, r6 * r6 + b2); + MM_STORE(g+27*SIMDD, r7 * r7 + b3); + MM_STORE(g+40*SIMDD,(r8 * r8 + b0)* g32); + MM_STORE(g+41*SIMDD,(r9 * r9 + b1)* g33); + MM_STORE(g+42*SIMDD,(r10* r10+ b2)* g34); + MM_STORE(g+43*SIMDD,(r11* r11+ b3)* g35); + MM_STORE(g+12*SIMDD, r0 *(r0 * r0 + i3 * b0)); + MM_STORE(g+13*SIMDD, r1 *(r1 * r1 + i3 * b1)); + MM_STORE(g+14*SIMDD, r2 *(r2 * r2 + i3 * b2)); + MM_STORE(g+15*SIMDD, r3 *(r3 * r3 + i3 * b3)); + MM_STORE(g+28*SIMDD, r4 *(r4 * r4 + i3 * b0)); + MM_STORE(g+29*SIMDD, r5 *(r5 * r5 + i3 * b1)); + MM_STORE(g+30*SIMDD, r6 *(r6 * r6 + i3 * b2)); + MM_STORE(g+31*SIMDD, r7 *(r7 * r7 + i3 * b3)); + MM_STORE(g+44*SIMDD,(r8 * r8 + i3 * b0) * MM_LOAD(g+36*SIMDD)); + MM_STORE(g+45*SIMDD,(r9 * r9 + i3 * b1) * MM_LOAD(g+37*SIMDD)); + MM_STORE(g+46*SIMDD,(r10* r10+ i3 * b2) * MM_LOAD(g+38*SIMDD)); + MM_STORE(g+47*SIMDD,(r11* r11+ i3 * b3) * MM_LOAD(g+39*SIMDD)); +} + +static inline void _srg0_2d4d_0100(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + MM_STORE(g+2*SIMDD, MM_LOAD(cx+0*SIMDD)); + MM_STORE(g+3*SIMDD, MM_LOAD(cx+1*SIMDD)); + MM_STORE(g+6*SIMDD, MM_LOAD(cy+0*SIMDD)); + MM_STORE(g+7*SIMDD, MM_LOAD(cy+1*SIMDD)); + MM_STORE(g+10*SIMDD, MM_MUL(MM_LOAD(cz+0*SIMDD), MM_LOAD(g+8*SIMDD))); + MM_STORE(g+11*SIMDD, MM_MUL(MM_LOAD(cz+1*SIMDD), MM_LOAD(g+9*SIMDD))); +} + +static inline void _srg0_2d4d_0101(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b = bc->b00; + __MD cx0 = MM_LOAD(c0x+0*SIMDD); + __MD cx1 = MM_LOAD(c0x+1*SIMDD); + __MD cx2 = MM_LOAD(c0x+2*SIMDD); + __MD cx3 = MM_LOAD(c0x+3*SIMDD); + __MD cy0 = MM_LOAD(c0y+0*SIMDD); + __MD cy1 = MM_LOAD(c0y+1*SIMDD); + __MD cy2 = MM_LOAD(c0y+2*SIMDD); + __MD cy3 = MM_LOAD(c0y+3*SIMDD); + __MD cz0 = MM_LOAD(c0z+0*SIMDD); + __MD cz1 = MM_LOAD(c0z+1*SIMDD); + __MD cz2 = MM_LOAD(c0z+2*SIMDD); + __MD cz3 = MM_LOAD(c0z+3*SIMDD); + __MD px0 = MM_LOAD(cpx+0*SIMDD); + __MD px1 = MM_LOAD(cpx+1*SIMDD); + __MD px2 = MM_LOAD(cpx+2*SIMDD); + __MD px3 = MM_LOAD(cpx+3*SIMDD); + __MD py0 = MM_LOAD(cpy+0*SIMDD); + __MD py1 = MM_LOAD(cpy+1*SIMDD); + __MD py2 = MM_LOAD(cpy+2*SIMDD); + __MD py3 = MM_LOAD(cpy+3*SIMDD); + __MD pz0 = MM_LOAD(cpz+0*SIMDD); + __MD pz1 = MM_LOAD(cpz+1*SIMDD); + __MD pz2 = MM_LOAD(cpz+2*SIMDD); + __MD pz3 = MM_LOAD(cpz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + MM_STORE(g+4 *SIMDD, px0); + MM_STORE(g+5 *SIMDD, px1); + MM_STORE(g+6 *SIMDD, px2); + MM_STORE(g+7 *SIMDD, px3); + MM_STORE(g+8 *SIMDD, cx0); + MM_STORE(g+9 *SIMDD, cx1); + MM_STORE(g+10*SIMDD, cx2); + MM_STORE(g+11*SIMDD, cx3); + MM_STORE(g+20*SIMDD, py0); + MM_STORE(g+21*SIMDD, py1); + MM_STORE(g+22*SIMDD, py2); + MM_STORE(g+23*SIMDD, py3); + MM_STORE(g+24*SIMDD, cy0); + MM_STORE(g+25*SIMDD, cy1); + MM_STORE(g+26*SIMDD, cy2); + MM_STORE(g+27*SIMDD, cy3); + MM_STORE(g+36*SIMDD, MM_MUL(pz0, g32)); + MM_STORE(g+37*SIMDD, MM_MUL(pz1, g33)); + MM_STORE(g+38*SIMDD, MM_MUL(pz2, g34)); + MM_STORE(g+39*SIMDD, MM_MUL(pz3, g35)); + MM_STORE(g+40*SIMDD, MM_MUL(cz0, g32)); + MM_STORE(g+41*SIMDD, MM_MUL(cz1, g33)); + MM_STORE(g+42*SIMDD, MM_MUL(cz2, g34)); + MM_STORE(g+43*SIMDD, MM_MUL(cz3, g35)); + MM_STORE(g+12*SIMDD, MM_FMA(px0, cx0, b0)); + MM_STORE(g+13*SIMDD, MM_FMA(px1, cx1, b1)); + MM_STORE(g+14*SIMDD, MM_FMA(px2, cx2, b2)); + MM_STORE(g+15*SIMDD, MM_FMA(px3, cx3, b3)); + MM_STORE(g+28*SIMDD, MM_FMA(py0, cy0, b0)); + MM_STORE(g+29*SIMDD, MM_FMA(py1, cy1, b1)); + MM_STORE(g+30*SIMDD, MM_FMA(py2, cy2, b2)); + MM_STORE(g+31*SIMDD, MM_FMA(py3, cy3, b3)); + MM_STORE(g+44*SIMDD, MM_MUL(MM_FMA(pz0, cz0, b0), g32)); + MM_STORE(g+45*SIMDD, MM_MUL(MM_FMA(pz1, cz1, b1), g33)); + MM_STORE(g+46*SIMDD, MM_MUL(MM_FMA(pz2, cz2, b2), g34)); + MM_STORE(g+47*SIMDD, MM_MUL(MM_FMA(pz3, cz3, b3), g35)); +} + +static inline void _srg0_2d4d_0102(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b01; + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, s0); + MM_STORE(g+5 *SIMDD, s1); + MM_STORE(g+6 *SIMDD, s2); + MM_STORE(g+7 *SIMDD, s3); + MM_STORE(g+12*SIMDD, r0); + MM_STORE(g+13*SIMDD, r1); + MM_STORE(g+14*SIMDD, r2); + MM_STORE(g+15*SIMDD, r3); + MM_STORE(g+28*SIMDD, s4); + MM_STORE(g+29*SIMDD, s5); + MM_STORE(g+30*SIMDD, s6); + MM_STORE(g+31*SIMDD, s7); + MM_STORE(g+36*SIMDD, r4); + MM_STORE(g+37*SIMDD, r5); + MM_STORE(g+38*SIMDD, r6); + MM_STORE(g+39*SIMDD, r7); + MM_STORE(g+52*SIMDD, MM_MUL(s8 , g48)); + MM_STORE(g+53*SIMDD, MM_MUL(s9 , g49)); + MM_STORE(g+54*SIMDD, MM_MUL(s10, g50)); + MM_STORE(g+55*SIMDD, MM_MUL(s11, g51)); + MM_STORE(g+60*SIMDD, MM_MUL(r8 , g48)); + MM_STORE(g+61*SIMDD, MM_MUL(r9 , g49)); + MM_STORE(g+62*SIMDD, MM_MUL(r10, g50)); + MM_STORE(g+63*SIMDD, MM_MUL(r11, g51)); + MM_STORE(g+8 *SIMDD, MM_FMA(s0, s0, b10)); + MM_STORE(g+9 *SIMDD, MM_FMA(s1, s1, b11)); + MM_STORE(g+10*SIMDD, MM_FMA(s2, s2, b12)); + MM_STORE(g+11*SIMDD, MM_FMA(s3, s3, b13)); + MM_STORE(g+16*SIMDD, MM_FMA(r0, s0, b00)); + MM_STORE(g+17*SIMDD, MM_FMA(r1, s1, b01)); + MM_STORE(g+18*SIMDD, MM_FMA(r2, s2, b02)); + MM_STORE(g+19*SIMDD, MM_FMA(r3, s3, b03)); + MM_STORE(g+32*SIMDD, MM_FMA(s4, s4, b10)); + MM_STORE(g+33*SIMDD, MM_FMA(s5, s5, b11)); + MM_STORE(g+34*SIMDD, MM_FMA(s6, s6, b12)); + MM_STORE(g+35*SIMDD, MM_FMA(s7, s7, b13)); + MM_STORE(g+40*SIMDD, MM_FMA(r4, s4, b00)); + MM_STORE(g+41*SIMDD, MM_FMA(r5, s5, b01)); + MM_STORE(g+42*SIMDD, MM_FMA(r6, s6, b02)); + MM_STORE(g+43*SIMDD, MM_FMA(r7, s7, b03)); + MM_STORE(g+56*SIMDD, MM_MUL(MM_FMA(s8 , s8 , b10), g48)); + MM_STORE(g+57*SIMDD, MM_MUL(MM_FMA(s9 , s9 , b11), g49)); + MM_STORE(g+58*SIMDD, MM_MUL(MM_FMA(s10, s10, b12), g50)); + MM_STORE(g+59*SIMDD, MM_MUL(MM_FMA(s11, s11, b13), g51)); + MM_STORE(g+64*SIMDD, MM_MUL(MM_FMA(r8 , s8 , b00), g48)); + MM_STORE(g+65*SIMDD, MM_MUL(MM_FMA(r9 , s9 , b01), g49)); + MM_STORE(g+66*SIMDD, MM_MUL(MM_FMA(r10, s10, b02), g50)); + MM_STORE(g+67*SIMDD, MM_MUL(MM_FMA(r11, s11, b03), g51)); + MM_STORE(g+20*SIMDD, s0 *(MM_LOAD(g+16*SIMDD) + b00) + b10 * r0); + MM_STORE(g+21*SIMDD, s1 *(MM_LOAD(g+17*SIMDD) + b01) + b11 * r1); + MM_STORE(g+22*SIMDD, s2 *(MM_LOAD(g+18*SIMDD) + b02) + b12 * r2); + MM_STORE(g+23*SIMDD, s3 *(MM_LOAD(g+19*SIMDD) + b03) + b13 * r3); + MM_STORE(g+44*SIMDD, s4 *(MM_LOAD(g+40*SIMDD) + b00) + b10 * r4); + MM_STORE(g+45*SIMDD, s5 *(MM_LOAD(g+41*SIMDD) + b01) + b11 * r5); + MM_STORE(g+46*SIMDD, s6 *(MM_LOAD(g+42*SIMDD) + b02) + b12 * r6); + MM_STORE(g+47*SIMDD, s7 *(MM_LOAD(g+43*SIMDD) + b03) + b13 * r7); + MM_STORE(g+68*SIMDD, s8 * MM_LOAD(g+64*SIMDD) + b00 * MM_LOAD(g+52*SIMDD) + b10 * MM_LOAD(g+60*SIMDD)); + MM_STORE(g+69*SIMDD, s9 * MM_LOAD(g+65*SIMDD) + b01 * MM_LOAD(g+53*SIMDD) + b11 * MM_LOAD(g+61*SIMDD)); + MM_STORE(g+70*SIMDD, s10* MM_LOAD(g+66*SIMDD) + b02 * MM_LOAD(g+54*SIMDD) + b12 * MM_LOAD(g+62*SIMDD)); + MM_STORE(g+71*SIMDD, s11* MM_LOAD(g+67*SIMDD) + b03 * MM_LOAD(g+55*SIMDD) + b13 * MM_LOAD(g+63*SIMDD)); +} + +static inline void _srg0_2d4d_0110(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b = bc->b00; + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + MM_STORE(g+4 *SIMDD, s0); + MM_STORE(g+5 *SIMDD, s1); + MM_STORE(g+6 *SIMDD, s2); + MM_STORE(g+7 *SIMDD, s3); + MM_STORE(g+20*SIMDD, s4); + MM_STORE(g+21*SIMDD, s5); + MM_STORE(g+22*SIMDD, s6); + MM_STORE(g+23*SIMDD, s7); + MM_STORE(g+36*SIMDD, s8 * g32); + MM_STORE(g+37*SIMDD, s9 * g33); + MM_STORE(g+38*SIMDD, s10* g34); + MM_STORE(g+39*SIMDD, s11* g35); + MM_STORE(g+8 *SIMDD, r0); + MM_STORE(g+9 *SIMDD, r1); + MM_STORE(g+10*SIMDD, r2); + MM_STORE(g+11*SIMDD, r3); + MM_STORE(g+24*SIMDD, r4); + MM_STORE(g+25*SIMDD, r5); + MM_STORE(g+26*SIMDD, r6); + MM_STORE(g+27*SIMDD, r7); + MM_STORE(g+40*SIMDD, r8 * g32); + MM_STORE(g+41*SIMDD, r9 * g33); + MM_STORE(g+42*SIMDD, r10* g34); + MM_STORE(g+43*SIMDD, r11* g35); + MM_STORE(g+12*SIMDD, s0 * r0 + b0); + MM_STORE(g+13*SIMDD, s1 * r1 + b1); + MM_STORE(g+14*SIMDD, s2 * r2 + b2); + MM_STORE(g+15*SIMDD, s3 * r3 + b3); + MM_STORE(g+28*SIMDD, s4 * r4 + b0); + MM_STORE(g+29*SIMDD, s5 * r5 + b1); + MM_STORE(g+30*SIMDD, s6 * r6 + b2); + MM_STORE(g+31*SIMDD, s7 * r7 + b3); + MM_STORE(g+44*SIMDD,(s8 * r8 + b0) * g32); + MM_STORE(g+45*SIMDD,(s9 * r9 + b1) * g33); + MM_STORE(g+46*SIMDD,(s10* r10+ b2) * g34); + MM_STORE(g+47*SIMDD,(s11* r11+ b3) * g35); +} + +static inline void _srg0_2d4d_0111(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b01; + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpx2 = MM_LOAD(cpx+2*SIMDD); + __MD cpx3 = MM_LOAD(cpx+3*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpy2 = MM_LOAD(cpy+2*SIMDD); + __MD cpy3 = MM_LOAD(cpy+3*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); + __MD cpz2 = MM_LOAD(cpz+2*SIMDD); + __MD cpz3 = MM_LOAD(cpz+3*SIMDD); + __MD rcp0 = MM_ADD(rx, cpx0); + __MD rcp1 = MM_ADD(rx, cpx1); + __MD rcp2 = MM_ADD(rx, cpx2); + __MD rcp3 = MM_ADD(rx, cpx3); + __MD rcp4 = MM_ADD(ry, cpy0); + __MD rcp5 = MM_ADD(ry, cpy1); + __MD rcp6 = MM_ADD(ry, cpy2); + __MD rcp7 = MM_ADD(ry, cpy3); + __MD rcp8 = MM_ADD(rz, cpz0); + __MD rcp9 = MM_ADD(rz, cpz1); + __MD rcp10= MM_ADD(rz, cpz2); + __MD rcp11= MM_ADD(rz, cpz3); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0x2 = MM_LOAD(c0x+2*SIMDD); + __MD c0x3 = MM_LOAD(c0x+3*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0y2 = MM_LOAD(c0y+2*SIMDD); + __MD c0y3 = MM_LOAD(c0y+3*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD c0z2 = MM_LOAD(c0z+2*SIMDD); + __MD c0z3 = MM_LOAD(c0z+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g96 = MM_LOAD(g+96*SIMDD); + __MD g97 = MM_LOAD(g+97*SIMDD); + __MD g98 = MM_LOAD(g+98*SIMDD); + __MD g99 = MM_LOAD(g+99*SIMDD); + MM_STORE(g+4 *SIMDD, rcp0); + MM_STORE(g+5 *SIMDD, rcp1); + MM_STORE(g+6 *SIMDD, rcp2); + MM_STORE(g+7 *SIMDD, rcp3); + MM_STORE(g+8 *SIMDD, cpx0); + MM_STORE(g+9 *SIMDD, cpx1); + MM_STORE(g+10*SIMDD, cpx2); + MM_STORE(g+11*SIMDD, cpx3); + MM_STORE(g+24*SIMDD, c0x0); + MM_STORE(g+25*SIMDD, c0x1); + MM_STORE(g+26*SIMDD, c0x2); + MM_STORE(g+27*SIMDD, c0x3); + MM_STORE(g+52*SIMDD, rcp4); + MM_STORE(g+53*SIMDD, rcp5); + MM_STORE(g+54*SIMDD, rcp6); + MM_STORE(g+55*SIMDD, rcp7); + MM_STORE(g+56*SIMDD, cpy0); + MM_STORE(g+57*SIMDD, cpy1); + MM_STORE(g+58*SIMDD, cpy2); + MM_STORE(g+59*SIMDD, cpy3); + MM_STORE(g+72*SIMDD, c0y0); + MM_STORE(g+73*SIMDD, c0y1); + MM_STORE(g+74*SIMDD, c0y2); + MM_STORE(g+75*SIMDD, c0y3); + MM_STORE(g+100*SIMDD, rcp8 * g96); + MM_STORE(g+101*SIMDD, rcp9 * g97); + MM_STORE(g+102*SIMDD, rcp10* g98); + MM_STORE(g+103*SIMDD, rcp11* g99); + MM_STORE(g+104*SIMDD, cpz0 * g96); + MM_STORE(g+105*SIMDD, cpz1 * g97); + MM_STORE(g+106*SIMDD, cpz2 * g98); + MM_STORE(g+107*SIMDD, cpz3 * g99); + MM_STORE(g+120*SIMDD, c0z0 * g96); + MM_STORE(g+121*SIMDD, c0z1 * g97); + MM_STORE(g+122*SIMDD, c0z2 * g98); + MM_STORE(g+123*SIMDD, c0z3 * g99); + MM_STORE(g+28*SIMDD, c0x0 * rcp0 + b00); + MM_STORE(g+29*SIMDD, c0x1 * rcp1 + b01); + MM_STORE(g+30*SIMDD, c0x2 * rcp2 + b02); + MM_STORE(g+31*SIMDD, c0x3 * rcp3 + b03); + MM_STORE(g+32*SIMDD, c0x0 * cpx0 + b00); + MM_STORE(g+33*SIMDD, c0x1 * cpx1 + b01); + MM_STORE(g+34*SIMDD, c0x2 * cpx2 + b02); + MM_STORE(g+35*SIMDD, c0x3 * cpx3 + b03); + MM_STORE(g+12*SIMDD, cpx0 * rcp0 + b10); + MM_STORE(g+13*SIMDD, cpx1 * rcp1 + b11); + MM_STORE(g+14*SIMDD, cpx2 * rcp2 + b12); + MM_STORE(g+15*SIMDD, cpx3 * rcp3 + b13); + MM_STORE(g+60*SIMDD, cpy0 * rcp4 + b10); + MM_STORE(g+61*SIMDD, cpy1 * rcp5 + b11); + MM_STORE(g+62*SIMDD, cpy2 * rcp6 + b12); + MM_STORE(g+63*SIMDD, cpy3 * rcp7 + b13); + MM_STORE(g+76*SIMDD, c0y0 * rcp4 + b00); + MM_STORE(g+77*SIMDD, c0y1 * rcp5 + b01); + MM_STORE(g+78*SIMDD, c0y2 * rcp6 + b02); + MM_STORE(g+79*SIMDD, c0y3 * rcp7 + b03); + MM_STORE(g+80*SIMDD, c0y0 * cpy0 + b00); + MM_STORE(g+81*SIMDD, c0y1 * cpy1 + b01); + MM_STORE(g+82*SIMDD, c0y2 * cpy2 + b02); + MM_STORE(g+83*SIMDD, c0y3 * cpy3 + b03); + MM_STORE(g+108*SIMDD,(cpz0 * rcp8 + b10) * g96); + MM_STORE(g+109*SIMDD,(cpz1 * rcp9 + b11) * g97); + MM_STORE(g+110*SIMDD,(cpz2 * rcp10+ b12) * g98); + MM_STORE(g+111*SIMDD,(cpz3 * rcp11+ b13) * g99); + MM_STORE(g+124*SIMDD,(c0z0 * rcp8 + b00) * g96); + MM_STORE(g+125*SIMDD,(c0z1 * rcp9 + b01) * g97); + MM_STORE(g+126*SIMDD,(c0z2 * rcp10+ b02) * g98); + MM_STORE(g+127*SIMDD,(c0z3 * rcp11+ b03) * g99); + MM_STORE(g+128*SIMDD,(c0z0 * cpz0 + b00) * g96); + MM_STORE(g+129*SIMDD,(c0z1 * cpz1 + b01) * g97); + MM_STORE(g+130*SIMDD,(c0z2 * cpz2 + b02) * g98); + MM_STORE(g+131*SIMDD,(c0z3 * cpz3 + b03) * g99); + MM_STORE(g+36*SIMDD , c0x0 * MM_LOAD(g+12 *SIMDD) + b00 * (rcp0 + cpx0)); + MM_STORE(g+37*SIMDD , c0x1 * MM_LOAD(g+13 *SIMDD) + b01 * (rcp1 + cpx1)); + MM_STORE(g+38*SIMDD , c0x2 * MM_LOAD(g+14 *SIMDD) + b02 * (rcp2 + cpx2)); + MM_STORE(g+39*SIMDD , c0x3 * MM_LOAD(g+15 *SIMDD) + b03 * (rcp3 + cpx3)); + MM_STORE(g+84*SIMDD , c0y0 * MM_LOAD(g+60 *SIMDD) + b00 * (rcp4 + cpy0)); + MM_STORE(g+85*SIMDD , c0y1 * MM_LOAD(g+61 *SIMDD) + b01 * (rcp5 + cpy1)); + MM_STORE(g+86*SIMDD , c0y2 * MM_LOAD(g+62 *SIMDD) + b02 * (rcp6 + cpy2)); + MM_STORE(g+87*SIMDD , c0y3 * MM_LOAD(g+63 *SIMDD) + b03 * (rcp7 + cpy3)); + MM_STORE(g+132*SIMDD, c0z0 * MM_LOAD(g+108*SIMDD) + b00 * (MM_LOAD(g+100*SIMDD) + MM_LOAD(g+104*SIMDD))); + MM_STORE(g+133*SIMDD, c0z1 * MM_LOAD(g+109*SIMDD) + b01 * (MM_LOAD(g+101*SIMDD) + MM_LOAD(g+105*SIMDD))); + MM_STORE(g+134*SIMDD, c0z2 * MM_LOAD(g+110*SIMDD) + b02 * (MM_LOAD(g+102*SIMDD) + MM_LOAD(g+106*SIMDD))); + MM_STORE(g+135*SIMDD, c0z3 * MM_LOAD(g+111*SIMDD) + b03 * (MM_LOAD(g+103*SIMDD) + MM_LOAD(g+107*SIMDD))); +} + +static inline void _srg0_2d4d_0120(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b01; + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, s0); + MM_STORE(g+5 *SIMDD, s1); + MM_STORE(g+6 *SIMDD, s2); + MM_STORE(g+7 *SIMDD, s3); + MM_STORE(g+12*SIMDD, r0); + MM_STORE(g+13*SIMDD, r1); + MM_STORE(g+14*SIMDD, r2); + MM_STORE(g+15*SIMDD, r3); + MM_STORE(g+28*SIMDD, s4); + MM_STORE(g+29*SIMDD, s5); + MM_STORE(g+30*SIMDD, s6); + MM_STORE(g+31*SIMDD, s7); + MM_STORE(g+36*SIMDD, r4); + MM_STORE(g+37*SIMDD, r5); + MM_STORE(g+38*SIMDD, r6); + MM_STORE(g+39*SIMDD, r7); + MM_STORE(g+52*SIMDD, MM_MUL(s8, g48)); + MM_STORE(g+53*SIMDD, MM_MUL(s9, g49)); + MM_STORE(g+54*SIMDD, MM_MUL(s10,g50)); + MM_STORE(g+55*SIMDD, MM_MUL(s11,g51)); + MM_STORE(g+60*SIMDD, MM_MUL(r8, g48)); + MM_STORE(g+61*SIMDD, MM_MUL(r9, g49)); + MM_STORE(g+62*SIMDD, MM_MUL(r10,g50)); + MM_STORE(g+63*SIMDD, MM_MUL(r11,g51)); + MM_STORE(g+8 *SIMDD, MM_FMA(s0, s0, b10)); + MM_STORE(g+9 *SIMDD, MM_FMA(s1, s1, b11)); + MM_STORE(g+10*SIMDD, MM_FMA(s2, s2, b12)); + MM_STORE(g+11*SIMDD, MM_FMA(s3, s3, b13)); + MM_STORE(g+16*SIMDD, MM_FMA(r0, s0, b00)); + MM_STORE(g+17*SIMDD, MM_FMA(r1, s1, b01)); + MM_STORE(g+18*SIMDD, MM_FMA(r2, s2, b02)); + MM_STORE(g+19*SIMDD, MM_FMA(r3, s3, b03)); + MM_STORE(g+32*SIMDD, MM_FMA(s4, s4, b10)); + MM_STORE(g+33*SIMDD, MM_FMA(s5, s5, b11)); + MM_STORE(g+34*SIMDD, MM_FMA(s6, s6, b12)); + MM_STORE(g+35*SIMDD, MM_FMA(s7, s7, b13)); + MM_STORE(g+40*SIMDD, MM_FMA(r4, s4, b00)); + MM_STORE(g+41*SIMDD, MM_FMA(r5, s5, b01)); + MM_STORE(g+42*SIMDD, MM_FMA(r6, s6, b02)); + MM_STORE(g+43*SIMDD, MM_FMA(r7, s7, b03)); + MM_STORE(g+56*SIMDD, MM_MUL(MM_FMA(s8 , s8 , b10), g48)); + MM_STORE(g+57*SIMDD, MM_MUL(MM_FMA(s9 , s9 , b11), g49)); + MM_STORE(g+58*SIMDD, MM_MUL(MM_FMA(s10, s10, b12), g50)); + MM_STORE(g+59*SIMDD, MM_MUL(MM_FMA(s11, s11, b13), g51)); + MM_STORE(g+64*SIMDD, MM_MUL(MM_FMA(r8 , s8 , b00), g48)); + MM_STORE(g+65*SIMDD, MM_MUL(MM_FMA(r9 , s9 , b01), g49)); + MM_STORE(g+66*SIMDD, MM_MUL(MM_FMA(r10, s10, b02), g50)); + MM_STORE(g+67*SIMDD, MM_MUL(MM_FMA(r11, s11, b03), g51)); + MM_STORE(g+20*SIMDD, s0 *(MM_LOAD(g+16*SIMDD) + b00) + b10 * r0); + MM_STORE(g+21*SIMDD, s1 *(MM_LOAD(g+17*SIMDD) + b01) + b11 * r1); + MM_STORE(g+22*SIMDD, s2 *(MM_LOAD(g+18*SIMDD) + b02) + b12 * r2); + MM_STORE(g+23*SIMDD, s3 *(MM_LOAD(g+19*SIMDD) + b03) + b13 * r3); + MM_STORE(g+44*SIMDD, s4 *(MM_LOAD(g+40*SIMDD) + b00) + b10 * r4); + MM_STORE(g+45*SIMDD, s5 *(MM_LOAD(g+41*SIMDD) + b01) + b11 * r5); + MM_STORE(g+46*SIMDD, s6 *(MM_LOAD(g+42*SIMDD) + b02) + b12 * r6); + MM_STORE(g+47*SIMDD, s7 *(MM_LOAD(g+43*SIMDD) + b03) + b13 * r7); + MM_STORE(g+68*SIMDD, s8 * MM_LOAD(g+64*SIMDD) + b00 * MM_LOAD(g+52*SIMDD) + b10 * MM_LOAD(g+60*SIMDD)); + MM_STORE(g+69*SIMDD, s9 * MM_LOAD(g+65*SIMDD) + b01 * MM_LOAD(g+53*SIMDD) + b11 * MM_LOAD(g+61*SIMDD)); + MM_STORE(g+70*SIMDD, s10* MM_LOAD(g+66*SIMDD) + b02 * MM_LOAD(g+54*SIMDD) + b12 * MM_LOAD(g+62*SIMDD)); + MM_STORE(g+71*SIMDD, s11* MM_LOAD(g+67*SIMDD) + b03 * MM_LOAD(g+55*SIMDD) + b13 * MM_LOAD(g+63*SIMDD)); +} + +static inline void _srg0_2d4d_0200(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + double *b = bc->b10; + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g24 = MM_LOAD(g+24*SIMDD); + __MD g25 = MM_LOAD(g+25*SIMDD); + __MD g26 = MM_LOAD(g+26*SIMDD); + __MD g27 = MM_LOAD(g+27*SIMDD); + MM_STORE(g+4 *SIMDD, cx0); + MM_STORE(g+5 *SIMDD, cx1); + MM_STORE(g+6 *SIMDD, cx2); + MM_STORE(g+7 *SIMDD, cx3); + MM_STORE(g+16*SIMDD, cy0); + MM_STORE(g+17*SIMDD, cy1); + MM_STORE(g+18*SIMDD, cy2); + MM_STORE(g+19*SIMDD, cy3); + MM_STORE(g+28*SIMDD, MM_MUL(cz0, g24)); + MM_STORE(g+29*SIMDD, MM_MUL(cz1, g25)); + MM_STORE(g+30*SIMDD, MM_MUL(cz2, g26)); + MM_STORE(g+31*SIMDD, MM_MUL(cz3, g27)); + MM_STORE(g+8 *SIMDD, MM_FMA(cx0, cx0, b0)); + MM_STORE(g+9 *SIMDD, MM_FMA(cx1, cx1, b1)); + MM_STORE(g+10*SIMDD, MM_FMA(cx2, cx2, b2)); + MM_STORE(g+11*SIMDD, MM_FMA(cx3, cx3, b3)); + MM_STORE(g+20*SIMDD, MM_FMA(cy0, cy0, b0)); + MM_STORE(g+21*SIMDD, MM_FMA(cy1, cy1, b1)); + MM_STORE(g+22*SIMDD, MM_FMA(cy2, cy2, b2)); + MM_STORE(g+23*SIMDD, MM_FMA(cy3, cy3, b3)); + MM_STORE(g+32*SIMDD, MM_MUL(MM_FMA(cz0, cz0, b0), g24)); + MM_STORE(g+33*SIMDD, MM_MUL(MM_FMA(cz1, cz1, b1), g25)); + MM_STORE(g+34*SIMDD, MM_MUL(MM_FMA(cz2, cz2, b2), g26)); + MM_STORE(g+35*SIMDD, MM_MUL(MM_FMA(cz3, cz3, b3), g27)); +} + +static inline void _srg0_2d4d_0201(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b10; + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0x2 = MM_LOAD(c0x+2*SIMDD); + __MD c0x3 = MM_LOAD(c0x+3*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0y2 = MM_LOAD(c0y+2*SIMDD); + __MD c0y3 = MM_LOAD(c0y+3*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD c0z2 = MM_LOAD(c0z+2*SIMDD); + __MD c0z3 = MM_LOAD(c0z+3*SIMDD); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpx2 = MM_LOAD(cpx+2*SIMDD); + __MD cpx3 = MM_LOAD(cpx+3*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpy2 = MM_LOAD(cpy+2*SIMDD); + __MD cpy3 = MM_LOAD(cpy+3*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); + __MD cpz2 = MM_LOAD(cpz+2*SIMDD); + __MD cpz3 = MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + __MD i2 = MM_SET1(2.); + MM_STORE(g+4 *SIMDD, cpx0); + MM_STORE(g+5 *SIMDD, cpx1); + MM_STORE(g+6 *SIMDD, cpx2); + MM_STORE(g+7 *SIMDD, cpx3); + MM_STORE(g+8 *SIMDD, c0x0); + MM_STORE(g+9 *SIMDD, c0x1); + MM_STORE(g+10*SIMDD, c0x2); + MM_STORE(g+11*SIMDD, c0x3); + MM_STORE(g+28*SIMDD, cpy0); + MM_STORE(g+29*SIMDD, cpy1); + MM_STORE(g+30*SIMDD, cpy2); + MM_STORE(g+31*SIMDD, cpy3); + MM_STORE(g+32*SIMDD, c0y0); + MM_STORE(g+33*SIMDD, c0y1); + MM_STORE(g+34*SIMDD, c0y2); + MM_STORE(g+35*SIMDD, c0y3); + MM_STORE(g+52*SIMDD, cpz0 * g48); + MM_STORE(g+53*SIMDD, cpz1 * g49); + MM_STORE(g+54*SIMDD, cpz2 * g50); + MM_STORE(g+55*SIMDD, cpz3 * g51); + MM_STORE(g+56*SIMDD, c0z0 * g48); + MM_STORE(g+57*SIMDD, c0z1 * g49); + MM_STORE(g+58*SIMDD, c0z2 * g50); + MM_STORE(g+59*SIMDD, c0z3 * g51); + MM_STORE(g+12*SIMDD, cpx0 * c0x0 + b00); + MM_STORE(g+13*SIMDD, cpx1 * c0x1 + b01); + MM_STORE(g+14*SIMDD, cpx2 * c0x2 + b02); + MM_STORE(g+15*SIMDD, cpx3 * c0x3 + b03); + MM_STORE(g+16*SIMDD, c0x0 * c0x0 + b10); + MM_STORE(g+17*SIMDD, c0x1 * c0x1 + b11); + MM_STORE(g+18*SIMDD, c0x2 * c0x2 + b12); + MM_STORE(g+19*SIMDD, c0x3 * c0x3 + b13); + MM_STORE(g+36*SIMDD, cpy0 * c0y0 + b00); + MM_STORE(g+37*SIMDD, cpy1 * c0y1 + b01); + MM_STORE(g+38*SIMDD, cpy2 * c0y2 + b02); + MM_STORE(g+39*SIMDD, cpy3 * c0y3 + b03); + MM_STORE(g+40*SIMDD, c0y0 * c0y0 + b10); + MM_STORE(g+41*SIMDD, c0y1 * c0y1 + b11); + MM_STORE(g+42*SIMDD, c0y2 * c0y2 + b12); + MM_STORE(g+43*SIMDD, c0y3 * c0y3 + b13); + MM_STORE(g+60*SIMDD,(cpz0 * c0z0 + b00) * g48); + MM_STORE(g+61*SIMDD,(cpz1 * c0z1 + b01) * g49); + MM_STORE(g+62*SIMDD,(cpz2 * c0z2 + b02) * g50); + MM_STORE(g+63*SIMDD,(cpz3 * c0z3 + b03) * g51); + MM_STORE(g+64*SIMDD,(c0z0 * c0z0 + b10) * g48); + MM_STORE(g+65*SIMDD,(c0z1 * c0z1 + b11) * g49); + MM_STORE(g+66*SIMDD,(c0z2 * c0z2 + b12) * g50); + MM_STORE(g+67*SIMDD,(c0z3 * c0z3 + b13) * g51); + MM_STORE(g+20*SIMDD, cpx0 * MM_LOAD(g+16*SIMDD) + i2 * b00 * c0x0); + MM_STORE(g+21*SIMDD, cpx1 * MM_LOAD(g+17*SIMDD) + i2 * b01 * c0x1); + MM_STORE(g+22*SIMDD, cpx2 * MM_LOAD(g+18*SIMDD) + i2 * b02 * c0x2); + MM_STORE(g+23*SIMDD, cpx3 * MM_LOAD(g+19*SIMDD) + i2 * b03 * c0x3); + MM_STORE(g+44*SIMDD, cpy0 * MM_LOAD(g+40*SIMDD) + i2 * b00 * c0y0); + MM_STORE(g+45*SIMDD, cpy1 * MM_LOAD(g+41*SIMDD) + i2 * b01 * c0y1); + MM_STORE(g+46*SIMDD, cpy2 * MM_LOAD(g+42*SIMDD) + i2 * b02 * c0y2); + MM_STORE(g+47*SIMDD, cpy3 * MM_LOAD(g+43*SIMDD) + i2 * b03 * c0y3); + MM_STORE(g+68*SIMDD, cpz0 * MM_LOAD(g+64*SIMDD) + i2 * b00 * MM_LOAD(g+56*SIMDD)); + MM_STORE(g+69*SIMDD, cpz1 * MM_LOAD(g+65*SIMDD) + i2 * b01 * MM_LOAD(g+57*SIMDD)); + MM_STORE(g+70*SIMDD, cpz2 * MM_LOAD(g+66*SIMDD) + i2 * b02 * MM_LOAD(g+58*SIMDD)); + MM_STORE(g+71*SIMDD, cpz3 * MM_LOAD(g+67*SIMDD) + i2 * b03 * MM_LOAD(g+59*SIMDD)); +} + +static inline void _srg0_2d4d_0210(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b10; + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, s0); + MM_STORE(g+5 *SIMDD, s1); + MM_STORE(g+6 *SIMDD, s2); + MM_STORE(g+7 *SIMDD, s3); + MM_STORE(g+8 *SIMDD, r0); + MM_STORE(g+9 *SIMDD, r1); + MM_STORE(g+10*SIMDD, r2); + MM_STORE(g+11*SIMDD, r3); + MM_STORE(g+28*SIMDD, s4); + MM_STORE(g+29*SIMDD, s5); + MM_STORE(g+30*SIMDD, s6); + MM_STORE(g+31*SIMDD, s7); + MM_STORE(g+32*SIMDD, r4); + MM_STORE(g+33*SIMDD, r5); + MM_STORE(g+34*SIMDD, r6); + MM_STORE(g+35*SIMDD, r7); + MM_STORE(g+52*SIMDD, MM_MUL(s8, g48)); + MM_STORE(g+53*SIMDD, MM_MUL(s9, g49)); + MM_STORE(g+54*SIMDD, MM_MUL(s10,g50)); + MM_STORE(g+55*SIMDD, MM_MUL(s11,g51)); + MM_STORE(g+56*SIMDD, MM_MUL(r8, g48)); + MM_STORE(g+57*SIMDD, MM_MUL(r9, g49)); + MM_STORE(g+58*SIMDD, MM_MUL(r10,g50)); + MM_STORE(g+59*SIMDD, MM_MUL(r11,g51)); + MM_STORE(g+12*SIMDD, MM_FMA(s0, r0, b00)); + MM_STORE(g+13*SIMDD, MM_FMA(s1, r1, b01)); + MM_STORE(g+14*SIMDD, MM_FMA(s2, r2, b02)); + MM_STORE(g+15*SIMDD, MM_FMA(s3, r3, b03)); + MM_STORE(g+16*SIMDD, MM_FMA(r0, r0, b10)); + MM_STORE(g+17*SIMDD, MM_FMA(r1, r1, b11)); + MM_STORE(g+18*SIMDD, MM_FMA(r2, r2, b12)); + MM_STORE(g+19*SIMDD, MM_FMA(r3, r3, b13)); + MM_STORE(g+36*SIMDD, MM_FMA(s4, r4, b00)); + MM_STORE(g+37*SIMDD, MM_FMA(s5, r5, b01)); + MM_STORE(g+38*SIMDD, MM_FMA(s6, r6, b02)); + MM_STORE(g+39*SIMDD, MM_FMA(s7, r7, b03)); + MM_STORE(g+40*SIMDD, MM_FMA(r4, r4, b10)); + MM_STORE(g+41*SIMDD, MM_FMA(r5, r5, b11)); + MM_STORE(g+42*SIMDD, MM_FMA(r6, r6, b12)); + MM_STORE(g+43*SIMDD, MM_FMA(r7, r7, b13)); + MM_STORE(g+60*SIMDD, MM_MUL(MM_FMA(s8, r8, b00), g48)); + MM_STORE(g+61*SIMDD, MM_MUL(MM_FMA(s9, r9, b01), g49)); + MM_STORE(g+62*SIMDD, MM_MUL(MM_FMA(s10,r10,b02), g50)); + MM_STORE(g+63*SIMDD, MM_MUL(MM_FMA(s11,r11,b03), g51)); + MM_STORE(g+64*SIMDD, MM_MUL(MM_FMA(r8, r8, b10), g48)); + MM_STORE(g+65*SIMDD, MM_MUL(MM_FMA(r9, r9, b11), g49)); + MM_STORE(g+66*SIMDD, MM_MUL(MM_FMA(r10,r10,b12), g50)); + MM_STORE(g+67*SIMDD, MM_MUL(MM_FMA(r11,r11,b13), g51)); + MM_STORE(g+20*SIMDD, r0 *(MM_LOAD(g+12*SIMDD) + b00) + b10 * s0); + MM_STORE(g+21*SIMDD, r1 *(MM_LOAD(g+13*SIMDD) + b01) + b11 * s1); + MM_STORE(g+22*SIMDD, r2 *(MM_LOAD(g+14*SIMDD) + b02) + b12 * s2); + MM_STORE(g+23*SIMDD, r3 *(MM_LOAD(g+15*SIMDD) + b03) + b13 * s3); + MM_STORE(g+44*SIMDD, r4 *(MM_LOAD(g+36*SIMDD) + b00) + b10 * s4); + MM_STORE(g+45*SIMDD, r5 *(MM_LOAD(g+37*SIMDD) + b01) + b11 * s5); + MM_STORE(g+46*SIMDD, r6 *(MM_LOAD(g+38*SIMDD) + b02) + b12 * s6); + MM_STORE(g+47*SIMDD, r7 *(MM_LOAD(g+39*SIMDD) + b03) + b13 * s7); + MM_STORE(g+68*SIMDD, r8 * MM_LOAD(g+60*SIMDD) + b10 * MM_LOAD(g+52*SIMDD) + b00 * MM_LOAD(g+56*SIMDD)); + MM_STORE(g+69*SIMDD, r9 * MM_LOAD(g+61*SIMDD) + b11 * MM_LOAD(g+53*SIMDD) + b01 * MM_LOAD(g+57*SIMDD)); + MM_STORE(g+70*SIMDD, r10* MM_LOAD(g+62*SIMDD) + b12 * MM_LOAD(g+54*SIMDD) + b02 * MM_LOAD(g+58*SIMDD)); + MM_STORE(g+71*SIMDD, r11* MM_LOAD(g+63*SIMDD) + b13 * MM_LOAD(g+55*SIMDD) + b03 * MM_LOAD(g+59*SIMDD)); +} + +static inline void _srg0_2d4d_0300(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + double *b = bc->b10; + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + __MD i3 = MM_SET1(3.); + MM_STORE(g+4 *SIMDD, cx0); + MM_STORE(g+5 *SIMDD, cx1); + MM_STORE(g+6 *SIMDD, cx2); + MM_STORE(g+7 *SIMDD, cx3); + MM_STORE(g+20*SIMDD, cy0); + MM_STORE(g+21*SIMDD, cy1); + MM_STORE(g+22*SIMDD, cy2); + MM_STORE(g+23*SIMDD, cy3); + MM_STORE(g+36*SIMDD, cz0 * g32); + MM_STORE(g+37*SIMDD, cz1 * g33); + MM_STORE(g+38*SIMDD, cz2 * g34); + MM_STORE(g+39*SIMDD, cz3 * g35); + MM_STORE(g+8 *SIMDD, cx0 * cx0 + b0); + MM_STORE(g+9 *SIMDD, cx1 * cx1 + b1); + MM_STORE(g+10*SIMDD, cx2 * cx2 + b2); + MM_STORE(g+11*SIMDD, cx3 * cx3 + b3); + MM_STORE(g+24*SIMDD, cy0 * cy0 + b0); + MM_STORE(g+25*SIMDD, cy1 * cy1 + b1); + MM_STORE(g+26*SIMDD, cy2 * cy2 + b2); + MM_STORE(g+27*SIMDD, cy3 * cy3 + b3); + MM_STORE(g+40*SIMDD,(cz0 * cz0 + b0)* g32); + MM_STORE(g+41*SIMDD,(cz1 * cz1 + b1)* g33); + MM_STORE(g+42*SIMDD,(cz2 * cz2 + b2)* g34); + MM_STORE(g+43*SIMDD,(cz3 * cz3 + b3)* g35); + MM_STORE(g+12*SIMDD, cx0 *(cx0 * cx0 + i3 * b0)); + MM_STORE(g+13*SIMDD, cx1 *(cx1 * cx1 + i3 * b1)); + MM_STORE(g+14*SIMDD, cx2 *(cx2 * cx2 + i3 * b2)); + MM_STORE(g+15*SIMDD, cx3 *(cx3 * cx3 + i3 * b3)); + MM_STORE(g+28*SIMDD, cy0 *(cy0 * cy0 + i3 * b0)); + MM_STORE(g+29*SIMDD, cy1 *(cy1 * cy1 + i3 * b1)); + MM_STORE(g+30*SIMDD, cy2 *(cy2 * cy2 + i3 * b2)); + MM_STORE(g+31*SIMDD, cy3 *(cy3 * cy3 + i3 * b3)); + MM_STORE(g+44*SIMDD,(cz0 * cz0 + i3 * b0)* MM_LOAD(g+36*SIMDD)); + MM_STORE(g+45*SIMDD,(cz1 * cz1 + i3 * b1)* MM_LOAD(g+37*SIMDD)); + MM_STORE(g+46*SIMDD,(cz2 * cz2 + i3 * b2)* MM_LOAD(g+38*SIMDD)); + MM_STORE(g+47*SIMDD,(cz3 * cz3 + i3 * b3)* MM_LOAD(g+39*SIMDD)); +} + +static inline void _srg0_2d4d_1000(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + MM_STORE(g+2*SIMDD, MM_LOAD(cx+0*SIMDD)); + MM_STORE(g+3*SIMDD, MM_LOAD(cx+1*SIMDD)); + MM_STORE(g+6*SIMDD, MM_LOAD(cy+0*SIMDD)); + MM_STORE(g+7*SIMDD, MM_LOAD(cy+1*SIMDD)); + MM_STORE(g+10*SIMDD, MM_MUL(MM_LOAD(cz+0*SIMDD), MM_LOAD(g+8*SIMDD))); + MM_STORE(g+11*SIMDD, MM_MUL(MM_LOAD(cz+1*SIMDD), MM_LOAD(g+9*SIMDD))); +} + +static inline void _srg0_2d4d_1001(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b = bc->b00; + __MD cx0 = MM_LOAD(c0x+0*SIMDD); + __MD cx1 = MM_LOAD(c0x+1*SIMDD); + __MD cx2 = MM_LOAD(c0x+2*SIMDD); + __MD cx3 = MM_LOAD(c0x+3*SIMDD); + __MD cy0 = MM_LOAD(c0y+0*SIMDD); + __MD cy1 = MM_LOAD(c0y+1*SIMDD); + __MD cy2 = MM_LOAD(c0y+2*SIMDD); + __MD cy3 = MM_LOAD(c0y+3*SIMDD); + __MD cz0 = MM_LOAD(c0z+0*SIMDD); + __MD cz1 = MM_LOAD(c0z+1*SIMDD); + __MD cz2 = MM_LOAD(c0z+2*SIMDD); + __MD cz3 = MM_LOAD(c0z+3*SIMDD); + __MD px0 = MM_LOAD(cpx+0*SIMDD); + __MD px1 = MM_LOAD(cpx+1*SIMDD); + __MD px2 = MM_LOAD(cpx+2*SIMDD); + __MD px3 = MM_LOAD(cpx+3*SIMDD); + __MD py0 = MM_LOAD(cpy+0*SIMDD); + __MD py1 = MM_LOAD(cpy+1*SIMDD); + __MD py2 = MM_LOAD(cpy+2*SIMDD); + __MD py3 = MM_LOAD(cpy+3*SIMDD); + __MD pz0 = MM_LOAD(cpz+0*SIMDD); + __MD pz1 = MM_LOAD(cpz+1*SIMDD); + __MD pz2 = MM_LOAD(cpz+2*SIMDD); + __MD pz3 = MM_LOAD(cpz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + MM_STORE(g+4 *SIMDD, cx0); + MM_STORE(g+5 *SIMDD, cx1); + MM_STORE(g+6 *SIMDD, cx2); + MM_STORE(g+7 *SIMDD, cx3); + MM_STORE(g+8 *SIMDD, px0); + MM_STORE(g+9 *SIMDD, px1); + MM_STORE(g+10*SIMDD, px2); + MM_STORE(g+11*SIMDD, px3); + MM_STORE(g+20*SIMDD, cy0); + MM_STORE(g+21*SIMDD, cy1); + MM_STORE(g+22*SIMDD, cy2); + MM_STORE(g+23*SIMDD, cy3); + MM_STORE(g+24*SIMDD, py0); + MM_STORE(g+25*SIMDD, py1); + MM_STORE(g+26*SIMDD, py2); + MM_STORE(g+27*SIMDD, py3); + MM_STORE(g+36*SIMDD, MM_MUL(cz0, g32)); + MM_STORE(g+37*SIMDD, MM_MUL(cz1, g33)); + MM_STORE(g+38*SIMDD, MM_MUL(cz2, g34)); + MM_STORE(g+39*SIMDD, MM_MUL(cz3, g35)); + MM_STORE(g+40*SIMDD, MM_MUL(pz0, g32)); + MM_STORE(g+41*SIMDD, MM_MUL(pz1, g33)); + MM_STORE(g+42*SIMDD, MM_MUL(pz2, g34)); + MM_STORE(g+43*SIMDD, MM_MUL(pz3, g35)); + MM_STORE(g+12*SIMDD, MM_FMA(px0, cx0, b0)); + MM_STORE(g+13*SIMDD, MM_FMA(px1, cx1, b1)); + MM_STORE(g+14*SIMDD, MM_FMA(px2, cx2, b2)); + MM_STORE(g+15*SIMDD, MM_FMA(px3, cx3, b3)); + MM_STORE(g+28*SIMDD, MM_FMA(py0, cy0, b0)); + MM_STORE(g+29*SIMDD, MM_FMA(py1, cy1, b1)); + MM_STORE(g+30*SIMDD, MM_FMA(py2, cy2, b2)); + MM_STORE(g+31*SIMDD, MM_FMA(py3, cy3, b3)); + MM_STORE(g+44*SIMDD, MM_MUL(MM_FMA(pz0, cz0, b0), g32)); + MM_STORE(g+45*SIMDD, MM_MUL(MM_FMA(pz1, cz1, b1), g33)); + MM_STORE(g+46*SIMDD, MM_MUL(MM_FMA(pz2, cz2, b2), g34)); + MM_STORE(g+47*SIMDD, MM_MUL(MM_FMA(pz3, cz3, b3), g35)); +} + +static inline void _srg0_2d4d_1002(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b01; + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+8 *SIMDD, s0); + MM_STORE(g+9 *SIMDD, s1); + MM_STORE(g+10*SIMDD, s2); + MM_STORE(g+11*SIMDD, s3); + MM_STORE(g+28*SIMDD, r4); + MM_STORE(g+29*SIMDD, r5); + MM_STORE(g+30*SIMDD, r6); + MM_STORE(g+31*SIMDD, r7); + MM_STORE(g+32*SIMDD, s4); + MM_STORE(g+33*SIMDD, s5); + MM_STORE(g+34*SIMDD, s6); + MM_STORE(g+35*SIMDD, s7); + MM_STORE(g+52*SIMDD, MM_MUL(r8, g48)); + MM_STORE(g+53*SIMDD, MM_MUL(r9, g49)); + MM_STORE(g+54*SIMDD, MM_MUL(r10,g50)); + MM_STORE(g+55*SIMDD, MM_MUL(r11,g51)); + MM_STORE(g+56*SIMDD, MM_MUL(s8, g48)); + MM_STORE(g+57*SIMDD, MM_MUL(s9, g49)); + MM_STORE(g+58*SIMDD, MM_MUL(s10,g50)); + MM_STORE(g+59*SIMDD, MM_MUL(s11,g51)); + MM_STORE(g+12*SIMDD, MM_FMA(r0, s0, b00)); + MM_STORE(g+13*SIMDD, MM_FMA(r1, s1, b01)); + MM_STORE(g+14*SIMDD, MM_FMA(r2, s2, b02)); + MM_STORE(g+15*SIMDD, MM_FMA(r3, s3, b03)); + MM_STORE(g+16*SIMDD, MM_FMA(s0, s0, b10)); + MM_STORE(g+17*SIMDD, MM_FMA(s1, s1, b11)); + MM_STORE(g+18*SIMDD, MM_FMA(s2, s2, b12)); + MM_STORE(g+19*SIMDD, MM_FMA(s3, s3, b13)); + MM_STORE(g+36*SIMDD, MM_FMA(r4, s4, b00)); + MM_STORE(g+37*SIMDD, MM_FMA(r5, s5, b01)); + MM_STORE(g+38*SIMDD, MM_FMA(r6, s6, b02)); + MM_STORE(g+39*SIMDD, MM_FMA(r7, s7, b03)); + MM_STORE(g+40*SIMDD, MM_FMA(s4, s4, b10)); + MM_STORE(g+41*SIMDD, MM_FMA(s5, s5, b11)); + MM_STORE(g+42*SIMDD, MM_FMA(s6, s6, b12)); + MM_STORE(g+43*SIMDD, MM_FMA(s7, s7, b13)); + MM_STORE(g+60*SIMDD, MM_MUL(MM_FMA(r8, s8, b00), g48)); + MM_STORE(g+61*SIMDD, MM_MUL(MM_FMA(r9, s9, b01), g49)); + MM_STORE(g+62*SIMDD, MM_MUL(MM_FMA(r10,s10,b02), g50)); + MM_STORE(g+63*SIMDD, MM_MUL(MM_FMA(r11,s11,b03), g51)); + MM_STORE(g+64*SIMDD, MM_MUL(MM_FMA(s8, s8, b10), g48)); + MM_STORE(g+65*SIMDD, MM_MUL(MM_FMA(s9, s9, b11), g49)); + MM_STORE(g+66*SIMDD, MM_MUL(MM_FMA(s10,s10,b12), g50)); + MM_STORE(g+67*SIMDD, MM_MUL(MM_FMA(s11,s11,b13), g51)); + MM_STORE(g+20*SIMDD, s0 *(MM_LOAD(g+12*SIMDD) + b00) + b10 * r0); + MM_STORE(g+21*SIMDD, s1 *(MM_LOAD(g+13*SIMDD) + b01) + b11 * r1); + MM_STORE(g+22*SIMDD, s2 *(MM_LOAD(g+14*SIMDD) + b02) + b12 * r2); + MM_STORE(g+23*SIMDD, s3 *(MM_LOAD(g+15*SIMDD) + b03) + b13 * r3); + MM_STORE(g+44*SIMDD, s4 *(MM_LOAD(g+36*SIMDD) + b00) + b10 * r4); + MM_STORE(g+45*SIMDD, s5 *(MM_LOAD(g+37*SIMDD) + b01) + b11 * r5); + MM_STORE(g+46*SIMDD, s6 *(MM_LOAD(g+38*SIMDD) + b02) + b12 * r6); + MM_STORE(g+47*SIMDD, s7 *(MM_LOAD(g+39*SIMDD) + b03) + b13 * r7); + MM_STORE(g+68*SIMDD, s8 * MM_LOAD(g+60*SIMDD) + b10 * MM_LOAD(g+52*SIMDD) + b00 * MM_LOAD(g+56*SIMDD)); + MM_STORE(g+69*SIMDD, s9 * MM_LOAD(g+61*SIMDD) + b11 * MM_LOAD(g+53*SIMDD) + b01 * MM_LOAD(g+57*SIMDD)); + MM_STORE(g+70*SIMDD, s10* MM_LOAD(g+62*SIMDD) + b12 * MM_LOAD(g+54*SIMDD) + b02 * MM_LOAD(g+58*SIMDD)); + MM_STORE(g+71*SIMDD, s11* MM_LOAD(g+63*SIMDD) + b13 * MM_LOAD(g+55*SIMDD) + b03 * MM_LOAD(g+59*SIMDD)); +} + +static inline void _srg0_2d4d_1010(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b = bc->b00; + __MD cx0 = MM_LOAD(c0x+0*SIMDD); + __MD cx1 = MM_LOAD(c0x+1*SIMDD); + __MD cx2 = MM_LOAD(c0x+2*SIMDD); + __MD cx3 = MM_LOAD(c0x+3*SIMDD); + __MD cy0 = MM_LOAD(c0y+0*SIMDD); + __MD cy1 = MM_LOAD(c0y+1*SIMDD); + __MD cy2 = MM_LOAD(c0y+2*SIMDD); + __MD cy3 = MM_LOAD(c0y+3*SIMDD); + __MD cz0 = MM_LOAD(c0z+0*SIMDD); + __MD cz1 = MM_LOAD(c0z+1*SIMDD); + __MD cz2 = MM_LOAD(c0z+2*SIMDD); + __MD cz3 = MM_LOAD(c0z+3*SIMDD); + __MD px0 = MM_LOAD(cpx+0*SIMDD); + __MD px1 = MM_LOAD(cpx+1*SIMDD); + __MD px2 = MM_LOAD(cpx+2*SIMDD); + __MD px3 = MM_LOAD(cpx+3*SIMDD); + __MD py0 = MM_LOAD(cpy+0*SIMDD); + __MD py1 = MM_LOAD(cpy+1*SIMDD); + __MD py2 = MM_LOAD(cpy+2*SIMDD); + __MD py3 = MM_LOAD(cpy+3*SIMDD); + __MD pz0 = MM_LOAD(cpz+0*SIMDD); + __MD pz1 = MM_LOAD(cpz+1*SIMDD); + __MD pz2 = MM_LOAD(cpz+2*SIMDD); + __MD pz3 = MM_LOAD(cpz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + MM_STORE(g+4 *SIMDD, cx0); + MM_STORE(g+5 *SIMDD, cx1); + MM_STORE(g+6 *SIMDD, cx2); + MM_STORE(g+7 *SIMDD, cx3); + MM_STORE(g+8 *SIMDD, px0); + MM_STORE(g+9 *SIMDD, px1); + MM_STORE(g+10*SIMDD, px2); + MM_STORE(g+11*SIMDD, px3); + MM_STORE(g+20*SIMDD, cy0); + MM_STORE(g+21*SIMDD, cy1); + MM_STORE(g+22*SIMDD, cy2); + MM_STORE(g+23*SIMDD, cy3); + MM_STORE(g+24*SIMDD, py0); + MM_STORE(g+25*SIMDD, py1); + MM_STORE(g+26*SIMDD, py2); + MM_STORE(g+27*SIMDD, py3); + MM_STORE(g+36*SIMDD, MM_MUL(cz0, g32)); + MM_STORE(g+37*SIMDD, MM_MUL(cz1, g33)); + MM_STORE(g+38*SIMDD, MM_MUL(cz2, g34)); + MM_STORE(g+39*SIMDD, MM_MUL(cz3, g35)); + MM_STORE(g+40*SIMDD, MM_MUL(pz0, g32)); + MM_STORE(g+41*SIMDD, MM_MUL(pz1, g33)); + MM_STORE(g+42*SIMDD, MM_MUL(pz2, g34)); + MM_STORE(g+43*SIMDD, MM_MUL(pz3, g35)); + MM_STORE(g+12*SIMDD, MM_FMA(px0, cx0, b0)); + MM_STORE(g+13*SIMDD, MM_FMA(px1, cx1, b1)); + MM_STORE(g+14*SIMDD, MM_FMA(px2, cx2, b2)); + MM_STORE(g+15*SIMDD, MM_FMA(px3, cx3, b3)); + MM_STORE(g+28*SIMDD, MM_FMA(py0, cy0, b0)); + MM_STORE(g+29*SIMDD, MM_FMA(py1, cy1, b1)); + MM_STORE(g+30*SIMDD, MM_FMA(py2, cy2, b2)); + MM_STORE(g+31*SIMDD, MM_FMA(py3, cy3, b3)); + MM_STORE(g+44*SIMDD, MM_MUL(MM_FMA(pz0, cz0, b0), g32)); + MM_STORE(g+45*SIMDD, MM_MUL(MM_FMA(pz1, cz1, b1), g33)); + MM_STORE(g+46*SIMDD, MM_MUL(MM_FMA(pz2, cz2, b2), g34)); + MM_STORE(g+47*SIMDD, MM_MUL(MM_FMA(pz3, cz3, b3), g35)); +} + +static inline void _srg0_2d4d_1011(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b01; + double *r = envs->rkrl; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpx2 = MM_LOAD(cpx+2*SIMDD); + __MD cpx3 = MM_LOAD(cpx+3*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpy2 = MM_LOAD(cpy+2*SIMDD); + __MD cpy3 = MM_LOAD(cpy+3*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); + __MD cpz2 = MM_LOAD(cpz+2*SIMDD); + __MD cpz3 = MM_LOAD(cpz+3*SIMDD); + __MD rcp0 = MM_ADD(rx, cpx0); + __MD rcp1 = MM_ADD(rx, cpx1); + __MD rcp2 = MM_ADD(rx, cpx2); + __MD rcp3 = MM_ADD(rx, cpx3); + __MD rcp4 = MM_ADD(ry, cpy0); + __MD rcp5 = MM_ADD(ry, cpy1); + __MD rcp6 = MM_ADD(ry, cpy2); + __MD rcp7 = MM_ADD(ry, cpy3); + __MD rcp8 = MM_ADD(rz, cpz0); + __MD rcp9 = MM_ADD(rz, cpz1); + __MD rcp10= MM_ADD(rz, cpz2); + __MD rcp11= MM_ADD(rz, cpz3); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0x2 = MM_LOAD(c0x+2*SIMDD); + __MD c0x3 = MM_LOAD(c0x+3*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0y2 = MM_LOAD(c0y+2*SIMDD); + __MD c0y3 = MM_LOAD(c0y+3*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD c0z2 = MM_LOAD(c0z+2*SIMDD); + __MD c0z3 = MM_LOAD(c0z+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g96 = MM_LOAD(g+96*SIMDD); + __MD g97 = MM_LOAD(g+97*SIMDD); + __MD g98 = MM_LOAD(g+98*SIMDD); + __MD g99 = MM_LOAD(g+99*SIMDD); + MM_STORE(g+4 *SIMDD, c0x0); + MM_STORE(g+5 *SIMDD, c0x1); + MM_STORE(g+6 *SIMDD, c0x2); + MM_STORE(g+7 *SIMDD, c0x3); + MM_STORE(g+8 *SIMDD, rcp0); + MM_STORE(g+9 *SIMDD, rcp1); + MM_STORE(g+10*SIMDD, rcp2); + MM_STORE(g+11*SIMDD, rcp3); + MM_STORE(g+16*SIMDD, cpx0); + MM_STORE(g+17*SIMDD, cpx1); + MM_STORE(g+18*SIMDD, cpx2); + MM_STORE(g+19*SIMDD, cpx3); + MM_STORE(g+52*SIMDD, c0y0); + MM_STORE(g+53*SIMDD, c0y1); + MM_STORE(g+54*SIMDD, c0y2); + MM_STORE(g+55*SIMDD, c0y3); + MM_STORE(g+56*SIMDD, rcp4); + MM_STORE(g+57*SIMDD, rcp5); + MM_STORE(g+58*SIMDD, rcp6); + MM_STORE(g+59*SIMDD, rcp7); + MM_STORE(g+64*SIMDD, cpy0); + MM_STORE(g+65*SIMDD, cpy1); + MM_STORE(g+66*SIMDD, cpy2); + MM_STORE(g+67*SIMDD, cpy3); + MM_STORE(g+100*SIMDD, c0z0 * g96); + MM_STORE(g+101*SIMDD, c0z1 * g97); + MM_STORE(g+102*SIMDD, c0z2 * g98); + MM_STORE(g+103*SIMDD, c0z3 * g99); + MM_STORE(g+104*SIMDD, rcp8 * g96); + MM_STORE(g+105*SIMDD, rcp9 * g97); + MM_STORE(g+106*SIMDD, rcp10* g98); + MM_STORE(g+107*SIMDD, rcp11* g99); + MM_STORE(g+112*SIMDD, cpz0 * g96); + MM_STORE(g+113*SIMDD, cpz1 * g97); + MM_STORE(g+114*SIMDD, cpz2 * g98); + MM_STORE(g+115*SIMDD, cpz3 * g99); + MM_STORE(g+12*SIMDD, rcp0 * c0x0 + b00); + MM_STORE(g+13*SIMDD, rcp1 * c0x1 + b01); + MM_STORE(g+14*SIMDD, rcp2 * c0x2 + b02); + MM_STORE(g+15*SIMDD, rcp3 * c0x3 + b03); + MM_STORE(g+20*SIMDD, cpx0 * c0x0 + b00); + MM_STORE(g+21*SIMDD, cpx1 * c0x1 + b01); + MM_STORE(g+22*SIMDD, cpx2 * c0x2 + b02); + MM_STORE(g+23*SIMDD, cpx3 * c0x3 + b03); + MM_STORE(g+24*SIMDD, cpx0 * rcp0 + b10); + MM_STORE(g+25*SIMDD, cpx1 * rcp1 + b11); + MM_STORE(g+26*SIMDD, cpx2 * rcp2 + b12); + MM_STORE(g+27*SIMDD, cpx3 * rcp3 + b13); + MM_STORE(g+60*SIMDD, rcp4 * c0y0 + b00); + MM_STORE(g+61*SIMDD, rcp5 * c0y1 + b01); + MM_STORE(g+62*SIMDD, rcp6 * c0y2 + b02); + MM_STORE(g+63*SIMDD, rcp7 * c0y3 + b03); + MM_STORE(g+68*SIMDD, cpy0 * c0y0 + b00); + MM_STORE(g+69*SIMDD, cpy1 * c0y1 + b01); + MM_STORE(g+70*SIMDD, cpy2 * c0y2 + b02); + MM_STORE(g+71*SIMDD, cpy3 * c0y3 + b03); + MM_STORE(g+72*SIMDD, cpy0 * rcp4 + b10); + MM_STORE(g+73*SIMDD, cpy1 * rcp5 + b11); + MM_STORE(g+74*SIMDD, cpy2 * rcp6 + b12); + MM_STORE(g+75*SIMDD, cpy3 * rcp7 + b13); + MM_STORE(g+108*SIMDD,(rcp8 * c0z0 + b00)* g96); + MM_STORE(g+109*SIMDD,(rcp9 * c0z1 + b01)* g97); + MM_STORE(g+110*SIMDD,(rcp10* c0z2 + b02)* g98); + MM_STORE(g+111*SIMDD,(rcp11* c0z3 + b03)* g99); + MM_STORE(g+116*SIMDD,(cpz0 * c0z0 + b00)* g96); + MM_STORE(g+117*SIMDD,(cpz1 * c0z1 + b01)* g97); + MM_STORE(g+118*SIMDD,(cpz2 * c0z2 + b02)* g98); + MM_STORE(g+119*SIMDD,(cpz3 * c0z3 + b03)* g99); + MM_STORE(g+120*SIMDD,(cpz0 * rcp8 + b10)* g96); + MM_STORE(g+121*SIMDD,(cpz1 * rcp9 + b11)* g97); + MM_STORE(g+122*SIMDD,(cpz2 * rcp10+ b12)* g98); + MM_STORE(g+123*SIMDD,(cpz3 * rcp11+ b13)* g99); + MM_STORE(g+28*SIMDD , rcp0 * MM_LOAD(g+20 *SIMDD) + b00 * cpx0 + b10 * c0x0); + MM_STORE(g+29*SIMDD , rcp1 * MM_LOAD(g+21 *SIMDD) + b01 * cpx1 + b11 * c0x1); + MM_STORE(g+30*SIMDD , rcp2 * MM_LOAD(g+22 *SIMDD) + b02 * cpx2 + b12 * c0x2); + MM_STORE(g+31*SIMDD , rcp3 * MM_LOAD(g+23 *SIMDD) + b03 * cpx3 + b13 * c0x3); + MM_STORE(g+76*SIMDD , rcp4 * MM_LOAD(g+68 *SIMDD) + b00 * cpy0 + b10 * c0y0); + MM_STORE(g+77*SIMDD , rcp5 * MM_LOAD(g+69 *SIMDD) + b01 * cpy1 + b11 * c0y1); + MM_STORE(g+78*SIMDD , rcp6 * MM_LOAD(g+70 *SIMDD) + b02 * cpy2 + b12 * c0y2); + MM_STORE(g+79*SIMDD , rcp7 * MM_LOAD(g+71 *SIMDD) + b03 * cpy3 + b13 * c0y3); + MM_STORE(g+124*SIMDD, rcp8 * MM_LOAD(g+116*SIMDD) + b00 * MM_LOAD(g+112*SIMDD) + b10 * MM_LOAD(g+100*SIMDD)); + MM_STORE(g+125*SIMDD, rcp9 * MM_LOAD(g+117*SIMDD) + b01 * MM_LOAD(g+113*SIMDD) + b11 * MM_LOAD(g+101*SIMDD)); + MM_STORE(g+126*SIMDD, rcp10* MM_LOAD(g+118*SIMDD) + b02 * MM_LOAD(g+114*SIMDD) + b12 * MM_LOAD(g+102*SIMDD)); + MM_STORE(g+127*SIMDD, rcp11* MM_LOAD(g+119*SIMDD) + b03 * MM_LOAD(g+115*SIMDD) + b13 * MM_LOAD(g+103*SIMDD)); +} + +static inline void _srg0_2d4d_1020(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b01; + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+8 *SIMDD, s0); + MM_STORE(g+9 *SIMDD, s1); + MM_STORE(g+10*SIMDD, s2); + MM_STORE(g+11*SIMDD, s3); + MM_STORE(g+28*SIMDD, r4); + MM_STORE(g+29*SIMDD, r5); + MM_STORE(g+30*SIMDD, r6); + MM_STORE(g+31*SIMDD, r7); + MM_STORE(g+32*SIMDD, s4); + MM_STORE(g+33*SIMDD, s5); + MM_STORE(g+34*SIMDD, s6); + MM_STORE(g+35*SIMDD, s7); + MM_STORE(g+52*SIMDD, MM_MUL(r8, g48)); + MM_STORE(g+53*SIMDD, MM_MUL(r9, g49)); + MM_STORE(g+54*SIMDD, MM_MUL(r10,g50)); + MM_STORE(g+55*SIMDD, MM_MUL(r11,g51)); + MM_STORE(g+56*SIMDD, MM_MUL(s8, g48)); + MM_STORE(g+57*SIMDD, MM_MUL(s9, g49)); + MM_STORE(g+58*SIMDD, MM_MUL(s10,g50)); + MM_STORE(g+59*SIMDD, MM_MUL(s11,g51)); + MM_STORE(g+12*SIMDD, MM_FMA(r0, s0, b00)); + MM_STORE(g+13*SIMDD, MM_FMA(r1, s1, b01)); + MM_STORE(g+14*SIMDD, MM_FMA(r2, s2, b02)); + MM_STORE(g+15*SIMDD, MM_FMA(r3, s3, b03)); + MM_STORE(g+16*SIMDD, MM_FMA(s0, s0, b10)); + MM_STORE(g+17*SIMDD, MM_FMA(s1, s1, b11)); + MM_STORE(g+18*SIMDD, MM_FMA(s2, s2, b12)); + MM_STORE(g+19*SIMDD, MM_FMA(s3, s3, b13)); + MM_STORE(g+36*SIMDD, MM_FMA(r4, s4, b00)); + MM_STORE(g+37*SIMDD, MM_FMA(r5, s5, b01)); + MM_STORE(g+38*SIMDD, MM_FMA(r6, s6, b02)); + MM_STORE(g+39*SIMDD, MM_FMA(r7, s7, b03)); + MM_STORE(g+40*SIMDD, MM_FMA(s4, s4, b10)); + MM_STORE(g+41*SIMDD, MM_FMA(s5, s5, b11)); + MM_STORE(g+42*SIMDD, MM_FMA(s6, s6, b12)); + MM_STORE(g+43*SIMDD, MM_FMA(s7, s7, b13)); + MM_STORE(g+60*SIMDD, MM_MUL(MM_FMA(r8, s8, b00), g48)); + MM_STORE(g+61*SIMDD, MM_MUL(MM_FMA(r9, s9, b01), g49)); + MM_STORE(g+62*SIMDD, MM_MUL(MM_FMA(r10,s10,b02), g50)); + MM_STORE(g+63*SIMDD, MM_MUL(MM_FMA(r11,s11,b03), g51)); + MM_STORE(g+64*SIMDD, MM_MUL(MM_FMA(s8, s8, b10), g48)); + MM_STORE(g+65*SIMDD, MM_MUL(MM_FMA(s9, s9, b11), g49)); + MM_STORE(g+66*SIMDD, MM_MUL(MM_FMA(s10,s10,b12), g50)); + MM_STORE(g+67*SIMDD, MM_MUL(MM_FMA(s11,s11,b13), g51)); + MM_STORE(g+20*SIMDD, s0 *(MM_LOAD(g+12*SIMDD) + b00) + b10 * r0); + MM_STORE(g+21*SIMDD, s1 *(MM_LOAD(g+13*SIMDD) + b01) + b11 * r1); + MM_STORE(g+22*SIMDD, s2 *(MM_LOAD(g+14*SIMDD) + b02) + b12 * r2); + MM_STORE(g+23*SIMDD, s3 *(MM_LOAD(g+15*SIMDD) + b03) + b13 * r3); + MM_STORE(g+44*SIMDD, s4 *(MM_LOAD(g+36*SIMDD) + b00) + b10 * r4); + MM_STORE(g+45*SIMDD, s5 *(MM_LOAD(g+37*SIMDD) + b01) + b11 * r5); + MM_STORE(g+46*SIMDD, s6 *(MM_LOAD(g+38*SIMDD) + b02) + b12 * r6); + MM_STORE(g+47*SIMDD, s7 *(MM_LOAD(g+39*SIMDD) + b03) + b13 * r7); + MM_STORE(g+68*SIMDD, s8 * MM_LOAD(g+60*SIMDD) + b10 * MM_LOAD(g+52*SIMDD) + b00 * MM_LOAD(g+56*SIMDD)); + MM_STORE(g+69*SIMDD, s9 * MM_LOAD(g+61*SIMDD) + b11 * MM_LOAD(g+53*SIMDD) + b01 * MM_LOAD(g+57*SIMDD)); + MM_STORE(g+70*SIMDD, s10* MM_LOAD(g+62*SIMDD) + b12 * MM_LOAD(g+54*SIMDD) + b02 * MM_LOAD(g+58*SIMDD)); + MM_STORE(g+71*SIMDD, s11* MM_LOAD(g+63*SIMDD) + b13 * MM_LOAD(g+55*SIMDD) + b03 * MM_LOAD(g+59*SIMDD)); +} + +static inline void _srg0_2d4d_1100(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + double *b = bc->b10; + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(rx, cx2); + __MD r3 = MM_ADD(rx, cx3); + __MD r4 = MM_ADD(ry, cy0); + __MD r5 = MM_ADD(ry, cy1); + __MD r6 = MM_ADD(ry, cy2); + __MD r7 = MM_ADD(ry, cy3); + __MD r8 = MM_ADD(rz, cz0); + __MD r9 = MM_ADD(rz, cz1); + __MD r10= MM_ADD(rz, cz2); + __MD r11= MM_ADD(rz, cz3); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+8 *SIMDD, cx0); + MM_STORE(g+9 *SIMDD, cx1); + MM_STORE(g+10*SIMDD, cx2); + MM_STORE(g+11*SIMDD, cx3); + MM_STORE(g+28*SIMDD, r4); + MM_STORE(g+29*SIMDD, r5); + MM_STORE(g+30*SIMDD, r6); + MM_STORE(g+31*SIMDD, r7); + MM_STORE(g+32*SIMDD, cy0); + MM_STORE(g+33*SIMDD, cy1); + MM_STORE(g+34*SIMDD, cy2); + MM_STORE(g+35*SIMDD, cy3); + MM_STORE(g+52*SIMDD, MM_MUL(r8 , g48)); + MM_STORE(g+53*SIMDD, MM_MUL(r9 , g49)); + MM_STORE(g+54*SIMDD, MM_MUL(r10, g50)); + MM_STORE(g+55*SIMDD, MM_MUL(r11, g51)); + MM_STORE(g+56*SIMDD, MM_MUL(cz0,g48)); + MM_STORE(g+57*SIMDD, MM_MUL(cz1,g49)); + MM_STORE(g+58*SIMDD, MM_MUL(cz2,g50)); + MM_STORE(g+59*SIMDD, MM_MUL(cz3,g51)); + MM_STORE(g+12*SIMDD, MM_FMA(r0, cx0, b0)); + MM_STORE(g+13*SIMDD, MM_FMA(r1, cx1, b1)); + MM_STORE(g+14*SIMDD, MM_FMA(r2, cx2, b2)); + MM_STORE(g+15*SIMDD, MM_FMA(r3, cx3, b3)); + MM_STORE(g+36*SIMDD, MM_FMA(r4, cy0, b0)); + MM_STORE(g+37*SIMDD, MM_FMA(r5, cy1, b1)); + MM_STORE(g+38*SIMDD, MM_FMA(r6, cy2, b2)); + MM_STORE(g+39*SIMDD, MM_FMA(r7, cy3, b3)); + MM_STORE(g+60*SIMDD, MM_MUL(MM_FMA(r8 , cz0, b0), g48)); + MM_STORE(g+61*SIMDD, MM_MUL(MM_FMA(r9 , cz1, b1), g49)); + MM_STORE(g+62*SIMDD, MM_MUL(MM_FMA(r10, cz2, b2), g50)); + MM_STORE(g+63*SIMDD, MM_MUL(MM_FMA(r11, cz3, b3), g51)); +} + +static inline void _srg0_2d4d_1101(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b10; + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0x2 = MM_LOAD(c0x+2*SIMDD); + __MD c0x3 = MM_LOAD(c0x+3*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0y2 = MM_LOAD(c0y+2*SIMDD); + __MD c0y3 = MM_LOAD(c0y+3*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD c0z2 = MM_LOAD(c0z+2*SIMDD); + __MD c0z3 = MM_LOAD(c0z+3*SIMDD); + __MD rc00 = MM_ADD(rx, c0x0); + __MD rc01 = MM_ADD(rx, c0x1); + __MD rc02 = MM_ADD(rx, c0x2); + __MD rc03 = MM_ADD(rx, c0x3); + __MD rc04 = MM_ADD(ry, c0y0); + __MD rc05 = MM_ADD(ry, c0y1); + __MD rc06 = MM_ADD(ry, c0y2); + __MD rc07 = MM_ADD(ry, c0y3); + __MD rc08 = MM_ADD(rz, c0z0); + __MD rc09 = MM_ADD(rz, c0z1); + __MD rc10 = MM_ADD(rz, c0z2); + __MD rc11 = MM_ADD(rz, c0z3); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpx2 = MM_LOAD(cpx+2*SIMDD); + __MD cpx3 = MM_LOAD(cpx+3*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpy2 = MM_LOAD(cpy+2*SIMDD); + __MD cpy3 = MM_LOAD(cpy+3*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); + __MD cpz2 = MM_LOAD(cpz+2*SIMDD); + __MD cpz3 = MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g96 = MM_LOAD(g+96*SIMDD); + __MD g97 = MM_LOAD(g+97*SIMDD); + __MD g98 = MM_LOAD(g+98*SIMDD); + __MD g99 = MM_LOAD(g+99*SIMDD); + MM_STORE(g+4 *SIMDD, rc00); + MM_STORE(g+5 *SIMDD, rc01); + MM_STORE(g+6 *SIMDD, rc02); + MM_STORE(g+7 *SIMDD, rc03); + MM_STORE(g+8 *SIMDD, cpx0); + MM_STORE(g+9 *SIMDD, cpx1); + MM_STORE(g+10*SIMDD, cpx2); + MM_STORE(g+11*SIMDD, cpx3); + MM_STORE(g+16*SIMDD, c0x0); + MM_STORE(g+17*SIMDD, c0x1); + MM_STORE(g+18*SIMDD, c0x2); + MM_STORE(g+19*SIMDD, c0x3); + MM_STORE(g+52*SIMDD, rc04); + MM_STORE(g+53*SIMDD, rc05); + MM_STORE(g+54*SIMDD, rc06); + MM_STORE(g+55*SIMDD, rc07); + MM_STORE(g+56*SIMDD, cpy0); + MM_STORE(g+57*SIMDD, cpy1); + MM_STORE(g+58*SIMDD, cpy2); + MM_STORE(g+59*SIMDD, cpy3); + MM_STORE(g+64*SIMDD, c0y0); + MM_STORE(g+65*SIMDD, c0y1); + MM_STORE(g+66*SIMDD, c0y2); + MM_STORE(g+67*SIMDD, c0y3); + MM_STORE(g+100*SIMDD, rc08 * g96); + MM_STORE(g+101*SIMDD, rc09 * g97); + MM_STORE(g+102*SIMDD, rc10 * g98); + MM_STORE(g+103*SIMDD, rc11 * g99); + MM_STORE(g+104*SIMDD, cpz0 * g96); + MM_STORE(g+105*SIMDD, cpz1 * g97); + MM_STORE(g+106*SIMDD, cpz2 * g98); + MM_STORE(g+107*SIMDD, cpz3 * g99); + MM_STORE(g+112*SIMDD, c0z0 * g96); + MM_STORE(g+113*SIMDD, c0z1 * g97); + MM_STORE(g+114*SIMDD, c0z2 * g98); + MM_STORE(g+115*SIMDD, c0z3 * g99); + MM_STORE(g+12*SIMDD, cpx0 * rc00 + b00); + MM_STORE(g+13*SIMDD, cpx1 * rc01 + b01); + MM_STORE(g+14*SIMDD, cpx2 * rc02 + b02); + MM_STORE(g+15*SIMDD, cpx3 * rc03 + b03); + MM_STORE(g+20*SIMDD, c0x0 * rc00 + b10); + MM_STORE(g+21*SIMDD, c0x1 * rc01 + b11); + MM_STORE(g+22*SIMDD, c0x2 * rc02 + b12); + MM_STORE(g+23*SIMDD, c0x3 * rc03 + b13); + MM_STORE(g+24*SIMDD, c0x0 * cpx0 + b00); + MM_STORE(g+25*SIMDD, c0x1 * cpx1 + b01); + MM_STORE(g+26*SIMDD, c0x2 * cpx2 + b02); + MM_STORE(g+27*SIMDD, c0x3 * cpx3 + b03); + MM_STORE(g+60*SIMDD, cpy0 * rc04 + b00); + MM_STORE(g+61*SIMDD, cpy1 * rc05 + b01); + MM_STORE(g+62*SIMDD, cpy2 * rc06 + b02); + MM_STORE(g+63*SIMDD, cpy3 * rc07 + b03); + MM_STORE(g+68*SIMDD, c0y0 * rc04 + b10); + MM_STORE(g+69*SIMDD, c0y1 * rc05 + b11); + MM_STORE(g+70*SIMDD, c0y2 * rc06 + b12); + MM_STORE(g+71*SIMDD, c0y3 * rc07 + b13); + MM_STORE(g+72*SIMDD, c0y0 * cpy0 + b00); + MM_STORE(g+73*SIMDD, c0y1 * cpy1 + b01); + MM_STORE(g+74*SIMDD, c0y2 * cpy2 + b02); + MM_STORE(g+75*SIMDD, c0y3 * cpy3 + b03); + MM_STORE(g+108*SIMDD,(cpz0 * rc08 + b00)* g96); + MM_STORE(g+109*SIMDD,(cpz1 * rc09 + b01)* g97); + MM_STORE(g+110*SIMDD,(cpz2 * rc10 + b02)* g98); + MM_STORE(g+111*SIMDD,(cpz3 * rc11 + b03)* g99); + MM_STORE(g+116*SIMDD,(c0z0 * rc08 + b10)* g96); + MM_STORE(g+117*SIMDD,(c0z1 * rc09 + b11)* g97); + MM_STORE(g+118*SIMDD,(c0z2 * rc10 + b12)* g98); + MM_STORE(g+119*SIMDD,(c0z3 * rc11 + b13)* g99); + MM_STORE(g+120*SIMDD,(c0z0 * cpz0 + b00)* g96); + MM_STORE(g+121*SIMDD,(c0z1 * cpz1 + b01)* g97); + MM_STORE(g+122*SIMDD,(c0z2 * cpz2 + b02)* g98); + MM_STORE(g+123*SIMDD,(c0z3 * cpz3 + b03)* g99); + MM_STORE(g+28 *SIMDD, cpx0 * MM_LOAD(g+20 *SIMDD) + b00 *(rc00 + c0x0)); + MM_STORE(g+29 *SIMDD, cpx1 * MM_LOAD(g+21 *SIMDD) + b01 *(rc01 + c0x1)); + MM_STORE(g+30 *SIMDD, cpx2 * MM_LOAD(g+22 *SIMDD) + b02 *(rc02 + c0x2)); + MM_STORE(g+31 *SIMDD, cpx3 * MM_LOAD(g+23 *SIMDD) + b03 *(rc03 + c0x3)); + MM_STORE(g+76 *SIMDD, cpy0 * MM_LOAD(g+68 *SIMDD) + b00 *(rc04 + c0y0)); + MM_STORE(g+77 *SIMDD, cpy1 * MM_LOAD(g+69 *SIMDD) + b01 *(rc05 + c0y1)); + MM_STORE(g+78 *SIMDD, cpy2 * MM_LOAD(g+70 *SIMDD) + b02 *(rc06 + c0y2)); + MM_STORE(g+79 *SIMDD, cpy3 * MM_LOAD(g+71 *SIMDD) + b03 *(rc07 + c0y3)); + MM_STORE(g+124*SIMDD, cpz0 * MM_LOAD(g+116*SIMDD) + b00 *(MM_LOAD(g+100*SIMDD) + MM_LOAD(g+112*SIMDD))); + MM_STORE(g+125*SIMDD, cpz1 * MM_LOAD(g+117*SIMDD) + b01 *(MM_LOAD(g+101*SIMDD) + MM_LOAD(g+113*SIMDD))); + MM_STORE(g+126*SIMDD, cpz2 * MM_LOAD(g+118*SIMDD) + b02 *(MM_LOAD(g+102*SIMDD) + MM_LOAD(g+114*SIMDD))); + MM_STORE(g+127*SIMDD, cpz3 * MM_LOAD(g+119*SIMDD) + b03 *(MM_LOAD(g+103*SIMDD) + MM_LOAD(g+115*SIMDD))); +} + +static inline void _srg0_2d4d_1110(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b10; + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD c0x0 = MM_LOAD(c0x+0*SIMDD); + __MD c0x1 = MM_LOAD(c0x+1*SIMDD); + __MD c0x2 = MM_LOAD(c0x+2*SIMDD); + __MD c0x3 = MM_LOAD(c0x+3*SIMDD); + __MD c0y0 = MM_LOAD(c0y+0*SIMDD); + __MD c0y1 = MM_LOAD(c0y+1*SIMDD); + __MD c0y2 = MM_LOAD(c0y+2*SIMDD); + __MD c0y3 = MM_LOAD(c0y+3*SIMDD); + __MD c0z0 = MM_LOAD(c0z+0*SIMDD); + __MD c0z1 = MM_LOAD(c0z+1*SIMDD); + __MD c0z2 = MM_LOAD(c0z+2*SIMDD); + __MD c0z3 = MM_LOAD(c0z+3*SIMDD); + __MD rc00 = MM_ADD(rx, c0x0); + __MD rc01 = MM_ADD(rx, c0x1); + __MD rc02 = MM_ADD(rx, c0x2); + __MD rc03 = MM_ADD(rx, c0x3); + __MD rc04 = MM_ADD(ry, c0y0); + __MD rc05 = MM_ADD(ry, c0y1); + __MD rc06 = MM_ADD(ry, c0y2); + __MD rc07 = MM_ADD(ry, c0y3); + __MD rc08 = MM_ADD(rz, c0z0); + __MD rc09 = MM_ADD(rz, c0z1); + __MD rc10 = MM_ADD(rz, c0z2); + __MD rc11 = MM_ADD(rz, c0z3); + __MD cpx0 = MM_LOAD(cpx+0*SIMDD); + __MD cpx1 = MM_LOAD(cpx+1*SIMDD); + __MD cpx2 = MM_LOAD(cpx+2*SIMDD); + __MD cpx3 = MM_LOAD(cpx+3*SIMDD); + __MD cpy0 = MM_LOAD(cpy+0*SIMDD); + __MD cpy1 = MM_LOAD(cpy+1*SIMDD); + __MD cpy2 = MM_LOAD(cpy+2*SIMDD); + __MD cpy3 = MM_LOAD(cpy+3*SIMDD); + __MD cpz0 = MM_LOAD(cpz+0*SIMDD); + __MD cpz1 = MM_LOAD(cpz+1*SIMDD); + __MD cpz2 = MM_LOAD(cpz+2*SIMDD); + __MD cpz3 = MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g96 = MM_LOAD(g+96*SIMDD); + __MD g97 = MM_LOAD(g+97*SIMDD); + __MD g98 = MM_LOAD(g+98*SIMDD); + __MD g99 = MM_LOAD(g+99*SIMDD); + MM_STORE(g+4 *SIMDD, rc00); + MM_STORE(g+5 *SIMDD, rc01); + MM_STORE(g+6 *SIMDD, rc02); + MM_STORE(g+7 *SIMDD, rc03); + MM_STORE(g+8 *SIMDD, cpx0); + MM_STORE(g+9 *SIMDD, cpx1); + MM_STORE(g+10*SIMDD, cpx2); + MM_STORE(g+11*SIMDD, cpx3); + MM_STORE(g+16*SIMDD, c0x0); + MM_STORE(g+17*SIMDD, c0x1); + MM_STORE(g+18*SIMDD, c0x2); + MM_STORE(g+19*SIMDD, c0x3); + MM_STORE(g+52*SIMDD, rc04); + MM_STORE(g+53*SIMDD, rc05); + MM_STORE(g+54*SIMDD, rc06); + MM_STORE(g+55*SIMDD, rc07); + MM_STORE(g+56*SIMDD, cpy0); + MM_STORE(g+57*SIMDD, cpy1); + MM_STORE(g+58*SIMDD, cpy2); + MM_STORE(g+59*SIMDD, cpy3); + MM_STORE(g+64*SIMDD, c0y0); + MM_STORE(g+65*SIMDD, c0y1); + MM_STORE(g+66*SIMDD, c0y2); + MM_STORE(g+67*SIMDD, c0y3); + MM_STORE(g+100*SIMDD, rc08 * g96); + MM_STORE(g+101*SIMDD, rc09 * g97); + MM_STORE(g+102*SIMDD, rc10 * g98); + MM_STORE(g+103*SIMDD, rc11 * g99); + MM_STORE(g+104*SIMDD, cpz0 * g96); + MM_STORE(g+105*SIMDD, cpz1 * g97); + MM_STORE(g+106*SIMDD, cpz2 * g98); + MM_STORE(g+107*SIMDD, cpz3 * g99); + MM_STORE(g+112*SIMDD, c0z0 * g96); + MM_STORE(g+113*SIMDD, c0z1 * g97); + MM_STORE(g+114*SIMDD, c0z2 * g98); + MM_STORE(g+115*SIMDD, c0z3 * g99); + MM_STORE(g+12*SIMDD, cpx0 * rc00 + b00); + MM_STORE(g+13*SIMDD, cpx1 * rc01 + b01); + MM_STORE(g+14*SIMDD, cpx2 * rc02 + b02); + MM_STORE(g+15*SIMDD, cpx3 * rc03 + b03); + MM_STORE(g+20*SIMDD, c0x0 * rc00 + b10); + MM_STORE(g+21*SIMDD, c0x1 * rc01 + b11); + MM_STORE(g+22*SIMDD, c0x2 * rc02 + b12); + MM_STORE(g+23*SIMDD, c0x3 * rc03 + b13); + MM_STORE(g+24*SIMDD, c0x0 * cpx0 + b00); + MM_STORE(g+25*SIMDD, c0x1 * cpx1 + b01); + MM_STORE(g+26*SIMDD, c0x2 * cpx2 + b02); + MM_STORE(g+27*SIMDD, c0x3 * cpx3 + b03); + MM_STORE(g+60*SIMDD, cpy0 * rc04 + b00); + MM_STORE(g+61*SIMDD, cpy1 * rc05 + b01); + MM_STORE(g+62*SIMDD, cpy2 * rc06 + b02); + MM_STORE(g+63*SIMDD, cpy3 * rc07 + b03); + MM_STORE(g+68*SIMDD, c0y0 * rc04 + b10); + MM_STORE(g+69*SIMDD, c0y1 * rc05 + b11); + MM_STORE(g+70*SIMDD, c0y2 * rc06 + b12); + MM_STORE(g+71*SIMDD, c0y3 * rc07 + b13); + MM_STORE(g+72*SIMDD, c0y0 * cpy0 + b00); + MM_STORE(g+73*SIMDD, c0y1 * cpy1 + b01); + MM_STORE(g+74*SIMDD, c0y2 * cpy2 + b02); + MM_STORE(g+75*SIMDD, c0y3 * cpy3 + b03); + MM_STORE(g+108*SIMDD,(cpz0 * rc08 + b00)* g96); + MM_STORE(g+109*SIMDD,(cpz1 * rc09 + b01)* g97); + MM_STORE(g+110*SIMDD,(cpz2 * rc10 + b02)* g98); + MM_STORE(g+111*SIMDD,(cpz3 * rc11 + b03)* g99); + MM_STORE(g+116*SIMDD,(c0z0 * rc08 + b10)* g96); + MM_STORE(g+117*SIMDD,(c0z1 * rc09 + b11)* g97); + MM_STORE(g+118*SIMDD,(c0z2 * rc10 + b12)* g98); + MM_STORE(g+119*SIMDD,(c0z3 * rc11 + b13)* g99); + MM_STORE(g+120*SIMDD,(c0z0 * cpz0 + b00)* g96); + MM_STORE(g+121*SIMDD,(c0z1 * cpz1 + b01)* g97); + MM_STORE(g+122*SIMDD,(c0z2 * cpz2 + b02)* g98); + MM_STORE(g+123*SIMDD,(c0z3 * cpz3 + b03)* g99); + MM_STORE(g+28 *SIMDD, cpx0 * MM_LOAD(g+20 *SIMDD) + b00 *(rc00 + c0x0)); + MM_STORE(g+29 *SIMDD, cpx1 * MM_LOAD(g+21 *SIMDD) + b01 *(rc01 + c0x1)); + MM_STORE(g+30 *SIMDD, cpx2 * MM_LOAD(g+22 *SIMDD) + b02 *(rc02 + c0x2)); + MM_STORE(g+31 *SIMDD, cpx3 * MM_LOAD(g+23 *SIMDD) + b03 *(rc03 + c0x3)); + MM_STORE(g+76 *SIMDD, cpy0 * MM_LOAD(g+68 *SIMDD) + b00 *(rc04 + c0y0)); + MM_STORE(g+77 *SIMDD, cpy1 * MM_LOAD(g+69 *SIMDD) + b01 *(rc05 + c0y1)); + MM_STORE(g+78 *SIMDD, cpy2 * MM_LOAD(g+70 *SIMDD) + b02 *(rc06 + c0y2)); + MM_STORE(g+79 *SIMDD, cpy3 * MM_LOAD(g+71 *SIMDD) + b03 *(rc07 + c0y3)); + MM_STORE(g+124*SIMDD, cpz0 * MM_LOAD(g+116*SIMDD) + b00 *(MM_LOAD(g+100*SIMDD) + MM_LOAD(g+112*SIMDD))); + MM_STORE(g+125*SIMDD, cpz1 * MM_LOAD(g+117*SIMDD) + b01 *(MM_LOAD(g+101*SIMDD) + MM_LOAD(g+113*SIMDD))); + MM_STORE(g+126*SIMDD, cpz2 * MM_LOAD(g+118*SIMDD) + b02 *(MM_LOAD(g+102*SIMDD) + MM_LOAD(g+114*SIMDD))); + MM_STORE(g+127*SIMDD, cpz3 * MM_LOAD(g+119*SIMDD) + b03 *(MM_LOAD(g+103*SIMDD) + MM_LOAD(g+115*SIMDD))); +} + +static inline void _srg0_2d4d_1200(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + double *b = bc->b10; + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD cx0 = MM_LOAD(cx+0*SIMDD); + __MD cx1 = MM_LOAD(cx+1*SIMDD); + __MD cx2 = MM_LOAD(cx+2*SIMDD); + __MD cx3 = MM_LOAD(cx+3*SIMDD); + __MD cy0 = MM_LOAD(cy+0*SIMDD); + __MD cy1 = MM_LOAD(cy+1*SIMDD); + __MD cy2 = MM_LOAD(cy+2*SIMDD); + __MD cy3 = MM_LOAD(cy+3*SIMDD); + __MD cz0 = MM_LOAD(cz+0*SIMDD); + __MD cz1 = MM_LOAD(cz+1*SIMDD); + __MD cz2 = MM_LOAD(cz+2*SIMDD); + __MD cz3 = MM_LOAD(cz+3*SIMDD); + __MD r0 = MM_ADD(rx, cx0); + __MD r1 = MM_ADD(rx, cx1); + __MD r2 = MM_ADD(rx, cx2); + __MD r3 = MM_ADD(rx, cx3); + __MD r4 = MM_ADD(ry, cy0); + __MD r5 = MM_ADD(ry, cy1); + __MD r6 = MM_ADD(ry, cy2); + __MD r7 = MM_ADD(ry, cy3); + __MD r8 = MM_ADD(rz, cz0); + __MD r9 = MM_ADD(rz, cz1); + __MD r10= MM_ADD(rz, cz2); + __MD r11= MM_ADD(rz, cz3); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD i2 = MM_SET1(2.); + __MD g64 = MM_LOAD(g+64*SIMDD); + __MD g65 = MM_LOAD(g+65*SIMDD); + __MD g66 = MM_LOAD(g+66*SIMDD); + __MD g67 = MM_LOAD(g+67*SIMDD); + MM_STORE(g+4 *SIMDD, r0 ); + MM_STORE(g+5 *SIMDD, r1 ); + MM_STORE(g+6 *SIMDD, r2 ); + MM_STORE(g+7 *SIMDD, r3 ); + MM_STORE(g+8 *SIMDD, cx0); + MM_STORE(g+9 *SIMDD, cx1); + MM_STORE(g+10*SIMDD, cx2); + MM_STORE(g+11*SIMDD, cx3); + MM_STORE(g+36*SIMDD, r4 ); + MM_STORE(g+37*SIMDD, r5 ); + MM_STORE(g+38*SIMDD, r6 ); + MM_STORE(g+39*SIMDD, r7 ); + MM_STORE(g+40*SIMDD, cy0); + MM_STORE(g+41*SIMDD, cy1); + MM_STORE(g+42*SIMDD, cy2); + MM_STORE(g+43*SIMDD, cy3); + MM_STORE(g+68*SIMDD, MM_MUL(r8 , g64)); + MM_STORE(g+69*SIMDD, MM_MUL(r9 , g65)); + MM_STORE(g+70*SIMDD, MM_MUL(r10, g66)); + MM_STORE(g+71*SIMDD, MM_MUL(r11, g67)); + MM_STORE(g+72*SIMDD, MM_MUL(cz0, g64)); + MM_STORE(g+73*SIMDD, MM_MUL(cz1, g65)); + MM_STORE(g+74*SIMDD, MM_MUL(cz2, g66)); + MM_STORE(g+75*SIMDD, MM_MUL(cz3, g67)); + MM_STORE(g+12*SIMDD, MM_FMA(r0 , cx0, b0)); + MM_STORE(g+13*SIMDD, MM_FMA(r1 , cx1, b1)); + MM_STORE(g+14*SIMDD, MM_FMA(r2 , cx2, b2)); + MM_STORE(g+15*SIMDD, MM_FMA(r3 , cx3, b3)); + MM_STORE(g+16*SIMDD, MM_FMA(cx0, cx0, b0)); + MM_STORE(g+17*SIMDD, MM_FMA(cx1, cx1, b1)); + MM_STORE(g+18*SIMDD, MM_FMA(cx2, cx2, b2)); + MM_STORE(g+19*SIMDD, MM_FMA(cx3, cx3, b3)); + MM_STORE(g+44*SIMDD, MM_FMA(r4 , cy0, b0)); + MM_STORE(g+45*SIMDD, MM_FMA(r5 , cy1, b1)); + MM_STORE(g+46*SIMDD, MM_FMA(r6 , cy2, b2)); + MM_STORE(g+47*SIMDD, MM_FMA(r7 , cy3, b3)); + MM_STORE(g+48*SIMDD, MM_FMA(cy0, cy0, b0)); + MM_STORE(g+49*SIMDD, MM_FMA(cy1, cy1, b1)); + MM_STORE(g+50*SIMDD, MM_FMA(cy2, cy2, b2)); + MM_STORE(g+51*SIMDD, MM_FMA(cy3, cy3, b3)); + MM_STORE(g+76*SIMDD, MM_MUL(MM_FMA(r8 , cz0, b0), g64)); + MM_STORE(g+77*SIMDD, MM_MUL(MM_FMA(r9 , cz1, b1), g65)); + MM_STORE(g+78*SIMDD, MM_MUL(MM_FMA(r10, cz2, b2), g66)); + MM_STORE(g+79*SIMDD, MM_MUL(MM_FMA(r11, cz3, b3), g67)); + MM_STORE(g+80*SIMDD, MM_MUL(MM_FMA(cz0, cz0, b0), g64)); + MM_STORE(g+81*SIMDD, MM_MUL(MM_FMA(cz1, cz1, b1), g65)); + MM_STORE(g+82*SIMDD, MM_MUL(MM_FMA(cz2, cz2, b2), g66)); + MM_STORE(g+83*SIMDD, MM_MUL(MM_FMA(cz3, cz3, b3), g67)); + MM_STORE(g+20*SIMDD, r0 * MM_LOAD(g+16*SIMDD) + i2 * b0 * cx0); + MM_STORE(g+21*SIMDD, r1 * MM_LOAD(g+17*SIMDD) + i2 * b1 * cx1); + MM_STORE(g+22*SIMDD, r2 * MM_LOAD(g+18*SIMDD) + i2 * b2 * cx2); + MM_STORE(g+23*SIMDD, r3 * MM_LOAD(g+19*SIMDD) + i2 * b3 * cx3); + MM_STORE(g+52*SIMDD, r4 * MM_LOAD(g+48*SIMDD) + i2 * b0 * cy0); + MM_STORE(g+53*SIMDD, r5 * MM_LOAD(g+49*SIMDD) + i2 * b1 * cy1); + MM_STORE(g+54*SIMDD, r6 * MM_LOAD(g+50*SIMDD) + i2 * b2 * cy2); + MM_STORE(g+55*SIMDD, r7 * MM_LOAD(g+51*SIMDD) + i2 * b3 * cy3); + MM_STORE(g+84*SIMDD, r8 * MM_LOAD(g+80*SIMDD) + i2 * b0 * MM_LOAD(g+72*SIMDD)); + MM_STORE(g+85*SIMDD, r9 * MM_LOAD(g+81*SIMDD) + i2 * b1 * MM_LOAD(g+73*SIMDD)); + MM_STORE(g+86*SIMDD, r10* MM_LOAD(g+82*SIMDD) + i2 * b2 * MM_LOAD(g+74*SIMDD)); + MM_STORE(g+87*SIMDD, r11* MM_LOAD(g+83*SIMDD) + i2 * b3 * MM_LOAD(g+75*SIMDD)); +} + +static inline void _srg0_2d4d_2000(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + double *b = bc->b10; + __MD r0 = MM_LOAD(cx+0*SIMDD); + __MD r1 = MM_LOAD(cx+1*SIMDD); + __MD r2 = MM_LOAD(cx+2*SIMDD); + __MD r3 = MM_LOAD(cx+3*SIMDD); + __MD r4 = MM_LOAD(cy+0*SIMDD); + __MD r5 = MM_LOAD(cy+1*SIMDD); + __MD r6 = MM_LOAD(cy+2*SIMDD); + __MD r7 = MM_LOAD(cy+3*SIMDD); + __MD r8 = MM_LOAD(cz+0*SIMDD); + __MD r9 = MM_LOAD(cz+1*SIMDD); + __MD r10 = MM_LOAD(cz+2*SIMDD); + __MD r11 = MM_LOAD(cz+3*SIMDD); + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD g24 = MM_LOAD(g+24*SIMDD); + __MD g25 = MM_LOAD(g+25*SIMDD); + __MD g26 = MM_LOAD(g+26*SIMDD); + __MD g27 = MM_LOAD(g+27*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+16*SIMDD, r4); + MM_STORE(g+17*SIMDD, r5); + MM_STORE(g+18*SIMDD, r6); + MM_STORE(g+19*SIMDD, r7); + MM_STORE(g+28*SIMDD, MM_MUL(r8, g24)); + MM_STORE(g+29*SIMDD, MM_MUL(r9, g25)); + MM_STORE(g+30*SIMDD, MM_MUL(r10,g26)); + MM_STORE(g+31*SIMDD, MM_MUL(r11,g27)); + MM_STORE(g+8 *SIMDD, MM_FMA(r0, r0, b0)); + MM_STORE(g+9 *SIMDD, MM_FMA(r1, r1, b1)); + MM_STORE(g+10*SIMDD, MM_FMA(r2, r2, b2)); + MM_STORE(g+11*SIMDD, MM_FMA(r3, r3, b3)); + MM_STORE(g+20*SIMDD, MM_FMA(r4, r4, b0)); + MM_STORE(g+21*SIMDD, MM_FMA(r5, r5, b1)); + MM_STORE(g+22*SIMDD, MM_FMA(r6, r6, b2)); + MM_STORE(g+23*SIMDD, MM_FMA(r7, r7, b3)); + MM_STORE(g+32*SIMDD, MM_MUL(MM_FMA(r8 , r8 , b0), g24)); + MM_STORE(g+33*SIMDD, MM_MUL(MM_FMA(r9 , r9 , b1), g25)); + MM_STORE(g+34*SIMDD, MM_MUL(MM_FMA(r10, r10, b2), g26)); + MM_STORE(g+35*SIMDD, MM_MUL(MM_FMA(r11, r11, b3), g27)); +} + +static inline void _srg0_2d4d_2001(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b10; + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+12*SIMDD, s0); + MM_STORE(g+13*SIMDD, s1); + MM_STORE(g+14*SIMDD, s2); + MM_STORE(g+15*SIMDD, s3); + MM_STORE(g+28*SIMDD, r4); + MM_STORE(g+29*SIMDD, r5); + MM_STORE(g+30*SIMDD, r6); + MM_STORE(g+31*SIMDD, r7); + MM_STORE(g+36*SIMDD, s4); + MM_STORE(g+37*SIMDD, s5); + MM_STORE(g+38*SIMDD, s6); + MM_STORE(g+39*SIMDD, s7); + MM_STORE(g+52*SIMDD, MM_MUL(r8 , g48)); + MM_STORE(g+53*SIMDD, MM_MUL(r9 , g49)); + MM_STORE(g+54*SIMDD, MM_MUL(r10, g50)); + MM_STORE(g+55*SIMDD, MM_MUL(r11, g51)); + MM_STORE(g+60*SIMDD, MM_MUL(s8 , g48)); + MM_STORE(g+61*SIMDD, MM_MUL(s9 , g49)); + MM_STORE(g+62*SIMDD, MM_MUL(s10, g50)); + MM_STORE(g+63*SIMDD, MM_MUL(s11, g51)); + MM_STORE(g+8 *SIMDD, MM_FMA(r0, r0, b10)); + MM_STORE(g+9 *SIMDD, MM_FMA(r1, r1, b11)); + MM_STORE(g+10*SIMDD, MM_FMA(r2, r2, b12)); + MM_STORE(g+11*SIMDD, MM_FMA(r3, r3, b13)); + MM_STORE(g+16*SIMDD, MM_FMA(s0, r0, b00)); + MM_STORE(g+17*SIMDD, MM_FMA(s1, r1, b01)); + MM_STORE(g+18*SIMDD, MM_FMA(s2, r2, b02)); + MM_STORE(g+19*SIMDD, MM_FMA(s3, r3, b03)); + MM_STORE(g+32*SIMDD, MM_FMA(r4, r4, b10)); + MM_STORE(g+33*SIMDD, MM_FMA(r5, r5, b11)); + MM_STORE(g+34*SIMDD, MM_FMA(r6, r6, b12)); + MM_STORE(g+35*SIMDD, MM_FMA(r7, r7, b13)); + MM_STORE(g+40*SIMDD, MM_FMA(s4, r4, b00)); + MM_STORE(g+41*SIMDD, MM_FMA(s5, r5, b01)); + MM_STORE(g+42*SIMDD, MM_FMA(s6, r6, b02)); + MM_STORE(g+43*SIMDD, MM_FMA(s7, r7, b03)); + MM_STORE(g+56*SIMDD, MM_MUL(MM_FMA(r8 , r8 , b10), g48)); + MM_STORE(g+57*SIMDD, MM_MUL(MM_FMA(r9 , r9 , b11), g49)); + MM_STORE(g+58*SIMDD, MM_MUL(MM_FMA(r10, r10, b12), g50)); + MM_STORE(g+59*SIMDD, MM_MUL(MM_FMA(r11, r11, b13), g51)); + MM_STORE(g+64*SIMDD, MM_MUL(MM_FMA(s8 , r8 , b00), g48)); + MM_STORE(g+65*SIMDD, MM_MUL(MM_FMA(s9 , r9 , b01), g49)); + MM_STORE(g+66*SIMDD, MM_MUL(MM_FMA(s10, r10, b02), g50)); + MM_STORE(g+67*SIMDD, MM_MUL(MM_FMA(s11, r11, b03), g51)); + MM_STORE(g+20*SIMDD, r0 *(MM_LOAD(g+16*SIMDD) + b00) + b10 * s0); + MM_STORE(g+21*SIMDD, r1 *(MM_LOAD(g+17*SIMDD) + b01) + b11 * s1); + MM_STORE(g+22*SIMDD, r2 *(MM_LOAD(g+18*SIMDD) + b02) + b12 * s2); + MM_STORE(g+23*SIMDD, r3 *(MM_LOAD(g+19*SIMDD) + b03) + b13 * s3); + MM_STORE(g+44*SIMDD, r4 *(MM_LOAD(g+40*SIMDD) + b00) + b10 * s4); + MM_STORE(g+45*SIMDD, r5 *(MM_LOAD(g+41*SIMDD) + b01) + b11 * s5); + MM_STORE(g+46*SIMDD, r6 *(MM_LOAD(g+42*SIMDD) + b02) + b12 * s6); + MM_STORE(g+47*SIMDD, r7 *(MM_LOAD(g+43*SIMDD) + b03) + b13 * s7); + MM_STORE(g+68*SIMDD, r8 * MM_LOAD(g+64*SIMDD) + b00 * MM_LOAD(g+52*SIMDD) + b10 * MM_LOAD(g+60*SIMDD)); + MM_STORE(g+69*SIMDD, r9 * MM_LOAD(g+65*SIMDD) + b01 * MM_LOAD(g+53*SIMDD) + b11 * MM_LOAD(g+61*SIMDD)); + MM_STORE(g+70*SIMDD, r10* MM_LOAD(g+66*SIMDD) + b02 * MM_LOAD(g+54*SIMDD) + b12 * MM_LOAD(g+62*SIMDD)); + MM_STORE(g+71*SIMDD, r11* MM_LOAD(g+67*SIMDD) + b03 * MM_LOAD(g+55*SIMDD) + b13 * MM_LOAD(g+63*SIMDD)); +} + +static inline void _srg0_2d4d_2010(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b0 = bc->b00; + double *b1 = bc->b10; + __MD r0 = MM_LOAD(c0x+0*SIMDD); + __MD r1 = MM_LOAD(c0x+1*SIMDD); + __MD r2 = MM_LOAD(c0x+2*SIMDD); + __MD r3 = MM_LOAD(c0x+3*SIMDD); + __MD r4 = MM_LOAD(c0y+0*SIMDD); + __MD r5 = MM_LOAD(c0y+1*SIMDD); + __MD r6 = MM_LOAD(c0y+2*SIMDD); + __MD r7 = MM_LOAD(c0y+3*SIMDD); + __MD r8 = MM_LOAD(c0z+0*SIMDD); + __MD r9 = MM_LOAD(c0z+1*SIMDD); + __MD r10= MM_LOAD(c0z+2*SIMDD); + __MD r11= MM_LOAD(c0z+3*SIMDD); + __MD s0 = MM_LOAD(cpx+0*SIMDD); + __MD s1 = MM_LOAD(cpx+1*SIMDD); + __MD s2 = MM_LOAD(cpx+2*SIMDD); + __MD s3 = MM_LOAD(cpx+3*SIMDD); + __MD s4 = MM_LOAD(cpy+0*SIMDD); + __MD s5 = MM_LOAD(cpy+1*SIMDD); + __MD s6 = MM_LOAD(cpy+2*SIMDD); + __MD s7 = MM_LOAD(cpy+3*SIMDD); + __MD s8 = MM_LOAD(cpz+0*SIMDD); + __MD s9 = MM_LOAD(cpz+1*SIMDD); + __MD s10= MM_LOAD(cpz+2*SIMDD); + __MD s11= MM_LOAD(cpz+3*SIMDD); + __MD b00 = MM_LOAD(b0+0*SIMDD); + __MD b01 = MM_LOAD(b0+1*SIMDD); + __MD b02 = MM_LOAD(b0+2*SIMDD); + __MD b03 = MM_LOAD(b0+3*SIMDD); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g48 = MM_LOAD(g+48*SIMDD); + __MD g49 = MM_LOAD(g+49*SIMDD); + __MD g50 = MM_LOAD(g+50*SIMDD); + __MD g51 = MM_LOAD(g+51*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+12*SIMDD, s0); + MM_STORE(g+13*SIMDD, s1); + MM_STORE(g+14*SIMDD, s2); + MM_STORE(g+15*SIMDD, s3); + MM_STORE(g+28*SIMDD, r4); + MM_STORE(g+29*SIMDD, r5); + MM_STORE(g+30*SIMDD, r6); + MM_STORE(g+31*SIMDD, r7); + MM_STORE(g+36*SIMDD, s4); + MM_STORE(g+37*SIMDD, s5); + MM_STORE(g+38*SIMDD, s6); + MM_STORE(g+39*SIMDD, s7); + MM_STORE(g+52*SIMDD, MM_MUL(r8 , g48)); + MM_STORE(g+53*SIMDD, MM_MUL(r9 , g49)); + MM_STORE(g+54*SIMDD, MM_MUL(r10, g50)); + MM_STORE(g+55*SIMDD, MM_MUL(r11, g51)); + MM_STORE(g+60*SIMDD, MM_MUL(s8 , g48)); + MM_STORE(g+61*SIMDD, MM_MUL(s9 , g49)); + MM_STORE(g+62*SIMDD, MM_MUL(s10, g50)); + MM_STORE(g+63*SIMDD, MM_MUL(s11, g51)); + MM_STORE(g+8 *SIMDD, MM_FMA(r0, r0, b10)); + MM_STORE(g+9 *SIMDD, MM_FMA(r1, r1, b11)); + MM_STORE(g+10*SIMDD, MM_FMA(r2, r2, b12)); + MM_STORE(g+11*SIMDD, MM_FMA(r3, r3, b13)); + MM_STORE(g+16*SIMDD, MM_FMA(s0, r0, b00)); + MM_STORE(g+17*SIMDD, MM_FMA(s1, r1, b01)); + MM_STORE(g+18*SIMDD, MM_FMA(s2, r2, b02)); + MM_STORE(g+19*SIMDD, MM_FMA(s3, r3, b03)); + MM_STORE(g+32*SIMDD, MM_FMA(r4, r4, b10)); + MM_STORE(g+33*SIMDD, MM_FMA(r5, r5, b11)); + MM_STORE(g+34*SIMDD, MM_FMA(r6, r6, b12)); + MM_STORE(g+35*SIMDD, MM_FMA(r7, r7, b13)); + MM_STORE(g+40*SIMDD, MM_FMA(s4, r4, b00)); + MM_STORE(g+41*SIMDD, MM_FMA(s5, r5, b01)); + MM_STORE(g+42*SIMDD, MM_FMA(s6, r6, b02)); + MM_STORE(g+43*SIMDD, MM_FMA(s7, r7, b03)); + MM_STORE(g+56*SIMDD, MM_MUL(MM_FMA(r8 , r8 , b10), g48)); + MM_STORE(g+57*SIMDD, MM_MUL(MM_FMA(r9 , r9 , b11), g49)); + MM_STORE(g+58*SIMDD, MM_MUL(MM_FMA(r10, r10, b12), g50)); + MM_STORE(g+59*SIMDD, MM_MUL(MM_FMA(r11, r11, b13), g51)); + MM_STORE(g+64*SIMDD, MM_MUL(MM_FMA(s8 , r8 , b00), g48)); + MM_STORE(g+65*SIMDD, MM_MUL(MM_FMA(s9 , r9 , b01), g49)); + MM_STORE(g+66*SIMDD, MM_MUL(MM_FMA(s10, r10, b02), g50)); + MM_STORE(g+67*SIMDD, MM_MUL(MM_FMA(s11, r11, b03), g51)); + MM_STORE(g+20*SIMDD, r0 *(MM_LOAD(g+16*SIMDD) + b00) + b10 * s0); + MM_STORE(g+21*SIMDD, r1 *(MM_LOAD(g+17*SIMDD) + b01) + b11 * s1); + MM_STORE(g+22*SIMDD, r2 *(MM_LOAD(g+18*SIMDD) + b02) + b12 * s2); + MM_STORE(g+23*SIMDD, r3 *(MM_LOAD(g+19*SIMDD) + b03) + b13 * s3); + MM_STORE(g+44*SIMDD, r4 *(MM_LOAD(g+40*SIMDD) + b00) + b10 * s4); + MM_STORE(g+45*SIMDD, r5 *(MM_LOAD(g+41*SIMDD) + b01) + b11 * s5); + MM_STORE(g+46*SIMDD, r6 *(MM_LOAD(g+42*SIMDD) + b02) + b12 * s6); + MM_STORE(g+47*SIMDD, r7 *(MM_LOAD(g+43*SIMDD) + b03) + b13 * s7); + MM_STORE(g+68*SIMDD, r8 * MM_LOAD(g+64*SIMDD) + b00 * MM_LOAD(g+52*SIMDD) + b10 * MM_LOAD(g+60*SIMDD)); + MM_STORE(g+69*SIMDD, r9 * MM_LOAD(g+65*SIMDD) + b01 * MM_LOAD(g+53*SIMDD) + b11 * MM_LOAD(g+61*SIMDD)); + MM_STORE(g+70*SIMDD, r10* MM_LOAD(g+66*SIMDD) + b02 * MM_LOAD(g+54*SIMDD) + b12 * MM_LOAD(g+62*SIMDD)); + MM_STORE(g+71*SIMDD, r11* MM_LOAD(g+67*SIMDD) + b03 * MM_LOAD(g+55*SIMDD) + b13 * MM_LOAD(g+63*SIMDD)); +} + +static inline void _srg0_2d4d_2100(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + double *b1 = bc->b10; + double *r = envs->rirj; + __MD rx = MM_SET1(r[0]); + __MD ry = MM_SET1(r[1]); + __MD rz = MM_SET1(r[2]); + __MD s0 = MM_LOAD(cx+0*SIMDD); + __MD s1 = MM_LOAD(cx+1*SIMDD); + __MD s2 = MM_LOAD(cx+2*SIMDD); + __MD s3 = MM_LOAD(cx+3*SIMDD); + __MD s4 = MM_LOAD(cy+0*SIMDD); + __MD s5 = MM_LOAD(cy+1*SIMDD); + __MD s6 = MM_LOAD(cy+2*SIMDD); + __MD s7 = MM_LOAD(cy+3*SIMDD); + __MD s8 = MM_LOAD(cz+0*SIMDD); + __MD s9 = MM_LOAD(cz+1*SIMDD); + __MD s10= MM_LOAD(cz+2*SIMDD); + __MD s11= MM_LOAD(cz+3*SIMDD); + __MD r0 = MM_ADD(rx, s0 ); + __MD r1 = MM_ADD(rx, s1 ); + __MD r2 = MM_ADD(rx, s2 ); + __MD r3 = MM_ADD(rx, s3 ); + __MD r4 = MM_ADD(ry, s4 ); + __MD r5 = MM_ADD(ry, s5 ); + __MD r6 = MM_ADD(ry, s6 ); + __MD r7 = MM_ADD(ry, s7 ); + __MD r8 = MM_ADD(rz, s8 ); + __MD r9 = MM_ADD(rz, s9 ); + __MD r10 = MM_ADD(rz, s10); + __MD r11 = MM_ADD(rz, s11); + __MD b10 = MM_LOAD(b1+0*SIMDD); + __MD b11 = MM_LOAD(b1+1*SIMDD); + __MD b12 = MM_LOAD(b1+2*SIMDD); + __MD b13 = MM_LOAD(b1+3*SIMDD); + __MD g64 = MM_LOAD(g+64*SIMDD); + __MD g65 = MM_LOAD(g+65*SIMDD); + __MD g66 = MM_LOAD(g+66*SIMDD); + __MD g67 = MM_LOAD(g+67*SIMDD); + __MD i2 = MM_SET1(2.); + MM_STORE(g+4 *SIMDD, s0); + MM_STORE(g+5 *SIMDD, s1); + MM_STORE(g+6 *SIMDD, s2); + MM_STORE(g+7 *SIMDD, s3); + MM_STORE(g+16*SIMDD, r0); + MM_STORE(g+17*SIMDD, r1); + MM_STORE(g+18*SIMDD, r2); + MM_STORE(g+19*SIMDD, r3); + MM_STORE(g+36*SIMDD, s4); + MM_STORE(g+37*SIMDD, s5); + MM_STORE(g+38*SIMDD, s6); + MM_STORE(g+39*SIMDD, s7); + MM_STORE(g+48*SIMDD, r4); + MM_STORE(g+49*SIMDD, r5); + MM_STORE(g+50*SIMDD, r6); + MM_STORE(g+51*SIMDD, r7); + MM_STORE(g+68*SIMDD, s8 * g64); + MM_STORE(g+69*SIMDD, s9 * g65); + MM_STORE(g+70*SIMDD, s10* g66); + MM_STORE(g+71*SIMDD, s11* g67); + MM_STORE(g+80*SIMDD, r8 * g64); + MM_STORE(g+81*SIMDD, r9 * g65); + MM_STORE(g+82*SIMDD, r10* g66); + MM_STORE(g+83*SIMDD, r11* g67); + MM_STORE(g+8 *SIMDD, s0 * s0 + b10); + MM_STORE(g+9 *SIMDD, s1 * s1 + b11); + MM_STORE(g+10*SIMDD, s2 * s2 + b12); + MM_STORE(g+11*SIMDD, s3 * s3 + b13); + MM_STORE(g+20*SIMDD, s0 * r0 + b10); + MM_STORE(g+21*SIMDD, s1 * r1 + b11); + MM_STORE(g+22*SIMDD, s2 * r2 + b12); + MM_STORE(g+23*SIMDD, s3 * r3 + b13); + MM_STORE(g+40*SIMDD, s4 * s4 + b10); + MM_STORE(g+41*SIMDD, s5 * s5 + b11); + MM_STORE(g+42*SIMDD, s6 * s6 + b12); + MM_STORE(g+43*SIMDD, s7 * s7 + b13); + MM_STORE(g+52*SIMDD, s4 * r4 + b10); + MM_STORE(g+53*SIMDD, s5 * r5 + b11); + MM_STORE(g+54*SIMDD, s6 * r6 + b12); + MM_STORE(g+55*SIMDD, s7 * r7 + b13); + MM_STORE(g+72*SIMDD,(s8 * s8 + b10) * g64); + MM_STORE(g+73*SIMDD,(s9 * s9 + b11) * g65); + MM_STORE(g+74*SIMDD,(s10* s10+ b12) * g66); + MM_STORE(g+75*SIMDD,(s11* s11+ b13) * g67); + MM_STORE(g+84*SIMDD,(s8 * r8 + b10) * g64); + MM_STORE(g+85*SIMDD,(s9 * r9 + b11) * g65); + MM_STORE(g+86*SIMDD,(s10* r10+ b12) * g66); + MM_STORE(g+87*SIMDD,(s11* r11+ b13) * g67); + MM_STORE(g+24*SIMDD, r0 * MM_LOAD(g+8 *SIMDD) + i2 * b10 * s0); + MM_STORE(g+25*SIMDD, r1 * MM_LOAD(g+9 *SIMDD) + i2 * b11 * s1); + MM_STORE(g+26*SIMDD, r2 * MM_LOAD(g+10*SIMDD) + i2 * b12 * s2); + MM_STORE(g+27*SIMDD, r3 * MM_LOAD(g+11*SIMDD) + i2 * b13 * s3); + MM_STORE(g+56*SIMDD, r4 * MM_LOAD(g+40*SIMDD) + i2 * b10 * s4); + MM_STORE(g+57*SIMDD, r5 * MM_LOAD(g+41*SIMDD) + i2 * b11 * s5); + MM_STORE(g+58*SIMDD, r6 * MM_LOAD(g+42*SIMDD) + i2 * b12 * s6); + MM_STORE(g+59*SIMDD, r7 * MM_LOAD(g+43*SIMDD) + i2 * b13 * s7); + MM_STORE(g+88*SIMDD, r8 * MM_LOAD(g+72*SIMDD) + i2 * b10 * MM_LOAD(g+68*SIMDD)); + MM_STORE(g+89*SIMDD, r9 * MM_LOAD(g+73*SIMDD) + i2 * b11 * MM_LOAD(g+69*SIMDD)); + MM_STORE(g+90*SIMDD, r10* MM_LOAD(g+74*SIMDD) + i2 * b12 * MM_LOAD(g+70*SIMDD)); + MM_STORE(g+91*SIMDD, r11* MM_LOAD(g+75*SIMDD) + i2 * b13 * MM_LOAD(g+71*SIMDD)); +} + +static inline void _srg0_2d4d_3000(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cx = bc->c00x; + double *cy = bc->c00y; + double *cz = bc->c00z; + double *b = bc->b10; + __MD b0 = MM_LOAD(b+0*SIMDD); + __MD b1 = MM_LOAD(b+1*SIMDD); + __MD b2 = MM_LOAD(b+2*SIMDD); + __MD b3 = MM_LOAD(b+3*SIMDD); + __MD i3 = MM_SET1(3.); + __MD r0 = MM_LOAD(cx+0*SIMDD); + __MD r1 = MM_LOAD(cx+1*SIMDD); + __MD r2 = MM_LOAD(cx+2*SIMDD); + __MD r3 = MM_LOAD(cx+3*SIMDD); + __MD r4 = MM_LOAD(cy+0*SIMDD); + __MD r5 = MM_LOAD(cy+1*SIMDD); + __MD r6 = MM_LOAD(cy+2*SIMDD); + __MD r7 = MM_LOAD(cy+3*SIMDD); + __MD r8 = MM_LOAD(cz+0*SIMDD); + __MD r9 = MM_LOAD(cz+1*SIMDD); + __MD r10= MM_LOAD(cz+2*SIMDD); + __MD r11= MM_LOAD(cz+3*SIMDD); + __MD g32 = MM_LOAD(g+32*SIMDD); + __MD g33 = MM_LOAD(g+33*SIMDD); + __MD g34 = MM_LOAD(g+34*SIMDD); + __MD g35 = MM_LOAD(g+35*SIMDD); + MM_STORE(g+4 *SIMDD, r0); + MM_STORE(g+5 *SIMDD, r1); + MM_STORE(g+6 *SIMDD, r2); + MM_STORE(g+7 *SIMDD, r3); + MM_STORE(g+20*SIMDD, r4); + MM_STORE(g+21*SIMDD, r5); + MM_STORE(g+22*SIMDD, r6); + MM_STORE(g+23*SIMDD, r7); + MM_STORE(g+36*SIMDD, r8 * g32); + MM_STORE(g+37*SIMDD, r9 * g33); + MM_STORE(g+38*SIMDD, r10* g34); + MM_STORE(g+39*SIMDD, r11* g35); + MM_STORE(g+8 *SIMDD, r0 * r0 + b0); + MM_STORE(g+9 *SIMDD, r1 * r1 + b1); + MM_STORE(g+10*SIMDD, r2 * r2 + b2); + MM_STORE(g+11*SIMDD, r3 * r3 + b3); + MM_STORE(g+24*SIMDD, r4 * r4 + b0); + MM_STORE(g+25*SIMDD, r5 * r5 + b1); + MM_STORE(g+26*SIMDD, r6 * r6 + b2); + MM_STORE(g+27*SIMDD, r7 * r7 + b3); + MM_STORE(g+40*SIMDD,(r8 * r8 + b0)* g32); + MM_STORE(g+41*SIMDD,(r9 * r9 + b1)* g33); + MM_STORE(g+42*SIMDD,(r10* r10+ b2)* g34); + MM_STORE(g+43*SIMDD,(r11* r11+ b3)* g35); + MM_STORE(g+12*SIMDD, r0 *(r0 * r0 + i3 * b0)); + MM_STORE(g+13*SIMDD, r1 *(r1 * r1 + i3 * b1)); + MM_STORE(g+14*SIMDD, r2 *(r2 * r2 + i3 * b2)); + MM_STORE(g+15*SIMDD, r3 *(r3 * r3 + i3 * b3)); + MM_STORE(g+28*SIMDD, r4 *(r4 * r4 + i3 * b0)); + MM_STORE(g+29*SIMDD, r5 *(r5 * r5 + i3 * b1)); + MM_STORE(g+30*SIMDD, r6 *(r6 * r6 + i3 * b2)); + MM_STORE(g+31*SIMDD, r7 *(r7 * r7 + i3 * b3)); + MM_STORE(g+44*SIMDD,(r8 * r8 + i3 * b0) * MM_LOAD(g+36*SIMDD)); + MM_STORE(g+45*SIMDD,(r9 * r9 + i3 * b1) * MM_LOAD(g+37*SIMDD)); + MM_STORE(g+46*SIMDD,(r10*r10 + i3 * b2) * MM_LOAD(g+38*SIMDD)); + MM_STORE(g+47*SIMDD,(r11*r11 + i3 * b3) * MM_LOAD(g+39*SIMDD)); +} + +void CINTsrg0_2e_2d4d_unrolled(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + int type_ijkl = ((envs->li_ceil << 6) | (envs->lj_ceil << 4) | + (envs->lk_ceil << 2) | (envs->ll_ceil)); + switch (type_ijkl) { + case 0b00000000: _srg0_2d4d_0000(g, bc, envs); return; + case 0b00000001: _srg0_2d4d_0001(g, bc, envs); return; + case 0b00000010: _srg0_2d4d_0002(g, bc, envs); return; + case 0b00000011: _srg0_2d4d_0003(g, bc, envs); return; + case 0b00000100: _srg0_2d4d_0010(g, bc, envs); return; + case 0b00000101: _srg0_2d4d_0011(g, bc, envs); return; + case 0b00000110: _srg0_2d4d_0012(g, bc, envs); return; + case 0b00001000: _srg0_2d4d_0020(g, bc, envs); return; + case 0b00001001: _srg0_2d4d_0021(g, bc, envs); return; + case 0b00001100: _srg0_2d4d_0030(g, bc, envs); return; + case 0b00010000: _srg0_2d4d_0100(g, bc, envs); return; + case 0b00010001: _srg0_2d4d_0101(g, bc, envs); return; + case 0b00010010: _srg0_2d4d_0102(g, bc, envs); return; + case 0b00010100: _srg0_2d4d_0110(g, bc, envs); return; + case 0b00010101: _srg0_2d4d_0111(g, bc, envs); return; + case 0b00011000: _srg0_2d4d_0120(g, bc, envs); return; + case 0b00100000: _srg0_2d4d_0200(g, bc, envs); return; + case 0b00100001: _srg0_2d4d_0201(g, bc, envs); return; + case 0b00100100: _srg0_2d4d_0210(g, bc, envs); return; + case 0b00110000: _srg0_2d4d_0300(g, bc, envs); return; + case 0b01000000: _srg0_2d4d_1000(g, bc, envs); return; + case 0b01000001: _srg0_2d4d_1001(g, bc, envs); return; + case 0b01000010: _srg0_2d4d_1002(g, bc, envs); return; + case 0b01000100: _srg0_2d4d_1010(g, bc, envs); return; + case 0b01000101: _srg0_2d4d_1011(g, bc, envs); return; + case 0b01001000: _srg0_2d4d_1020(g, bc, envs); return; + case 0b01010000: _srg0_2d4d_1100(g, bc, envs); return; + case 0b01010001: _srg0_2d4d_1101(g, bc, envs); return; + case 0b01010100: _srg0_2d4d_1110(g, bc, envs); return; + case 0b01100000: _srg0_2d4d_1200(g, bc, envs); return; + case 0b10000000: _srg0_2d4d_2000(g, bc, envs); return; + case 0b10000001: _srg0_2d4d_2001(g, bc, envs); return; + case 0b10000100: _srg0_2d4d_2010(g, bc, envs); return; + case 0b10010000: _srg0_2d4d_2100(g, bc, envs); return; + case 0b11000000: _srg0_2d4d_3000(g, bc, envs); return; } + fprintf(stderr, "Dimension error for CINTg0_2e_lj2d4d: iklj = %d %d %d %d", + (int)envs->li_ceil, (int)envs->lk_ceil, + (int)envs->ll_ceil, (int)envs->lj_ceil); +} - r1 = MM_LOAD(a1); - r0 = MM_DIV(MM_LOAD(a0), MM_MUL(r1, MM_MUL(r1, r1))); +void CINTg0_2e_lj2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d(g, bc, envs); + CINTg0_lj_4d(g, envs); +} +void CINTg0_2e_kj2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d(g, bc, envs); + CINTg0_kj_4d(g, envs); +} +void CINTg0_2e_ik2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d(g, bc, envs); + CINTg0_ik_4d(g, envs); +} +void CINTg0_2e_il2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d(g, bc, envs); + CINTg0_il_4d(g, envs); +} + +/* + * g[i,k,l,j] = < ik | lj > = ( i j | k l ) + */ +int CINTg0_2e(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int count) +{ + ALIGNMM double a0[SIMDD]; + ALIGNMM double a1[SIMDD]; + ALIGNMM double fac1[SIMDD]; + ALIGNMM double x[SIMDD]; + ALIGNMM double rijrkl[SIMDD*3]; + ALIGNMM double rijrx[SIMDD*3]; + ALIGNMM double rklrx[SIMDD*3]; + ALIGNMM double u[MXRYSROOTS*SIMDD]; + DEF_GXYZ(double, g, gx, gy, gz); + double *rij = envs->rij; + double *rkl = envs->rkl; + double *w = gz; + __MD ra, r0, r1, r2, r3, r4, r5, r6, r7, r8; + int nroots = envs->nrys_roots; + int i; + + //:for (int k = 0; k < count; k++) { + //: aij[k] = envs->ai[k] + envs->aj[k]; + //: akl[k] = envs->ak[k] + envs->al[k]; + //: aijkl[k] = aij[k] + akl[k]; + //: a1[k] = aij[k] * akl[k]; + //: a0[k] = a1[k] / aijkl[k]; + //: //fac1[k] = sqrt(a0[k] / (a1[k] * a1[k] * a1[k])) * envs->fac[k]; + //: fac1[k] = envs->fac[k] / (sqrt(aijakl[k]) * a1[k]); + //:} + __MD aij = MM_ADD(MM_LOAD(envs->ai), MM_LOAD(envs->aj)); + __MD akl = MM_ADD(MM_LOAD(envs->ak), MM_LOAD(envs->al)); + r1 = MM_MUL(aij, akl); + MM_STORE(a1, r1); + ra = MM_ADD(aij, akl); + r0 = MM_DIV(r1, ra); + MM_STORE(a0, r0); + + r0 = MM_DIV(r0, MM_MUL(r1, MM_MUL(r1, r1))); MM_STORE(fac1, MM_MUL(MM_SQRT(r0), MM_LOAD(envs->fac))); -#else - MM_STORE(fac1, MM_DIV(MM_LOAD(envs->fac), MM_MUL(MM_SQRT(ra), r1))); -#endif r0 = MM_LOAD(rij+0*SIMDD); r1 = MM_LOAD(rij+1*SIMDD); @@ -2475,39 +5473,91 @@ int CINTg0_2e(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int coun MM_STORE(rklrx+1*SIMDD, MM_SUB(r4, MM_SET1(envs->rx_in_rklrx[1]))); MM_STORE(rklrx+2*SIMDD, MM_SUB(r5, MM_SET1(envs->rx_in_rklrx[2]))); -//ABORT ALIGNMM double erfx[SIMDD]; -//ABORT if (envs->g_size == 1) { -//ABORT // gz = erf(sqrt(rr*aij*akl/(aij+akl)))/sqrt(rr) -//ABORT MM_STORE(erfx, MM_SQRT(MM_LOAD(x))); -//ABORT for (k = 0; k < count; k++) { -//ABORT erfx[k] = erf(erfx[k]); -//ABORT } -//ABORT MM_STORE(gz, MM_DIV(MM_LOAD(erfx), MM_SQRT(MM_LOAD(x)))); -//ABORT return; -//ABORT } -#ifdef WITH_RANGE_COULOMB - if (omega < 0) { - int all_negligible = _CINTsr_rys_roots_batch( - envs, x, theta, u, w, cutoff, count); - if (all_negligible) { - // g still has to be evaluated since iempty (which - // indicates whether g is zero) in cint2e is determined - // before calling the g0_2e function. - for (i = 0; i < envs->g_size * SIMDD * 3; i++) { - g[i] = 0; +// Not recommended to mix range-separated Coulomb with regular Coulomb operator. +// Keep this for backward compatibility to cint2 + const double omega = envs->env[PTR_RANGE_OMEGA]; + ALIGNMM double theta[SIMDD]; + if (omega == 0) { + _CINTrys_roots_batch(nroots, x, u, w, count); + } else if (omega < 0) { + //:theta = omega * omega / (omega * omega + a0); + r0 = MM_SET1(omega); + r0 = MM_MUL(r0, r0); + r1 = MM_LOAD(a0); + r0 = MM_DIV(r0, MM_ADD(r0, r1)); + MM_STORE(theta, r0); + int rorder = envs->rys_order; + int all_negligible = 1; + if (rorder == nroots) { + all_negligible = _CINTsr_rys_roots_batch( + envs, x, theta, u, w, cutoff, count); + if (all_negligible) { + return 0; + } + } else { + r0 = MM_SET1(0.); + for (i = 0; i < nroots; i++) { + MM_STORE(u+i*SIMDD, r0); + MM_STORE(w+i*SIMDD, r0); + } + ALIGNMM double xt[SIMDD]; + __MD rtheta = MM_LOAD(theta); + __MD rtheta1 = -MM_SQRT(rtheta); + MM_STORE(xt, MM_MUL(rtheta, MM_LOAD(x))); + double rt[MXRYSROOTS * 2]; + double *wt = rt + nroots; + int k; + for (i = 0; i < count; i++) { + if (xt[i] < cutoff[i] && xt[i] < EXPCUTOFF_SR) { + CINTrys_roots(rorder, x[i], rt, wt); + CINTrys_roots(rorder, xt[i], rt+rorder, wt+rorder); + for (k = 0; k < nroots; k++) { + u[k*SIMDD+i] = rt[k]; + w[k*SIMDD+i] = wt[k]; + } + all_negligible = 0; + } + } + if (all_negligible) { + return 0; + } + + if (envs->g_size == 2) { + r0 = MM_LOAD(fac1); + r1 = MM_SET1(1.); + MM_STORE(g+0*SIMDD, r1); + MM_STORE(g+1*SIMDD, r1); + MM_STORE(g+2*SIMDD, r1); + MM_STORE(g+3*SIMDD, r1); + MM_STORE(g+4*SIMDD, MM_LOAD(w ) * r0); + MM_STORE(g+5*SIMDD, MM_LOAD(w+SIMDD) * r0 * rtheta1); + return 1; + } + r1 = MM_SET1(1.); + for (i = rorder; i < nroots; i++) { + r0 = MM_LOAD(u+i*SIMDD); + r2 = r0 * rtheta; + MM_STORE(u+i*SIMDD, MM_DIV(r2, r0+r1-r2)); + MM_STORE(w+i*SIMDD, MM_LOAD(w+i*SIMDD) * rtheta1); } - return 0; } } else { + //:theta = omega * omega / (omega * omega + a0); + r0 = MM_SET1(omega); + r0 = MM_MUL(r0, r0); + r1 = MM_LOAD(a0); + __MD rtheta = MM_DIV(r0, MM_ADD(r0, r1)); + MM_STORE(x, MM_MUL(rtheta, MM_LOAD(x))); + MM_STORE(fac1, MM_MUL(MM_LOAD(fac1), MM_SQRT(rtheta))); _CINTrys_roots_batch(nroots, x, u, w, count); + r1 = MM_SET1(1.); + for (i = 0; i < nroots; i++) { + r0 = MM_LOAD(u+i*SIMDD); + r2 = r0 * rtheta; + MM_STORE(u+i*SIMDD, MM_DIV(r2, r0+r1-r2)); + } } -#else - _CINTrys_roots_batch(nroots, x, u, w, count); -#endif - double *gx = g; - double *gy = gx + envs->g_size * SIMDD; - double *gz = gy + envs->g_size * SIMDD; //:for (i = 0; i < nroots; i++) { //:for (k = 0; k < count; k++) { //: gx[i*SIMDD+k] = 1; @@ -2517,33 +5567,14 @@ int CINTg0_2e(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int coun r0 = MM_LOAD(fac1); r1 = MM_SET1(1.); for (i = 0; i < nroots; i++) { - //MM_STORE(gx+i*SIMDD, r1); - //MM_STORE(gy+i*SIMDD, r1); + MM_STORE(gx+i*SIMDD, r1); + MM_STORE(gy+i*SIMDD, r1); MM_STORE(gz+i*SIMDD, MM_MUL(MM_LOAD(w+i*SIMDD), r0)); } if (envs->g_size == 1) { // ssss return 1; } -#ifdef WITH_RANGE_COULOMB - if (omega > 0) { - /* u[:] = tau^2 / (1 - tau^2) - * transform u[:] to theta^-1 tau^2 / (theta^-1 - tau^2) - * so the rest code can be reused. - */ - //:for (irys = 0; irys < nroots; irys++) { - //: u[irys] /= u[irys] + 1 - u[irys] * theta; - //:} - r0 = MM_LOAD(theta); - r1 = MM_SET1(1.); - for (i = 0; i < nroots; i++) { - r2 = MM_LOAD(u+i*SIMDD); - r3 = r2 + r1 - r2 * r0; - MM_STORE(u+i*SIMDD, MM_DIV(r2, r3)); - } - } -#endif - double *b00 = bc->b00; double *b10 = bc->b10; double *b01 = bc->b01; @@ -2553,9 +5584,6 @@ int CINTg0_2e(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int coun double *c0px = bc->c0px; double *c0py = bc->c0py; double *c0pz = bc->c0pz; - - ALIGNMM double tmp1[MXRYSROOTS*SIMDD]; - ALIGNMM double tmp4[MXRYSROOTS*SIMDD]; //:double u2[SIMDD]; //:double tmp2, tmp3; //:double div[SIMDD]; @@ -2579,49 +5607,39 @@ int CINTg0_2e(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int coun //: c0pz[i*SIMDD+k] = rklrx[2*SIMDD+k] + tmp3 * rijrkl[2*SIMDD+k]; //:} } - ra = MM_LOAD(aijkl); + ra = MM_ADD(aij, akl); r0 = MM_LOAD(a0); r1 = MM_LOAD(a1); r2 = MM_SET1(.5); r3 = MM_SET1(1.); - for (i = 0; i < nroots; i++) { - r4 = MM_MUL(r0, MM_LOAD(u+i*SIMDD)); - r5 = MM_DIV(r3, MM_FMA(r4, ra, r1)); - MM_STORE(tmp4+i*SIMDD, MM_MUL(r2, r5)); - MM_STORE(tmp1+i*SIMDD, MM_MUL(r4, r5)); - } - ra = MM_SET1(.5); - r2 = MM_LOAD(akl); - r3 = MM_LOAD(aij); - for (i = 0; i < nroots; i++) { - r0 = MM_MUL(ra, MM_LOAD(tmp1+i*SIMDD)); - MM_STORE(b00+i*SIMDD, r0); - r1 = MM_LOAD(tmp4+i*SIMDD); - MM_STORE(b10+i*SIMDD, MM_FMA(r1, r2, r0)); - MM_STORE(b01+i*SIMDD, MM_FMA(r1, r3, r0)); - } r4 = MM_LOAD(rijrkl+0*SIMDD); r5 = MM_LOAD(rijrkl+1*SIMDD); r6 = MM_LOAD(rijrkl+2*SIMDD); - ra = MM_LOAD(akl); - r1 = MM_LOAD(rijrx+0*SIMDD); - r2 = MM_LOAD(rijrx+1*SIMDD); - r3 = MM_LOAD(rijrx+2*SIMDD); - for (i = 0; i < nroots; i++) { - r0 = MM_MUL(MM_LOAD(tmp1+i*SIMDD), ra); - MM_STORE(c00x+i*SIMDD, MM_FNMA(r0, r4, r1)); - MM_STORE(c00y+i*SIMDD, MM_FNMA(r0, r5, r2)); - MM_STORE(c00z+i*SIMDD, MM_FNMA(r0, r6, r3)); - } - ra = MM_LOAD(aij); - r1 = MM_LOAD(rklrx+0*SIMDD); - r2 = MM_LOAD(rklrx+1*SIMDD); - r3 = MM_LOAD(rklrx+2*SIMDD); + __MD _rijrx = MM_LOAD(rijrx+0*SIMDD); + __MD _rijry = MM_LOAD(rijrx+1*SIMDD); + __MD _rijrz = MM_LOAD(rijrx+2*SIMDD); + __MD _rklrx = MM_LOAD(rklrx+0*SIMDD); + __MD _rklry = MM_LOAD(rklrx+1*SIMDD); + __MD _rklrz = MM_LOAD(rklrx+2*SIMDD); + __MD tmp1, tmp2, tmp3, tmp4, tmp5; for (i = 0; i < nroots; i++) { - r0 = MM_MUL(MM_LOAD(tmp1+i*SIMDD), ra); - MM_STORE(c0px+i*SIMDD, MM_FMA (r0, r4, r1)); - MM_STORE(c0py+i*SIMDD, MM_FMA (r0, r5, r2)); - MM_STORE(c0pz+i*SIMDD, MM_FMA (r0, r6, r3)); + tmp1 = MM_MUL(r0, MM_LOAD(u+i*SIMDD)); + tmp5 = MM_DIV(r3, MM_FMA(tmp1, ra, r1)); + tmp1 = MM_MUL(tmp1, tmp5); + tmp4 = MM_MUL(r2, tmp5); + tmp2 = MM_MUL(r2, tmp1); + MM_STORE(b00+i*SIMDD, tmp2); + MM_STORE(b10+i*SIMDD, MM_FMA(tmp4, akl, tmp2)); + MM_STORE(b01+i*SIMDD, MM_FMA(tmp4, aij, tmp2)); + + tmp2 = MM_MUL(tmp1, akl); + tmp3 = MM_MUL(tmp1, aij); + MM_STORE(c00x+i*SIMDD, MM_FNMA(tmp2, r4, _rijrx)); + MM_STORE(c00y+i*SIMDD, MM_FNMA(tmp2, r5, _rijry)); + MM_STORE(c00z+i*SIMDD, MM_FNMA(tmp2, r6, _rijrz)); + MM_STORE(c0px+i*SIMDD, MM_FMA (tmp3, r4, _rklrx)); + MM_STORE(c0py+i*SIMDD, MM_FMA (tmp3, r5, _rklry)); + MM_STORE(c0pz+i*SIMDD, MM_FMA (tmp3, r6, _rklrz)); } (*envs->f_g0_2d4d)(g, bc, envs); diff --git a/src/g2e.h b/src/g2e.h index 421665d..c0a4149 100644 --- a/src/g2e.h +++ b/src/g2e.h @@ -33,8 +33,6 @@ typedef struct { ALIGNMM double b01[SIMDD*MXRYSROOTS]; ALIGNMM double b00[SIMDD*MXRYSROOTS]; ALIGNMM double b10[SIMDD*MXRYSROOTS]; - ALIGNMM double u[MXRYSROOTS*SIMDD]; - ALIGNMM double w[MXRYSROOTS*SIMDD]; } Rys2eT; #endif @@ -46,8 +44,6 @@ void CINTinit_int3c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, int *atm, int natm, int *bas, int nbas, double *env); void CINTinit_int2c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, int *atm, int natm, int *bas, int nbas, double *env); -void CINTinit_int2e_coulerf_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env); void CINTinit_int2e_stg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, int *atm, int natm, int *bas, int nbas, double *env); void CINTinit_int2e_yp_EnvVars(CINTEnvVars *envs, int *ng, int *shls, @@ -61,6 +57,8 @@ void CINTg0_2e_2d(double *g, Rys2eT *bc, CINTEnvVars *envs); void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs); void CINTg0_2e_2d4d_unrolled(double *g, Rys2eT *bc, CINTEnvVars *envs); void CINTg0_2e_2d4d_unrolled_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs); +void CINTsrg0_2e_2d4d_unrolled(double *g, Rys2eT *bc, CINTEnvVars *envs); +void CINTsrg0_2e_2d4d_unrolled_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs); void CINTg0_2e_lj2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs); void CINTg0_2e_kj2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs); void CINTg0_2e_il2d4d(double *g, Rys2eT *bc, CINTEnvVars *envs); @@ -74,8 +72,6 @@ void CINTinit_int2e_stg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, int *atm, int natm, int *bas, int nbas, double *env); void CINTinit_int2e_yp_EnvVars(CINTEnvVars *envs, int *ng, int *shls, int *atm, int natm, int *bas, int nbas, double *env); -void CINTinit_int2e_coulerf_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env); void CINTnabla1i_2e(double *f, double *g, int li, int lj, int lk, int ll, CINTEnvVars *envs); @@ -120,16 +116,6 @@ void CINTinit_int2e_stg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, int *atm, int natm, int *bas, int nbas, double *env); #endif -#ifdef WITH_GTG -void CINTinit_int2e_gtg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env); -void CINTinit_int3c2e_gtg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env); -void CINTinit_int2c2e_gtg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env); -#endif - - #define G2E_D_I(f, g, li, lj, lk, ll) CINTnabla1i_2e(f, g, li, lj, lk, ll, envs) #define G2E_D_J(f, g, li, lj, lk, ll) CINTnabla1j_2e(f, g, li, lj, lk, ll, envs) #define G2E_D_K(f, g, li, lj, lk, ll) CINTnabla1k_2e(f, g, li, lj, lk, ll, envs) diff --git a/src/g2e_coulerf.c b/src/g2e_coulerf.c deleted file mode 100644 index cc41e04..0000000 --- a/src/g2e_coulerf.c +++ /dev/null @@ -1,432 +0,0 @@ -/* - * Qcint is a general GTO integral library for computational chemistry - * Copyright (C) 2014- Qiming Sun - * - * This file is part of Qcint. - * - * Qcint is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -/* - * Attenuated coulomb operator exp(-w r_{12}^2) / r_{12} - */ - -#include -#include -#include -#include "config.h" -#include "cint_bas.h" -#include "simd.h" -#include "rys_roots.h" -#include "misc.h" -#include "g2e.h" - -int CINTg0_2e_coulerf(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int count); -int CINTg0_2e_coulerf_simd1(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int idsimd); - -void CINTinit_int2e_coulerf_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env) -{ - envs->natm = natm; - envs->nbas = nbas; - envs->atm = atm; - envs->bas = bas; - envs->env = env; - envs->shls = shls; - - int i_sh = shls[0]; - int j_sh = shls[1]; - int k_sh = shls[2]; - int l_sh = shls[3]; - envs->i_l = bas(ANG_OF, i_sh); - envs->j_l = bas(ANG_OF, j_sh); - envs->k_l = bas(ANG_OF, k_sh); - envs->l_l = bas(ANG_OF, l_sh); - envs->x_ctr[0] = bas(NCTR_OF, i_sh); - envs->x_ctr[1] = bas(NCTR_OF, j_sh); - envs->x_ctr[2] = bas(NCTR_OF, k_sh); - envs->x_ctr[3] = bas(NCTR_OF, l_sh); - envs->nfi = (envs->i_l+1)*(envs->i_l+2)/2; - envs->nfj = (envs->j_l+1)*(envs->j_l+2)/2; - envs->nfk = (envs->k_l+1)*(envs->k_l+2)/2; - envs->nfl = (envs->l_l+1)*(envs->l_l+2)/2; - envs->nf = envs->nfi * envs->nfk * envs->nfl * envs->nfj; - envs->common_factor = 1; - if (env[PTR_EXPCUTOFF] == 0) { - envs->expcutoff = EXPCUTOFF; - } else { - envs->expcutoff = MAX(MIN_EXPCUTOFF, env[PTR_EXPCUTOFF]); - } - - envs->gbits = ng[GSHIFT]; - envs->ncomp_e1 = ng[POS_E1]; - envs->ncomp_e2 = ng[POS_E2]; - envs->ncomp_tensor = ng[TENSOR]; - - envs->li_ceil = envs->i_l + ng[IINC]; - envs->lj_ceil = envs->j_l + ng[JINC]; - envs->lk_ceil = envs->k_l + ng[KINC]; - envs->ll_ceil = envs->l_l + ng[LINC]; - - envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); - envs->rj = env + atm(PTR_COORD, bas(ATOM_OF, j_sh)); - envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); - envs->rl = env + atm(PTR_COORD, bas(ATOM_OF, l_sh)); - - int nroots = (envs->li_ceil + envs->lj_ceil + - envs->lk_ceil + envs->ll_ceil)/2 + 1; - envs->nrys_roots = nroots; - assert(nroots < MXRYSROOTS); - - int dli, dlj, dlk, dll; - int ibase = envs->li_ceil > envs->lj_ceil; - int kbase = envs->lk_ceil > envs->ll_ceil; - if (nroots <= 2) { // use the fully optimized lj_4d algorithm - ibase = 0; - kbase = 0; - } - if (kbase) { - if (ibase) { - envs->f_g0_2d4d = &CINTg0_2e_ik2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_ik2d4d_simd1; - } else { - envs->f_g0_2d4d = &CINTg0_2e_kj2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_kj2d4d_simd1; - } - } else { - if (ibase) { - envs->f_g0_2d4d = &CINTg0_2e_il2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_il2d4d_simd1; - } else { - envs->f_g0_2d4d = &CINTg0_2e_lj2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_lj2d4d_simd1; - } - } - envs->f_g0_2e = &CINTg0_2e_coulerf; - envs->f_g0_2e_simd1 = &CINTg0_2e_coulerf_simd1; - - if (kbase) { - dlk = envs->lk_ceil + envs->ll_ceil + 1; - dll = envs->ll_ceil + 1; - } else { - dlk = envs->lk_ceil + 1; - dll = envs->lk_ceil + envs->ll_ceil + 1; - } - - if (ibase) { - dli = envs->li_ceil + envs->lj_ceil + 1; - dlj = envs->lj_ceil + 1; - } else { - dli = envs->li_ceil + 1; - dlj = envs->li_ceil + envs->lj_ceil + 1; - } - envs->g_stride_i = nroots; - envs->g_stride_k = nroots * dli; - envs->g_stride_l = nroots * dli * dlk; - envs->g_stride_j = nroots * dli * dlk * dll; - envs->g_size = nroots * dli * dlk * dll * dlj; - - if (kbase) { - envs->g2d_klmax = envs->g_stride_k; - envs->rx_in_rklrx = envs->rk; - envs->rkrl[0] = envs->rk[0] - envs->rl[0]; - envs->rkrl[1] = envs->rk[1] - envs->rl[1]; - envs->rkrl[2] = envs->rk[2] - envs->rl[2]; - } else { - envs->g2d_klmax = envs->g_stride_l; - envs->rx_in_rklrx = envs->rl; - envs->rkrl[0] = envs->rl[0] - envs->rk[0]; - envs->rkrl[1] = envs->rl[1] - envs->rk[1]; - envs->rkrl[2] = envs->rl[2] - envs->rk[2]; - } - - if (ibase) { - envs->g2d_ijmax = envs->g_stride_i; - envs->rx_in_rijrx = envs->ri; - envs->rirj[0] = envs->ri[0] - envs->rj[0]; - envs->rirj[1] = envs->ri[1] - envs->rj[1]; - envs->rirj[2] = envs->ri[2] - envs->rj[2]; - } else { - envs->g2d_ijmax = envs->g_stride_j; - envs->rx_in_rijrx = envs->rj; - envs->rirj[0] = envs->rj[0] - envs->ri[0]; - envs->rirj[1] = envs->rj[1] - envs->ri[1]; - envs->rirj[2] = envs->rj[2] - envs->ri[2]; - } -} - -int CINTg0_2e_coulerf(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int count) -{ - ALIGNMM double aij[SIMDD]; - ALIGNMM double akl[SIMDD]; - ALIGNMM double a0[SIMDD]; - ALIGNMM double a1[SIMDD]; - ALIGNMM double aijkl[SIMDD]; - ALIGNMM double fac1[SIMDD]; - ALIGNMM double x[SIMDD]; - ALIGNMM double rijrkl[SIMDD*3]; - ALIGNMM double rijrx[SIMDD*3]; - ALIGNMM double rklrx[SIMDD*3]; - double *rij = envs->rij; - double *rkl = envs->rkl; - double *u = bc->u; - double *w = bc->w; - __MD ra, r0, r1, r2, r3, r4, r5, r6, r7, r8; - double omega = envs->env[PTR_RANGE_OMEGA]; - int nroots = envs->nrys_roots; - int i; - - r2 = MM_ADD(MM_LOAD(envs->ai), MM_LOAD(envs->aj)); - r3 = MM_ADD(MM_LOAD(envs->ak), MM_LOAD(envs->al)); - MM_STORE(aij, r2); - MM_STORE(akl, r3); - r1 = MM_MUL(r2, r3); - MM_STORE(a1, r1); - ra = MM_ADD(r2, r3); - MM_STORE(aijkl, ra); - r0 = MM_DIV(r1, ra); - MM_STORE(a0, r0); - - ALIGNMM double theta[SIMDD]; - if (omega > 0) { -// For long-range part of range-separated Coulomb operator - //:theta = omega * omega / (omega * omega + a0); - //:a0 *= theta; - r0 = MM_SET1(omega); - r1 = MM_LOAD(a0); - r0 = MM_MUL(r0, r0); - r0 = MM_DIV(r0, MM_ADD(r0, r1)); - MM_STORE(theta, r0); - MM_STORE(a0, MM_MUL(r0, r1)); - } - r1 = MM_LOAD(a1); - r0 = MM_DIV(MM_LOAD(a0), MM_MUL(r1, MM_MUL(r1, r1))); - MM_STORE(fac1, MM_MUL(MM_SQRT(r0), MM_LOAD(envs->fac))); - - r0 = MM_LOAD(rij+0*SIMDD); - r1 = MM_LOAD(rij+1*SIMDD); - r2 = MM_LOAD(rij+2*SIMDD); - r3 = MM_LOAD(rkl+0*SIMDD); - r4 = MM_LOAD(rkl+1*SIMDD); - r5 = MM_LOAD(rkl+2*SIMDD); - - r6 = MM_SUB(r0, r3); MM_STORE(rijrkl+0*SIMDD, r6); - r7 = MM_SUB(r1, r4); MM_STORE(rijrkl+1*SIMDD, r7); - r8 = MM_SUB(r2, r5); MM_STORE(rijrkl+2*SIMDD, r8); - ra = MM_FMA(r6, r6, MM_FMA(r7, r7, MM_MUL(r8, r8))); - MM_STORE(x, MM_MUL(MM_LOAD(a0), ra)); - MM_STORE(rijrx+0*SIMDD, MM_SUB(r0, MM_SET1(envs->rx_in_rijrx[0]))); - MM_STORE(rijrx+1*SIMDD, MM_SUB(r1, MM_SET1(envs->rx_in_rijrx[1]))); - MM_STORE(rijrx+2*SIMDD, MM_SUB(r2, MM_SET1(envs->rx_in_rijrx[2]))); - MM_STORE(rklrx+0*SIMDD, MM_SUB(r3, MM_SET1(envs->rx_in_rklrx[0]))); - MM_STORE(rklrx+1*SIMDD, MM_SUB(r4, MM_SET1(envs->rx_in_rklrx[1]))); - MM_STORE(rklrx+2*SIMDD, MM_SUB(r5, MM_SET1(envs->rx_in_rklrx[2]))); - - _CINTrys_roots_batch(nroots, x, u, w, count); - - double *gx = g; - double *gy = gx + envs->g_size * SIMDD; - double *gz = gy + envs->g_size * SIMDD; - r0 = MM_LOAD(fac1); - r1 = MM_SET1(1.); - for (i = 0; i < nroots; i++) { - //MM_STORE(gx+i*SIMDD, r1); - //MM_STORE(gy+i*SIMDD, r1); - MM_STORE(gz+i*SIMDD, MM_MUL(MM_LOAD(w+i*SIMDD), r0)); - } - if (envs->g_size == 1) { - return 1; - } - - if (omega > 0) { - /* u[:] = tau^2 / (1 - tau^2) - * transform u[:] to theta^-1 tau^2 / (theta^-1 - tau^2) - * so the rest code can be reused. - */ - //:for (irys = 0; irys < nroots; irys++) { - //: u[irys] /= u[irys] + 1 - u[irys] * theta; - //:} - r0 = MM_LOAD(theta); - r1 = MM_SET1(1.); - for (i = 0; i < nroots; i++) { - r2 = MM_LOAD(u+i*SIMDD); - r3 = r2 + r1 - r2 * r0; - MM_STORE(u+i*SIMDD, MM_DIV(r2, r3)); - } - } - - double *b00 = bc->b00; - double *b10 = bc->b10; - double *b01 = bc->b01; - double *c00x = bc->c00x; - double *c00y = bc->c00y; - double *c00z = bc->c00z; - double *c0px = bc->c0px; - double *c0py = bc->c0py; - double *c0pz = bc->c0pz; - - ALIGNMM double tmp1[MXRYSROOTS*SIMDD]; - ALIGNMM double tmp4[MXRYSROOTS*SIMDD]; - - ra = MM_LOAD(aijkl); - r0 = MM_LOAD(a0); - r1 = MM_LOAD(a1); - r2 = MM_SET1(.5); - r3 = MM_SET1(2.); - for (i = 0; i < nroots; i++) { - r4 = MM_MUL(r0, MM_LOAD(u+i*SIMDD)); - r5 = MM_DIV(r2, MM_FMA(r4, ra, r1)); - MM_STORE(tmp4+i*SIMDD, r5); - r6 = MM_MUL(MM_MUL(r3, r4), r5); - MM_STORE(tmp1+i*SIMDD, r6); - } - ra = MM_SET1(.5); - r2 = MM_LOAD(akl); - r3 = MM_LOAD(aij); - for (i = 0; i < nroots; i++) { - r0 = MM_MUL(ra, MM_LOAD(tmp1+i*SIMDD)); - MM_STORE(b00+i*SIMDD, r0); - r1 = MM_LOAD(tmp4+i*SIMDD); - MM_STORE(b10+i*SIMDD, MM_FMA(r1, r2, r0)); - MM_STORE(b01+i*SIMDD, MM_FMA(r1, r3, r0)); - } - r4 = MM_LOAD(rijrkl+0*SIMDD); - r5 = MM_LOAD(rijrkl+1*SIMDD); - r6 = MM_LOAD(rijrkl+2*SIMDD); - ra = MM_LOAD(akl); - r1 = MM_LOAD(rijrx+0*SIMDD); - r2 = MM_LOAD(rijrx+1*SIMDD); - r3 = MM_LOAD(rijrx+2*SIMDD); - for (i = 0; i < nroots; i++) { - r0 = MM_MUL(MM_LOAD(tmp1+i*SIMDD), ra); - MM_STORE(c00x+i*SIMDD, MM_FNMA(r0, r4, r1)); - MM_STORE(c00y+i*SIMDD, MM_FNMA(r0, r5, r2)); - MM_STORE(c00z+i*SIMDD, MM_FNMA(r0, r6, r3)); - } - ra = MM_LOAD(aij); - r1 = MM_LOAD(rklrx+0*SIMDD); - r2 = MM_LOAD(rklrx+1*SIMDD); - r3 = MM_LOAD(rklrx+2*SIMDD); - for (i = 0; i < nroots; i++) { - r0 = MM_MUL(MM_LOAD(tmp1+i*SIMDD), ra); - MM_STORE(c0px+i*SIMDD, MM_FMA (r0, r4, r1)); - MM_STORE(c0py+i*SIMDD, MM_FMA (r0, r5, r2)); - MM_STORE(c0pz+i*SIMDD, MM_FMA (r0, r6, r3)); - } - - (*envs->f_g0_2d4d)(g, bc, envs); - return 1; -} - -int CINTg0_2e_coulerf_simd1(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int idsimd) -{ - double aij, akl, a0, a1, fac1; - double *rij = envs->rij; - double *rkl = envs->rkl; - double rijrkl[3]; - double rijrx[3]; - double rklrx[3]; - double *u = bc->u; - double *w = bc->w; - double omega = envs->env[PTR_RANGE_OMEGA]; - int nroots = envs->nrys_roots; - int i; - - aij = envs->ai[idsimd] + envs->aj[idsimd]; - akl = envs->ak[idsimd] + envs->al[idsimd]; - a1 = aij * akl; - a0 = a1 / (aij + akl); - - double theta = 0; - if (omega > 0) { -// For long-range part of range-separated Coulomb operator - theta = omega * omega / (omega * omega + a0); - a0 *= theta; - } - fac1 = sqrt(a0 / (a1 * a1 * a1)) * envs->fac[idsimd]; - - rijrkl[0] = rij[0*SIMDD+idsimd] - rkl[0*SIMDD+idsimd]; - rijrkl[1] = rij[1*SIMDD+idsimd] - rkl[1*SIMDD+idsimd]; - rijrkl[2] = rij[2*SIMDD+idsimd] - rkl[2*SIMDD+idsimd]; - CINTrys_roots(nroots, a0 * SQUARE(rijrkl), u, w); - - double *gx = g; - double *gy = g + envs->g_size; - double *gz = g + envs->g_size * 2; - for (i = 0; i < nroots; i++) { - gx[i] = 1; - gy[i] = 1; - gz[i] = w[i] * fac1; - } - if (envs->g_size == 1) { - return 1; - } - - if (omega > 0) { - /* u[:] = tau^2 / (1 - tau^2) - * transform u[:] to theta^-1 tau^2 / (theta^-1 - tau^2) - * so the rest code can be reused. - */ - for (i = 0; i < nroots; i++) { - u[i] /= u[i] + 1 - u[i] * theta; - } - } - - double u2, div, tmp1, tmp2, tmp3, tmp4; - double *b00 = bc->b00; - double *b10 = bc->b10; - double *b01 = bc->b01; - double *c00x = bc->c00x; - double *c00y = bc->c00y; - double *c00z = bc->c00z; - double *c0px = bc->c0px; - double *c0py = bc->c0py; - double *c0pz = bc->c0pz; - - rijrx[0] = rij[0*SIMDD+idsimd] - envs->rx_in_rijrx[0]; - rijrx[1] = rij[1*SIMDD+idsimd] - envs->rx_in_rijrx[1]; - rijrx[2] = rij[2*SIMDD+idsimd] - envs->rx_in_rijrx[2]; - rklrx[0] = rkl[0*SIMDD+idsimd] - envs->rx_in_rklrx[0]; - rklrx[1] = rkl[1*SIMDD+idsimd] - envs->rx_in_rklrx[1]; - rklrx[2] = rkl[2*SIMDD+idsimd] - envs->rx_in_rklrx[2]; - for (i = 0; i < nroots; i++) { - /* - *t2 = u(i)/(1+u(i)) - *u2 = aij*akl/(aij+akl)*t2/(1-t2) - */ - u2 = a0 * u[i]; - div = 1 / (u2 * (aij + akl) + a1); - tmp1 = u2 * div; - tmp4 = .5 * div; - b00[i] = 0.5 * tmp1; - tmp2 = tmp1 * akl; - tmp3 = tmp1 * aij; - b10[i] = b00[i] + tmp4 * akl; - b01[i] = b00[i] + tmp4 * aij; - c00x[i] = rijrx[0] - tmp2 * rijrkl[0]; - c00y[i] = rijrx[1] - tmp2 * rijrkl[1]; - c00z[i] = rijrx[2] - tmp2 * rijrkl[2]; - c0px[i] = rklrx[0] + tmp3 * rijrkl[0]; - c0py[i] = rklrx[1] + tmp3 * rijrkl[1]; - c0pz[i] = rklrx[2] + tmp3 * rijrkl[2]; - } - - (*envs->f_g0_2d4d_simd1)(g, bc, envs); - return 1; -} - diff --git a/src/g2e_f12.c b/src/g2e_f12.c index 1afbbd0..7ce266c 100644 --- a/src/g2e_f12.c +++ b/src/g2e_f12.c @@ -203,10 +203,10 @@ int CINTg0_2e_yp(double *g, double *cutoff, ALIGNMM double rijrkl[SIMDD*3]; ALIGNMM double rijrx[SIMDD*3]; ALIGNMM double rklrx[SIMDD*3]; + ALIGNMM double u[MXRYSROOTS*SIMDD]; double *rij = envs->rij; double *rkl = envs->rkl; - double *u = bc->u; - double *w = bc->w; + double *w = g + envs->g_size * 2; // ~ gz __MD ra, r0, r1, r2, r3, r4, r5, r6, r7, r8; double zeta = envs->env[PTR_F12_ZETA]; int nroots = envs->nrys_roots; @@ -345,18 +345,18 @@ int CINTg0_2e_yp(double *g, double *cutoff, int CINTg0_2e_yp_simd1(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int idsimd) { - const double aij = envs->ai[idsimd] + envs->aj[idsimd]; - const double akl = envs->ak[idsimd] + envs->al[idsimd]; - const double zeta = envs->env[PTR_F12_ZETA]; - const int nroots = envs->nrys_roots; + double aij = envs->ai[idsimd] + envs->aj[idsimd]; + double akl = envs->ak[idsimd] + envs->al[idsimd]; + double zeta = envs->env[PTR_F12_ZETA]; + int nroots = envs->nrys_roots; double a0, a1, fac1, x, ua; double *rij = envs->rij; double *rkl = envs->rkl; double rijrkl[3]; double rijrx[3]; double rklrx[3]; - double *u = bc->u; - double *w = bc->w; + ALIGNMM double u[MXRYSROOTS]; + double *w = g + envs->g_size * 2; // ~ gz int i; a1 = aij * akl; @@ -453,10 +453,10 @@ int CINTg0_2e_stg(double *g, double *cutoff, ALIGNMM double rijrkl[SIMDD*3]; ALIGNMM double rijrx[SIMDD*3]; ALIGNMM double rklrx[SIMDD*3]; + ALIGNMM double u[MXRYSROOTS*SIMDD]; double *rij = envs->rij; double *rkl = envs->rkl; - double *u = bc->u; - double *w = bc->w; + double *w = g + envs->g_size * 2; // ~ gz __MD ra, r0, r1, r2, r3, r4, r5, r6, r7, r8; double zeta = envs->env[PTR_F12_ZETA]; int nroots = envs->nrys_roots; @@ -594,18 +594,18 @@ int CINTg0_2e_stg(double *g, double *cutoff, int CINTg0_2e_stg_simd1(double *g, double *cutoff, Rys2eT *bc, CINTEnvVars *envs, int idsimd) { - const double aij = envs->ai[idsimd] + envs->aj[idsimd]; - const double akl = envs->ak[idsimd] + envs->al[idsimd]; - const double zeta = envs->env[PTR_F12_ZETA]; - const int nroots = envs->nrys_roots; + double aij = envs->ai[idsimd] + envs->aj[idsimd]; + double akl = envs->ak[idsimd] + envs->al[idsimd]; + double zeta = envs->env[PTR_F12_ZETA]; + int nroots = envs->nrys_roots; double a0, a1, fac1, x, ua; double *rij = envs->rij; double *rkl = envs->rkl; double rijrkl[3]; double rijrx[3]; double rklrx[3]; - double *u = bc->u; - double *w = bc->w; + ALIGNMM double u[MXRYSROOTS*SIMDD]; + double *w = g + envs->g_size * 2; // ~ gz int i; a1 = aij * akl; diff --git a/src/g2e_gtg.c b/src/g2e_gtg.c deleted file mode 100644 index cbc6404..0000000 --- a/src/g2e_gtg.c +++ /dev/null @@ -1,446 +0,0 @@ -/* - * Qcint is a general GTO integral library for computational chemistry - * Copyright (C) 2014- Qiming Sun - * - * This file is part of Qcint. - * - * Qcint is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - - -#include -#include -#include -#include "config.h" -#include "cint_bas.h" -#include "simd.h" -#include "g2e.h" -#include "rys_roots.h" -#include "misc.h" - -void CINTg0_2e_lj2d4d_regular(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs); -void CINTg0_2e_lj2d4d_simd1_regular(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs); -int CINTg0_2e_gtg(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int count); -int CINTg0_2e_gtg_simd1(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int idsimd); -void CINTg0_lj_4d(double *g, CINTEnvVars *envs); -void CINTg0_lj_4d_simd1(double *g, CINTEnvVars *envs); - -void CINTinit_int2e_gtg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env) -{ - envs->natm = natm; - envs->nbas = nbas; - envs->atm = atm; - envs->bas = bas; - envs->env = env; - envs->shls = shls; - - const int i_sh = shls[0]; - const int j_sh = shls[1]; - const int k_sh = shls[2]; - const int l_sh = shls[3]; - envs->i_l = bas(ANG_OF, i_sh); - envs->j_l = bas(ANG_OF, j_sh); - envs->k_l = bas(ANG_OF, k_sh); - envs->l_l = bas(ANG_OF, l_sh); - envs->x_ctr[0] = bas(NCTR_OF, i_sh); - envs->x_ctr[1] = bas(NCTR_OF, j_sh); - envs->x_ctr[2] = bas(NCTR_OF, k_sh); - envs->x_ctr[3] = bas(NCTR_OF, l_sh); - envs->nfi = (envs->i_l+1)*(envs->i_l+2)/2; - envs->nfj = (envs->j_l+1)*(envs->j_l+2)/2; - envs->nfk = (envs->k_l+1)*(envs->k_l+2)/2; - envs->nfl = (envs->l_l+1)*(envs->l_l+2)/2; - envs->nf = envs->nfi * envs->nfk * envs->nfl * envs->nfj; - - envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); - envs->rj = env + atm(PTR_COORD, bas(ATOM_OF, j_sh)); - envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); - envs->rl = env + atm(PTR_COORD, bas(ATOM_OF, l_sh)); - - envs->common_factor = SQRTPI * .5; - if (env[PTR_EXPCUTOFF] == 0) { - envs->expcutoff = EXPCUTOFF; - } else { - envs->expcutoff = MAX(MIN_EXPCUTOFF, env[PTR_EXPCUTOFF]); - } - - envs->gbits = ng[GSHIFT]; - envs->ncomp_e1 = ng[POS_E1]; - envs->ncomp_e2 = ng[POS_E2]; - envs->ncomp_tensor = ng[TENSOR]; - - envs->li_ceil = envs->i_l + ng[IINC]; - envs->lj_ceil = envs->j_l + ng[JINC]; - envs->lk_ceil = envs->k_l + ng[KINC]; - envs->ll_ceil = envs->l_l + ng[LINC]; - envs->nrys_roots = 1; - - assert(i_sh < SHLS_MAX); - assert(j_sh < SHLS_MAX); - assert(k_sh < SHLS_MAX); - assert(l_sh < SHLS_MAX); - assert(envs->i_l < ANG_MAX); - assert(envs->j_l < ANG_MAX); - assert(envs->k_l < ANG_MAX); - assert(envs->l_l < ANG_MAX); - assert(bas(ATOM_OF,i_sh) >= 0); - assert(bas(ATOM_OF,j_sh) >= 0); - assert(bas(ATOM_OF,k_sh) >= 0); - assert(bas(ATOM_OF,l_sh) >= 0); - assert(bas(ATOM_OF,i_sh) < natm); - assert(bas(ATOM_OF,j_sh) < natm); - assert(bas(ATOM_OF,k_sh) < natm); - assert(bas(ATOM_OF,l_sh) < natm); - assert(envs->nrys_roots < MXRYSROOTS); - - int dli, dlj, dlk, dll; - int ibase = envs->li_ceil > envs->lj_ceil; - int kbase = envs->lk_ceil > envs->ll_ceil; - - if (kbase) { - dlk = envs->lk_ceil + envs->ll_ceil + 1; - dll = envs->ll_ceil + 1; - } else { - dlk = envs->lk_ceil + 1; - dll = envs->lk_ceil + envs->ll_ceil + 1; - } - - if (ibase) { - dli = envs->li_ceil + envs->lj_ceil + 1; - dlj = envs->lj_ceil + 1; - } else { - dli = envs->li_ceil + 1; - dlj = envs->li_ceil + envs->lj_ceil + 1; - } - envs->g_stride_i = envs->nrys_roots; - envs->g_stride_k = envs->nrys_roots * dli; - envs->g_stride_l = envs->nrys_roots * dli * dlk; - envs->g_stride_j = envs->nrys_roots * dli * dlk * dll; - envs->g_size = envs->nrys_roots * dli * dlk * dll * dlj; - - if (kbase) { - envs->g2d_klmax = envs->g_stride_k; - envs->rx_in_rklrx = envs->rk; - envs->rkrl[0] = envs->rk[0] - envs->rl[0]; - envs->rkrl[1] = envs->rk[1] - envs->rl[1]; - envs->rkrl[2] = envs->rk[2] - envs->rl[2]; - } else { - envs->g2d_klmax = envs->g_stride_l; - envs->rx_in_rklrx = envs->rl; - envs->rkrl[0] = envs->rl[0] - envs->rk[0]; - envs->rkrl[1] = envs->rl[1] - envs->rk[1]; - envs->rkrl[2] = envs->rl[2] - envs->rk[2]; - } - - if (ibase) { - envs->g2d_ijmax = envs->g_stride_i; - envs->rx_in_rijrx = envs->ri; - envs->rirj[0] = envs->ri[0] - envs->rj[0]; - envs->rirj[1] = envs->ri[1] - envs->rj[1]; - envs->rirj[2] = envs->ri[2] - envs->rj[2]; - } else { - envs->g2d_ijmax = envs->g_stride_j; - envs->rx_in_rijrx = envs->rj; - envs->rirj[0] = envs->rj[0] - envs->ri[0]; - envs->rirj[1] = envs->rj[1] - envs->ri[1]; - envs->rirj[2] = envs->rj[2] - envs->ri[2]; - } - - if (kbase) { - if (ibase) { - envs->f_g0_2d4d = &CINTg0_2e_ik2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_ik2d4d_simd1; - } else { - envs->f_g0_2d4d = &CINTg0_2e_kj2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_kj2d4d_simd1; - } - } else { - if (ibase) { - envs->f_g0_2d4d = &CINTg0_2e_il2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_il2d4d_simd1; - } else { - envs->f_g0_2d4d = &CINTg0_2e_lj2d4d_regular; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_lj2d4d_simd1_regular; - } - } - envs->f_g0_2e = &CINTg0_2e_gtg; - envs->f_g0_2e_simd1 = &CINTg0_2e_gtg_simd1; -} - - -void CINTg0_2e_lj2d4d_regular(double *g, Rys2eT *bc, CINTEnvVars *envs) -{ - CINTg0_2e_2d(g, bc, envs); - CINTg0_lj_4d(g, envs); - return; -} - -void CINTg0_2e_lj2d4d_simd1_regular(double *g, Rys2eT *bc, CINTEnvVars *envs) -{ - CINTg0_2e_2d_simd1(g, bc, envs); - CINTg0_lj_4d_simd1(g, envs); - return; -} - -/* - * g[i,k,l,j] = < ik | lj > = ( i j | k l ) - */ -int CINTg0_2e_gtg(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int count) -{ - const double zeta = envs->env[PTR_GTG_ZETA]; - double *gx = g; - double *gy = gx + envs->g_size * SIMDD; - double *gz = gy + envs->g_size * SIMDD; - ALIGNMM double aij[SIMDD]; - ALIGNMM double akl[SIMDD]; - ALIGNMM double a0[SIMDD]; - ALIGNMM double a1[SIMDD]; - ALIGNMM double aijkl[SIMDD]; - ALIGNMM double fac1[SIMDD]; - ALIGNMM double x[SIMDD]; - ALIGNMM double t[SIMDD]; - ALIGNMM double rijrkl[SIMDD*3]; - ALIGNMM double rijrx[SIMDD*3]; - ALIGNMM double rklrx[SIMDD*3]; - double *rij = envs->rij; - double *rkl = envs->rkl; - __MD ra, r0, r1, r2, r3, r4, r5, r6, r7, r8; - - //:for (int k = 0; k < count; k++) { - //: aij[k] = envs->ai[k] + envs->aj[k]; - //: akl[k] = envs->ak[k] + envs->al[k]; - //: aijkl[k] = aij[k] + akl[k]; - //: a1[k] = aij[k] * akl[k]; - //: a0[k] = a1[k] / aijkl[k]; - //:} - r2 = MM_ADD(MM_LOAD(envs->ai), MM_LOAD(envs->aj)); - r3 = MM_ADD(MM_LOAD(envs->ak), MM_LOAD(envs->al)); - MM_STORE(aij, r2); - MM_STORE(akl, r3); - r1 = MM_MUL(r2, r3); - MM_STORE(a1, r1); - ra = MM_ADD(r2, r3); - MM_STORE(aijkl, ra); - r0 = MM_DIV(r1, ra); - MM_STORE(a0, r0); - - // t = zeta / (zeta + a0); - // fac1 = (1-t) / a1; - r2 = MM_SET1(zeta); - r2 = MM_DIV(r2, MM_ADD(r2, r0)); - MM_STORE(t, r2); - MM_STORE(fac1, MM_DIV(MM_SUB(MM_SET1(1), r2), r1)); - - r0 = MM_LOAD(rij+0*SIMDD); - r1 = MM_LOAD(rij+1*SIMDD); - r2 = MM_LOAD(rij+2*SIMDD); - r3 = MM_LOAD(rkl+0*SIMDD); - r4 = MM_LOAD(rkl+1*SIMDD); - r5 = MM_LOAD(rkl+2*SIMDD); - - //:for (k = 0; k < count; k++) { - //: rijrkl[0*SIMDD+k] = rij[0*SIMDD+k] - rkl[0*SIMDD+k]; - //: rijrkl[1*SIMDD+k] = rij[1*SIMDD+k] - rkl[1*SIMDD+k]; - //: rijrkl[2*SIMDD+k] = rij[2*SIMDD+k] - rkl[2*SIMDD+k]; - //: rijrx[0*SIMDD+k] = rij[0*SIMDD+k] - envs->rx_in_rijrx[0]; - //: rijrx[1*SIMDD+k] = rij[1*SIMDD+k] - envs->rx_in_rijrx[1]; - //: rijrx[2*SIMDD+k] = rij[2*SIMDD+k] - envs->rx_in_rijrx[2]; - //: rklrx[0*SIMDD+k] = rkl[0*SIMDD+k] - envs->rx_in_rklrx[0]; - //: rklrx[1*SIMDD+k] = rkl[1*SIMDD+k] - envs->rx_in_rklrx[1]; - //: rklrx[2*SIMDD+k] = rkl[2*SIMDD+k] - envs->rx_in_rklrx[2]; - //: x[k] = a0[k] *(rijrkl[0*SIMDD+k] * rijrkl[0*SIMDD+k] - //: + rijrkl[1*SIMDD+k] * rijrkl[1*SIMDD+k] - //: + rijrkl[2*SIMDD+k] * rijrkl[2*SIMDD+k]); - //:} - MM_STORE(rijrx+0*SIMDD, MM_SUB(r0, MM_SET1(envs->rx_in_rijrx[0]))); - MM_STORE(rijrx+1*SIMDD, MM_SUB(r1, MM_SET1(envs->rx_in_rijrx[1]))); - MM_STORE(rijrx+2*SIMDD, MM_SUB(r2, MM_SET1(envs->rx_in_rijrx[2]))); - MM_STORE(rklrx+0*SIMDD, MM_SUB(r3, MM_SET1(envs->rx_in_rklrx[0]))); - MM_STORE(rklrx+1*SIMDD, MM_SUB(r4, MM_SET1(envs->rx_in_rklrx[1]))); - MM_STORE(rklrx+2*SIMDD, MM_SUB(r5, MM_SET1(envs->rx_in_rklrx[2]))); - - r6 = MM_SUB(r0, r3); MM_STORE(rijrkl+0*SIMDD, r6); - r7 = MM_SUB(r1, r4); MM_STORE(rijrkl+1*SIMDD, r7); - r8 = MM_SUB(r2, r5); MM_STORE(rijrkl+2*SIMDD, r8); - ra = MM_FMA(r6, r6, MM_FMA(r7, r7, MM_MUL(r8, r8))); - - // x = -x * t - // gz[0] = fac1*sqrt(fac1) * exp(x) * fac; - ra = MM_MUL(MM_LOAD(a0), ra); - ra = MM_MUL(MM_LOAD(t), ra); - MM_STORE(x, -ra); - int k; - for (k = 0; k < count; k++) { - t[k] = exp(x[k]); - } - ra = MM_LOAD(fac1); - r1 = MM_MUL(MM_SQRT(ra), ra); - r2 = MM_MUL(MM_LOAD(t), MM_LOAD(envs->fac)); - MM_STORE(gz, MM_MUL(r1, r2)); - r1 = MM_SET1(1.); - MM_STORE(gx, r1); - MM_STORE(gy, r1); - if (envs->g_size == 1) { - return 1; - } - - double *b00 = bc->b00; - double *b10 = bc->b10; - double *b01 = bc->b01; - double *c00x = bc->c00x; - double *c00y = bc->c00y; - double *c00z = bc->c00z; - double *c0px = bc->c0px; - double *c0py = bc->c0py; - double *c0pz = bc->c0pz; - - ALIGNMM double tmp1[SIMDD]; - ALIGNMM double tmp4[SIMDD]; - //:double u2[SIMDD]; - //:double tmp2, tmp3; - //:double div[SIMDD]; - //:for (k = 0; k < count; k++) { - //: div[k] = 1 / (zeta * aijkl[k] + a1[k]); - //: tmp1[k] = zeta * div[k]; - //: tmp4[k] = .5 * div[k]; - //: - //: b00[k] = 0.5 * tmp1[k]; - //: tmp2 = tmp1[k] * akl[k]; - //: tmp3 = tmp1[k] * aij[k]; - //: b10[k] = b00[k] + tmp4[k] * akl[k]; - //: b01[k] = b00[k] + tmp4[k] * aij[k]; - //: c00x[k] = rijrx[0*SIMDD+k] - tmp2 * rijrkl[0*SIMDD+k]; - //: c00y[k] = rijrx[1*SIMDD+k] - tmp2 * rijrkl[1*SIMDD+k]; - //: c00z[k] = rijrx[2*SIMDD+k] - tmp2 * rijrkl[2*SIMDD+k]; - //: c0px[k] = rklrx[0*SIMDD+k] + tmp3 * rijrkl[0*SIMDD+k]; - //: c0py[k] = rklrx[1*SIMDD+k] + tmp3 * rijrkl[1*SIMDD+k]; - //: c0pz[k] = rklrx[2*SIMDD+k] + tmp3 * rijrkl[2*SIMDD+k]; - //:} - - ra = MM_LOAD(aijkl); - r0 = MM_SET1(zeta); - r1 = MM_LOAD(a1); - r2 = MM_SET1(.5); - r3 = MM_SET1(1.); - - r5 = MM_DIV(r3, MM_FMA(r0, ra, r1)); - r1 = MM_MUL(r0, r5); - r4 = MM_MUL(r2, r5); - MM_STORE(tmp1, r1); - MM_STORE(tmp4, r4); - - r0 = MM_MUL(r2, r1); - MM_STORE(b00, r0); - MM_STORE(b10, MM_FMA(r4, MM_LOAD(akl), r0)); - MM_STORE(b01, MM_FMA(r4, MM_LOAD(aij), r0)); - - r4 = MM_LOAD(rijrkl+0*SIMDD); - r5 = MM_LOAD(rijrkl+1*SIMDD); - r6 = MM_LOAD(rijrkl+2*SIMDD); - ra = MM_LOAD(akl); - r1 = MM_LOAD(rijrx+0*SIMDD); - r2 = MM_LOAD(rijrx+1*SIMDD); - r3 = MM_LOAD(rijrx+2*SIMDD); - r0 = MM_MUL(MM_LOAD(tmp1), ra); - MM_STORE(c00x, MM_FNMA(r0, r4, r1)); - MM_STORE(c00y, MM_FNMA(r0, r5, r2)); - MM_STORE(c00z, MM_FNMA(r0, r6, r3)); - - ra = MM_LOAD(aij); - r1 = MM_LOAD(rklrx+0*SIMDD); - r2 = MM_LOAD(rklrx+1*SIMDD); - r3 = MM_LOAD(rklrx+2*SIMDD); - r0 = MM_MUL(MM_LOAD(tmp1), ra); - MM_STORE(c0px, MM_FMA (r0, r4, r1)); - MM_STORE(c0py, MM_FMA (r0, r5, r2)); - MM_STORE(c0pz, MM_FMA (r0, r6, r3)); - - (*envs->f_g0_2d4d)(g, bc, envs); - return 1; -} - -int CINTg0_2e_gtg_simd1(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int idsimd) -{ - const double aij = envs->ai[idsimd] + envs->aj[idsimd]; - const double akl = envs->ak[idsimd] + envs->al[idsimd]; - const double zeta = envs->env[PTR_GTG_ZETA]; - double *gx = g; - double *gy = gx + envs->g_size; - double *gz = gy + envs->g_size; - double a0, a1, fac1, x, t; - double *rij = envs->rij; - double *rkl = envs->rkl; - double rijrkl[3]; - double rijrx[3]; - double rklrx[3]; - rijrkl[0] = rij[0*SIMDD+idsimd] - rkl[0*SIMDD+idsimd]; - rijrkl[1] = rij[1*SIMDD+idsimd] - rkl[1*SIMDD+idsimd]; - rijrkl[2] = rij[2*SIMDD+idsimd] - rkl[2*SIMDD+idsimd]; - - a1 = aij * akl; - a0 = a1 / (aij + akl); - t = zeta / (zeta + a0); - x = a0 *(rijrkl[0] * rijrkl[0] - + rijrkl[1] * rijrkl[1] - + rijrkl[2] * rijrkl[2]); - fac1 = (1-t) / a1; - gx[0] = 1; - gy[0] = 1; - gz[0] = fac1*sqrt(fac1) * exp(-t * x) * envs->fac[idsimd]; - if (envs->g_size == 1) { - return 1; - } - - double div, tmp1, tmp2, tmp3, tmp4; - double *b00 = bc->b00; - double *b10 = bc->b10; - double *b01 = bc->b01; - double *c00x = bc->c00x; - double *c00y = bc->c00y; - double *c00z = bc->c00z; - double *c0px = bc->c0px; - double *c0py = bc->c0py; - double *c0pz = bc->c0pz; - - rijrx[0] = rij[0*SIMDD+idsimd] - envs->rx_in_rijrx[0]; - rijrx[1] = rij[1*SIMDD+idsimd] - envs->rx_in_rijrx[1]; - rijrx[2] = rij[2*SIMDD+idsimd] - envs->rx_in_rijrx[2]; - rklrx[0] = rkl[0*SIMDD+idsimd] - envs->rx_in_rklrx[0]; - rklrx[1] = rkl[1*SIMDD+idsimd] - envs->rx_in_rklrx[1]; - rklrx[2] = rkl[2*SIMDD+idsimd] - envs->rx_in_rklrx[2]; - div = 1 / (zeta * (aij + akl) + a1); - tmp1 = zeta * div; - tmp2 = tmp1 * akl; - tmp3 = tmp1 * aij; - tmp4 = .5 * div; - b00[0] = 0.5 * tmp1; - b10[0] = b00[0] + tmp4 * akl; - b01[0] = b00[0] + tmp4 * aij; - c00x[0] = rijrx[0] - tmp2 * rijrkl[0]; - c00y[0] = rijrx[1] - tmp2 * rijrkl[1]; - c00z[0] = rijrx[2] - tmp2 * rijrkl[2]; - c0px[0] = rklrx[0] + tmp3 * rijrkl[0]; - c0py[0] = rklrx[1] + tmp3 * rijrkl[1]; - c0pz[0] = rklrx[2] + tmp3 * rijrkl[2]; - - (*envs->f_g0_2d4d_simd1)(g, bc, envs); - return 1; -} diff --git a/src/g2e_simd1.c b/src/g2e_simd1.c index 969b4f3..599b8f3 100644 --- a/src/g2e_simd1.c +++ b/src/g2e_simd1.c @@ -39,12 +39,12 @@ void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) { int nroots = envs->nrys_roots; - int i, m, n; - DEF_GXYZ(double, g, gx, gy, gz); int nmax = envs->li_ceil + envs->lj_ceil; int mmax = envs->lk_ceil + envs->ll_ceil; int dm = envs->g2d_klmax; int dn = envs->g2d_ijmax; + int i, m, n; + DEF_GXYZ(double, g, gx, gy, gz); double *c00x = bc->c00x; double *c00y = bc->c00y; double *c00z = bc->c00z; @@ -57,29 +57,39 @@ void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) double *p0x, *p0y, *p0z; double *p1x, *p1y, *p1z; double nb1, mb0; + + for (i = 0; i < nroots; i++) { + gx[i] = 1; + gy[i] = 1; + //gz[i] = w[i]; + } + double r0x, r0y, r0z; double r1x, r1y, r1z; double r2x, r2y, r2z; - if (nmax > 0) { p0x = gx + dn; p0y = gy + dn; p0z = gz + dn; for (i = 0; i < nroots; i++) { + double c00x_i = c00x[i]; + double c00y_i = c00y[i]; + double c00z_i = c00z[i]; + double b10_i = b10[i]; r2x = gx[i]; r2y = gy[i]; r2z = gz[i]; - r0x = c00x[i] * r2x; - r0y = c00y[i] * r2y; - r0z = c00z[i] * r2z; + r0x = c00x_i * r2x; + r0y = c00y_i * r2y; + r0z = c00z_i * r2z; p0x[i] = r0x; p0y[i] = r0y; p0z[i] = r0z; for (n = 1; n < nmax; n++) { - nb1 = n * b10[i]; - r1x = c00x[i] * r0x + nb1 * r2x; - r1y = c00y[i] * r0y + nb1 * r2y; - r1z = c00z[i] * r0z + nb1 * r2z; + nb1 = n * b10_i; + r1x = c00x_i * r0x + nb1 * r2x; + r1y = c00y_i * r0y + nb1 * r2y; + r1z = c00z_i * r0z + nb1 * r2z; p0x[i+n*dn] = r1x; p0y[i+n*dn] = r1y; p0z[i+n*dn] = r1z; @@ -98,20 +108,24 @@ void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) p0y = gy + dm; p0z = gz + dm; for (i = 0; i < nroots; i++) { + double c0px_i = c0px[i]; + double c0py_i = c0py[i]; + double c0pz_i = c0pz[i]; + double b01_i = b01[i]; r2x = gx[i]; r2y = gy[i]; r2z = gz[i]; - r0x = c0px[i] * r2x; - r0y = c0py[i] * r2y; - r0z = c0pz[i] * r2z; + r0x = c0px_i * r2x; + r0y = c0py_i * r2y; + r0z = c0pz_i * r2z; p0x[i] = r0x; p0y[i] = r0y; p0z[i] = r0z; for (m = 1; m < mmax; m++) { - mb0 = m * b01[i]; - r1x = c0px[i] * r0x + mb0 * r2x; - r1y = c0py[i] * r0y + mb0 * r2y; - r1z = c0pz[i] * r0z + mb0 * r2z; + mb0 = m * b01_i; + r1x = c0px_i * r0x + mb0 * r2x; + r1y = c0py_i * r0y + mb0 * r2y; + r1z = c0pz_i * r0z + mb0 * r2z; p0x[i+m*dm] = r1x; p0y[i+m*dm] = r1y; p0z[i+m*dm] = r1z; @@ -133,9 +147,17 @@ void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) p1y = gy + dn; p1z = gz + dn; for (i = 0; i < nroots; i++) { - r1x = c0px[i] * gx[i+dn] + b00[i] * gx[i]; - r1y = c0py[i] * gy[i+dn] + b00[i] * gy[i]; - r1z = c0pz[i] * gz[i+dn] + b00[i] * gz[i]; + double c00x_i = c00x[i]; + double c00y_i = c00y[i]; + double c00z_i = c00z[i]; + double c0px_i = c0px[i]; + double c0py_i = c0py[i]; + double c0pz_i = c0pz[i]; + double b10_i = b10[i]; + double b00_i = b00[i]; + r1x = c0px_i * gx[i+dn] + b00_i * gx[i]; + r1y = c0py_i * gy[i+dn] + b00_i * gy[i]; + r1z = c0pz_i * gz[i+dn] + b00_i * gz[i]; p0x[i] = r1x; p0y[i] = r1y; p0z[i] = r1z; @@ -143,9 +165,9 @@ void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) r2y = gy[i+dm]; r2z = gz[i+dm]; for (n = 1; n < nmax; n++) { - r0x = c00x[i] * r1x + n * b10[i] * r2x + b00[i] * gx[i+n*dn]; - r0y = c00y[i] * r1y + n * b10[i] * r2y + b00[i] * gy[i+n*dn]; - r0z = c00z[i] * r1z + n * b10[i] * r2z + b00[i] * gz[i+n*dn]; + r0x = c00x_i * r1x + n * b10_i * r2x + b00_i * gx[i+n*dn]; + r0y = c00y_i * r1y + n * b10_i * r2y + b00_i * gy[i+n*dn]; + r0z = c00z_i * r1z + n * b10_i * r2z + b00_i * gz[i+n*dn]; p0x[i+n*dn] = r0x; p0y[i+n*dn] = r0y; p0z[i+n*dn] = r0z; @@ -160,9 +182,18 @@ void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) for (m = 1; m < mmax; m++) { for (i = 0; i < nroots; i++) { - r1x = c0px[i] * p1x[i+m*dm] + m * b01[i] * p1x[i+(m-1)*dm] + b00[i] * gx[i+m*dm]; - r1y = c0py[i] * p1y[i+m*dm] + m * b01[i] * p1y[i+(m-1)*dm] + b00[i] * gy[i+m*dm]; - r1z = c0pz[i] * p1z[i+m*dm] + m * b01[i] * p1z[i+(m-1)*dm] + b00[i] * gz[i+m*dm]; + double c00x_i = c00x[i]; + double c00y_i = c00y[i]; + double c00z_i = c00z[i]; + double c0px_i = c0px[i]; + double c0py_i = c0py[i]; + double c0pz_i = c0pz[i]; + double b10_i = b10[i]; + double b01_i = b01[i]; + double b00_i = b00[i]; + r1x = c0px_i * p1x[i+m*dm] + m * b01_i * p1x[i+(m-1)*dm] + b00_i * gx[i+m*dm]; + r1y = c0py_i * p1y[i+m*dm] + m * b01_i * p1y[i+(m-1)*dm] + b00_i * gy[i+m*dm]; + r1z = c0pz_i * p1z[i+m*dm] + m * b01_i * p1z[i+(m-1)*dm] + b00_i * gz[i+m*dm]; p0x[i+m*dm] = r1x; p0y[i+m*dm] = r1y; p0z[i+m*dm] = r1z; @@ -170,11 +201,11 @@ void CINTg0_2e_2d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) r2x = gx[i+(m+1)*dm]; r2y = gy[i+(m+1)*dm]; r2z = gz[i+(m+1)*dm]; - mb0 = (m + 1) * b00[i]; + mb0 = (m + 1) * b00_i; for (n = 1; n < nmax; n++) { - r0x = c00x[i] * r1x + n * b10[i] * r2x + mb0 * gx[i+m*dm+n*dn]; - r0y = c00y[i] * r1y + n * b10[i] * r2y + mb0 * gy[i+m*dm+n*dn]; - r0z = c00z[i] * r1z + n * b10[i] * r2z + mb0 * gz[i+m*dm+n*dn]; + r0x = c00x_i * r1x + n * b10_i * r2x + mb0 * gx[i+m*dm+n*dn]; + r0y = c00y_i * r1y + n * b10_i * r2y + mb0 * gy[i+m*dm+n*dn]; + r0z = c00z_i * r1z + n * b10_i * r2z + mb0 * gz[i+m*dm+n*dn]; p0x[i+m*dm+n*dn] = r0x; p0y[i+m*dm+n*dn] = r0y; p0z[i+m*dm+n*dn] = r0z; @@ -215,8 +246,12 @@ void CINTg0_lj_4d_simd1(double *g, CINTEnvVars *envs) double *rkrl = envs->rkrl; DEF_GXYZ(double, g, gx, gy, gz); double *p1x, *p1y, *p1z, *p2x, *p2y, *p2z; + double rx, ry, rz; // g(i,...,j) = rirj * g(i-1,...,j) + g(i-1,...,j+1) + rx = rirj[0]; + ry = rirj[1]; + rz = rirj[2]; p1x = gx - di; p1y = gy - di; p1z = gz - di; @@ -228,13 +263,16 @@ void CINTg0_lj_4d_simd1(double *g, CINTEnvVars *envs) for (l = 0; l <= mmax; l++) { ptr = j*dj + l*dl + i*di; for (n = ptr; n < ptr+nroots; n++) { - gx[n] = rirj[0] * p1x[n] + p2x[n]; - gy[n] = rirj[1] * p1y[n] + p2y[n]; - gz[n] = rirj[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } // g(...,k,l,..) = rkrl * g(...,k-1,l,..) + g(...,k-1,l+1,..) + rx = rkrl[0]; + ry = rkrl[1]; + rz = rkrl[2]; p1x = gx - dk; p1y = gy - dk; p1z = gz - dk; @@ -246,9 +284,9 @@ void CINTg0_lj_4d_simd1(double *g, CINTEnvVars *envs) for (l = 0; l <= mmax-k; l++) { ptr = j*dj + l*dl + k*dk; for (n = ptr; n < ptr+dk; n++) { - gx[n] = rkrl[0] * p1x[n] + p2x[n]; - gy[n] = rkrl[1] * p1y[n] + p2y[n]; - gz[n] = rkrl[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } } @@ -274,8 +312,12 @@ void CINTg0_kj_4d_simd1(double *g, CINTEnvVars *envs) double *rkrl = envs->rkrl; DEF_GXYZ(double, g, gx, gy, gz); double *p1x, *p1y, *p1z, *p2x, *p2y, *p2z; + double rx, ry, rz; // g(i,...,j) = rirj * g(i-1,...,j) + g(i-1,...,j+1) + rx = rirj[0]; + ry = rirj[1]; + rz = rirj[2]; p1x = gx - di; p1y = gy - di; p1z = gz - di; @@ -287,13 +329,16 @@ void CINTg0_kj_4d_simd1(double *g, CINTEnvVars *envs) for (k = 0; k <= mmax; k++) { ptr = j*dj + k*dk + i*di; for (n = ptr; n < ptr+nroots; n++) { - gx[n] = rirj[0] * p1x[n] + p2x[n]; - gy[n] = rirj[1] * p1y[n] + p2y[n]; - gz[n] = rirj[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } // g(...,k,l,..) = rkrl * g(...,k,l-1,..) + g(...,k+1,l-1,..) + rx = rkrl[0]; + ry = rkrl[1]; + rz = rkrl[2]; p1x = gx - dl; p1y = gy - dl; p1z = gz - dl; @@ -305,9 +350,9 @@ void CINTg0_kj_4d_simd1(double *g, CINTEnvVars *envs) for (k = 0; k <= mmax-l; k++) { ptr = j*dj + l*dl + k*dk; for (n = ptr; n < ptr+dk; n++) { - gx[n] = rkrl[0] * p1x[n] + p2x[n]; - gy[n] = rkrl[1] * p1y[n] + p2y[n]; - gz[n] = rkrl[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } } @@ -333,8 +378,12 @@ void CINTg0_il_4d_simd1(double *g, CINTEnvVars *envs) double *rkrl = envs->rkrl; DEF_GXYZ(double, g, gx, gy, gz); double *p1x, *p1y, *p1z, *p2x, *p2y, *p2z; + double rx, ry, rz; // g(...,k,l,..) = rkrl * g(...,k-1,l,..) + g(...,k-1,l+1,..) + rx = rkrl[0]; + ry = rkrl[1]; + rz = rkrl[2]; p1x = gx - dk; p1y = gy - dk; p1z = gz - dk; @@ -346,13 +395,16 @@ void CINTg0_il_4d_simd1(double *g, CINTEnvVars *envs) for (i = 0; i <= nmax; i++) { ptr = l*dl + k*dk + i*di; for (n = ptr; n < ptr+nroots; n++) { - gx[n] = rkrl[0] * p1x[n] + p2x[n]; - gy[n] = rkrl[1] * p1y[n] + p2y[n]; - gz[n] = rkrl[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } // g(i,...,j) = rirj * g(i,...,j-1) + g(i+1,...,j-1) + rx = rirj[0]; + ry = rirj[1]; + rz = rirj[2]; p1x = gx - dj; p1y = gy - dj; p1z = gz - dj; @@ -364,9 +416,9 @@ void CINTg0_il_4d_simd1(double *g, CINTEnvVars *envs) for (k = 0; k <= lk; k++) { ptr = j*dj + l*dl + k*dk; for (n = ptr; n < ptr+dk-di*j; n++) { - gx[n] = rirj[0] * p1x[n] + p2x[n]; - gy[n] = rirj[1] * p1y[n] + p2y[n]; - gz[n] = rirj[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } } @@ -392,8 +444,12 @@ void CINTg0_ik_4d_simd1(double *g, CINTEnvVars *envs) double *rkrl = envs->rkrl; DEF_GXYZ(double, g, gx, gy, gz); double *p1x, *p1y, *p1z, *p2x, *p2y, *p2z; + double rx, ry, rz; // g(...,k,l,..) = rkrl * g(...,k,l-1,..) + g(...,k+1,l-1,..) + rx = rkrl[0]; + ry = rkrl[1]; + rz = rkrl[2]; p1x = gx - dl; p1y = gy - dl; p1z = gz - dl; @@ -407,14 +463,17 @@ void CINTg0_ik_4d_simd1(double *g, CINTEnvVars *envs) for (i = 0; i <= nmax; i++) { ptr = l*dl + k*dk + i*di; for (n = ptr; n < ptr+nroots; n++) { - gx[n] = rkrl[0] * p1x[n] + p2x[n]; - gy[n] = rkrl[1] * p1y[n] + p2y[n]; - gz[n] = rkrl[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } // g(i,...,j) = rirj * g(i,...,j-1) + g(i+1,...,j-1) + rx = rirj[0]; + ry = rirj[1]; + rz = rirj[2]; p1x = gx - dj; p1y = gy - dj; p1z = gz - dj; @@ -426,17 +485,17 @@ void CINTg0_ik_4d_simd1(double *g, CINTEnvVars *envs) for (k = 0; k <= lk; k++) { ptr = j*dj + l*dl + k*dk; for (n = ptr; n < ptr+dk-di*j; n++) { - gx[n] = rirj[0] * p1x[n] + p2x[n]; - gy[n] = rirj[1] * p1y[n] + p2y[n]; - gz[n] = rirj[2] * p1z[n] + p2z[n]; + gx[n] = rx * p1x[n] + p2x[n]; + gy[n] = ry * p1y[n] + p2y[n]; + gz[n] = rz * p1z[n] + p2z[n]; } } } } } static inline void _g0_2d4d_0000(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) { - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; //g[2] = w[0]; } @@ -445,9 +504,9 @@ static inline void _g0_2d4d_0001(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpx = bc->c0px; double *cpy = bc->c0py; double *cpz = bc->c0pz; - //g[0] = 1; + g[0] = 1; g[1] = cpx[0]; - //g[2] = 1; + g[2] = 1; g[3] = cpy[0]; //g[4] = w[0]; g[5] = cpz[0] * g[4]; @@ -459,14 +518,14 @@ static inline void _g0_2d4d_0002(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = cpx[0] * cpx[0] + b01[0]; g[5] = cpx[1] * cpx[1] + b01[1]; - //g[6] = 1; - //g[7] = 1; + g[6] = 1; + g[7] = 1; g[8] = cpy[0]; g[9] = cpy[1]; g[10] = cpy[0] * cpy[0] + b01[0]; @@ -485,16 +544,16 @@ static inline void _g0_2d4d_0003(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = cpx[0] * cpx[0] + b01[0]; g[5] = cpx[1] * cpx[1] + b01[1]; g[6] = cpx[0] * (g[4] + 2 * b01[0]); g[7] = cpx[1] * (g[5] + 2 * b01[1]); - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = cpy[0]; g[11] = cpy[1]; g[12] = cpy[0] * cpy[0] + b01[0]; @@ -516,9 +575,9 @@ static inline void _g0_2d4d_0010(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpx = bc->c0px; double *cpy = bc->c0py; double *cpz = bc->c0pz; - //g[0] = 1; + g[0] = 1; g[1] = cpx[0]; - //g[2] = 1; + g[2] = 1; g[3] = cpy[0]; //g[4] = w[0]; g[5] = cpz[0] * g[4]; @@ -533,16 +592,16 @@ static inline void _g0_2d4d_0011(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xkxl = envs->rkrl[0]; double ykyl = envs->rkrl[1]; double zkzl = envs->rkrl[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[4] = cpx[0]; g[5] = cpx[1]; g[6] = cpx[0] * (xkxl + cpx[0]) + b01[0]; g[7] = cpx[1] * (xkxl + cpx[1]) + b01[1]; g[2] = xkxl + cpx[0]; g[3] = xkxl + cpx[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[16] = cpy[0]; g[17] = cpy[1]; g[18] = cpy[0] * (ykyl + cpy[0]) + b01[0]; @@ -568,8 +627,8 @@ static inline void _g0_2d4d_0012(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xkxl = envs->rkrl[0]; double ykyl = envs->rkrl[1]; double zkzl = envs->rkrl[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[4] = cpx[0]; g[5] = cpx[1]; g[8] = cpx[0] * cpx[0] + b01[0]; @@ -580,8 +639,8 @@ static inline void _g0_2d4d_0012(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[7] = cpx[1] * (xkxl + cpx[1]) + b01[1]; g[2] = xkxl + cpx[0]; g[3] = xkxl + cpx[1]; - //g[16] = 1; - //g[17] = 1; + g[16] = 1; + g[17] = 1; g[20] = cpy[0]; g[21] = cpy[1]; g[24] = cpy[0] * cpy[0] + b01[0]; @@ -612,14 +671,14 @@ static inline void _g0_2d4d_0020(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = cpx[0] * cpx[0] + b01[0]; g[5] = cpx[1] * cpx[1] + b01[1]; - //g[6] = 1; - //g[7] = 1; + g[6] = 1; + g[7] = 1; g[8] = cpy[0]; g[9] = cpy[1]; g[10] = cpy[0] * cpy[0] + b01[0]; @@ -641,8 +700,8 @@ static inline void _g0_2d4d_0021(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xkxl = envs->rkrl[0]; double ykyl = envs->rkrl[1]; double zkzl = envs->rkrl[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = cpx[0] * cpx[0] + b01[0]; @@ -653,8 +712,8 @@ static inline void _g0_2d4d_0021(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[11] = cpx[1] * (xkxl + cpx[1]) + b01[1]; g[12] = g[4] * (xkxl + cpx[0]) + cpx[0] * 2 * b01[0]; g[13] = g[5] * (xkxl + cpx[1]) + cpx[1] * 2 * b01[1]; - //g[16] = 1; - //g[17] = 1; + g[16] = 1; + g[17] = 1; g[18] = cpy[0]; g[19] = cpy[1]; g[20] = cpy[0] * cpy[0] + b01[0]; @@ -685,16 +744,16 @@ static inline void _g0_2d4d_0030(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = cpx[0] * cpx[0] + b01[0]; g[5] = cpx[1] * cpx[1] + b01[1]; g[6] = cpx[0] * (g[4] + 2 * b01[0]); g[7] = cpx[1] * (g[5] + 2 * b01[1]); - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = cpy[0]; g[11] = cpy[1]; g[12] = cpy[0] * cpy[0] + b01[0]; @@ -716,9 +775,9 @@ static inline void _g0_2d4d_0100(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *c0x = bc->c00x; double *c0y = bc->c00y; double *c0z = bc->c00z; - //g[0] = 1; + g[0] = 1; g[1] = c0x[0]; - //g[2] = 1; + g[2] = 1; g[3] = c0y[0]; //g[4] = w[0]; g[5] = c0z[0] * g[4]; @@ -733,16 +792,16 @@ static inline void _g0_2d4d_0101(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b00 = bc->b00; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = c0x[0]; g[5] = c0x[1]; g[6] = cpx[0] * c0x[0] + b00[0]; g[7] = cpx[1] * c0x[1] + b00[1]; - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = cpy[0]; g[11] = cpy[1]; g[12] = c0y[0]; @@ -769,8 +828,8 @@ static inline void _g0_2d4d_0102(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[6] = c0x[0]; @@ -781,8 +840,8 @@ static inline void _g0_2d4d_0102(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[9] = cpx[1] * c0x[1] + b00[1]; g[10] = cpx[0] * (g[8] + b00[0]) + b01[0] * c0x[0]; g[11] = cpx[1] * (g[9] + b00[1]) + b01[1] * c0x[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[14] = cpy[0]; g[15] = cpy[1]; g[18] = c0y[0]; @@ -816,16 +875,16 @@ static inline void _g0_2d4d_0110(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b00 = bc->b00; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = c0x[0]; g[5] = c0x[1]; g[6] = cpx[0] * c0x[0] + b00[0]; g[7] = cpx[1] * c0x[1] + b00[1]; - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = cpy[0]; g[11] = cpy[1]; g[12] = c0y[0]; @@ -855,8 +914,8 @@ static inline void _g0_2d4d_0111(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xkxl = envs->rkrl[0]; double ykyl = envs->rkrl[1]; double zkzl = envs->rkrl[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[12] = c0x[0]; g[13] = c0x[1]; g[4] = cpx[0]; @@ -871,8 +930,8 @@ static inline void _g0_2d4d_0111(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[3] = xkxl + cpx[1]; g[14] = c0x[0] * (xkxl + cpx[0]) + b00[0]; g[15] = c0x[1] * (xkxl + cpx[1]) + b00[1]; - //g[24] = 1; - //g[25] = 1; + g[24] = 1; + g[25] = 1; g[36] = c0y[0]; g[37] = c0y[1]; g[28] = cpy[0]; @@ -915,8 +974,8 @@ static inline void _g0_2d4d_0120(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[6] = c0x[0]; @@ -927,8 +986,8 @@ static inline void _g0_2d4d_0120(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[9] = cpx[1] * c0x[1] + b00[1]; g[10] = cpx[0] * (g[8] + b00[0]) + b01[0] * c0x[0]; g[11] = cpx[1] * (g[9] + b00[1]) + b01[1] * c0x[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[14] = cpy[0]; g[15] = cpy[1]; g[18] = c0y[0]; @@ -959,14 +1018,14 @@ static inline void _g0_2d4d_0200(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *c0y = bc->c00y; double *c0z = bc->c00z; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = c0x[0] * c0x[0] + b10[0]; g[5] = c0x[1] * c0x[1] + b10[1]; - //g[6] = 1; - //g[7] = 1; + g[6] = 1; + g[7] = 1; g[8] = c0y[0]; g[9] = c0y[1]; g[10] = c0y[0] * c0y[0] + b10[0]; @@ -989,8 +1048,8 @@ static inline void _g0_2d4d_0201(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[4] = c0x[0]; g[5] = c0x[1]; g[8] = c0x[0] * c0x[0] + b10[0]; @@ -1001,8 +1060,8 @@ static inline void _g0_2d4d_0201(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[7] = cpx[1] * c0x[1] + b00[1]; g[10] = c0x[0] * (g[6] + b00[0]) + b10[0] * cpx[0]; g[11] = c0x[1] * (g[7] + b00[1]) + b10[1] * cpx[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[16] = c0y[0]; g[17] = c0y[1]; g[20] = c0y[0] * c0y[0] + b10[0]; @@ -1037,8 +1096,8 @@ static inline void _g0_2d4d_0210(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = cpx[0]; g[3] = cpx[1]; g[4] = c0x[0]; @@ -1049,8 +1108,8 @@ static inline void _g0_2d4d_0210(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[9] = c0x[1] * c0x[1] + b10[1]; g[10] = c0x[0] * (g[6] + b00[0]) + b10[0] * cpx[0]; g[11] = c0x[1] * (g[7] + b00[1]) + b10[1] * cpx[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[14] = cpy[0]; g[15] = cpy[1]; g[16] = c0y[0]; @@ -1084,16 +1143,16 @@ static inline void _g0_2d4d_0300(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = c0x[0] * c0x[0] + b10[0]; g[5] = c0x[1] * c0x[1] + b10[1]; g[6] = c0x[0] * (g[4] + 2 * b10[0]); g[7] = c0x[1] * (g[5] + 2 * b10[1]); - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = c0y[0]; g[11] = c0y[1]; g[12] = c0y[0] * c0y[0] + b10[0]; @@ -1115,9 +1174,9 @@ static inline void _g0_2d4d_1000(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *c0x = bc->c00x; double *c0y = bc->c00y; double *c0z = bc->c00z; - //g[0] = 1; + g[0] = 1; g[1] = c0x[0]; - //g[2] = 1; + g[2] = 1; g[3] = c0y[0]; //g[4] = w[0]; g[5] = c0z[0] * g[4]; @@ -1132,16 +1191,16 @@ static inline void _g0_2d4d_1001(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b00 = bc->b00; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = cpx[0]; g[5] = cpx[1]; g[6] = cpx[0] * c0x[0] + b00[0]; g[7] = cpx[1] * c0x[1] + b00[1]; - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = c0y[0]; g[11] = c0y[1]; g[12] = cpy[0]; @@ -1168,8 +1227,8 @@ static inline void _g0_2d4d_1002(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = cpx[0]; @@ -1180,8 +1239,8 @@ static inline void _g0_2d4d_1002(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[9] = cpx[1] * cpx[1] + b01[1]; g[10] = cpx[0] * (g[6] + b00[0]) + b01[0] * c0x[0]; g[11] = cpx[1] * (g[7] + b00[1]) + b01[1] * c0x[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[14] = c0y[0]; g[15] = c0y[1]; g[16] = cpy[0]; @@ -1215,16 +1274,16 @@ static inline void _g0_2d4d_1010(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpy = bc->c0py; double *cpz = bc->c0pz; double *b00 = bc->b00; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = cpx[0]; g[5] = cpx[1]; g[6] = cpx[0] * c0x[0] + b00[0]; g[7] = cpx[1] * c0x[1] + b00[1]; - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = c0y[0]; g[11] = c0y[1]; g[12] = cpy[0]; @@ -1254,8 +1313,8 @@ static inline void _g0_2d4d_1011(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xkxl = envs->rkrl[0]; double ykyl = envs->rkrl[1]; double zkzl = envs->rkrl[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[8] = cpx[0]; @@ -1270,8 +1329,8 @@ static inline void _g0_2d4d_1011(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[5] = xkxl + cpx[1]; g[6] = c0x[0] * (xkxl + cpx[0]) + b00[0]; g[7] = c0x[1] * (xkxl + cpx[1]) + b00[1]; - //g[24] = 1; - //g[25] = 1; + g[24] = 1; + g[25] = 1; g[26] = c0y[0]; g[27] = c0y[1]; g[32] = cpy[0]; @@ -1314,8 +1373,8 @@ static inline void _g0_2d4d_1020(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b01 = bc->b01; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = cpx[0]; @@ -1326,8 +1385,8 @@ static inline void _g0_2d4d_1020(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[9] = cpx[1] * cpx[1] + b01[1]; g[10] = cpx[0] * (g[6] + b00[0]) + b01[0] * c0x[0]; g[11] = cpx[1] * (g[7] + b00[1]) + b01[1] * c0x[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[14] = c0y[0]; g[15] = c0y[1]; g[16] = cpy[0]; @@ -1361,16 +1420,16 @@ static inline void _g0_2d4d_1100(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xixj = envs->rirj[0]; double yiyj = envs->rirj[1]; double zizj = envs->rirj[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[4] = c0x[0]; g[5] = c0x[1]; g[6] = c0x[0] * (xixj + c0x[0]) + b10[0]; g[7] = c0x[1] * (xixj + c0x[1]) + b10[1]; g[2] = xixj + c0x[0]; g[3] = xixj + c0x[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[16] = c0y[0]; g[17] = c0y[1]; g[18] = c0y[0] * (yiyj + c0y[0]) + b10[0]; @@ -1400,8 +1459,8 @@ static inline void _g0_2d4d_1101(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xixj = envs->rirj[0]; double yiyj = envs->rirj[1]; double zizj = envs->rirj[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[8] = c0x[0]; g[9] = c0x[1]; g[4] = cpx[0]; @@ -1416,8 +1475,8 @@ static inline void _g0_2d4d_1101(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[15] = g[13] * (xixj + c0x[1]) + c0x[1] * b00[1] + b10[1] * cpx[1]; g[6] = cpx[0] * (xixj + c0x[0]) + b00[0]; g[7] = cpx[1] * (xixj + c0x[1]) + b00[1]; - //g[24] = 1; - //g[25] = 1; + g[24] = 1; + g[25] = 1; g[32] = c0y[0]; g[33] = c0y[1]; g[28] = cpy[0]; @@ -1463,8 +1522,8 @@ static inline void _g0_2d4d_1110(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xixj = envs->rirj[0]; double yiyj = envs->rirj[1]; double zizj = envs->rirj[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[8] = c0x[0]; g[9] = c0x[1]; g[4] = cpx[0]; @@ -1479,8 +1538,8 @@ static inline void _g0_2d4d_1110(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[15] = g[13] * (xixj + c0x[1]) + c0x[1] * b00[1] + b10[1] * cpx[1]; g[6] = cpx[0] * (xixj + c0x[0]) + b00[0]; g[7] = cpx[1] * (xixj + c0x[1]) + b00[1]; - //g[24] = 1; - //g[25] = 1; + g[24] = 1; + g[25] = 1; g[32] = c0y[0]; g[33] = c0y[1]; g[28] = cpy[0]; @@ -1522,8 +1581,8 @@ static inline void _g0_2d4d_1200(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xixj = envs->rirj[0]; double yiyj = envs->rirj[1]; double zizj = envs->rirj[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[4] = c0x[0]; g[5] = c0x[1]; g[8] = c0x[0] * c0x[0] + b10[0]; @@ -1534,8 +1593,8 @@ static inline void _g0_2d4d_1200(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[7] = c0x[1] * (xixj + c0x[1]) + b10[1]; g[2] = xixj + c0x[0]; g[3] = xixj + c0x[1]; - //g[16] = 1; - //g[17] = 1; + g[16] = 1; + g[17] = 1; g[20] = c0y[0]; g[21] = c0y[1]; g[24] = c0y[0] * c0y[0] + b10[0]; @@ -1566,14 +1625,14 @@ static inline void _g0_2d4d_2000(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *c0y = bc->c00y; double *c0z = bc->c00z; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = c0x[0] * c0x[0] + b10[0]; g[5] = c0x[1] * c0x[1] + b10[1]; - //g[6] = 1; - //g[7] = 1; + g[6] = 1; + g[7] = 1; g[8] = c0y[0]; g[9] = c0y[1]; g[10] = c0y[0] * c0y[0] + b10[0]; @@ -1596,8 +1655,8 @@ static inline void _g0_2d4d_2001(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = c0x[0] * c0x[0] + b10[0]; @@ -1608,8 +1667,8 @@ static inline void _g0_2d4d_2001(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[9] = cpx[1] * c0x[1] + b00[1]; g[10] = c0x[0] * (g[8] + b00[0]) + b10[0] * cpx[0]; g[11] = c0x[1] * (g[9] + b00[1]) + b10[1] * cpx[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[14] = c0y[0]; g[15] = c0y[1]; g[16] = c0y[0] * c0y[0] + b10[0]; @@ -1644,8 +1703,8 @@ static inline void _g0_2d4d_2010(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *cpz = bc->c0pz; double *b00 = bc->b00; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = c0x[0] * c0x[0] + b10[0]; @@ -1656,8 +1715,8 @@ static inline void _g0_2d4d_2010(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[9] = cpx[1] * c0x[1] + b00[1]; g[10] = c0x[0] * (g[8] + b00[0]) + b10[0] * cpx[0]; g[11] = c0x[1] * (g[9] + b00[1]) + b10[1] * cpx[1]; - //g[12] = 1; - //g[13] = 1; + g[12] = 1; + g[13] = 1; g[14] = c0y[0]; g[15] = c0y[1]; g[16] = c0y[0] * c0y[0] + b10[0]; @@ -1691,8 +1750,8 @@ static inline void _g0_2d4d_2100(double *restrict g, Rys2eT *bc, CINTEnvVars *en double xixj = envs->rirj[0]; double yiyj = envs->rirj[1]; double zizj = envs->rirj[2]; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = c0x[0] * c0x[0] + b10[0]; @@ -1703,8 +1762,8 @@ static inline void _g0_2d4d_2100(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[11] = c0x[1] * (xixj + c0x[1]) + b10[1]; g[8] = xixj + c0x[0]; g[9] = xixj + c0x[1]; - //g[16] = 1; - //g[17] = 1; + g[16] = 1; + g[17] = 1; g[18] = c0y[0]; g[19] = c0y[1]; g[20] = c0y[0] * c0y[0] + b10[0]; @@ -1735,16 +1794,16 @@ static inline void _g0_2d4d_3000(double *restrict g, Rys2eT *bc, CINTEnvVars *en double *c0y = bc->c00y; double *c0z = bc->c00z; double *b10 = bc->b10; - //g[0] = 1; - //g[1] = 1; + g[0] = 1; + g[1] = 1; g[2] = c0x[0]; g[3] = c0x[1]; g[4] = c0x[0] * c0x[0] + b10[0]; g[5] = c0x[1] * c0x[1] + b10[1]; g[6] = c0x[0] * (g[4] + 2 * b10[0]); g[7] = c0x[1] * (g[5] + 2 * b10[1]); - //g[8] = 1; - //g[9] = 1; + g[8] = 1; + g[9] = 1; g[10] = c0y[0]; g[11] = c0y[1]; g[12] = c0y[0] * c0y[0] + b10[0]; @@ -1761,7 +1820,7 @@ static inline void _g0_2d4d_3000(double *restrict g, Rys2eT *bc, CINTEnvVars *en g[23] = c0z[1] * g[21] + 2 * b10[1] * g[19]; } -void CINTg0_2e_2d4d_unrolled_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +void CINTg0_2e_2d4d_unrolled_simd1(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) { int type_ijkl = ((envs->li_ceil << 6) | (envs->lj_ceil << 4) | (envs->lk_ceil << 2) | (envs->ll_ceil)); @@ -1807,146 +1866,2481 @@ void CINTg0_2e_2d4d_unrolled_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) (int)envs->ll_ceil, (int)envs->lj_ceil); } -void CINTg0_2e_lj2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +static inline void _srg0_2d4d_0000(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d_simd1(g, bc, envs); - CINTg0_lj_4d_simd1(g, envs); + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + //g[4] = w[0]; + //g[5] = w[1]; } -void CINTg0_2e_kj2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +static inline void _srg0_2d4d_0001(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d_simd1(g, bc, envs); - CINTg0_kj_4d_simd1(g, envs); + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + g[0] = 1; + g[1] = 1; + g[2] = cpx[0]; + g[3] = cpx[1]; + g[4] = 1; + g[5] = 1; + g[6] = cpy[0]; + g[7] = cpy[1]; + //g[8] = w[0]; + //g[9] = w[0]; + g[10] = cpz[0] * g[8]; + g[11] = cpz[1] * g[9]; } -void CINTg0_2e_ik2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) + +static inline void _srg0_2d4d_0002(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d_simd1(g, bc, envs); - CINTg0_ik_4d_simd1(g, envs); + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b01 = bc->b01; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = cpx[0] * cpx[0] + b01[0]; + g[9] = cpx[1] * cpx[1] + b01[1]; + g[10] = cpx[2] * cpx[2] + b01[2]; + g[11] = cpx[3] * cpx[3] + b01[3]; + g[12] = 1; + g[13] = 1; + g[14] = 1; + g[15] = 1; + g[16] = cpy[0]; + g[17] = cpy[1]; + g[18] = cpy[2]; + g[19] = cpy[3]; + g[20] = cpy[0] * cpy[0] + b01[0]; + g[21] = cpy[1] * cpy[1] + b01[1]; + g[22] = cpy[2] * cpy[2] + b01[2]; + g[23] = cpy[3] * cpy[3] + b01[3]; + //g[24] = w[0]; + //g[25] = w[0]; + //g[26] = w[1]; + //g[27] = w[1]; + g[28] = cpz[0] * g[24]; + g[29] = cpz[1] * g[25]; + g[30] = cpz[2] * g[26]; + g[31] = cpz[3] * g[27]; + g[32] = cpz[0] * g[28] + b01[0] * g[24]; + g[33] = cpz[1] * g[29] + b01[1] * g[25]; + g[34] = cpz[2] * g[30] + b01[2] * g[26]; + g[35] = cpz[3] * g[31] + b01[3] * g[27]; } -void CINTg0_2e_il2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) + +static inline void _srg0_2d4d_0003(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) { - CINTg0_2e_2d_simd1(g, bc, envs); - CINTg0_il_4d_simd1(g, envs); + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b01 = bc->b01; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = cpx[0] * cpx[0] + b01[0]; + g[9] = cpx[1] * cpx[1] + b01[1]; + g[10] = cpx[2] * cpx[2] + b01[2]; + g[11] = cpx[3] * cpx[3] + b01[3]; + g[12] = cpx[0] * (g[8] + 2 * b01[0]); + g[13] = cpx[1] * (g[9] + 2 * b01[1]); + g[14] = cpx[2] * (g[10] + 2 * b01[2]); + g[15] = cpx[3] * (g[11] + 2 * b01[3]); + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = cpy[0]; + g[21] = cpy[1]; + g[22] = cpy[2]; + g[23] = cpy[3]; + g[24] = cpy[0] * cpy[0] + b01[0]; + g[25] = cpy[1] * cpy[1] + b01[1]; + g[26] = cpy[2] * cpy[2] + b01[2]; + g[27] = cpy[3] * cpy[3] + b01[3]; + g[28] = cpy[0] * (g[24] + 2 * b01[0]); + g[29] = cpy[1] * (g[25] + 2 * b01[1]); + g[30] = cpy[2] * (g[26] + 2 * b01[2]); + g[31] = cpy[3] * (g[27] + 2 * b01[3]); + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = cpz[0] * g[32]; + g[37] = cpz[1] * g[33]; + g[38] = cpz[2] * g[34]; + g[39] = cpz[3] * g[35]; + g[40] = cpz[0] * g[36] + b01[0] * g[32]; + g[41] = cpz[1] * g[37] + b01[1] * g[33]; + g[42] = cpz[2] * g[38] + b01[2] * g[34]; + g[43] = cpz[3] * g[39] + b01[3] * g[35]; + g[44] = cpz[0] * g[40] + 2 * b01[0] * g[36]; + g[45] = cpz[1] * g[41] + 2 * b01[1] * g[37]; + g[46] = cpz[2] * g[42] + 2 * b01[2] * g[38]; + g[47] = cpz[3] * g[43] + 2 * b01[3] * g[39]; } -/* - * g[i,k,l,j] = < ik | lj > = ( i j | k l ) - */ -int CINTg0_2e_simd1(double *g, double *cutoff, - Rys2eT *bc, CINTEnvVars *envs, int idsimd) +static inline void _srg0_2d4d_0010(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) { - int nroots = envs->nrys_roots; - double aij, akl, a0, a1, fac1; - double x; - double *rij = envs->rij; - double *rkl = envs->rkl; - double rijrkl[3]; - double rijrx[3]; - double rklrx[3]; - double *u = bc->u; - double *w = bc->w; - - int i; - aij = envs->ai[idsimd] + envs->aj[idsimd]; - akl = envs->ak[idsimd] + envs->al[idsimd]; - a1 = aij * akl; - a0 = a1 / (aij + akl); -#ifdef WITH_RANGE_COULOMB - const double omega = envs->env[PTR_RANGE_OMEGA]; - double theta = 1; - if (omega != 0) { - theta = omega * omega / (omega * omega + a0); - if (omega > 0) { // long-range part of range-separated Coulomb - a0 *= theta; - } - } -#endif - fac1 = sqrt(a0 / (a1 * a1 * a1)) * envs->fac[idsimd]; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + g[0] = 1; + g[1] = 1; + g[2] = cpx[0]; + g[3] = cpx[1]; + g[4] = 1; + g[5] = 1; + g[6] = cpy[0]; + g[7] = cpy[1]; + //g[8] = w[0]; + //g[9] = w[0]; + g[10] = cpz[0] * g[8]; + g[11] = cpz[1] * g[9]; +} - rijrkl[0] = rij[0*SIMDD+idsimd] - rkl[0*SIMDD+idsimd]; - rijrkl[1] = rij[1*SIMDD+idsimd] - rkl[1*SIMDD+idsimd]; - rijrkl[2] = rij[2*SIMDD+idsimd] - rkl[2*SIMDD+idsimd]; - x = a0 * SQUARE(rijrkl); +static inline void _srg0_2d4d_0011(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b01 = bc->b01; + double xkxl = envs->rkrl[0]; + double ykyl = envs->rkrl[1]; + double zkzl = envs->rkrl[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[12] = cpx[0] * (xkxl + cpx[0]) + b01[0]; + g[13] = cpx[1] * (xkxl + cpx[1]) + b01[1]; + g[14] = cpx[2] * (xkxl + cpx[2]) + b01[2]; + g[15] = cpx[3] * (xkxl + cpx[3]) + b01[3]; + g[4] = xkxl + cpx[0]; + g[5] = xkxl + cpx[1]; + g[6] = xkxl + cpx[2]; + g[7] = xkxl + cpx[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[32] = cpy[0]; + g[33] = cpy[1]; + g[34] = cpy[2]; + g[35] = cpy[3]; + g[36] = cpy[0] * (ykyl + cpy[0]) + b01[0]; + g[37] = cpy[1] * (ykyl + cpy[1]) + b01[1]; + g[38] = cpy[2] * (ykyl + cpy[2]) + b01[2]; + g[39] = cpy[3] * (ykyl + cpy[3]) + b01[3]; + g[28] = ykyl + cpy[0]; + g[29] = ykyl + cpy[1]; + g[30] = ykyl + cpy[2]; + g[31] = ykyl + cpy[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[56] = cpz[0] * g[48]; + g[57] = cpz[1] * g[49]; + g[58] = cpz[2] * g[50]; + g[59] = cpz[3] * g[51]; + g[60] = g[56] * (zkzl + cpz[0]) + b01[0] * g[48]; + g[61] = g[57] * (zkzl + cpz[1]) + b01[1] * g[49]; + g[62] = g[58] * (zkzl + cpz[2]) + b01[2] * g[50]; + g[63] = g[59] * (zkzl + cpz[3]) + b01[3] * g[51]; + g[52] = g[48] * (zkzl + cpz[0]); + g[53] = g[49] * (zkzl + cpz[1]); + g[54] = g[50] * (zkzl + cpz[2]); + g[55] = g[51] * (zkzl + cpz[3]); +} -#ifdef WITH_RANGE_COULOMB - if (omega < 0) { // short-range part of range-separated Coulomb - // very small erfc() leads to ~0 weights. They can cause - // numerical issue in sr_rys_roots. - if (theta * x > cutoff[idsimd] || theta * x > EXPCUTOFF_SR) { - for (i = 0; i < envs->g_size * 3; i++) { - g[i] = 0; - } - return 0; - } - CINTsr_rys_roots(nroots, x, sqrt(theta), u, w); - } else { - CINTrys_roots(nroots, x, u, w); - } -#else - CINTrys_roots(nroots, x, u, w); -#endif +static inline void _srg0_2d4d_0012(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b01 = bc->b01; + double xkxl = envs->rkrl[0]; + double ykyl = envs->rkrl[1]; + double zkzl = envs->rkrl[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[16] = cpx[0] * cpx[0] + b01[0]; + g[17] = cpx[1] * cpx[1] + b01[1]; + g[18] = cpx[2] * cpx[2] + b01[2]; + g[19] = cpx[3] * cpx[3] + b01[3]; + g[20] = g[16] * (xkxl + cpx[0]) + cpx[0] * 2 * b01[0]; + g[21] = g[17] * (xkxl + cpx[1]) + cpx[1] * 2 * b01[1]; + g[22] = g[18] * (xkxl + cpx[2]) + cpx[2] * 2 * b01[2]; + g[23] = g[19] * (xkxl + cpx[3]) + cpx[3] * 2 * b01[3]; + g[12] = cpx[0] * (xkxl + cpx[0]) + b01[0]; + g[13] = cpx[1] * (xkxl + cpx[1]) + b01[1]; + g[14] = cpx[2] * (xkxl + cpx[2]) + b01[2]; + g[15] = cpx[3] * (xkxl + cpx[3]) + b01[3]; + g[4] = xkxl + cpx[0]; + g[5] = xkxl + cpx[1]; + g[6] = xkxl + cpx[2]; + g[7] = xkxl + cpx[3]; + g[32] = 1; + g[33] = 1; + g[34] = 1; + g[35] = 1; + g[40] = cpy[0]; + g[41] = cpy[1]; + g[42] = cpy[2]; + g[43] = cpy[3]; + g[48] = cpy[0] * cpy[0] + b01[0]; + g[49] = cpy[1] * cpy[1] + b01[1]; + g[50] = cpy[2] * cpy[2] + b01[2]; + g[51] = cpy[3] * cpy[3] + b01[3]; + g[52] = g[48] * (ykyl + cpy[0]) + cpy[0] * 2 * b01[0]; + g[53] = g[49] * (ykyl + cpy[1]) + cpy[1] * 2 * b01[1]; + g[54] = g[50] * (ykyl + cpy[2]) + cpy[2] * 2 * b01[2]; + g[55] = g[51] * (ykyl + cpy[3]) + cpy[3] * 2 * b01[3]; + g[44] = cpy[0] * (ykyl + cpy[0]) + b01[0]; + g[45] = cpy[1] * (ykyl + cpy[1]) + b01[1]; + g[46] = cpy[2] * (ykyl + cpy[2]) + b01[2]; + g[47] = cpy[3] * (ykyl + cpy[3]) + b01[3]; + g[36] = ykyl + cpy[0]; + g[37] = ykyl + cpy[1]; + g[38] = ykyl + cpy[2]; + g[39] = ykyl + cpy[3]; + //g[64] = w[0]; + //g[65] = w[0]; + //g[66] = w[1]; + //g[67] = w[1]; + g[72] = cpz[0] * g[64]; + g[73] = cpz[1] * g[65]; + g[74] = cpz[2] * g[66]; + g[75] = cpz[3] * g[67]; + g[80] = cpz[0] * g[72] + b01[0] * g[64]; + g[81] = cpz[1] * g[73] + b01[1] * g[65]; + g[82] = cpz[2] * g[74] + b01[2] * g[66]; + g[83] = cpz[3] * g[75] + b01[3] * g[67]; + g[84] = g[80] * (zkzl + cpz[0]) + 2 * b01[0] * g[72]; + g[85] = g[81] * (zkzl + cpz[1]) + 2 * b01[1] * g[73]; + g[86] = g[82] * (zkzl + cpz[2]) + 2 * b01[2] * g[74]; + g[87] = g[83] * (zkzl + cpz[3]) + 2 * b01[3] * g[75]; + g[76] = g[72] * (zkzl + cpz[0]) + b01[0] * g[64]; + g[77] = g[73] * (zkzl + cpz[1]) + b01[1] * g[65]; + g[78] = g[74] * (zkzl + cpz[2]) + b01[2] * g[66]; + g[79] = g[75] * (zkzl + cpz[3]) + b01[3] * g[67]; + g[68] = g[64] * (zkzl + cpz[0]); + g[69] = g[65] * (zkzl + cpz[1]); + g[70] = g[66] * (zkzl + cpz[2]); + g[71] = g[67] * (zkzl + cpz[3]); +} - double *gx = g; - double *gy = g + envs->g_size; - double *gz = g + envs->g_size * 2; - for (i = 0; i < nroots; i++) { - gx[i] = 1; - gy[i] = 1; - gz[i] = w[i] * fac1; - } - if (envs->g_size == 1) { - return 1; - } +static inline void _srg0_2d4d_0020(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b01 = bc->b01; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = cpx[0] * cpx[0] + b01[0]; + g[9] = cpx[1] * cpx[1] + b01[1]; + g[10] = cpx[2] * cpx[2] + b01[2]; + g[11] = cpx[3] * cpx[3] + b01[3]; + g[12] = 1; + g[13] = 1; + g[14] = 1; + g[15] = 1; + g[16] = cpy[0]; + g[17] = cpy[1]; + g[18] = cpy[2]; + g[19] = cpy[3]; + g[20] = cpy[0] * cpy[0] + b01[0]; + g[21] = cpy[1] * cpy[1] + b01[1]; + g[22] = cpy[2] * cpy[2] + b01[2]; + g[23] = cpy[3] * cpy[3] + b01[3]; + //g[24] = w[0]; + //g[25] = w[0]; + //g[26] = w[1]; + //g[27] = w[1]; + g[28] = cpz[0] * g[24]; + g[29] = cpz[1] * g[25]; + g[30] = cpz[2] * g[26]; + g[31] = cpz[3] * g[27]; + g[32] = cpz[0] * g[28] + b01[0] * g[24]; + g[33] = cpz[1] * g[29] + b01[1] * g[25]; + g[34] = cpz[2] * g[30] + b01[2] * g[26]; + g[35] = cpz[3] * g[31] + b01[3] * g[27]; +} -#ifdef WITH_RANGE_COULOMB - if (omega > 0) { - /* u[:] = tau^2 / (1 - tau^2) - * transform u[:] to theta^-1 tau^2 / (theta^-1 - tau^2) - * so the rest code can be reused. - */ - for (i = 0; i < nroots; i++) { - u[i] /= u[i] + 1 - u[i] * theta; - } - } -#endif +static inline void _srg0_2d4d_0021(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b01 = bc->b01; + double xkxl = envs->rkrl[0]; + double ykyl = envs->rkrl[1]; + double zkzl = envs->rkrl[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = cpx[0] * cpx[0] + b01[0]; + g[9] = cpx[1] * cpx[1] + b01[1]; + g[10] = cpx[2] * cpx[2] + b01[2]; + g[11] = cpx[3] * cpx[3] + b01[3]; + g[16] = xkxl + cpx[0]; + g[17] = xkxl + cpx[1]; + g[18] = xkxl + cpx[2]; + g[19] = xkxl + cpx[3]; + g[20] = cpx[0] * (xkxl + cpx[0]) + b01[0]; + g[21] = cpx[1] * (xkxl + cpx[1]) + b01[1]; + g[22] = cpx[2] * (xkxl + cpx[2]) + b01[2]; + g[23] = cpx[3] * (xkxl + cpx[3]) + b01[3]; + g[24] = g[8] * (xkxl + cpx[0]) + cpx[0] * 2 * b01[0]; + g[25] = g[9] * (xkxl + cpx[1]) + cpx[1] * 2 * b01[1]; + g[26] = g[10] * (xkxl + cpx[2]) + cpx[2] * 2 * b01[2]; + g[27] = g[11] * (xkxl + cpx[3]) + cpx[3] * 2 * b01[3]; + g[32] = 1; + g[33] = 1; + g[34] = 1; + g[35] = 1; + g[36] = cpy[0]; + g[37] = cpy[1]; + g[38] = cpy[2]; + g[39] = cpy[3]; + g[40] = cpy[0] * cpy[0] + b01[0]; + g[41] = cpy[1] * cpy[1] + b01[1]; + g[42] = cpy[2] * cpy[2] + b01[2]; + g[43] = cpy[3] * cpy[3] + b01[3]; + g[48] = ykyl + cpy[0]; + g[49] = ykyl + cpy[1]; + g[50] = ykyl + cpy[2]; + g[51] = ykyl + cpy[3]; + g[52] = cpy[0] * (ykyl + cpy[0]) + b01[0]; + g[53] = cpy[1] * (ykyl + cpy[1]) + b01[1]; + g[54] = cpy[2] * (ykyl + cpy[2]) + b01[2]; + g[55] = cpy[3] * (ykyl + cpy[3]) + b01[3]; + g[56] = g[40] * (ykyl + cpy[0]) + cpy[0] * 2 * b01[0]; + g[57] = g[41] * (ykyl + cpy[1]) + cpy[1] * 2 * b01[1]; + g[58] = g[42] * (ykyl + cpy[2]) + cpy[2] * 2 * b01[2]; + g[59] = g[43] * (ykyl + cpy[3]) + cpy[3] * 2 * b01[3]; + //g[64] = w[0]; + //g[65] = w[0]; + //g[66] = w[1]; + //g[67] = w[1]; + g[68] = cpz[0] * g[64]; + g[69] = cpz[1] * g[65]; + g[70] = cpz[2] * g[66]; + g[71] = cpz[3] * g[67]; + g[72] = cpz[0] * g[68] + b01[0] * g[64]; + g[73] = cpz[1] * g[69] + b01[1] * g[65]; + g[74] = cpz[2] * g[70] + b01[2] * g[66]; + g[75] = cpz[3] * g[71] + b01[3] * g[67]; + g[80] = g[64] * (zkzl + cpz[0]); + g[81] = g[65] * (zkzl + cpz[1]); + g[82] = g[66] * (zkzl + cpz[2]); + g[83] = g[67] * (zkzl + cpz[3]); + g[84] = g[68] * (zkzl + cpz[0]) + b01[0] * g[64]; + g[85] = g[69] * (zkzl + cpz[1]) + b01[1] * g[65]; + g[86] = g[70] * (zkzl + cpz[2]) + b01[2] * g[66]; + g[87] = g[71] * (zkzl + cpz[3]) + b01[3] * g[67]; + g[88] = g[72] * (zkzl + cpz[0]) + 2 * b01[0] * g[68]; + g[89] = g[73] * (zkzl + cpz[1]) + 2 * b01[1] * g[69]; + g[90] = g[74] * (zkzl + cpz[2]) + 2 * b01[2] * g[70]; + g[91] = g[75] * (zkzl + cpz[3]) + 2 * b01[3] * g[71]; +} - double u2, div, tmp1, tmp2, tmp3, tmp4; - double *b00 = bc->b00; - double *b10 = bc->b10; +static inline void _srg0_2d4d_0030(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; double *b01 = bc->b01; - double *c00x = bc->c00x; - double *c00y = bc->c00y; - double *c00z = bc->c00z; - double *c0px = bc->c0px; - double *c0py = bc->c0py; - double *c0pz = bc->c0pz; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = cpx[0] * cpx[0] + b01[0]; + g[9] = cpx[1] * cpx[1] + b01[1]; + g[10] = cpx[2] * cpx[2] + b01[2]; + g[11] = cpx[3] * cpx[3] + b01[3]; + g[12] = cpx[0] * (g[8] + 2 * b01[0]); + g[13] = cpx[1] * (g[9] + 2 * b01[1]); + g[14] = cpx[2] * (g[10] + 2 * b01[2]); + g[15] = cpx[3] * (g[11] + 2 * b01[3]); + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = cpy[0]; + g[21] = cpy[1]; + g[22] = cpy[2]; + g[23] = cpy[3]; + g[24] = cpy[0] * cpy[0] + b01[0]; + g[25] = cpy[1] * cpy[1] + b01[1]; + g[26] = cpy[2] * cpy[2] + b01[2]; + g[27] = cpy[3] * cpy[3] + b01[3]; + g[28] = cpy[0] * (g[24] + 2 * b01[0]); + g[29] = cpy[1] * (g[25] + 2 * b01[1]); + g[30] = cpy[2] * (g[26] + 2 * b01[2]); + g[31] = cpy[3] * (g[27] + 2 * b01[3]); + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = cpz[0] * g[32]; + g[37] = cpz[1] * g[33]; + g[38] = cpz[2] * g[34]; + g[39] = cpz[3] * g[35]; + g[40] = cpz[0] * g[36] + b01[0] * g[32]; + g[41] = cpz[1] * g[37] + b01[1] * g[33]; + g[42] = cpz[2] * g[38] + b01[2] * g[34]; + g[43] = cpz[3] * g[39] + b01[3] * g[35]; + g[44] = cpz[0] * g[40] + 2 * b01[0] * g[36]; + g[45] = cpz[1] * g[41] + 2 * b01[1] * g[37]; + g[46] = cpz[2] * g[42] + 2 * b01[2] * g[38]; + g[47] = cpz[3] * g[43] + 2 * b01[3] * g[39]; +} - rijrx[0] = rij[0*SIMDD+idsimd] - envs->rx_in_rijrx[0]; - rijrx[1] = rij[1*SIMDD+idsimd] - envs->rx_in_rijrx[1]; - rijrx[2] = rij[2*SIMDD+idsimd] - envs->rx_in_rijrx[2]; - rklrx[0] = rkl[0*SIMDD+idsimd] - envs->rx_in_rklrx[0]; - rklrx[1] = rkl[1*SIMDD+idsimd] - envs->rx_in_rklrx[1]; - rklrx[2] = rkl[2*SIMDD+idsimd] - envs->rx_in_rklrx[2]; - for (i = 0; i < nroots; i++) { +static inline void _srg0_2d4d_0100(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + g[0] = 1; + g[1] = 1; + g[2] = c0x[0]; + g[3] = c0x[1]; + g[4] = 1; + g[5] = 1; + g[6] = c0y[0]; + g[7] = c0y[1]; + //g[8] = w[0]; + //g[9] = w[0]; + g[10] = c0z[0] * g[8]; + g[11] = c0z[1] * g[9]; +} + +static inline void _srg0_2d4d_0101(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = c0x[0]; + g[9] = c0x[1]; + g[10] = c0x[2]; + g[11] = c0x[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = cpy[0]; + g[21] = cpy[1]; + g[22] = cpy[2]; + g[23] = cpy[3]; + g[24] = c0y[0]; + g[25] = c0y[1]; + g[26] = c0y[2]; + g[27] = c0y[3]; + g[28] = cpy[0] * c0y[0] + b00[0]; + g[29] = cpy[1] * c0y[1] + b00[1]; + g[30] = cpy[2] * c0y[2] + b00[2]; + g[31] = cpy[3] * c0y[3] + b00[3]; + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = cpz[0] * g[32]; + g[37] = cpz[1] * g[33]; + g[38] = cpz[2] * g[34]; + g[39] = cpz[3] * g[35]; + g[40] = c0z[0] * g[32]; + g[41] = c0z[1] * g[33]; + g[42] = c0z[2] * g[34]; + g[43] = c0z[3] * g[35]; + g[44] = cpz[0] * g[40] + b00[0] * g[32]; + g[45] = cpz[1] * g[41] + b00[1] * g[33]; + g[46] = cpz[2] * g[42] + b00[2] * g[34]; + g[47] = cpz[3] * g[43] + b00[3] * g[35]; +} + +static inline void _srg0_2d4d_0102(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b01 = bc->b01; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[12] = c0x[0]; + g[13] = c0x[1]; + g[14] = c0x[2]; + g[15] = c0x[3]; + g[8] = cpx[0] * cpx[0] + b01[0]; + g[9] = cpx[1] * cpx[1] + b01[1]; + g[10] = cpx[2] * cpx[2] + b01[2]; + g[11] = cpx[3] * cpx[3] + b01[3]; + g[16] = cpx[0] * c0x[0] + b00[0]; + g[17] = cpx[1] * c0x[1] + b00[1]; + g[18] = cpx[2] * c0x[2] + b00[2]; + g[19] = cpx[3] * c0x[3] + b00[3]; + g[20] = cpx[0] * (g[16] + b00[0]) + b01[0] * c0x[0]; + g[21] = cpx[1] * (g[17] + b00[1]) + b01[1] * c0x[1]; + g[22] = cpx[2] * (g[18] + b00[2]) + b01[2] * c0x[2]; + g[23] = cpx[3] * (g[19] + b00[3]) + b01[3] * c0x[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[28] = cpy[0]; + g[29] = cpy[1]; + g[30] = cpy[2]; + g[31] = cpy[3]; + g[36] = c0y[0]; + g[37] = c0y[1]; + g[38] = c0y[2]; + g[39] = c0y[3]; + g[32] = cpy[0] * cpy[0] + b01[0]; + g[33] = cpy[1] * cpy[1] + b01[1]; + g[34] = cpy[2] * cpy[2] + b01[2]; + g[35] = cpy[3] * cpy[3] + b01[3]; + g[40] = cpy[0] * c0y[0] + b00[0]; + g[41] = cpy[1] * c0y[1] + b00[1]; + g[42] = cpy[2] * c0y[2] + b00[2]; + g[43] = cpy[3] * c0y[3] + b00[3]; + g[44] = cpy[0] * (g[40] + b00[0]) + b01[0] * c0y[0]; + g[45] = cpy[1] * (g[41] + b00[1]) + b01[1] * c0y[1]; + g[46] = cpy[2] * (g[42] + b00[2]) + b01[2] * c0y[2]; + g[47] = cpy[3] * (g[43] + b00[3]) + b01[3] * c0y[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[52] = cpz[0] * g[48]; + g[53] = cpz[1] * g[49]; + g[54] = cpz[2] * g[50]; + g[55] = cpz[3] * g[51]; + g[60] = c0z[0] * g[48]; + g[61] = c0z[1] * g[49]; + g[62] = c0z[2] * g[50]; + g[63] = c0z[3] * g[51]; + g[56] = cpz[0] * g[52] + b01[0] * g[48]; + g[57] = cpz[1] * g[53] + b01[1] * g[49]; + g[58] = cpz[2] * g[54] + b01[2] * g[50]; + g[59] = cpz[3] * g[55] + b01[3] * g[51]; + g[64] = cpz[0] * g[60] + b00[0] * g[48]; + g[65] = cpz[1] * g[61] + b00[1] * g[49]; + g[66] = cpz[2] * g[62] + b00[2] * g[50]; + g[67] = cpz[3] * g[63] + b00[3] * g[51]; + g[68] = cpz[0] * g[64] + b01[0] * g[60] + b00[0] * g[52]; + g[69] = cpz[1] * g[65] + b01[1] * g[61] + b00[1] * g[53]; + g[70] = cpz[2] * g[66] + b01[2] * g[62] + b00[2] * g[54]; + g[71] = cpz[3] * g[67] + b01[3] * g[63] + b00[3] * g[55]; +} + +static inline void _srg0_2d4d_0110(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = c0x[0]; + g[9] = c0x[1]; + g[10] = c0x[2]; + g[11] = c0x[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = cpy[0]; + g[21] = cpy[1]; + g[22] = cpy[2]; + g[23] = cpy[3]; + g[24] = c0y[0]; + g[25] = c0y[1]; + g[26] = c0y[2]; + g[27] = c0y[3]; + g[28] = cpy[0] * c0y[0] + b00[0]; + g[29] = cpy[1] * c0y[1] + b00[1]; + g[30] = cpy[2] * c0y[2] + b00[2]; + g[31] = cpy[3] * c0y[3] + b00[3]; + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = cpz[0] * g[32]; + g[37] = cpz[1] * g[33]; + g[38] = cpz[2] * g[34]; + g[39] = cpz[3] * g[35]; + g[40] = c0z[0] * g[32]; + g[41] = c0z[1] * g[33]; + g[42] = c0z[2] * g[34]; + g[43] = c0z[3] * g[35]; + g[44] = cpz[0] * g[40] + b00[0] * g[32]; + g[45] = cpz[1] * g[41] + b00[1] * g[33]; + g[46] = cpz[2] * g[42] + b00[2] * g[34]; + g[47] = cpz[3] * g[43] + b00[3] * g[35]; +} + +static inline void _srg0_2d4d_0111(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b01 = bc->b01; + double xkxl = envs->rkrl[0]; + double ykyl = envs->rkrl[1]; + double zkzl = envs->rkrl[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[24] = c0x[0]; + g[25] = c0x[1]; + g[26] = c0x[2]; + g[27] = c0x[3]; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[32] = cpx[0] * c0x[0] + b00[0]; + g[33] = cpx[1] * c0x[1] + b00[1]; + g[34] = cpx[2] * c0x[2] + b00[2]; + g[35] = cpx[3] * c0x[3] + b00[3]; + g[12] = cpx[0] * (xkxl + cpx[0]) + b01[0]; + g[13] = cpx[1] * (xkxl + cpx[1]) + b01[1]; + g[14] = cpx[2] * (xkxl + cpx[2]) + b01[2]; + g[15] = cpx[3] * (xkxl + cpx[3]) + b01[3]; + g[36] = g[32] * (xkxl + cpx[0]) + cpx[0] * b00[0] + b01[0] * c0x[0]; + g[37] = g[33] * (xkxl + cpx[1]) + cpx[1] * b00[1] + b01[1] * c0x[1]; + g[38] = g[34] * (xkxl + cpx[2]) + cpx[2] * b00[2] + b01[2] * c0x[2]; + g[39] = g[35] * (xkxl + cpx[3]) + cpx[3] * b00[3] + b01[3] * c0x[3]; + g[4] = xkxl + cpx[0]; + g[5] = xkxl + cpx[1]; + g[6] = xkxl + cpx[2]; + g[7] = xkxl + cpx[3]; + g[28] = c0x[0] * (xkxl + cpx[0]) + b00[0]; + g[29] = c0x[1] * (xkxl + cpx[1]) + b00[1]; + g[30] = c0x[2] * (xkxl + cpx[2]) + b00[2]; + g[31] = c0x[3] * (xkxl + cpx[3]) + b00[3]; + g[48] = 1; + g[49] = 1; + g[50] = 1; + g[51] = 1; + g[72] = c0y[0]; + g[73] = c0y[1]; + g[74] = c0y[2]; + g[75] = c0y[3]; + g[56] = cpy[0]; + g[57] = cpy[1]; + g[58] = cpy[2]; + g[59] = cpy[3]; + g[80] = cpy[0] * c0y[0] + b00[0]; + g[81] = cpy[1] * c0y[1] + b00[1]; + g[82] = cpy[2] * c0y[2] + b00[2]; + g[83] = cpy[3] * c0y[3] + b00[3]; + g[60] = cpy[0] * (ykyl + cpy[0]) + b01[0]; + g[61] = cpy[1] * (ykyl + cpy[1]) + b01[1]; + g[62] = cpy[2] * (ykyl + cpy[2]) + b01[2]; + g[63] = cpy[3] * (ykyl + cpy[3]) + b01[3]; + g[84] = g[80] * (ykyl + cpy[0]) + cpy[0] * b00[0] + b01[0] * c0y[0]; + g[85] = g[81] * (ykyl + cpy[1]) + cpy[1] * b00[1] + b01[1] * c0y[1]; + g[86] = g[82] * (ykyl + cpy[2]) + cpy[2] * b00[2] + b01[2] * c0y[2]; + g[87] = g[83] * (ykyl + cpy[3]) + cpy[3] * b00[3] + b01[3] * c0y[3]; + g[52] = ykyl + cpy[0]; + g[53] = ykyl + cpy[1]; + g[54] = ykyl + cpy[2]; + g[55] = ykyl + cpy[3]; + g[76] = c0y[0] * (ykyl + cpy[0]) + b00[0]; + g[77] = c0y[1] * (ykyl + cpy[1]) + b00[1]; + g[78] = c0y[2] * (ykyl + cpy[2]) + b00[2]; + g[79] = c0y[3] * (ykyl + cpy[3]) + b00[3]; + //g[96] = w[0]; + //g[97] = w[0]; + //g[98] = w[1]; + //g[99] = w[1]; + g[120] = c0z[0] * g[96]; + g[121] = c0z[1] * g[97]; + g[122] = c0z[2] * g[98]; + g[123] = c0z[3] * g[99]; + g[104] = cpz[0] * g[96]; + g[105] = cpz[1] * g[97]; + g[106] = cpz[2] * g[98]; + g[107] = cpz[3] * g[99]; + g[128] = cpz[0] * g[120] + b00[0] * g[96]; + g[129] = cpz[1] * g[121] + b00[1] * g[97]; + g[130] = cpz[2] * g[122] + b00[2] * g[98]; + g[131] = cpz[3] * g[123] + b00[3] * g[99]; + g[108] = g[104] * (zkzl + cpz[0]) + b01[0] * g[96]; + g[109] = g[105] * (zkzl + cpz[1]) + b01[1] * g[97]; + g[110] = g[106] * (zkzl + cpz[2]) + b01[2] * g[98]; + g[111] = g[107] * (zkzl + cpz[3]) + b01[3] * g[99]; + g[132] = g[128] * (zkzl + cpz[0]) + b01[0] * g[120] + b00[0] * g[104]; + g[133] = g[129] * (zkzl + cpz[1]) + b01[1] * g[121] + b00[1] * g[105]; + g[134] = g[130] * (zkzl + cpz[2]) + b01[2] * g[122] + b00[2] * g[106]; + g[135] = g[131] * (zkzl + cpz[3]) + b01[3] * g[123] + b00[3] * g[107]; + g[100] = g[96] * (zkzl + cpz[0]); + g[101] = g[97] * (zkzl + cpz[1]); + g[102] = g[98] * (zkzl + cpz[2]); + g[103] = g[99] * (zkzl + cpz[3]); + g[124] = g[120] * (zkzl + cpz[0]) + b00[0] * g[96]; + g[125] = g[121] * (zkzl + cpz[1]) + b00[1] * g[97]; + g[126] = g[122] * (zkzl + cpz[2]) + b00[2] * g[98]; + g[127] = g[123] * (zkzl + cpz[3]) + b00[3] * g[99]; +} + +static inline void _srg0_2d4d_0120(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b01 = bc->b01; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[12] = c0x[0]; + g[13] = c0x[1]; + g[14] = c0x[2]; + g[15] = c0x[3]; + g[8] = cpx[0] * cpx[0] + b01[0]; + g[9] = cpx[1] * cpx[1] + b01[1]; + g[10] = cpx[2] * cpx[2] + b01[2]; + g[11] = cpx[3] * cpx[3] + b01[3]; + g[16] = cpx[0] * c0x[0] + b00[0]; + g[17] = cpx[1] * c0x[1] + b00[1]; + g[18] = cpx[2] * c0x[2] + b00[2]; + g[19] = cpx[3] * c0x[3] + b00[3]; + g[20] = cpx[0] * (g[16] + b00[0]) + b01[0] * c0x[0]; + g[21] = cpx[1] * (g[17] + b00[1]) + b01[1] * c0x[1]; + g[22] = cpx[2] * (g[18] + b00[2]) + b01[2] * c0x[2]; + g[23] = cpx[3] * (g[19] + b00[3]) + b01[3] * c0x[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[28] = cpy[0]; + g[29] = cpy[1]; + g[30] = cpy[2]; + g[31] = cpy[3]; + g[36] = c0y[0]; + g[37] = c0y[1]; + g[38] = c0y[2]; + g[39] = c0y[3]; + g[32] = cpy[0] * cpy[0] + b01[0]; + g[33] = cpy[1] * cpy[1] + b01[1]; + g[34] = cpy[2] * cpy[2] + b01[2]; + g[35] = cpy[3] * cpy[3] + b01[3]; + g[40] = cpy[0] * c0y[0] + b00[0]; + g[41] = cpy[1] * c0y[1] + b00[1]; + g[42] = cpy[2] * c0y[2] + b00[2]; + g[43] = cpy[3] * c0y[3] + b00[3]; + g[44] = cpy[0] * (g[40] + b00[0]) + b01[0] * c0y[0]; + g[45] = cpy[1] * (g[41] + b00[1]) + b01[1] * c0y[1]; + g[46] = cpy[2] * (g[42] + b00[2]) + b01[2] * c0y[2]; + g[47] = cpy[3] * (g[43] + b00[3]) + b01[3] * c0y[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[52] = cpz[0] * g[48]; + g[53] = cpz[1] * g[49]; + g[54] = cpz[2] * g[50]; + g[55] = cpz[3] * g[51]; + g[60] = c0z[0] * g[48]; + g[61] = c0z[1] * g[49]; + g[62] = c0z[2] * g[50]; + g[63] = c0z[3] * g[51]; + g[56] = cpz[0] * g[52] + b01[0] * g[48]; + g[57] = cpz[1] * g[53] + b01[1] * g[49]; + g[58] = cpz[2] * g[54] + b01[2] * g[50]; + g[59] = cpz[3] * g[55] + b01[3] * g[51]; + g[64] = cpz[0] * g[60] + b00[0] * g[48]; + g[65] = cpz[1] * g[61] + b00[1] * g[49]; + g[66] = cpz[2] * g[62] + b00[2] * g[50]; + g[67] = cpz[3] * g[63] + b00[3] * g[51]; + g[68] = cpz[0] * g[64] + b01[0] * g[60] + b00[0] * g[52]; + g[69] = cpz[1] * g[65] + b01[1] * g[61] + b00[1] * g[53]; + g[70] = cpz[2] * g[66] + b01[2] * g[62] + b00[2] * g[54]; + g[71] = cpz[3] * g[67] + b01[3] * g[63] + b00[3] * g[55]; +} + +static inline void _srg0_2d4d_0200(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = c0x[0] * c0x[0] + b10[0]; + g[9] = c0x[1] * c0x[1] + b10[1]; + g[10] = c0x[2] * c0x[2] + b10[2]; + g[11] = c0x[3] * c0x[3] + b10[3]; + g[12] = 1; + g[13] = 1; + g[14] = 1; + g[15] = 1; + g[16] = c0y[0]; + g[17] = c0y[1]; + g[18] = c0y[2]; + g[19] = c0y[3]; + g[20] = c0y[0] * c0y[0] + b10[0]; + g[21] = c0y[1] * c0y[1] + b10[1]; + g[22] = c0y[2] * c0y[2] + b10[2]; + g[23] = c0y[3] * c0y[3] + b10[3]; + //g[24] = w[0]; + //g[25] = w[0]; + //g[26] = w[1]; + //g[27] = w[1]; + g[28] = c0z[0] * g[24]; + g[29] = c0z[1] * g[25]; + g[30] = c0z[2] * g[26]; + g[31] = c0z[3] * g[27]; + g[32] = c0z[0] * g[28] + b10[0] * g[24]; + g[33] = c0z[1] * g[29] + b10[1] * g[25]; + g[34] = c0z[2] * g[30] + b10[2] * g[26]; + g[35] = c0z[3] * g[31] + b10[3] * g[27]; +} + +static inline void _srg0_2d4d_0201(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[8] = c0x[0]; + g[9] = c0x[1]; + g[10] = c0x[2]; + g[11] = c0x[3]; + g[16] = c0x[0] * c0x[0] + b10[0]; + g[17] = c0x[1] * c0x[1] + b10[1]; + g[18] = c0x[2] * c0x[2] + b10[2]; + g[19] = c0x[3] * c0x[3] + b10[3]; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[20] = c0x[0] * (g[12] + b00[0]) + b10[0] * cpx[0]; + g[21] = c0x[1] * (g[13] + b00[1]) + b10[1] * cpx[1]; + g[22] = c0x[2] * (g[14] + b00[2]) + b10[2] * cpx[2]; + g[23] = c0x[3] * (g[15] + b00[3]) + b10[3] * cpx[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[32] = c0y[0]; + g[33] = c0y[1]; + g[34] = c0y[2]; + g[35] = c0y[3]; + g[40] = c0y[0] * c0y[0] + b10[0]; + g[41] = c0y[1] * c0y[1] + b10[1]; + g[42] = c0y[2] * c0y[2] + b10[2]; + g[43] = c0y[3] * c0y[3] + b10[3]; + g[28] = cpy[0]; + g[29] = cpy[1]; + g[30] = cpy[2]; + g[31] = cpy[3]; + g[36] = cpy[0] * c0y[0] + b00[0]; + g[37] = cpy[1] * c0y[1] + b00[1]; + g[38] = cpy[2] * c0y[2] + b00[2]; + g[39] = cpy[3] * c0y[3] + b00[3]; + g[44] = c0y[0] * (g[36] + b00[0]) + b10[0] * cpy[0]; + g[45] = c0y[1] * (g[37] + b00[1]) + b10[1] * cpy[1]; + g[46] = c0y[2] * (g[38] + b00[2]) + b10[2] * cpy[2]; + g[47] = c0y[3] * (g[39] + b00[3]) + b10[3] * cpy[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[56] = c0z[0] * g[48]; + g[57] = c0z[1] * g[49]; + g[58] = c0z[2] * g[50]; + g[59] = c0z[3] * g[51]; + g[64] = c0z[0] * g[56] + b10[0] * g[48]; + g[65] = c0z[1] * g[57] + b10[1] * g[49]; + g[66] = c0z[2] * g[58] + b10[2] * g[50]; + g[67] = c0z[3] * g[59] + b10[3] * g[51]; + g[52] = cpz[0] * g[48]; + g[53] = cpz[1] * g[49]; + g[54] = cpz[2] * g[50]; + g[55] = cpz[3] * g[51]; + g[60] = cpz[0] * g[56] + b00[0] * g[48]; + g[61] = cpz[1] * g[57] + b00[1] * g[49]; + g[62] = cpz[2] * g[58] + b00[2] * g[50]; + g[63] = cpz[3] * g[59] + b00[3] * g[51]; + g[68] = c0z[0] * g[60] + b10[0] * g[52] + b00[0] * g[56]; + g[69] = c0z[1] * g[61] + b10[1] * g[53] + b00[1] * g[57]; + g[70] = c0z[2] * g[62] + b10[2] * g[54] + b00[2] * g[58]; + g[71] = c0z[3] * g[63] + b10[3] * g[55] + b00[3] * g[59]; +} + +static inline void _srg0_2d4d_0210(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = cpx[0]; + g[5] = cpx[1]; + g[6] = cpx[2]; + g[7] = cpx[3]; + g[8] = c0x[0]; + g[9] = c0x[1]; + g[10] = c0x[2]; + g[11] = c0x[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[16] = c0x[0] * c0x[0] + b10[0]; + g[17] = c0x[1] * c0x[1] + b10[1]; + g[18] = c0x[2] * c0x[2] + b10[2]; + g[19] = c0x[3] * c0x[3] + b10[3]; + g[20] = c0x[0] * (g[12] + b00[0]) + b10[0] * cpx[0]; + g[21] = c0x[1] * (g[13] + b00[1]) + b10[1] * cpx[1]; + g[22] = c0x[2] * (g[14] + b00[2]) + b10[2] * cpx[2]; + g[23] = c0x[3] * (g[15] + b00[3]) + b10[3] * cpx[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[28] = cpy[0]; + g[29] = cpy[1]; + g[30] = cpy[2]; + g[31] = cpy[3]; + g[32] = c0y[0]; + g[33] = c0y[1]; + g[34] = c0y[2]; + g[35] = c0y[3]; + g[36] = cpy[0] * c0y[0] + b00[0]; + g[37] = cpy[1] * c0y[1] + b00[1]; + g[38] = cpy[2] * c0y[2] + b00[2]; + g[39] = cpy[3] * c0y[3] + b00[3]; + g[40] = c0y[0] * c0y[0] + b10[0]; + g[41] = c0y[1] * c0y[1] + b10[1]; + g[42] = c0y[2] * c0y[2] + b10[2]; + g[43] = c0y[3] * c0y[3] + b10[3]; + g[44] = c0y[0] * (g[36] + b00[0]) + b10[0] * cpy[0]; + g[45] = c0y[1] * (g[37] + b00[1]) + b10[1] * cpy[1]; + g[46] = c0y[2] * (g[38] + b00[2]) + b10[2] * cpy[2]; + g[47] = c0y[3] * (g[39] + b00[3]) + b10[3] * cpy[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[52] = cpz[0] * g[48]; + g[53] = cpz[1] * g[49]; + g[54] = cpz[2] * g[50]; + g[55] = cpz[3] * g[51]; + g[56] = c0z[0] * g[48]; + g[57] = c0z[1] * g[49]; + g[58] = c0z[2] * g[50]; + g[59] = c0z[3] * g[51]; + g[60] = cpz[0] * g[56] + b00[0] * g[48]; + g[61] = cpz[1] * g[57] + b00[1] * g[49]; + g[62] = cpz[2] * g[58] + b00[2] * g[50]; + g[63] = cpz[3] * g[59] + b00[3] * g[51]; + g[64] = c0z[0] * g[56] + b10[0] * g[48]; + g[65] = c0z[1] * g[57] + b10[1] * g[49]; + g[66] = c0z[2] * g[58] + b10[2] * g[50]; + g[67] = c0z[3] * g[59] + b10[3] * g[51]; + g[68] = c0z[0] * g[60] + b10[0] * g[52] + b00[0] * g[56]; + g[69] = c0z[1] * g[61] + b10[1] * g[53] + b00[1] * g[57]; + g[70] = c0z[2] * g[62] + b10[2] * g[54] + b00[2] * g[58]; + g[71] = c0z[3] * g[63] + b10[3] * g[55] + b00[3] * g[59]; +} + +static inline void _srg0_2d4d_0300(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = c0x[0] * c0x[0] + b10[0]; + g[9] = c0x[1] * c0x[1] + b10[1]; + g[10] = c0x[2] * c0x[2] + b10[2]; + g[11] = c0x[3] * c0x[3] + b10[3]; + g[12] = c0x[0] * (g[8] + 2 * b10[0]); + g[13] = c0x[1] * (g[9] + 2 * b10[1]); + g[14] = c0x[2] * (g[10] + 2 * b10[2]); + g[15] = c0x[3] * (g[11] + 2 * b10[3]); + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = c0y[0]; + g[21] = c0y[1]; + g[22] = c0y[2]; + g[23] = c0y[3]; + g[24] = c0y[0] * c0y[0] + b10[0]; + g[25] = c0y[1] * c0y[1] + b10[1]; + g[26] = c0y[2] * c0y[2] + b10[2]; + g[27] = c0y[3] * c0y[3] + b10[3]; + g[28] = c0y[0] * (g[24] + 2 * b10[0]); + g[29] = c0y[1] * (g[25] + 2 * b10[1]); + g[30] = c0y[2] * (g[26] + 2 * b10[2]); + g[31] = c0y[3] * (g[27] + 2 * b10[3]); + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = c0z[0] * g[32]; + g[37] = c0z[1] * g[33]; + g[38] = c0z[2] * g[34]; + g[39] = c0z[3] * g[35]; + g[40] = c0z[0] * g[36] + b10[0] * g[32]; + g[41] = c0z[1] * g[37] + b10[1] * g[33]; + g[42] = c0z[2] * g[38] + b10[2] * g[34]; + g[43] = c0z[3] * g[39] + b10[3] * g[35]; + g[44] = c0z[0] * g[40] + 2 * b10[0] * g[36]; + g[45] = c0z[1] * g[41] + 2 * b10[1] * g[37]; + g[46] = c0z[2] * g[42] + 2 * b10[2] * g[38]; + g[47] = c0z[3] * g[43] + 2 * b10[3] * g[39]; +} + +static inline void _srg0_2d4d_1000(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + g[0] = 1; + g[1] = 1; + g[2] = c0x[0]; + g[3] = c0x[1]; + g[4] = 1; + g[5] = 1; + g[6] = c0y[0]; + g[7] = c0y[1]; + //g[8] = w[0]; + //g[9] = w[0]; + g[10] = c0z[0] * g[8]; + g[11] = c0z[1] * g[9]; +} + +static inline void _srg0_2d4d_1001(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = c0y[0]; + g[21] = c0y[1]; + g[22] = c0y[2]; + g[23] = c0y[3]; + g[24] = cpy[0]; + g[25] = cpy[1]; + g[26] = cpy[2]; + g[27] = cpy[3]; + g[28] = cpy[0] * c0y[0] + b00[0]; + g[29] = cpy[1] * c0y[1] + b00[1]; + g[30] = cpy[2] * c0y[2] + b00[2]; + g[31] = cpy[3] * c0y[3] + b00[3]; + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = c0z[0] * g[32]; + g[37] = c0z[1] * g[33]; + g[38] = c0z[2] * g[34]; + g[39] = c0z[3] * g[35]; + g[40] = cpz[0] * g[32]; + g[41] = cpz[1] * g[33]; + g[42] = cpz[2] * g[34]; + g[43] = cpz[3] * g[35]; + g[44] = cpz[0] * g[36] + b00[0] * g[32]; + g[45] = cpz[1] * g[37] + b00[1] * g[33]; + g[46] = cpz[2] * g[38] + b00[2] * g[34]; + g[47] = cpz[3] * g[39] + b00[3] * g[35]; +} + +static inline void _srg0_2d4d_1002(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b01 = bc->b01; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[16] = cpx[0] * cpx[0] + b01[0]; + g[17] = cpx[1] * cpx[1] + b01[1]; + g[18] = cpx[2] * cpx[2] + b01[2]; + g[19] = cpx[3] * cpx[3] + b01[3]; + g[20] = cpx[0] * (g[12] + b00[0]) + b01[0] * c0x[0]; + g[21] = cpx[1] * (g[13] + b00[1]) + b01[1] * c0x[1]; + g[22] = cpx[2] * (g[14] + b00[2]) + b01[2] * c0x[2]; + g[23] = cpx[3] * (g[15] + b00[3]) + b01[3] * c0x[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[28] = c0y[0]; + g[29] = c0y[1]; + g[30] = c0y[2]; + g[31] = c0y[3]; + g[32] = cpy[0]; + g[33] = cpy[1]; + g[34] = cpy[2]; + g[35] = cpy[3]; + g[36] = cpy[0] * c0y[0] + b00[0]; + g[37] = cpy[1] * c0y[1] + b00[1]; + g[38] = cpy[2] * c0y[2] + b00[2]; + g[39] = cpy[3] * c0y[3] + b00[3]; + g[40] = cpy[0] * cpy[0] + b01[0]; + g[41] = cpy[1] * cpy[1] + b01[1]; + g[42] = cpy[2] * cpy[2] + b01[2]; + g[43] = cpy[3] * cpy[3] + b01[3]; + g[44] = cpy[0] * (g[36] + b00[0]) + b01[0] * c0y[0]; + g[45] = cpy[1] * (g[37] + b00[1]) + b01[1] * c0y[1]; + g[46] = cpy[2] * (g[38] + b00[2]) + b01[2] * c0y[2]; + g[47] = cpy[3] * (g[39] + b00[3]) + b01[3] * c0y[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[52] = c0z[0] * g[48]; + g[53] = c0z[1] * g[49]; + g[54] = c0z[2] * g[50]; + g[55] = c0z[3] * g[51]; + g[56] = cpz[0] * g[48]; + g[57] = cpz[1] * g[49]; + g[58] = cpz[2] * g[50]; + g[59] = cpz[3] * g[51]; + g[60] = cpz[0] * g[52] + b00[0] * g[48]; + g[61] = cpz[1] * g[53] + b00[1] * g[49]; + g[62] = cpz[2] * g[54] + b00[2] * g[50]; + g[63] = cpz[3] * g[55] + b00[3] * g[51]; + g[64] = cpz[0] * g[56] + b01[0] * g[48]; + g[65] = cpz[1] * g[57] + b01[1] * g[49]; + g[66] = cpz[2] * g[58] + b01[2] * g[50]; + g[67] = cpz[3] * g[59] + b01[3] * g[51]; + g[68] = cpz[0] * g[60] + b01[0] * g[52] + b00[0] * g[56]; + g[69] = cpz[1] * g[61] + b01[1] * g[53] + b00[1] * g[57]; + g[70] = cpz[2] * g[62] + b01[2] * g[54] + b00[2] * g[58]; + g[71] = cpz[3] * g[63] + b01[3] * g[55] + b00[3] * g[59]; +} + +static inline void _srg0_2d4d_1010(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = c0y[0]; + g[21] = c0y[1]; + g[22] = c0y[2]; + g[23] = c0y[3]; + g[24] = cpy[0]; + g[25] = cpy[1]; + g[26] = cpy[2]; + g[27] = cpy[3]; + g[28] = cpy[0] * c0y[0] + b00[0]; + g[29] = cpy[1] * c0y[1] + b00[1]; + g[30] = cpy[2] * c0y[2] + b00[2]; + g[31] = cpy[3] * c0y[3] + b00[3]; + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = c0z[0] * g[32]; + g[37] = c0z[1] * g[33]; + g[38] = c0z[2] * g[34]; + g[39] = c0z[3] * g[35]; + g[40] = cpz[0] * g[32]; + g[41] = cpz[1] * g[33]; + g[42] = cpz[2] * g[34]; + g[43] = cpz[3] * g[35]; + g[44] = cpz[0] * g[36] + b00[0] * g[32]; + g[45] = cpz[1] * g[37] + b00[1] * g[33]; + g[46] = cpz[2] * g[38] + b00[2] * g[34]; + g[47] = cpz[3] * g[39] + b00[3] * g[35]; +} + +static inline void _srg0_2d4d_1011(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b01 = bc->b01; + double xkxl = envs->rkrl[0]; + double ykyl = envs->rkrl[1]; + double zkzl = envs->rkrl[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[16] = cpx[0]; + g[17] = cpx[1]; + g[18] = cpx[2]; + g[19] = cpx[3]; + g[20] = cpx[0] * c0x[0] + b00[0]; + g[21] = cpx[1] * c0x[1] + b00[1]; + g[22] = cpx[2] * c0x[2] + b00[2]; + g[23] = cpx[3] * c0x[3] + b00[3]; + g[24] = cpx[0] * (xkxl + cpx[0]) + b01[0]; + g[25] = cpx[1] * (xkxl + cpx[1]) + b01[1]; + g[26] = cpx[2] * (xkxl + cpx[2]) + b01[2]; + g[27] = cpx[3] * (xkxl + cpx[3]) + b01[3]; + g[28] = g[20] * (xkxl + cpx[0]) + cpx[0] * b00[0] + b01[0] * c0x[0]; + g[29] = g[21] * (xkxl + cpx[1]) + cpx[1] * b00[1] + b01[1] * c0x[1]; + g[30] = g[22] * (xkxl + cpx[2]) + cpx[2] * b00[2] + b01[2] * c0x[2]; + g[31] = g[23] * (xkxl + cpx[3]) + cpx[3] * b00[3] + b01[3] * c0x[3]; + g[8] = xkxl + cpx[0]; + g[9] = xkxl + cpx[1]; + g[10] = xkxl + cpx[2]; + g[11] = xkxl + cpx[3]; + g[12] = c0x[0] * (xkxl + cpx[0]) + b00[0]; + g[13] = c0x[1] * (xkxl + cpx[1]) + b00[1]; + g[14] = c0x[2] * (xkxl + cpx[2]) + b00[2]; + g[15] = c0x[3] * (xkxl + cpx[3]) + b00[3]; + g[48] = 1; + g[49] = 1; + g[50] = 1; + g[51] = 1; + g[52] = c0y[0]; + g[53] = c0y[1]; + g[54] = c0y[2]; + g[55] = c0y[3]; + g[64] = cpy[0]; + g[65] = cpy[1]; + g[66] = cpy[2]; + g[67] = cpy[3]; + g[68] = cpy[0] * c0y[0] + b00[0]; + g[69] = cpy[1] * c0y[1] + b00[1]; + g[70] = cpy[2] * c0y[2] + b00[2]; + g[71] = cpy[3] * c0y[3] + b00[3]; + g[72] = cpy[0] * (ykyl + cpy[0]) + b01[0]; + g[73] = cpy[1] * (ykyl + cpy[1]) + b01[1]; + g[74] = cpy[2] * (ykyl + cpy[2]) + b01[2]; + g[75] = cpy[3] * (ykyl + cpy[3]) + b01[3]; + g[76] = g[68] * (ykyl + cpy[0]) + cpy[0] * b00[0] + b01[0] * c0y[0]; + g[77] = g[69] * (ykyl + cpy[1]) + cpy[1] * b00[1] + b01[1] * c0y[1]; + g[78] = g[70] * (ykyl + cpy[2]) + cpy[2] * b00[2] + b01[2] * c0y[2]; + g[79] = g[71] * (ykyl + cpy[3]) + cpy[3] * b00[3] + b01[3] * c0y[3]; + g[56] = ykyl + cpy[0]; + g[57] = ykyl + cpy[1]; + g[58] = ykyl + cpy[2]; + g[59] = ykyl + cpy[3]; + g[60] = c0y[0] * (ykyl + cpy[0]) + b00[0]; + g[61] = c0y[1] * (ykyl + cpy[1]) + b00[1]; + g[62] = c0y[2] * (ykyl + cpy[2]) + b00[2]; + g[63] = c0y[3] * (ykyl + cpy[3]) + b00[3]; + //g[96] = w[0]; + //g[97] = w[0]; + //g[98] = w[1]; + //g[99] = w[1]; + g[100] = c0z[0] * g[96]; + g[101] = c0z[1] * g[97]; + g[102] = c0z[2] * g[98]; + g[103] = c0z[3] * g[99]; + g[112] = cpz[0] * g[96]; + g[113] = cpz[1] * g[97]; + g[114] = cpz[2] * g[98]; + g[115] = cpz[3] * g[99]; + g[116] = cpz[0] * g[100] + b00[0] * g[96]; + g[117] = cpz[1] * g[101] + b00[1] * g[97]; + g[118] = cpz[2] * g[102] + b00[2] * g[98]; + g[119] = cpz[3] * g[103] + b00[3] * g[99]; + g[120] = g[112] * (zkzl + cpz[0]) + b01[0] * g[96]; + g[121] = g[113] * (zkzl + cpz[1]) + b01[1] * g[97]; + g[122] = g[114] * (zkzl + cpz[2]) + b01[2] * g[98]; + g[123] = g[115] * (zkzl + cpz[3]) + b01[3] * g[99]; + g[124] = g[116] * (zkzl + cpz[0]) + b01[0] * g[100] + b00[0] * g[112]; + g[125] = g[117] * (zkzl + cpz[1]) + b01[1] * g[101] + b00[1] * g[113]; + g[126] = g[118] * (zkzl + cpz[2]) + b01[2] * g[102] + b00[2] * g[114]; + g[127] = g[119] * (zkzl + cpz[3]) + b01[3] * g[103] + b00[3] * g[115]; + g[104] = g[96] * (zkzl + cpz[0]); + g[105] = g[97] * (zkzl + cpz[1]); + g[106] = g[98] * (zkzl + cpz[2]); + g[107] = g[99] * (zkzl + cpz[3]); + g[108] = g[100] * (zkzl + cpz[0]) + b00[0] * g[96]; + g[109] = g[101] * (zkzl + cpz[1]) + b00[1] * g[97]; + g[110] = g[102] * (zkzl + cpz[2]) + b00[2] * g[98]; + g[111] = g[103] * (zkzl + cpz[3]) + b00[3] * g[99]; +} + +static inline void _srg0_2d4d_1020(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b01 = bc->b01; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[12] = cpx[0] * c0x[0] + b00[0]; + g[13] = cpx[1] * c0x[1] + b00[1]; + g[14] = cpx[2] * c0x[2] + b00[2]; + g[15] = cpx[3] * c0x[3] + b00[3]; + g[16] = cpx[0] * cpx[0] + b01[0]; + g[17] = cpx[1] * cpx[1] + b01[1]; + g[18] = cpx[2] * cpx[2] + b01[2]; + g[19] = cpx[3] * cpx[3] + b01[3]; + g[20] = cpx[0] * (g[12] + b00[0]) + b01[0] * c0x[0]; + g[21] = cpx[1] * (g[13] + b00[1]) + b01[1] * c0x[1]; + g[22] = cpx[2] * (g[14] + b00[2]) + b01[2] * c0x[2]; + g[23] = cpx[3] * (g[15] + b00[3]) + b01[3] * c0x[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[28] = c0y[0]; + g[29] = c0y[1]; + g[30] = c0y[2]; + g[31] = c0y[3]; + g[32] = cpy[0]; + g[33] = cpy[1]; + g[34] = cpy[2]; + g[35] = cpy[3]; + g[36] = cpy[0] * c0y[0] + b00[0]; + g[37] = cpy[1] * c0y[1] + b00[1]; + g[38] = cpy[2] * c0y[2] + b00[2]; + g[39] = cpy[3] * c0y[3] + b00[3]; + g[40] = cpy[0] * cpy[0] + b01[0]; + g[41] = cpy[1] * cpy[1] + b01[1]; + g[42] = cpy[2] * cpy[2] + b01[2]; + g[43] = cpy[3] * cpy[3] + b01[3]; + g[44] = cpy[0] * (g[36] + b00[0]) + b01[0] * c0y[0]; + g[45] = cpy[1] * (g[37] + b00[1]) + b01[1] * c0y[1]; + g[46] = cpy[2] * (g[38] + b00[2]) + b01[2] * c0y[2]; + g[47] = cpy[3] * (g[39] + b00[3]) + b01[3] * c0y[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[52] = c0z[0] * g[48]; + g[53] = c0z[1] * g[49]; + g[54] = c0z[2] * g[50]; + g[55] = c0z[3] * g[51]; + g[56] = cpz[0] * g[48]; + g[57] = cpz[1] * g[49]; + g[58] = cpz[2] * g[50]; + g[59] = cpz[3] * g[51]; + g[60] = cpz[0] * g[52] + b00[0] * g[48]; + g[61] = cpz[1] * g[53] + b00[1] * g[49]; + g[62] = cpz[2] * g[54] + b00[2] * g[50]; + g[63] = cpz[3] * g[55] + b00[3] * g[51]; + g[64] = cpz[0] * g[56] + b01[0] * g[48]; + g[65] = cpz[1] * g[57] + b01[1] * g[49]; + g[66] = cpz[2] * g[58] + b01[2] * g[50]; + g[67] = cpz[3] * g[59] + b01[3] * g[51]; + g[68] = cpz[0] * g[60] + b01[0] * g[52] + b00[0] * g[56]; + g[69] = cpz[1] * g[61] + b01[1] * g[53] + b00[1] * g[57]; + g[70] = cpz[2] * g[62] + b01[2] * g[54] + b00[2] * g[58]; + g[71] = cpz[3] * g[63] + b01[3] * g[55] + b00[3] * g[59]; +} + +static inline void _srg0_2d4d_1100(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *b10 = bc->b10; + double xixj = envs->rirj[0]; + double yiyj = envs->rirj[1]; + double zizj = envs->rirj[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[8] = c0x[0]; + g[9] = c0x[1]; + g[10] = c0x[2]; + g[11] = c0x[3]; + g[12] = c0x[0] * (xixj + c0x[0]) + b10[0]; + g[13] = c0x[1] * (xixj + c0x[1]) + b10[1]; + g[14] = c0x[2] * (xixj + c0x[2]) + b10[2]; + g[15] = c0x[3] * (xixj + c0x[3]) + b10[3]; + g[4] = xixj + c0x[0]; + g[5] = xixj + c0x[1]; + g[6] = xixj + c0x[2]; + g[7] = xixj + c0x[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[32] = c0y[0]; + g[33] = c0y[1]; + g[34] = c0y[2]; + g[35] = c0y[3]; + g[36] = c0y[0] * (yiyj + c0y[0]) + b10[0]; + g[37] = c0y[1] * (yiyj + c0y[1]) + b10[1]; + g[38] = c0y[2] * (yiyj + c0y[2]) + b10[2]; + g[39] = c0y[3] * (yiyj + c0y[3]) + b10[3]; + g[28] = yiyj + c0y[0]; + g[29] = yiyj + c0y[1]; + g[30] = yiyj + c0y[2]; + g[31] = yiyj + c0y[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[56] = c0z[0] * g[48]; + g[57] = c0z[1] * g[49]; + g[58] = c0z[2] * g[50]; + g[59] = c0z[3] * g[51]; + g[60] = g[56] * (zizj + c0z[0]) + b10[0] * g[48]; + g[61] = g[57] * (zizj + c0z[1]) + b10[1] * g[49]; + g[62] = g[58] * (zizj + c0z[2]) + b10[2] * g[50]; + g[63] = g[59] * (zizj + c0z[3]) + b10[3] * g[51]; + g[52] = g[48] * (zizj + c0z[0]); + g[53] = g[49] * (zizj + c0z[1]); + g[54] = g[50] * (zizj + c0z[2]); + g[55] = g[51] * (zizj + c0z[3]); +} + +static inline void _srg0_2d4d_1101(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b10 = bc->b10; + double xixj = envs->rirj[0]; + double yiyj = envs->rirj[1]; + double zizj = envs->rirj[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[16] = c0x[0]; + g[17] = c0x[1]; + g[18] = c0x[2]; + g[19] = c0x[3]; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[24] = cpx[0] * c0x[0] + b00[0]; + g[25] = cpx[1] * c0x[1] + b00[1]; + g[26] = cpx[2] * c0x[2] + b00[2]; + g[27] = cpx[3] * c0x[3] + b00[3]; + g[20] = c0x[0] * (xixj + c0x[0]) + b10[0]; + g[21] = c0x[1] * (xixj + c0x[1]) + b10[1]; + g[22] = c0x[2] * (xixj + c0x[2]) + b10[2]; + g[23] = c0x[3] * (xixj + c0x[3]) + b10[3]; + g[4] = xixj + c0x[0]; + g[5] = xixj + c0x[1]; + g[6] = xixj + c0x[2]; + g[7] = xixj + c0x[3]; + g[28] = g[24] * (xixj + c0x[0]) + c0x[0] * b00[0] + b10[0] * cpx[0]; + g[29] = g[25] * (xixj + c0x[1]) + c0x[1] * b00[1] + b10[1] * cpx[1]; + g[30] = g[26] * (xixj + c0x[2]) + c0x[2] * b00[2] + b10[2] * cpx[2]; + g[31] = g[27] * (xixj + c0x[3]) + c0x[3] * b00[3] + b10[3] * cpx[3]; + g[12] = cpx[0] * (xixj + c0x[0]) + b00[0]; + g[13] = cpx[1] * (xixj + c0x[1]) + b00[1]; + g[14] = cpx[2] * (xixj + c0x[2]) + b00[2]; + g[15] = cpx[3] * (xixj + c0x[3]) + b00[3]; + g[48] = 1; + g[49] = 1; + g[50] = 1; + g[51] = 1; + g[64] = c0y[0]; + g[65] = c0y[1]; + g[66] = c0y[2]; + g[67] = c0y[3]; + g[56] = cpy[0]; + g[57] = cpy[1]; + g[58] = cpy[2]; + g[59] = cpy[3]; + g[72] = cpy[0] * c0y[0] + b00[0]; + g[73] = cpy[1] * c0y[1] + b00[1]; + g[74] = cpy[2] * c0y[2] + b00[2]; + g[75] = cpy[3] * c0y[3] + b00[3]; + g[68] = c0y[0] * (yiyj + c0y[0]) + b10[0]; + g[69] = c0y[1] * (yiyj + c0y[1]) + b10[1]; + g[70] = c0y[2] * (yiyj + c0y[2]) + b10[2]; + g[71] = c0y[3] * (yiyj + c0y[3]) + b10[3]; + g[52] = yiyj + c0y[0]; + g[53] = yiyj + c0y[1]; + g[54] = yiyj + c0y[2]; + g[55] = yiyj + c0y[3]; + g[76] = g[72] * (yiyj + c0y[0]) + c0y[0] * b00[0] + b10[0] * cpy[0]; + g[77] = g[73] * (yiyj + c0y[1]) + c0y[1] * b00[1] + b10[1] * cpy[1]; + g[78] = g[74] * (yiyj + c0y[2]) + c0y[2] * b00[2] + b10[2] * cpy[2]; + g[79] = g[75] * (yiyj + c0y[3]) + c0y[3] * b00[3] + b10[3] * cpy[3]; + g[60] = cpy[0] * (yiyj + c0y[0]) + b00[0]; + g[61] = cpy[1] * (yiyj + c0y[1]) + b00[1]; + g[62] = cpy[2] * (yiyj + c0y[2]) + b00[2]; + g[63] = cpy[3] * (yiyj + c0y[3]) + b00[3]; + //g[96] = w[0]; + //g[97] = w[0]; + //g[98] = w[1]; + //g[99] = w[1]; + g[112] = c0z[0] * g[96]; + g[113] = c0z[1] * g[97]; + g[114] = c0z[2] * g[98]; + g[115] = c0z[3] * g[99]; + g[104] = cpz[0] * g[96]; + g[105] = cpz[1] * g[97]; + g[106] = cpz[2] * g[98]; + g[107] = cpz[3] * g[99]; + g[120] = cpz[0] * g[112] + b00[0] * g[96]; + g[121] = cpz[1] * g[113] + b00[1] * g[97]; + g[122] = cpz[2] * g[114] + b00[2] * g[98]; + g[123] = cpz[3] * g[115] + b00[3] * g[99]; + g[116] = g[112] * (zizj + c0z[0]) + b10[0] * g[96]; + g[117] = g[113] * (zizj + c0z[1]) + b10[1] * g[97]; + g[118] = g[114] * (zizj + c0z[2]) + b10[2] * g[98]; + g[119] = g[115] * (zizj + c0z[3]) + b10[3] * g[99]; + g[100] = g[96] * (zizj + c0z[0]); + g[101] = g[97] * (zizj + c0z[1]); + g[102] = g[98] * (zizj + c0z[2]); + g[103] = g[99] * (zizj + c0z[3]); + g[124] = g[120] * (zizj + c0z[0]) + b10[0] * g[104] + b00[0] * g[112]; + g[125] = g[121] * (zizj + c0z[1]) + b10[1] * g[105] + b00[1] * g[113]; + g[126] = g[122] * (zizj + c0z[2]) + b10[2] * g[106] + b00[2] * g[114]; + g[127] = g[123] * (zizj + c0z[3]) + b10[3] * g[107] + b00[3] * g[115]; + g[108] = zizj * g[104] + cpz[0] * g[112] + b00[0] * g[96]; + g[109] = zizj * g[105] + cpz[1] * g[113] + b00[1] * g[97]; + g[110] = zizj * g[106] + cpz[2] * g[114] + b00[2] * g[98]; + g[111] = zizj * g[107] + cpz[3] * g[115] + b00[3] * g[99]; +} + +static inline void _srg0_2d4d_1110(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b10 = bc->b10; + double xixj = envs->rirj[0]; + double yiyj = envs->rirj[1]; + double zizj = envs->rirj[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[16] = c0x[0]; + g[17] = c0x[1]; + g[18] = c0x[2]; + g[19] = c0x[3]; + g[8] = cpx[0]; + g[9] = cpx[1]; + g[10] = cpx[2]; + g[11] = cpx[3]; + g[24] = cpx[0] * c0x[0] + b00[0]; + g[25] = cpx[1] * c0x[1] + b00[1]; + g[26] = cpx[2] * c0x[2] + b00[2]; + g[27] = cpx[3] * c0x[3] + b00[3]; + g[20] = c0x[0] * (xixj + c0x[0]) + b10[0]; + g[21] = c0x[1] * (xixj + c0x[1]) + b10[1]; + g[22] = c0x[2] * (xixj + c0x[2]) + b10[2]; + g[23] = c0x[3] * (xixj + c0x[3]) + b10[3]; + g[4] = xixj + c0x[0]; + g[5] = xixj + c0x[1]; + g[6] = xixj + c0x[2]; + g[7] = xixj + c0x[3]; + g[28] = g[24] * (xixj + c0x[0]) + c0x[0] * b00[0] + b10[0] * cpx[0]; + g[29] = g[25] * (xixj + c0x[1]) + c0x[1] * b00[1] + b10[1] * cpx[1]; + g[30] = g[26] * (xixj + c0x[2]) + c0x[2] * b00[2] + b10[2] * cpx[2]; + g[31] = g[27] * (xixj + c0x[3]) + c0x[3] * b00[3] + b10[3] * cpx[3]; + g[12] = cpx[0] * (xixj + c0x[0]) + b00[0]; + g[13] = cpx[1] * (xixj + c0x[1]) + b00[1]; + g[14] = cpx[2] * (xixj + c0x[2]) + b00[2]; + g[15] = cpx[3] * (xixj + c0x[3]) + b00[3]; + g[48] = 1; + g[49] = 1; + g[50] = 1; + g[51] = 1; + g[64] = c0y[0]; + g[65] = c0y[1]; + g[66] = c0y[2]; + g[67] = c0y[3]; + g[56] = cpy[0]; + g[57] = cpy[1]; + g[58] = cpy[2]; + g[59] = cpy[3]; + g[72] = cpy[0] * c0y[0] + b00[0]; + g[73] = cpy[1] * c0y[1] + b00[1]; + g[74] = cpy[2] * c0y[2] + b00[2]; + g[75] = cpy[3] * c0y[3] + b00[3]; + g[68] = c0y[0] * (yiyj + c0y[0]) + b10[0]; + g[69] = c0y[1] * (yiyj + c0y[1]) + b10[1]; + g[70] = c0y[2] * (yiyj + c0y[2]) + b10[2]; + g[71] = c0y[3] * (yiyj + c0y[3]) + b10[3]; + g[52] = yiyj + c0y[0]; + g[53] = yiyj + c0y[1]; + g[54] = yiyj + c0y[2]; + g[55] = yiyj + c0y[3]; + g[76] = g[72] * (yiyj + c0y[0]) + c0y[0] * b00[0] + b10[0] * cpy[0]; + g[77] = g[73] * (yiyj + c0y[1]) + c0y[1] * b00[1] + b10[1] * cpy[1]; + g[78] = g[74] * (yiyj + c0y[2]) + c0y[2] * b00[2] + b10[2] * cpy[2]; + g[79] = g[75] * (yiyj + c0y[3]) + c0y[3] * b00[3] + b10[3] * cpy[3]; + g[60] = cpy[0] * (yiyj + c0y[0]) + b00[0]; + g[61] = cpy[1] * (yiyj + c0y[1]) + b00[1]; + g[62] = cpy[2] * (yiyj + c0y[2]) + b00[2]; + g[63] = cpy[3] * (yiyj + c0y[3]) + b00[3]; + //g[96] = w[0]; + //g[97] = w[0]; + //g[98] = w[1]; + //g[99] = w[1]; + g[112] = c0z[0] * g[96]; + g[113] = c0z[1] * g[97]; + g[114] = c0z[2] * g[98]; + g[115] = c0z[3] * g[99]; + g[104] = cpz[0] * g[96]; + g[105] = cpz[1] * g[97]; + g[106] = cpz[2] * g[98]; + g[107] = cpz[3] * g[99]; + g[120] = cpz[0] * g[112] + b00[0] * g[96]; + g[121] = cpz[1] * g[113] + b00[1] * g[97]; + g[122] = cpz[2] * g[114] + b00[2] * g[98]; + g[123] = cpz[3] * g[115] + b00[3] * g[99]; + g[116] = g[112] * (zizj + c0z[0]) + b10[0] * g[96]; + g[117] = g[113] * (zizj + c0z[1]) + b10[1] * g[97]; + g[118] = g[114] * (zizj + c0z[2]) + b10[2] * g[98]; + g[119] = g[115] * (zizj + c0z[3]) + b10[3] * g[99]; + g[100] = g[96] * (zizj + c0z[0]); + g[101] = g[97] * (zizj + c0z[1]); + g[102] = g[98] * (zizj + c0z[2]); + g[103] = g[99] * (zizj + c0z[3]); + g[124] = g[120] * (zizj + c0z[0]) + b10[0] * g[104] + b00[0] * g[112]; + g[125] = g[121] * (zizj + c0z[1]) + b10[1] * g[105] + b00[1] * g[113]; + g[126] = g[122] * (zizj + c0z[2]) + b10[2] * g[106] + b00[2] * g[114]; + g[127] = g[123] * (zizj + c0z[3]) + b10[3] * g[107] + b00[3] * g[115]; + g[108] = zizj * g[104] + cpz[0] * g[112] + b00[0] * g[96]; + g[109] = zizj * g[105] + cpz[1] * g[113] + b00[1] * g[97]; + g[110] = zizj * g[106] + cpz[2] * g[114] + b00[2] * g[98]; + g[111] = zizj * g[107] + cpz[3] * g[115] + b00[3] * g[99]; +} + +static inline void _srg0_2d4d_1200(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *b10 = bc->b10; + double xixj = envs->rirj[0]; + double yiyj = envs->rirj[1]; + double zizj = envs->rirj[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[8] = c0x[0]; + g[9] = c0x[1]; + g[10] = c0x[2]; + g[11] = c0x[3]; + g[16] = c0x[0] * c0x[0] + b10[0]; + g[17] = c0x[1] * c0x[1] + b10[1]; + g[18] = c0x[2] * c0x[2] + b10[2]; + g[19] = c0x[3] * c0x[3] + b10[3]; + g[20] = g[16] * (xixj + c0x[0]) + c0x[0] * 2 * b10[0]; + g[21] = g[17] * (xixj + c0x[1]) + c0x[1] * 2 * b10[1]; + g[22] = g[18] * (xixj + c0x[2]) + c0x[2] * 2 * b10[2]; + g[23] = g[19] * (xixj + c0x[3]) + c0x[3] * 2 * b10[3]; + g[12] = c0x[0] * (xixj + c0x[0]) + b10[0]; + g[13] = c0x[1] * (xixj + c0x[1]) + b10[1]; + g[14] = c0x[2] * (xixj + c0x[2]) + b10[2]; + g[15] = c0x[3] * (xixj + c0x[3]) + b10[3]; + g[4] = xixj + c0x[0]; + g[5] = xixj + c0x[1]; + g[6] = xixj + c0x[2]; + g[7] = xixj + c0x[3]; + g[32] = 1; + g[33] = 1; + g[34] = 1; + g[35] = 1; + g[40] = c0y[0]; + g[41] = c0y[1]; + g[42] = c0y[2]; + g[43] = c0y[3]; + g[48] = c0y[0] * c0y[0] + b10[0]; + g[49] = c0y[1] * c0y[1] + b10[1]; + g[50] = c0y[2] * c0y[2] + b10[2]; + g[51] = c0y[3] * c0y[3] + b10[3]; + g[52] = g[48] * (yiyj + c0y[0]) + c0y[0] * 2 * b10[0]; + g[53] = g[49] * (yiyj + c0y[1]) + c0y[1] * 2 * b10[1]; + g[54] = g[50] * (yiyj + c0y[2]) + c0y[2] * 2 * b10[2]; + g[55] = g[51] * (yiyj + c0y[3]) + c0y[3] * 2 * b10[3]; + g[44] = c0y[0] * (yiyj + c0y[0]) + b10[0]; + g[45] = c0y[1] * (yiyj + c0y[1]) + b10[1]; + g[46] = c0y[2] * (yiyj + c0y[2]) + b10[2]; + g[47] = c0y[3] * (yiyj + c0y[3]) + b10[3]; + g[36] = yiyj + c0y[0]; + g[37] = yiyj + c0y[1]; + g[38] = yiyj + c0y[2]; + g[39] = yiyj + c0y[3]; + //g[64] = w[0]; + //g[65] = w[0]; + //g[66] = w[1]; + //g[67] = w[1]; + g[72] = c0z[0] * g[64]; + g[73] = c0z[1] * g[65]; + g[74] = c0z[2] * g[66]; + g[75] = c0z[3] * g[67]; + g[80] = c0z[0] * g[72] + b10[0] * g[64]; + g[81] = c0z[1] * g[73] + b10[1] * g[65]; + g[82] = c0z[2] * g[74] + b10[2] * g[66]; + g[83] = c0z[3] * g[75] + b10[3] * g[67]; + g[84] = g[80] * (zizj + c0z[0]) + 2 * b10[0] * g[72]; + g[85] = g[81] * (zizj + c0z[1]) + 2 * b10[1] * g[73]; + g[86] = g[82] * (zizj + c0z[2]) + 2 * b10[2] * g[74]; + g[87] = g[83] * (zizj + c0z[3]) + 2 * b10[3] * g[75]; + g[76] = g[72] * (zizj + c0z[0]) + b10[0] * g[64]; + g[77] = g[73] * (zizj + c0z[1]) + b10[1] * g[65]; + g[78] = g[74] * (zizj + c0z[2]) + b10[2] * g[66]; + g[79] = g[75] * (zizj + c0z[3]) + b10[3] * g[67]; + g[68] = g[64] * (zizj + c0z[0]); + g[69] = g[65] * (zizj + c0z[1]); + g[70] = g[66] * (zizj + c0z[2]); + g[71] = g[67] * (zizj + c0z[3]); +} + +static inline void _srg0_2d4d_2000(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = c0x[0] * c0x[0] + b10[0]; + g[9] = c0x[1] * c0x[1] + b10[1]; + g[10] = c0x[2] * c0x[2] + b10[2]; + g[11] = c0x[3] * c0x[3] + b10[3]; + g[12] = 1; + g[13] = 1; + g[14] = 1; + g[15] = 1; + g[16] = c0y[0]; + g[17] = c0y[1]; + g[18] = c0y[2]; + g[19] = c0y[3]; + g[20] = c0y[0] * c0y[0] + b10[0]; + g[21] = c0y[1] * c0y[1] + b10[1]; + g[22] = c0y[2] * c0y[2] + b10[2]; + g[23] = c0y[3] * c0y[3] + b10[3]; + //g[24] = w[0]; + //g[25] = w[0]; + //g[26] = w[1]; + //g[27] = w[1]; + g[28] = c0z[0] * g[24]; + g[29] = c0z[1] * g[25]; + g[30] = c0z[2] * g[26]; + g[31] = c0z[3] * g[27]; + g[32] = c0z[0] * g[28] + b10[0] * g[24]; + g[33] = c0z[1] * g[29] + b10[1] * g[25]; + g[34] = c0z[2] * g[30] + b10[2] * g[26]; + g[35] = c0z[3] * g[31] + b10[3] * g[27]; +} + +static inline void _srg0_2d4d_2001(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = c0x[0] * c0x[0] + b10[0]; + g[9] = c0x[1] * c0x[1] + b10[1]; + g[10] = c0x[2] * c0x[2] + b10[2]; + g[11] = c0x[3] * c0x[3] + b10[3]; + g[12] = cpx[0]; + g[13] = cpx[1]; + g[14] = cpx[2]; + g[15] = cpx[3]; + g[16] = cpx[0] * c0x[0] + b00[0]; + g[17] = cpx[1] * c0x[1] + b00[1]; + g[18] = cpx[2] * c0x[2] + b00[2]; + g[19] = cpx[3] * c0x[3] + b00[3]; + g[20] = c0x[0] * (g[16] + b00[0]) + b10[0] * cpx[0]; + g[21] = c0x[1] * (g[17] + b00[1]) + b10[1] * cpx[1]; + g[22] = c0x[2] * (g[18] + b00[2]) + b10[2] * cpx[2]; + g[23] = c0x[3] * (g[19] + b00[3]) + b10[3] * cpx[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[28] = c0y[0]; + g[29] = c0y[1]; + g[30] = c0y[2]; + g[31] = c0y[3]; + g[32] = c0y[0] * c0y[0] + b10[0]; + g[33] = c0y[1] * c0y[1] + b10[1]; + g[34] = c0y[2] * c0y[2] + b10[2]; + g[35] = c0y[3] * c0y[3] + b10[3]; + g[36] = cpy[0]; + g[37] = cpy[1]; + g[38] = cpy[2]; + g[39] = cpy[3]; + g[40] = cpy[0] * c0y[0] + b00[0]; + g[41] = cpy[1] * c0y[1] + b00[1]; + g[42] = cpy[2] * c0y[2] + b00[2]; + g[43] = cpy[3] * c0y[3] + b00[3]; + g[44] = c0y[0] * (g[40] + b00[0]) + b10[0] * cpy[0]; + g[45] = c0y[1] * (g[41] + b00[1]) + b10[1] * cpy[1]; + g[46] = c0y[2] * (g[42] + b00[2]) + b10[2] * cpy[2]; + g[47] = c0y[3] * (g[43] + b00[3]) + b10[3] * cpy[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[52] = c0z[0] * g[48]; + g[53] = c0z[1] * g[49]; + g[54] = c0z[2] * g[50]; + g[55] = c0z[3] * g[51]; + g[56] = c0z[0] * g[52] + b10[0] * g[48]; + g[57] = c0z[1] * g[53] + b10[1] * g[49]; + g[58] = c0z[2] * g[54] + b10[2] * g[50]; + g[59] = c0z[3] * g[55] + b10[3] * g[51]; + g[60] = cpz[0] * g[48]; + g[61] = cpz[1] * g[49]; + g[62] = cpz[2] * g[50]; + g[63] = cpz[3] * g[51]; + g[64] = cpz[0] * g[52] + b00[0] * g[48]; + g[65] = cpz[1] * g[53] + b00[1] * g[49]; + g[66] = cpz[2] * g[54] + b00[2] * g[50]; + g[67] = cpz[3] * g[55] + b00[3] * g[51]; + g[68] = c0z[0] * g[64] + b10[0] * g[60] + b00[0] * g[52]; + g[69] = c0z[1] * g[65] + b10[1] * g[61] + b00[1] * g[53]; + g[70] = c0z[2] * g[66] + b10[2] * g[62] + b00[2] * g[54]; + g[71] = c0z[3] * g[67] + b10[3] * g[63] + b00[3] * g[55]; +} + +static inline void _srg0_2d4d_2010(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *cpx = bc->c0px; + double *cpy = bc->c0py; + double *cpz = bc->c0pz; + double *b00 = bc->b00; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = c0x[0] * c0x[0] + b10[0]; + g[9] = c0x[1] * c0x[1] + b10[1]; + g[10] = c0x[2] * c0x[2] + b10[2]; + g[11] = c0x[3] * c0x[3] + b10[3]; + g[12] = cpx[0]; + g[13] = cpx[1]; + g[14] = cpx[2]; + g[15] = cpx[3]; + g[16] = cpx[0] * c0x[0] + b00[0]; + g[17] = cpx[1] * c0x[1] + b00[1]; + g[18] = cpx[2] * c0x[2] + b00[2]; + g[19] = cpx[3] * c0x[3] + b00[3]; + g[20] = c0x[0] * (g[16] + b00[0]) + b10[0] * cpx[0]; + g[21] = c0x[1] * (g[17] + b00[1]) + b10[1] * cpx[1]; + g[22] = c0x[2] * (g[18] + b00[2]) + b10[2] * cpx[2]; + g[23] = c0x[3] * (g[19] + b00[3]) + b10[3] * cpx[3]; + g[24] = 1; + g[25] = 1; + g[26] = 1; + g[27] = 1; + g[28] = c0y[0]; + g[29] = c0y[1]; + g[30] = c0y[2]; + g[31] = c0y[3]; + g[32] = c0y[0] * c0y[0] + b10[0]; + g[33] = c0y[1] * c0y[1] + b10[1]; + g[34] = c0y[2] * c0y[2] + b10[2]; + g[35] = c0y[3] * c0y[3] + b10[3]; + g[36] = cpy[0]; + g[37] = cpy[1]; + g[38] = cpy[2]; + g[39] = cpy[3]; + g[40] = cpy[0] * c0y[0] + b00[0]; + g[41] = cpy[1] * c0y[1] + b00[1]; + g[42] = cpy[2] * c0y[2] + b00[2]; + g[43] = cpy[3] * c0y[3] + b00[3]; + g[44] = c0y[0] * (g[40] + b00[0]) + b10[0] * cpy[0]; + g[45] = c0y[1] * (g[41] + b00[1]) + b10[1] * cpy[1]; + g[46] = c0y[2] * (g[42] + b00[2]) + b10[2] * cpy[2]; + g[47] = c0y[3] * (g[43] + b00[3]) + b10[3] * cpy[3]; + //g[48] = w[0]; + //g[49] = w[0]; + //g[50] = w[1]; + //g[51] = w[1]; + g[52] = c0z[0] * g[48]; + g[53] = c0z[1] * g[49]; + g[54] = c0z[2] * g[50]; + g[55] = c0z[3] * g[51]; + g[56] = c0z[0] * g[52] + b10[0] * g[48]; + g[57] = c0z[1] * g[53] + b10[1] * g[49]; + g[58] = c0z[2] * g[54] + b10[2] * g[50]; + g[59] = c0z[3] * g[55] + b10[3] * g[51]; + g[60] = cpz[0] * g[48]; + g[61] = cpz[1] * g[49]; + g[62] = cpz[2] * g[50]; + g[63] = cpz[3] * g[51]; + g[64] = cpz[0] * g[52] + b00[0] * g[48]; + g[65] = cpz[1] * g[53] + b00[1] * g[49]; + g[66] = cpz[2] * g[54] + b00[2] * g[50]; + g[67] = cpz[3] * g[55] + b00[3] * g[51]; + g[68] = c0z[0] * g[64] + b10[0] * g[60] + b00[0] * g[52]; + g[69] = c0z[1] * g[65] + b10[1] * g[61] + b00[1] * g[53]; + g[70] = c0z[2] * g[66] + b10[2] * g[62] + b00[2] * g[54]; + g[71] = c0z[3] * g[67] + b10[3] * g[63] + b00[3] * g[55]; +} + +static inline void _srg0_2d4d_2100(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *b10 = bc->b10; + double xixj = envs->rirj[0]; + double yiyj = envs->rirj[1]; + double zizj = envs->rirj[2]; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = c0x[0] * c0x[0] + b10[0]; + g[9] = c0x[1] * c0x[1] + b10[1]; + g[10] = c0x[2] * c0x[2] + b10[2]; + g[11] = c0x[3] * c0x[3] + b10[3]; + g[24] = g[8] * (xixj + c0x[0]) + c0x[0] * 2 * b10[0]; + g[25] = g[9] * (xixj + c0x[1]) + c0x[1] * 2 * b10[1]; + g[26] = g[10] * (xixj + c0x[2]) + c0x[2] * 2 * b10[2]; + g[27] = g[11] * (xixj + c0x[3]) + c0x[3] * 2 * b10[3]; + g[20] = c0x[0] * (xixj + c0x[0]) + b10[0]; + g[21] = c0x[1] * (xixj + c0x[1]) + b10[1]; + g[22] = c0x[2] * (xixj + c0x[2]) + b10[2]; + g[23] = c0x[3] * (xixj + c0x[3]) + b10[3]; + g[16] = xixj + c0x[0]; + g[17] = xixj + c0x[1]; + g[18] = xixj + c0x[2]; + g[19] = xixj + c0x[3]; + g[32] = 1; + g[33] = 1; + g[34] = 1; + g[35] = 1; + g[36] = c0y[0]; + g[37] = c0y[1]; + g[38] = c0y[2]; + g[39] = c0y[3]; + g[40] = c0y[0] * c0y[0] + b10[0]; + g[41] = c0y[1] * c0y[1] + b10[1]; + g[42] = c0y[2] * c0y[2] + b10[2]; + g[43] = c0y[3] * c0y[3] + b10[3]; + g[56] = g[40] * (yiyj + c0y[0]) + c0y[0] * 2 * b10[0]; + g[57] = g[41] * (yiyj + c0y[1]) + c0y[1] * 2 * b10[1]; + g[58] = g[42] * (yiyj + c0y[2]) + c0y[2] * 2 * b10[2]; + g[59] = g[43] * (yiyj + c0y[3]) + c0y[3] * 2 * b10[3]; + g[52] = c0y[0] * (yiyj + c0y[0]) + b10[0]; + g[53] = c0y[1] * (yiyj + c0y[1]) + b10[1]; + g[54] = c0y[2] * (yiyj + c0y[2]) + b10[2]; + g[55] = c0y[3] * (yiyj + c0y[3]) + b10[3]; + g[48] = yiyj + c0y[0]; + g[49] = yiyj + c0y[1]; + g[50] = yiyj + c0y[2]; + g[51] = yiyj + c0y[3]; + //g[64] = w[0]; + //g[65] = w[0]; + //g[66] = w[1]; + //g[67] = w[1]; + g[68] = c0z[0] * g[64]; + g[69] = c0z[1] * g[65]; + g[70] = c0z[2] * g[66]; + g[71] = c0z[3] * g[67]; + g[72] = c0z[0] * g[68] + b10[0] * g[64]; + g[73] = c0z[1] * g[69] + b10[1] * g[65]; + g[74] = c0z[2] * g[70] + b10[2] * g[66]; + g[75] = c0z[3] * g[71] + b10[3] * g[67]; + g[88] = g[72] * (zizj + c0z[0]) + 2 * b10[0] * g[68]; + g[89] = g[73] * (zizj + c0z[1]) + 2 * b10[1] * g[69]; + g[90] = g[74] * (zizj + c0z[2]) + 2 * b10[2] * g[70]; + g[91] = g[75] * (zizj + c0z[3]) + 2 * b10[3] * g[71]; + g[84] = g[68] * (zizj + c0z[0]) + b10[0] * g[64]; + g[85] = g[69] * (zizj + c0z[1]) + b10[1] * g[65]; + g[86] = g[70] * (zizj + c0z[2]) + b10[2] * g[66]; + g[87] = g[71] * (zizj + c0z[3]) + b10[3] * g[67]; + g[80] = g[64] * (zizj + c0z[0]); + g[81] = g[65] * (zizj + c0z[1]); + g[82] = g[66] * (zizj + c0z[2]); + g[83] = g[67] * (zizj + c0z[3]); +} + +static inline void _srg0_2d4d_3000(double *restrict g, Rys2eT *bc, CINTEnvVars *envs) +{ + double *c0x = bc->c00x; + double *c0y = bc->c00y; + double *c0z = bc->c00z; + double *b10 = bc->b10; + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] = c0x[0]; + g[5] = c0x[1]; + g[6] = c0x[2]; + g[7] = c0x[3]; + g[8] = c0x[0] * c0x[0] + b10[0]; + g[9] = c0x[1] * c0x[1] + b10[1]; + g[10] = c0x[2] * c0x[2] + b10[2]; + g[11] = c0x[3] * c0x[3] + b10[3]; + g[12] = c0x[0] * (g[8] + 2 * b10[0]); + g[13] = c0x[1] * (g[9] + 2 * b10[1]); + g[14] = c0x[2] * (g[10] + 2 * b10[2]); + g[15] = c0x[3] * (g[11] + 2 * b10[3]); + g[16] = 1; + g[17] = 1; + g[18] = 1; + g[19] = 1; + g[20] = c0y[0]; + g[21] = c0y[1]; + g[22] = c0y[2]; + g[23] = c0y[3]; + g[24] = c0y[0] * c0y[0] + b10[0]; + g[25] = c0y[1] * c0y[1] + b10[1]; + g[26] = c0y[2] * c0y[2] + b10[2]; + g[27] = c0y[3] * c0y[3] + b10[3]; + g[28] = c0y[0] * (g[24] + 2 * b10[0]); + g[29] = c0y[1] * (g[25] + 2 * b10[1]); + g[30] = c0y[2] * (g[26] + 2 * b10[2]); + g[31] = c0y[3] * (g[27] + 2 * b10[3]); + //g[32] = w[0]; + //g[33] = w[0]; + //g[34] = w[1]; + //g[35] = w[1]; + g[36] = c0z[0] * g[32]; + g[37] = c0z[1] * g[33]; + g[38] = c0z[2] * g[34]; + g[39] = c0z[3] * g[35]; + g[40] = c0z[0] * g[36] + b10[0] * g[32]; + g[41] = c0z[1] * g[37] + b10[1] * g[33]; + g[42] = c0z[2] * g[38] + b10[2] * g[34]; + g[43] = c0z[3] * g[39] + b10[3] * g[35]; + g[44] = c0z[0] * g[40] + 2 * b10[0] * g[36]; + g[45] = c0z[1] * g[41] + 2 * b10[1] * g[37]; + g[46] = c0z[2] * g[42] + 2 * b10[2] * g[38]; + g[47] = c0z[3] * g[43] + 2 * b10[3] * g[39]; +} + +void CINTsrg0_2e_2d4d_unrolled_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + int type_ijkl = ((envs->li_ceil << 6) | (envs->lj_ceil << 4) | + (envs->lk_ceil << 2) | (envs->ll_ceil)); + switch (type_ijkl) { + case 0b00000000: _srg0_2d4d_0000(g, bc, envs); return; + case 0b00000001: _srg0_2d4d_0001(g, bc, envs); return; + case 0b00000010: _srg0_2d4d_0002(g, bc, envs); return; + case 0b00000011: _srg0_2d4d_0003(g, bc, envs); return; + case 0b00000100: _srg0_2d4d_0010(g, bc, envs); return; + case 0b00000101: _srg0_2d4d_0011(g, bc, envs); return; + case 0b00000110: _srg0_2d4d_0012(g, bc, envs); return; + case 0b00001000: _srg0_2d4d_0020(g, bc, envs); return; + case 0b00001001: _srg0_2d4d_0021(g, bc, envs); return; + case 0b00001100: _srg0_2d4d_0030(g, bc, envs); return; + case 0b00010000: _srg0_2d4d_0100(g, bc, envs); return; + case 0b00010001: _srg0_2d4d_0101(g, bc, envs); return; + case 0b00010010: _srg0_2d4d_0102(g, bc, envs); return; + case 0b00010100: _srg0_2d4d_0110(g, bc, envs); return; + case 0b00010101: _srg0_2d4d_0111(g, bc, envs); return; + case 0b00011000: _srg0_2d4d_0120(g, bc, envs); return; + case 0b00100000: _srg0_2d4d_0200(g, bc, envs); return; + case 0b00100001: _srg0_2d4d_0201(g, bc, envs); return; + case 0b00100100: _srg0_2d4d_0210(g, bc, envs); return; + case 0b00110000: _srg0_2d4d_0300(g, bc, envs); return; + case 0b01000000: _srg0_2d4d_1000(g, bc, envs); return; + case 0b01000001: _srg0_2d4d_1001(g, bc, envs); return; + case 0b01000010: _srg0_2d4d_1002(g, bc, envs); return; + case 0b01000100: _srg0_2d4d_1010(g, bc, envs); return; + case 0b01000101: _srg0_2d4d_1011(g, bc, envs); return; + case 0b01001000: _srg0_2d4d_1020(g, bc, envs); return; + case 0b01010000: _srg0_2d4d_1100(g, bc, envs); return; + case 0b01010001: _srg0_2d4d_1101(g, bc, envs); return; + case 0b01010100: _srg0_2d4d_1110(g, bc, envs); return; + case 0b01100000: _srg0_2d4d_1200(g, bc, envs); return; + case 0b10000000: _srg0_2d4d_2000(g, bc, envs); return; + case 0b10000001: _srg0_2d4d_2001(g, bc, envs); return; + case 0b10000100: _srg0_2d4d_2010(g, bc, envs); return; + case 0b10010000: _srg0_2d4d_2100(g, bc, envs); return; + case 0b11000000: _srg0_2d4d_3000(g, bc, envs); return; + } + fprintf(stderr, "Dimension error for CINTg0_2e_lj2d4d: iklj = %d %d %d %d", + (int)envs->li_ceil, (int)envs->lk_ceil, + (int)envs->ll_ceil, (int)envs->lj_ceil); +} + +void CINTg0_2e_lj2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d_simd1(g, bc, envs); + CINTg0_lj_4d_simd1(g, envs); +} + +void CINTg0_2e_kj2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d_simd1(g, bc, envs); + CINTg0_kj_4d_simd1(g, envs); +} +void CINTg0_2e_ik2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d_simd1(g, bc, envs); + CINTg0_ik_4d_simd1(g, envs); +} +void CINTg0_2e_il2d4d_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs) +{ + CINTg0_2e_2d_simd1(g, bc, envs); + CINTg0_il_4d_simd1(g, envs); +} + +/* + * g[i,k,l,j] = < ik | lj > = ( i j | k l ) + */ +int CINTg0_2e_simd1(double *g, double *cutoff, + Rys2eT *bc, CINTEnvVars *envs, int idsimd) +{ + ALIGNMM double u[MXRYSROOTS]; + DEF_GXYZ(double, g, gx, gy, gz); + int irys; + int nroots = envs->nrys_roots; + double aij = envs->ai[idsimd] + envs->aj[idsimd]; + double akl = envs->ak[idsimd] + envs->al[idsimd]; + double a0, a1, fac1, x; + double *rij = envs->rij; + double *rkl = envs->rkl; + double *w = g + envs->g_size * 2; // ~ gz + double xij_kl = rij[0*SIMDD+idsimd] - rkl[0*SIMDD+idsimd]; + double yij_kl = rij[1*SIMDD+idsimd] - rkl[1*SIMDD+idsimd]; + double zij_kl = rij[2*SIMDD+idsimd] - rkl[2*SIMDD+idsimd]; + double rr = xij_kl * xij_kl + yij_kl * yij_kl + zij_kl * zij_kl; + + a1 = aij * akl; + a0 = a1 / (aij + akl); + fac1 = sqrt(a0 / (a1 * a1 * a1)) * envs->fac[idsimd]; + x = a0 * rr; + const double omega = envs->env[PTR_RANGE_OMEGA]; + double theta = 0; + if (omega == 0.) { + CINTrys_roots(nroots, x, u, w); + } else if (omega < 0.) { + // short-range part of range-separated Coulomb + theta = omega * omega / (omega * omega + a0); + // very small erfc() leads to ~0 weights. They can cause + // numerical issue in sr_rys_roots. + if (theta * x > cutoff[idsimd] || theta * x > EXPCUTOFF_SR) { + return 0; + } + int rorder = envs->rys_order; + if (rorder == nroots) { + CINTsr_rys_roots(nroots, x, sqrt(theta), u, w); + } else { + double sqrt_theta = -sqrt(theta); + CINTrys_roots(rorder, x, u, w); + CINTrys_roots(rorder, theta*x, u+rorder, w+rorder); + if (envs->g_size == 2) { + g[0] = 1; + g[1] = 1; + g[2] = 1; + g[3] = 1; + g[4] *= fac1; + g[5] *= fac1 * sqrt_theta; + return 1; + } + for (irys = rorder; irys < nroots; irys++) { + double ut = u[irys] * theta; + u[irys] = ut / (u[irys]+1.-ut); + w[irys] *= sqrt_theta; + } + } + } else { // omega > 0. + // long-range part of range-separated Coulomb + theta = omega * omega / (omega * omega + a0); + x *= theta; + fac1 *= sqrt(theta); + CINTrys_roots(nroots, x, u, w); + for (irys = 0; irys < nroots; irys++) { + double ut = u[irys] * theta; + u[irys] = ut / (u[irys]+1.-ut); + } + } + if (envs->g_size == 1) { + g[0] = 1; + g[1] = 1; + g[2] *= fac1; + return 1; + } + + double u2, tmp1, tmp2, tmp3, tmp4, tmp5; + double rijrx = rij[0*SIMDD+idsimd] - envs->rx_in_rijrx[0]; + double rijry = rij[1*SIMDD+idsimd] - envs->rx_in_rijrx[1]; + double rijrz = rij[2*SIMDD+idsimd] - envs->rx_in_rijrx[2]; + double rklrx = rkl[0*SIMDD+idsimd] - envs->rx_in_rklrx[0]; + double rklry = rkl[1*SIMDD+idsimd] - envs->rx_in_rklrx[1]; + double rklrz = rkl[2*SIMDD+idsimd] - envs->rx_in_rklrx[2]; + double *b00 = bc->b00; + double *b10 = bc->b10; + double *b01 = bc->b01; + double *c00x = bc->c00x; + double *c00y = bc->c00y; + double *c00z = bc->c00z; + double *c0px = bc->c0px; + double *c0py = bc->c0py; + double *c0pz = bc->c0pz; + + for (irys = 0; irys < nroots; irys++) { /* - *t2 = u(i)/(1+u(i)) + *u(irys) = t2/(1-t2) + *t2 = u(irys)/(1+u(irys)) *u2 = aij*akl/(aij+akl)*t2/(1-t2) */ - u2 = a0 * u[i]; - div = 1 / (u2 * (aij + akl) + a1); - tmp1 = u2 * div; - tmp4 = .5 * div; - b00[i] = 0.5 * tmp1; + u2 = a0 * u[irys]; + tmp4 = .5 / (u2 * (aij + akl) + a1); + tmp5 = u2 * tmp4; + tmp1 = 2. * tmp5; tmp2 = tmp1 * akl; tmp3 = tmp1 * aij; - b10[i] = b00[i] + tmp4 * akl; - b01[i] = b00[i] + tmp4 * aij; - c00x[i] = rijrx[0] - tmp2 * rijrkl[0]; - c00y[i] = rijrx[1] - tmp2 * rijrkl[1]; - c00z[i] = rijrx[2] - tmp2 * rijrkl[2]; - c0px[i] = rklrx[0] + tmp3 * rijrkl[0]; - c0py[i] = rklrx[1] + tmp3 * rijrkl[1]; - c0pz[i] = rklrx[2] + tmp3 * rijrkl[2]; + b00[irys] = tmp5; + b10[irys] = tmp5 + tmp4 * akl; + b01[irys] = tmp5 + tmp4 * aij; + c00x[irys] = rijrx - tmp2 * xij_kl; + c00y[irys] = rijry - tmp2 * yij_kl; + c00z[irys] = rijrz - tmp2 * zij_kl; + c0px[irys] = rklrx + tmp3 * xij_kl; + c0py[irys] = rklry + tmp3 * yij_kl; + c0pz[irys] = rklrz + tmp3 * zij_kl; + w[irys] *= fac1; } (*envs->f_g0_2d4d_simd1)(g, bc, envs); diff --git a/src/g3c2e.c b/src/g3c2e.c index 8e99004..f2221a2 100644 --- a/src/g3c2e.c +++ b/src/g3c2e.c @@ -54,7 +54,14 @@ void CINTinit_int3c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->nfk = (envs->k_l+1)*(envs->k_l+2)/2; envs->nfl = 1; envs->nf = envs->nfi * envs->nfk * envs->nfj; - envs->common_factor = 1; + + envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); + envs->rj = env + atm(PTR_COORD, bas(ATOM_OF, j_sh)); + envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); + + envs->common_factor = (M_PI*M_PI*M_PI)*2/SQRTPI + * CINTcommon_fac_sp(envs->i_l) * CINTcommon_fac_sp(envs->j_l) + * CINTcommon_fac_sp(envs->k_l); if (env[PTR_EXPCUTOFF] == 0) { envs->expcutoff = EXPCUTOFF; } else { @@ -70,14 +77,14 @@ void CINTinit_int3c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->lj_ceil = envs->j_l + ng[JINC]; envs->lk_ceil = 0; envs->ll_ceil = envs->k_l + ng[KINC]; - - envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); - envs->rj = env + atm(PTR_COORD, bas(ATOM_OF, j_sh)); - envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); - - int nroots = (envs->li_ceil + envs->lj_ceil + envs->ll_ceil)/2 + 1; - envs->nrys_roots = nroots; - assert(nroots < MXRYSROOTS); + int rys_order =(envs->li_ceil + envs->lj_ceil + envs->ll_ceil)/2 + 1; + int nrys_roots = rys_order; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0 && rys_order <= 3) { + nrys_roots *= 2; + } + envs->rys_order = rys_order; + envs->nrys_roots = nrys_roots; int dli, dlj, dlk; int ibase = envs->li_ceil > envs->lj_ceil; @@ -91,11 +98,11 @@ void CINTinit_int3c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, } dlk = envs->ll_ceil + 1; - envs->g_stride_i = nroots; - envs->g_stride_k = nroots * dli; - envs->g_stride_l = nroots * dli; - envs->g_stride_j = nroots * dli * dlk; - envs->g_size = nroots * dli * dlk * dlj; + envs->g_stride_i = nrys_roots; + envs->g_stride_k = nrys_roots * dli; + envs->g_stride_l = nrys_roots * dli; + envs->g_stride_j = nrys_roots * dli * dlk; + envs->g_size = nrys_roots * dli * dlk * dlj; MM_STORE(envs->al, MM_SET1(0.)); MM_STORE(envs->rkl+0*SIMDD, MM_SET1(envs->rk[0])); @@ -122,9 +129,13 @@ void CINTinit_int3c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->rirj[2] = envs->rj[2] - envs->ri[2]; } - if (nroots <= 2) { + if (rys_order <= 2) { envs->f_g0_2d4d = &CINTg0_2e_2d4d_unrolled; envs->f_g0_2d4d_simd1 = &CINTg0_2e_2d4d_unrolled_simd1; + if (rys_order != nrys_roots) { + envs->f_g0_2d4d = &CINTsrg0_2e_2d4d_unrolled; + envs->f_g0_2d4d_simd1 = &CINTsrg0_2e_2d4d_unrolled_simd1; + } } else if (ibase) { envs->f_g0_2d4d = &CINTg0_2e_il2d4d; envs->f_g0_2d4d_simd1 = &CINTg0_2e_il2d4d_simd1; @@ -136,113 +147,3 @@ void CINTinit_int3c2e_EnvVars(CINTEnvVars *envs, int *ng, int *shls, envs->f_g0_2e_simd1 = &CINTg0_2e_simd1; } - -#ifdef WITH_GTG -void CINTg0_2e_lj2d4d_regular(double *g, Rys2eT *bc, CINTEnvVars *envs); -void CINTg0_2e_lj2d4d_simd1_regular(double *g, Rys2eT *bc, CINTEnvVars *envs); -int CINTg0_2e_gtg(double *g, Rys2eT *bc, CINTEnvVars *envs, int count); -int CINTg0_2e_gtg_simd1(double *g, Rys2eT *bc, CINTEnvVars *envs, int idsimd); - -void CINTinit_int3c2e_gtg_EnvVars(CINTEnvVars *envs, int *ng, int *shls, - int *atm, int natm, int *bas, int nbas, double *env) -{ - envs->natm = natm; - envs->nbas = nbas; - envs->atm = atm; - envs->bas = bas; - envs->env = env; - envs->shls = shls; - - int i_sh = shls[0]; - int j_sh = shls[1]; - int k_sh = shls[2]; - envs->i_l = bas(ANG_OF, i_sh); - envs->j_l = bas(ANG_OF, j_sh); - envs->k_l = bas(ANG_OF, k_sh); - envs->l_l = 0; - envs->x_ctr[0] = bas(NCTR_OF, i_sh); - envs->x_ctr[1] = bas(NCTR_OF, j_sh); - envs->x_ctr[2] = bas(NCTR_OF, k_sh); - envs->x_ctr[3] = 1; - envs->nfi = (envs->i_l+1)*(envs->i_l+2)/2; - envs->nfj = (envs->j_l+1)*(envs->j_l+2)/2; - envs->nfk = (envs->k_l+1)*(envs->k_l+2)/2; - envs->nfl = 1; - envs->nf = envs->nfi * envs->nfk * envs->nfj; - envs->common_factor = SQRTPI * .5; - if (env[PTR_EXPCUTOFF] == 0) { - envs->expcutoff = EXPCUTOFF; - } else { - // +1 to ensure accuracy. See comments in libcint/cint2e.c - envs->expcutoff = MAX(MIN_EXPCUTOFF, env[PTR_EXPCUTOFF]); - } - - envs->gbits = ng[GSHIFT]; - envs->ncomp_e1 = ng[POS_E1]; - envs->ncomp_e2 = ng[POS_E2]; - envs->ncomp_tensor = ng[TENSOR]; - - envs->li_ceil = envs->i_l + ng[IINC]; - envs->lj_ceil = envs->j_l + ng[JINC]; - envs->lk_ceil = 0; - envs->ll_ceil = envs->k_l + ng[KINC]; - - envs->ri = env + atm(PTR_COORD, bas(ATOM_OF, i_sh)); - envs->rj = env + atm(PTR_COORD, bas(ATOM_OF, j_sh)); - envs->rk = env + atm(PTR_COORD, bas(ATOM_OF, k_sh)); - envs->nrys_roots = 1; - - int dli, dlj, dlk; - int ibase = envs->li_ceil > envs->lj_ceil; - - if (ibase) { - dli = envs->li_ceil + envs->lj_ceil + 1; - dlj = envs->lj_ceil + 1; - } else { - dli = envs->li_ceil + 1; - dlj = envs->li_ceil + envs->lj_ceil + 1; - } - dlk = envs->ll_ceil + 1; - - envs->g_stride_i = 1; - envs->g_stride_k = dli; - envs->g_stride_l = dli; - envs->g_stride_j = dli * dlk; - envs->g_size = dli * dlk * dlj; - - MM_STORE(envs->al, MM_SET1(0.)); - MM_STORE(envs->rkl+0*SIMDD, MM_SET1(envs->rk[0])); - MM_STORE(envs->rkl+1*SIMDD, MM_SET1(envs->rk[1])); - MM_STORE(envs->rkl+2*SIMDD, MM_SET1(envs->rk[2])); - envs->g2d_klmax = envs->g_stride_k; - envs->rkrl[0] = envs->rk[0]; - envs->rkrl[1] = envs->rk[1]; - envs->rkrl[2] = envs->rk[2]; - // in g0_2d rklrx = rkl - rx = 0 => rkl = rx - envs->rx_in_rklrx = envs->rk; - - if (ibase) { - envs->g2d_ijmax = envs->g_stride_i; - envs->rx_in_rijrx = envs->ri; - envs->rirj[0] = envs->ri[0] - envs->rj[0]; - envs->rirj[1] = envs->ri[1] - envs->rj[1]; - envs->rirj[2] = envs->ri[2] - envs->rj[2]; - } else { - envs->g2d_ijmax = envs->g_stride_j; - envs->rx_in_rijrx = envs->rj; - envs->rirj[0] = envs->rj[0] - envs->ri[0]; - envs->rirj[1] = envs->rj[1] - envs->ri[1]; - envs->rirj[2] = envs->rj[2] - envs->ri[2]; - } - - if (ibase) { - envs->f_g0_2d4d = &CINTg0_2e_il2d4d; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_il2d4d_simd1; - } else { - envs->f_g0_2d4d = &CINTg0_2e_lj2d4d_regular; - envs->f_g0_2d4d_simd1 = &CINTg0_2e_lj2d4d_simd1_regular; - } - envs->f_g0_2e = &CINTg0_2e_gtg; - envs->f_g0_2e_simd1 = &CINTg0_2e_gtg_simd1; -} -#endif diff --git a/src/gout2e.c b/src/gout2e.c index f2e2950..c981e7f 100644 --- a/src/gout2e.c +++ b/src/gout2e.c @@ -27,12 +27,12 @@ #if (SIMDD == 8) void CINTgout2e(double *gout, double *g, int *idx, CINTEnvVars *envs) { + int nrys_roots = envs->nrys_roots; int nf = envs->nf; - if (nf == 1) { + if (nf == 1 && nrys_roots == 1) { double *gz = g + envs->g_size * 2 * SIMDD; MM_STORE(gout, MM_LOAD(gz)); } else { - int nrys_roots = envs->nrys_roots; int i, n; double *gx, *gy, *gz; double *hx, *hy, *hz; @@ -224,12 +224,12 @@ void CINTgout2e(double *gout, double *g, int *idx, CINTEnvVars *envs) void CINTgout2e(double *gout, double *g, int *idx, CINTEnvVars *envs) { + int nrys_roots = envs->nrys_roots; int nf = envs->nf; - if (nf == 1) { + if (nf == 1 && nrys_roots == 1) { double *gz = g + envs->g_size * 2 * SIMDD; MM_STORE(gout, MM_LOAD(gz)); } else { - int nrys_roots = envs->nrys_roots; int i, n; double *gx, *gy, *gz; double *hx, *hy, *hz; @@ -528,12 +528,12 @@ void CINTgout2e(double *gout, double *g, int *idx, CINTEnvVars *envs) void CINTgout2e(double *gout, double *g, int *idx, CINTEnvVars *envs) { + int nrys_roots = envs->nrys_roots; int nf = envs->nf; - if (nf == 1) { + if (nf == 1 && nrys_roots == 1) { double *gz = g + envs->g_size * 2 * SIMDD; MM_STORE(gout, MM_LOAD(gz)); } else { - int nrys_roots = envs->nrys_roots; int i, n; double *gx, *gy, *gz; double *hx, *hy, *hz; diff --git a/src/optimizer.c b/src/optimizer.c index 9d85ccd..30fef5f 100644 --- a/src/optimizer.c +++ b/src/optimizer.c @@ -260,38 +260,6 @@ void CINTall_2e_stg_optimizer(CINTOpt **opt, int *ng, } #endif -#ifdef WITH_GTG -void CINTall_2e_gtg_optimizer(CINTOpt **opt, int *ng, - int *atm, int natm, int *bas, int nbas, double *env) -{ - CINTinit_2e_optimizer(opt, atm, natm, bas, nbas, env); - CINTOpt_setij(*opt, ng, atm, natm, bas, nbas, env); - CINTOpt_set_non0coeff(*opt, atm, natm, bas, nbas, env); - gen_idx(*opt, &CINTinit_int2e_gtg_EnvVars, &CINTg4c_index_xyz, - 4, 6, ng, atm, natm, bas, nbas, env); -} - -void CINTall_3c2e_gtg_optimizer(CINTOpt **opt, int *ng, - int *atm, int natm, int *bas, int nbas, double *env) -{ - CINTinit_2e_optimizer(opt, atm, natm, bas, nbas, env); - CINTOpt_setij(*opt, ng, atm, natm, bas, nbas, env); - CINTOpt_set_non0coeff(*opt, atm, natm, bas, nbas, env); - gen_idx(*opt, &CINTinit_int3c2e_gtg_EnvVars, &CINTg4c_index_xyz, - 3, 12, ng, atm, natm, bas, nbas, env); -} - -void CINTall_2c2e_gtg_optimizer(CINTOpt **opt, int *ng, - int *atm, int natm, int *bas, int nbas, double *env) -{ - CINTinit_2e_optimizer(opt, atm, natm, bas, nbas, env); - CINTOpt_set_log_maxc(*opt, atm, natm, bas, nbas, env); - CINTOpt_set_non0coeff(*opt, atm, natm, bas, nbas, env); - gen_idx(*opt, &CINTinit_int2c2e_gtg_EnvVars, &CINTg2c_index_xyz, - 2, ANG_MAX, ng, atm, natm, bas, nbas, env); -} -#endif - void CINTOpt_log_max_pgto_coeff(double *log_maxc, double *coeff, int nprim, int nctr) { int i, ip; @@ -350,7 +318,6 @@ int CINTset_pairdata(PairData *pairdata, double *ai, double *aj, double *ri, dou int lij = li_ceil + lj_ceil; if (lij > 0) { double dist_ij = sqrt(rr_ij); -#ifdef WITH_RANGE_COULOMB double omega = env[PTR_RANGE_OMEGA]; if (omega < 0) { double r_guess = 8.; @@ -360,9 +327,6 @@ int CINTset_pairdata(PairData *pairdata, double *ai, double *aj, double *ri, dou } else { log_rr_ij += lij * approx_log(dist_ij + 1.); } -#else - log_rr_ij += lij * approx_log(dist_ij + 1.); -#endif } PairData *pdata; diff --git a/src/optimizer.h b/src/optimizer.h index 2cd1087..e15802a 100644 --- a/src/optimizer.h +++ b/src/optimizer.h @@ -65,15 +65,6 @@ void CINTall_2e_stg_optimizer(CINTOpt **opt, int *ng, int *atm, int natm, int *bas, int nbas, double *env); #endif -#ifdef WITH_GTG -void CINTall_2e_gtg_optimizer(CINTOpt **opt, int *ng, - int *atm, int natm, int *bas, int nbas, double *env); -void CINTall_3c2e_gtg_optimizer(CINTOpt **opt, int *ng, - int *atm, int natm, int *bas, int nbas, double *env); -void CINTall_2c2e_gtg_optimizer(CINTOpt **opt, int *ng, - int *atm, int natm, int *bas, int nbas, double *env); -#endif - #ifndef HAVE_DEFINED_APPROX_LOG #define HAVE_DEFINED_APPROX_LOG #ifdef __X86__ diff --git a/src/rys_roots.c b/src/rys_roots.c index a676afd..d189c7d 100644 --- a/src/rys_roots.c +++ b/src/rys_roots.c @@ -77,6 +77,13 @@ void _CINTrys_roots_batch(int nroots, double *x, double *u, double *w, int count double roots[MXRYSROOTS * 2]; double *weights = roots + nroots; int i, k; + if (count < SIMDD) { + __MD r0 = MM_SET1(0.); + for (i = 0; i < nroots; i++) { + MM_STORE(u+i*SIMDD, r0); + MM_STORE(w+i*SIMDD, r0); + } + } for (i = 0; i < count; i++) { CINTrys_roots(nroots, x[i], roots, weights); for (k = 0; k < nroots; k++) { @@ -96,7 +103,7 @@ void CINTrys_roots(int nroots, double x, double *u, double *w) w[i] = POLY_SMALLX_W0[off+i] + POLY_SMALLX_W1[off+i] * x; } return; - } else if (x >= 50.) { + } else if (x >= 35+nroots*5) { int off = nroots * (nroots - 1) / 2; int i; double rt; @@ -141,7 +148,6 @@ void CINTrys_roots(int nroots, double x, double *u, double *w) } } -#ifdef WITH_RANGE_COULOMB /* * lower is the lower bound of the sr integral */ @@ -295,28 +301,29 @@ int _CINTsr_rys_roots_batch(CINTEnvVars *envs, double *x, double *theta, double *weights = roots + nroots; int i, k; int all_negligible = 1; - double xt; - + ALIGNMM double xt[SIMDD]; + ALIGNMM double sqrt_theta[SIMDD]; + __MD rtheta = MM_LOAD(theta); + __MD r0 = MM_SET1(0.); + MM_STORE(sqrt_theta, MM_SQRT(rtheta)); + MM_STORE(xt, MM_MUL(MM_LOAD(x), rtheta)); + for (i = 0; i < nroots; i++) { + MM_STORE(u+i*SIMDD, r0); + MM_STORE(w+i*SIMDD, r0); + } for (i = 0; i < count; i++) { // very small erfc() leads to ~0 weights - xt = x[i] * theta[i]; - if (xt < cutoff[i] && xt < EXPCUTOFF_SR) { - all_negligible = 0; - CINTsr_rys_roots(nroots, x[i], sqrt(theta[i]), roots, weights); + if (xt[i] < cutoff[i] && xt[i] < EXPCUTOFF_SR) { + CINTsr_rys_roots(nroots, x[i], sqrt_theta[i], roots, weights); for (k = 0; k < nroots; k++) { u[k*SIMDD+i] = roots[k]; w[k*SIMDD+i] = weights[k]; } - } else { - for (k = 0; k < nroots; k++) { - u[k*SIMDD+i] = 0; - w[k*SIMDD+i] = 0; - } + all_negligible = 0; } } return all_negligible; } -#endif // WITH_RANGE_COULOMB static int rys_root1(double X, double *roots, double *weights) {