From 3f787eb25d13947ef3becc6cab8aaf6b3d1c516f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Mon, 3 May 2021 17:42:44 -0700
Subject: [PATCH 01/45] declaring ddot in util.fh causes problems with cublas
 module

Signed-off-by: Jeff Hammond <jehammond@nvidia.com>
---
 src/ddscf/fast/potential.F               |  2 ++
 src/ddscf/riscf_trans.F                  |  3 +++
 src/driver/opt_drv.F                     | 13 +++++++++++++
 src/geom/geom.F                          |  6 ++++++
 src/geom/geom_hnd.F                      |  6 +++++-
 src/hessian/analytic/dft/xc_d2expl.F     |  2 ++
 src/mcscf/detci/detci_dens.F             |  8 +++++++-
 src/mp2_grad/mp2_back_transform.F        |  4 ++++
 src/mp2_grad/mp2_make_tuhf.F             |  2 ++
 src/mp2_grad/mp2_read_tiajb.F            |  2 ++
 src/nwdft/coulomb/dft_fitcd.F            |  4 ++++
 src/nwdft/coulomb/dft_fitvc.F            |  2 ++
 src/nwdft/dftgrad/dftg_cdfit.F           |  2 ++
 src/nwdft/dftgrad/dftg_gridv0b.F         |  3 +++
 src/nwdft/grid/grid_quadv0b.F            |  3 +++
 src/nwdft/so_dft/dft_scf_utils_so.F      |  2 ++
 src/nwdft/so_dft/sym_mo_adapt_so.F       |  2 ++
 src/nwdft/xc/xc_eval_fnl.F               |  2 ++
 src/nwdft/xc/xc_tabcd.F                  |  2 ++
 src/nwpw/band/minimizer/c_bybminimize.F  |  2 ++
 src/nwpw/band/minimizer/c_bybminimize2.F |  2 ++
 src/nwpw/nwpwlib/ion/ion.F               |  2 ++
 src/optim/mepgs/mepgs_drv.F              | 14 ++++++++++++++
 src/optim/neb/neb_drv.F                  |  2 ++
 src/optim/string/string.F                |  3 +++
 src/optim/tropt/tropt_drv.F              |  8 ++++++++
 src/symmetry/sym_bs_ir_id.F              |  3 +++
 src/symmetry/sym_bs_irrep.F              |  3 +++
 src/symmetry/sym_mo_adapt.F              |  2 ++
 src/tce/ddotfile.F                       |  6 ++++++
 src/tce/tce_diagnose_t1.F                |  2 ++
 src/tce/tce_mbpt2.F                      |  2 ++
 src/tce/tce_residual_t1.F                |  4 ++++
 src/tce/tce_residual_t2.F                |  6 ++++++
 src/tce/tce_residual_t3.F                |  4 ++++
 src/tce/tce_residual_t3a.F               |  2 ++
 src/tce/tce_residual_t4.F                |  4 ++++
 src/util/util.fh                         |  2 --
 src/vib/vib_eckart.F                     |  4 ++++
 src/vib/vib_nmass.F                      |  4 ++++
 src/vib/vib_tors.F                       |  2 ++
 41 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/src/ddscf/fast/potential.F b/src/ddscf/fast/potential.F
index 62e597a57de..52ef519ac0d 100644
--- a/src/ddscf/fast/potential.F
+++ b/src/ddscf/fast/potential.F
@@ -20,6 +20,8 @@ double precision function potential(basis, g_dens, x, y, z)
       integer l_buf, l_scr
       integer k_buf, k_scr
       double precision pot, r(3)
+      double precision ddot
+      external ddot
 c
       r(1) = x
       r(2) = y
diff --git a/src/ddscf/riscf_trans.F b/src/ddscf/riscf_trans.F
index cc4e6536de0..6f5b03cc10e 100644
--- a/src/ddscf/riscf_trans.F
+++ b/src/ddscf/riscf_trans.F
@@ -41,6 +41,9 @@ subroutine riscf_trans_int (g_three, ao_basis, nsh, nbf, nsqhalf,
       integer nxtask
       external nxtask
 
+      double precision ddot
+      external ddot
+
       nproc = ga_nnodes()
 
       if ( odisk ) then
diff --git a/src/driver/opt_drv.F b/src/driver/opt_drv.F
index c53d09578e7..4449c0b03aa 100644
--- a/src/driver/opt_drv.F
+++ b/src/driver/opt_drv.F
@@ -1424,6 +1424,8 @@ subroutine driver_hessian_update(geom,rtdb)
       integer istrss
       double precision strss1,strss2,dum1,dum2,dum3,dum4
       logical redo_hessian
+      double precision ddot
+      external ddot
 
 ! This is a statement function
       ind(i,j) = k_hess + i + (j-1)*nvar - 1
@@ -1651,6 +1653,8 @@ subroutine driver_opt_search_dir(geom)
       logical ophigh
       logical geom_lattice_get
       external geom_lattice_get
+      double precision ddot
+      external ddot
 c
       ophigh = util_print('high', print_high)
 c     
@@ -2001,6 +2005,8 @@ subroutine driver_line_search1(rtdb,geom)
       double precision e0, e1, e2p, dsgrad
       double precision hess, a0, a1, a2
       double precision driver_energy_step
+      double precision ddot
+      external ddot
 c
       dsgrad = ddot(nvar, ds, 1, g, 1)
       if (dsgrad*alpha .ge. 0d0) then
@@ -2154,6 +2160,8 @@ subroutine driver_line_search2(rtdb,geom)
       integer i, j
       double precision walka(5), walke(5),dum
       logical success
+      double precision ddot
+      external ddot
 c
       dsgrad = ddot(nvar, ds, 1, g, 1)
       if (dsgrad*alpha .ge. 0d0) then
@@ -2763,6 +2771,8 @@ subroutine driver_cart_pmat(rtdb, geom)
       logical omm
       logical task_qmmm
       logical opt_geom_cart_coords_get
+      double precision ddot
+      external ddot
       ind(i,j) = k_pmat + i-1 + (j-1)*ncart
 c     
 c     FRACTIONAL?
@@ -3118,6 +3128,9 @@ subroutine driver_sad_search_dir(rtdb,geom,istep)
 c
       logical geom_print_zmatrix, omatchneg
       save saddir, evalp
+
+      double precision ddot
+      external ddot
 c     
       omatchneg = .true.
       ovtol     = 0.7d0
diff --git a/src/geom/geom.F b/src/geom/geom.F
index bc49298b44b..e0136f3df9a 100644
--- a/src/geom/geom.F
+++ b/src/geom/geom.F
@@ -555,6 +555,8 @@ logical function geom_rtdb_load(rtdb, geom, name)
       logical geom_check_handle, geom_rtdb_in, geom_get_user_scale
       external geom_check_handle, geom_rtdb_in, geom_get_user_scale
       logical getsym
+      double precision ddot
+      external ddot
 c     
       geom_rtdb_load = geom_check_handle(geom, 'geom_rtdb_load')
       if (.not. geom_rtdb_load) return
@@ -948,6 +950,8 @@ subroutine geom_compute_values(geom)
       logical geom_tag_to_element
       external geom_tag_to_element
       logical is_atom
+      double precision ddot
+      external ddot
       is_atom(i) = (.not. inp_compare(.false., 'bq', tags(i,geom)(1:2)))
 c
       e = 0.0d0
@@ -2697,6 +2701,8 @@ logical function geom_print(geom)
      $     geom_print_zmatrix, geom_any_finuc
       double precision deter3
       external         deter3
+      double precision ddot
+      external ddot
 c
       if (.not. geom_check_handle(geom, 'geom_print')) then
          geom_print = .false.
diff --git a/src/geom/geom_hnd.F b/src/geom/geom_hnd.F
index 5da6d2171f6..89fb1e6b83f 100644
--- a/src/geom/geom_hnd.F
+++ b/src/geom/geom_hnd.F
@@ -4522,6 +4522,8 @@ SUBROUTINE HND_TFTR(H,F,Q,T,IA,M,N,NDIM)
       integer IA(1)
       integer ij,j,ik,max,i,k
       double precision small,zero,dum,qij,hij
+      double precision ddot
+      external ddot
       DATA SMALL /1.0D-11/
       DATA ZERO  /0.0D+00/
       IJ = 0
@@ -5387,7 +5389,9 @@ double precision function out_of_plane_angle(a,b,c)
 c     containing a and b.  The sign is determined in a left-handed
 c     sense ... so that if a=x, b=y then for c=+z angle=+90.
 c
-      double precision ddot, d(3), abc, theta
+      double precision d(3), abc, theta
+      double precision ddot
+      external ddot
 c
       call cross_product(a,b,d)
       abc = ddot(3,c,1,d,1)/sqrt(
diff --git a/src/hessian/analytic/dft/xc_d2expl.F b/src/hessian/analytic/dft/xc_d2expl.F
index cfb4f8c4d86..ca881c8e744 100644
--- a/src/hessian/analytic/dft/xc_d2expl.F
+++ b/src/hessian/analytic/dft/xc_d2expl.F
@@ -186,6 +186,8 @@ Subroutine xc_d2expl(tol_rho, scr,
       double precision duefac
       double precision dabsmax
       external dabsmax
+      double precision ddot
+      external ddot
 c
 c     d2Exc             /    d2fxc    dp(i) dp(j)          / dfxc  d2p(i)
 c     ----- =  sum sum  | ----------- ----- -----  +  sum  | ----- ------
diff --git a/src/mcscf/detci/detci_dens.F b/src/mcscf/detci/detci_dens.F
index e01f5e34e49..3a78e019ec9 100644
--- a/src/mcscf/detci/detci_dens.F
+++ b/src/mcscf/detci/detci_dens.F
@@ -33,13 +33,15 @@ subroutine detci_onepdm( norb, nsym, nela, nelb, nstra, nstrb,
       integer g_dtmp
       integer ii, jj, iex, ph
       double precision xx
+      double precision ddot
+      external ddot
 c
 c
 c
       myid = ga_nodeid()
 *ga:1:0
       if (.not.(ga_create(MT_DBL, norb, norb, 'd', norb, 0, g_dtmp )))
-     $   call errquit('detci_twopdm: cannot create global',0, GA_ERR)
+     $   call errquit('detci_onepdm: cannot create global',0, GA_ERR)
       call ga_zero(g_dtmp)
       call ga_distribution( g_civec, myid, rlo, rhi, cilo, cihi )
       if (((cilo.ne.0).and.(cihi.ne.-1)).and.
@@ -162,6 +164,8 @@ subroutine detci_twopdm( norb, nsym, nela, nelb, nstra, nstrb,
       integer l_t, k_t, l_s, k_s, lds
 **      integer g_dentmp
       integer myid, nn, rlo, rhi, cilo, cihi, dilo, dihi
+      double precision ddot
+      external ddot
 c
 c
       myid = ga_nodeid()
@@ -425,6 +429,8 @@ subroutine detci_twopdm_ab( norb, nsym, nela, nelb, nstra, nstrb,
       double precision tx
       integer nxtask
       external nxtask
+      double precision ddot
+      external ddot
 c      
 c
 c   Initialize parallel stuff
diff --git a/src/mp2_grad/mp2_back_transform.F b/src/mp2_grad/mp2_back_transform.F
index da6f683f003..77d6cd21574 100644
--- a/src/mp2_grad/mp2_back_transform.F
+++ b/src/mp2_grad/mp2_back_transform.F
@@ -808,6 +808,8 @@ subroutine mp2_nonsep(
      $     ylo_cur, yhi_cur, udim_cur, vdim_cur, xdim_cur, ydim_cur
       logical status, odebug, odoit, sym_shell_quartet, oenergy
       external sym_shell_quartet
+      double precision ddot
+      external ddot
 c     
       odebug = util_print('mp2_backt', print_debug)
       oenergy = util_print('backtenergy', print_debug)
@@ -1122,6 +1124,8 @@ subroutine mp2_copyback0(dowork,g_a,g_a_trans,
       integer t_ilo, t_ihi, t_jlo, t_jhi
       integer nsegs,iseg_in
       double precision tr,tr_tr
+      double precision ddot
+      external ddot
 c
       call ga_distribution(g_a_trans, ga_nodeid(), 
      T     t_ilo, t_ihi, t_jlo, t_jhi)
diff --git a/src/mp2_grad/mp2_make_tuhf.F b/src/mp2_grad/mp2_make_tuhf.F
index aa4931d4564..3ed99890a59 100644
--- a/src/mp2_grad/mp2_make_tuhf.F
+++ b/src/mp2_grad/mp2_make_tuhf.F
@@ -51,6 +51,8 @@ subroutine mp2_make_tuhf(nbf,noa_lo,noa_hi,nva_lo,nva_hi,
       logical otdebug
 c
       integer l_ia_uv, k_ia_uv, l_tmp, k_tmp, l_ia_jb, k_ia_jb
+      double precision ddot
+      external ddot
 c
 #include "bitops.fh"
 c
diff --git a/src/mp2_grad/mp2_read_tiajb.F b/src/mp2_grad/mp2_read_tiajb.F
index bb3ca7246c8..3a96f3486cb 100644
--- a/src/mp2_grad/mp2_read_tiajb.F
+++ b/src/mp2_grad/mp2_read_tiajb.F
@@ -12,6 +12,8 @@ subroutine mp2_read_tijab(nv_lo, nv_hi, irs, symia,
       integer tunit
       double precision tunitptr
       double precision t(*)
+      double precision ddot
+      external ddot
 c
 c     Read t(j,b,i,a) all j, b for given i, a taking into
 c     account symmetry blocking
diff --git a/src/nwdft/coulomb/dft_fitcd.F b/src/nwdft/coulomb/dft_fitcd.F
index 6ceb7b43c7e..cd6889272f4 100644
--- a/src/nwdft/coulomb/dft_fitcd.F
+++ b/src/nwdft/coulomb/dft_fitcd.F
@@ -59,6 +59,8 @@ Subroutine dft_fitcd(nfit,CD_coef, i3c_ERI, Ecoul1,
       integer LU,ierr,ilo,ihi,jlo,jhi,nnii
       integer adrc,ldc,iptr,intdum
       character*255 errmsg
+      double precision ddot
+      external ddot
       
 c     
 c     Fit electronic charge density. The fitting coefficients are obtained by 
@@ -383,6 +385,8 @@ Subroutine mull_pop_fit(basis, natoms, nshells, nbf_cd,
       double precision cd_coef(nbf_cd), cgtf(nbf_cd)
       character*1 shell_labels(nshells), ang_mom_label(11)
       logical oprint_mull_fit
+      double precision ddot
+      external ddot
 c
 #include "bas.fh"
 #include "mafdecls.fh"
diff --git a/src/nwdft/coulomb/dft_fitvc.F b/src/nwdft/coulomb/dft_fitvc.F
index f2d03ca1982..12d52bdbee6 100644
--- a/src/nwdft/coulomb/dft_fitvc.F
+++ b/src/nwdft/coulomb/dft_fitvc.F
@@ -57,6 +57,8 @@ Subroutine dft_fitvc(CD_coef, i3c_ERI, Ecoul2,  g_vc,
       integer k_at,l_at,atom_c_in,atom_d_in
       logical v_nonzero
       external nxtask
+      double precision ddot
+      external ddot
 c
       if(dermat) call errquit(
      C     'fitvc: dermat not coded yet',0,0)
diff --git a/src/nwdft/dftgrad/dftg_cdfit.F b/src/nwdft/dftgrad/dftg_cdfit.F
index ed52e51aa30..93adc88e375 100644
--- a/src/nwdft/dftgrad/dftg_cdfit.F
+++ b/src/nwdft/dftgrad/dftg_cdfit.F
@@ -120,6 +120,8 @@ Subroutine dftg_cdfit_gen(geom, AO_bas_han, CD_bas_han,
 c
       double precision dabsmax
       external nxtask,schwarz_shell,dabsmax
+      double precision ddot
+      external ddot
       nproc  = ga_nnodes()
       me = ga_nodeid()
 c
diff --git a/src/nwdft/dftgrad/dftg_gridv0b.F b/src/nwdft/dftgrad/dftg_gridv0b.F
index 2bc6394e83c..6425a6f70ff 100644
--- a/src/nwdft/dftgrad/dftg_gridv0b.F
+++ b/src/nwdft/dftgrad/dftg_gridv0b.F
@@ -158,6 +158,9 @@ subroutine dftg_gridv0b(nqpts,rad,ictr_buf,iga_dens,
 c dVxc*P contribution.
       logical do_tddftvxc
       logical ldew2 ! Prevent weighting of derivative matrices
+
+      double precision ddot
+      external ddot
 c     
 c           Evaluate the AO basis set at each of the quad. points.
 c           allocate arrays for exponents and contraction coefficients
diff --git a/src/nwdft/grid/grid_quadv0b.F b/src/nwdft/grid/grid_quadv0b.F
index 5583a2e961e..a1037354f5c 100644
--- a/src/nwdft/grid/grid_quadv0b.F
+++ b/src/nwdft/grid/grid_quadv0b.F
@@ -101,6 +101,9 @@ subroutine grid_quadv0b(
       double precision StericEnergy_qm, StericEnergy_fde
       double precision StericEnergy_tot
 
+      double precision ddot
+      external ddot
+
       pname = 'grid_quadv0b: '
       mbf_fde = 0
 !      npol = 0
diff --git a/src/nwdft/so_dft/dft_scf_utils_so.F b/src/nwdft/so_dft/dft_scf_utils_so.F
index 4c9ad6ff8c8..0dad21f99f4 100644
--- a/src/nwdft/so_dft/dft_scf_utils_so.F
+++ b/src/nwdft/so_dft/dft_scf_utils_so.F
@@ -85,6 +85,8 @@ subroutine diag_fock(nbf_mo,ia,g_fockso,ibuff,g_moso,iwork,irwork,
       integer info
 c
       integer i,j,i1
+      double precision ddot
+      external ddot
 c
 c     Prepare arrays for diagonalization
 c
diff --git a/src/nwdft/so_dft/sym_mo_adapt_so.F b/src/nwdft/so_dft/sym_mo_adapt_so.F
index eea60379d26..eec941ac15e 100644
--- a/src/nwdft/so_dft/sym_mo_adapt_so.F
+++ b/src/nwdft/so_dft/sym_mo_adapt_so.F
@@ -62,6 +62,8 @@ subroutine sym_movecs_adapt_so(basis, thresh, g_vecs, irs, nmixed)
       integer l_u,k_u
       logical sym_char_table_so
       external sym_char_table_so
+      double precision ddot
+      external ddot
 c
       logical odebug            ! True if debugging
       logical owarn             ! True if to print warning messages
diff --git a/src/nwdft/xc/xc_eval_fnl.F b/src/nwdft/xc/xc_eval_fnl.F
index ba491d83902..b81915a0b13 100644
--- a/src/nwdft/xc/xc_eval_fnl.F
+++ b/src/nwdft/xc/xc_eval_fnl.F
@@ -98,6 +98,8 @@ Subroutine xc_eval_fnl(rho, delrho, Amat, Amat2, Cmat, Cmat2,
       double precision eps,dumd
       integer nx,nc,dumi
       parameter (eps=1.e-8)
+      double precision ddot
+      external ddot
 c
 c     Initialize the XC potential and energy sampling matrices.
 c
diff --git a/src/nwdft/xc/xc_tabcd.F b/src/nwdft/xc/xc_tabcd.F
index c457761ec28..aae03ad0688 100644
--- a/src/nwdft/xc/xc_tabcd.F
+++ b/src/nwdft/xc/xc_tabcd.F
@@ -209,6 +209,8 @@ Subroutine xc_tabcd(what,l3d_dum,
       data nbhandl2 /0./
       save nbhandl1
       save nbhandl2
+      double precision ddot
+      external ddot
 c
 c         0: l3d=.f.    & n3d=1
 ccc     rhs: l3d=.true. & n3d=3
diff --git a/src/nwpw/band/minimizer/c_bybminimize.F b/src/nwpw/band/minimizer/c_bybminimize.F
index 21b60b2c8c0..4f085641d70 100644
--- a/src/nwpw/band/minimizer/c_bybminimize.F
+++ b/src/nwpw/band/minimizer/c_bybminimize.F
@@ -86,6 +86,8 @@ subroutine c_bybminimize(E,deltae,deltac,current_iteration,
       external cpsi_1_noupdate_energy,cpsi_eigenvalue,ion_disp_energy
       logical  ion_disp_on
       external ion_disp_on
+      double precision ddot
+      external ddot
    
 
       Ein = E(1)
diff --git a/src/nwpw/band/minimizer/c_bybminimize2.F b/src/nwpw/band/minimizer/c_bybminimize2.F
index 0b21cf07a97..190d6b6c69a 100644
--- a/src/nwpw/band/minimizer/c_bybminimize2.F
+++ b/src/nwpw/band/minimizer/c_bybminimize2.F
@@ -106,6 +106,8 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
       external control_ks_algorithm
       integer  control_ks_maxit_orb,control_ks_maxit_orbs
       external control_ks_maxit_orb,control_ks_maxit_orbs
+      double precision ddot
+      external ddot
 
       Ein = E(1)
       call Parallel_taskid(taskid)
diff --git a/src/nwpw/nwpwlib/ion/ion.F b/src/nwpw/nwpwlib/ion/ion.F
index c8d93364850..00ed875e88f 100644
--- a/src/nwpw/nwpwlib/ion/ion.F
+++ b/src/nwpw/nwpwlib/ion/ion.F
@@ -4797,6 +4797,8 @@ subroutine ion_eckart(oprint,NAT,NAT3,removerotation,
       double precision test_norm
       integer i, j, k, l, m, n, mu, nu, indx, iatom, iaxis, itemp
       integer nhess, nhesst,IMAX
+      double precision ddot
+      external ddot
 C****
       nhess = nat3*nat3
       nhesst =  nat3*(nat3+1)/2 ! dimension of lower triangular hessian
diff --git a/src/optim/mepgs/mepgs_drv.F b/src/optim/mepgs/mepgs_drv.F
index 4e5f71883ac..8cd5e12b5f9 100644
--- a/src/optim/mepgs/mepgs_drv.F
+++ b/src/optim/mepgs/mepgs_drv.F
@@ -430,6 +430,8 @@ subroutine mepgs_cent(rtdb, geom, geoma, pgref, sfactor, string)
       logical gsopt_geom_cart_coords_get
       logical gsopt_geom_cart_coords_set
       logical ophigh
+      double precision ddot
+      external ddot
 c
       ophigh = util_print('high', print_high)
 CCCCCCCCCCCCCCCCCCCCCC
@@ -511,6 +513,8 @@ double precision function mepgs_cosang(avec,bvec,angle)
       double precision avec(nvar), bvec(nvar)
 c
       double precision ctheta, factor(2)
+      double precision ddot
+      external ddot
 c
       mepgs_cosang = 0.0
 c
@@ -555,6 +559,8 @@ subroutine mepgs_hessian_update()
       double precision dsds, dshds, dsdg
       integer l_hess, k_hess, i, j
       integer ind
+      double precision ddot
+      external ddot
       ind(i,j) = k_hess + i + (j-1)*nvar - 1
 c
       if (.not. ma_push_get(mt_dbl, nvar**2, 'hess',
@@ -2188,6 +2194,8 @@ subroutine gsopt_hessian_update()
       double precision dsds, dshds, dsdg
       integer l_hess, k_hess, i, j
       integer ind
+      double precision ddot
+      external ddot
       ind(i,j) = k_hess + i + (j-1)*nvar - 1
 c
       if (alpha .eq. 0d0) call errquit
@@ -2462,6 +2470,8 @@ subroutine gsopt_cart_pmat(rtdb, geom)
       integer ind
       logical task_qmmm
       logical gsopt_geom_cart_coords_get
+      double precision ddot
+      external ddot
       ind(i,j) = k_pmat + i-1 + (j-1)*ncart
 c     
 c     FRACTIONAL?
@@ -2656,6 +2666,8 @@ subroutine gsopt_compute_info()
       double precision desphere(max_nvar)
       double precision zeta(max_nvar)
       double precision norm
+      double precision ddot
+      external ddot
 c
 c     Compute stuff used for printing and convergence tests
 c
@@ -3058,6 +3070,8 @@ subroutine gsopt_pickstp(rtdb, geom, istep)
       double precision trustds  ! restriction of step in opt. variable
       logical ophigh
       logical gsopt_geom_cart_coords_get
+      double precision ddot
+      external ddot
 c
 c     get the hessian and gradient with appropriate projectors
 c     applied following peng, ayala, schlegel and frisch so that
diff --git a/src/optim/neb/neb_drv.F b/src/optim/neb/neb_drv.F
index c6a2f578630..e69f709c609 100644
--- a/src/optim/neb/neb_drv.F
+++ b/src/optim/neb/neb_drv.F
@@ -52,6 +52,8 @@ logical function neb(rtdb)
       external energy_bead_list
       character*7 bead_index_name
       external    bead_index_name
+      double precision ddot
+      external ddot
 
 
       oprint = ga_nodeid() .eq. 0
diff --git a/src/optim/string/string.F b/src/optim/string/string.F
index e45ff408343..d17f976a0c4 100644
--- a/src/optim/string/string.F
+++ b/src/optim/string/string.F
@@ -467,6 +467,9 @@ subroutine zts_meps(maxit,nbeads,tol,stepsize,string_algorithm,
       character*7 bead_index_name
       external    bead_index_name
 
+      double precision ddot
+      external ddot
+
 ! Setup the problem
 ! "Secret" options with defaults that generally do not need changed
       if (.not. rtdb_get(rtdb, 'string:linopt', mt_log,1,linopt))
diff --git a/src/optim/tropt/tropt_drv.F b/src/optim/tropt/tropt_drv.F
index 332d50dc157..b369854d6d8 100644
--- a/src/optim/tropt/tropt_drv.F
+++ b/src/optim/tropt/tropt_drv.F
@@ -1392,6 +1392,8 @@ subroutine tropt_hessian_update()
       double precision dsds, dshds, dsdg
       integer l_hess, k_hess, i, j
       integer ind
+      double precision ddot
+      external ddot
       ind(i,j) = k_hess + i + (j-1)*nvar - 1
 c
       if (alpha .eq. 0d0) call errquit
@@ -1781,6 +1783,8 @@ subroutine tropt_cart_pmat(rtdb, geom)
       integer ind
       logical task_qmmm
       logical tropt_geom_cart_coords_get
+      double precision ddot
+      external ddot
       ind(i,j) = k_pmat + i-1 + (j-1)*ncart
 c     
 c     FRACTIONAL?
@@ -2515,6 +2519,8 @@ subroutine tropt_pickstp(rtdb,geom,istep)
       double precision trustds  ! restriction of step in opt. variable
       logical geom_print_zmatrix
       logical ophigh
+      double precision ddot
+      external ddot
 c
 c     get the hessian and gradient with appropriate projectors
 c     applied following peng, ayala, schlegel and frisch so that
@@ -2929,6 +2935,8 @@ subroutine tropt_etaylor(epredict)
       integer l_hess, k_hess
       double precision epredict
       double precision gamma(nvar)
+      double precision ddot
+      external ddot
 !
 !     *** calculate predicted energy change ***
 !
diff --git a/src/symmetry/sym_bs_ir_id.F b/src/symmetry/sym_bs_ir_id.F
index 0c86e6c8add..917ee9b021f 100644
--- a/src/symmetry/sym_bs_ir_id.F
+++ b/src/symmetry/sym_bs_ir_id.F
@@ -47,6 +47,9 @@ subroutine sym_bas_irrep_id(basis, oprint, mbf, njr,
  
       integer ncent_unique
       integer centlist(100), jc
+c
+      double precision ddot
+      external ddot
 c
 c     Get basis and geom info
 c     
diff --git a/src/symmetry/sym_bs_irrep.F b/src/symmetry/sym_bs_irrep.F
index c89c695fae7..f2ad6f3c8ba 100644
--- a/src/symmetry/sym_bs_irrep.F
+++ b/src/symmetry/sym_bs_irrep.F
@@ -40,6 +40,9 @@ subroutine sym_bas_irreps(basis, oprint, nbf_per_ir)
 c
       double precision sym_trace_bas_op
       external sym_trace_bas_op
+c
+      double precision ddot
+      external ddot
 c
 c     Get basis and geom info
 c     
diff --git a/src/symmetry/sym_mo_adapt.F b/src/symmetry/sym_mo_adapt.F
index 8c06df96f55..efd6357d31e 100644
--- a/src/symmetry/sym_mo_adapt.F
+++ b/src/symmetry/sym_mo_adapt.F
@@ -57,6 +57,8 @@ subroutine sym_movecs_adapt(basis, thresh, g_vecs, irs, nmixed)
       double precision v(maxireps), vnorm
       integer idamax
       external idamax
+      double precision ddot
+      external ddot
 c
       logical odebug            ! True if debugging
       logical owarn             ! True if to print warning messages
diff --git a/src/tce/ddotfile.F b/src/tce/ddotfile.F
index 40a0428e9b9..2da5f52a19f 100644
--- a/src/tce/ddotfile.F
+++ b/src/tce/ddotfile.F
@@ -29,6 +29,8 @@ double precision function ddotfile(d_1,d_2,size)
       EXTERNAL NXTASK
 cc      external nxtask
       logical noloadbalance
+      double precision ddot
+      external ddot
 c
 c     new
 c
@@ -143,6 +145,8 @@ double precision function ddotfile_1(d_1,d_2,size)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical noloadbalance
+      double precision ddot
+      external ddot
 c
 c     new
 c
@@ -267,6 +271,8 @@ double precision function ddotfile_2(d_1,d_2,size)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical noloadbalance
+      double precision ddot
+      external ddot
 c
 c     new
 c
diff --git a/src/tce/tce_diagnose_t1.F b/src/tce/tce_diagnose_t1.F
index 15c4c0c23f0..b4444cdb989 100644
--- a/src/tce/tce_diagnose_t1.F
+++ b/src/tce/tce_diagnose_t1.F
@@ -21,6 +21,8 @@ subroutine tce_diagnose_t1(d_r1,k_r1_offset,residual)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     =====================
 c     Zero scratch residual
diff --git a/src/tce/tce_mbpt2.F b/src/tce/tce_mbpt2.F
index 9ff749dd271..fbf1e8eceed 100644
--- a/src/tce/tce_mbpt2.F
+++ b/src/tce/tce_mbpt2.F
@@ -35,6 +35,8 @@ subroutine tce_mbpt2(d_mo2e,k_2e_offset,
       double precision cpu
       double precision wall
       logical nodezero
+      double precision ddot
+      external ddot
 c
       nodezero=(ga_nodeid().eq.0)
       cpu=-util_cpusec()
diff --git a/src/tce/tce_residual_t1.F b/src/tce/tce_residual_t1.F
index a6ccae7ff1c..2b42a9835af 100644
--- a/src/tce/tce_residual_t1.F
+++ b/src/tce/tce_residual_t1.F
@@ -27,6 +27,8 @@ subroutine tce_residual_t1(d_r1,k_r1_offset,residual)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
@@ -130,6 +132,8 @@ subroutine tce_residual_tr1(d_r1,k_r1_offset,residual)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
diff --git a/src/tce/tce_residual_t2.F b/src/tce/tce_residual_t2.F
index 938f386dd5b..0c105123c4f 100644
--- a/src/tce/tce_residual_t2.F
+++ b/src/tce/tce_residual_t2.F
@@ -30,6 +30,8 @@ subroutine tce_residual_t2(d_r2,k_r2_offset,residual)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
@@ -139,6 +141,8 @@ subroutine tce_residual_tr2(d_r2,k_r2_offset,residual)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
@@ -252,6 +256,8 @@ subroutine tce_residual_t2a(d_r2,k_r2_offset,residual)
       INTEGER NXTASK
       EXTERNAL NXTASK
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     =====================
 c     Zero scratch residual
diff --git a/src/tce/tce_residual_t3.F b/src/tce/tce_residual_t3.F
index 34c671092cc..5c9d2389587 100644
--- a/src/tce/tce_residual_t3.F
+++ b/src/tce/tce_residual_t3.F
@@ -30,6 +30,8 @@ subroutine tce_residual_t3(d_r3,k_r3_offset,residual)
       integer nxtask
       external nxtask
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
@@ -173,6 +175,8 @@ subroutine tce_residual_tr3(d_r3,k_r3_offset,residual)
       integer nxtask
       external nxtask
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
diff --git a/src/tce/tce_residual_t3a.F b/src/tce/tce_residual_t3a.F
index 7a32c4728dd..0353ca44a58 100644
--- a/src/tce/tce_residual_t3a.F
+++ b/src/tce/tce_residual_t3a.F
@@ -31,6 +31,8 @@ subroutine tce_residual_t3a(d_r3,k_r3_offset,residual)
       external nxtask
       logical nodezero
       logical acolo
+      double precision ddot
+      external ddot
 c
 c     new
 c
diff --git a/src/tce/tce_residual_t4.F b/src/tce/tce_residual_t4.F
index 090fb7cd38c..86413aea57d 100644
--- a/src/tce/tce_residual_t4.F
+++ b/src/tce/tce_residual_t4.F
@@ -32,6 +32,8 @@ subroutine tce_residual_t4(d_r4,k_r4_offset,residual)
       integer nxtask
       external nxtask
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
@@ -208,6 +210,8 @@ subroutine tce_residual_tr4(d_r4,k_r4_offset,residual)
       integer nxtask
       external nxtask
       logical nodezero
+      double precision ddot
+      external ddot
 c
 c     new
 c
diff --git a/src/util/util.fh b/src/util/util.fh
index 6c01182d3ec..95d91b1de95 100644
--- a/src/util/util.fh
+++ b/src/util/util.fh
@@ -2,7 +2,6 @@
 c      
 C$Id$
       logical util_print
-      double precision ddot
       double precision util_cpusec
       double precision util_wallsec
       double precision util_random
@@ -12,7 +11,6 @@ C$Id$
       logical util_nwchemrc_get
       logical util_module_avail
       external util_print
-      external ddot
       external util_cpusec
       external util_wallsec
       external util_random
diff --git a/src/vib/vib_eckart.F b/src/vib/vib_eckart.F
index 1d316b3ab36..42b431b632f 100644
--- a/src/vib/vib_eckart.F
+++ b/src/vib/vib_eckart.F
@@ -22,6 +22,8 @@ SUBROUTINE vib_eckart( HESS, HESSP, HESST, COORD, VC ,
       DOUBLE PRECISION UNIVEC(3), TEST(6,6), VNORM, temp, dotval, rnorm
       double precision test_norm
       integer i, j, k, l, m, n, mu, nu, indx, iatom, iaxis, itemp
+      double precision ddot
+      external ddot
 C****
 C**** construct translation unit vectors;  these are stored in the
 C**** first three columns of array VC, the rotation vectors will
@@ -200,6 +202,8 @@ SUBROUTINE vib_eckart_trans( HESS, HESSP, HESST, COORD, VC ,
       DOUBLE PRECISION UNIVEC(3), TEST(3,3), VNORM, temp, dotval, rnorm
       double precision test_norm
       integer i, j, k, l, m, n, mu, nu, indx, iatom, iaxis, itemp
+      double precision ddot
+      external ddot
 C****
 C**** construct translation unit vectors;  these are stored in the
 C**** first three columns of array VC, the rotation vectors will
diff --git a/src/vib/vib_nmass.F b/src/vib/vib_nmass.F
index ea81a2d2b5c..fed5956d4d8 100644
--- a/src/vib/vib_nmass.F
+++ b/src/vib/vib_nmass.F
@@ -39,6 +39,8 @@ subroutine vib_vecnormal(vectors,nvec)
 c
       double precision vnorm
       integer col
+      double precision ddot
+      external ddot
       do col = 1,nvec
         vnorm = ddot(nvec,vectors(1,col),1,vectors(1,col),1)
         vnorm = sqrt(1.0d00/vnorm)
@@ -60,6 +62,8 @@ subroutine vib_vecphase(coord,vectors,nvec)
 c
       double precision vnorm
       integer col
+      double precision ddot
+      external ddot
       do col = 1,nvec
         vnorm = ddot(nvec,coord,1,vectors(1,col),1)
         if (vnorm.lt.0.0d0) then
diff --git a/src/vib/vib_tors.F b/src/vib/vib_tors.F
index cbacf329fc8..d0e4cb9fb2c 100644
--- a/src/vib/vib_tors.F
+++ b/src/vib/vib_tors.F
@@ -32,6 +32,8 @@ SUBROUTINE vib_TORS(EQVAL,NOINT,I,J,K,L,C,B,NDIM)
       double precision dotpj, dotpk, sinpj, sinpk
       double precision smi, smj, sml, sense, f1, f2, dot
       integer m, nocol1, nocol2, nocol3, nocol4
+      double precision ddot
+      external ddot
 C
 C
 C

From a367b13b5aed4722faaf0d768b60015f90c6640e Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Tue, 4 May 2021 10:26:37 -0700
Subject: [PATCH 02/45] ddot declaration

---
 src/util/ga_mix.F | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/util/ga_mix.F b/src/util/ga_mix.F
index 58cd548332b..7e29313ff68 100644
--- a/src/util/ga_mix.F
+++ b/src/util/ga_mix.F
@@ -5,6 +5,8 @@ subroutine ga_mix(g_a, n, nvec, b, ld)
 #include "global.fh"
 #include "mafdecls.fh"
 #include "util.fh"
+      external ddot
+      double precision ddot
       integer g_a
       integer n, nvec, ld
       double precision b(ld,nvec)

From 90414b378c3697ea97ab21688e14203943ac87dd Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Wed, 5 May 2021 10:07:16 -0700
Subject: [PATCH 03/45] use curl when available

---
 src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh b/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh
index 84e78e67fcd..45cf56a1951 100755
--- a/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh
+++ b/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh
@@ -1,10 +1,17 @@
 #!/usr/bin/env bash
 rm -f dftd3.f nwpwxc_vdw3a.F
-if [[ -f "dftd3.tgz" ]]; then
-    echo "using existing" dftd3.tgz
+URL="https://www.chemie.uni-bonn.de/pctc/mulliken-center/software/dft-d3/"
+TGZ=dftd3.tgz
+if [[ -f "$TGZ" ]]; then
+    echo "using existing" "$TGZ"
 else
-    echo "downloading"  dftd3.tgz
-    wget https://www.chemie.uni-bonn.de/pctc/mulliken-center/software/dft-d3/dftd3.tgz
+    echo "downloading"  "$TGZ"
+    CURL_YES=`curl  -O 2>&1 | head -1  | awk ' /URL/ {print "Y";exit};{print "N"}'`
+    if [ $CURL_YES = "Y" ];	then
+	curl -L "$URL"/"$TGZ" -o "$TGZ"
+    else
+	wget "$URL"/"$TGZ"
+    fi
 fi    
 tar xzf dftd3.tgz dftd3.f
 mv dftd3.f nwpwxc_vdw3a.F

From ff287e202bca309e49af1a640a3053c81c200d6b Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Wed, 5 May 2021 12:16:55 -0700
Subject: [PATCH 04/45] silence compiler warnings

---
 src/nwdft/xc/xc_cr2scan.F  |  5 +++++
 src/nwdft/xc/xc_cr2scanl.F | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/nwdft/xc/xc_cr2scan.F b/src/nwdft/xc/xc_cr2scan.F
index 88f8b06ec90..02112b67052 100644
--- a/src/nwdft/xc/xc_cr2scan.F
+++ b/src/nwdft/xc/xc_cr2scan.F
@@ -160,6 +160,11 @@ Subroutine xc_cr2scan(tol_rho, cfac, rho, delrho, Amat,
            ds = 1d0
            dx = 1d0
            gc = 1d0
+           zeta=0d0
+           opz=1d0
+           omz=1d0
+           opz23=1d0
+           omz23=1d0
          else
            zeta = (rho(n,2) - rho(n,3))/ntot      
            if (zeta.lt.-1d0) zeta=-1d0
diff --git a/src/nwdft/xc/xc_cr2scanl.F b/src/nwdft/xc/xc_cr2scanl.F
index 2e174c9746f..8f8f3892ccb 100644
--- a/src/nwdft/xc/xc_cr2scanl.F
+++ b/src/nwdft/xc/xc_cr2scanl.F
@@ -162,6 +162,12 @@ Subroutine xc_cr2scanl(tol_rho, cfac, rho, delrho, laprho, Amat,
      &         (delrho(n,3,1)+delrho(n,3,2))**2
          end if
 c         
+         dtdnb = 0d0
+         dtdgb = 0d0
+         dtdlb = 0d0
+         dtdna = 0d0
+         dtdga = 0d0
+         dtdla = 0d0
          if (ipol.eq.1) then
            pa = dn2/(4d0*ckf2*n83)
            qa = laprho(n,1)/(4d0*ckf2*n53)
@@ -192,10 +198,6 @@ Subroutine xc_cr2scanl(tol_rho, cfac, rho, delrho, laprho, Amat,
      &                      F53*dfsdqa*qa/rho(n,2))
              dtdga = tuega*dfsdpa/(ckf2*(2d0*rho(n,2))**F83)
              dtdla = tuega*dfsdqa/(2d0*ckf2*(2d0*rho(n,2))**F53)
-           else
-             dtdna = 0d0
-             dtdga = 0d0
-             dtdla = 0d0
            endif
            if (rho(n,3).gt.tol_rho) then
              pb = (delrho(n,1,2)**2 +
@@ -207,10 +209,6 @@ Subroutine xc_cr2scanl(tol_rho, cfac, rho, delrho, laprho, Amat,
      &                      F53*dfsdqb*qb/rho(n,3))
              dtdgb = tuegb*dfsdpb/(ckf2*(2d0*rho(n,3))**F83)
              dtdlb = tuegb*dfsdqb/(2d0*ckf2*(2d0*rho(n,3))**F53)
-           else
-             dtdnb = 0d0
-             dtdgb = 0d0
-             dtdlb = 0d0
            endif
          endif
 
@@ -229,6 +227,11 @@ Subroutine xc_cr2scanl(tol_rho, cfac, rho, delrho, laprho, Amat,
            ds = 1d0
            dx = 1d0
            gc = 1d0
+           zeta=0d0
+           opz=1d0
+           omz=1d0
+           opz23=1d0
+           omz23=1d0
          else
            zeta = (rho(n,2) - rho(n,3))/ntot      
            if (zeta.lt.-1d0) zeta=-1d0

From ac743fa11fb9cd5b71d28be51d2746069db98d98 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Wed, 5 May 2021 17:34:52 -0700
Subject: [PATCH 05/45] added 5 download attempts

---
 src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh b/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh
index 45cf56a1951..e27626d73c4 100755
--- a/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh
+++ b/src/nwpw/nwpwlib/nwpwxc/build_dftd3a.sh
@@ -7,12 +7,18 @@ if [[ -f "$TGZ" ]]; then
 else
     echo "downloading"  "$TGZ"
     CURL_YES=`curl  -O 2>&1 | head -1  | awk ' /URL/ {print "Y";exit};{print "N"}'`
-    if [ $CURL_YES = "Y" ];	then
-	curl -L "$URL"/"$TGZ" -o "$TGZ"
+    tries=0 ; until [ "$tries" -ge 5 ] ; do
+    if [ $CURL_YES = "Y" ]; then
+	curl -L "$URL"/"$TGZ" -o "$TGZ" && break
     else
-	wget "$URL"/"$TGZ"
+	wget "$URL"/"$TGZ" && break
     fi
-fi    
+    tries=$((tries+1)) ; echo attempt no.  $tries    ; sleep 5 ;  done
+fi
+if [[ ! -f "$TGZ" ]]; then
+    echo "download failed"
+    exit 1
+fi
 tar xzf dftd3.tgz dftd3.f
 mv dftd3.f nwpwxc_vdw3a.F
 patch -p0 < nwpwxc_vdw3a.patch

From a21982ca65871e8bce37d7ec0f0108f9faff3156 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 6 May 2021 10:37:29 -0700
Subject: [PATCH 06/45] increased buffer size to 1024
 https://github.com/conda-forge/staged-recipes/pull/14725#commitcomment-50480065

---
 src/config/depend.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/config/depend.c b/src/config/depend.c
index 38d07115730..32efadcb7b6 100644
--- a/src/config/depend.c
+++ b/src/config/depend.c
@@ -8,6 +8,7 @@
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
+#define MAXBUF 1024
 
 extern char *strdup(const char *);
 char *makefile;
@@ -15,7 +16,7 @@ char backup[] = "makefile.bak";
 
 void copy_truncate_makefile(const char *backup)
 {
-    char buf[8192];
+    char buf[32*MAXBUF];
     FILE *in;
     int i, j, ninbuf;
     char line[] = 
@@ -121,7 +122,6 @@ void skip_white_space(FILE *file)
 
 char *include_directive(FILE *file)
 {
-#define MAXBUF 256
     char tmp[MAXBUF];
     int n = 0;
     int i;
@@ -330,7 +330,7 @@ Original code:
 	
     while (nincfile--) {
         char *incname = incfiles[nincfile];
-        char path[256];
+        char path[MAXBUF];
 
         for (i=0; i<nincdir; i++) {
             (void) sprintf(path, "%s/%s", incdirlist[i], incname);

From dc5f73a64a3b6596746e8648f24df838e7a36872 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Thu, 6 May 2021 14:04:51 -0700
Subject: [PATCH 07/45] Updated control_Ep and control_Sp to avoid using rtdb
 on the fly,,.EJB

---
 src/nwpw/nwpwlib/control/control.F  | 37 +++++++----------------------
 src/nwpw/nwpwlib/control/control.fh |  4 ++--
 2 files changed, 10 insertions(+), 31 deletions(-)

diff --git a/src/nwpw/nwpwlib/control/control.F b/src/nwpw/nwpwlib/control/control.F
index 5785f1840a4..4c392110865 100644
--- a/src/nwpw/nwpwlib/control/control.F
+++ b/src/nwpw/nwpwlib/control/control.F
@@ -1233,6 +1233,12 @@ logical function control_read(code_in,rtdb)
       if (.not.btdb_get(rtdb,'nwpw:attenuation',mt_dbl,1,attenuation))
      >  attenuation = 0.5d0
 
+*     **** set preconditioning parameters Ep,Sp ****
+      if (.not.btdb_get(rtdb,'nwpw:Eprecondition',mt_dbl,1,Ep))
+     >   Ep = 20.0d0
+      if (.not.btdb_get(rtdb,'nwpw:Sprecondition',mt_dbl,1,Sp))
+     >   Sp = 200.0d0
+
 *     **** set out of time variables ****
       est_step_time   = -1
       est_finish_time = -1
@@ -4178,7 +4184,6 @@ subroutine control_mullikenparameters(atom,rcut,lmbda)
          
 
 
-
 *     ***************************
 *     *                	   	*
 *     *        control_Ep	*
@@ -4187,20 +4192,7 @@ subroutine control_mullikenparameters(atom,rcut,lmbda)
       real*8 function control_Ep()
       implicit none
 
-#include "bafdecls.fh"
-#include "btdb.fh"
-
-*     **** control_rtdb common block ****
-      integer rtdb
-      common / control_rtdb1 / rtdb
-
-      real*8 Ep
-
-      if (.not.btdb_get(rtdb,'nwpw:Eprecondition',
-     >                  mt_dbl,1,Ep)) 
-     >  then
-         Ep = 20.0d0
-      end if
+#include "control.fh"
 
       control_Ep = Ep
       return
@@ -4216,20 +4208,7 @@ real*8 function control_Ep()
       real*8 function control_Sp()
       implicit none
 
-#include "bafdecls.fh"
-#include "btdb.fh"
-
-*     **** control_rtdb common block ****
-      integer rtdb
-      common / control_rtdb1 / rtdb
-
-      real*8 Sp
-      
-      if (.not.btdb_get(rtdb,'nwpw:Sprecondition',
-     >                  mt_dbl,1,Sp))
-     >  then
-         Sp = 200.0d0
-      end if
+#include "control.fh"
 
       control_Sp = Sp
       return
diff --git a/src/nwpw/nwpwlib/control/control.fh b/src/nwpw/nwpwlib/control/control.fh
index 58e785b2f45..c35b7618eeb 100644
--- a/src/nwpw/nwpwlib/control/control.fh
+++ b/src/nwpw/nwpwlib/control/control.fh
@@ -15,7 +15,7 @@
       real*8      tolerances(3),scaling(2),sa_decay(2)
       real*8      time_step,fake_mass,ks_alpha,fractional_alpha
       real*8      ecut,wcut,rcut
-      real*8      cpu1_time,cpu2_time,attenuation
+      real*8      cpu1_time,cpu2_time,attenuation,Ep,Sp
       real*8      bo_time_step,bo_fake_mass,kerker_g0
       real*8      smooth_cutoff_values(2)
       integer     bo_steps(2),bo_algorithm
@@ -35,7 +35,7 @@
      >                         scaling,sa_decay,smooth_cutoff_values,
      >                         time_step,fake_mass,ks_alpha,
      >                         fractional_alpha,
-     >                         ecut,wcut,rcut,attenuation,
+     >                         ecut,wcut,rcut,attenuation,Ep,Sp,
      >                         bo_time_step,bo_fake_mass,kerker_g0,
      >                         bo_steps,bo_algorithm,
      >                         mapping,mapping1d,np_dimensions,

From 4f6ec0b7b8eba18b7db8c09fced0eb3fe1fdac29 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Thu, 6 May 2021 14:29:02 -0700
Subject: [PATCH 08/45] ...EJB

---
 src/nwpw/band/lib/ke/cke.F | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nwpw/band/lib/ke/cke.F b/src/nwpw/band/lib/ke/cke.F
index 2f3951ce72b..7a65c11bb4a 100644
--- a/src/nwpw/band/lib/ke/cke.F
+++ b/src/nwpw/band/lib/ke/cke.F
@@ -102,11 +102,12 @@ subroutine cke_init()
      >                           dbl_mb(tg_indx+(nb-1)*npack1))
          end do
       end if
-      
+
       value =           BA_pop_stack(tmp2(2))
       value = value.and.BA_pop_stack(tmp1(2))
       if (.not. value)
      > call errquit('cke_init:popping stack memory',0,MA_ERR)
+
       return
       end
 

From 7cfcb81a605409f87266135fc35070c147c58648 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Thu, 6 May 2021 14:30:53 -0700
Subject: [PATCH 09/45] ...EJB

---
 src/nwpw/pspw/lib/psi/psi.F | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/nwpw/pspw/lib/psi/psi.F b/src/nwpw/pspw/lib/psi/psi.F
index 3d0912e2e2e..8fabb8df945 100644
--- a/src/nwpw/pspw/lib/psi/psi.F
+++ b/src/nwpw/pspw/lib/psi/psi.F
@@ -304,8 +304,9 @@ subroutine psi_minimize_f_orb()
      >                               0.001d0,ii,error_out,e0)
           !write(*,*) "e0:",ii,l,e0,error_out
           l = l+1
-          if ((error_out.gt.maxerror).and.(l.le.4)) go to 3
-          if ((error_out.gt.maxerror).and.(l2.le.1)) then
+          if ((error_out.gt.maxerror).and.(l.le.(1+(l2-1)*3))) go to 3
+          if (((error_out.gt.maxerror).or.(e0.gt.4.0d0))
+     >        .and.(l2.le.1)) then
            call Pack_c_Zero(1,dcpl_mb(psi1(1) +(ii-1)*npack1))
            call Pack_c_setzero(1,1.0d0,dcpl_mb(psi1(1) +(ii-1)*npack1))
            go to 2
@@ -2785,7 +2786,8 @@ subroutine psi_minimize_virtual()
      >                               0.001d0,ii,error_out,e0)
           l  = l+1
           if ((error_out.gt.maxerror).and.(l.le.(1+(l2-1)*3))) go to 3
-          if ((error_out.gt.maxerror).and.(l2.le.1)) then
+          if (((error_out.gt.maxerror).or.(e0.gt.4.0d0))
+     >        .and.(l2.le.1)) then
             call Pack_c_Zero(1,
      >               dcpl_mb(psi1_excited(1) +(ii-1)*npack1))
             call Pack_c_setzero(1,1.0d0,

From f935ff14c65dbc9564ba4c85bea8ddd10b7e15d1 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Thu, 6 May 2021 16:00:20 -0700
Subject: [PATCH 10/45] ...EJB

---
 src/nwpw/band/lib/psi/cpsi_KS.F | 120 +++++++++++++++++++++++++++-----
 1 file changed, 101 insertions(+), 19 deletions(-)

diff --git a/src/nwpw/band/lib/psi/cpsi_KS.F b/src/nwpw/band/lib/psi/cpsi_KS.F
index 3e464ec36d3..c918b2157de 100644
--- a/src/nwpw/band/lib/psi/cpsi_KS.F
+++ b/src/nwpw/band/lib/psi/cpsi_KS.F
@@ -2650,22 +2650,23 @@ subroutine cpsi_minimize_virtual()
       implicit none
 
 #include "bafdecls.fh"
+#include "errquit.fh"
 #include "cpsi_common.fh"
       
       !*** local variables ***
-      integer maxit_orb,taskid_k
+      integer maxit_orb
       integer ii,l,l2,nb,epsi_ptr,eig_ptr
       real*8  sum,maxerror,error_out,e0
 
       !*** external functions ***
       integer  cpsi_data_get_ptr
-      real*8   control_tole
       external cpsi_data_get_ptr
+      real*8   control_tole
       external control_tole
 
+
       maxit_orb=120
       maxerror = control_tole()
-      call Parallel3d_taskid_k(taskid_k)
 
       do nb=1,nbrillq
       do ii=1,(ne_excited(1)+ne_excited(2))
@@ -2678,10 +2679,7 @@ subroutine cpsi_minimize_virtual()
          call cpsi_project_out_virtual1(nb,ii,dbl_mb(epsi_ptr))
 
          !*** normalize ****
-         call Cram_cc_dot(nb,
-     >            dbl_mb(epsi_ptr),
-     >            dbl_mb(epsi_ptr),
-     >            sum)
+         call Cram_cc_dot(nb,dbl_mb(epsi_ptr),dbl_mb(epsi_ptr),sum)
          sum = 1.0d0/dsqrt(sum)
          call Cram_c_SMul1(nb,sum,dbl_mb(epsi_ptr))
 
@@ -2689,13 +2687,17 @@ subroutine cpsi_minimize_virtual()
          !*** minimize orbital ****
           l = 0
  3        call cpsi_KS_update_virtual(maxit_orb,
-     >                               maxerror,
-     >                               0.001d0,nb,ii,error_out,e0)
+     >                                maxerror,
+     >                                0.001d0,nb,ii,error_out,e0)
           l = l+1
-          if ((error_out.gt.maxerror).and.(l.le.(1+(l2-1)*3))) go to 3
-          if ((error_out.gt.maxerror).and.(l2.le.1)) then
-            call Cram_c_Zero(nb,dbl_mb(epsi_ptr))
-            call Cram_c_setzero(1,1.0d0,dbl_mb(epsi_ptr))
+
+          if ((error_out.gt.maxerror).and.(l.le.(1+(l2-1)*3))) then
+             go to 3
+          end if
+
+          if (((error_out.gt.maxerror).or.(e0.gt.4.0d0))
+     >        .and.(l2.le.1)) then
+            call cpsi_corrector_orb(nb,dbl_mb(epsi_ptr))
             go to 2
           end if 
 
@@ -2703,11 +2705,54 @@ subroutine cpsi_minimize_virtual()
 
       end do
       end do
+
+
       call cpsi_sort_virtual()
 
+
       return
       end
 *
+
+*     ***********************************
+*     *                                 *
+*     *       cpsi_corrector_orb        *
+*     *                                 *
+*     ***********************************
+      subroutine cpsi_corrector_orb(nb,orb)
+      implicit none
+      integer nb
+      complex*16 orb(*)
+
+#include "bafdecls.fh"
+#include "errquit.fh"
+#include "cpsi_common.fh"
+
+      !***** local variables ****
+      integer k,g(2)
+      real*8  sum
+
+      if (.not. BA_push_get(mt_dcpl,nfft3d,'g',g(2),g(1)))
+     >   call errquit('cpsi_corrector_orb:out stack',0,MA_ERR)
+
+      do k=1,nfft3d
+        dcpl_mb(g(1)+k-1)=dcmplx(0.1d0,0.002d0*dsin(0.001d0*k))
+      end do
+      call C3dB_rc_fft3f(1,dcpl_mb(g(1)))
+      call Cram_c_pack(nb,dcpl_mb(g(1)))
+      call Cram_cc_dot(nb,dcpl_mb(g(1)),dcpl_mb(g(1)),sum)
+      sum = 1.0d0/dsqrt(sum)
+      call Cram_c_SMul1(nb,sum,dcpl_mb(g(1)))
+
+      call Parallel_shared_vector_zero(.true.,2*npack1,orb)
+      call Cram_c_Copy(nb,dcpl_mb(g(1)),orb)
+
+      if (.not.BA_pop_stack(g(2)))
+     >   call errquit('cpsi_corrector_orb:pop stack',1,MA_ERR)
+
+      return
+      end 
+
 c      subroutine cpsi_check_orthodebug(nb,i,Horb)
 c      implicit none
 c      integer nb,i
@@ -2942,23 +2987,37 @@ subroutine cpsi_KS_update_virtual(maxiteration,
 
 #include "bafdecls.fh"
 #include "errquit.fh"
+#include "util.fh"
+#include "stdio.fh"
 #include "cpsi_common.fh"
 
 
 *     **** local variables ****
-      logical value,done,oneloop
-      integer it
+c      integer MASTER,taskid
+c      parameter (MASTER=0)
+c      logical oprint
+
+      logical value,done,oneloop,precondition
+      integer it,pit
       real*8 e0,eold,percent_error,error0,de0,lmbda_r0,lmbda_r1
-      real*8 theta
+      real*8 theta,ep,sp
       integer r1(2),t0(2),t(2),g(2)
       integer psi_ptr
 
 *     **** external functions ****
-      integer  cpsi_data_get_ptr
-      external cpsi_data_get_ptr
+      integer  cpsi_data_get_ptr,Pneb_convert_nb
+      external cpsi_data_get_ptr,Pneb_convert_nb
+      logical  control_print
+      external control_print
+      real*8   control_Ep,control_Sp
+      external control_Ep,control_Sp
 
       psi_ptr=cpsi_data_get_ptr(psi1_excited_tag,nb,i)
 
+c      call Parallel3d_taskid_i(taskid)
+c      call Parallel3d_taskid_k(taskid_k)
+c      oprint= ((taskid.eq.MASTER).and.control_print(print_medium))
+
       lmbda_r0 = 1.0d0
 
       value = BA_push_get(mt_dcpl,npack1,'t0',t0(2),t0(1))
@@ -2971,11 +3030,15 @@ subroutine cpsi_KS_update_virtual(maxiteration,
       if (.not. value) call errquit(
      >     'cpsi_KS_update_virtual: out of stack memory',0, MA_ERR)
 
+      ep = control_Ep()
+      sp = control_Sp()
+      precondition = .true.
       done = .false.
       error0 = 0.0d0
       e0 = 0.0d0
       theta = -3.14159d0/600.0d0
       it = 0
+      pit = 0
  2    continue
  
          it = it + 1
@@ -2990,17 +3053,25 @@ subroutine cpsi_KS_update_virtual(maxiteration,
 
          percent_error=0.0d0
          if(error0.ne.0.0d0)
-     A      percent_error = dabs(e0-eold)/error0
+     >      percent_error = dabs(e0-eold)/error0
 
          done = ((it.gt.maxiteration)
      >           .or.
      >           (dabs(e0-eold).lt.maxerror))
 
+         precondition = (dabs(e0-eold).gt.(sp*maxerror))
          if (done) go to 4
 
          call Cram_c_Copy(nb,dcpl_mb(g(1)),dcpl_mb(r1(1)))
          call Cram_cc_daxpy(nb,e0,dbl_mb(psi_ptr),dcpl_mb(r1(1)))
 
+*        **** preconditioning ****
+         if (precondition) then
+            pit = pit + 1
+            call cke_Precondition(nb,npack1,1,
+     >                            dbl_mb(psi_ptr),
+     >                            dcpl_mb(g(1)))
+         end if
 
 *        *** determine conjuagate direction ***
          call Cram_cc_dot(nb,dcpl_mb(r1(1)),
@@ -3051,6 +3122,17 @@ subroutine cpsi_KS_update_virtual(maxiteration,
       if (.not.value) call errquit(
      >     'cpsi_KS_update_virtual: popping stack memory',1,MA_ERR)
 
+c      if (oprint) then
+c         write(luout,921) Pneb_convert_nb(nb),nb,i,-e0,
+c     >                    dabs(e0-eold),it,pit,ep,sp
+c  921 format(5x,"nb",I4,1x,"nbq",I4,1x,"orbital",I4," current e=",E10.3,
+c     >       " (error=",E9.3,")",
+c     >       " iterations",I4,"(",I4,
+c     >       " preconditioned, Ep,Sp=",F5.1,F7.1,")")
+c      end if
+
+
+
       error_out = dabs(e0-eold)
       e0 = -e0
       return

From cb94bc12a3dc913b042942ef4aae426f0927f0fc Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Thu, 6 May 2021 19:55:34 -0700
Subject: [PATCH 11/45] debugging fractional with minimizer 8....EJB

---
 src/nwpw/band/lib/psi/cpsi.F             |  1 +
 src/nwpw/band/minimizer/c_bybminimize2.F | 12 ++-
 src/nwpw/band/minimizer/c_cgsd_energy.F  | 95 ++++++++++++++++++++----
 3 files changed, 92 insertions(+), 16 deletions(-)

diff --git a/src/nwpw/band/lib/psi/cpsi.F b/src/nwpw/band/lib/psi/cpsi.F
index 1856d87d3ae..6a79d74f2d9 100644
--- a/src/nwpw/band/lib/psi/cpsi.F
+++ b/src/nwpw/band/lib/psi/cpsi.F
@@ -2526,6 +2526,7 @@ subroutine cpsi_1define_occupation(initial_alpha,use_hml)
       end
 
 
+
 c  set nwpw:fractional_smeartype 1 #0-none, 1-Fermi-Dirac, 2-Gaussian, 3-Hermite
 c                                   4-Marzari-Vanderbilt
 
diff --git a/src/nwpw/band/minimizer/c_bybminimize2.F b/src/nwpw/band/minimizer/c_bybminimize2.F
index 190d6b6c69a..f0b4e47378c 100644
--- a/src/nwpw/band/minimizer/c_bybminimize2.F
+++ b/src/nwpw/band/minimizer/c_bybminimize2.F
@@ -175,7 +175,7 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
 *     **** set the initial density ****
       if (current_iteration.eq.1) then
          Enew  = cpsi_1energy()
-         !write(*,*) "Enew=",Enew+eion,Enew
+         if (oprint) write(*,*) "Enew=",Enew+eion,Enew
          alpha = control_ks_alpha()
          deltae = -9232323299.0d0
          ks_deltae = tole
@@ -198,9 +198,12 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
             call cpsi_2to1()
          end if
          if (control_fractional()) then
-            call cpsi_1define_occupation(0.0d0,.false.)
+            !call cpsi_1define_occupation(0.0d0,.false.)
+            call cpsi_1define_occupation(-1.0d0,.false.)
             Enew = Enew + cpsi_smearcorrection()
          end if
+         if (oprint) write(*,*) "THIRD EIGPRINT"
+         call cpsi_printeig_debug()
       else
          call cpsi_get_density(1,dbl_mb(rho_in(1)))
          call cpsi_get_density(1,dbl_mb(rho_out(1)))
@@ -270,6 +273,9 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
       if (control_fractional()) Enew = Enew + cpsi_smearcorrection()
       deltae = Enew-Eold
 
+       if (oprint) write(*,*) "Fourth EIGPRINT"
+       call cpsi_printeig_debug()
+
 
       call cpsi_get_density(1,dbl_mb(rho_in(1)))
 
@@ -466,6 +472,8 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
          E(1)  = E(1) + E(33)
       end if
 
+       if (oprint) write(*,*) "FINAL c_bybminmize2 EIGPRINT"
+       call cpsi_printeig_debug()
 
       return
       end
diff --git a/src/nwpw/band/minimizer/c_cgsd_energy.F b/src/nwpw/band/minimizer/c_cgsd_energy.F
index ed634c90a44..e3d8beac409 100644
--- a/src/nwpw/band/minimizer/c_cgsd_energy.F
+++ b/src/nwpw/band/minimizer/c_cgsd_energy.F
@@ -128,27 +128,29 @@ real*8 function c_cgsd_energy(newpsi)
       if ((minimizer.eq.5).or.(minimizer.eq.8)) it_out = 1
 
       if (newpsi) then
-         if (minimizer.ne.4) call c_sdminimize(10)
+         if (minimizer.lt.4) call c_sdminimize(10)
          call c_bybminimize0()
          if (control_fractional()) then
-            Enew  = cpsi_1energy()
+            Enew  = cpsi_1energy() + ewald_e()
+            if (mprint) write(*,*) "Enew+ewald=",Enew
             call cpsi_1gen_hml()
-            !write(*,*) "Start ENEW = ",enew + ewald_e()
-            !if (minimizer.ne.4) then
-               call cpsi_diagonalize_hml()
+            call cpsi_diagonalize_hml()
+
+             if (mprint) write(*,*) "First EIG OCC:"
+             call cpsi_printeig_debug()
+
                call cpsi_1rotate2()
                call cpsi_2to1()
                call cpsi_1define_occupation(-1.0d0,.false.)
-            !else
-            !   call cpsi_1define_occupation(-1.0d0,.true.)
-            !end if
+             if (mprint) write(*,*) "Second EIG OCC:"
+             call cpsi_printeig_debug()
          end if
       end if
 
    2  continue
          icount = icount + 1
          if (stalled) then
-           if (minimizer.ne.4) call c_sdminimize(0)
+           if (minimizer.lt.4) call c_sdminimize(0)
            bfgscount = 0
          end if
 
@@ -247,11 +249,14 @@ real*8 function c_cgsd_energy(newpsi)
 
 
 *     **** diagonalize hamiltonian and rotate psi ****
-      call cpsi_1gen_hml()
-      call cpsi_diagonalize_hml()
-      if (.not.control_fractional()) then
-         call cpsi_1rotate2()
-         call cpsi_2to1()
+      !**** NEED TO CHECK THIS LOGIC AGAIN ****
+      if (minimizer.ne.8) then
+         call cpsi_1gen_hml()
+         call cpsi_diagonalize_hml()
+         if (.not.control_fractional()) then
+            call cpsi_1rotate2()
+            call cpsi_2to1()
+         end if
       end if
       
 
@@ -434,3 +439,65 @@ real*8 function c_cgsd_energy(newpsi)
       end
 
 
+
+      subroutine cpsi_printeig_debug()
+      implicit none
+
+#include "stdio.fh"
+#include "util.fh"
+
+      integer MASTER,taskid
+      parameter (MASTER=0)
+
+      logical mprint
+      integer nb,i
+      real*8 f0,f1,f2,f3,f4,f5,f6
+
+      logical  control_print
+      external control_print
+      integer  brillioun_nbrillioun,cpsi_ne
+      external brillioun_nbrillioun,cpsi_ne
+      real*8   brillioun_weight_brdcst
+      real*8   brillioun_ks_brdcst
+      real*8   brillioun_k_brdcst
+      real*8   cpsi_eigenvalue_brdcst
+      real*8   cpsi_occupation_brdcst
+      external brillioun_weight_brdcst
+      external brillioun_ks_brdcst
+      external brillioun_k_brdcst
+      external cpsi_eigenvalue_brdcst
+      external cpsi_occupation_brdcst
+
+
+      call Parallel_taskid(taskid)
+
+      mprint = (taskid.eq.MASTER).and.control_print(print_medium)
+
+          do nb=1,brillioun_nbrillioun()
+             f0 = brillioun_weight_brdcst(nb)
+             f1 = brillioun_ks_brdcst(1,nb)
+             f2 = brillioun_ks_brdcst(2,nb)
+             f3 = brillioun_ks_brdcst(3,nb)
+             f4 = brillioun_k_brdcst(1,nb)
+             f5 = brillioun_k_brdcst(2,nb)
+             f6 = brillioun_k_brdcst(3,nb)
+             if (mprint) then
+               write(luout,1508) nb,f0,f1,f2,f3,f4,f5,f6
+               write(luout,1500)
+             end if
+             do i=0,cpsi_ne(1)-1
+               f1 = cpsi_eigenvalue_brdcst(nb,1,cpsi_ne(1)-i)
+               f2 = cpsi_occupation_brdcst(nb,1,cpsi_ne(1)-i)
+               if (mprint) write(luout,1510) f1,f1*27.2116d0,f2
+             end do
+          end do
+
+ 1500 FORMAT(/' orbital energies:')
+ 1508 FORMAT(/' Brillouin zone point: ',i6,
+     >       /'    weight=',f10.6,
+     >       /'    k     =<',3f8.3,'> . <b1,b2,b3> ',
+     >       /'          =<',3f8.3,'>')
+ 1510 FORMAT(4(E18.7,' (',F8.3,'eV) occ=',F5.3))
+
+      return
+      end

From ece96d259a984e48562a539cfa50bb90032159a8 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Fri, 7 May 2021 10:18:28 -0700
Subject: [PATCH 12/45] debuggin fractional occupation...EJB

---
 src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F | 29 +++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
index e37c3541e97..be0c7687604 100644
--- a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
+++ b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
@@ -115,12 +115,14 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 #include "nwpw_scf_mixing.fh"
 
 *     **** local variables ****
-      logical value
+      integer MASTER,taskid
+      parameter (MASTER=0)
+      logical value,oprint
       integer i,j,info,k,ipiv,ms,shift
       integer rr_ptr,ss_ptr,tt_ptr,ff_ptr
       integer V0,V1,Vout0,Vout1,F0,F1,Vbar0,Vbar1
       integer dV,U,dF,dFi
-      real*8 sum0,sum1,beta,p00,p01,p11,alpha1,r00
+      real*8 sum0,sum1,beta,p00,p01,p11,alpha1,r00,small
       real*8 BB(40,40),BBB(40,40)
 
 *     **** external functions ****
@@ -133,6 +135,8 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 c         alpha1 = 1.0d0
 c      end if
       alpha1 = alpha 
+      call Parallel_taskid(taskid)
+      oprint = (taskid.eq.MASTER)
 
 *     **** simple mixing ****
       if (algorithm.eq.0) then
@@ -213,6 +217,8 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 !$OMP MASTER
            scf_error = dsqrt(scf_error)          
 !$OMP END MASTER
+           if (oprint)
+     >      write(*,*) "Broyden SCF MIXING: scf_error=",scf_error
 
        
            !**** Beta = <F1|F1-F0>/<F1-F0/F1-F0> ****
@@ -224,6 +230,8 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
            call D3dB_SumAll(sum0)
            call D3dB_SumAll(sum1)
            beta = sum0/sum1
+
+           if (oprint) write(*,*) "Broyden SCF MIXING: betar=",beta
            
  
 
@@ -325,6 +333,8 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 !$OMP MASTER
            scf_error = dsqrt(scf_error)          
 !$OMP END MASTER
+           if (oprint) 
+     >     write(*,*) "Johnson SCF MIXING: m,scf_error=",m,scf_error
 
            !*** dF = dF(m-1), U = U(m-1) ***
            call nwpw_list_ptr(1,(5+m-1),dF)
@@ -362,11 +372,20 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
            
            call Parallel_shared_vector_zero(.true.,max_list*max_list,B)
 !$OMP MASTER
+
+           if (oprint)
+     >     write(*,*) "Johnson SCF MIXING small:",small
+           small = 0.0d0
            do i=1,m-1   
            do j=1,m-1   
               B(i,j) = A(i,j)
+              small = small + dabs(A(i,j))
            end do
+           if (oprint)
+     >     write(*,*) "Johnson SCF MIXING C,B:",C(i),(B(i,j),j=1,m-1)
            end do
+           small = small/dble(m-1)**2
+           
 
            do i=1,m-1
            do j=1,m-1
@@ -374,7 +393,7 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
            end do
            end do
            do i=1,m-1
-              Binv(i,i) = 1.0d0
+              Binv(i,i) = 1.0d0*small
            end do
 
            call DGESV((m-1),(m-1),
@@ -388,8 +407,10 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
            do i=1,m-1
               d(i) = 0.0d0
               do j=1,m-1
-                 d(i) = d(i) - c(j)*Binv(j,i)
+                 d(i) = d(i) - (c(j)/small)*Binv(j,i)
               end do
+       if (oprint)
+     > write(*,*) "Johnson SCF MIXING d,Binv:",d(i),(Binv(j,i),j=1,m-1)
            end do
 !$OMP END MASTER
 

From cae5a62c7987214d813b0d6e56d06e7f49f4c46e Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Fri, 7 May 2021 10:48:47 -0700
Subject: [PATCH 13/45] ipiv defined incorrectly...EJB

---
 src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F  | 7 ++++---
 src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.fh | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
index be0c7687604..6fdbf0e96f5 100644
--- a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
+++ b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
@@ -118,13 +118,14 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
       integer MASTER,taskid
       parameter (MASTER=0)
       logical value,oprint
-      integer i,j,info,k,ipiv,ms,shift
+      integer i,j,info,k,ms,shift
       integer rr_ptr,ss_ptr,tt_ptr,ff_ptr
       integer V0,V1,Vout0,Vout1,F0,F1,Vbar0,Vbar1
       integer dV,U,dF,dFi
       real*8 sum0,sum1,beta,p00,p01,p11,alpha1,r00,small
       real*8 BB(40,40),BBB(40,40)
 
+
 *     **** external functions ****
       real*8   ddot
       external ddot
@@ -373,8 +374,6 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
            call Parallel_shared_vector_zero(.true.,max_list*max_list,B)
 !$OMP MASTER
 
-           if (oprint)
-     >     write(*,*) "Johnson SCF MIXING small:",small
            small = 0.0d0
            do i=1,m-1   
            do j=1,m-1   
@@ -386,6 +385,8 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
            end do
            small = small/dble(m-1)**2
            
+           if (oprint)
+     >     write(*,*) "Johnson SCF MIXING small:",small
 
            do i=1,m-1
            do j=1,m-1
diff --git a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.fh b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.fh
index 74bdfb7191a..c34e25fe7b1 100644
--- a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.fh
+++ b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.fh
@@ -3,11 +3,12 @@ c     $Id$
 c
       integer max_list
       parameter (max_list=40)
+      integer ipiv(max_list)
       real*8  c(max_list),d(max_list),B(max_list,max_list),alpha
       real*8  A(max_list,max_list),w(max_list),w0
       real*8  Binv(max_list,max_list)
       integer n2ft3d,npack1,neall,nsize,max_m,m,algorithm,ispin
       common / nwpw_scf_mixing_block / A,B,Binv,c,d,w,w0,alpha,
-     >                             n2ft3d,npack1,neall,
+     >                             ipiv,n2ft3d,npack1,neall,
      >                             nsize,max_m,m,algorithm,ispin
 

From 4aaaf394e8c036aaa03d0653e0c7e33eb6a4baef Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Fri, 7 May 2021 10:57:45 -0700
Subject: [PATCH 14/45] removed debug print statements

---
 src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F | 21 ++------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
index 6fdbf0e96f5..e8295348d35 100644
--- a/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
+++ b/src/nwpw/nwpwlib/utilities/nwpw_scf_mixing.F
@@ -115,9 +115,7 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 #include "nwpw_scf_mixing.fh"
 
 *     **** local variables ****
-      integer MASTER,taskid
-      parameter (MASTER=0)
-      logical value,oprint
+      logical value
       integer i,j,info,k,ms,shift
       integer rr_ptr,ss_ptr,tt_ptr,ff_ptr
       integer V0,V1,Vout0,Vout1,F0,F1,Vbar0,Vbar1
@@ -136,8 +134,7 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 c         alpha1 = 1.0d0
 c      end if
       alpha1 = alpha 
-      call Parallel_taskid(taskid)
-      oprint = (taskid.eq.MASTER)
+
 
 *     **** simple mixing ****
       if (algorithm.eq.0) then
@@ -218,8 +215,6 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 !$OMP MASTER
            scf_error = dsqrt(scf_error)          
 !$OMP END MASTER
-           if (oprint)
-     >      write(*,*) "Broyden SCF MIXING: scf_error=",scf_error
 
        
            !**** Beta = <F1|F1-F0>/<F1-F0/F1-F0> ****
@@ -232,9 +227,6 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
            call D3dB_SumAll(sum1)
            beta = sum0/sum1
 
-           if (oprint) write(*,*) "Broyden SCF MIXING: betar=",beta
-           
- 
 
            !**** Vbar1 = (1-Beta)*Vout1 + Beta*Vout0 ****
            call Parallel_shared_vector_copy(.true.,nsize,
@@ -334,8 +326,6 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
 !$OMP MASTER
            scf_error = dsqrt(scf_error)          
 !$OMP END MASTER
-           if (oprint) 
-     >     write(*,*) "Johnson SCF MIXING: m,scf_error=",m,scf_error
 
            !*** dF = dF(m-1), U = U(m-1) ***
            call nwpw_list_ptr(1,(5+m-1),dF)
@@ -380,14 +370,9 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
               B(i,j) = A(i,j)
               small = small + dabs(A(i,j))
            end do
-           if (oprint)
-     >     write(*,*) "Johnson SCF MIXING C,B:",C(i),(B(i,j),j=1,m-1)
            end do
            small = small/dble(m-1)**2
            
-           if (oprint)
-     >     write(*,*) "Johnson SCF MIXING small:",small
-
            do i=1,m-1
            do j=1,m-1
               Binv(i,j) = 0.0d0
@@ -410,8 +395,6 @@ subroutine nwpw_scf_mixing(vout,vnew,deltae,scf_error)
               do j=1,m-1
                  d(i) = d(i) - (c(j)/small)*Binv(j,i)
               end do
-       if (oprint)
-     > write(*,*) "Johnson SCF MIXING d,Binv:",d(i),(Binv(j,i),j=1,m-1)
            end do
 !$OMP END MASTER
 

From 56e450a790ce454a3d841e11ae1c052df32d2c12 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Fri, 7 May 2021 11:05:44 -0700
Subject: [PATCH 15/45] removed debug print statements...EJB

---
 src/nwpw/band/minimizer/c_bybminimize2.F | 10 +---------
 src/nwpw/band/minimizer/c_cgsd_energy.F  |  7 -------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/nwpw/band/minimizer/c_bybminimize2.F b/src/nwpw/band/minimizer/c_bybminimize2.F
index f0b4e47378c..e887cd24253 100644
--- a/src/nwpw/band/minimizer/c_bybminimize2.F
+++ b/src/nwpw/band/minimizer/c_bybminimize2.F
@@ -175,7 +175,6 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
 *     **** set the initial density ****
       if (current_iteration.eq.1) then
          Enew  = cpsi_1energy()
-         if (oprint) write(*,*) "Enew=",Enew+eion,Enew
          alpha = control_ks_alpha()
          deltae = -9232323299.0d0
          ks_deltae = tole
@@ -202,8 +201,7 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
             call cpsi_1define_occupation(-1.0d0,.false.)
             Enew = Enew + cpsi_smearcorrection()
          end if
-         if (oprint) write(*,*) "THIRD EIGPRINT"
-         call cpsi_printeig_debug()
+        
       else
          call cpsi_get_density(1,dbl_mb(rho_in(1)))
          call cpsi_get_density(1,dbl_mb(rho_out(1)))
@@ -273,9 +271,6 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
       if (control_fractional()) Enew = Enew + cpsi_smearcorrection()
       deltae = Enew-Eold
 
-       if (oprint) write(*,*) "Fourth EIGPRINT"
-       call cpsi_printeig_debug()
-
 
       call cpsi_get_density(1,dbl_mb(rho_in(1)))
 
@@ -472,9 +467,6 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
          E(1)  = E(1) + E(33)
       end if
 
-       if (oprint) write(*,*) "FINAL c_bybminmize2 EIGPRINT"
-       call cpsi_printeig_debug()
-
       return
       end
  
diff --git a/src/nwpw/band/minimizer/c_cgsd_energy.F b/src/nwpw/band/minimizer/c_cgsd_energy.F
index e3d8beac409..6c59c1084d5 100644
--- a/src/nwpw/band/minimizer/c_cgsd_energy.F
+++ b/src/nwpw/band/minimizer/c_cgsd_energy.F
@@ -132,18 +132,11 @@ real*8 function c_cgsd_energy(newpsi)
          call c_bybminimize0()
          if (control_fractional()) then
             Enew  = cpsi_1energy() + ewald_e()
-            if (mprint) write(*,*) "Enew+ewald=",Enew
             call cpsi_1gen_hml()
             call cpsi_diagonalize_hml()
-
-             if (mprint) write(*,*) "First EIG OCC:"
-             call cpsi_printeig_debug()
-
                call cpsi_1rotate2()
                call cpsi_2to1()
                call cpsi_1define_occupation(-1.0d0,.false.)
-             if (mprint) write(*,*) "Second EIG OCC:"
-             call cpsi_printeig_debug()
          end if
       end if
 

From 4b2b1af52ebc503a26e83e458dcd7d2849bb463b Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Fri, 7 May 2021 12:55:13 -0700
Subject: [PATCH 16/45] script to download tarballs for sites without network
 connection [ci skip]

---
 contrib/getfiles.nwchem | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 contrib/getfiles.nwchem

diff --git a/contrib/getfiles.nwchem b/contrib/getfiles.nwchem
new file mode 100644
index 00000000000..3476308a26a
--- /dev/null
+++ b/contrib/getfiles.nwchem
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+#
+# $Id$
+cd $NWCHEM_TOP/src/nwpw/nwpwlib/nwpwxc/
+rm -f dftd3.tgz
+wget https://www.chemie.uni-bonn.de/pctc/mulliken-center/software/dft-d3/dftd3.tgz
+cd $NWCHEM_TOP/src/libext/openblas
+VERSION=0.3.13
+rm -rf OpenBLAS*gz
+curl -L https://github.com/xianyi/OpenBLAS/archive/v${VERSION}.tar.gz -o OpenBLAS-${VERSION}.tar.gz
+cd $NWCHEM_TOP/src/libext/scalapack
+COMMIT=bc6cad585362aa58e05186bb85d4b619080c45a9
+rm -f scalapack-$COMMIT.zip
+curl -L https://github.com/Reference-ScaLAPACK/scalapack/archive/$COMMIT.zip -o scalapack-$COMMIT.zip
+

From c8f49127fce5fa9c54047efdca8e3308bb248d3b Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Sun, 9 May 2021 14:00:09 -0700
Subject: [PATCH 17/45] ...EJB

---
 src/nwpw/pspw/cgsd/bybminimize2.F    | 11 ++-----
 src/nwpw/pspw/cgsd/cgsd_energy.F     | 46 ++++++++++++++++++++++++++--
 src/nwpw/pspw/makepsi/wvfnc_adjust.F | 16 +++++++---
 3 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/src/nwpw/pspw/cgsd/bybminimize2.F b/src/nwpw/pspw/cgsd/bybminimize2.F
index c79f62610a2..26dc8cf9e88 100644
--- a/src/nwpw/pspw/cgsd/bybminimize2.F
+++ b/src/nwpw/pspw/cgsd/bybminimize2.F
@@ -220,12 +220,8 @@ subroutine bybminimize2(E,deltae,deltac,current_iteration,
 
          ks_deltae = tole
          value = .false.
+
          if (rho_read) call rho_2to1()
-c         if (control_fractional()) then
-c            if (control_use_fractional_rho()) then
-c               value = psi_try_read_density(1)
-c            end if
-c         end if
          call electron_gen_vall()
          call psi_get_density(1,dbl_mb(rho_in(1)))
          call psi_get_density(1,dbl_mb(rho_out(1)))
@@ -311,6 +307,7 @@ subroutine bybminimize2(E,deltae,deltac,current_iteration,
 
 !$OMP BARRIER
 
+
       e00 = psi_1energy()
 !$OMP MASTER
       Eold_shared = Enew_shared
@@ -320,13 +317,11 @@ subroutine bybminimize2(E,deltae,deltac,current_iteration,
       deltae = Enew_shared - Eold_shared
 !$OMP END MASTER
 
+
       !call electron_gen_vall()
       call psi_get_density(1,dbl_mb(rho_in(1)))
 
 *     **** compute deltaV ****
-c      call dcopy(ispin*n2ft3d,
-c     >           dbl_mb(rho_in(1)),1,
-c     >           dbl_mb(rho_junk(1)),1)
       call Parallel_shared_vector_copy(.true.,ispin*n2ft3d,
      >                                 dbl_mb(rho_in(1)), 
      >                                 dbl_mb(rho_junk(1)))
diff --git a/src/nwpw/pspw/cgsd/cgsd_energy.F b/src/nwpw/pspw/cgsd/cgsd_energy.F
index 881684fd2dc..0c2aa75424d 100644
--- a/src/nwpw/pspw/cgsd/cgsd_energy.F
+++ b/src/nwpw/pspw/cgsd/cgsd_energy.F
@@ -154,8 +154,8 @@ real*8 function cgsd_energy(newpsi)
             call psi_diagonalize_hml_assending()
             call psi_1rotate2()
             call psi_2to1()
-            !call psi_1define_occupation(-1.0d0,.false.)
-            call psi_1define_occupation(1.0d0,.false.)
+            call psi_1define_occupation(-1.0d0,.false.)
+            !call psi_1define_occupation(1.0d0,.false.)
          end if
       end if
 
@@ -828,6 +828,48 @@ real*8 function cgsd_energy(newpsi)
    
       end
 
+
+      subroutine psi_printeigs_debug()
+      implicit none
+
+#include "stdio.fh"
+
+      integer MASTER,taskid
+      parameter (MASTER=0)
+
+      integer i,NN
+      real*8 EV
+
+      integer  psi_ne
+      external psi_ne
+      real*8   psi_eigenvalue,psi_occupation
+      external psi_eigenvalue,psi_occupation
+
+      NN=psi_ne(1)-psi_ne(2)
+      EV=27.2116d0
+
+      call Parallel_taskid(taskid)
+      if (taskid.eq.MASTER) then
+         do i=1,NN
+            write(luout,1511) psi_eigenvalue(1,i),
+     >                        psi_eigenvalue(1,i)*EV,
+     >                        psi_occupation(1,i)
+         end do
+         do i=1,psi_ne(2)
+            write(luout,1511)  psi_eigenvalue(1,i+NN),
+     >                         psi_eigenvalue(1,i+NN)*EV,
+     >                         psi_occupation(1,i+NN),
+     >                         psi_eigenvalue(2,i),
+     >                         psi_eigenvalue(2,i)*EV,
+     >                         psi_occupation(2,i)
+         end do
+      end if
+
+      return
+ 1511 FORMAT(2(E18.7,' (',F8.3,'eV)  occ=',F5.3))
+      end
+         
+
 *     *******************************
 *     *				    *
 *     *	    cgsd_energy_gradient    *
diff --git a/src/nwpw/pspw/makepsi/wvfnc_adjust.F b/src/nwpw/pspw/makepsi/wvfnc_adjust.F
index b3db6244d04..d2754abd2a7 100644
--- a/src/nwpw/pspw/makepsi/wvfnc_adjust.F
+++ b/src/nwpw/pspw/makepsi/wvfnc_adjust.F
@@ -22,7 +22,7 @@ subroutine wvfnc_adjust(wavefunction_filename,ispin,nein)
       parameter (MASTER=0)
 
       integer NMAX
-      integer filling(2)
+      integer filling(2),irm_excited
       integer fractional_orbitals(2),ne(2)
       character*255 new_filename,old_filename,emo_filename
 
@@ -78,7 +78,7 @@ subroutine wvfnc_adjust(wavefunction_filename,ispin,nein)
      >                     ispin,
      >                     ne,
      >                     fractional,
-     >                     fractional_orbitals)
+     >                     fractional_orbitals,irm_excited)
 
         !*** remove temporary wvfnc_adjust file ***
         call util_file_unlink(old_filename)
@@ -92,6 +92,9 @@ subroutine wvfnc_adjust(wavefunction_filename,ispin,nein)
       end if
       call ga_sync()
 
+      call Parallel_Brdcst_ivalue(MASTER,irm_excited)
+      if (irm_excited.eq.1) call control_unset_excited_ne()     !*** remove excited_ne from rtdb ***
+
       return
       end
 
@@ -103,7 +106,7 @@ subroutine sub_wvfnc_adjust(NMAX,filling,
      >                         ispin,
      >                         ne,
      >                         fractional,
-     >                         frac_orb)
+     >                         frac_orb,irm_excited)
       implicit none
       integer NMAX
       integer filling(4,NMAX,2)
@@ -113,6 +116,7 @@ subroutine sub_wvfnc_adjust(NMAX,filling,
       integer      ispin,ne(2)
       logical      fractional
       integer      frac_orb(2)
+      integer      irm_excited
 
 #include "bafdecls.fh"
 #include "errquit.fh"
@@ -138,6 +142,7 @@ subroutine sub_wvfnc_adjust(NMAX,filling,
       double precision GCDOTC,util_random
       external         GCDOTC,util_random
  
+      irm_excited = 0
 
       p = util_random(5291999) !*** initialize the random sequence ****
 
@@ -362,7 +367,10 @@ subroutine sub_wvfnc_adjust(NMAX,filling,
  
       if (emo_found) then
          call closefile(3)
-         if (emo_used) call util_file_unlink(emo_filename) !*** remove emo_filename ***
+         if (emo_used) then
+            call util_file_unlink(emo_filename) !*** remove emo_filename ***
+            irm_excited = 1
+         end if 
       end if
       call closefile(5)
       call closefile(6)

From 2ada604988552b16956e8d625fb6082e717cf262 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Mon, 10 May 2021 16:15:04 -0700
Subject: [PATCH 18/45] introduce icds keyword for SMD non-aqueous solvents
 https://github.com/nwchemgit/nwchem/issues/363

---
 src/solvation/cosmo_initialize.F | 1 +
 src/solvation/cosmo_input.F      | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/src/solvation/cosmo_initialize.F b/src/solvation/cosmo_initialize.F
index 78297283caf..f0211b0cff6 100644
--- a/src/solvation/cosmo_initialize.F
+++ b/src/solvation/cosmo_initialize.F
@@ -346,6 +346,7 @@ subroutine cosmo_initialize(rtdb,geom,basis,oprint)
       status = rtdb_get(rtdb,'cosmo:solg',mt_dbl,1,solg)
       status = rtdb_get(rtdb,'cosmo:solh',mt_dbl,1,solh)
       status = rtdb_get(rtdb,'cosmo:soln',mt_dbl,1,soln)
+      status = rtdb_get(rtdb,'cosmo:icds',mt_int,1,icds)
 c
 c     set sola,solb,solc,solg,solh,soln,icds parameters (either from
 c     solv_data or user-defined)
diff --git a/src/solvation/cosmo_input.F b/src/solvation/cosmo_input.F
index 0c4e2b73e22..3447e9a80d9 100644
--- a/src/solvation/cosmo_input.F
+++ b/src/solvation/cosmo_input.F
@@ -106,6 +106,7 @@ subroutine cosmo_input(rtdb)
       double precision soln
       double precision pol_cosmo_vem(2)
       double precision polgs_cosmo_vem, poles_cosmo_vem
+      integer icds
 c
 c     vem model parameters
 c
@@ -295,6 +296,10 @@ subroutine cosmo_input(rtdb)
         status = inp_f(soln)
         if (.not. rtdb_put(rtdb,'cosmo:soln',mt_dbl,1,soln))
      $    call errquit('cosmo_input: rtdb put failed',911,RTDB_ERR)
+      else if(inp_compare(.false.,'icds',field)) then
+        status = inp_i(icds)
+        if (.not. rtdb_put(rtdb,'cosmo:icds',mt_int,1,icds))
+     $    call errquit('cosmo_input: rtdb put failed',912,RTDB_ERR)
 c
 c <-- MN solvation models
 c

From f2f5cc62cbc94b30ba2cc93342373f5115c878f2 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Tue, 11 May 2021 10:18:44 -0700
Subject: [PATCH 19/45] fortran flags for nvfortran/pgf90. might fix
 https://github.com/edoapra/simint-generator/issues/4

---
 src/NWints/simint/libsimint_source/build_simint.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/NWints/simint/libsimint_source/build_simint.sh b/src/NWints/simint/libsimint_source/build_simint.sh
index fd6a4ce08ae..4119c11fc9a 100755
--- a/src/NWints/simint/libsimint_source/build_simint.sh
+++ b/src/NWints/simint/libsimint_source/build_simint.sh
@@ -167,6 +167,8 @@ elif  [ ${FC} == xlf ] || [ ${FC} == xlf_r ] || [ ${FC} == xlf90 ]|| [ ${FC} ==
     Fortran_FLAGS=" -qintsize=8 -qextname -qpreprocess"
 elif  [ ${FC} == ifort ]; then
     Fortran_FLAGS="-i8 -fpp"
+elif  [ ${FC} == nvfortran ] || [ ${FC} == pgf90 ] ; then
+    Fortran_FLAGS="-i8 -cpp"
 fi
 if [[ -z "${SIMINT_BUILD_TYPE}" ]]; then
     SIMINT_BUILD_TYPE=Release

From 5e9dea6a2f227206efe69edcfd9350a8f24455f5 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 11 May 2021 10:58:45 -0700
Subject: [PATCH 20/45] add test for presence of curl and patch

Signed-off-by: Jeff Hammond <jeff.science@gmail.com>
---
 src/NWints/simint/libsimint_source/build_simint.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/NWints/simint/libsimint_source/build_simint.sh b/src/NWints/simint/libsimint_source/build_simint.sh
index 4119c11fc9a..925447dbc85 100755
--- a/src/NWints/simint/libsimint_source/build_simint.sh
+++ b/src/NWints/simint/libsimint_source/build_simint.sh
@@ -12,6 +12,16 @@ if  [ -z "$(command -v python3)" ]; then
     echo please install python3
     exit 1
 fi
+if  [ -z "$(command -v curl)" ]; then
+    echo curl not installed
+    echo please install curl
+    exit 1
+fi
+if  [ -z "$(command -v patch)" ]; then
+    echo patch not installed
+    echo please install patch
+    exit 1
+fi
 UNAME_S=$(uname -s)
 if [[ ${UNAME_S} == Linux ]]; then
     CPU_FLAGS=$(cat /proc/cpuinfo | grep flags |tail -n 1)

From 3b0903232c4f7afeffcaf9ac68a47af95507bffa Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Tue, 11 May 2021 12:10:22 -0700
Subject: [PATCH 21/45] adding diis histories count

---
 src/nwpw/band/minimizer/band_minimizer.F |  8 ++++++--
 src/nwpw/band/minimizer/c_bybminimize.F  |  5 ++++-
 src/nwpw/band/minimizer/c_bybminimize2.F |  6 +++++-
 src/nwpw/nwpwlib/control/control.F       | 23 +++++++++++++++++++++++
 src/nwpw/nwpwlib/control/control.fh      |  2 ++
 src/nwpw/pspw/cgsd/bybminimize.F         |  5 ++++-
 src/nwpw/pspw/cgsd/bybminimize2.F        |  5 ++++-
 src/nwpw/pspw/cgsd/cgsdv5.F              | 10 ++++++----
 8 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/src/nwpw/band/minimizer/band_minimizer.F b/src/nwpw/band/minimizer/band_minimizer.F
index 57d24691110..a13c588bf5f 100644
--- a/src/nwpw/band/minimizer/band_minimizer.F
+++ b/src/nwpw/band/minimizer/band_minimizer.F
@@ -148,6 +148,8 @@ logical function band_minimizer(rtdb,flag)
       external control_fractional_smeartype
       real*8   control_fractional_kT,control_fractional_alpha
       external control_fractional_kT,control_fractional_alpha
+      integer  control_diis_histories
+      external control_diis_histories
 
 c      character*255 cpsp_comment,comment
 c      external      cpsp_comment
@@ -546,9 +548,10 @@ logical function band_minimizer(rtdb,flag)
            if (control_scf_algorithm().eq.0)
      >       write(luout,1293) "simple mixing"
            if (control_scf_algorithm().eq.1)
-     >       write(luout,1293) "Anderson potential mixing"
+     >       write(luout,1293) "Broyden mixing"
            if (control_scf_algorithm().eq.2)
-     >       write(luout,1293) "Johnson-Pulay mixing"
+     >       write(luout,1289) "Johnson-Pulay mixing",
+     >                         control_diis_histories()
            if (control_scf_algorithm().eq.3)
      >       write(luout,1293) "Anderson density mixing"
            if (minimizer.eq.5) write(luout,1296) "potential"
@@ -855,6 +858,7 @@ logical function band_minimizer(rtdb,flag)
  1280 FORMAT(5X, ' time step=',F10.2,5X,'fictitious mass=',F10.1)
  1281 FORMAT(5X, ' maximum iterations =',I10,
      >           ' ( ',I4,' inner ',I6,' outer )')
+ 1289 FORMAT(5X, ' scf algorithm        = ',A,' (',I2,' histories)')
  1290 FORMAT(5X, ' tolerance=',E9.3,' (energy)',E12.3,
      &        ' (density)')
  1291 FORMAT(/' Kohn-Sham scf parameters:')
diff --git a/src/nwpw/band/minimizer/c_bybminimize.F b/src/nwpw/band/minimizer/c_bybminimize.F
index 4f085641d70..335a0a116e8 100644
--- a/src/nwpw/band/minimizer/c_bybminimize.F
+++ b/src/nwpw/band/minimizer/c_bybminimize.F
@@ -89,6 +89,8 @@ subroutine c_bybminimize(E,deltae,deltac,current_iteration,
       double precision ddot
       external ddot
    
+      integer   control_diis_histories
+      external  control_diis_histories
 
       Ein = E(1)
       call Parallel_taskid(taskid)
@@ -169,7 +171,8 @@ subroutine c_bybminimize(E,deltae,deltac,current_iteration,
   
 *     **** iniitialize SCF Mixing ****    
       call nwpw_scf_mixing_init(control_scf_algorithm(),alpha,
-     >                5,ispin,2*nfft3d,dcpl_mb(vall_out(1)))
+     >                          control_diis_histories(),
+     >                          ispin,2*nfft3d,dcpl_mb(vall_out(1)))
 
 
 *     ***** diis loop ****
diff --git a/src/nwpw/band/minimizer/c_bybminimize2.F b/src/nwpw/band/minimizer/c_bybminimize2.F
index e887cd24253..669c6d24480 100644
--- a/src/nwpw/band/minimizer/c_bybminimize2.F
+++ b/src/nwpw/band/minimizer/c_bybminimize2.F
@@ -109,6 +109,9 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
       double precision ddot
       external ddot
 
+      integer  control_diis_histories
+      external control_diis_histories
+
       Ein = E(1)
       call Parallel_taskid(taskid)
       oprint = (taskid.eq.MASTER).and.control_print(print_medium)
@@ -225,7 +228,8 @@ subroutine c_bybminimize2(E,deltae,deltac,current_iteration,
   
 *     **** iniitialize SCF Mixing ****    
       call nwpw_scf_mixing_init(control_scf_algorithm(),alpha,
-     >                5,ispin,nfft3d,dbl_mb(rho_out(1)))
+     >                          control_diis_histories(),
+     >                          ispin,nfft3d,dbl_mb(rho_out(1)))
 
 
 *     **** iniitialize blocked cg ****
diff --git a/src/nwpw/nwpwlib/control/control.F b/src/nwpw/nwpwlib/control/control.F
index 4c392110865..6696692e619 100644
--- a/src/nwpw/nwpwlib/control/control.F
+++ b/src/nwpw/nwpwlib/control/control.F
@@ -512,6 +512,10 @@ logical function control_read(code_in,rtdb)
      >                  mt_int,1,scf_algorithm))
      >   scf_algorithm = 3
 
+      if (.not.btdb_get(rtdb,'nwpw:diis_histories',
+     >                  mt_int,1,diis_histories))
+     >   diis_histories = 15
+
       if (.not.btdb_get(rtdb,'nwpw:ks_algorithm',
      >                  mt_int,1,ks_algorithm))
      >   ks_algorithm = 0
@@ -767,6 +771,11 @@ logical function control_read(code_in,rtdb)
       if (.not.btdb_get(rtdb,'nwpw:scf_algorithm',
      >                  mt_int,1,scf_algorithm))
      >   scf_algorithm = 3
+
+      if (.not.btdb_get(rtdb,'nwpw:diis_histories',
+     >                  mt_int,1,diis_histories))
+     >   diis_histories = 15
+
       if (.not.btdb_get(rtdb,'nwpw:ks_algorithm',
      >                  mt_int,1,ks_algorithm))
      >   ks_algorithm = 0
@@ -2360,6 +2369,20 @@ integer function control_scf_algorithm()
       return
       end
 
+*     ***********************************
+*     *                                 *
+*     *       control_diis_histories    *
+*     *                                 *
+*     ***********************************
+      integer function control_diis_histories()
+      implicit none
+
+#include "control.fh"
+
+      control_diis_histories = diis_histories
+      return
+      end
+
 *     ***********************************
 *     *                                 *
 *     *         control_ks_algorithm    *
diff --git a/src/nwpw/nwpwlib/control/control.fh b/src/nwpw/nwpwlib/control/control.fh
index c35b7618eeb..e0f09fef919 100644
--- a/src/nwpw/nwpwlib/control/control.fh
+++ b/src/nwpw/nwpwlib/control/control.fh
@@ -30,6 +30,7 @@
       logical     two_comp_ppot,frozen,pio,fast_erf,fmm,smooth_cutoff
       logical     hess_model,periodic_dipole,precondition
       integer     maxit_orb,maxit_orbs,scf_algorithm,ks_algorithm
+      integer     diis_histories
       integer     symm_number,minimizer
       common / control_block / unita,unita_frozen,tolerances,
      >                         scaling,sa_decay,smooth_cutoff_values,
@@ -44,6 +45,7 @@
      >                         code,
      >                         ispin,multiplicity,
      >                         maxit_orb,maxit_orbs,scf_algorithm,
+     >                         diis_histories,
      >                         ks_algorithm,minimizer,
      >                         symm_number,
      >                         move,frac_coord,SA,fei,fei_quench,
diff --git a/src/nwpw/pspw/cgsd/bybminimize.F b/src/nwpw/pspw/cgsd/bybminimize.F
index 3f448e6922f..d3b6b9a0191 100644
--- a/src/nwpw/pspw/cgsd/bybminimize.F
+++ b/src/nwpw/pspw/cgsd/bybminimize.F
@@ -122,6 +122,8 @@ subroutine bybminimize(E,deltae,deltac,current_iteration,
 
       integer  control_ks_maxit_orb,control_ks_maxit_orbs
       external control_ks_maxit_orb,control_ks_maxit_orbs
+      integer  control_diis_histories
+      external control_diis_histories
 
    
       Ein = E(1)
@@ -226,7 +228,8 @@ subroutine bybminimize(E,deltae,deltac,current_iteration,
   
 *     **** iniitialize SCF Mixing ****    
       call nwpw_scf_mixing_init(control_scf_algorithm(),alpha,
-     >                5,ispin,n2ft3d,dbl_mb(vall_out(1)))
+     >                          control_diis_histories(),
+     >                          ispin,n2ft3d,dbl_mb(vall_out(1)))
 
 *     **** iniitialize RMM-DIIS ****
       if (control_ks_algorithm().eq.1) call pspw_rmmdiis_init(5)
diff --git a/src/nwpw/pspw/cgsd/bybminimize2.F b/src/nwpw/pspw/cgsd/bybminimize2.F
index 26dc8cf9e88..9ca93d52f4b 100644
--- a/src/nwpw/pspw/cgsd/bybminimize2.F
+++ b/src/nwpw/pspw/cgsd/bybminimize2.F
@@ -147,6 +147,8 @@ subroutine bybminimize2(E,deltae,deltac,current_iteration,
       integer  control_ks_maxit_orb,control_ks_maxit_orbs
       external control_ks_maxit_orb,control_ks_maxit_orbs
 
+      integer   control_diis_histories
+      external  control_diis_histories
 
       Ein = E(1)
       call Parallel_taskid(taskid)
@@ -262,7 +264,8 @@ subroutine bybminimize2(E,deltae,deltac,current_iteration,
   
 *     **** iniitialize SCF Mixing ****    
       call nwpw_scf_mixing_init(control_scf_algorithm(),alpha,
-     >                5,ispin,n2ft3d,dbl_mb(rho_out(1)))
+     >                          control_diis_histories(),
+     >                          ispin,n2ft3d,dbl_mb(rho_out(1)))
 
 *     **** iniitialize RMM-DIIS ****
       if (control_ks_algorithm().eq.1) call pspw_rmmdiis_init(5)
diff --git a/src/nwpw/pspw/cgsd/cgsdv5.F b/src/nwpw/pspw/cgsd/cgsdv5.F
index d2a660a2128..9a6586c804c 100644
--- a/src/nwpw/pspw/cgsd/cgsdv5.F
+++ b/src/nwpw/pspw/cgsd/cgsdv5.F
@@ -132,8 +132,8 @@ logical function cgsdv5(rtdb,flag)
       external  pspw_charge_found,ion_q_FixIon
       integer  control_minimizer,control_scf_algorithm
       external control_minimizer,control_scf_algorithm
-      integer  control_ks_algorithm
-      external control_ks_algorithm
+      integer  control_ks_algorithm,control_diis_histories
+      external control_ks_algorithm,control_diis_histories
       real*8   control_ks_alpha,control_kerker_g0
       external control_ks_alpha,control_kerker_g0
       logical  control_print,control_balance
@@ -586,9 +586,10 @@ logical function cgsdv5(rtdb,flag)
            if (control_scf_algorithm().eq.0) 
      >       write(luout,1293) "simple mixing"
            if (control_scf_algorithm().eq.1) 
-     >       write(luout,1293) "Anderson potential mixing"
+     >       write(luout,1293) "Broyden mixing"
            if (control_scf_algorithm().eq.2) 
-     >       write(luout,1293) "Johnson-Pulay mixing"
+     >       write(luout,1289) "Johnson-Pulay mixing",
+     >                         control_diis_histories()
            if (control_scf_algorithm().eq.3) 
      >       write(luout,1293) "Anderson density mixing"
            if (minimizer.eq.5) write(luout,1296) "potential"
@@ -924,6 +925,7 @@ logical function cgsdv5(rtdb,flag)
  1280 FORMAT(5X, ' time step=',F10.2,5X,'fictitious mass=',F10.1)
  1281 FORMAT(5X, ' maximum iterations =',I10,
      >           ' ( ',I4,' inner ',I6,' outer )')
+ 1289 FORMAT(5X, ' SCF algorithm        = ',A,' (',I2,' histories)')
  1290 FORMAT(5X, ' tolerance=',E9.3,' (energy)',E12.3,
      &        ' (density)')
  1291 FORMAT(/' Kohn-Sham scf parameters:')

From 16e698507cbef34dab990b3eebd20356f64c6531 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Tue, 11 May 2021 12:20:46 -0700
Subject: [PATCH 22/45] Added diis_histories option to scf keyword...EJB

---
 src/nwpw/nwpw_input.F | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/nwpw/nwpw_input.F b/src/nwpw/nwpw_input.F
index e75e23bc714..59ec09e3b7d 100644
--- a/src/nwpw/nwpw_input.F
+++ b/src/nwpw/nwpw_input.F
@@ -1155,6 +1155,14 @@ subroutine nwpw_input(rtdb)
      >                   4800,RTDB_ERR)
           if(.not.rtdb_put(rtdb,'nwpw:ks_maxit_orbs',mt_int,1,nx))
      >    call errquit('nwpw_input: error writing to rtdb',
+     >                 4800,RTDB_ERR)
+       end if
+       if (inp_compare(.false.,zone_name,'diis_histories')) then
+          if (.not.inp_i(nx)) 
+     >      call errquit('nwpw_input: error reading diis histories',
+     >                   4800,RTDB_ERR)
+          if(.not.rtdb_put(rtdb,'nwpw:diis_histories',mt_int,1,nx))
+     >    call errquit('nwpw_input: error writing to rtdb',
      >                 4800,RTDB_ERR)
        end if
 

From 2d4b7c7b705407fd737e4f904e7cfbe1b30d8773 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 11 May 2021 12:21:47 -0700
Subject: [PATCH 23/45] use curl or wget

Signed-off-by: Jeff Hammond <jeff.science@gmail.com>
---
 .../simint/libsimint_source/build_simint.sh    | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/NWints/simint/libsimint_source/build_simint.sh b/src/NWints/simint/libsimint_source/build_simint.sh
index 925447dbc85..ddff453720a 100755
--- a/src/NWints/simint/libsimint_source/build_simint.sh
+++ b/src/NWints/simint/libsimint_source/build_simint.sh
@@ -12,9 +12,9 @@ if  [ -z "$(command -v python3)" ]; then
     echo please install python3
     exit 1
 fi
-if  [ -z "$(command -v curl)" ]; then
-    echo curl not installed
-    echo please install curl
+if  [ -z "$(command -v curl)" ] && [ -z "$(command -v wget)" ]; then
+    echo curl and wget not installed
+    echo please install curl or wget
     exit 1
 fi
 if  [ -z "$(command -v patch)" ]; then
@@ -60,8 +60,16 @@ fi
 PERMUTE_SLOW=${SIMINT_MAXAM}
 GITHUB_USERID=edoapra
 rm -rf simint.l${SIMINT_MAXAM}_p${PERMUTE_SLOW}_d${DERIVE}* *-chem-simint-generator-?????? simint-chem-simint-generator.tar.gz simint_lib
-curl -L https://github.com/${GITHUB_USERID}/simint-generator/tarball/master -o simint-chem-simint-generator.tar.gz
-#curl -LJ https://github.com/simint-chem/simint-generator/tarball/master -o simint-chem-simint-generator.tar.gz
+
+GITHUB_URL=https://github.com/${GITHUB_USERID}/simint-generator/tarball/master
+#GITHUB_URL=https://github.com/simint-chem/simint-generator/tarball/master
+TAR_NAME=simint-chem-simint-generator.tar.gz
+if  [ ! -z "$(command -v curl)" ] ; then
+    curl -L "${GITHUB_URL}" -o "${TAR_NAME}"
+else
+    wget -O "${TAR_NAME}" "${GITHUB_URL}"
+fi
+
 tar xzf simint-chem-simint-generator.tar.gz
 cd *-simint-generator-???????
 rm -f generator_types.patch

From a1a55750b4fd51144ead3d6a262e345f18287e90 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Tue, 11 May 2021 10:44:31 -0700
Subject: [PATCH 24/45] use gcc and g++ with nvfortran/pgf90

---
 src/NWints/simint/libsimint_source/build_simint.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/NWints/simint/libsimint_source/build_simint.sh b/src/NWints/simint/libsimint_source/build_simint.sh
index 4119c11fc9a..75450607414 100755
--- a/src/NWints/simint/libsimint_source/build_simint.sh
+++ b/src/NWints/simint/libsimint_source/build_simint.sh
@@ -169,6 +169,8 @@ elif  [ ${FC} == ifort ]; then
     Fortran_FLAGS="-i8 -fpp"
 elif  [ ${FC} == nvfortran ] || [ ${FC} == pgf90 ] ; then
     Fortran_FLAGS="-i8 -cpp"
+    CC=gcc
+    CXX=g++
 fi
 if [[ -z "${SIMINT_BUILD_TYPE}" ]]; then
     SIMINT_BUILD_TYPE=Release

From 08a9df852022fe2124b9a8db27683097c8e682c5 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Tue, 11 May 2021 10:43:28 -0700
Subject: [PATCH 25/45] new nvfortran simint step

---
 .github/workflows/github_actions.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml
index 5db0bbe4a4b..88eef200c18 100644
--- a/.github/workflows/github_actions.yml
+++ b/.github/workflows/github_actions.yml
@@ -93,6 +93,14 @@ jobs:
             nwchem_modules: "nwdft solvation driver"
             fc: gfortran-10
             use_simint: 1
+          - os: ubuntu-20.04
+            experimental: true
+            mpi_impl: mpich
+            armci_network: mpi-ts
+            nwchem_modules: "tinyqmpw python"
+            fc: nvfortran
+            cc: gcc
+            use_simint: 1
           - os: ubuntu-20.04
             experimental: true 
             mpi_impl: mpich
@@ -143,6 +151,7 @@ jobs:
         NWCHEM_MODULES: ${{ matrix.nwchem_modules }}
         USE_SIMINT: ${{ matrix.use_simint }}
         FC: ${{ matrix.fc }}
+        CC: ${{ matrix.cc }}
         COMEX_MAX_NB_OUTSTANDING: 4
         SIMINT_MAXAM: 3
         SIMINT_VECTOR: avx2

From 9e6e98aef00ce233fa88af0299f4b91353553153 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Tue, 11 May 2021 11:04:40 -0700
Subject: [PATCH 26/45] remove CC definition from matrix

---
 .github/workflows/github_actions.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml
index 88eef200c18..7b038f94ea9 100644
--- a/.github/workflows/github_actions.yml
+++ b/.github/workflows/github_actions.yml
@@ -151,7 +151,6 @@ jobs:
         NWCHEM_MODULES: ${{ matrix.nwchem_modules }}
         USE_SIMINT: ${{ matrix.use_simint }}
         FC: ${{ matrix.fc }}
-        CC: ${{ matrix.cc }}
         COMEX_MAX_NB_OUTSTANDING: 4
         SIMINT_MAXAM: 3
         SIMINT_VECTOR: avx2

From abb3207305d418acf76785dc220301bb345a1ce3 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Tue, 11 May 2021 15:40:41 -0700
Subject: [PATCH 27/45] large speedup of simint build by setting build=release
 for generator

---
 src/NWints/simint/libsimint_source/build_simint.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/NWints/simint/libsimint_source/build_simint.sh b/src/NWints/simint/libsimint_source/build_simint.sh
index 251bf9b5f5e..7de2434a837 100755
--- a/src/NWints/simint/libsimint_source/build_simint.sh
+++ b/src/NWints/simint/libsimint_source/build_simint.sh
@@ -130,7 +130,10 @@ if [[ ${CMAKE_VER} -lt 3 ]]; then
     echo define the CMAKE env. variable
     exit 1
 fi
-$CMAKE ../
+if [[ -z "${SIMINT_BUILD_TYPE}" ]]; then
+    SIMINT_BUILD_TYPE=Release
+fi
+$CMAKE  -DCMAKE_BUILD_TYPE="${SIMINT_BUILD_TYPE}"  ../
 make -j2
 cd ..
 #./create.py -g build/generator/ostei -l 6 -p 4 -d 1 simint.l6_p4_d1
@@ -190,9 +193,6 @@ elif  [ ${FC} == nvfortran ] || [ ${FC} == pgf90 ] ; then
     CC=gcc
     CXX=g++
 fi
-if [[ -z "${SIMINT_BUILD_TYPE}" ]]; then
-    SIMINT_BUILD_TYPE=Release
-fi
 echo Fortran_FLAGS equal "$Fortran_FLAGS"
 FC="${FC}" CXX="${CXX}" $CMAKE \
  -DCMAKE_BUILD_TYPE="${SIMINT_BUILD_TYPE}" -DSIMINT_VECTOR=${VEC}  \

From 0ba6d634982f8264eed2bf3cadc5e3d18a0a7307 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Wed, 12 May 2021 09:57:24 -0700
Subject: [PATCH 28/45] missing arguments spotted by Michael.Klemm@amd.com

---
 src/optim/neb/neb_utils.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optim/neb/neb_utils.F b/src/optim/neb/neb_utils.F
index 9912a32a303..5cb778a10ab 100644
--- a/src/optim/neb/neb_utils.F
+++ b/src/optim/neb/neb_utils.F
@@ -2201,7 +2201,7 @@ real*8 function neb_line_energy(bead_list,kbeads,alpha,opt)
      >                          dbl_mb(c1(1)),
      >                          dbl_mb(e1(1)),
      >                          dbl_mb(t1(1)),
-     >                          dbl_mb(g1(1)))
+     >                          dbl_mb(g1(1)),.false.,.false.)
            shift = (m-1)*ng
            call dcopy(ng,dbl_mb(c1(1)),1,dbl_mb(cs(1)+shift),1)
            call dcopy(ng,dbl_mb(g1(1)),1,dbl_mb(gs(1)+shift),1)

From 07473ed86a778169cffcdbe690ddda7329addfa2 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Wed, 12 May 2021 16:50:50 -0700
Subject: [PATCH 29/45] riscv64 options

---
 src/config/makefile.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/config/makefile.h b/src/config/makefile.h
index e8b11758542..73781e20a86 100644
--- a/src/config/makefile.h
+++ b/src/config/makefile.h
@@ -1807,6 +1807,13 @@ endif
          FFLAGS_FORGA   = -mabi=64
          CFLAGS_FORGA   = -mabi=64
        endif
+       ifeq ($(_CPU),riscv64)
+         DONTHAVEM64OPT=Y
+         COPTIONS   =  -march=rv64gc -mabi=lp64d
+         FOPTIONS   =  -march=rv64gc -mabi=lp64d
+         FFLAGS_FORGA   = -march=rv64gc -mabi=lp64d
+         CFLAGS_FORGA   = -march=rv64gc -mabi=lp64d
+       endif
       ifeq ($(_CC),gcc)
        ifneq ($(DONTHAVEM64OPT),Y)
          COPTIONS   = -m64

From 624ce28d5e07328412cd9cd0aff3263b1683f17e Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Wed, 12 May 2021 17:29:35 -0700
Subject: [PATCH 30/45] default to peigs_CPU=PENTIUM if peigs_CPU is not
 defined

---
 src/peigs/DEFS | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/peigs/DEFS b/src/peigs/DEFS
index 325ed4e575b..9c0859f46b7 100644
--- a/src/peigs/DEFS
+++ b/src/peigs/DEFS
@@ -583,6 +583,9 @@ ifeq ($(FC),xlf)
     endif
   endif
 
+  ifndef peigs_CPU
+        peigs_CPU  = PENTIUM
+  endif
 endif#end of LINUX64
 
 ifeq ($(peigs_TARGET),cray-sv2)

From 0a3d926025979528107cf669f291218880d7e830 Mon Sep 17 00:00:00 2001
From: Eric Bylaska <bylaska@gmail.com>
Date: Thu, 13 May 2021 08:11:43 -0700
Subject: [PATCH 31/45] debugging pspw fractional optimizer...EJB

---
 src/nwpw/pspw/cgsd/bybminimize2.F |  4 +++-
 src/nwpw/pspw/cgsd/cgsd_energy.F  | 19 +++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/nwpw/pspw/cgsd/bybminimize2.F b/src/nwpw/pspw/cgsd/bybminimize2.F
index 9ca93d52f4b..026f0643284 100644
--- a/src/nwpw/pspw/cgsd/bybminimize2.F
+++ b/src/nwpw/pspw/cgsd/bybminimize2.F
@@ -239,7 +239,8 @@ subroutine bybminimize2(E,deltae,deltac,current_iteration,
          if (dohfx) call psi_1genrho()
          if (control_fractional()) then
             call psi_1assending_occupation()
-            call psi_1define_occupation(0.0d0,.false.)
+            !call psi_1define_occupation(0.0d0,.false.)
+            call psi_1define_occupation(-1.0d0,.false.)
             Enew1 = Enew1 + psi_smearcorrection()
          end if
 !$OMP MASTER
@@ -302,6 +303,7 @@ subroutine bybminimize2(E,deltae,deltac,current_iteration,
       call psi_1rotate2()
       call psi_2to1()
 
+
 !$OMP BARRIER
 
 *     **** define fractional occupation ****
diff --git a/src/nwpw/pspw/cgsd/cgsd_energy.F b/src/nwpw/pspw/cgsd/cgsd_energy.F
index 0c2aa75424d..91bc9f1eef8 100644
--- a/src/nwpw/pspw/cgsd/cgsd_energy.F
+++ b/src/nwpw/pspw/cgsd/cgsd_energy.F
@@ -146,17 +146,16 @@ real*8 function cgsd_energy(newpsi)
       if (minimizer.eq.8) it_out = 1
       if ((newpsi).or.(nwpw_cosmo_firsttime())) then
          call pspw_Lin_HFX_reset()
-         call sdminimize(15)
+         if (minimizer.lt.4) call sdminimize(15)
          call bybminimize0()
-         if (control_fractional()) then
-            call psi_1toelectron()
-            call psi_1gen_hml()
-            call psi_diagonalize_hml_assending()
-            call psi_1rotate2()
-            call psi_2to1()
-            call psi_1define_occupation(-1.0d0,.false.)
-            !call psi_1define_occupation(1.0d0,.false.)
-         end if
+c         if (control_fractional()) then
+c            call psi_1toelectron()
+c            call psi_1gen_hml()
+c            call psi_diagonalize_hml_assending()
+c            call psi_1rotate2()
+c            call psi_2to1()
+c            call psi_1define_occupation(-1.0d0,.false.)
+c         end if
       end if
 
    2  continue

From 56875e03d10ae8b48c554a567dbc7cf13d89bcef Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 13 May 2021 10:31:36 -0700
Subject: [PATCH 32/45] fix for Simint memory requirements
 https://github.com/nwchemgit/nwchem/issues/372

---
 src/NWints/simint/source/nwcsim_facef90.F | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/NWints/simint/source/nwcsim_facef90.F b/src/NWints/simint/source/nwcsim_facef90.F
index 26520868c11..46a2cf3f40a 100644
--- a/src/NWints/simint/source/nwcsim_facef90.F
+++ b/src/NWints/simint/source/nwcsim_facef90.F
@@ -102,6 +102,7 @@ subroutine nwcsim_init(rtdb,nbas,bases,num_der)
      &              call errquit(pname//'Exiting ',8, BASIS_ERR)
                if (.not. geom_cent_get(geom, iat, tag,
      &              coord, q))call errquit
+     &              (pname//'Exiting ',9, GEOM_ERR)
                nwcsim_noshell(bas)=nwcsim_noshell(bas)+1
                call simint_initialize_shell(
      S              smnt_sh(nwcsim_noshell(bas),bas))
@@ -124,10 +125,10 @@ subroutine nwcsim_init(rtdb,nbas,bases,num_der)
 c
 c     memory allocation
 c
-         isz_2e4c = max(isz_2e4c,
-     S        simint_eri_worksize(num_der, max_ang))
+C simint_eri_workmem gives the minimum size of the workspace required in bytes
          mem_2e4c = max(mem_2e4c,
-     S        simint_eri_workmem(num_der, max_ang))
+     S        simint_eri_workmem(num_der, max_ang))/
+     M        MA_sizeof(MT_INT,1,MT_BYTE)
       enddo ! basis loop
       endif
 c
@@ -136,16 +137,16 @@ subroutine nwcsim_init(rtdb,nbas,bases,num_der)
 c     SIMINT_PRIM_SCREEN_STAT needs 4 more doubles
 c      isz_2e4c        = isz_2e4c + 4
 c      
-      call util_align(isz_2e4c,SIMINT_SIMD_LEN)
-      call util_align(mem_2e4c,SIMINT_SIMD_LEN)
 c
-      iszb_2e4c=isz_2e4c
+c      iszb_2e4c=isz_2e4c
       
       if(num_der.eq.1) then
          memb_2e4c = mem_2e4c + mem_2e4c/5
       else
          memb_2e4c = mem_2e4c + mem_2e4c/10 ! +10% to be safe
       endif
+      call util_align(isz_2e4c,SIMINT_SIMD_LEN)
+      call util_align(mem_2e4c,SIMINT_SIMD_LEN)
       call util_align(memb_2e4c,SIMINT_SIMD_LEN)
       mem_2e3c = mem_2e4c
       mem_2e2c = mem_2e4c

From 2707ea08b695731c903bf76c2d96136feb97e6e7 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 13 May 2021 10:48:48 -0700
Subject: [PATCH 33/45] refresh github action cache to test latest simint
 commits

---
 .github/workflows/github_actions.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml
index 7b038f94ea9..6a72d8628e0 100644
--- a/.github/workflows/github_actions.yml
+++ b/.github/workflows/github_actions.yml
@@ -174,7 +174,7 @@ jobs:
         with:
           path: |
             ~/cache
-          key: ${{ matrix.os }}-${{ matrix.mpi_impl}}-${{ matrix.fc}}-xcode${{ matrix.xcode_version}}-simint${{ matrix.use_simint}}-blas${{ matrix.blas}}-simd${{ steps.get-simd.outputs.simd }}-nwchemcache-v004
+          key: ${{ matrix.os }}-${{ matrix.mpi_impl}}-${{ matrix.fc}}-xcode${{ matrix.xcode_version}}-simint${{ matrix.use_simint}}-blas${{ matrix.blas}}-simd${{ steps.get-simd.outputs.simd }}-nwchemcache-v005
       - name: build environment
         run: |
           pwd

From 05908c3366b8a314af4c3e06333230593a76010a Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 13 May 2021 13:32:28 -0700
Subject: [PATCH 34/45] added ifort simint step

---
 .github/workflows/github_actions.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml
index 6a72d8628e0..475c3418fd6 100644
--- a/.github/workflows/github_actions.yml
+++ b/.github/workflows/github_actions.yml
@@ -97,10 +97,18 @@ jobs:
             experimental: true
             mpi_impl: mpich
             armci_network: mpi-ts
-            nwchem_modules: "tinyqmpw python"
+            nwchem_modules: "nwdft solvation driver"
             fc: nvfortran
             cc: gcc
             use_simint: 1
+          - os: ubuntu-20.04
+            experimental: true
+            mpi_impl: intel
+            armci_network: mpi-ts
+            nwchem_modules: "nwdft solvation driver"
+            fc: ifort
+            cc: icc
+            use_simint: 1
           - os: ubuntu-20.04
             experimental: true 
             mpi_impl: mpich

From 6ff10eeed4e0d3c14406329ae3701d10f610bd55 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 13 May 2021 13:32:28 -0700
Subject: [PATCH 35/45] require gcc6 for skylake

---
 src/NWints/simint/libsimint_source/build_simint.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/NWints/simint/libsimint_source/build_simint.sh b/src/NWints/simint/libsimint_source/build_simint.sh
index 7de2434a837..079cc076dc2 100755
--- a/src/NWints/simint/libsimint_source/build_simint.sh
+++ b/src/NWints/simint/libsimint_source/build_simint.sh
@@ -51,6 +51,19 @@ else
     VEC=scalar
 fi
 echo VEC $VEC
+if [[ "${VEC}" == "avx512" ]]; then
+if [[   -z "${CC}" ]]; then
+    CC=cc
+fi
+let GCCVERSIONGT5=$(expr `${CC} -dumpversion | cut -f1 -d.` \> 5)
+    if [[ ${GCCVERSIONGT5} != 1 ]]; then
+	echo
+	echo you have gcc version $(${CC} -dumpversion | cut -f1 -d.)
+	echo gcc version 6 and later needed for skylake
+	echo
+	exit 1
+    fi
+fi
 SRC_HOME=`pwd`
 DERIV=1
 if [[  -z "${SIMINT_MAXAM}" ]]; then

From 5d09697f4967485a69c990732319635366445347 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 13 May 2021 15:24:05 -0700
Subject: [PATCH 36/45] changes for l=7 aka k functions

---
 src/basis/bas_input.F | 4 ++--
 src/basis/basis.F     | 3 ++-
 src/util/nwc_const.fh | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/basis/bas_input.F b/src/basis/bas_input.F
index d521d3995d8..8c9f610ff02 100644
--- a/src/basis/bas_input.F
+++ b/src/basis/bas_input.F
@@ -358,7 +358,7 @@ subroutine bas_input_body(basis, osegment, oIs_rel, oHas_Star)
       integer nltypes           ! No. of known angular momentum types
       integer nsptypes          ! No. of known sp type shells
       integer nopts             ! No. of options
-      parameter (nltypes = 7, nsptypes = 3, nopts = 6)
+      parameter (nltypes = 8, nsptypes = 3, nopts = 6)
       character*1 ltypes(nltypes)
       character*2 sptypes(nsptypes)
       character*8 opts(nopts)
@@ -374,7 +374,7 @@ subroutine bas_input_body(basis, osegment, oIs_rel, oHas_Star)
 cc AJL/Begin/SPIN-POLARISED ECPs
       integer channel           ! Both (Default)=0; Alpha=1; Beta=2
 cc AJL/End
-      data ltypes /'s','p','d','f','g','h','i'/
+      data ltypes /'s','p','d','f','g','h','i','k'/
       data sptypes / 'sp', 'l ', 'ul'/
       data spvalues/  -1 , -1 ,   -1 /
       data opts / 'except', 'library', 'file', 'rel', 'nelec' ,
diff --git a/src/basis/basis.F b/src/basis/basis.F
index 8a4a6b9670f..a6571e3de55 100644
--- a/src/basis/basis.F
+++ b/src/basis/basis.F
@@ -658,7 +658,7 @@ logical function bas_print(basisin)
       integer atn, len_tag, len_ele
       character*2 symbol
       character*16 element
-      character*3 ctype(0:6),cltype(2)
+      character*3 ctype(0:7),cltype(2)
       character*3 shell_type
 *. . . . . . . . . . . ! Room for tag+space+(+element+) = 16+1+1+16+1
       character*35 buffer  
@@ -683,6 +683,7 @@ logical function bas_print(basisin)
       ctype(4)='G'
       ctype(5)='H'
       ctype(6)='I'
+      ctype(7)='K'
       cltype(1)='SP'
       cltype(2)='SPD'
       bas_print = .true.
diff --git a/src/util/nwc_const.fh b/src/util/nwc_const.fh
index 3aa4a44b9ca..b7263626150 100644
--- a/src/util/nwc_const.fh
+++ b/src/util/nwc_const.fh
@@ -49,9 +49,9 @@
       parameter (nw_max_coor = 3*nw_max_atom)
 *------------------------------------------------------------------------------
 * Maximum angular momentum (union of all integral functionality)
-*                                 0=S, 1=P, 2=D, 3=F, 4=G, 5=H, 6=I
+*                                 0=S, 1=P, 2=D, 3=F, 4=G, 5=H, 6=I, 7=K
       integer nw_max_angular
-      parameter (nw_max_angular = 6)
+      parameter (nw_max_angular = 7)
 *------------------------------------------------------------------------------
 * Maximum number of primitive gaussians in a segmented shell 
 *....................................................... or general contraction

From d06274c2b40654e984a21a7b10fc057fa97ad8b9 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 13 May 2021 19:51:11 -0700
Subject: [PATCH 37/45] switch to OpenBLAS 0.3.15. fix avx512 detection with
 icc

---
 src/libext/openblas/build_openblas.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/libext/openblas/build_openblas.sh b/src/libext/openblas/build_openblas.sh
index a73c9aa7a23..5aa104e9b60 100755
--- a/src/libext/openblas/build_openblas.sh
+++ b/src/libext/openblas/build_openblas.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -v
 arch=`uname -m`
-VERSION=0.3.13
+VERSION=0.3.15
 #COMMIT=974acb39ff86121a5a94be4853f58bd728b56b81
 BRANCH=develop
 #if [ -f  OpenBLAS-${VERSION}.tar.gz ]; then
@@ -98,6 +98,11 @@ if [[ "$FORCETARGET" == *"SKYLAKEX"* ]]; then
 	exit 1
     fi
 fi
+#this fixes avx512 detection for icc
+if [[ "${CC}" == "icc" ]]; then
+    FORCETARGET+=HOSTCC=\"icc -xhost\"
+fi
+
 #disable threading for ppc64le since it uses OPENMP
 echo arch is "$arch"
 if [[ "$arch" == "ppc64le" ]]; then

From 9f062ecb156365c0d5548343ad21e4585026368b Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Thu, 13 May 2021 19:57:35 -0700
Subject: [PATCH 38/45] fixed check for existing openblas tarball

---
 src/libext/openblas/build_openblas.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/libext/openblas/build_openblas.sh b/src/libext/openblas/build_openblas.sh
index 5aa104e9b60..b9557867df8 100755
--- a/src/libext/openblas/build_openblas.sh
+++ b/src/libext/openblas/build_openblas.sh
@@ -4,10 +4,10 @@ arch=`uname -m`
 VERSION=0.3.15
 #COMMIT=974acb39ff86121a5a94be4853f58bd728b56b81
 BRANCH=develop
-#if [ -f  OpenBLAS-${VERSION}.tar.gz ]; then
-#    echo "using existing"  OpenBLAS-${VERSION}.tar.gz
-if [ -f  OpenBLAS-$COMMIT.zip ]; then
-    echo "using existing"  OpenBLAS-${COMMIT}.zip
+if [ -f  OpenBLAS-${VERSION}.tar.gz ]; then
+    echo "using existing"  OpenBLAS-${VERSION}.tar.gz
+#if [ -f  OpenBLAS-$COMMIT.zip ]; then
+#    echo "using existing"  OpenBLAS-${COMMIT}.zip
 else
     rm -rf OpenBLAS*
 #    curl -L https://github.com/xianyi/OpenBLAS/archive/$COMMIT.zip -o OpenBLAS-$COMMIT.zip

From bbb4773f42fb0429b53ad1ba7b0a60954baab093 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Fri, 14 May 2021 10:31:00 -0700
Subject: [PATCH 39/45] scalapack size bug fix. should address the mac
 accelerate failures

---
 src/libext/scalapack/build_scalapa.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/libext/scalapack/build_scalapa.sh b/src/libext/scalapack/build_scalapa.sh
index 3ed6245f353..90016eeda86 100755
--- a/src/libext/scalapack/build_scalapa.sh
+++ b/src/libext/scalapack/build_scalapa.sh
@@ -83,7 +83,7 @@ if [[ "$BLAS_SIZE" != "$SCALAPACK_SIZE"  ]] ; then
     exit 1
 fi
 
-if [[  -z "${SCALAPCK_SIZE}" ]]; then
+if [[  -z "${SCALAPACK_SIZE}" ]]; then
    SCALAPACK_SIZE=8
 fi
 if [[ "$BLAS_SIZE" == 4 ]] && [[ -z "$USE_64TO32"   ]] ; then
@@ -141,7 +141,7 @@ GOTCLANG=$( mpicc -dM -E - </dev/null 2> /dev/null |grep __clang__|head -1|cut -
 if [[ ${GOTCLANG} == "1" ]] ; then
     C_FLAGS=" -Wno-error=implicit-function-declaration "
 fi
-
+echo "SCALAPACK_SIZE" is $SCALAPACK_SIZE
 if [[  "$SCALAPACK_SIZE" == 8 ]] ; then
     GFORTRAN_EXTRA=$(echo $FC | cut -c 1-8)
     if  [[ ${FC} == gfortran ]] || [[ ${FC} == f95 ]] || [[ ${GFORTRAN_EXTRA} == gfortran ]] ; then
@@ -153,6 +153,7 @@ if [[  "$SCALAPACK_SIZE" == 8 ]] ; then
     fi
     C_FLAGS+=" -DInt=long"
 fi
+echo compiling with CC=mpicc  FC=$MPIF90 CFLAGS="$C_FLAGS" FFLAGS="$Fortran_FLAGS" $CMAKE -Wno-dev ../ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_FLAGS="$C_FLAGS"  -DCMAKE_Fortran_FLAGS="$Fortran_FLAGS" -DTEST_SCALAPACK=OFF  -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=OFF  -DBLAS_openblas_LIBRARY="$BLASOPT"  -DBLAS_LIBRARIES="$BLASOPT"  -DLAPACK_openblas_LIBRARY="$BLASOPT"  -DLAPACK_LIBRARIES="$BLASOPT" 
 CC=mpicc  FC=$MPIF90 CFLAGS="$C_FLAGS" FFLAGS="$Fortran_FLAGS" $CMAKE -Wno-dev ../ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_FLAGS="$C_FLAGS"  -DCMAKE_Fortran_FLAGS="$Fortran_FLAGS" -DTEST_SCALAPACK=OFF  -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=OFF  -DBLAS_openblas_LIBRARY="$BLASOPT"  -DBLAS_LIBRARIES="$BLASOPT"  -DLAPACK_openblas_LIBRARY="$BLASOPT"  -DLAPACK_LIBRARIES="$BLASOPT" 
 make V=0 -j3 scalapack/fast
 mkdir -p ../../../lib

From 367ce443496367e20cc150b58d5384cbe74407a8 Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Fri, 14 May 2021 11:45:42 -0700
Subject: [PATCH 40/45] fix wrong position of parenthesis
 https://github.com/nwchemgit/nwchem/issues/372#issuecomment-841131707

---
 src/NWints/simint/source/nwcsim_facef90.F | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/NWints/simint/source/nwcsim_facef90.F b/src/NWints/simint/source/nwcsim_facef90.F
index 46a2cf3f40a..2b479ee684a 100644
--- a/src/NWints/simint/source/nwcsim_facef90.F
+++ b/src/NWints/simint/source/nwcsim_facef90.F
@@ -125,10 +125,8 @@ subroutine nwcsim_init(rtdb,nbas,bases,num_der)
 c
 c     memory allocation
 c
-C simint_eri_workmem gives the minimum size of the workspace required in bytes
          mem_2e4c = max(mem_2e4c,
-     S        simint_eri_workmem(num_der, max_ang))/
-     M        MA_sizeof(MT_INT,1,MT_BYTE)
+     S        simint_eri_worksize(num_der, max_ang))
       enddo ! basis loop
       endif
 c

From 4ccdc364cb069dcef6867d0d3ab0f9b447db6320 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 2 May 2021 16:46:21 -0700
Subject: [PATCH 41/45] use do concurrent in 64-to-32 copy

---
 src/ccsd/convert_single_double.F | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/ccsd/convert_single_double.F b/src/ccsd/convert_single_double.F
index fe598ceb89d..140276e3168 100644
--- a/src/ccsd/convert_single_double.F
+++ b/src/ccsd/convert_single_double.F
@@ -9,10 +9,9 @@ subroutine copy_32_to_64(n, a32, a64)
       real(kind=dp), intent(out) :: a64(n)
       !
       integer :: i
-      !$OMP SIMD
-      do i=1, n
+      do concurrent (i=1:n)
           a64(i) = real(a32(i), kind=dp)
-      enddo
+      end do
       end subroutine
 
       subroutine copy_64_to_32(n, a64, a32)
@@ -26,10 +25,9 @@ subroutine copy_64_to_32(n, a64, a32)
       real(kind=sp), intent(out) :: a32(n)
       !
       integer :: i
-      !$OMP SIMD
-      do i=1, n
+      do concurrent (i=1:n)
           a32(i) = real(a64(i), kind=sp)
-      enddo
+      end do
       end subroutine
 
       subroutine add_32_to_64(n, a32, a64)
@@ -43,10 +41,9 @@ subroutine add_32_to_64(n, a32, a64)
       real(kind=dp), intent(inout) :: a64(n)
       !
       integer :: i
-      !$OMP SIMD
-      do i=1, n
+      do concurrent (i=1:n)
           a64(i) = a64(i) + real(a32(i), kind=dp)
-      enddo
+      end do
       end subroutine
 
       subroutine add_64_to_32(n, a64, a32)
@@ -60,8 +57,7 @@ subroutine add_64_to_32(n, a64, a32)
       real(kind=sp), intent(inout) :: a32(n)
       !
       integer :: i
-      !$OMP SIMD
-      do i=1, n
+      do concurrent (i=1:n)
           a32(i) = a32(i) + real(a64(i), kind=sp)
-      enddo
+      end do
       end subroutine

From b081e18b64bf7f9e3abeedc870bb8c6eee89fe45 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sun, 2 May 2021 17:09:25 -0700
Subject: [PATCH 42/45] NVIDIA GPU version of CCSD(T) semidirect (trpdrv)

at build time, you must specify USE_OPENACC_TRPDRV=1 and NWCHEM_LINK_CUDA=1.
the former turns on the OpenACC+CUBLAS compilation in TRPDRV.
the latter turns on the use of CUDA managed memory in GA.

for now, you must specify DEV_GA=1 at build time, until https://github.com/GlobalArrays/ga/pull/210 is part of GA release version NWChem uses.

at run time, you must specify MA_USE_CUDA_MEM=1
this causes MA to use CUDA managed memory, without which the code will crash, if the OS does not support system unified memory.

this code does not yet deal with how MPI and multi-GPU systems interact.
---
 src/ccsd/GNUmakefile           |   6 +
 src/ccsd/aoccsd2.F             |  29 ++
 src/ccsd/ccsd_trpdrv_openacc.F | 572 +++++++++++++++++++++++++++++++++
 src/config/makefile.h          |   4 +
 src/tools/GNUmakefile          |   4 +
 5 files changed, 615 insertions(+)
 create mode 100644 src/ccsd/ccsd_trpdrv_openacc.F

diff --git a/src/ccsd/GNUmakefile b/src/ccsd/GNUmakefile
index cecc25724b1..4516406a42f 100644
--- a/src/ccsd/GNUmakefile
+++ b/src/ccsd/GNUmakefile
@@ -109,6 +109,12 @@ ifeq ($(HAVE_SET_GA_PROPERTY),Y)
       LIB_DEFINES += -DHAVE_SET_GA_PROPERTY
 endif
 
+ifdef USE_OPENACC_TRPDRV
+  OBJ_OPTIMIZE += ccsd_trpdrv_openacc.o
+  USES_BLAS    += ccsd_trpdrv_openacc.F
+  FOPTIONS += -acc -gpu=managed -cuda -cudalib=cublas
+endif
+
 
 ifeq ($(ARMCI_NETWORK),MPI-PR)
    LIB_DEFINES += -DACC_STRIPS
diff --git a/src/ccsd/aoccsd2.F b/src/ccsd/aoccsd2.F
index 6829c5c51bf..87e258f12cd 100644
--- a/src/ccsd/aoccsd2.F
+++ b/src/ccsd/aoccsd2.F
@@ -16,6 +16,7 @@ subroutine aoccsd(basis,ncor,nocc,nvir,ndel,nact,nbf,maxit,
       logical oconverged, occd, use_trpdrv_nb
       logical use_trpdrv_omp, use_trpdrv_bgp2
       logical use_trpdrv_omp_mp
+      logical use_trpdrv_openacc
       logical use_trpdrv_offload
 c
 #include "ccsd_len.fh"
@@ -717,6 +718,9 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf,
       if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_omp_mp', mt_log, 1,
      1                   use_trpdrv_omp_mp))
      2    use_trpdrv_omp_mp=.false.
+      if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_openacc', mt_log, 1,
+     1                   use_trpdrv_openacc))
+     2    use_trpdrv_offload=.false.
       if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_offload', mt_log, 1,
      1                   use_trpdrv_offload))
      2    use_trpdrv_offload=.false.
@@ -976,6 +980,31 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf,
      $        dbl_mb(k_trp_Kka), dbl_mb(k_trp_Jij), dbl_mb(k_trp_Jkj),
      $        dbl_mb(k_trp_Kij), dbl_mb(k_trp_Kkj), dbl_mb(k_trp_Dja),
      $        dbl_mb(k_trp_Djka), dbl_mb(k_trp_Djia))
+c
+         else if (use_trpdrv_openacc) then
+#if defined(USE_OPENACC_TRPDRV)
+#ifndef USE_F90_ALLOCATABLE
+#error You must set USE_F90_ALLOCATABLE if USE_OPENACC_TRPDRV is set!
+#endif
+         if (iam.eq.0.and.oprint) then
+            write(luout,1808) nvpass,util_wallsec()
+            call util_flush(luout)
+         endif
+ 1808    format(' commencing triples evaluation - openacc version',i8,
+     I        ' at ',f20.2,' secs')
+         call ccsd_trpdrv_openacc(dbl_mb(k_t1),
+     $        f1n,f1t,f2n,f2t,f3n,f3t,f4n,f4t,
+     $        eorb,g_objo,g_objv,g_coul,g_exch,ncor,nocc,nvir,iprt,
+     $        empt(1),empt(2),oseg_lo,oseg_hi,kchunk,
+     $        dbl_mb(k_trp_Tij), dbl_mb(k_trp_Tkj), dbl_mb(k_trp_Tia),
+     $        dbl_mb(k_trp_Tka), dbl_mb(k_trp_Xia), dbl_mb(k_trp_Xka),
+     $        dbl_mb(k_trp_Jia), dbl_mb(k_trp_Jka), dbl_mb(k_trp_Kia),
+     $        dbl_mb(k_trp_Kka), dbl_mb(k_trp_Jij), dbl_mb(k_trp_Jkj),
+     $        dbl_mb(k_trp_Kij), dbl_mb(k_trp_Kkj), dbl_mb(k_trp_Dja),
+     $        dbl_mb(k_trp_Djka), dbl_mb(k_trp_Djia))
+#else
+         call errquit('aoccsd: trpdrv_openacc disabled ',0,0)
+#endif
 c
          elseif (use_trpdrv_omp_mp) then
 #ifndef TRPMIXP_OFF
diff --git a/src/ccsd/ccsd_trpdrv_openacc.F b/src/ccsd/ccsd_trpdrv_openacc.F
new file mode 100644
index 00000000000..4a174d5d883
--- /dev/null
+++ b/src/ccsd/ccsd_trpdrv_openacc.F
@@ -0,0 +1,572 @@
+      subroutine ccsd_trpdrv_openacc(t1,
+     &     f1n,f1t,f2n,f2t,f3n,f3t,f4n,f4t,eorb,
+     &     g_objo,g_objv,g_coul,g_exch,
+     &     ncor,nocc,nvir,iprt,emp4,emp5,
+     &     oseg_lo,oseg_hi, kchunk,
+     &     Tij, Tkj, Tia, Tka, Xia, Xka, Jia, Jka, Kia, Kka,
+     &     Jij, Jkj, Kij, Kkj, Dja, Djka, Djia)
+      use iso_fortran_env
+      use cudafor
+      use cublas
+      implicit none
+!
+#include "errquit.fh"
+#include "global.fh"
+#include "ccsd_len.fh"
+#include "ccsdps.fh"
+#include "util.fh"
+#include "msgids.fh"
+#include "yflop.fh"
+!
+      double precision, intent(inout) :: emp4,emp5
+      double precision, intent(in) :: t1(*)
+      integer, intent(in) :: ncor,nocc,nvir
+      integer, intent(in) :: iprt
+      integer, intent(in) :: g_objo,g_objv,g_coul,g_exch
+      integer, intent(in) :: oseg_lo,oseg_hi, kchunk
+      double precision, intent(in), managed :: f1n(nvir,nvir)
+      double precision, intent(in), managed :: f2n(nvir,nvir)
+      double precision, intent(in), managed :: f3n(nvir,nvir)
+      double precision, intent(in), managed :: f4n(nvir,nvir)
+      double precision, intent(in), managed :: f1t(nvir,nvir)
+      double precision, intent(in), managed :: f2t(nvir,nvir)
+      double precision, intent(in), managed :: f3t(nvir,nvir)
+      double precision, intent(in), managed :: f4t(nvir,nvir)
+      double precision, intent(in), managed :: eorb(*)
+      double precision, intent(in), managed :: Tij(*), Tkj(*)
+      double precision, intent(in), managed :: Tia(*), Tka(*)
+      double precision, intent(in), managed :: Xia(*), Xka(*)
+      double precision, intent(in), managed :: Jia(*), Jka(*)
+      double precision, intent(in), managed :: Jij(*), Jkj(*)
+      double precision, intent(in), managed :: Kia(*), Kka(*)
+      double precision, intent(in), managed :: Kij(*), Kkj(*)
+      double precision, intent(in), managed :: Dja(*), Djka(*), Djia(*)
+! used to make inline threaded tengy correct - for now
+! it is correct that dint[cx]1 are paired with t1v2 and vice versa
+! in the inlined tengy loops.  see ccsd_tengy in ccsd_trpdrv.F for
+! verification of the i-k and k-i pairing of these.
+      double precision, allocatable, managed :: dintc1(:),dintc2(:)
+      double precision, allocatable, managed :: dintx1(:),dintx2(:)
+      double precision, allocatable, managed :: t1v1(:),t1v2(:)
+      integer :: alloc_error, err
+!
+      double precision :: emp4i,emp5i,emp4k,emp5k
+      double precision :: eaijk,denom
+      integer :: inode,next,nodes,iam
+      integer :: a,b,c,i,j,k,akold,av
+      ! chunking is the loop blocking size in the loop nest
+      ! formerly associated with the tengy routine.
+      ! we have not explored this paramater space but 32 is
+      ! optimal for TLB blocking in matrix transpose on most
+      ! architectures (especially x86).
+      integer, parameter :: chunking = 32
+      integer :: bb,cc
+      integer :: klo, khi
+      integer nxtask
+      external nxtask
+      double precision perfm_flop,tzero,flopzero,t_flops,agg_flops
+      external perfm_flop
+!
+!  Dependencies (global array, local array, handle):
+!
+!  These are waited on first
+!
+!      g_objv, Dja,  nbh_objv1
+!      g_objv, Djka(1+(k-klo)*nvir), nbh_objv4(k)
+!      g_objv, Djia, nbh_objv5
+!
+!  These are waited on later
+!
+!      g_objv, Tka,  nbh_objv2
+!      g_objv, Xka,  nbh_objv3
+!      g_objv, Tia,  nbh_objv6
+!      g_objv, Xia,  nbh_objv7
+!      g_objo, Tkj,  nbh_objo1
+!      g_objo, Jkj,  nbh_objo2
+!      g_objo, Kkj,  nbh_objo3
+!      g_objo, Tij,  nbh_objo4
+!      g_objo, Jij,  nbh_objo5
+!      g_objo, Kij,  nbh_objo6
+!      g_exch, Kka,  nbh_exch1
+!      g_exch, Kia,  nbh_exch2
+!      g_coul, Jka,  nbh_coul1
+!      g_coul, Jia,  nbh_coul2
+!
+!  non-blocking handles
+!
+      integer nbh_objv1,nbh_objv2,nbh_objv3
+      integer nbh_objv5,nbh_objv6,nbh_objv7
+      integer nbh_objv4(nocc)
+!
+      integer nbh_objo1,nbh_objo2,nbh_objo3
+      integer nbh_objo4,nbh_objo5,nbh_objo6
+!
+      integer nbh_exch1,nbh_exch2,nbh_coul1,nbh_coul2
+      integer n_progr,pct_progr
+      parameter(n_progr=20)
+      logical i_progr(n_progr+1)
+!
+      integer(INT32) :: shi
+      type(cublasHandle) :: handle(8)
+      integer(kind=cuda_stream_kind) :: stream(8)
+      double precision :: tt0, tt1
+!
+      integer(INT32) :: nv4, no4 ! cublasDgemm requires 32-bit integers
+      integer(INT32) :: cu_op_n, cu_op_t
+      cu_op_n = CUBLAS_OP_N ! 0
+      cu_op_t = CUBLAS_OP_T ! 1
+!
+      if (ga_nodeid().eq.0) then
+        write(6,99)
+      endif
+   99 format(2x,'Using Fortran standard parallelism in CCSD(T)')
+      tzero=util_wallsec()
+      flopzero=perfm_flop()
+!
+! CUDA stuff
+!
+      tt0 = util_wallsec()
+      do shi=1,8
+        err = cudaStreamCreate(stream(shi))
+        if (err.ne.0) call errquit('cudaStreamCreate',shi,UNKNOWN_ERR)
+        err = cublasCreate(handle(shi))
+        if (err.ne.0) call errquit('cublasCreate',shi,UNKNOWN_ERR)
+        err = cublasSetStream(handle(shi), stream(shi))
+        if (err.ne.0) call errquit('cublasSetStream',shi,UNKNOWN_ERR)
+      end do
+      tt1 = util_wallsec()
+      if (ga_nodeid().eq.0) then
+        write(6,500) tt1-tt0
+  500   format('CU init took ',e15.5,' seconds')
+      endif
+!
+      allocate( dintc1(1:nvir), stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintc1',1,MA_ERR)
+      allocate( dintx1(1:nvir), stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintx1',2,MA_ERR)
+      allocate( t1v1(1:nvir), stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('t1v1',3,MA_ERR)
+      allocate( dintc2(1:nvir), stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintc2',4,MA_ERR)
+      allocate( dintx2(1:nvir), stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintx2',5,MA_ERR)
+      allocate( t1v2(1:nvir), stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('t1v2',6,MA_ERR)
+!
+      nodes = ga_nnodes()
+      iam = ga_nodeid()
+!
+!      call ga_sync() ! ga_sync called just before trpdrv in aoccsd2
+!
+      if (occsdps) then
+         call pstat_on(ps_trpdrv)
+      else
+         call qenter('trpdrv',0)
+      endif
+       do klo=1,n_progr+1
+          i_progr(klo)=.true.
+       enddo
+      inode=-1
+      next=nxtask(nodes, 1)
+      do klo = 1, nocc, kchunk
+         akold=0
+         khi = min(nocc, klo+kchunk-1)
+         do a=oseg_lo,oseg_hi
+            av=a-ncor-nocc
+            do j=1,nocc
+               inode=inode+1
+               if (inode.eq.next)then
+
+                  call ga_nbget(g_objv,1+(j-1)*lnov,j*lnov,av,av,Dja,
+     &                          lnov,nbh_objv1)
+                  do k = klo, khi
+                     call ga_nbget(g_objv,1+(j-1)*nvir+(k-1)*lnov,
+     &                    j*nvir+(k-1)*lnov,av,av,
+     &                    Djka(1+(k-klo)*nvir),nvir,nbh_objv4(k))
+                  enddo
+                  call ga_nbget(g_objo,(klo-1)*lnvv+1,khi*lnvv,j,j,Tkj,
+     &                          (khi-klo+1)*lnvv,nbh_objo1)
+                  call ga_nbget(g_objo,lnovv+(klo-1)*lnov+1,
+     &                          lnovv+khi*lnov,j,j,Jkj,
+     &                          (khi-klo+1)*lnov,nbh_objo2)
+                  call ga_nbget(g_objo,lnovv+lnoov+(klo-1)*lnov+1,
+     &                          lnovv+lnoov+khi*lnov,j,j,Kkj,
+     &                          (khi-klo+1)*lnov,nbh_objo3)
+                  if (akold .ne. a) then
+                     akold = a
+                     call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+klo,
+     &                    (a-oseg_lo)*nocc+khi,Jka,lnvv,nbh_coul1)
+                     call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+klo,
+     &                    (a-oseg_lo)*nocc+khi,Kka,lnvv,nbh_exch1)
+                     call ga_nbget(g_objv,1+lnoov+(klo-1)*lnov,
+     &                    lnoov+khi*lnov,av,av,Tka,(khi-klo+1)*lnov,
+     &                    nbh_objv2)
+                     call ga_nbget(g_objv,1+2*lnoov+(klo-1)*lnov,
+     &                    2*lnoov+khi*lnov,av,av,Xka,(khi-klo+1)*lnov,
+     &                    nbh_objv3)
+                  endif
+
+                  do i=1,nocc
+
+                     call ga_nbget(g_objv,1+(j-1)*nvir+(i-1)*lnov,
+     &                    j*nvir+(i-1)*lnov,av,av,Djia,nvir,nbh_objv5)
+                     call ga_nbget(g_objo,(i-1)*lnvv+1,i*lnvv,j,j,Tij,
+     &                    lnvv,nbh_objo4)
+                     call ga_nbget(g_objo,lnovv+(i-1)*lnov+1,
+     &                    lnovv+i*lnov,j,j,Jij,lnov,nbh_objo5)
+                     call ga_nbget(g_objo,lnovv+lnoov+(i-1)*lnov+1,
+     &                    lnovv+lnoov+i*lnov,j,j,Kij,lnov,nbh_objo6)
+                     call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+i,
+     &                    (a-oseg_lo)*nocc+i,Jia,lnvv,nbh_coul2)
+                     call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+i,
+     &                    (a-oseg_lo)*nocc+i,Kia,lnvv,nbh_exch2)
+                     call ga_nbget(g_objv,1+lnoov+(i-1)*lnov,
+     &                    lnoov+i*lnov,av,av,Tia,lnov,nbh_objv6)
+                     call ga_nbget(g_objv,1+2*lnoov+(i-1)*lnov,
+     &                    2*lnoov+i*lnov,av,av,Xia,lnov,nbh_objv7)
+
+!                     call dcopy(nvir,t1((i-1)*nvir+1),1,t1v2,1)
+                     t1v2(:) = t1((i-1)*nvir+1:i*nvir)
+                     call ga_nbwait(nbh_objv1) ! Dja
+!                     call dcopy(nvir,Dja(1+(i-1)*nvir),1,dintc1,1)
+                     dintc1(:) = Dja(1+(i-1)*nvir:i*nvir)
+                     call ga_nbwait(nbh_objv5) ! Djia
+!                     call dcopy(nvir,Djia,1,dintx1,1)
+                     dintx1(:) = Djia(1:nvir)
+
+                     do k=klo,min(khi,i)
+
+                        !call dcopy(nvir,t1((k-1)*nvir+1),1,t1v1,1)
+                        t1v1(:) = t1((k-1)*nvir+1:k*nvir)
+                        !call dcopy(nvir,Dja(1+(k-1)*nvir),1,dintc2,1)
+                        dintc2(:) = Dja(1+(k-1)*nvir:k*nvir)
+                        call ga_nbwait(nbh_objv4(k)) ! Djka
+                        !call dcopy(nvir,Djka(1+(k-klo)*nvir),1,dintx2,1)
+                        dintx2(:) = Djka(1+(k-klo)*nvir:(k-klo+1)*nvir)
+                        emp4i = 0.0d0
+                        emp5i = 0.0d0
+                        emp4k = 0.0d0
+                        emp5k = 0.0d0
+                        if (occsdps) then
+                           call pstat_on(ps_doxxx)
+                        else
+                           call qenter('doxxx',0)
+                        endif
+!
+!  These are the input dependencies for the DGEMM calls below.
+!  We wait on all of them here because GA is not even remotely thread-safe.
+!  All of these are independent of k, so we wait on them only
+!  at the first trip of the loop.
+!
+                        if (k.eq.klo) then
+                            call ga_nbwait(nbh_objv2)
+                            call ga_nbwait(nbh_objv3)
+                            call ga_nbwait(nbh_objv6)
+                            call ga_nbwait(nbh_objv7)
+                            call ga_nbwait(nbh_objo1)
+                            call ga_nbwait(nbh_objo2)
+                            call ga_nbwait(nbh_objo3)
+                            call ga_nbwait(nbh_objo4)
+                            call ga_nbwait(nbh_objo5)
+                            call ga_nbwait(nbh_objo6)
+                            call ga_nbwait(nbh_exch1)
+                            call ga_nbwait(nbh_exch2)
+                            call ga_nbwait(nbh_coul1)
+                            call ga_nbwait(nbh_coul2)
+                        endif
+
+                        nv4 = nvir ! no possibility of overflow
+                        no4 = nocc
+
+                        err = cublasDgemm_v2(handle(1),
+     &                        cu_op_n,cu_op_t,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Jia,nv4,Tkj(1+(k-klo)*lnvv),nv4,0.0d0,
+     &                        f1n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(1),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Tia,nv4,Kkj(1+(k-klo)*lnov),no4,1.0d0,
+     &                        f1n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                       
+                        err = cublasDgemm_v2(handle(2),
+     &                        cu_op_n,cu_op_t,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Kia,nv4,Tkj(1+(k-klo)*lnvv),nv4,0.0d0,
+     &                        f2n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(2),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Xia,nv4,Kkj(1+(k-klo)*lnov),no4,1.0d0,
+     &                        f2n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                       
+                        err = cublasDgemm_v2(handle(3),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Jia,nv4,Tkj(1+(k-klo)*lnvv),nv4,0.0d0,
+     &                        f3n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(3),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Tia,nv4,Jkj(1+(k-klo)*lnov),no4,1.0d0,
+     &                        f3n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                       
+                        err = cublasDgemm_v2(handle(4),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Kia,nv4,Tkj(1+(k-klo)*lnvv),nv4,0.0d0,
+     &                        f4n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(4),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Xia,nv4,Jkj(1+(k-klo)*lnov),no4,1.0d0,
+     &                        f4n,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                       
+                        err = cublasDgemm_v2(handle(5),
+     &                        cu_op_n,cu_op_t,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Jka(1+(k-klo)*lnvv),nv4,Tij,nv4,0.0d0,
+     &                        f1t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(5),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Tka(1+(k-klo)*lnov),nv4,Kij,no4,1.0d0,
+     &                        f1t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                       
+                        err = cublasDgemm_v2(handle(6),
+     &                        cu_op_n,cu_op_t,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Kka(1+(k-klo)*lnvv),nv4,Tij,nv4,0.0d0,
+     &                        f2t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(6),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Xka(1+(k-klo)*lnov),nv4,Kij,no4,1.0d0,
+     &                        f2t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                       
+                        err = cublasDgemm_v2(handle(7),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Jka(1+(k-klo)*lnvv),nv4,Tij,nv4,0.0d0,
+     &                        f3t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(7),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Tka(1+(k-klo)*lnov),nv4,Jij,no4,1.0d0,
+     &                        f3t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                       
+                        err = cublasDgemm_v2(handle(8),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,nv4,1.0d0,
+     &                        Kka(1+(k-klo)*lnvv),nv4,Tij,nv4,0.0d0,
+     &                        f4t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+                        err = cublasDgemm_v2(handle(8),
+     &                        cu_op_n,cu_op_n,
+     &                        nv4,nv4,no4,-1.0d0,
+     &                        Xka(1+(k-klo)*lnov),nv4,Jij,no4,1.0d0,
+     &                        f4t,nv4)
+                        if (err.ne.0) then
+                          call errquit('cublasDgemm_v2',err,UNKNOWN_ERR)
+                        endif
+
+! this is necessary if OpenACC is not used below
+!                        err = cudaDeviceSynchronize()
+
+                        do shi=1,8
+                          err = cudaStreamSynchronize(stream(i))
+                          if (err.ne.0) then
+                            call errquit('cudaStreamSynchronize',err,
+     &                                   UNKNOWN_ERR)
+                          endif
+                        end do
+
+                        if (occsdps) then
+                           call pstat_off(ps_doxxx)
+                           call pstat_on(ps_tengy)
+                        else
+                           call qexit('doxxx',0)
+                           call qenter('tengy',0)
+                        endif
+
+                        eaijk=eorb(a) - (  eorb(ncor+i)
+     &                                    +eorb(ncor+j)
+     &                                    +eorb(ncor+k) )
+#ifdef USE_YFLOP
+      flops_ycount = flops_ycount + nvir*nvir*(
+     D                       3 + 2*(
+     E                       12 +
+     E                       11 +
+     E                       11 ) +
+     5                       2*27 )
+#endif
+
+!$acc parallel loop collapse(2) private(denom) 
+!$acc&         reduction(+:emp4i,emp4k,emp5i,emp5k)
+           do b=1,nvir
+             do c=1,nvir
+                   denom=-1.0d0/( eorb(ncor+nocc+b)
+     &                           +eorb(ncor+nocc+c)+eaijk )
+                   emp4i=emp4i+denom*
+     &                  (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3n(b,c)+f4n(c,b))*
+     &                  (f1t(b,c)-2*f2t(b,c)-2*f3t(b,c)+f4t(b,c))
+     &                        -denom*
+     &                  (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3n(c,b))*
+     &                  (2*f1t(b,c)-f2t(b,c)-f3t(b,c)+2*f4t(b,c))
+     &                        +3*denom*(
+     &                  f1n(b,c)*(f1n(b,c)+f3n(c,b)+2*f4t(c,b))+
+     &                  f2n(b,c)*f2t(c,b)+f3n(b,c)*f4t(b,c))
+                   emp4k=emp4k+denom*
+     &                  (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3t(b,c)+f4t(c,b))*
+     &                  (f1n(b,c)-2*f2n(b,c)-2*f3n(b,c)+f4n(b,c))
+     &                        -denom*
+     &                  (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3t(c,b))*
+     &                  (2*f1n(b,c)-f2n(b,c)-f3n(b,c)+2*f4n(b,c))
+     &                        +3*denom*(
+     &                  f1t(b,c)*(f1t(b,c)+f3t(c,b)+2*f4n(c,b))+
+     &                  f2t(b,c)*f2n(c,b)+f3t(b,c)*f4n(b,c))
+                   emp5i=emp5i+denom*t1v1(b)*dintx1(c)*
+     &                 (    f1t(b,c)+f2n(b,c)+f4n(c,b)
+     &                  -2*(f3t(b,c)+f4n(b,c)+f2n(c,b)+
+     &                      f1n(b,c)+f2t(b,c)+f3n(c,b))
+     &                  +4*(f3n(b,c)+f4t(b,c)+f1n(c,b)))
+     &                        +denom*t1v1(b)*dintc1(c)*
+     &                 (     f1n(b,c)+f4n(b,c)+f1t(c,b)
+     &                   -2*(f2n(b,c)+f3n(b,c)+f2t(c,b)))
+                   emp5k=emp5k+denom*t1v2(b)*dintx2(c)*
+     &                 (    f1n(b,c)+f2t(b,c)+f4t(c,b)
+     &                  -2*(f3n(b,c)+f4t(b,c)+f2t(c,b)+
+     &                      f1t(b,c)+f2n(b,c)+f3t(c,b))
+     &                  +4*(f3t(b,c)+f4n(b,c)+f1t(c,b)))
+     &                        +denom*t1v2(b)*dintc2(c)*
+     &                 (     f1t(b,c)+f4t(b,c)+f1n(c,b)
+     &                   -2*(f2t(b,c)+f3t(b,c)+f2n(c,b)))
+             end do
+           end do
+                         if (occsdps) then
+                            call pstat_off(ps_tengy)
+                         else
+                            call qexit('tengy',0)
+                         endif
+
+                         emp4 = emp4 + emp4i
+                         emp5 = emp5 + emp5i
+                         if (i.ne.k) then
+                             emp4 = emp4 + emp4k
+                             emp5 = emp5 + emp5k
+                         end if ! (i.ne.k)
+                     end do    ! k
+                  end do       ! i
+                  if (iprt.gt.50)then
+                     write(6,1234)iam,a,j,emp4,emp5
+ 1234                format(' iam aijk',3i5,2e15.5)
+                  end if
+                  next=nxtask(nodes, 1)
+            if(ga_nodeid().eq.0) then
+               pct_progr=(a-(ncor+nocc)+((klo-1)/kchunk)*nvir)*n_progr/
+     /          ((nocc/kchunk)*nvir)+1
+               if(i_progr(pct_progr)) then
+                  i_progr(pct_progr)=.false.
+               write(6,4321) ' ccsd(t): done ',
+     A              a-(ncor+nocc)+((klo-1)/kchunk)*nvir,
+     O              ' out of ',(nocc/kchunk)*nvir,
+     O              ' progress: ',
+     O              ((a-(ncor+nocc)+((klo-1)/kchunk)*nvir)*100)/
+     D              ((nocc/kchunk)*nvir),
+     P            '%, Gflops=',(perfm_flop()-flopzero)/
+     D              (util_wallsec()-tzero),
+     P                 ' at ',(util_wallsec()-tzero),' secs'
+               call util_flush(6)
+ 4321          format(a,i8,a,i8,a,i3,a,1pg11.4,a,0pf10.1,a)
+               endif
+            endif
+               end if
+            end do
+         end do
+      end do
+      call ga_sync()
+      next=nxtask(-nodes, 1)
+      t_flops=util_wallsec()-tzero
+      agg_flops=perfm_flop()-flopzero
+      call ga_dgop(msg_cc_diis1,agg_flops,1, '+')
+      if(ga_nodeid().eq.0) then
+         write(6,4322) ' ccsd(t): 100% done, Aggregate Gflops=',
+     P        agg_flops/t_flops,
+     P                 ' in ',t_flops,' secs'
+ 4322    format(a,1pg11.4,a,0pf10.1,a)
+         call util_flush(6)
+      endif
+      call ga_sync()
+      if (occsdps) then
+         call pstat_off(ps_trpdrv)
+      else
+         call qexit('trpdrv',0)
+      endif
+!
+      deallocate( dintc1, stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintc1',11,MA_ERR)
+      deallocate( dintx1, stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintx1',12,MA_ERR)
+      deallocate( t1v1, stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('t1v1',13,MA_ERR)
+      deallocate( dintc2, stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintc2',14,MA_ERR)
+      deallocate( dintx2, stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('dintx2',15,MA_ERR)
+      deallocate( t1v2, stat=alloc_error)
+      if (alloc_error.ne.0) call errquit('t1v2',16,MA_ERR)
+!
+! CUDA stuff
+!
+      do shi=1,8
+        err = cublasDestroy(handle(shi))
+        if (err.ne.0) call errquit('cublasDestroy',shi,UNKNOWN_ERR)
+        err = cudaStreamDestroy(stream(shi))
+        if (err.ne.0) call errquit('cudaStreamDestroy',shi,UNKNOWN_ERR)
+      end do
+!
+      end
diff --git a/src/config/makefile.h b/src/config/makefile.h
index 73781e20a86..e1b2d86b255 100644
--- a/src/config/makefile.h
+++ b/src/config/makefile.h
@@ -2827,6 +2827,10 @@ else
     CORE_LIBS += $(BLASOPT)
 endif
 
+ifdef NWCHEM_LINK_CUDA
+CORE_LIBS += -stdpar -acc -gpu=managed -cuda -cudalib=cublas
+endif
+
 ifdef BLASOPT
 BLAS_SUPPLIED=Y
 endif
diff --git a/src/tools/GNUmakefile b/src/tools/GNUmakefile
index dfa715b99eb..3e495e274f0 100644
--- a/src/tools/GNUmakefile
+++ b/src/tools/GNUmakefile
@@ -630,6 +630,10 @@ ifndef ARMCI_NETWORK
     ARMCI_NETWORK=MPI-TS
     MAYBE_ARMCI = --with-mpi-ts
 endif
+# CUDA UM support
+ifdef NWCHEM_LINK_CUDA
+    MAYBE_ARMCI +=  --enable-cuda-mem
+endif
 #
 # Apparently weak bindings do not work with CYGWIN64 at the moment. There seems
 # to be an issue with the COFF object format that gets in the way (with ELF

From 9034ac7a73b43eea51b4986ad0c34a6b7183b446 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 14 May 2021 15:56:18 -0700
Subject: [PATCH 43/45] stdpar not required

---
 src/config/makefile.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/config/makefile.h b/src/config/makefile.h
index e1b2d86b255..f924f75db85 100644
--- a/src/config/makefile.h
+++ b/src/config/makefile.h
@@ -2828,7 +2828,7 @@ else
 endif
 
 ifdef NWCHEM_LINK_CUDA
-CORE_LIBS += -stdpar -acc -gpu=managed -cuda -cudalib=cublas
+CORE_LIBS += -acc -gpu=managed -cuda -cudalib=cublas
 endif
 
 ifdef BLASOPT

From 8047d5119d98342c9bf5d7745a18488fa4e2ec5a Mon Sep 17 00:00:00 2001
From: edoapra <edoardo.apra@gmail.com>
Date: Fri, 14 May 2021 17:27:12 -0700
Subject: [PATCH 44/45] update for building on cray

---
 src/libext/scalapack/build_scalapa.sh | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/libext/scalapack/build_scalapa.sh b/src/libext/scalapack/build_scalapa.sh
index 90016eeda86..6932d298f1a 100755
--- a/src/libext/scalapack/build_scalapa.sh
+++ b/src/libext/scalapack/build_scalapa.sh
@@ -17,8 +17,19 @@ get_cmake38(){
 	fi
 
 }
+MPICC=mpicc
 if [[ "$FC" = "ftn"  ]] ; then
     MPIF90="ftn"
+    # ugly hack to get mpicc on cray
+    if [[  -z "${INTEL_PATH}" ]]; then
+	echo
+	echo Intel compilers not loaded
+	echo please execute "module load intel" for building Scalapack
+	echo
+	exit 1
+    else
+	MPICC=$INTEL_PATH/linux/mpi/intel64/bin/mpicc
+    fi
 else
     if ! [ -x "$(command -v mpif90)" ]; then
 	echo
@@ -137,7 +148,7 @@ fi
 #    Fortran_FLAGS+=-I"$NWCHEM_TOP"/src/libext/include
 #fi
 #fix for clang 12 error in implicit-function-declaration
-GOTCLANG=$( mpicc -dM -E - </dev/null 2> /dev/null |grep __clang__|head -1|cut -c19)
+GOTCLANG=$( "$MPICC" -dM -E - </dev/null 2> /dev/null |grep __clang__|head -1|cut -c19)
 if [[ ${GOTCLANG} == "1" ]] ; then
     C_FLAGS=" -Wno-error=implicit-function-declaration "
 fi
@@ -153,8 +164,15 @@ if [[  "$SCALAPACK_SIZE" == 8 ]] ; then
     fi
     C_FLAGS+=" -DInt=long"
 fi
-echo compiling with CC=mpicc  FC=$MPIF90 CFLAGS="$C_FLAGS" FFLAGS="$Fortran_FLAGS" $CMAKE -Wno-dev ../ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_FLAGS="$C_FLAGS"  -DCMAKE_Fortran_FLAGS="$Fortran_FLAGS" -DTEST_SCALAPACK=OFF  -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=OFF  -DBLAS_openblas_LIBRARY="$BLASOPT"  -DBLAS_LIBRARIES="$BLASOPT"  -DLAPACK_openblas_LIBRARY="$BLASOPT"  -DLAPACK_LIBRARIES="$BLASOPT" 
-CC=mpicc  FC=$MPIF90 CFLAGS="$C_FLAGS" FFLAGS="$Fortran_FLAGS" $CMAKE -Wno-dev ../ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_FLAGS="$C_FLAGS"  -DCMAKE_Fortran_FLAGS="$Fortran_FLAGS" -DTEST_SCALAPACK=OFF  -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=OFF  -DBLAS_openblas_LIBRARY="$BLASOPT"  -DBLAS_LIBRARIES="$BLASOPT"  -DLAPACK_openblas_LIBRARY="$BLASOPT"  -DLAPACK_LIBRARIES="$BLASOPT" 
+if [[ "$CRAY_CPU_TARGET" == "mic-knl" ]]; then
+    module swap craype-mic-knl craype-haswell
+    KNL_SWAP=1
+fi
+echo compiling with CC="$MPICC"  FC=$MPIF90 CFLAGS="$C_FLAGS" FFLAGS="$Fortran_FLAGS" $CMAKE -Wno-dev ../ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_FLAGS="$C_FLAGS"  -DCMAKE_Fortran_FLAGS="$Fortran_FLAGS" -DTEST_SCALAPACK=OFF  -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=OFF  -DBLAS_openblas_LIBRARY="$BLASOPT"  -DBLAS_LIBRARIES="$BLASOPT"  -DLAPACK_openblas_LIBRARY="$BLASOPT"  -DLAPACK_LIBRARIES="$BLASOPT"
+CC="$MPICC"  FC=$MPIF90 CFLAGS="$C_FLAGS" FFLAGS="$Fortran_FLAGS" $CMAKE -Wno-dev ../ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_FLAGS="$C_FLAGS"  -DCMAKE_Fortran_FLAGS="$Fortran_FLAGS" -DTEST_SCALAPACK=OFF  -DBUILD_TESTING=OFF -DBUILD_SHARED_LIBS=OFF  -DBLAS_openblas_LIBRARY="$BLASOPT"  -DBLAS_LIBRARIES="$BLASOPT"  -DLAPACK_openblas_LIBRARY="$BLASOPT"  -DLAPACK_LIBRARIES="$BLASOPT"
 make V=0 -j3 scalapack/fast
 mkdir -p ../../../lib
 cp lib/libscalapack.a ../../../lib/libnwc_scalapack.a
+if [[ "$KNL_SWAP" == "1" ]]; then
+    module swap  craype-haswell craype-mic-knl
+fi

From 760e87875134392cf1aba3e2ebcaced1d90fbaa0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 14 May 2021 18:39:18 -0700
Subject: [PATCH 45/45] fix bug: openacc was not initialized

---
 src/ccsd/aoccsd2.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ccsd/aoccsd2.F b/src/ccsd/aoccsd2.F
index 87e258f12cd..82516c0f0e0 100644
--- a/src/ccsd/aoccsd2.F
+++ b/src/ccsd/aoccsd2.F
@@ -720,7 +720,7 @@ subroutine ccsd_iterdrv2(rtdb,basis,nsh,ncor,nocc,nvir,nact,nbf,
      2    use_trpdrv_omp_mp=.false.
       if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_openacc', mt_log, 1,
      1                   use_trpdrv_openacc))
-     2    use_trpdrv_offload=.false.
+     2    use_trpdrv_openacc=.false.
       if (.not. rtdb_get(rtdb, 'ccsd:use_trpdrv_offload', mt_log, 1,
      1                   use_trpdrv_offload))
      2    use_trpdrv_offload=.false.