diff --git a/examples/pbc/27-multigrid.py b/examples/pbc/27-multigrid.py
index f1b1f85a95..6809f33e3d 100644
--- a/examples/pbc/27-multigrid.py
+++ b/examples/pbc/27-multigrid.py
@@ -31,9 +31,9 @@
 #
 # There are two ways to enable multigrid numerical integration
 #
-# Method 1: use multigrid.multigrid function to update SCF object
+# Method 1: use multigrid.multigrid_fftdf function to update SCF object
 #
-mf = multigrid.multigrid(mf)
+mf = multigrid.multigrid_fftdf(mf)
 mf.kernel()
 
 #
diff --git a/examples/pbc/27-multigrid2.py b/examples/pbc/27-multigrid2.py
new file mode 100644
index 0000000000..d73cd8fe50
--- /dev/null
+++ b/examples/pbc/27-multigrid2.py
@@ -0,0 +1,238 @@
+#from os.path import expanduser
+#home_dir = expanduser("~")
+#f = open(home_dir+'/.pyscf_conf.py', 'a')
+# use FFTW for fft, this requires to compile the FFTW library
+# cmake -DENABLE_FFTW=ON -DBUILD_FFTW=ON
+#f.write('pbc_tools_pbc_fft_engine=\'FFTW\'')
+#f.close()
+
+import numpy
+import pyscf
+from pyscf import lib
+from pyscf import pbc
+from pyscf.pbc import gto as pbcgto
+from pyscf.pbc import dft as pbcdft
+from pyscf.pbc.dft import multigrid
+
+cell=pbcgto.Cell()
+
+#Molecule
+boxlen=12.4138
+cell.a=numpy.array([[boxlen,0.0,0.0],[0.0,boxlen,0.0],[0.0,0.0,boxlen]])
+cell.atom="""
+O      12.235322       1.376642      10.869880
+O       6.445390       3.706940       8.650794
+O       0.085977       2.181322       8.276663
+O      12.052554       2.671366       2.147199
+O      12.250036       4.190930      12.092014
+O       7.187422       0.959062       4.733469
+O       8.346457       7.210040       4.667644
+O      12.361546      11.527875       8.106887
+O       3.299984       4.440816       9.193275
+O       2.855829       3.759909       6.552815
+O       1.392494       6.362753       0.586172
+O       1.858645       8.694013       2.068738
+O       3.770231      12.094519       8.652183
+O       6.432508       3.669828       2.772418
+O       1.998724       1.820217       4.876440
+O       8.248581       2.404730       6.931303
+O       5.753814       3.360029      12.461534
+O      11.322212       5.649239       2.236798
+O       4.277318       2.113956      10.590808
+O       5.405015       3.349247       5.484702
+O       6.493278      11.869958       0.684912
+O       3.275250       2.346576       2.425241
+O       7.981003       6.352512       7.507970
+O       5.985990       6.512854      12.194648
+O      10.636714      11.856872      12.209540
+O       9.312283       3.670384       3.508594
+O       1.106885       5.830301       6.638695
+O       8.008007       3.326363      10.869818
+O      12.403000       9.687405      11.761901
+O       4.219782       7.085315       8.153470
+O       3.781557       8.203821      11.563272
+O      11.088898       4.532081       7.809475
+O      10.387548       8.408890       1.017882
+O       1.979016       6.418091      10.374159
+O       4.660547       0.549666       5.617403
+O       8.745880      12.256257       8.089383
+O       2.662041      10.489890       0.092980
+O       7.241661      10.471815       4.226946
+O       2.276827       0.276647      10.810417
+O       8.887733       0.946877       1.333885
+O       1.943554       8.088552       7.567650
+O       9.667942       8.056759       9.868847
+O      10.905491       8.339638       6.484782
+O       3.507733       4.862402       1.557439
+O       8.010457       8.642846      12.055969
+O       8.374446      10.035932       6.690309
+O       5.635247       6.076875       5.563993
+O      11.728434       1.601906       5.079475
+O       9.771134       9.814114       3.548703
+O       3.944355      10.563450       4.687536
+O       0.890357       6.382287       4.065806
+O       6.862447       6.425182       2.488202
+O       3.813963       6.595122       3.762649
+O       6.562448       8.295463       8.807182
+O       9.809455       0.143325       3.886553
+O       4.117074      11.661225       2.221679
+O       5.295317       8.735561       2.763183
+O       9.971999       5.379339       5.340378
+O      12.254708       8.643874       3.957116
+O       2.344274      10.761274       6.829162
+O       7.013416       0.643488      10.518797
+O       5.152349      10.233624      10.359388
+O      11.184278       5.884064      10.298279
+O      12.252335       8.974142       9.070831
+H      12.415139       2.233125      11.257611
+H      11.922476       1.573799       9.986994
+H       5.608192       3.371543       8.971482
+H       6.731226       3.060851       8.004962
+H      -0.169205       1.565594       7.589645
+H      -0.455440       2.954771       8.118939
+H      12.125168       2.826463       1.205443
+H      12.888828       2.969761       2.504745
+H      11.553255       4.386613      11.465566
+H      12.818281       4.960808      12.067151
+H       7.049495       1.772344       4.247898
+H       6.353019       0.798145       5.174047
+H       7.781850       7.384852       5.420566
+H       9.103203       6.754017       5.035898
+H      12.771232      11.788645       8.931744
+H      12.018035      10.650652       8.276334
+H       3.557245       3.792529       9.848846
+H       2.543844       4.884102       9.577958
+H       2.320235       4.521250       6.329813
+H       2.872128       3.749963       7.509824
+H       1.209685       7.121391       1.140501
+H       2.238885       6.038801       0.894245
+H       2.763109       8.856353       2.336735
+H       1.329379       9.047369       2.783755
+H       4.315639      11.533388       9.203449
+H       3.098742      12.433043       9.244412
+H       5.987369       3.448974       3.590530
+H       5.813096       3.419344       2.086985
+H       1.057126       1.675344       4.969379
+H       2.248496       2.292119       5.670892
+H       8.508264       1.653337       7.464411
+H       8.066015       2.034597       6.067646
+H       5.197835       2.915542      11.821572
+H       6.630900       3.329981      12.079371
+H      10.788986       6.436672       2.127933
+H      11.657923       5.463602       1.359832
+H       3.544476       1.634958      10.977765
+H       4.755770       1.455054      10.087655
+H       4.465371       3.375459       5.665294
+H       5.682663       4.264430       5.524498
+H       6.174815      11.778676       1.582954
+H       5.713640      12.089924       0.174999
+H       3.476076       1.498708       2.028983
+H       2.730229       2.134295       3.182949
+H       7.119624       5.936450       7.474030
+H       8.536492       5.799405       6.958665
+H       5.909499       5.717477      11.667621
+H       6.125402       6.196758      13.087330
+H      11.203499      12.513536      11.804844
+H      10.260930      12.300153      12.970145
+H       9.985036       3.927685       2.878172
+H       8.545584       3.468329       2.972331
+H       1.399882       6.620092       7.093246
+H       0.963561       6.112523       5.735345
+H       8.067363       3.674002       9.979955
+H       8.000737       2.375959      10.756190
+H      11.821629      10.402510      12.020482
+H      12.206854       8.983242      12.379892
+H       3.461473       7.606485       7.889688
+H       3.844478       6.304711       8.560946
+H       3.179884       7.585614      11.148494
+H       4.401957       7.652030      12.039573
+H      11.573777       5.053211       7.169515
+H      10.342076       4.186083       7.320831
+H      10.065640       8.919194       1.760981
+H       9.629585       8.322499       0.439729
+H       1.396302       6.546079       9.625630
+H       1.405516       6.479759      11.138049
+H       4.024008       1.232518       5.405828
+H       4.736858       0.579881       6.571077
+H       9.452293      12.313381       8.732772
+H       8.976559      11.502788       7.545965
+H       1.834701      10.012311       0.153462
+H       3.295197       9.836403      -0.204175
+H       7.056724      11.401702       4.095264
+H       6.499038      10.020287       3.825865
+H       1.365541       0.487338      11.013887
+H       2.501591      -0.428131      11.417871
+H       8.644279       1.812362       1.005409
+H       8.142674       0.388030       1.112955
+H       1.272659       8.365063       8.191888
+H       2.142485       8.877768       7.063867
+H       8.961493       7.826192       9.265523
+H       9.227102       8.487654      10.601118
+H      10.150144       7.758934       6.392768
+H      10.596082       9.187988       6.167290
+H       3.463106       4.096188       2.129414
+H       3.919461       4.539801       0.755791
+H       7.418998       9.394959      12.028876
+H       7.430413       7.883095      12.106546
+H       7.972905      10.220334       5.841196
+H       7.675111       9.631498       7.203725
+H       5.332446       6.381336       6.419473
+H       5.000025       6.434186       4.943466
+H      11.575078       2.271167       4.412540
+H      11.219802       0.847030       4.783357
+H       8.865342       9.721516       3.843998
+H      10.000732      10.719285       3.758898
+H       3.186196      10.476397       5.265333
+H       4.407331      11.335128       5.013723
+H       0.558187       7.255936       3.859331
+H       0.341672       5.789383       3.552346
+H       7.459933       6.526049       3.229193
+H       6.696228       5.483739       2.440372
+H       3.864872       6.313007       2.849385
+H       2.876419       6.621201       3.953862
+H       5.631529       8.079145       8.753997
+H       7.003296       7.568245       8.367822
+H       9.615413       0.527902       3.031755
+H       8.962985       0.109366       4.332162
+H       3.825854      11.139182       1.474087
+H       4.063988      11.063232       2.967211
+H       5.784391       7.914558       2.708486
+H       4.780461       8.655167       3.566110
+H      10.880659       5.444664       5.046607
+H       9.593331       4.687991       4.797350
+H      11.562317       8.960134       3.376765
+H      11.926084       8.816948       4.839320
+H       2.856874      11.297981       7.433660
+H       1.492332      11.195517       6.786033
+H       7.145820       0.090200       9.749009
+H       7.227275       0.077690      11.260665
+H       4.662021       9.538430      10.798155
+H       5.994537       9.833472      10.142985
+H      10.544299       6.595857      10.301445
+H      11.281750       5.653082       9.374494
+H      12.103020       8.841164      10.006916
+H      11.491592       8.576221       8.647557
+"""
+cell.basis = 'gth-tzv2p'
+cell.ke_cutoff = 200  # kinetic energy cutoff in a.u.
+cell.max_memory = 8000 # in MB
+cell.precision = 1e-6 # integral precision
+cell.pseudo = 'gth-pade'
+cell.verbose = 4
+cell.use_loose_rcut = True # integral screening based on shell radii
+cell.use_particle_mesh_ewald = True # use particle mesh ewald for nuclear repulsion
+cell.build()
+#cell = pbc.tools.super_cell(cell, [1,2,2]) #build super cell by replicating unit cell
+
+mf=pbcdft.RKS(cell)
+#mf.xc = "LDA, VWN"
+mf.xc = "PBE,PBE"
+mf.init_guess = 'atom' # atom guess is fast
+mf.with_df = multigrid.MultiGridFFTDF2(cell)
+mf.with_df.ngrids = 4 # number of sets of grid points
+mf.kernel()
+
+# Nuclear Gradients
+from pyscf.pbc.grad import rks as rks_grad
+grad = rks_grad.Gradients(mf)
+g = grad.kernel()
diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index 4e06980ffb..28d8fd444d 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -61,6 +61,7 @@
 NUC_MOD_OF = 2
 PTR_ZETA   = 3
 PTR_FRAC_CHARGE = 4
+PTR_RADIUS = 5
 ATM_SLOTS  = 6
 ATOM_OF    = 0
 ANG_OF     = 1
@@ -2412,6 +2413,15 @@ def ms(self, x):
         else:
             self.spin = int(round(2*x, 4))
 
+    @property
+    def enuc(self):
+        '''nuclear repulsion energy'''
+        if self._enuc is None:
+            self._enuc = self.energy_nuc()
+        return self._enuc
+    @enuc.setter
+    def enuc(self, enuc):
+        self._enuc = enuc
 
     copy = copy
 
@@ -2576,6 +2586,9 @@ def build(self, dump_input=True, parse_arg=ARGPARSE,
             # number of electrons are consistent.
             self.nelec
 
+        # reset nuclear energy
+        self.enuc = None
+
         if not self.magmom:
             self.magmom = [0,] * self.natm
         elif len(self.magmom) != self.natm:
@@ -2784,7 +2797,7 @@ def dump_input(self):
 
         if self.verbose >= logger.INFO:
             self.stdout.write('\n')
-            logger.info(self, 'nuclear repulsion = %.15g', self.energy_nuc())
+            logger.info(self, 'nuclear repulsion = %.15g', self.enuc)
             if self.symmetry:
                 if self.topgroup == self.groupname:
                     logger.info(self, 'point group symmetry = %s', self.topgroup)
@@ -3050,6 +3063,9 @@ def set_geom_(self, atoms_or_coords, unit=None, symmetry=None,
             mol.symmetry = symmetry
             mol.build(False, False)
 
+        # reset nuclear energy
+        mol.enuc = None
+
         if mol.verbose >= logger.INFO:
             logger.info(mol, 'New geometry')
             for ia, atom in enumerate(mol._atom):
@@ -3542,7 +3558,9 @@ def intor_by_shell(self, intor, shells, comp=None, grids=None):
 
     eval_ao = eval_gto = eval_gto
 
-    energy_nuc = get_enuc = energy_nuc
+    energy_nuc = energy_nuc
+    def get_enuc(self):
+        return self.enuc
 
     def get_ao_indices(self, bas_list, ao_loc=None):
         '''
diff --git a/pyscf/gto/moleintor.py b/pyscf/gto/moleintor.py
index e3d661f1e0..4c6a4ce8cf 100644
--- a/pyscf/gto/moleintor.py
+++ b/pyscf/gto/moleintor.py
@@ -429,6 +429,7 @@ def _get_intor_and_comp(intor_name, comp=None):
     'int2c2e_ip1ip2'            : (9, 9),
     'int2c2e_ipip1'             : (9, 9),
     'int3c1e'                   : (1, 1),
+    'int3c1e_ip1'               : (3, 3),
     'int3c1e_p2'                : (1, 1),
     'int3c1e_iprinv'            : (3, 3),
     'int2c2e'                   : (1, 1),
diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt
index 1dc076da21..4b7236535e 100644
--- a/pyscf/lib/CMakeLists.txt
+++ b/pyscf/lib/CMakeLists.txt
@@ -136,6 +136,9 @@ else ()
   set(CMAKE_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/deps/lib:\$ORIGIN/deps/lib64")
 endif ()
 
+option(ENABLE_FFTW "Using fftw3" OFF)
+option(BUILD_FFTW "Building fftw3" OFF)
+
 add_subdirectory(np_helper)
 add_subdirectory(gto)
 add_subdirectory(vhf)
@@ -198,6 +201,12 @@ option(ENABLE_XCFUN "Using xcfun for XC functional library" ON)
 option(BUILD_LIBXC "Download and build libxc library" ON)
 option(BUILD_XCFUN "Download and build xcfun library" ON)
 
+option(ENABLE_LIBXSMM "Using libxsmm" OFF)
+option(BUILD_LIBXSMM "Building libxsmm" OFF)
+if(APPLE)
+    set(ENABLE_LIBXSMM OFF)
+endif()
+
 if(NOT DISABLE_DFT)
 add_subdirectory(dft)
 
@@ -237,8 +246,39 @@ if(ENABLE_XCFUN AND BUILD_XCFUN)
   add_dependencies(xcfun_itrf libxcfun)
   add_dependencies(dft libxcfun)
 endif() # ENABLE_XCFUN
+
+if(ENABLE_LIBXSMM AND BUILD_LIBXSMM)
+  if(NOT EXISTS "${PROJECT_SOURCE_DIR}/deps/include/libxsmm.h")
+    ExternalProject_Add(libxsmm
+      GIT_REPOSITORY https://github.com/hfp/libxsmm.git
+      GIT_TAG 1.17
+      PREFIX ${PROJECT_BINARY_DIR}/deps
+      INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps
+      CONFIGURE_COMMAND ""
+      BUILD_IN_SOURCE True
+      BUILD_COMMAND make -j4 PREFIX=<INSTALL_DIR> CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} STATIC=0 MALLOC=0 INTRINSICS=2 install
+      INSTALL_COMMAND ""
+    )
+    add_dependencies(dft libxsmm)
+  endif()
+endif()
 endif() # DISABLE_DFT
 
+if(ENABLE_FFTW AND BUILD_FFTW)
+#  if(NOT EXISTS "${PROJECT_SOURCE_DIR}/deps/include/fftw3.h")
+    ExternalProject_Add(libfftw3
+      URL https://www.fftw.org/fftw-3.3.10.tar.gz
+      PREFIX ${PROJECT_BINARY_DIR}/deps
+      INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps
+      BUILD_IN_SOURCE True
+      CONFIGURE_COMMAND ./configure --enable-static=no --enable-shared=yes --enable-threads CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER} prefix=<INSTALL_DIR>
+      BUILD_COMMAND make -j4 install
+    )
+    add_dependencies(fft libfftw3)
+    add_dependencies(pbc libfftw3)
+#  endif()
+endif()
+
 if(EXISTS "${PROJECT_SOURCE_DIR}/cmake.user.inc")
   include("${PROJECT_SOURCE_DIR}/cmake.user.inc")
 endif()
diff --git a/pyscf/lib/dft/CMakeLists.txt b/pyscf/lib/dft/CMakeLists.txt
index 6b01b7eca0..c7263183c8 100644
--- a/pyscf/lib/dft/CMakeLists.txt
+++ b/pyscf/lib/dft/CMakeLists.txt
@@ -15,14 +15,19 @@
 add_library(dft SHARED
   CxLebedevGrid.c grid_basis.c nr_numint.c r_numint.c
   numint_uniform_grid.c xc_deriv.c nr_numint_sparse.c
-  )
-add_dependencies(dft cgto cvhf np_helper)
+  multigrid.c grid_common.c grid_collocate.c grid_integrate.c utils.c
+)
+add_dependencies(dft cgto cvhf np_helper pbc)
 
 set_target_properties(dft PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
 
-target_link_libraries(dft cvhf cgto cint np_helper ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
-
+if(ENABLE_LIBXSMM)
+  add_definitions(-DHAVE_LIBXSMM)
+  target_link_libraries(dft cvhf cgto cint np_helper pbc xsmm ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
+else()
+  target_link_libraries(dft cvhf cgto cint np_helper pbc ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
+endif()
 
 if(ENABLE_LIBXC)
 add_library(xc_itrf SHARED libxc_itrf.c)
@@ -37,4 +42,3 @@ set_target_properties(xcfun_itrf PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
 target_link_libraries(xcfun_itrf xcfun ${OPENMP_C_PROPERTIES})
 endif()
-
diff --git a/pyscf/lib/dft/grid_collocate.c b/pyscf/lib/dft/grid_collocate.c
new file mode 100644
index 0000000000..33842191d3
--- /dev/null
+++ b/pyscf/lib/dft/grid_collocate.c
@@ -0,0 +1,655 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <complex.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "dft/multigrid.h"
+#include "dft/grid_common.h"
+#include "dft/utils.h"
+
+#define MAX_THREADS     256
+#define PTR_RADIUS        5
+
+static void transform_dm(double* dm_cart, double* dm,
+                         double* ish_contr_coeff, double* jsh_contr_coeff,
+                         int* ish_ao_loc, int* jsh_ao_loc,
+                         int* ish_bas, int* jsh_bas, int ish, int jsh,
+                         int ish0, int jsh0, int naoj, double* cache)
+{
+    int i0 = ish_ao_loc[ish] - ish_ao_loc[ish0];
+    int i1 = ish_ao_loc[ish+1] - ish_ao_loc[ish0];
+    int j0 = jsh_ao_loc[jsh] - jsh_ao_loc[jsh0];
+    int j1 = jsh_ao_loc[jsh+1] - jsh_ao_loc[jsh0];
+
+    int nrow = i1 - i0;
+    int ncol = j1 - j0;
+    double* pdm = dm + ((size_t)naoj) * i0 + j0;
+
+    int l_i = ish_bas[ANG_OF+ish*BAS_SLOTS];
+    int ncart_i = _LEN_CART[l_i];
+    int nprim_i = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+    int nao_i = nprim_i*ncart_i;
+    int l_j = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+    int ncart_j = _LEN_CART[l_j];
+    int nprim_j = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+    int nao_j = nprim_j*ncart_j;
+
+    const char TRANS_T = 'T';
+    const char TRANS_N = 'N';
+    const double D1 = 1;
+    const double D0 = 0;
+    //einsum("pi,ij,qj->pq", coeff_i, dm, coeff_j)
+    dgemm_wrapper(TRANS_T, TRANS_N, nao_j, nrow, ncol,
+           D1, jsh_contr_coeff, ncol, pdm, naoj, D0, cache, nao_j);
+    dgemm_wrapper(TRANS_N, TRANS_N, nao_j, nao_i, nrow,
+           D1, cache, nao_j, ish_contr_coeff, nrow, D0, dm_cart, nao_j);
+}
+
+
+static void add_rho_submesh(double* rho, double* pqr,
+                            int* mesh_lb, int* mesh_ub, int* submesh_lb,
+                            const int* mesh, const int* submesh)
+{
+    const int x0 = mesh_lb[0];
+    const int y0 = mesh_lb[1];
+    const int z0 = mesh_lb[2];
+
+    const int nx = mesh_ub[0] - x0;
+    const int ny = mesh_ub[1] - y0;
+    const int nz = mesh_ub[2] - z0;
+
+    const int x0_sub = submesh_lb[0];
+    const int y0_sub = submesh_lb[1];
+    const int z0_sub = submesh_lb[2];
+
+    const size_t mesh_yz = ((size_t) mesh[1]) * mesh[2];
+    const size_t submesh_yz = ((size_t) submesh[1]) * submesh[2];
+
+    int ix, iy, iz;
+    for (ix = 0; ix < nx; ix++) {
+        double* __restrict ptr_rho = rho + (ix + x0) * mesh_yz + y0 * mesh[2] + z0;
+        double* __restrict ptr_pqr = pqr + (ix + x0_sub) * submesh_yz + y0_sub * submesh[2] + z0_sub;
+        for (iy = 0; iy < ny; iy++) {
+            #pragma omp simd
+            for (iz = 0; iz < nz; iz++) {
+                ptr_rho[iz] += ptr_pqr[iz];
+            }
+            ptr_rho += mesh[2];
+            ptr_pqr += submesh[2];
+        }
+    }
+}
+
+
+static void _orth_rho(double *rho, double *dm_xyz,
+                      double fac, int topl,
+                      int *mesh, int *grid_slice,
+                      double *xs_exp, double *ys_exp, double *zs_exp,
+                      double *cache)
+{
+    const int l1 = topl + 1;
+    const int l1l1 = l1 * l1;
+    const int nx0 = grid_slice[0];
+    const int nx1 = grid_slice[1];
+    const int ny0 = grid_slice[2];
+    const int ny1 = grid_slice[3];
+    const int nz0 = grid_slice[4];
+    const int nz1 = grid_slice[5];
+    const int ngridx = nx1 - nx0;
+    const int ngridy = ny1 - ny0;
+    const int ngridz = nz1 - nz0;
+    if (ngridx == 0 || ngridy == 0 || ngridz == 0) {
+        return;
+    }
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D0 = 0;
+    const double D1 = 1;
+    const int xcols = ngridy * ngridz;
+    double *xyr = cache;
+    double *xqr = xyr + l1l1 * ngridz;
+    double *pqr = xqr + l1 * xcols;
+    int ix, iy, iz, l;
+
+    dgemm_wrapper(TRANS_N, TRANS_N, ngridz, l1l1, l1,
+                  fac, zs_exp, ngridz, dm_xyz, l1,
+                  D0, xyr, ngridz);
+    for (l = 0; l <= topl; l++) {
+        dgemm_wrapper(TRANS_N, TRANS_T, ngridz, ngridy, l1,
+                      D1, xyr+l*l1*ngridz, ngridz, ys_exp, ngridy,
+                      D0, xqr+l*xcols, ngridz);
+    }
+    dgemm_wrapper(TRANS_N, TRANS_T, xcols, ngridx, l1,
+                  D1, xqr, xcols, xs_exp, ngridx,
+                  D0, pqr, xcols);
+
+    const int submesh[3] = {ngridx, ngridy, ngridz};
+    int lb[3], ub[3];
+    for (ix = 0; ix < ngridx;) {
+        lb[0] = modulo(ix + nx0, mesh[0]);
+        ub[0] = get_upper_bound(lb[0], mesh[0], ix, ngridx);
+        for (iy = 0; iy < ngridy;) {
+            lb[1] = modulo(iy + ny0, mesh[1]);
+            ub[1] = get_upper_bound(lb[1], mesh[1], iy, ngridy);
+            for (iz = 0; iz < ngridz;) {
+                lb[2] = modulo(iz + nz0, mesh[2]);
+                ub[2] = get_upper_bound(lb[2], mesh[2], iz, ngridz);
+                int lb_sub[3] = {ix, iy, iz};
+                add_rho_submesh(rho, pqr, lb, ub, lb_sub, mesh, submesh);
+                iz += ub[2] - lb[2];
+            }
+            iy += ub[1] - lb[1];
+        }
+        ix += ub[0] - lb[0];
+    }
+}
+
+
+void make_rho_lda_orth(double *rho, double *dm, int comp,
+                       int li, int lj, double ai, double aj,
+                       double *ri, double *rj, double fac, double cutoff,
+                       int dimension, double* dh, double *a, double *b,
+                       int *mesh, double *cache)
+{
+        int topl = li + lj;
+        int l1 = topl + 1;
+        int l1l1l1 = l1 * l1 * l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+
+        if (data_size == 0) {
+                return;
+        }
+        cache += data_size;
+
+        double *dm_xyz = cache;
+        cache += l1l1l1;
+        memset(dm_xyz, 0, l1l1l1*sizeof(double));
+
+        _dm_to_dm_xyz(dm_xyz, dm, li, lj, ri, rj, cache);
+
+        _orth_rho(rho, dm_xyz, fac, topl, mesh, grid_slice,
+                  xs_exp, ys_exp, zs_exp, cache);
+}
+
+
+static void _apply_rho(void (*eval_rho)(), double *rho, double *dm,
+                       PGFPair* pgfpair, int comp, int dimension,
+                       double* dh, double *a, double *b, int *mesh,
+                       double* ish_gto_norm, double* jsh_gto_norm,
+                       int *ish_atm, int *ish_bas, double *ish_env,
+                       int *jsh_atm, int *jsh_bas, double *jsh_env,
+                       double* Ls, double *cache)
+{
+        int ish = pgfpair->ish;
+        int jsh = pgfpair->jsh;
+        int ipgf = pgfpair->ipgf;
+        int jpgf = pgfpair->jpgf;
+        int iL = pgfpair->iL;
+        double cutoff = pgfpair->radius;
+
+        double *ri = ish_env + ish_atm[PTR_COORD+ish_bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        double *rj = jsh_env + jsh_atm[PTR_COORD+jsh_bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        double *rL = Ls + iL*3;
+        double rjL[3];
+        rjL[0] = rj[0] + rL[0];
+        rjL[1] = rj[1] + rL[1];
+        rjL[2] = rj[2] + rL[2];
+
+        const int li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        const int lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+        double ai = ish_env[ish_bas[PTR_EXP+ish*BAS_SLOTS]+ipgf];
+        double aj = jsh_env[jsh_bas[PTR_EXP+jsh*BAS_SLOTS]+jpgf];
+        double ci = ish_gto_norm[ipgf];
+        double cj = jsh_gto_norm[jpgf];
+        double aij = ai + aj;
+        double rrij = CINTsquare_dist(ri, rjL);
+        double eij = (ai * aj / aij) * rrij;
+        if (eij > EIJCUTOFF) {
+                return;
+        }
+        double fac = exp(-eij) * ci * cj * CINTcommon_fac_sp(li) * CINTcommon_fac_sp(lj);
+        if (fac < ish_env[PTR_EXPDROP] && fac < jsh_env[PTR_EXPDROP]) {
+                return;
+        }
+
+        (*eval_rho)(rho, dm, comp, li, lj, ai, aj, ri, rjL,
+                    fac, cutoff, dimension, dh, a, b, mesh, cache);
+}
+
+
+static size_t _rho_cache_size(int l, int nprim, int nctr, int* mesh, double radius, double* dh)
+{
+    size_t size = 0;
+    size_t mesh_size = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int l1 = 2 * l + 1;
+    int l1l1 = l1 * l1;
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    size += (nprim * _LEN_CART[l]) * (nprim * _LEN_CART[l]); // dm_cart
+    size += _LEN_CART[l]*_LEN_CART[l]; // dm_pgf
+    size += nctr * _LEN_CART[l] * nprim * _LEN_CART[l]; // transform_dm
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]); // xs_exp, ys_exp, zs_exp
+    size += l1l1 * l1; // dm_xyz
+    size += 3 * (_LEN_CART[l] + l1); // _dm_to_dm_xyz
+
+    size_t size_orth_components = l1 * nmx + nmx; // orth_components
+    size_t size_orth_rho = 0; // _orth_rho
+    if (nmx < max_mesh) {
+        size_orth_rho = l1l1*nmx + l1*nmx*nmx + nmx*nmx*nmx;
+    } else {
+        size_orth_rho = l1l1*mesh[2] + l1*mesh[1]*mesh[2] + mesh_size;
+    }
+    size += MAX(size_orth_rho, size_orth_components);
+    size += 1000000;
+    //printf("Memory allocated per thread for make_rho: %ld MB.\n", (size+mesh_size)*sizeof(double) / 1000000);
+    return size;
+}
+
+
+static size_t _rho_core_cache_size(int* mesh, double radius, double* dh)
+{
+    size_t size = 0;
+    size_t mesh_size = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int l = 0;
+    int l1 = 1;
+    int l1l1 = l1 * l1;
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]);
+    size += l1l1 * l1;
+    size += 3 * (_LEN_CART[l] + l1);
+
+    size_t size_orth_components = l1 * nmx + nmx;
+    size_t size_orth_rho = 0;
+    if (nmx < max_mesh) {
+        size_orth_rho = l1l1*nmx + l1*nmx*nmx + nmx*nmx*nmx;
+    } else {
+        size_orth_rho = l1l1*mesh[2] + l1*mesh[1]*mesh[2] + mesh_size;
+    }
+    size += MAX(size_orth_rho, size_orth_components);
+    //size += 1000000;
+    return size;
+}
+
+
+void grid_collocate_drv(void (*eval_rho)(), RS_Grid** rs_rho, double* dm, TaskList** task_list,
+                        int comp, int hermi, int *shls_slice, int* ish_ao_loc, int* jsh_ao_loc,
+                        int dimension, double* Ls, double* a, double* b,
+                        int* ish_atm, int* ish_bas, double* ish_env,
+                        int* jsh_atm, int* jsh_bas, double* jsh_env, int cart)
+{
+    TaskList* tl = *task_list;
+    GridLevel_Info* gridlevel_info = tl->gridlevel_info;
+    int nlevels = gridlevel_info->nlevels;
+
+    assert (comp == (*rs_rho)->comp);
+
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    //const int nijsh = nish * njsh;
+    //const int naoi = ish_ao_loc[ish1] - ish_ao_loc[ish0];
+    const int naoj = jsh_ao_loc[jsh1] - jsh_ao_loc[jsh0];
+
+    double **gto_norm_i = (double**) malloc(sizeof(double*) * nish);
+    double **cart2sph_coeff_i = (double**) malloc(sizeof(double*) * nish);
+    get_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1, ish_bas, ish_env, cart);
+    double **gto_norm_j = gto_norm_i;
+    double **cart2sph_coeff_j = cart2sph_coeff_i;
+    if (hermi != 1) {
+        gto_norm_j = (double**) malloc(sizeof(double*) * njsh);
+        cart2sph_coeff_j = (double**) malloc(sizeof(double*) * njsh);
+        get_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1, jsh_bas, jsh_env, cart);
+    }
+
+    int ish_lmax = get_lmax(ish0, ish1, ish_bas);
+    int jsh_lmax = ish_lmax;
+    if (hermi != 1) {
+        jsh_lmax = get_lmax(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nprim_max = get_nprim_max(ish0, ish1, ish_bas);
+    int jsh_nprim_max = ish_nprim_max;
+    if (hermi != 1) {
+        jsh_nprim_max = get_nprim_max(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nctr_max = get_nctr_max(ish0, ish1, ish_bas);
+    int jsh_nctr_max = ish_nctr_max;
+    if (hermi != 1) {
+        jsh_nctr_max = get_nctr_max(jsh0, jsh1, jsh_bas);
+    } 
+
+    int ilevel;
+    int *mesh;
+    double max_radius;
+    double *rho, *rhobufs[MAX_THREADS];
+    Task* task;
+    size_t ntasks;
+    PGFPair** pgfpairs;
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        task = (tl->tasks)[ilevel];
+        ntasks = task->ntasks;
+        if (ntasks <= 0) {
+            continue;
+        }
+        pgfpairs = task->pgfpairs;
+        max_radius = task->radius;
+
+        rho = (*rs_rho)->data[ilevel];
+        mesh = gridlevel_info->mesh + ilevel*3;
+
+        double dh[9];
+        get_grid_spacing(dh, a, mesh);
+
+        int *task_loc;
+        int nblock = get_task_loc(&task_loc, pgfpairs, ntasks, ish0, ish1, jsh0, jsh1, hermi);
+
+        size_t cache_size = _rho_cache_size(MAX(ish_lmax,jsh_lmax), 
+                                            MAX(ish_nprim_max, jsh_nprim_max),
+                                            MAX(ish_nctr_max, jsh_nctr_max), mesh, max_radius, dh);
+        size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+
+#pragma omp parallel
+{
+    PGFPair *pgfpair = NULL;
+    int iblock, itask, ish, jsh;
+    double *ptr_gto_norm_i, *ptr_gto_norm_j;
+    double *cache0 = malloc(sizeof(double) * cache_size);
+    double *dm_cart = cache0;
+    double *dm_pgf = cache0 + ish_nprim_max*_LEN_CART[ish_lmax]*jsh_nprim_max*_LEN_CART[jsh_lmax];
+    double *cache = dm_pgf + _LEN_CART[ish_lmax]*_LEN_CART[jsh_lmax]; 
+
+    int thread_id = omp_get_thread_num();
+    double *rho_priv;
+    if (thread_id == 0) {
+        rho_priv = rho;
+    } else {
+        rho_priv = calloc(comp*ngrids, sizeof(double));
+    }
+    rhobufs[thread_id] = rho_priv;
+
+    #pragma omp for schedule(dynamic)
+    for (iblock = 0; iblock < nblock; iblock+=2) {
+        itask = task_loc[iblock];
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        ptr_gto_norm_i = gto_norm_i[ish];
+        ptr_gto_norm_j = gto_norm_j[jsh];
+        transform_dm(dm_cart, dm, cart2sph_coeff_i[ish],
+                     cart2sph_coeff_j[jsh], ish_ao_loc, jsh_ao_loc,
+                     ish_bas, jsh_bas, ish, jsh, ish0, jsh0, naoj, cache);
+        for (; itask < task_loc[iblock+1]; itask++) {
+            pgfpair = pgfpairs[itask];
+            get_dm_pgfpair(dm_pgf, dm_cart, pgfpair, ish_bas, jsh_bas, hermi);
+            _apply_rho(eval_rho, rho_priv, dm_pgf, pgfpair, comp, dimension, dh, a, b, mesh,
+                       ptr_gto_norm_i, ptr_gto_norm_j, ish_atm, ish_bas, ish_env,
+                       jsh_atm, jsh_bas, jsh_env, Ls, cache);
+        }
+    }
+
+    free(cache0);
+    NPomp_dsum_reduce_inplace(rhobufs, comp*ngrids);
+    if (thread_id != 0) {
+        free(rho_priv);
+    }
+}
+    if (task_loc) {
+        free(task_loc);
+    }
+    } // loop ilevel
+
+    del_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1);
+    if (hermi != 1) {
+        del_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1);
+    }
+}
+
+
+void build_core_density(void (*eval_rho)(), double* rho,
+                        int* atm, int* bas, int nbas, double* env,
+                        int* mesh, int dimension, double* a, double* b, double max_radius)
+{
+    size_t ngrids;
+    ngrids = ((size_t) mesh[0]) * mesh[1] * mesh[2];
+
+    double dh[9];
+    get_grid_spacing(dh, a, mesh);
+
+    double *rhobufs[MAX_THREADS];
+    size_t cache_size =  _rho_core_cache_size(mesh, max_radius, dh);
+
+#pragma omp parallel
+{
+    int ia, ib;
+    double alpha, coeff, charge, rad, fac;
+    double dm[] = {1.0};
+    double *r0;
+    double *cache = (double*) malloc(sizeof(double) * cache_size);
+
+    int thread_id = omp_get_thread_num();
+    double *rho_priv;
+    if (thread_id == 0) {
+        rho_priv = rho;
+    } else {
+        rho_priv = calloc(ngrids, sizeof(double));
+    }
+    rhobufs[thread_id] = rho_priv;
+
+    #pragma omp for schedule(static)
+    for (ib = 0; ib < nbas; ib++) {
+        ia = bas[ib*BAS_SLOTS+ATOM_OF];
+        alpha = env[bas[ib*BAS_SLOTS+PTR_EXP]];
+        coeff = env[bas[ib*BAS_SLOTS+PTR_COEFF]];
+        charge = (double)atm[ia*ATM_SLOTS+CHARGE_OF];
+        r0 = env + atm[ia*ATM_SLOTS+PTR_COORD];
+        fac = -charge * coeff;
+        rad = env[atm[ia*ATM_SLOTS+PTR_RADIUS]];
+        eval_rho(rho_priv, dm, 1, 0, 0, alpha, 0., r0, r0,
+                 fac, rad, dimension, dh, a, b, mesh, cache);
+    }
+    free(cache);
+
+    NPomp_dsum_reduce_inplace(rhobufs, ngrids);
+    if (thread_id != 0) {
+        free(rho_priv);
+    }
+}
+}
+
+
+
+
+static void make_pgfparis_orth(
+            PGFPair* pgfpair, int comp, int dimension,
+            double* dh, double *a, double *b, int *mesh,
+            double* ish_gto_norm, double* jsh_gto_norm,
+            int *ish_atm, int *ish_bas, double *ish_env,
+            int *jsh_atm, int *jsh_bas, double *jsh_env,
+            double* Ls, double *cache)
+{
+        int ish = pgfpair->ish;
+        int jsh = pgfpair->jsh;
+        int ipgf = pgfpair->ipgf;
+        int jpgf = pgfpair->jpgf;
+        int iL = pgfpair->iL;
+        double cutoff = pgfpair->radius;
+
+        double *ri = ish_env + ish_atm[PTR_COORD+ish_bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        double *rj = jsh_env + jsh_atm[PTR_COORD+jsh_bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        double *rL = Ls + iL*3;
+        double rjL[3];
+        rjL[0] = rj[0] + rL[0];
+        rjL[1] = rj[1] + rL[1];
+        rjL[2] = rj[2] + rL[2];
+
+        const int li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        const int lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+        double ai = ish_env[ish_bas[PTR_EXP+ish*BAS_SLOTS]+ipgf];
+        double aj = jsh_env[jsh_bas[PTR_EXP+jsh*BAS_SLOTS]+jpgf];
+        double ci = ish_gto_norm[ipgf];
+        double cj = jsh_gto_norm[jpgf];
+        double aij = ai + aj;
+        double rrij = CINTsquare_dist(ri, rjL);
+        double eij = (ai * aj / aij) * rrij;
+        if (eij > EIJCUTOFF) {
+                return;
+        }
+        double fac = exp(-eij) * ci * cj * CINTcommon_fac_sp(li) * CINTcommon_fac_sp(lj);
+        if (fac < ish_env[PTR_EXPDROP] && fac < jsh_env[PTR_EXPDROP]) {
+                return;
+        }
+
+        int topl = li + lj;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+}
+
+
+void eval_pgfpairs(TaskList** task_list,
+                    int comp, int hermi, int *shls_slice, int* ish_ao_loc, int* jsh_ao_loc,
+                    int dimension, double* Ls, double* a, double* b,
+                    int* ish_atm, int* ish_bas, double* ish_env,
+                    int* jsh_atm, int* jsh_bas, double* jsh_env, int cart)
+{
+    TaskList* tl = *task_list;
+    GridLevel_Info* gridlevel_info = tl->gridlevel_info;
+    int nlevels = gridlevel_info->nlevels;
+
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    //const int nijsh = nish * njsh;
+    //const int naoi = ish_ao_loc[ish1] - ish_ao_loc[ish0];
+    //const int naoj = jsh_ao_loc[jsh1] - jsh_ao_loc[jsh0];
+
+    double **gto_norm_i = (double**) malloc(sizeof(double*) * nish);
+    double **cart2sph_coeff_i = (double**) malloc(sizeof(double*) * nish);
+    get_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1, ish_bas, ish_env, cart);
+    double **gto_norm_j = gto_norm_i;
+    double **cart2sph_coeff_j = cart2sph_coeff_i;
+    if (hermi != 1) {
+        gto_norm_j = (double**) malloc(sizeof(double*) * njsh);
+        cart2sph_coeff_j = (double**) malloc(sizeof(double*) * njsh);
+        get_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1, jsh_bas, jsh_env, cart);
+    }
+
+    int ish_lmax = get_lmax(ish0, ish1, ish_bas);
+    int jsh_lmax = ish_lmax;
+    if (hermi != 1) {
+        jsh_lmax = get_lmax(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nprim_max = get_nprim_max(ish0, ish1, ish_bas);
+    int jsh_nprim_max = ish_nprim_max;
+    if (hermi != 1) {
+        jsh_nprim_max = get_nprim_max(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nctr_max = get_nctr_max(ish0, ish1, ish_bas);
+    int jsh_nctr_max = ish_nctr_max;
+    if (hermi != 1) {
+        jsh_nctr_max = get_nctr_max(jsh0, jsh1, jsh_bas);
+    } 
+
+    int ilevel;
+    int *mesh;
+    double max_radius;
+    Task* task;
+    size_t ntasks;
+    PGFPair** pgfpairs;
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        task = (tl->tasks)[ilevel];
+        ntasks = task->ntasks;
+        if (ntasks <= 0) {
+            continue;
+        }
+        pgfpairs = task->pgfpairs;
+        max_radius = task->radius;
+
+        mesh = gridlevel_info->mesh + ilevel*3;
+
+        double dh[9];
+        get_grid_spacing(dh, a, mesh);
+
+        int *task_loc;
+        int nblock = get_task_loc(&task_loc, pgfpairs, ntasks, ish0, ish1, jsh0, jsh1, hermi);
+
+        size_t cache_size = _rho_cache_size(MAX(ish_lmax,jsh_lmax), 
+                                            MAX(ish_nprim_max, jsh_nprim_max),
+                                            MAX(ish_nctr_max, jsh_nctr_max), mesh, max_radius, dh);
+        //size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+
+#pragma omp parallel
+{
+    PGFPair *pgfpair = NULL;
+    int iblock, itask, ish, jsh;
+    double *ptr_gto_norm_i, *ptr_gto_norm_j;
+    double *cache = malloc(sizeof(double) * cache_size);
+
+    #pragma omp for schedule(dynamic)
+    for (iblock = 0; iblock < nblock; iblock+=2) {
+        itask = task_loc[iblock];
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        ptr_gto_norm_i = gto_norm_i[ish];
+        ptr_gto_norm_j = gto_norm_j[jsh];
+        for (; itask < task_loc[iblock+1]; itask++) {
+            pgfpair = pgfpairs[itask];
+            make_pgfparis_orth(pgfpair, comp, dimension, dh, a, b, mesh,
+                               ptr_gto_norm_i, ptr_gto_norm_j, ish_atm, ish_bas, ish_env,
+                               jsh_atm, jsh_bas, jsh_env, Ls, cache);
+        }
+    }
+
+    free(cache);
+}
+    if (task_loc) {
+        free(task_loc);
+    }
+    } // loop ilevel
+
+    del_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1);
+    if (hermi != 1) {
+        del_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1);
+    }
+}
diff --git a/pyscf/lib/dft/grid_common.c b/pyscf/lib/dft/grid_common.c
new file mode 100644
index 0000000000..f7e198ab17
--- /dev/null
+++ b/pyscf/lib/dft/grid_common.c
@@ -0,0 +1,660 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include "config.h"
+#include "cint.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "dft/multigrid.h"
+#include "dft/grid_common.h"
+
+#define EXPMIN         -700
+
+
+int get_lmax(int ish0, int ish1, int* bas)
+{
+    int lmax = 0;
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        lmax = MAX(lmax, bas[ANG_OF+ish*BAS_SLOTS]);
+    }
+    return lmax;
+}
+
+
+int get_nprim_max(int ish0, int ish1, int* bas)
+{
+    int nprim_max = 1;
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        nprim_max = MAX(nprim_max, bas[NPRIM_OF+ish*BAS_SLOTS]);
+    }
+    return nprim_max;
+}
+
+
+int get_nctr_max(int ish0, int ish1, int* bas)
+{
+    int nctr_max = 1;
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        nctr_max = MAX(nctr_max, bas[NCTR_OF+ish*BAS_SLOTS]);
+    }
+    return nctr_max;
+}
+
+
+void get_cart2sph_coeff(double** contr_coeff, double** gto_norm, 
+                        int ish0, int ish1, int* bas, double* env, int cart)
+{
+    int l;
+    int lmax = get_lmax(ish0, ish1, bas);
+    int nprim, ncart, nsph, nctr;
+    int ptr_exp, ptr_coeff;
+    int ish, ipgf, ic, i, j;
+
+    double **c2s = (double**) malloc(sizeof(double*) * (lmax+1));
+    for (l = 0; l <= lmax; l++) {
+        ncart = _LEN_CART[l];
+        if (l <= 1 || cart == 1) {
+            c2s[l] = (double*) calloc(ncart*ncart, sizeof(double));
+            for (i = 0; i < ncart; i++) {
+                c2s[l][i*ncart + i] = 1;
+            }
+        }
+        else {
+            nsph = 2*l + 1;
+            c2s[l] = (double*) calloc(nsph*ncart, sizeof(double));
+            double* gcart = (double*) calloc(ncart*ncart, sizeof(double));
+            for (i = 0; i < ncart; i++) {
+                gcart[i*ncart + i] = 1;
+            }
+            CINTc2s_ket_sph(c2s[l], ncart, gcart, l);
+            free(gcart);
+        }
+    }
+
+#pragma omp parallel private (ish, ipgf, ic, i, j, l,\
+                              ncart, nsph, nprim, nctr,\
+                              ptr_exp, ptr_coeff)
+{
+    #pragma omp for schedule(dynamic) 
+    for (ish = ish0; ish < ish1; ish++) {
+        l = bas[ANG_OF+ish*BAS_SLOTS];
+        ncart = _LEN_CART[l];
+        nsph = cart == 1 ? ncart : 2*l+1;
+        nprim = bas[NPRIM_OF+ish*BAS_SLOTS];
+        nctr = bas[NCTR_OF+ish*BAS_SLOTS];
+
+        ptr_exp = bas[PTR_EXP+ish*BAS_SLOTS];
+        gto_norm[ish] = (double*) malloc(sizeof(double) * nprim);
+        for (ipgf = 0; ipgf < nprim; ipgf++) {
+            gto_norm[ish][ipgf] = CINTgto_norm(l, env[ptr_exp+ipgf]);
+        }
+
+        ptr_coeff = bas[PTR_COEFF+ish*BAS_SLOTS];
+        double *buf = (double*) calloc(nctr*nprim, sizeof(double));
+        for (ipgf = 0; ipgf < nprim; ipgf++) {
+            double inv_norm = 1./gto_norm[ish][ipgf];
+            daxpy_(&nctr, &inv_norm, env+ptr_coeff+ipgf, &nprim, buf+ipgf, &nprim);
+        }
+
+        contr_coeff[ish] = (double*) malloc(sizeof(double) * nprim*ncart*nctr*nsph);
+        double* ptr_contr_coeff = contr_coeff[ish];
+        for (ipgf = 0; ipgf < nprim; ipgf++) {
+            for (i = 0; i < ncart; i++) {
+                for (ic = 0; ic < nctr; ic++) {
+                    for (j = 0; j < nsph; j++) {
+                        *ptr_contr_coeff = buf[ic*nprim+ipgf] * c2s[l][j*ncart+i];
+                        ptr_contr_coeff += 1;
+                    }
+                }
+            }
+        }
+        free(buf);
+    }
+}
+
+    for (l = 0; l <= lmax; l++) {
+        free(c2s[l]);
+    }
+    free(c2s);
+}
+
+
+void del_cart2sph_coeff(double** contr_coeff, double** gto_norm, int ish0, int ish1)
+{
+    int ish;
+    for (ish = ish0; ish < ish1; ish++) {
+        if (contr_coeff[ish]) {
+            free(contr_coeff[ish]);
+        }
+        if (gto_norm[ish]) {
+            free(gto_norm[ish]);
+        }
+    }
+    free(contr_coeff);
+    free(gto_norm);
+}
+
+
+int get_max_num_grid_orth(double* dh, double radius)
+{
+    double dx = MIN(MIN(dh[0], dh[4]), dh[8]);
+    int ngrid = 2 * (int) ceil(radius / dx) + 1;
+    return ngrid;
+}
+
+
+void get_grid_spacing(double* dh, double* a, int* mesh)
+{
+    int i, j;
+    for (i = 0; i < 3; i++) {
+        for (j = 0; j < 3; j++) {
+            dh[i*3+j] = a[i*3+j] / mesh[i];
+        }
+    }
+}
+
+
+int orth_components(double *xs_exp, int* bounds, double dx, double radius,
+                    double xi, double xj, double ai, double aj,
+                    int nx_per_cell, int topl, double *cache)
+{
+    double aij = ai + aj;
+    double xij = (ai * xi + aj * xj) / aij;
+    int x0_latt = (int) floor((xij - radius) / dx);
+    int x1_latt = (int) ceil((xij + radius) / dx);
+    int xij_latt = rint(xij / dx);
+    xij_latt = MAX(xij_latt, x0_latt);
+    xij_latt = MIN(xij_latt, x1_latt);
+    bounds[0] = x0_latt;
+    bounds[1] = x1_latt;
+    int ngridx = x1_latt - x0_latt;
+
+    double base_x = dx * xij_latt;
+    double x0xij = base_x - xij;
+    double _x0x0 = -aij * x0xij * x0xij;
+    if (_x0x0 < EXPMIN) {
+        return 0;
+    }
+
+    double *gridx = cache;
+    double *xs_all = xs_exp;
+    if (ngridx >= nx_per_cell) {
+        xs_all = gridx + ngridx;
+    }
+
+    double _dxdx = -aij * dx * dx;
+    double _x0dx = -2 * aij * x0xij * dx;
+    double exp_dxdx = exp(_dxdx);
+    double exp_2dxdx = exp_dxdx * exp_dxdx;
+    double exp_x0dx = exp(_x0dx + _dxdx);
+    double exp_x0x0 = exp(_x0x0);
+
+    int i;
+    int istart = xij_latt - x0_latt;
+    for (i = istart; i < ngridx; i++) {
+        xs_all[i] = exp_x0x0;
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+    }
+
+    exp_x0dx = exp(_dxdx - _x0dx);
+    exp_x0x0 = exp(_x0x0);
+    for (i = istart-1; i >= 0; i--) {
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+        xs_all[i] = exp_x0x0;
+    }
+
+    if (topl > 0) {
+        double x0xi = x0_latt * dx - xi;
+        for (i = 0; i < ngridx; i++) {
+            gridx[i] = x0xi + i * dx;
+        }
+        int l;
+        double *px0;
+        for (l = 1; l <= topl; l++) {
+            px0 = xs_all + (l-1) * ngridx;
+            for (i = 0; i < ngridx; i++) {
+                px0[ngridx+i] = px0[i] * gridx[i];
+            }
+        }
+    }
+
+    // add up contributions from all images to the referece image
+    if (ngridx >= nx_per_cell) {
+        memset(xs_exp, 0, (topl+1)*nx_per_cell*sizeof(double));
+        int ix, l, lb, ub, size_x;
+        for (ix = 0; ix < ngridx; ix++) {
+            lb = modulo(ix + x0_latt, nx_per_cell);
+            ub = get_upper_bound(lb, nx_per_cell, ix, ngridx);
+            size_x = ub - lb;
+            double* __restrict ptr_xs_exp = xs_exp + lb;
+            double* __restrict ptr_xs_all = xs_all + ix;
+            for (l = 0; l <= topl; l++) {
+                #pragma omp simd
+                for (i = 0; i < size_x; i++) {
+                    ptr_xs_exp[i] += ptr_xs_all[i];
+                }
+                ptr_xs_exp += nx_per_cell;
+                ptr_xs_all += ngridx;
+            }
+            ix += size_x - 1;
+        }
+
+        bounds[0] = 0;
+        bounds[1] = nx_per_cell;
+        ngridx = nx_per_cell;
+    }
+    return ngridx;
+}
+
+
+int _orth_components(double *xs_exp, int *img_slice, int *grid_slice,
+                     double a, double b, double cutoff,
+                     double xi, double xj, double ai, double aj,
+                     int periodic, int nx_per_cell, int topl, double *cache)
+{
+    double aij = ai + aj;
+    double xij = (ai * xi + aj * xj) / aij;
+    double heights_inv = b;
+    double xij_frac = xij * heights_inv;
+    double edge0 = xij_frac - cutoff * heights_inv;
+    double edge1 = xij_frac + cutoff * heights_inv;
+
+    if (edge0 == edge1) {
+        return 0;
+    }
+
+    int nimg0 = 0;
+    int nimg1 = 1;
+    if (periodic) {
+        nimg0 = (int) floor(edge0);
+        nimg1 = (int) ceil(edge1);
+    }
+
+    int nimg = nimg1 - nimg0;
+
+    int nmx0 = nimg0 * nx_per_cell;
+    int nmx1 = nimg1 * nx_per_cell;
+    int nmx = nmx1 - nmx0;
+
+    int nx0 = (int) floor(edge0 * nx_per_cell);
+    int nx1 = (int) ceil(edge1 * nx_per_cell);
+   
+    int nx0_edge = nx0 - nmx0;
+    int nx1_edge = nx1 - nmx0;
+
+    if (periodic) {
+        nx0 = nx0_edge % nx_per_cell;
+        nx1 = nx1_edge % nx_per_cell;
+        if (nx1 == 0) {
+            nx1 = nx_per_cell;
+        }
+    }
+    assert(nx0 == nx0_edge);
+
+    img_slice[0] = nimg0;
+    img_slice[1] = nimg1;
+    grid_slice[0] = nx0;
+    grid_slice[1] = nx1;
+
+    int ngridx = _num_grids_on_x(nimg, nx0, nx1, nx_per_cell);
+    if (ngridx == 0) {
+        return 0;
+    }
+
+    int i, m, l;
+    double *px0;
+
+    double *gridx = cache;
+    double *xs_all = cache + nmx;
+    if (nimg == 1) {
+        xs_all = xs_exp;
+    }
+
+    int grid_close_to_xij = rint(xij_frac * nx_per_cell) - nmx0;
+    grid_close_to_xij = MIN(grid_close_to_xij, nx1_edge);
+    grid_close_to_xij = MAX(grid_close_to_xij, nx0_edge);
+
+    double img0_x = a * nimg0;
+    double dx = a / nx_per_cell;
+    double base_x = img0_x + dx * grid_close_to_xij;
+    double x0xij = base_x - xij;
+    double _x0x0 = -aij * x0xij * x0xij;
+    if (_x0x0 < EXPMIN) {
+        return 0;
+    }
+
+    double _dxdx = -aij * dx * dx;
+    double _x0dx = -2 * aij * x0xij * dx;
+    double exp_dxdx = exp(_dxdx);
+    double exp_2dxdx = exp_dxdx * exp_dxdx;
+    double exp_x0dx = exp(_x0dx + _dxdx);
+    double exp_x0x0 = exp(_x0x0);
+
+    for (i = grid_close_to_xij; i < nx1_edge; i++) {
+        xs_all[i] = exp_x0x0;
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+    }
+
+    exp_x0dx = exp(_dxdx - _x0dx);
+    exp_x0x0 = exp(_x0x0);
+    for (i = grid_close_to_xij-1; i >= nx0_edge; i--) {
+        exp_x0x0 *= exp_x0dx;
+        exp_x0dx *= exp_2dxdx;
+        xs_all[i] = exp_x0x0;
+    }
+
+    if (topl > 0) {
+        double x0xi = img0_x - xi;
+        for (i = nx0_edge; i < nx1_edge; i++) {
+            gridx[i] = x0xi + i * dx;
+        }
+        for (l = 1; l <= topl; l++) {
+            px0 = xs_all + (l-1) * nmx;
+            for (i = nx0_edge; i < nx1_edge; i++) {
+                px0[nmx+i] = px0[i] * gridx[i];
+            }
+        }
+    }
+
+    int idx1;
+    if (nimg > 1) {
+        for (l = 0; l <= topl; l++) {
+            px0 = xs_all + l * nmx;
+            for (i = nx0; i < nx_per_cell; i++) {
+                xs_exp[l*nx_per_cell+i] = px0[i];
+            }
+            memset(xs_exp+l*nx_per_cell, 0, nx0*sizeof(double));
+            for (m = 1; m < nimg; m++) {
+                px0 = xs_all + l * nmx + m*nx_per_cell;
+                idx1 = (m == nimg - 1) ? nx1 : nx_per_cell;
+                for (i = 0; i < idx1; i++) {
+                    xs_exp[l*nx_per_cell+i] += px0[i];
+                }
+            }
+        }
+    }
+    return ngridx;
+}
+
+
+int init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                   int *grid_slice, double* dh, int* mesh, int topl, double radius,
+                   double ai, double aj, double *ri, double *rj, double *cache)
+{
+    int l1 = topl + 1;
+    *xs_exp = cache;
+    *ys_exp = *xs_exp + l1 * mesh[0];
+    *zs_exp = *ys_exp + l1 * mesh[1];
+    int data_size = l1 * (mesh[0] + mesh[1] + mesh[2]);
+    cache += data_size;
+
+    int ngridx = orth_components(*xs_exp, grid_slice, dh[0], radius,
+                                 ri[0], rj[0], ai, aj, mesh[0], topl, cache);
+    if (ngridx == 0) {
+            return 0;
+    }
+
+    int ngridy = orth_components(*ys_exp, grid_slice+2, dh[4], radius,
+                                 ri[1], rj[1], ai, aj, mesh[1], topl, cache);
+    if (ngridy == 0) {
+            return 0;
+    }
+
+    int ngridz = orth_components(*zs_exp, grid_slice+4, dh[8], radius,
+                                 ri[2], rj[2], ai, aj, mesh[2], topl, cache);
+    if (ngridz == 0) {
+            return 0;
+    }
+
+    return data_size;
+}
+
+
+int _init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                    int *img_slice, int *grid_slice, int *mesh,
+                    int topl, int dimension, double cutoff,
+                    double ai, double aj, double *ri, double *rj,
+                    double *a, double *b, double *cache)
+{
+        int l1 = topl + 1;
+        *xs_exp = cache;
+        *ys_exp = *xs_exp + l1 * mesh[0];
+        *zs_exp = *ys_exp + l1 * mesh[1];
+        int data_size = l1 * (mesh[0] + mesh[1] + mesh[2]);
+        cache += data_size;
+
+        int ngridx = _orth_components(*xs_exp, img_slice, grid_slice,
+                                      a[0], b[0], cutoff, ri[0], rj[0], ai, aj,
+                                      (dimension>=1), mesh[0], topl, cache);
+        if (ngridx == 0) {
+                return 0;
+        }
+
+        int ngridy = _orth_components(*ys_exp, img_slice+2, grid_slice+2,
+                                      a[4], b[4], cutoff, ri[1], rj[1], ai, aj,
+                                      (dimension>=2), mesh[1], topl, cache);
+        if (ngridy == 0) {
+                return 0;
+        }
+
+        int ngridz = _orth_components(*zs_exp, img_slice+4, grid_slice+4,
+                                      a[8], b[8], cutoff, ri[2], rj[2], ai, aj,
+                                      (dimension>=3), mesh[2], topl, cache);
+        if (ngridz == 0) {
+                return 0;
+        }
+
+        return data_size;
+}
+
+
+void _get_dm_to_dm_xyz_coeff(double* coeff, double* rij, int lmax, double* cache)
+{
+    int l1 = lmax + 1;
+    int l, lx;
+
+    double *rx_pow = cache;
+    double *ry_pow = rx_pow + l1;
+    double *rz_pow = ry_pow + l1;
+
+    rx_pow[0] = 1.0;
+    ry_pow[0] = 1.0;
+    rz_pow[0] = 1.0;
+    for (lx = 1; lx <= lmax; lx++) {
+        rx_pow[lx] = rx_pow[lx-1] * rij[0];
+        ry_pow[lx] = ry_pow[lx-1] * rij[1];
+        rz_pow[lx] = rz_pow[lx-1] * rij[2];
+    }
+
+    int dj = _LEN_CART[lmax];
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    for (l = 0; l <= lmax; l++){
+        for (lx = 0; lx <= l; lx++) {
+            pcx[lx] = BINOMIAL(l, lx) * rx_pow[l-lx];
+            pcy[lx] = BINOMIAL(l, lx) * ry_pow[l-lx];
+            pcz[lx] = BINOMIAL(l, lx) * rz_pow[l-lx];
+        }
+        pcx += l+1;
+        pcy += l+1;
+        pcz += l+1;
+    }
+}
+
+
+void _dm_to_dm_xyz(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache)
+{
+    int lx, ly, lz;
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    int jx, jy, jz;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int l1 = li + lj + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj, cache);
+
+    double cx, cxy, cxyz;
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pdm = dm;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    for (jx = 0; jx <= lx_j; jx++) {
+                        cx = pcx[jx+_LEN_CART0[lx_j]];
+                        lx = lx_i + jx;
+                        for (jy = 0; jy <= ly_j; jy++) {
+                            cxy = cx * pcy[jy+_LEN_CART0[ly_j]];
+                            ly = ly_i + jy;
+                            for (jz = 0; jz <= lz_j; jz++) {
+                                cxyz = cxy * pcz[jz+_LEN_CART0[lz_j]];
+                                lz = lz_i + jz;
+                                dm_xyz[lx*l1l1+ly*l1+lz] += cxyz * pdm[0];
+                            }
+                        }
+                    }
+                    pdm += 1;
+                }
+            }
+        }
+    }
+}
+
+
+void _dm_xyz_to_dm(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache)
+{
+    int lx, ly, lz;
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    int jx, jy, jz;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int l1 = li + lj + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj, cache);
+
+    double cx, cy, cz;
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pdm = dm;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    for (jx = 0; jx <= lx_j; jx++) {
+                        cx = pcx[jx+_LEN_CART0[lx_j]];
+                        lx = lx_i + jx;
+                        for (jy = 0; jy <= ly_j; jy++) {
+                            cy = pcy[jy+_LEN_CART0[ly_j]];
+                            ly = ly_i + jy;
+                            for (jz = 0; jz <= lz_j; jz++) {
+                                cz = pcz[jz+_LEN_CART0[lz_j]];
+                                lz = lz_i + jz;
+                                pdm[0] += cx*cy*cz * dm_xyz[lx*l1l1+ly*l1+lz];
+                            }
+                        }
+                    }
+                    pdm += 1;
+                }
+            }
+        }
+    }
+}
+
+
+void get_dm_pgfpair(double* dm_pgf, double* dm_cart, 
+                    PGFPair* pgfpair, int* ish_bas, int* jsh_bas, int hermi)
+{
+    int ish = pgfpair->ish;
+    int jsh = pgfpair->jsh;
+    int ipgf = pgfpair->ipgf;
+    int jpgf = pgfpair->jpgf;
+
+    int li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+    int lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+    int di = _LEN_CART[li];
+    int dj = _LEN_CART[lj];
+
+    int nprim_j = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+    int ncol = nprim_j * dj;
+    double *pdm = dm_cart + (ipgf*di*ncol + jpgf*dj);
+    double *pdm_pgf = dm_pgf;
+    int i, j;
+    for (i = 0; i < di; i++) {
+        for (j = 0; j < dj; j++) {
+            pdm_pgf[j] = pdm[j];
+        }
+        pdm_pgf += dj;
+        pdm += ncol;
+    }
+
+    /*
+    if (hermi == 1 && ish == jsh) {
+        assert(di == dj);
+        for (i = 0; i < di; i++) {
+            for (j = i+1; j < dj; j++) {
+                dm_pgf[i*dj+j] *= 2;
+                dm_pgf[j*dj+i] = 0;
+            }
+        }
+    }*/
+    if (hermi == 1 && ish != jsh) {
+        pdm_pgf = dm_pgf;
+        for (i = 0; i < di; i++) {
+            for (j = 0; j < dj; j++) {
+                pdm_pgf[j] *= 2;
+            }
+            pdm_pgf += dj;
+        }
+    }
+}
diff --git a/pyscf/lib/dft/grid_common.h b/pyscf/lib/dft/grid_common.h
new file mode 100644
index 0000000000..36dc7e3655
--- /dev/null
+++ b/pyscf/lib/dft/grid_common.h
@@ -0,0 +1,109 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ */
+
+#ifndef HAVE_DEFINED_GRID_COMMON_H
+#define HAVE_DEFINED_GRID_COMMON_H
+
+#include "cint.h"
+
+#define EIJCUTOFF        60
+#define PTR_EXPDROP      16
+
+extern double CINTsquare_dist(const double *r1, const double *r2);
+extern double CINTcommon_fac_sp(int l);
+
+int get_lmax(int ish0, int ish1, int* bas);
+int get_nprim_max(int ish0, int ish1, int* bas);
+int get_nctr_max(int ish0, int ish1, int* bas);
+void get_cart2sph_coeff(double** contr_coeff, double** gto_norm,
+                        int ish0, int ish1, int* bas, double* env, int cart);
+void del_cart2sph_coeff(double** contr_coeff, double** gto_norm, int ish0, int ish1);
+
+static inline int _has_overlap(int nx0, int nx1, int nx_per_cell)
+{
+    return nx0 <= nx1;
+}
+
+static inline int _num_grids_on_x(int nimgx, int nx0, int nx1, int nx_per_cell)
+{
+    int ngridx;
+    if (nimgx == 1) {
+        ngridx = nx1 - nx0;
+    } else if (nimgx == 2 && !_has_overlap(nx0, nx1, nx_per_cell)) {
+        ngridx = nx1 - nx0 + nx_per_cell;
+    } else {
+        ngridx = nx_per_cell;
+    }
+    return ngridx;
+}
+
+
+static inline void _get_grid_mapping(int* xmap, int nx0, int nx1, int ngridx, int nimgx, bool is_x_split)
+{
+    int ix, nx;
+    if (nimgx == 1) {
+        for (ix = 0; ix < ngridx; ix++) {
+            xmap[ix] = ix + nx0;
+        }
+    } else if (is_x_split) {
+        for (ix = 0; ix < nx1; ix++) {
+            xmap[ix] = ix;
+        }
+        nx = nx0 - nx1;
+        for (ix = nx1; ix < ngridx; ix++) {
+            xmap[ix] = ix + nx;
+        }
+    } else {
+        for (ix = 0; ix < ngridx; ix++) {
+            xmap[ix] = ix;
+        }
+    }
+}
+
+
+static inline int modulo(int i, int n)
+{
+    return (i % n + n) % n;
+}
+
+
+static inline int get_upper_bound(int x0, int nx_per_cell, int ix, int ngridx)
+{
+    return x0 + MIN(nx_per_cell - x0, ngridx - ix);
+}
+
+int _orth_components(double *xs_exp, int *img_slice, int *grid_slice,
+                     double a, double b, double cutoff,
+                     double xi, double xj, double ai, double aj,
+                     int periodic, int nx_per_cell, int topl, double *cache);
+int _init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                    int *img_slice, int *grid_slice, int *mesh,
+                    int topl, int dimension, double cutoff,
+                    double ai, double aj, double *ri, double *rj,
+                    double *a, double *b, double *cache);
+
+int init_orth_data(double **xs_exp, double **ys_exp, double **zs_exp,
+                   int *grid_slice, double* dh, int* mesh, int topl, double radius,
+                   double ai, double aj, double *ri, double *rj, double *cache);
+void get_grid_spacing(double* dh, double* a, int* mesh);
+
+void _get_dm_to_dm_xyz_coeff(double* coeff, double* rij, int lmax, double* cache);
+void _dm_to_dm_xyz(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache);
+void _dm_xyz_to_dm(double* dm_xyz, double* dm, int li, int lj, double* ri, double* rj, double* cache);
+void get_dm_pgfpair(double* dm_pgf, double* dm_cart,
+                    PGFPair* pgfpair, int* ish_bas, int* jsh_bas, int hermi);
+int get_max_num_grid_orth(double* dh, double radius);
+#endif
diff --git a/pyscf/lib/dft/grid_integrate.c b/pyscf/lib/dft/grid_integrate.c
new file mode 100644
index 0000000000..9cabe864cb
--- /dev/null
+++ b/pyscf/lib/dft/grid_integrate.c
@@ -0,0 +1,1358 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "dft/multigrid.h"
+#include "dft/grid_common.h"
+#include "dft/utils.h"
+
+#define PTR_RADIUS      5
+
+
+void transform_dm_inverse(double* dm_cart, double* dm, int comp,
+                          double* ish_contr_coeff, double* jsh_contr_coeff,
+                          int* ish_ao_loc, int* jsh_ao_loc,
+                          int* ish_bas, int* jsh_bas, int ish, int jsh,
+                          int ish0, int jsh0, int naoi, int naoj, double* cache)
+{
+    int i0 = ish_ao_loc[ish] - ish_ao_loc[ish0];
+    int i1 = ish_ao_loc[ish+1] - ish_ao_loc[ish0];
+    int j0 = jsh_ao_loc[jsh] - jsh_ao_loc[jsh0];
+    int j1 = jsh_ao_loc[jsh+1] - jsh_ao_loc[jsh0];
+
+    int nrow = i1 - i0;
+    int ncol = j1 - j0;
+    double* pdm = dm + ((size_t)naoj) * i0 + j0;
+
+    int l_i = ish_bas[ANG_OF+ish*BAS_SLOTS];
+    int ncart_i = _LEN_CART[l_i];
+    int nprim_i = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+    int nao_i = nprim_i*ncart_i;
+    int l_j = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+    int ncart_j = _LEN_CART[l_j];
+    int nprim_j = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+    int nao_j = nprim_j*ncart_j;
+
+    const char TRANS_T = 'T';
+    const char TRANS_N = 'N';
+    const double D1 = 1;
+    const double D0 = 0;
+    double *buf = cache;
+
+    int ic;
+    for (ic = 0; ic < comp; ic++) {
+        //einsum("pi,pq,qj->ij", coeff_i, dm_cart, coeff_j)
+        dgemm_(&TRANS_N, &TRANS_N, &ncol, &nao_i, &nao_j,
+               &D1, jsh_contr_coeff, &ncol, dm_cart, &nao_j, &D0, buf, &ncol);
+        dgemm_(&TRANS_N, &TRANS_T, &ncol, &nrow, &nao_i,
+               &D1, buf, &ncol, ish_contr_coeff, &nrow, &D0, pdm, &naoj);
+        pdm += ((size_t)naoi) * naoj;
+        dm_cart += nao_i * nao_j;
+    }
+}
+
+
+static void fill_tril(double* mat, int comp, int* ish_ao_loc, int* jsh_ao_loc,
+                      int ish, int jsh, int ish0, int jsh0, int naoi, int naoj)
+{
+    int i0 = ish_ao_loc[ish] - ish_ao_loc[ish0];
+    int i1 = ish_ao_loc[ish+1] - ish_ao_loc[ish0];
+    int j0 = jsh_ao_loc[jsh] - jsh_ao_loc[jsh0];
+    int j1 = jsh_ao_loc[jsh+1] - jsh_ao_loc[jsh0];
+    int ni = i1 - i0;
+    int nj = j1 - j0;
+    size_t nao2 = ((size_t)naoi) * naoj;
+
+    double *pmat_up = mat + i0*((size_t)naoj) + j0;
+    double *pmat_low = mat + j0*((size_t)naoj) + i0;
+    int ic, i, j;
+    for (ic = 0; ic < comp; ic++) {
+        for (i = 0; i < ni; i++) {
+            for (j = 0; j < nj; j++) {
+                pmat_low[j*naoj+i] = pmat_up[i*naoj+j];
+            }
+        }
+        pmat_up += nao2;
+        pmat_low += nao2;
+    }
+}
+
+
+static void integrate_submesh(double* out, double* weights,
+                              double* xs_exp, double* ys_exp, double* zs_exp,
+                              double fac, int topl,
+                              int* mesh_lb, int* mesh_ub, int* submesh_lb,
+                              const int* mesh, const int* submesh, double* cache)
+{
+    const int l1 = topl + 1;
+    const int l1l1 = l1 * l1;
+    const int x0 = mesh_lb[0];
+    const int y0 = mesh_lb[1];
+    const int z0 = mesh_lb[2];
+
+    const int nx = mesh_ub[0] - x0;
+    const int ny = mesh_ub[1] - y0;
+    const int nz = mesh_ub[2] - z0;
+
+    const int x0_sub = submesh_lb[0];
+    const int y0_sub = submesh_lb[1];
+    const int z0_sub = submesh_lb[2];
+
+    const size_t mesh_yz = ((size_t) mesh[1]) * mesh[2];
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D0 = 0;
+    const double D1 = 1;
+
+    double *lzlyx = cache;
+    double *zly = lzlyx + l1l1 * nx;
+    double *ptr_weights = weights + x0 * mesh_yz + y0 * mesh[2] + z0;
+
+    int ix;
+    for (ix = 0; ix < nx; ix++) {
+        dgemm_wrapper(TRANS_N, TRANS_N, nz, l1, ny,
+                      D1, ptr_weights, mesh[2], ys_exp+y0_sub, submesh[1],
+                      D0, zly, nz);
+        dgemm_wrapper(TRANS_T, TRANS_N, l1, l1, nz,
+                      D1, zs_exp+z0_sub, submesh[2], zly, nz,
+                      D0, lzlyx+l1l1*ix, l1);
+        ptr_weights += mesh_yz;
+    }
+    dgemm_wrapper(TRANS_N, TRANS_N, l1l1, l1, nx,
+                  fac, lzlyx, l1l1, xs_exp+x0_sub, submesh[0],
+                  D1, out, l1l1);
+}
+
+
+static void _orth_ints(double *out, double *weights, int topl, double fac,
+                       double *xs_exp, double *ys_exp, double *zs_exp,
+                       int *grid_slice, int *mesh, double *cache)
+{// NOTE: out is accumulated
+    const int nx0 = grid_slice[0];
+    const int nx1 = grid_slice[1];
+    const int ny0 = grid_slice[2];
+    const int ny1 = grid_slice[3];
+    const int nz0 = grid_slice[4];
+    const int nz1 = grid_slice[5];
+    const int ngridx = nx1 - nx0;
+    const int ngridy = ny1 - ny0;
+    const int ngridz = nz1 - nz0;
+    if (ngridx == 0 || ngridy == 0 || ngridz == 0) {
+        return;
+    }
+
+    const int submesh[3] = {ngridx, ngridy, ngridz};
+    int lb[3], ub[3];
+    int ix, iy, iz;
+    for (ix = 0; ix < ngridx;) {
+        lb[0] = modulo(ix + nx0, mesh[0]);
+        ub[0] = get_upper_bound(lb[0], mesh[0], ix, ngridx);
+        for (iy = 0; iy < ngridy;) {
+            lb[1] = modulo(iy + ny0, mesh[1]);
+            ub[1] = get_upper_bound(lb[1], mesh[1], iy, ngridy);
+            for (iz = 0; iz < ngridz;) {
+                lb[2] = modulo(iz + nz0, mesh[2]);
+                ub[2] = get_upper_bound(lb[2], mesh[2], iz, ngridz);
+                int lb_sub[3] = {ix, iy, iz};
+                integrate_submesh(out, weights, xs_exp, ys_exp, zs_exp, fac, topl,
+                                  lb, ub, lb_sub, mesh, submesh, cache);
+                iz += ub[2] - lb[2];
+            }
+            iy += ub[1] - lb[1];
+        }
+        ix += ub[0] - lb[0];
+    }
+}
+
+
+#define VRHO_LOOP_IP1(X, Y, Z) \
+    int lx, ly, lz; \
+    int jx, jy, jz; \
+    int l##X##_i_m1 = l##X##_i - 1; \
+    int l##X##_i_p1 = l##X##_i + 1; \
+    double cx, cy, cz, cfac; \
+    double fac_i = -2.0 * ai; \
+    for (j##Y = 0; j##Y <= l##Y##_j; j##Y++) { \
+        c##Y = pc##Y[j##Y+_LEN_CART0[l##Y##_j]]; \
+        l##Y = l##Y##_i + j##Y; \
+        for (j##Z = 0; j##Z <= l##Z##_j; j##Z++) { \
+            c##Z = pc##Z[j##Z+_LEN_CART0[l##Z##_j]]; \
+            l##Z = l##Z##_i + j##Z; \
+            cfac = c##Y * c##Z; \
+            for (j##X = 0; j##X <= l##X##_j; j##X++) { \
+                if (l##X##_i > 0) { \
+                    c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * l##X##_i; \
+                    l##X = l##X##_i_m1 + j##X; \
+                    pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+                } \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * fac_i; \
+                l##X = l##X##_i_p1 + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+        } \
+    }
+
+
+static void _vrho_loop_ip1_x(double* pv1, double* v1_xyz,
+                             double* pcx, double* pcy, double* pcz,
+                             double ai, double aj,
+                             int lx_i, int ly_i, int lz_i,
+                             int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VRHO_LOOP_IP1(x,y,z);
+}
+
+
+static void _vrho_loop_ip1_y(double* pv1, double* v1_xyz,
+                             double* pcx, double* pcy, double* pcz,
+                             double ai, double aj,
+                             int lx_i, int ly_i, int lz_i,
+                             int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VRHO_LOOP_IP1(y,x,z);
+}
+
+
+static void _vrho_loop_ip1_z(double* pv1, double* v1_xyz,
+                             double* pcx, double* pcy, double* pcz,
+                             double ai, double aj,
+                             int lx_i, int ly_i, int lz_i,
+                             int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VRHO_LOOP_IP1(z,x,y);
+}
+
+
+#define VSIGMA_LOOP(X, Y, Z) \
+    int lx, ly, lz; \
+    int jx, jy, jz; \
+    int l##X##_i_m1 = l##X##_i - 1; \
+    int l##X##_i_p1 = l##X##_i + 1; \
+    int l##X##_j_m1 = l##X##_j - 1; \
+    int l##X##_j_p1 = l##X##_j + 1; \
+    double cx, cy, cz, cfac; \
+    double fac_i = -2.0 * ai; \
+    double fac_j = -2.0 * aj; \
+    for (j##Y = 0; j##Y <= l##Y##_j; j##Y++) { \
+        c##Y = pc##Y[j##Y+_LEN_CART0[l##Y##_j]]; \
+        l##Y = l##Y##_i + j##Y; \
+        for (j##Z = 0; j##Z <= l##Z##_j; j##Z++) { \
+            c##Z = pc##Z[j##Z+_LEN_CART0[l##Z##_j]]; \
+            l##Z = l##Z##_i + j##Z; \
+            cfac = c##Y * c##Z; \
+            for (j##X = 0; j##X <= l##X##_j_m1; j##X++) { \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j_m1]] * l##X##_j; \
+                l##X = l##X##_i + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+            for (j##X = 0; j##X <= l##X##_j_p1; j##X++) { \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j_p1]] * fac_j; \
+                l##X = l##X##_i + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+            for (j##X = 0; j##X <= l##X##_j; j##X++) { \
+                if (l##X##_i > 0) { \
+                    c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * l##X##_i; \
+                    l##X = l##X##_i_m1 + j##X; \
+                    pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+                } \
+                c##X = pc##X[j##X+_LEN_CART0[l##X##_j]] * fac_i; \
+                l##X = l##X##_i_p1 + j##X; \
+                pv1[0] += c##X * cfac * v1_xyz[lx*l1l1+ly*l1+lz]; \
+            } \
+        } \
+    }
+
+
+static void _vsigma_loop_x(double* pv1, double* v1_xyz,
+                           double* pcx, double* pcy, double* pcz,
+                           double ai, double aj,
+                           int lx_i, int ly_i, int lz_i,
+                           int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VSIGMA_LOOP(x,y,z);
+}
+
+
+static void _vsigma_loop_y(double* pv1, double* v1_xyz,
+                           double* pcx, double* pcy, double* pcz,
+                           double ai, double aj,
+                           int lx_i, int ly_i, int lz_i,
+                           int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VSIGMA_LOOP(y,x,z);
+}
+
+
+static void _vsigma_loop_z(double* pv1, double* v1_xyz,
+                           double* pcx, double* pcy, double* pcz,
+                           double ai, double aj,
+                           int lx_i, int ly_i, int lz_i,
+                           int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    VSIGMA_LOOP(z,x,y);
+}
+
+
+static void _v1_xyz_to_v1(void (*_v1_loop)(), double* v1_xyz, double* v1,
+                          int li, int lj, double ai, double aj,
+                          double* ri, double* rj, double* cache)
+{
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int l1 = li + lj + 2;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj+1];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj+1, cache);
+
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pv1 = v1;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    _v1_loop(pv1, v1_xyz, pcx, pcy, pcz, ai, aj,
+                             lx_i, ly_i, lz_i, lx_j, ly_j, lz_j, l1, l1l1);
+                    pv1 += 1;
+                }
+            }
+        }
+    }
+}
+
+/*
+#define SUM_NABLA_I \
+        if (lx_i > 0) { \
+            pv1[0] += lx_i * cxyzj * v1x[(lx-1)*l1l1+ly*l1+lz]; \
+        } \
+        pv1[0] += fac_i * cxyzj * v1x[(lx+1)*l1l1+ly*l1+lz]; \
+        if (ly_i > 0) { \
+            pv1[0] += ly_i * cxyzj * v1y[lx*l1l1+(ly-1)*l1+lz]; \
+        } \
+        pv1[0] += fac_i * cxyzj * v1y[lx*l1l1+(ly+1)*l1+lz]; \
+        if (lz_i > 0) { \
+            pv1[0] += lz_i * cxyzj * v1z[lx*l1l1+ly*l1+lz-1]; \
+        } \
+        pv1[0] += fac_i * cxyzj * v1z[lx*l1l1+ly*l1+lz+1];
+*/
+/*
+static void _vsigma_loop_ip1ip2_x(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    int lx_j_m1 = lx_j - 1;
+    int lx_j_p1 = lx_j + 1;
+    double cxj, cyj, czj, cyzj, cxyzj;
+    double fac_i = -2.0 * ai;
+    double fac_j = -2.0 * aj;
+
+    for (jy = 0; jy <= ly_j; jy++) {
+        cyj = pcy[jy+_LEN_CART0[ly_j]];
+        ly = ly_i + jy;
+        for (jz = 0; jz <= lz_j; jz++) {
+            czj = pcz[jz+_LEN_CART0[lz_j]];
+            lz = lz_i + jz;
+            cyzj = cyj * czj;
+            for (jx = 0; jx <= lx_j_m1; jx++) {
+                cxj = pcx[jx+_LEN_CART0[lx_j_m1]] * lx_j;
+                cxyzj = cxj * cyzj;
+                lx = lx_i + jx;
+                SUM_NABLA_I;
+            }
+            for (jx = 0; jx <= lx_j_p1; jx++) {
+                cxj = pcx[jx+_LEN_CART0[lx_j_p1]] * fac_j;
+                cxyzj = cxj * cyzj;
+                lx = lx_i + jx;
+                SUM_NABLA_I;
+            }
+        }
+    }
+}
+*/
+
+#define COMMON_INIT(x) \
+    int l##x##_i; \
+    int lx, ly, lz; \
+    int jx, jy, jz; \
+    int lx_j_m1 = lx_j - 1; \
+    int lx_j_p1 = lx_j + 1; \
+    int ly_j_m1 = ly_j - 1; \
+    int ly_j_p1 = ly_j + 1; \
+    int lz_j_m1 = lz_j - 1; \
+    int lz_j_p1 = lz_j + 1; \
+    double ci; \
+    double cxj, cyj, czj; \
+    double cyzj, cxzj, cxyj, cxyzj; \
+    double fac_i = -2.0 * ai; \
+    double fac_j = -2.0 * aj; \
+
+
+#define SUM_NABLA_J(x, y, z) \
+    for (j##y = 0; j##y <= l##y##_j; j##y++) { \
+        c##y##j = pc##y[j##y+_LEN_CART0[l##y##_j]]; \
+        l##y = l##y##_i + j##y; \
+        for (j##z = 0; j##z <= l##z##_j; j##z++) { \
+            c##z##j = pc##z[j##z+_LEN_CART0[l##z##_j]]; \
+            l##z = l##z##_i + j##z; \
+            c##y##z##j = c##y##j * c##z##j; \
+            for (j##x = 0; j##x <= l##x##_j_m1; j##x++) { \
+                c##x##j = pc##x[j##x+_LEN_CART0[l##x##_j_m1]] * l##x##_j; \
+                cxyzj = c##x##j * c##y##z##j; \
+                l##x = l##x##_i + j##x; \
+                pv1[0] += ci * cxyzj * v1##x[lx*l1l1+ly*l1+lz]; \
+            } \
+            for (j##x = 0; j##x <= l##x##_j_p1; j##x++) { \
+                c##x##j = pc##x[j##x+_LEN_CART0[l##x##_j_p1]] * fac_j; \
+                cxyzj = c##x##j * c##y##z##j; \
+                l##x = l##x##_i + j##x; \
+                pv1[0] += ci * cxyzj * v1##x[lx*l1l1+ly*l1+lz]; \
+            } \
+        } \
+    }
+
+
+static void _vsigma_loop_ip1ip2_x(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i0, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    COMMON_INIT(x);
+
+    lx_i = lx_i0 + 1;
+    ci = fac_i;
+    SUM_NABLA_J(x,y,z);
+    SUM_NABLA_J(y,x,z);
+    SUM_NABLA_J(z,x,y);
+
+    if (lx_i0 > 0) {
+        lx_i = lx_i0 - 1;
+        ci = lx_i0;
+        SUM_NABLA_J(x,y,z);
+        SUM_NABLA_J(y,x,z);
+        SUM_NABLA_J(z,x,y);
+    }
+}
+
+/*
+static void _vsigma_loop_ip1ip2_y(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    int ly_j_m1 = ly_j - 1;
+    int ly_j_p1 = ly_j + 1;
+    double cxj, cyj, czj, cxzj, cxyzj;
+    double fac_i = -2.0 * ai;
+    double fac_j = -2.0 * aj;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jz = 0; jz <= lz_j; jz++) {
+            czj = pcz[jz+_LEN_CART0[lz_j]];
+            lz = lz_i + jz;
+            cxzj = cxj * czj;
+            for (jy = 0; jy <= ly_j_m1; jy++) {
+                cyj = pcy[jy+_LEN_CART0[ly_j_m1]] * ly_j;
+                cxyzj = cyj * cxzj;
+                ly = ly_i + jy;
+                SUM_NABLA_I;
+            }
+            for (jy = 0; jy <= ly_j_p1; jy++) {
+                cyj = pcy[jy+_LEN_CART0[ly_j_p1]] * fac_j;
+                cxyzj = cyj * cxzj;
+                ly = ly_i + jy;
+                SUM_NABLA_I;
+            }
+        }
+    }
+}
+*/
+
+static void _vsigma_loop_ip1ip2_y(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i0, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    COMMON_INIT(y);
+
+    ly_i = ly_i0 + 1;
+    ci = fac_i;
+    SUM_NABLA_J(x,y,z);
+    SUM_NABLA_J(y,x,z);
+    SUM_NABLA_J(z,x,y);
+
+    if (ly_i0 > 0) {
+        ly_i = ly_i0 - 1;
+        ci = ly_i0;
+        SUM_NABLA_J(x,y,z);
+        SUM_NABLA_J(y,x,z);
+        SUM_NABLA_J(z,x,y);
+    }
+}
+
+
+/*
+static void _vsigma_loop_ip1ip2_z(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    int lz_j_m1 = lz_j - 1;
+    int lz_j_p1 = lz_j + 1;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_i = -2.0 * ai;
+    double fac_j = -2.0 * aj;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j_m1; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j_m1]] * lz_j;
+                cxyzj = czj * cxyj;
+                lz = lz_i + jz;
+                SUM_NABLA_I;
+            }
+            for (jz = 0; jz <= lz_j_p1; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j_p1]] * fac_j;
+                cxyzj = czj * cxyj;
+                lz = lz_i + jz;
+                SUM_NABLA_I;
+            }
+        }
+    }
+}
+*/
+
+static void _vsigma_loop_ip1ip2_z(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i0,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    COMMON_INIT(z);
+
+    lz_i = lz_i0 + 1;
+    ci = fac_i;
+    SUM_NABLA_J(x,y,z);
+    SUM_NABLA_J(y,x,z);
+    SUM_NABLA_J(z,x,y);
+
+    if (lz_i0 > 0) {
+        lz_i = lz_i0 - 1;
+        ci = lz_i0;
+        SUM_NABLA_J(x,y,z);
+        SUM_NABLA_J(y,x,z);
+        SUM_NABLA_J(z,x,y);
+    }
+}
+
+
+static void _vsigma_ip1ip2(void (*_v1_loop)(), double* v1x,
+                           double* v1y, double* v1z, double* v1,
+                           int li, int lj, double ai, double aj,
+                           double* ri, double* rj, double* cache)
+{
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int topl = li + lj + 2;
+    int l1 = topl + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj+1];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj+1, cache);
+
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pv1 = v1;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    _v1_loop(pv1, v1x, v1y, v1z, pcx, pcy, pcz, ai, aj,
+                             lx_i, ly_i, lz_i, lx_j, ly_j, lz_j, l1, l1l1);
+                    pv1 += 1;
+                }
+            }
+        }
+    }
+}
+
+
+static void _vsigma_loop_lap1_x(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_x;
+    double fac_i = -2.0 * ai;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j]];
+                lz = lz_i + jz;
+                cxyzj = cxyj * czj;
+
+                fac_x = lx_i + 1;
+                pv1[0] += fac_x * fac_i * cxyzj * v1x[lx*l1l1+ly*l1+lz];
+                if (lx_i - 1 > 0) {
+                    fac_x = lx_i - 1;
+                    pv1[0] += fac_x * lx_i * cxyzj * v1x[(lx-2)*l1l1+ly*l1+lz];
+                }
+
+                if (lx_i > 0) {
+                    fac_x = lx_i;
+                    if (ly_i > 0) {
+                        pv1[0] += fac_x * ly_i * cxyzj * v1y[(lx-1)*l1l1+(ly-1)*l1+lz];
+                    }
+                    pv1[0] += fac_x * fac_i * cxyzj * v1y[(lx-1)*l1l1+(ly+1)*l1+lz];
+
+                    if (lz_i > 0) {
+                        pv1[0] += fac_x * lz_i * cxyzj * v1z[(lx-1)*l1l1+ly*l1+lz-1];
+                    }
+                    pv1[0] += fac_x * fac_i * cxyzj * v1z[(lx-1)*l1l1+ly*l1+lz+1];
+                }
+
+                fac_x = fac_i;
+                if (lx_i > 0) {
+                    pv1[0] += fac_x * lx_i * cxyzj * v1x[lx*l1l1+ly*l1+lz];
+                }
+                pv1[0] += fac_x * fac_i * cxyzj * v1x[(lx+2)*l1l1+ly*l1+lz];
+
+                if (ly_i > 0) {
+                    pv1[0] += fac_x * ly_i * cxyzj * v1y[(lx+1)*l1l1+(ly-1)*l1+lz];
+                }
+                pv1[0] += fac_x * fac_i * cxyzj * v1y[(lx+1)*l1l1+(ly+1)*l1+lz];
+
+                if (lz_i > 0) {
+                    pv1[0] += fac_x * lz_i * cxyzj * v1z[(lx+1)*l1l1+ly*l1+lz-1];
+                }
+                pv1[0] += fac_x * fac_i * cxyzj * v1z[(lx+1)*l1l1+ly*l1+lz+1];
+            }
+        }
+    }
+}
+
+
+static void _vsigma_loop_lap1_y(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_y;
+    double fac_i = -2.0 * ai;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j]];
+                lz = lz_i + jz;
+                cxyzj = cxyj * czj;
+
+                fac_y = ly_i + 1;
+                pv1[0] += fac_y * fac_i * cxyzj * v1y[lx*l1l1+ly*l1+lz];
+                if (ly_i - 1 > 0) {
+                    fac_y = ly_i - 1;
+                    pv1[0] += fac_y * ly_i * cxyzj * v1y[lx*l1l1+(ly-2)*l1+lz];
+                }
+
+                if (ly_i > 0) {
+                    fac_y = ly_i;
+                    if (lx_i > 0) {
+                        pv1[0] += fac_y * lx_i * cxyzj * v1x[(lx-1)*l1l1+(ly-1)*l1+lz];
+                    }
+                    pv1[0] += fac_y * fac_i * cxyzj * v1x[(lx+1)*l1l1+(ly-1)*l1+lz];
+
+                    if (lz_i > 0) {
+                        pv1[0] += fac_y * lz_i * cxyzj * v1z[lx*l1l1+(ly-1)*l1+lz-1];
+                    }
+                    pv1[0] += fac_y * fac_i * cxyzj * v1z[lx*l1l1+(ly-1)*l1+lz+1];
+                }
+
+                fac_y = fac_i;
+                if (lx_i > 0) {
+                    pv1[0] += fac_y * lx_i * cxyzj * v1x[(lx-1)*l1l1+(ly+1)*l1+lz];
+                }
+                pv1[0] += fac_y * fac_i * cxyzj * v1x[(lx+1)*l1l1+(ly+1)*l1+lz];
+
+                if (ly_i > 0) {
+                    pv1[0] += fac_y * ly_i * cxyzj * v1y[lx*l1l1+ly*l1+lz];
+                }
+                pv1[0] += fac_y * fac_i * cxyzj * v1y[lx*l1l1+(ly+2)*l1+lz];
+
+                if (lz_i > 0) {
+                    pv1[0] += fac_y * lz_i * cxyzj * v1z[lx*l1l1+(ly+1)*l1+lz-1];
+                }
+                pv1[0] += fac_y * fac_i * cxyzj * v1z[lx*l1l1+(ly+1)*l1+lz+1];
+            }
+        }
+    }
+}
+
+
+static void _vsigma_loop_lap1_z(double* pv1, double* v1x, double* v1y, double* v1z,
+                       double* pcx, double* pcy, double* pcz,
+                       double ai, double aj,
+                       int lx_i, int ly_i, int lz_i,
+                       int lx_j, int ly_j, int lz_j, int l1, int l1l1)
+{
+    int lx, ly, lz;
+    int jx, jy, jz;
+    double cxj, cyj, czj, cxyj, cxyzj;
+    double fac_z;
+    double fac_i = -2.0 * ai;
+
+    for (jx = 0; jx <= lx_j; jx++) {
+        cxj = pcx[jx+_LEN_CART0[lx_j]];
+        lx = lx_i + jx;
+        for (jy = 0; jy <= ly_j; jy++) {
+            cyj = pcy[jy+_LEN_CART0[ly_j]];
+            ly = ly_i + jy;
+            cxyj = cxj * cyj;
+            for (jz = 0; jz <= lz_j; jz++) {
+                czj = pcz[jz+_LEN_CART0[lz_j]];
+                lz = lz_i + jz;
+                cxyzj = cxyj * czj;
+
+                fac_z = lz_i + 1;
+                pv1[0] += fac_z * fac_i * cxyzj * v1z[lx*l1l1+ly*l1+lz];
+                if (lz_i - 1 > 0) {
+                    fac_z = lz_i - 1;
+                    pv1[0] += fac_z * lz_i * cxyzj * v1z[lx*l1l1+ly*l1+lz-2];
+                }
+
+                if (lz_i > 0) {
+                    fac_z = lz_i;
+                    if (lx_i > 0) {
+                        pv1[0] += fac_z * lx_i * cxyzj * v1x[(lx-1)*l1l1+ly*l1+lz-1];
+                    }
+                    pv1[0] += fac_z * fac_i * cxyzj * v1x[(lx+1)*l1l1+ly*l1+lz-1];
+
+                    if (ly_i > 0) {
+                        pv1[0] += fac_z * ly_i * cxyzj * v1y[lx*l1l1+(ly-1)*l1+lz-1];
+                    }
+                    pv1[0] += fac_z * fac_i * cxyzj * v1y[lx*l1l1+(ly+1)*l1+lz-1];
+                }
+
+                fac_z = fac_i;
+                if (lx_i > 0) {
+                    pv1[0] += fac_z * lx_i * cxyzj * v1x[(lx-1)*l1l1+ly*l1+lz+1];
+                }
+                pv1[0] += fac_z * fac_i * cxyzj * v1x[(lx+1)*l1l1+ly*l1+lz+1];
+
+                if (ly_i > 0) {
+                    pv1[0] += fac_z * ly_i * cxyzj * v1y[lx*l1l1+(ly-1)*l1+lz+1];
+                }
+                pv1[0] += fac_z * fac_i * cxyzj * v1y[lx*l1l1+(ly+1)*l1+lz+1];
+
+                if (lz_i > 0) {
+                    pv1[0] += fac_z * lz_i * cxyzj * v1z[lx*l1l1+ly*l1+lz];
+                }
+                pv1[0] += fac_z * fac_i * cxyzj * v1z[lx*l1l1+ly*l1+lz+2];
+            }
+        }
+    }
+}
+
+
+static void _vsigma_lap1(void (*_v1_loop)(), double* v1x,
+                         double* v1y, double* v1z, double* v1,
+                         int li, int lj, double ai, double aj,
+                         double* ri, double* rj, double* cache)
+{
+    int lx_i, ly_i, lz_i;
+    int lx_j, ly_j, lz_j;
+    double rij[3];
+
+    rij[0] = ri[0] - rj[0];
+    rij[1] = ri[1] - rj[1];
+    rij[2] = ri[2] - rj[2];
+
+    int topl = li + lj + 2;
+    int l1 = topl + 1;
+    int l1l1 = l1 * l1;
+    double *coeff = cache;
+    int dj = _LEN_CART[lj];
+    cache += 3 * dj;
+
+    _get_dm_to_dm_xyz_coeff(coeff, rij, lj, cache);
+
+    double *pcx = coeff;
+    double *pcy = pcx + dj;
+    double *pcz = pcy + dj;
+    double *pv1 = v1;
+    for (lx_i = li; lx_i >= 0; lx_i--) {
+        for (ly_i = li-lx_i; ly_i >= 0; ly_i--) {
+            lz_i = li - lx_i - ly_i;
+            for (lx_j = lj; lx_j >= 0; lx_j--) {
+                for (ly_j = lj-lx_j; ly_j >= 0; ly_j--) {
+                    lz_j = lj - lx_j - ly_j;
+                    _v1_loop(pv1, v1x, v1y, v1z, pcx, pcy, pcz, ai, aj,
+                             lx_i, ly_i, lz_i, lx_j, ly_j, lz_j, l1, l1l1);
+                    pv1 += 1;
+                }
+            }
+        }
+    }
+}
+
+
+int eval_mat_gga_orth(double *weights, double *out, int comp,
+                      int li, int lj, double ai, double aj,
+                      double *ri, double *rj, double fac, double cutoff,
+                      int dimension, double* dh, double *a, double *b,
+                      int *mesh, double *cache)
+{
+        int topl = li + lj + 1;
+        int l1 = topl+1;
+        int l1l1l1 = l1 * l1 * l1;
+        double *mat_xyz = cache;
+        cache += l1l1l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+        double *vx = weights + ngrids;
+        double *vy = vx + ngrids;
+        double *vz = vy + ngrids;
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, weights, li+lj, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _dm_xyz_to_dm(mat_xyz, out, li, lj, ri, rj, cache);
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, vx, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vsigma_loop_x, mat_xyz, out, li, lj, ai, aj, ri, rj, cache);
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, vy, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vsigma_loop_y, mat_xyz, out, li, lj, ai, aj, ri, rj, cache);
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, vz, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vsigma_loop_z, mat_xyz, out, li, lj, ai, aj, ri, rj, cache);
+
+        return 1;
+}
+
+
+int eval_mat_lda_orth(double *weights, double *out, int comp,
+                      int li, int lj, double ai, double aj,
+                      double *ri, double *rj, double fac, double cutoff,
+                      int dimension, double* dh, double *a, double *b,
+                      int *mesh, double *cache)
+{
+        int topl = li + lj;
+        int l1 = topl+1;
+        int l1l1l1 = l1*l1*l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        double *dm_xyz = cache;
+        cache += l1l1l1;
+
+        memset(dm_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(dm_xyz, weights, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        _dm_xyz_to_dm(dm_xyz, out, li, lj, ri, rj, cache);
+        return 1;
+}
+
+
+int eval_mat_lda_orth_ip1(double *weights, double *out, int comp,
+                          int li, int lj, double ai, double aj,
+                          double *ri, double *rj, double fac, double cutoff,
+                          int dimension, double* dh, double *a, double *b,
+                          int *mesh, double *cache)
+{
+        int dij = _LEN_CART[li] * _LEN_CART[lj];
+        int topl = li + lj + 1;
+        int l1 = topl+1;
+        int l1l1l1 = l1*l1*l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        double *mat_xyz = cache;
+        cache += l1l1l1;
+        double *pout_x = out;
+        double *pout_y = pout_x + dij;
+        double *pout_z = pout_y + dij;
+
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, weights, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_x, mat_xyz, pout_x, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_y, mat_xyz, pout_y, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_z, mat_xyz, pout_z, li, lj, ai, aj, ri, rj, cache);
+        return 1;
+}
+
+
+int eval_mat_gga_orth_ip1(double *weights, double *out, int comp,
+                          int li, int lj, double ai, double aj,
+                          double *ri, double *rj, double fac, double cutoff,
+                          int dimension, double* dh, double *a, double *b,
+                          int *mesh, double *cache)
+{
+        int dij = _LEN_CART[li] * _LEN_CART[lj];
+        int topl = li + lj + 2;
+        int l1 = topl+1;
+        int l1l1l1 = l1*l1*l1;
+        int grid_slice[6];
+        double *xs_exp, *ys_exp, *zs_exp;
+
+        int data_size = init_orth_data(&xs_exp, &ys_exp, &zs_exp,
+                                       grid_slice, dh, mesh, topl, cutoff,
+                                       ai, aj, ri, rj, cache);
+        if (data_size == 0) {
+                return 0;
+        }
+        cache += data_size;
+
+        double *mat_xyz = cache;
+        double *mat_x = mat_xyz;
+        double *mat_y = mat_x + l1l1l1;
+        double *mat_z = mat_y + l1l1l1;
+        cache += l1l1l1*3;
+        double *pout_x = out;
+        double *pout_y = pout_x + dij;
+        double *pout_z = pout_y + dij;
+
+        size_t ngrids = ((size_t)mesh[0]) * mesh[1] * mesh[2];
+        double *vx = weights + ngrids;
+        double *vy = vx + ngrids;
+        double *vz = vy + ngrids;
+
+        //vrho part
+        memset(mat_xyz, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_xyz, weights, topl-1, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_x, mat_xyz, pout_x, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_y, mat_xyz, pout_y, li, lj, ai, aj, ri, rj, cache);
+        _v1_xyz_to_v1(_vrho_loop_ip1_z, mat_xyz, pout_z, li, lj, ai, aj, ri, rj, cache);
+
+        //vsigma part
+        memset(mat_x, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_x, vx, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        memset(mat_y, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_y, vy, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        memset(mat_z, 0, l1l1l1*sizeof(double));
+        _orth_ints(mat_z, vz, topl, fac, xs_exp, ys_exp, zs_exp,
+                   grid_slice, mesh, cache);
+
+        _vsigma_ip1ip2(_vsigma_loop_ip1ip2_x, mat_x, mat_y, mat_z,
+                       pout_x, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_ip1ip2(_vsigma_loop_ip1ip2_y, mat_x, mat_y, mat_z,
+                       pout_y, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_ip1ip2(_vsigma_loop_ip1ip2_z, mat_x, mat_y, mat_z,
+                       pout_z, li, lj, ai, aj, ri, rj, cache);
+
+        _vsigma_lap1(_vsigma_loop_lap1_x, mat_x, mat_y, mat_z,
+                     pout_x, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_lap1(_vsigma_loop_lap1_y, mat_x, mat_y, mat_z,
+                     pout_y, li, lj, ai, aj, ri, rj, cache);
+        _vsigma_lap1(_vsigma_loop_lap1_z, mat_x, mat_y, mat_z,
+                     pout_z, li, lj, ai, aj, ri, rj, cache);
+        return 1;
+}
+
+
+void _apply_ints(int (*eval_ints)(), double *weights, double *mat,
+                        PGFPair* pgfpair, int comp, double fac, int dimension,
+                        double* dh, double *a, double *b, int *mesh,
+                        double* ish_gto_norm, double* jsh_gto_norm,
+                        int *ish_atm, int *ish_bas, double *ish_env,
+                        int *jsh_atm, int *jsh_bas, double *jsh_env,
+                        double* Ls, double *cache)
+{
+    int i_sh = pgfpair->ish;
+    int j_sh = pgfpair->jsh;
+    int ipgf = pgfpair->ipgf;
+    int jpgf = pgfpair->jpgf;
+    int iL = pgfpair->iL;
+    double cutoff = pgfpair->radius;
+
+    int li = ish_bas[ANG_OF+i_sh*BAS_SLOTS];
+    int lj = jsh_bas[ANG_OF+j_sh*BAS_SLOTS];
+    int di = _LEN_CART[li];
+    int dj = _LEN_CART[lj];
+
+    int ish_nprim = ish_bas[NPRIM_OF+i_sh*BAS_SLOTS];
+    int jsh_nprim = jsh_bas[NPRIM_OF+j_sh*BAS_SLOTS];
+    int naoi = ish_nprim * di;
+    int naoj = jsh_nprim * dj;
+
+    double *ri = ish_env + ish_atm[PTR_COORD+ish_bas[ATOM_OF+i_sh*BAS_SLOTS]*ATM_SLOTS];
+    double *rj = jsh_env + jsh_atm[PTR_COORD+jsh_bas[ATOM_OF+j_sh*BAS_SLOTS]*ATM_SLOTS];
+    double *rL = Ls + iL*3;
+    double rjL[3];
+    rjL[0] = rj[0] + rL[0];
+    rjL[1] = rj[1] + rL[1];
+    rjL[2] = rj[2] + rL[2];
+
+    double ai = ish_env[ish_bas[PTR_EXP+i_sh*BAS_SLOTS]+ipgf];
+    double aj = jsh_env[jsh_bas[PTR_EXP+j_sh*BAS_SLOTS]+jpgf];
+    double ci = ish_gto_norm[ipgf];
+    double cj = jsh_gto_norm[jpgf];
+    double aij = ai + aj;
+    double rrij = CINTsquare_dist(ri, rjL);
+    double eij = (ai * aj / aij) * rrij;
+    if (eij > EIJCUTOFF) {
+        return;
+    }
+    fac *= exp(-eij) * ci * cj * CINTcommon_fac_sp(li) * CINTcommon_fac_sp(lj);
+    if (fac < ish_env[PTR_EXPDROP] && fac < jsh_env[PTR_EXPDROP]) {
+        return;
+    }
+
+    double *out = cache;
+    memset(out, 0, comp*di*dj*sizeof(double));
+    cache += comp * di * dj;
+
+    int value = (*eval_ints)(weights, out, comp, li, lj, ai, aj, ri, rjL,
+                             fac, cutoff, dimension, dh, a, b, mesh, cache);
+
+    double *pmat = mat + ipgf*di*naoj + jpgf*dj;
+    if (value != 0) {
+        int i, j, ic;
+        for (ic = 0; ic < comp; ic++) {
+            for (i = 0; i < di; i++) {
+                #pragma omp simd
+                for (j = 0; j < dj; j++) {
+                    pmat[i*naoj+j] += out[i*dj+j];
+                } 
+            }
+            pmat += naoi * naoj;
+            out += di * dj;
+        }
+    }
+}
+
+
+static size_t _ints_cache_size(int l, int nprim, int nctr, int* mesh, double radius, double* dh, int comp)
+{
+    size_t size = 0;
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    int l1 = 2 * l + 1;
+    if (comp == 3) {
+        l1 += 1;
+    }
+    int l1l1 = l1 * l1;
+    int ncart = _LEN_CART[l1]; // use l1 to be safe
+
+    size += comp * nprim * nprim * ncart * ncart; // dm_cart
+    size += comp * ncart * ncart; // out
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]); // xs_exp, ys_exp, zs_exp
+
+    size_t size_orth_components = l1 * nmx + nmx; // orth_components
+    size += l1l1 * l1; // dm_xyz
+    size += 3 * (ncart + l1); // _dm_xyz_to_dm
+
+    size_t size_orth_ints = 0;
+    if (nmx < max_mesh) {
+        size_orth_ints = (l1 + l1l1) * nmx;
+    } else {
+        size_orth_ints = l1*mesh[2] + l1l1*mesh[0];
+    }
+    size += MAX(size_orth_components, size_orth_ints);
+    size += nctr * ncart * nprim * ncart;
+    //size += 1000000;
+    //printf("Memory allocated per thread for make_mat: %ld MB.\n", size*sizeof(double) / 1000000);
+    return size;
+}
+
+
+static size_t _ints_core_cache_size(int* mesh, double radius, double* dh, int comp)
+{
+    size_t size = 0;
+    size_t nmx = get_max_num_grid_orth(dh, radius);
+    int max_mesh = MAX(MAX(mesh[0], mesh[1]), mesh[2]);
+    const int l = 0;
+    int l1 = l + 1;
+    if (comp == 3) {
+        l1 += 1;
+    }
+    int l1l1 = l1 * l1;
+    int ncart = _LEN_CART[l1];
+
+    size_t size_orth_components = l1 * nmx + nmx;
+    size_t size_orth_ints = 0;
+    if (nmx < max_mesh) {
+        size_orth_ints = (l1 + l1l1) * nmx;
+    } else {
+        size_orth_ints = l1*mesh[2] + l1l1*mesh[0];
+    }
+    size += MAX(size_orth_components, size_orth_ints);
+    size += l1 * (mesh[0] + mesh[1] + mesh[2]);
+    size += l1l1 * l1;
+    size += 3 * (ncart + l1);
+    //size += 1000000;
+    return size;
+}
+
+
+void grid_integrate_drv(int (*eval_ints)(), double* mat, double* weights, TaskList** task_list,
+                        int comp, int hermi, int grid_level, 
+                        int *shls_slice, int* ish_ao_loc, int* jsh_ao_loc,
+                        int dimension, double* Ls, double* a, double* b,
+                        int* ish_atm, int* ish_bas, double* ish_env,
+                        int* jsh_atm, int* jsh_bas, double* jsh_env, int cart)
+{
+    TaskList* tl = *task_list;
+    GridLevel_Info* gridlevel_info = tl->gridlevel_info;
+    Task *task = (tl->tasks)[grid_level];
+    int ntasks = task->ntasks;
+    if (ntasks <= 0) {
+        return;
+    }
+    double max_radius = task->radius;
+    PGFPair **pgfpairs = task->pgfpairs;
+    int* mesh = gridlevel_info->mesh + grid_level*3;
+
+    double dh[9];
+    get_grid_spacing(dh, a, mesh);
+
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    //const int nijsh = nish * njsh;
+    const int naoi = ish_ao_loc[ish1] - ish_ao_loc[ish0];
+    const int naoj = jsh_ao_loc[jsh1] - jsh_ao_loc[jsh0];
+
+    int ish_lmax = get_lmax(ish0, ish1, ish_bas);
+    int jsh_lmax = ish_lmax;
+    if (hermi != 1) {
+        jsh_lmax = get_lmax(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nprim_max = get_nprim_max(ish0, ish1, ish_bas);
+    int jsh_nprim_max = ish_nprim_max;
+    if (hermi != 1) {
+        jsh_nprim_max = get_nprim_max(jsh0, jsh1, jsh_bas);
+    }
+
+    int ish_nctr_max = get_nctr_max(ish0, ish1, ish_bas);
+    int jsh_nctr_max = ish_nctr_max;
+    if (hermi != 1) {
+        jsh_nctr_max = get_nctr_max(jsh0, jsh1, jsh_bas);
+    }
+
+    double **gto_norm_i = (double**) malloc(sizeof(double*) * nish);
+    double **cart2sph_coeff_i = (double**) malloc(sizeof(double*) * nish);
+    get_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1, ish_bas, ish_env, cart);
+    double **gto_norm_j = gto_norm_i;
+    double **cart2sph_coeff_j = cart2sph_coeff_i;
+    if (hermi != 1) {
+        gto_norm_j = (double**) malloc(sizeof(double*) * njsh);
+        cart2sph_coeff_j = (double**) malloc(sizeof(double*) * njsh);
+        get_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1, jsh_bas, jsh_env, cart);
+    }
+
+    int *task_loc;
+    int nblock = get_task_loc(&task_loc, pgfpairs, ntasks, ish0, ish1, jsh0, jsh1, hermi);
+
+    size_t cache_size = _ints_cache_size(MAX(ish_lmax,jsh_lmax),
+                                         MAX(ish_nprim_max, jsh_nprim_max),
+                                         MAX(ish_nctr_max, jsh_nctr_max), 
+                                         mesh, max_radius, dh, comp);
+
+#pragma omp parallel
+{
+    int ish, jsh, itask, iblock;
+    int li, lj, ish_nprim, jsh_nprim;
+    PGFPair *pgfpair = NULL;
+    double *ptr_gto_norm_i, *ptr_gto_norm_j;
+    double *cache0 = malloc(sizeof(double) * cache_size);
+    double *dm_cart = cache0;
+    int len_dm_cart = comp*ish_nprim_max*_LEN_CART[ish_lmax]*jsh_nprim_max*_LEN_CART[jsh_lmax];
+    double *cache = dm_cart + len_dm_cart;
+
+    #pragma omp for schedule(dynamic)
+    for (iblock = 0; iblock < nblock; iblock+=2) {
+        itask = task_loc[iblock];
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        ptr_gto_norm_i = gto_norm_i[ish];
+        ptr_gto_norm_j = gto_norm_j[jsh];
+        li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+        ish_nprim = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+        jsh_nprim = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+        len_dm_cart = comp*ish_nprim*_LEN_CART[li]*jsh_nprim*_LEN_CART[lj];
+        memset(dm_cart, 0, len_dm_cart * sizeof(double));
+        for (; itask < task_loc[iblock+1]; itask++) {
+            pgfpair = pgfpairs[itask];
+            _apply_ints(eval_ints, weights, dm_cart, pgfpair, comp, 1.0, dimension, dh, a, b, mesh,
+                        ptr_gto_norm_i, ptr_gto_norm_j, ish_atm, ish_bas, ish_env,
+                        jsh_atm, jsh_bas, jsh_env, Ls, cache);
+        }
+        transform_dm_inverse(dm_cart, mat, comp,
+                             cart2sph_coeff_i[ish], cart2sph_coeff_j[jsh],
+                             ish_ao_loc, jsh_ao_loc, ish_bas, jsh_bas,
+                             ish, jsh, ish0, jsh0, naoi, naoj, cache);
+        if (hermi == 1 && ish != jsh) {
+            fill_tril(mat, comp, ish_ao_loc, jsh_ao_loc,
+                      ish, jsh, ish0, jsh0, naoi, naoj);
+        }
+    }
+    free(cache0);
+}
+
+    if (task_loc) {
+        free(task_loc);
+    }
+    del_cart2sph_coeff(cart2sph_coeff_i, gto_norm_i, ish0, ish1);
+    if (hermi != 1) {
+        del_cart2sph_coeff(cart2sph_coeff_j, gto_norm_j, jsh0, jsh1);
+    }
+}
+
+
+void int_gauss_charge_v_rs(int (*eval_ints)(), double* out, double* v_rs, int comp,
+                           int* atm, int* bas, int nbas, double* env,
+                           int* mesh, int dimension, double* a, double* b, double max_radius)
+{
+    double dh[9];
+    get_grid_spacing(dh, a, mesh);
+
+    size_t cache_size = _ints_core_cache_size(mesh, max_radius, dh, comp);
+
+#pragma omp parallel
+{
+    int ia, ib;
+    double alpha, coeff, charge, rad, fac;
+    double *r0;
+    double *cache = (double*) malloc(sizeof(double) * cache_size);
+    #pragma omp for schedule(static)
+    for (ib = 0; ib < nbas; ib++) {
+        ia = bas[ib*BAS_SLOTS+ATOM_OF];
+        alpha = env[bas[ib*BAS_SLOTS+PTR_EXP]];
+        coeff = env[bas[ib*BAS_SLOTS+PTR_COEFF]];
+        charge = (double)atm[ia*ATM_SLOTS+CHARGE_OF];
+        r0 = env + atm[ia*ATM_SLOTS+PTR_COORD];
+        fac = -charge * coeff;
+        rad = env[atm[ia*ATM_SLOTS+PTR_RADIUS]];
+        (*eval_ints)(v_rs, out+ia*comp, comp, 0, 0, alpha, 0.0, r0, r0, 
+                     fac, rad, dimension, dh, a, b, mesh, cache);
+    }
+    free(cache);
+}
+}
diff --git a/pyscf/lib/dft/libxc_itrf.c b/pyscf/lib/dft/libxc_itrf.c
index 76d7497980..badeab597a 100644
--- a/pyscf/lib/dft/libxc_itrf.c
+++ b/pyscf/lib/dft/libxc_itrf.c
@@ -15,6 +15,7 @@
  *
  * Authors: Qiming Sun <osirpt.sun@gmail.com>
  *          Susi Lehtola <susi.lehtola@gmail.com>
+ *          Xing Zhang <zhangxing.nju@gmail.com>
  *
  * libxc from
  * http://www.tddft.org/programs/octopus/wiki/index.php/Libxc:manual
@@ -24,7 +25,10 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <xc.h>
+#include "config.h"
 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX_THREADS     256
 
 // TODO: register python signal
 #define raise_error     return
@@ -83,13 +87,13 @@
  * In spin restricted case (spin == 1), rho_u is assumed to be the
  * spin-free quantities, rho_d is not used.
  */
-static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
+static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np, int ld_rho_u)
 {
         int i;
         double *sigma, *tau;
         double *gxu, *gyu, *gzu, *gxd, *gyd, *gzd;
         double *tau_u, *tau_d;
-        double *rho_d = rho_u + np * nvar;
+        double *rho_d = rho_u + ld_rho_u * nvar;
 
         switch (nvar) {
         case LDA_NVAR:
@@ -107,12 +111,12 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
         case GGA_NVAR:
                 if (spin == 1) {
                         sigma = rho + np * 2;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        gxd = rho_d + np;
-                        gyd = rho_d + np * 2;
-                        gzd = rho_d + np * 3;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
+                        gxd = rho_d + ld_rho_u;
+                        gyd = rho_d + ld_rho_u * 2;
+                        gzd = rho_d + ld_rho_u * 3;
                         for (i = 0; i < np; i++) {
                                 rho[i*2+0] = rho_u[i];
                                 rho[i*2+1] = rho_d[i];
@@ -122,9 +126,9 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
                         }
                 } else {
                         sigma = rho + np;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
                         for (i = 0; i < np; i++) {
                                 rho[i] = rho_u[i];
                                 sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
@@ -135,14 +139,14 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
                 if (spin == 1) {
                         sigma = rho + np * 2;
                         tau = sigma + np * 3;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        gxd = rho_d + np;
-                        gyd = rho_d + np * 2;
-                        gzd = rho_d + np * 3;
-                        tau_u  = rho_u + np * 4;
-                        tau_d  = rho_d + np * 4;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
+                        gxd = rho_d + ld_rho_u;
+                        gyd = rho_d + ld_rho_u * 2;
+                        gzd = rho_d + ld_rho_u * 3;
+                        tau_u  = rho_u + ld_rho_u * 4;
+                        tau_d  = rho_d + ld_rho_u * 4;
                         for (i = 0; i < np; i++) {
                                 rho[i*2+0] = rho_u[i];
                                 rho[i*2+1] = rho_d[i];
@@ -157,10 +161,10 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
                 } else {
                         sigma = rho + np;
                         tau  = sigma + np;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        tau_u = rho_u + np * 4;
+                        gxu = rho_u + ld_rho_u;
+                        gyu = rho_u + ld_rho_u * 2;
+                        gzu = rho_u + ld_rho_u * 3;
+                        tau_u = rho_u + ld_rho_u * 4;
                         for (i = 0; i < np; i++) {
                                 rho[i] = rho_u[i];
                                 sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
@@ -171,7 +175,7 @@ static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
         }
 }
 static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
-                     double *rho, double *exc)
+                     double *rho, double *exc, int offset, int blksize)
 {
         double *sigma, *tau;
         double *lapl = rho;
@@ -266,6 +270,21 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                         if (deriv > 3) {
                                 v4rho4 = v3rho3 + np * 4;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset * 2;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset * 3;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset * 4;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset * 5;
+                        }
                 } else {
                         if (deriv > 0) {
                                 vrho = exc + np;
@@ -279,15 +298,30 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                         if (deriv > 3) {
                                 v4rho4 = v3rho3 + np;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset;
+                        }
                 }
-                xc_lda(func_x, np, rho, exc, vrho, v2rho2, v3rho3, v4rho4);
+                xc_lda(func_x, blksize, rho, exc, vrho, v2rho2, v3rho3, v4rho4);
                 break;
         case XC_FAMILY_GGA:
 #ifdef XC_FAMILY_HYB_GGA
         case XC_FAMILY_HYB_GGA:
 #endif
                 if (spin == 1) {
-                        sigma = rho + np * 2;
+                        sigma = rho + blksize * 2;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np * 2;
@@ -310,8 +344,33 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4rhosigma3  = v4rho2sigma2 + np * 3*6 ;
                                 v4sigma4     = v4rhosigma3  + np * 2*10;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset * 2;
+                                vsigma += offset * 3;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset * 3;
+                                v2rhosigma += offset * 6;
+                                v2sigma2 += offset * 6;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset * 4;
+                                v3rho2sigma += offset * 9;
+                                v3rhosigma2 += offset * 12;
+                                v3sigma3 += offset * 10;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset * 5;
+                                v4rho3sigma += offset * 4*3;
+                                v4rho2sigma2 += offset * 3*6;
+                                v4rhosigma3 += offset * 2*10;
+                                v4sigma4 += offset * 15;
+                        }
                 } else {
-                        sigma = rho + np;
+                        sigma = rho + blksize;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np;
@@ -334,8 +393,33 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4rhosigma3  = v4rho2sigma2 + np;
                                 v4sigma4     = v4rhosigma3  + np;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho += offset;
+                                vsigma += offset;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 += offset;
+                                v2rhosigma += offset;
+                                v2sigma2 += offset;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 += offset;
+                                v3rho2sigma += offset;
+                                v3rhosigma2 += offset;
+                                v3sigma3 += offset;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 += offset;
+                                v4rho3sigma += offset;
+                                v4rho2sigma2 += offset;
+                                v4rhosigma3 += offset;
+                                v4sigma4 += offset;
+                        }
                 }
-                xc_gga(func_x, np, rho, sigma,
+                xc_gga(func_x, blksize, rho, sigma,
                        exc, vrho, vsigma,
                        v2rho2, v2rhosigma, v2sigma2,
                        v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3,
@@ -346,8 +430,8 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
         case XC_FAMILY_HYB_MGGA:
 #endif
                 if (spin == 1) {
-                        sigma = rho + np * 2;
-                        tau = sigma + np * 3;
+                        sigma = rho + blksize * 2;
+                        tau = sigma + blksize * 3;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np * 2;
@@ -390,9 +474,54 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4sigmatau3    = v4sigma2tau2   + np * 6*3  ;
                                 v4tau4         = v4sigmatau3    + np * 3*4  ;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho   += offset * 2;
+                                vsigma += offset * 3;
+                                vtau   += offset * 2;
+                        }
+                        if (deriv > 1) {
+                                v2rho2      += offset * 3;
+                                v2rhosigma  += offset * 6;
+                                v2sigma2    += offset * 6;
+                                v2rhotau    += offset * 4;
+                                v2sigmatau  += offset * 6;
+                                v2tau2      += offset * 3;
+                        }
+                        if (deriv > 2) {
+                                v3rho3         += offset * 4 ;
+                                v3rho2sigma    += offset * 9 ;
+                                v3rhosigma2    += offset * 12;
+                                v3sigma3       += offset * 10;
+                                v3rho2tau      += offset * 6 ;
+                                v3rhosigmatau  += offset * 12;
+                                v3rhotau2      += offset * 6 ;
+                                v3sigma2tau    += offset * 12;
+                                v3sigmatau2    += offset * 9 ;
+                                v3tau3         += offset * 4 ;
+                        }
+                        if (deriv > 3) {
+                                v4rho4         += offset * 5    ;
+                                v4rho3sigma    += offset * 4*3  ;
+                                v4rho2sigma2   += offset * 3*6  ;
+                                v4rhosigma3    += offset * 2*10 ;
+                                v4sigma4       += offset * 15   ;
+                                v4rho3tau      += offset * 4*2  ;
+                                v4rho2sigmatau += offset * 3*3*2;
+                                v4rho2tau2     += offset * 3*3  ;
+                                v4rhosigma2tau += offset * 2*6*2;
+                                v4rhosigmatau2 += offset * 2*3*3;
+                                v4rhotau3      += offset * 2*4  ;
+                                v4sigma3tau    += offset * 10*2 ;
+                                v4sigma2tau2   += offset * 6*3  ;
+                                v4sigmatau3    += offset * 3*4  ;
+                                v4tau4         += offset * 5    ;
+                        }
                 } else {
-                        sigma = rho + np;
-                        tau = sigma + np;
+                        sigma = rho + blksize;
+                        tau = sigma + blksize;
                         if (deriv > 0) {
                                 vrho = exc + np;
                                 vsigma = vrho + np;
@@ -435,8 +564,53 @@ static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
                                 v4sigmatau3    = v4sigma2tau2   + np;
                                 v4tau4         = v4sigmatau3    + np;
                         }
+
+                        // set offset
+                        exc += offset;
+                        if (deriv > 0) {
+                                vrho   += offset;
+                                vsigma += offset;
+                                vtau   += offset;
+                        }
+                        if (deriv > 1) {
+                                v2rho2      += offset;
+                                v2rhosigma  += offset;
+                                v2sigma2    += offset;
+                                v2rhotau    += offset;
+                                v2sigmatau  += offset;
+                                v2tau2      += offset;
+                        }
+                        if (deriv > 2) {
+                                v3rho3         += offset;
+                                v3rho2sigma    += offset;
+                                v3rhosigma2    += offset;
+                                v3sigma3       += offset;
+                                v3rho2tau      += offset;
+                                v3rhosigmatau  += offset;
+                                v3rhotau2      += offset;
+                                v3sigma2tau    += offset;
+                                v3sigmatau2    += offset;
+                                v3tau3         += offset;
+                        }
+                        if (deriv > 3) {
+                                v4rho4         += offset;
+                                v4rho3sigma    += offset;
+                                v4rho2sigma2   += offset;
+                                v4rhosigma3    += offset;
+                                v4sigma4       += offset;
+                                v4rho3tau      += offset;
+                                v4rho2sigmatau += offset;
+                                v4rho2tau2     += offset;
+                                v4rhosigma2tau += offset;
+                                v4rhosigmatau2 += offset;
+                                v4rhotau3      += offset;
+                                v4sigma3tau    += offset;
+                                v4sigma2tau2   += offset;
+                                v4sigmatau3    += offset;
+                                v4tau4         += offset;
+                        }
                 }
-                xc_mgga(func_x, np, rho, sigma, lapl, tau,
+                xc_mgga(func_x, blksize, rho, sigma, lapl, tau,
                      exc, vrho, vsigma, vlapl, vtau,
                      v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2,
                      v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2,
@@ -705,6 +879,7 @@ static void axpy(double *dst, double *src, double fac,
 {
         int i, j;
         for (j = 0; j < nsrc; j++) {
+                #pragma omp parallel for schedule(static)
                 for (i = 0; i < np; i++) {
                         dst[j*np+i] += fac * src[i*nsrc+j];
                 }
@@ -760,6 +935,7 @@ static void merge_xc(double *dst, double *ebuf, double fac,
                         pout = dst + offsets1[order] * np;
                         pin = ebuf + offsets0[order] * np;
                         nsrc = offsets0[order+1] - offsets0[order];
+                        #pragma omp parallel for schedule(static)
                         for (i = 0; i < np * nsrc; i++) {
                                 pout[i] += fac * pin[i];
                         }
@@ -802,10 +978,36 @@ void LIBXC_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
 {
         assert(deriv <= 4);
         double *ebuf = malloc(sizeof(double) * np * outlen);
-        double *rho = malloc(sizeof(double) * np * 7);
-        _eval_rho(rho, rho_u, spin, nvar, np);
-        int nspin = spin + 1;
 
+        double *rhobufs[MAX_THREADS];
+        int offsets[MAX_THREADS+1];
+#pragma omp parallel
+{
+        int iblk = omp_get_thread_num();
+        int nblk = omp_get_num_threads();
+        assert(nblk <= MAX_THREADS);
+
+        int blksize = np / nblk;
+        int ioff = iblk * blksize;
+        int np_mod = np % nblk;
+        if (iblk < np_mod) {
+            blksize += 1;
+        }
+        if (np_mod > 0) {
+            ioff += MIN(iblk, np_mod);
+        }
+        offsets[iblk] = ioff;
+        if (iblk == nblk-1) {
+            offsets[nblk] = np;
+            assert(ioff + blksize == np);
+        }
+
+        double *rho_priv = malloc(sizeof(double) * blksize * 7);
+        rhobufs[iblk] = rho_priv;
+        _eval_rho(rho_priv, rho_u+ioff, spin, nvar, blksize, np);
+}
+
+        int nspin = spin + 1;
         int i, j;
         xc_func_type func;
         for (i = 0; i < nfn; i++) {
@@ -857,13 +1059,25 @@ void LIBXC_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
 #if defined XC_SET_RELATIVITY
                 xc_lda_x_set_params(&func, relativity);
 #endif
-                _eval_xc(&func, spin, deriv, np, rho, ebuf);
+
+#pragma omp parallel
+{
+                int iblk = omp_get_thread_num();
+                int offset = offsets[iblk];
+                int blksize = offsets[iblk+1] - offset;
+                _eval_xc(&func, spin, deriv, np, rhobufs[iblk], ebuf, offset, blksize);
+}
+
                 merge_xc(output, ebuf, fac[i],
                          spin, deriv, nvar, np, outlen, func.info->family);
                 xc_func_end(&func);
         }
         free(ebuf);
-        free(rho);
+#pragma omp parallel
+{
+        int iblk = omp_get_thread_num();
+        free(rhobufs[iblk]);
+}
 }
 
 int LIBXC_max_deriv_order(int xc_id)
diff --git a/pyscf/lib/dft/multigrid.c b/pyscf/lib/dft/multigrid.c
new file mode 100644
index 0000000000..593aedf1b8
--- /dev/null
+++ b/pyscf/lib/dft/multigrid.c
@@ -0,0 +1,744 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include "config.h"
+#include "cint.h"
+#include "pbc/neighbor_list.h"
+#include "pbc/cell.h"
+#include "dft/multigrid.h"
+
+#define SQUARE(r)       (r[0]*r[0]+r[1]*r[1]+r[2]*r[2])
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define BUF_SIZE 2000
+#define ADD_SIZE 1000
+#define RZERO 1e-6
+
+const int _LEN_CART[] = {
+    1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136
+};
+
+const int _LEN_CART0[] = {
+    0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120
+};
+
+const int _BINOMIAL_COEF[] = {
+    1,
+    1,   1,
+    1,   2,   1,
+    1,   3,   3,   1,
+    1,   4,   6,   4,   1,
+    1,   5,  10,  10,   5,   1,
+    1,   6,  15,  20,  15,   6,   1,
+    1,   7,  21,  35,  35,  21,   7,   1,
+    1,   8,  28,  56,  70,  56,  28,   8,   1,
+    1,   9,  36,  84, 126, 126,  84,  36,   9,   1,
+    1,  10,  45, 120, 210, 252, 210, 120,  45,  10,   1,
+    1,  11,  55, 165, 330, 462, 462, 330, 165,  55,  11,   1,
+    1,  12,  66, 220, 495, 792, 924, 792, 495, 220,  66,  12,   1,
+    1,  13,  78, 286, 715,1287,1716,1716,1287, 715, 286,  78,  13,   1,
+    1,  14,  91, 364,1001,2002,3003,3432,3003,2002,1001, 364,  91,  14,   1,
+    1,  15, 105, 455,1365,3003,5005,6435,6435,5005,3003,1365, 455, 105,  15,   1,
+};
+
+double CINTsquare_dist(const double *r1, const double *r2);
+
+void init_gridlevel_info(GridLevel_Info** gridlevel_info,
+                         double* cutoff, int* mesh, int nlevels, double rel_cutoff)
+{
+    GridLevel_Info* gl_info = (GridLevel_Info*) malloc(sizeof(GridLevel_Info));
+    gl_info->nlevels = nlevels;
+    gl_info->rel_cutoff = rel_cutoff;
+    gl_info->cutoff = (double*) malloc(sizeof(double) * nlevels);
+    gl_info->mesh = (int*) malloc(sizeof(int) * nlevels * 3);
+    int i;
+    for (i = 0; i < nlevels; i++) {
+        (gl_info->cutoff)[i] = cutoff[i];
+        (gl_info->mesh)[i*3] = mesh[i*3];
+        (gl_info->mesh)[i*3+1] = mesh[i*3+1];
+        (gl_info->mesh)[i*3+2] = mesh[i*3+2];
+    }
+    *gridlevel_info = gl_info;
+}
+
+
+void init_rs_grid(RS_Grid** rs_grid, GridLevel_Info** gridlevel_info, int comp)
+{
+    RS_Grid* rg = (RS_Grid*) malloc(sizeof(RS_Grid));
+    GridLevel_Info* gl_info = *gridlevel_info;
+    int nlevels = gl_info->nlevels;
+    rg->nlevels = nlevels;
+    rg->gridlevel_info = gl_info;
+    rg->comp = comp;
+
+    int i;
+    size_t ngrid;
+    int *mesh = gl_info->mesh;
+    rg->data = (double**)malloc(sizeof(double*) * nlevels);
+    for (i = 0; i < nlevels; i++) {
+        ngrid = mesh[i*3] * mesh[i*3+1] * mesh[i*3+2];
+        (rg->data)[i] = calloc(comp*ngrid, sizeof(double));
+    }
+    *rs_grid = rg;
+}
+
+
+void del_rs_grid(RS_Grid** rs_grid)
+{
+    RS_Grid* rg = *rs_grid;
+    if (!rg) {
+        return;
+    }
+    if (rg->data) {
+        int i;
+        for (i = 0; i < rg->nlevels; i++) {
+            if (rg->data[i]) {
+                free(rg->data[i]);
+            }
+        }
+        free(rg->data);
+    }
+    rg->gridlevel_info = NULL;
+    free(rg);
+    *rs_grid = NULL;
+}
+
+
+void del_gridlevel_info(GridLevel_Info** gridlevel_info)
+{
+    GridLevel_Info* gl_info = *gridlevel_info;
+    if (!gl_info) {
+        return;
+    }
+    if (gl_info->cutoff) {
+        free(gl_info->cutoff);
+    }
+    if (gl_info->mesh) {
+        free(gl_info->mesh);
+    }
+    free(gl_info);
+    *gridlevel_info = NULL;
+}
+
+
+void init_pgfpair(PGFPair** pair_info,
+                  int ish, int ipgf, int jsh, int jpgf, int iL, double radius)
+{
+    PGFPair *pair0 = (PGFPair*) malloc(sizeof(PGFPair));
+    pair0->ish = ish;
+    pair0->ipgf = ipgf;
+    pair0->jsh = jsh;
+    pair0->jpgf = jpgf;
+    pair0->iL = iL;
+    pair0->radius = radius;
+    *pair_info = pair0;
+}
+
+
+bool pgfpairs_with_same_shells(PGFPair *pair1, PGFPair *pair2)
+{
+    if (!pair1 || !pair2) {
+        return false;
+    }
+    if (pair1->ish == pair2->ish && pair1->jsh == pair2->jsh) {
+        return true;
+    }
+    return false;
+}
+
+
+double pgfpair_radius(int la, int lb, double zeta, double zetb, double* ra, double* rab, double precision)
+{
+    double radius = 0;
+    double zetp = zeta + zetb;
+    double eps = precision * precision;
+
+    if (rab[0] < RZERO && rab[1] < RZERO && rab[2] < RZERO) {
+        radius = pgf_rcut(la+lb, zetp, 1., eps, radius);
+        return radius;
+    }
+
+    double prefactor = exp(-zeta*zetb/zetp*SQUARE(rab));
+    double rb[3], rp[3];
+    rb[0] = ra[0] + rab[0];
+    rb[1] = ra[1] + rab[1];
+    rb[2] = ra[2] + rab[2];
+    rp[0] = ra[0] + zetb/zetp*rab[0];
+    rp[1] = ra[1] + zetb/zetp*rab[1];
+    rp[2] = ra[2] + zetb/zetp*rab[2];
+
+    double rad_a = sqrt(CINTsquare_dist(ra, rp));
+    double rad_b = sqrt(CINTsquare_dist(rb, rp));
+
+    int lmax = la + lb;
+    double coef[lmax+1];
+    double rap[la+1];
+    double rbp[lb+1];
+
+    int lxa, lxb, i;
+    for (i = 0; i <= lmax; i++) {
+        coef[i] = 0;
+    }
+    rap[0] = 1.;
+    for (i = 1; i <= la; i++) {
+        rap[i] = rap[i-1] * rad_a;
+    }
+    rbp[0] = 1.;
+    for (i = 1; i <= lb; i++) {
+        rbp[i] = rbp[i-1] * rad_b;
+    }
+
+    for (lxa = 0; lxa <= la; lxa++) {
+        for (lxb = 0; lxb <= lb; lxb++) {
+            coef[lxa+lxb] += BINOMIAL(la, lxa) * BINOMIAL(lb, lxb) * rap[la-lxa] * rbp[lb-lxb];
+        }
+    }
+
+    for (i = 0; i <= lmax; i++){
+        coef[i] *= prefactor;
+        radius = MAX(radius, pgf_rcut(i, zetp, coef[i], eps, radius));
+    }
+    return radius;
+}
+
+
+void del_pgfpair(PGFPair** pair_info)
+{
+    PGFPair *pair0 = *pair_info;
+    if (!pair0) {
+        return;
+    } else {
+        free(pair0);
+    }
+    *pair_info = NULL;
+}
+
+
+//unlink the pgfpair data instead of deleting
+void nullify_pgfpair(PGFPair** pair_info)
+{
+    *pair_info = NULL;
+}
+
+
+void init_task(Task** task)
+{
+    Task *t0 = *task = (Task*) malloc(sizeof(Task));
+    t0->ntasks = 0;
+    t0->buf_size = BUF_SIZE; 
+    t0->pgfpairs = (PGFPair**) malloc(sizeof(PGFPair*) * t0->buf_size);
+    int i;
+    for (i = 0; i < t0->buf_size; i++) {
+        (t0->pgfpairs)[i] = NULL;
+    }
+}
+
+
+void del_task(Task** task)
+{
+    Task *t0 = *task;
+    if (!t0) {
+        return;
+    }
+    if (t0->pgfpairs) {
+        size_t i, ntasks = t0->ntasks;
+        for (i = 0; i < ntasks; i++) {
+            del_pgfpair(t0->pgfpairs + i);
+        }
+        free(t0->pgfpairs);
+    }
+    free(t0);
+    *task = NULL;
+}
+
+
+void nullify_task(Task** task)
+{
+    Task *t0 = *task;
+    if (!t0) {
+        return;
+    }
+    if (t0->pgfpairs) {
+        size_t i, ntasks = t0->ntasks;
+        for (i = 0; i < ntasks; i++) {
+            nullify_pgfpair(t0->pgfpairs + i);
+        }
+        free(t0->pgfpairs);
+    }
+    free(t0);
+    *task = NULL;
+}
+
+
+void init_task_list(TaskList** task_list, GridLevel_Info* gridlevel_info, int nlevels, int hermi)
+{
+    TaskList* tl = *task_list = (TaskList*) malloc(sizeof(TaskList));
+    tl->nlevels = nlevels;
+    tl->hermi = hermi;
+    tl->gridlevel_info = gridlevel_info;
+    tl->tasks = (Task**) malloc(sizeof(Task*)*nlevels);
+    int i;
+    for (i = 0; i < nlevels; i++) {
+        init_task(tl->tasks + i);
+    }
+}
+
+
+void del_task_list(TaskList** task_list)
+{
+    TaskList *tl = *task_list;
+    if (!tl) {
+        return;
+    }
+    if (tl->gridlevel_info) {
+        del_gridlevel_info(&(tl->gridlevel_info));
+        tl->gridlevel_info = NULL;
+    }
+    if (tl->tasks) {
+        int i;
+        for (i = 0; i < tl->nlevels; i++) {
+            if ((tl->tasks)[i]) {
+                del_task(tl->tasks + i);
+            }
+        }
+        free(tl->tasks);
+    }
+    free(tl);
+    *task_list = NULL;
+}
+
+
+void nullify_task_list(TaskList** task_list)
+{
+    TaskList *tl = *task_list;
+    if (!tl) {
+        return;
+    }
+    if (tl->gridlevel_info) {
+        tl->gridlevel_info = NULL;
+    }
+    if (tl->tasks) {
+        int i;
+        for (i = 0; i < tl->nlevels; i++) {
+            if ((tl->tasks)[i]) {
+                nullify_task(tl->tasks + i);
+            }
+        }
+        free(tl->tasks);
+    }
+    free(tl);
+    *task_list = NULL;
+}
+
+
+void update_task_list(TaskList** task_list, int grid_level, 
+                      int ish, int ipgf, int jsh, int jpgf, int iL, double radius)
+{
+    TaskList* tl = *task_list;
+    Task *t0 = (tl->tasks)[grid_level];
+    t0->ntasks += 1;
+    if (t0->ntasks > t0->buf_size) {
+        t0->buf_size += ADD_SIZE;
+        t0->pgfpairs = (PGFPair**) realloc(t0->pgfpairs, sizeof(PGFPair*) * t0->buf_size);
+    }
+    init_pgfpair(t0->pgfpairs + t0->ntasks - 1,
+                 ish, ipgf, jsh, jpgf, iL, radius);
+}
+
+
+void merge_task_list(TaskList** task_list, TaskList** task_list_loc)
+{
+    TaskList* tl = *task_list;
+    TaskList* tl_loc = *task_list_loc;
+    int ilevel, itask;
+    for (ilevel = 0; ilevel < tl->nlevels; ilevel++) {
+        Task *t0 = (tl->tasks)[ilevel];
+        Task *t1 = (tl_loc->tasks)[ilevel];
+        int itask_off = t0->ntasks;
+        int ntasks_loc = t1->ntasks;
+        t0->ntasks += ntasks_loc;
+        t0->buf_size = t0->ntasks;
+        t0->pgfpairs = (PGFPair**) realloc(t0->pgfpairs, sizeof(PGFPair*) * t0->buf_size);
+        PGFPair** ptr_pgfpairs = t0->pgfpairs + itask_off;
+        PGFPair** ptr_pgfpairs_loc = t1->pgfpairs;
+        for (itask = 0; itask < ntasks_loc; itask++) {
+            ptr_pgfpairs[itask] = ptr_pgfpairs_loc[itask];
+        }
+    }
+}
+
+
+int get_grid_level(GridLevel_Info* gridlevel_info, double alpha)
+{
+    int i;
+    int nlevels = gridlevel_info->nlevels;
+    int grid_level = nlevels - 1; //default use the most dense grid
+    double needed_cutoff = alpha * gridlevel_info->rel_cutoff;
+    for (i = 0; i < nlevels; i++) {
+        if ((gridlevel_info->cutoff)[i] >= needed_cutoff) {
+            grid_level = i;
+            break;
+        }
+    }
+    return grid_level;
+}
+
+
+void build_task_list(TaskList** task_list, NeighborList** neighbor_list,
+                     GridLevel_Info** gridlevel_info,
+                     int* ish_atm, int* ish_bas, double* ish_env, 
+                     double* ish_rcut, double** ipgf_rcut,
+                     int* jsh_atm, int* jsh_bas, double* jsh_env, 
+                     double* jsh_rcut, double** jpgf_rcut,
+                     int nish, int njsh, double* Ls, double precision, int hermi)
+{
+    GridLevel_Info *gl_info = *gridlevel_info;
+    int ilevel;
+    int nlevels = gl_info->nlevels;
+    init_task_list(task_list, gl_info, nlevels, hermi);
+    double max_radius[nlevels];
+    NeighborList *nl0 = *neighbor_list;
+
+#pragma omp parallel private(ilevel)
+{
+    double max_radius_loc[nlevels];
+    TaskList** task_list_loc = (TaskList**) malloc(sizeof(TaskList*));
+    init_task_list(task_list_loc, gl_info, nlevels, hermi);
+    NeighborPair *np0_ij;
+    int ish, jsh;
+    int li, lj;
+    int ipgf, jpgf;
+    int nipgf, njpgf;
+    int iL, iL_idx;
+    int ish_atm_id, jsh_atm_id;
+    int ish_alpha_of, jsh_alpha_of;
+    double ipgf_alpha, jpgf_alpha;
+    double *ish_ratm, *jsh_ratm, *rL;
+    double rij[3];
+    double dij, radius;
+
+    #pragma omp for schedule(dynamic)
+    for (ish = 0; ish < nish; ish++) {
+        li = ish_bas[ANG_OF+ish*BAS_SLOTS];
+        nipgf = ish_bas[NPRIM_OF+ish*BAS_SLOTS];
+        ish_atm_id = ish_bas[ish*BAS_SLOTS+ATOM_OF];
+        ish_ratm = ish_env + ish_atm[ish_atm_id*ATM_SLOTS+PTR_COORD];
+        ish_alpha_of = ish_bas[PTR_EXP+ish*BAS_SLOTS];
+        for (jsh = 0; jsh < njsh; jsh++) {
+            if (hermi == 1 && jsh < ish) {
+                continue;
+            }
+            np0_ij = (nl0->pairs)[ish*njsh + jsh];
+            if (np0_ij->nimgs > 0) {
+                lj = jsh_bas[ANG_OF+jsh*BAS_SLOTS];
+                njpgf = jsh_bas[NPRIM_OF+jsh*BAS_SLOTS];
+                jsh_atm_id = jsh_bas[jsh*BAS_SLOTS+ATOM_OF];
+                jsh_ratm = jsh_env + jsh_atm[jsh_atm_id*ATM_SLOTS+PTR_COORD];
+                jsh_alpha_of = jsh_bas[PTR_EXP+jsh*BAS_SLOTS];
+
+                for (iL_idx = 0; iL_idx < np0_ij->nimgs; iL_idx++){
+                    iL = (np0_ij->Ls_list)[iL_idx];
+                    rL = Ls + iL*3;
+                    rij[0] = jsh_ratm[0] + rL[0] - ish_ratm[0];
+                    rij[1] = jsh_ratm[1] + rL[1] - ish_ratm[1];
+                    rij[2] = jsh_ratm[2] + rL[2] - ish_ratm[2];
+                    dij = sqrt(SQUARE(rij));
+
+                    for (ipgf = 0; ipgf < nipgf; ipgf++) {
+                        if (ipgf_rcut[ish][ipgf] + jsh_rcut[jsh] < dij) {
+                            continue;
+                        }
+                        ipgf_alpha = ish_env[ish_alpha_of+ipgf];
+                        for (jpgf = 0; jpgf < njpgf; jpgf++) {
+                            //if (hermi == 1 && ish == jsh && jpgf < ipgf) {
+                            //    continue;
+                            //}
+                            if (ipgf_rcut[ish][ipgf] + jpgf_rcut[jsh][jpgf] < dij) {
+                                continue;
+                            }
+                            jpgf_alpha = jsh_env[jsh_alpha_of+jpgf]; 
+                            ilevel = get_grid_level(gl_info, ipgf_alpha+jpgf_alpha);
+                            radius = pgfpair_radius(li, lj, ipgf_alpha, jpgf_alpha, ish_ratm, rij, precision);
+                            if (radius < RZERO) {
+                                continue;
+                            }
+                            max_radius_loc[ilevel] = MAX(radius, max_radius_loc[ilevel]);
+                            update_task_list(task_list_loc, ilevel, ish, ipgf, jsh, jpgf, iL, radius);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    #pragma omp critical
+    merge_task_list(task_list, task_list_loc);
+
+    nullify_task_list(task_list_loc);
+    free(task_list_loc);
+
+    #pragma omp critical
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        max_radius[ilevel] = MAX(max_radius[ilevel], max_radius_loc[ilevel]);
+    }
+}
+
+    for (ilevel = 0; ilevel < nlevels; ilevel++) {
+        Task *t0 = ((*task_list)->tasks)[ilevel];
+        t0->radius = max_radius[ilevel];
+    }
+}
+
+
+int get_task_loc(int** task_loc, PGFPair** pgfpairs, int ntasks,
+                 int ish0, int ish1, int jsh0, int jsh1, int hermi)
+{
+    int n = -2;
+    int ish_prev = -1;
+    int jsh_prev = -1;
+    int itask, ish, jsh;
+    int *buf = (int*)malloc(sizeof(int) * ntasks*2);
+    PGFPair *pgfpair;
+    for(itask = 0; itask < ntasks; itask++){
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        jsh = pgfpair->jsh;
+        if (ish < ish0 || ish >= ish1) {
+            continue;
+        }
+        if (jsh < jsh0 || jsh >= jsh1) {
+            continue;
+        }
+        if (hermi == 1 && jsh < ish) {
+            continue;
+        }
+
+        if (ish != ish_prev || jsh != jsh_prev) {
+            n += 2;
+            buf[n] = itask;
+            buf[n+1] = itask+1;
+            ish_prev = ish;
+            jsh_prev = jsh;
+        } else {
+            buf[n+1] = itask+1;
+        }
+    }
+    n += 2;
+    *task_loc = (int*)realloc(buf, sizeof(int) * n);
+    return n;
+}
+
+
+void gradient_gs(double complex* out, double complex* f_gs, double* Gv,
+                 int n, size_t ng)
+{
+    int i;
+    double complex *outx, *outy, *outz;
+    for (i = 0; i < n; i++) {
+        outx = out;
+        outy = outx + ng;
+        outz = outy + ng;
+        #pragma omp parallel
+        {
+            size_t igrid;
+            double *pGv;
+            #pragma omp for schedule(static)
+            for (igrid = 0; igrid < ng; igrid++) {
+                pGv = Gv + igrid * 3;
+                outx[igrid] = pGv[0] * creal(f_gs[igrid]) * _Complex_I - pGv[0] * cimag(f_gs[igrid]);
+                outy[igrid] = pGv[1] * creal(f_gs[igrid]) * _Complex_I - pGv[1] * cimag(f_gs[igrid]);
+                outz[igrid] = pGv[2] * creal(f_gs[igrid]) * _Complex_I - pGv[2] * cimag(f_gs[igrid]);
+            }
+        }
+        f_gs += ng;
+        out += 3 * ng;
+    }
+}
+
+/*
+int get_task_loc_diff_ish(int** task_loc, PGFPair** pgfpairs, int ntasks,
+                          int ish0, int ish1)
+{
+    int n = -2;
+    int ish_prev = -1;
+    int itask, ish;
+    int *buf = (int*)malloc(sizeof(int) * ntasks*2);
+    PGFPair *pgfpair;
+    for(itask = 0; itask < ntasks; itask++){
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        if (ish < ish0 || ish >= ish1) {
+            continue;
+        }
+
+        if (ish != ish_prev) {
+            n += 2;
+            buf[n] = itask;
+            ish_prev = ish;
+        }
+        if (ish == ish_prev) {
+            buf[n+1] = itask+1;
+        }
+    }
+    n += 2;
+    *task_loc = (int*)realloc(buf, sizeof(int) * n);
+    return n;
+}
+*/
+
+/*
+typedef struct Task_Index_struct {
+    int ntasks;
+    int bufsize;
+    int* task_index;
+} Task_Index;
+
+
+void init_task_index(Task_Index* task_idx)
+{
+    task_idx->ntasks = 0;
+    task_idx->bufsize = 10;
+    task_idx->task_index = (int*)malloc(sizeof(int) * task_idx->bufsize);
+}
+
+
+void update_task_index(Task_Index* task_idx, int itask)
+{
+    task_idx->ntasks += 1;
+    if (task_idx->bufsize < task_idx->ntasks) {
+        task_idx->bufsize += 10;
+        task_idx->task_index = (int*)realloc(task_idx->task_index, sizeof(int) * task_idx->bufsize);
+    }
+    task_idx->task_index[task_idx->ntasks-1] = itask;
+}
+
+
+void del_task_index(Task_Index* task_idx)
+{
+    if (!task_idx) {
+        return;
+    }
+    if (task_idx->task_index) {
+        free(task_idx->task_index);
+    }
+    task_idx->ntasks = 0;
+    task_idx->bufsize = 0;
+}
+
+
+typedef struct Shlpair_Task_Index_struct {
+    int nish;
+    int njsh;
+    int ish0;
+    int jsh0;
+    Task_Index *task_index;
+} Shlpair_Task_Index;
+
+
+void init_shlpair_task_index(Shlpair_Task_Index* shlpair_task_idx,
+                             int ish0, int jsh0, int nish, int njsh)
+{
+    shlpair_task_idx->ish0 = ish0;
+    shlpair_task_idx->jsh0 = jsh0;
+    shlpair_task_idx->nish = nish;
+    shlpair_task_idx->njsh = njsh;
+    shlpair_task_idx->task_index = (Task_Index*)malloc(sizeof(Task_Index)*nish*njsh);
+
+    int ijsh;
+    for (ijsh = 0; ijsh < nish*njsh; ijsh++) {
+        init_task_index(shlpair_task_idx->task_index + ijsh);
+    }
+}
+
+
+void update_shlpair_task_index(Shlpair_Task_Index* shlpair_task_idx,
+                               int ish, int jsh, int itask)
+{
+    int ish0 = shlpair_task_idx->ish0;
+    int jsh0 = shlpair_task_idx->jsh0;
+    int njsh = shlpair_task_idx->njsh;
+    int ioff = ish - ish0;
+    int joff = jsh - jsh0;
+
+    update_task_index(shlpair_task_idx->task_index + ioff*njsh+joff, itask);
+}
+
+
+int get_task_index(Shlpair_Task_Index* shlpair_task_idx, int** idx, int ish, int jsh)
+{
+    int ish0 = shlpair_task_idx->ish0;
+    int jsh0 = shlpair_task_idx->jsh0;
+    int njsh = shlpair_task_idx->njsh;
+    int ioff = ish - ish0;
+    int joff = jsh - jsh0;
+    Task_Index *task_idx = shlpair_task_idx->task_index + ioff*njsh+joff;
+    int ntasks = task_idx->ntasks;
+    *idx = task_idx->task_index;
+    return ntasks;
+}
+
+
+void del_shlpair_task_index(Shlpair_Task_Index* shlpair_task_idx)
+{
+    if (!shlpair_task_idx) {
+        return;
+    }
+
+    int nish = shlpair_task_idx->nish;
+    int njsh = shlpair_task_idx->njsh;
+    int ijsh;
+    for (ijsh = 0; ijsh < nish*njsh; ijsh++) {
+        del_task_index(shlpair_task_idx->task_index + ijsh);
+    }
+    free(shlpair_task_idx->task_index);
+}
+
+
+Shlpair_Task_Index* get_shlpair_task_index(PGFPair** pgfpairs, int ntasks,
+            int ish0, int ish1, int jsh0, int jsh1, int hermi)
+{
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+
+    Shlpair_Task_Index* shlpair_task_idx = (Shlpair_Task_Index*) malloc(sizeof(Shlpair_Task_Index));
+    init_shlpair_task_index(shlpair_task_idx, ish0, jsh0, nish, njsh);
+
+    int itask;
+    int ish, jsh;
+    PGFPair *pgfpair = NULL;
+    for(itask = 0; itask < ntasks; itask++){
+        pgfpair = pgfpairs[itask];
+        ish = pgfpair->ish;
+        if (ish < ish0 || ish >= ish1) {
+            continue;
+        }
+        jsh = pgfpair->jsh;
+        if (jsh < jsh0 || jsh >= jsh1) {
+            continue;
+        }
+        if (hermi == 1 && jsh < ish) {
+            continue;
+        }
+        update_shlpair_task_index(shlpair_task_idx, ish, jsh, itask);
+    }
+    return shlpair_task_idx;
+}
+*/
diff --git a/pyscf/lib/dft/multigrid.h b/pyscf/lib/dft/multigrid.h
new file mode 100644
index 0000000000..e691a3ce12
--- /dev/null
+++ b/pyscf/lib/dft/multigrid.h
@@ -0,0 +1,72 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_MULTIGRID_H
+#define HAVE_DEFINED_MULTIGRID_H
+
+#include <stdbool.h>
+
+#define BINOMIAL(n, i)  (_BINOMIAL_COEF[_LEN_CART0[n]+i])
+
+extern const int _LEN_CART[];
+extern const int _LEN_CART0[];
+extern const int _BINOMIAL_COEF[];
+
+typedef struct GridLevel_Info_struct {
+    int nlevels;
+    double rel_cutoff;
+    double *cutoff;
+    int *mesh;
+} GridLevel_Info;
+
+typedef struct RS_Grid_struct {
+    int nlevels;
+    GridLevel_Info* gridlevel_info;
+    int comp;
+    double** data;
+} RS_Grid;
+
+typedef struct PGFPair_struct {
+    int ish;
+    int ipgf;
+    int jsh;
+    int jpgf;
+    int iL;
+    double radius;
+} PGFPair;
+
+bool pgfpairs_with_same_shells(PGFPair*, PGFPair*);
+
+typedef struct Task_struct {
+    size_t buf_size;
+    size_t ntasks;
+    PGFPair** pgfpairs;
+    double radius;
+} Task;
+
+typedef struct TaskList_struct {
+    int nlevels;
+    int hermi;
+    GridLevel_Info* gridlevel_info;
+    Task** tasks;
+} TaskList;
+
+
+int get_task_loc(int** task_loc, PGFPair** pgfpairs, int ntasks,
+                 int ish0, int ish1, int jsh0, int jsh1, int hermi);
+#endif
diff --git a/pyscf/lib/dft/utils.c b/pyscf/lib/dft/utils.c
new file mode 100644
index 0000000000..04ef8e5b2f
--- /dev/null
+++ b/pyscf/lib/dft/utils.c
@@ -0,0 +1,62 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <complex.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#if defined(HAVE_LIBXSMM)
+#include "libxsmm.h"
+#endif
+
+
+void dgemm_wrapper(const char transa, const char transb,
+                   const int m, const int n, const int k,
+                   const double alpha, const double* a, const int lda,
+                   const double* b, const int ldb,
+                   const double beta, double* c, const int ldc)
+{
+#if defined(HAVE_LIBXSMM)
+    if (transa == 'N') {
+        //libxsmm_dgemm(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+        int prefetch = LIBXSMM_PREFETCH_AUTO;
+        int flags = transb != 'T' ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B;
+        libxsmm_dmmfunction kernel = libxsmm_dmmdispatch(m, n, k, &lda, &ldb, &ldc,
+                                                         &alpha, &beta, &flags, &prefetch);
+        if (kernel) {
+            kernel(a,b,c,a,b,c);
+            return;
+        }
+    }
+#endif
+    dgemm_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void get_gga_vrho_gs(double complex *out, double complex *vrho_gs, double complex *vsigma1_gs,
+                     double *Gv, double weight, int ngrid)
+{
+    int i;
+    int ngrid2 = 2 * ngrid;
+    double complex fac = -2. * _Complex_I;
+    #pragma omp parallel for simd schedule(static)
+    for (i = 0; i < ngrid; i++) {
+        out[i] = ( Gv[i*3]   * vsigma1_gs[i]
+                  +Gv[i*3+1] * vsigma1_gs[i+ngrid]
+                  +Gv[i*3+2] * vsigma1_gs[i+ngrid2]) * fac + vrho_gs[i];
+        out[i] *= weight;
+    }
+}
diff --git a/pyscf/lib/dft/utils.h b/pyscf/lib/dft/utils.h
new file mode 100644
index 0000000000..1c85ff1fdc
--- /dev/null
+++ b/pyscf/lib/dft/utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_GRID_UTILS_H
+#define HAVE_DEFINED_GRID_UTILS_H
+
+extern void dgemm_wrapper(const char transa, const char transb,
+                   const int m, const int n, const int k,
+                   const double alpha, const double* a, const int lda,
+                   const double* b, const int ldb,
+                   const double beta, double* c, const int ldc);
+#endif
diff --git a/pyscf/lib/np_helper/np_helper.h b/pyscf/lib/np_helper/np_helper.h
index 2c8227c03d..3ed8d05574 100644
--- a/pyscf/lib/np_helper/np_helper.h
+++ b/pyscf/lib/np_helper/np_helper.h
@@ -61,3 +61,10 @@ void NPdset0(double *p, const size_t n);
 void NPzset0(double complex *p, const size_t n);
 void NPdcopy(double *out, const double *in, const size_t n);
 void NPzcopy(double complex *out, const double complex *in, const size_t n);
+
+void NPdgemm(const char trans_a, const char trans_b,
+             const int m, const int n, const int k,
+             const int lda, const int ldb, const int ldc,
+             const int offseta, const int offsetb, const int offsetc,
+             double *a, double *b, double *c,
+             const double alpha, const double beta);
diff --git a/pyscf/lib/numpy_helper.py b/pyscf/lib/numpy_helper.py
index 406fa54e20..58508d9f8b 100644
--- a/pyscf/lib/numpy_helper.py
+++ b/pyscf/lib/numpy_helper.py
@@ -1116,6 +1116,16 @@ def expm(a):
         y, buf = buf, y
     return y
 
+def ndarray_pointer_2d(array):
+    '''Get the C pointer of a 2D array
+    '''
+    assert array.ndim == 2
+    assert array.flags.c_contiguous
+
+    ptr = (array.ctypes.data +
+           numpy.arange(array.shape[0])*array.strides[0]).astype(numpy.uintp)
+    ptr = ptr.ctypes.data_as(ctypes.c_void_p)
+    return ptr
 
 class NPArrayWithTag(numpy.ndarray):
     # Initialize kwargs in function tag_array
diff --git a/pyscf/lib/pbc/CMakeLists.txt b/pyscf/lib/pbc/CMakeLists.txt
index 6d185fdf85..636cb75451 100644
--- a/pyscf/lib/pbc/CMakeLists.txt
+++ b/pyscf/lib/pbc/CMakeLists.txt
@@ -13,10 +13,20 @@
 # limitations under the License.
 
 add_library(pbc SHARED ft_ao.c fill_ints.c fill_ints_sr.c optimizer.c grid_ao.c
-  nr_direct.c symmetry.c inner_dot.c cint2e.c cint3c2e.c nr_ecp.c transform_mo.c)
+  nr_direct.c symmetry.c inner_dot.c cint2e.c cint3c2e.c nr_ecp.c transform_mo.c
+  neighbor_list.c cell.c pp.c hf_grad.c fill_ints_screened.c)
 add_dependencies(pbc cgto cvhf np_helper)
 
 set_target_properties(pbc PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR})
 
 target_link_libraries(pbc cgto cint cvhf np_helper ${BLAS_LIBRARIES} ${OPENMP_C_PROPERTIES})
+
+if(ENABLE_FFTW)
+add_library(fft SHARED fft.c)
+set_target_properties(fft PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
+  COMPILE_FLAGS ${OpenMP_C_FLAGS}
+  LINK_FLAGS ${OpenMP_C_FLAGS})
+target_link_libraries(fft fftw3_threads fftw3 ${BLAS_LIBRARIES})
+endif()
diff --git a/pyscf/lib/pbc/cell.c b/pyscf/lib/pbc/cell.c
new file mode 100644
index 0000000000..20bb96e72c
--- /dev/null
+++ b/pyscf/lib/pbc/cell.c
@@ -0,0 +1,280 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <complex.h>
+#include "config.h"
+#include "cint.h"
+#include "pbc/cell.h"
+#include "np_helper/np_helper.h"
+
+#define SQUARE(r) (r[0]*r[0]+r[1]*r[1]+r[2]*r[2])
+
+double pgf_rcut(int l, double alpha, double coeff, double precision, double r0)
+{
+    l += 2;
+
+    double rcut;
+    double rmin = sqrt(.5 * l / alpha) * 2.;
+    double gmax = coeff * pow(rmin, l) * exp(-alpha * rmin * rmin);
+    if (gmax < precision) {
+        return rmin;
+    }
+
+    double eps = MIN(rmin/10, RCUT_EPS);
+    double c = log(coeff / precision);
+    double rcut_last;
+    rcut = MAX(r0, rmin+eps);
+
+    int i;
+    for (i = 0; i < RCUT_MAX_CYCLE; i++) {
+        rcut_last = rcut;
+        rcut = sqrt((l*log(rcut) + c) / alpha);
+        if (fabs(rcut - rcut_last) < eps) {
+            break;
+        }
+    }
+    if (i == RCUT_MAX_CYCLE) {
+        //printf("r0 = %.6e, l = %d, alpha = %.6e, coeff = %.6e, precision=%.6e\n", r0, l, alpha, coeff, precision);
+        fprintf(stderr, "pgf_rcut did not converge in %d cycles: %.6f > %.6f.\n",
+                RCUT_MAX_CYCLE, fabs(rcut - rcut_last), eps);
+    }
+    return rcut; 
+}
+
+void rcut_by_shells(double* shell_radius, double** ptr_pgf_rcut, 
+                    int* bas, double* env, int nbas, 
+                    double r0, double precision)
+{
+#pragma omp parallel
+{
+    int ib, ic, p;
+    #pragma omp for schedule(static)
+    for (ib = 0; ib < nbas; ib ++) {
+        int l = bas[ANG_OF+ib*BAS_SLOTS];
+        int nprim = bas[NPRIM_OF+ib*BAS_SLOTS];
+        int ptr_exp = bas[PTR_EXP+ib*BAS_SLOTS];
+        int nctr = bas[NCTR_OF+ib*BAS_SLOTS];
+        int ptr_c = bas[PTR_COEFF+ib*BAS_SLOTS];
+        double rcut_max = 0, rcut;
+        for (p = 0; p < nprim; p++) {
+            double alpha = env[ptr_exp+p];
+            double cmax = 0;
+            for (ic = 0; ic < nctr; ic++) {
+                cmax = MAX(fabs(env[ptr_c+ic*nprim+p]), cmax);
+            }
+            rcut = pgf_rcut(l, alpha, cmax, precision, r0);
+            if (ptr_pgf_rcut) {
+                ptr_pgf_rcut[ib][p] = rcut;
+            }
+            rcut_max = MAX(rcut, rcut_max);
+        }
+        shell_radius[ib] = rcut_max;
+    }
+}
+}
+
+
+static void get_SI_real_imag(double* out_real, double* out_imag,
+                             double* coords, double* Gv,
+                             int natm, size_t ngrid)
+{
+#pragma omp parallel
+{
+    int ia;
+    size_t i;
+    double RG;
+    double *pcoords, *pGv;
+    double *pout_real, *pout_imag;
+    #pragma omp for schedule(static)
+    for (ia = 0; ia < natm; ia++) {
+        pcoords = coords + ia * 3;
+        pout_real = out_real + ia * ngrid;
+        pout_imag = out_imag + ia * ngrid;
+        for (i = 0; i < ngrid; i++) {
+            pGv = Gv + i * 3;
+            RG = pcoords[0] * pGv[0] + pcoords[1] * pGv[1] + pcoords[2] * pGv[2];
+            pout_real[i] = cos(RG);
+            pout_imag[i] = -sin(RG);
+        }
+    }
+}
+}
+
+
+void get_Gv(double* Gv, double* rx, double* ry, double* rz, int* mesh, double* b)
+{
+#pragma omp parallel
+{
+    int x, y, z;
+    double *pGv;
+    #pragma omp for schedule(dynamic)
+    for (x = 0; x < mesh[0]; x++) {
+        pGv = Gv + x * (size_t)mesh[1] * mesh[2] * 3;
+        for (y = 0; y < mesh[1]; y++) {
+        for (z = 0; z < mesh[2]; z++) {
+            pGv[0]  = rx[x] * b[0];
+            pGv[0] += ry[y] * b[3];
+            pGv[0] += rz[z] * b[6];
+            pGv[1]  = rx[x] * b[1];
+            pGv[1] += ry[y] * b[4];
+            pGv[1] += rz[z] * b[7];
+            pGv[2]  = rx[x] * b[2];
+            pGv[2] += ry[y] * b[5];
+            pGv[2] += rz[z] * b[8];
+            pGv += 3;
+        }}
+    }
+}
+}
+
+
+void ewald_gs_nuc_grad(double* out, double* Gv, double* charges, double* coords,
+                       double ew_eta, double weights, int natm, size_t ngrid)
+{
+    double *SI_real = (double*) malloc(natm*ngrid*sizeof(double));
+    double *SI_imag = (double*) malloc(natm*ngrid*sizeof(double)); 
+    get_SI_real_imag(SI_real, SI_imag, coords, Gv, natm, ngrid);
+
+    double *ZSI_real = calloc(ngrid, sizeof(double));
+    double *ZSI_imag = calloc(ngrid, sizeof(double));
+
+    NPdgemm('N', 'N', ngrid, 1, natm,
+            ngrid, natm, ngrid, 0, 0, 0,
+            SI_real, charges, ZSI_real, 1., 0.);
+    NPdgemm('N', 'N', ngrid, 1, natm,
+            ngrid, natm, ngrid, 0, 0, 0,
+            SI_imag, charges, ZSI_imag, 1., 0.);
+
+#pragma omp parallel
+{
+    int ia;
+    size_t i;
+    double charge_i;
+    double G2, coulG, tmp;
+    double *pout, *pGv;
+    double *pSI_real, *pSI_imag;
+    double fac = 4. * M_PI * weights;
+    double fac1 = 4. * ew_eta * ew_eta;
+
+    #pragma omp for schedule(static)
+    for (ia = 0; ia < natm; ia++) {
+        charge_i = charges[ia];
+        pout = out + ia * 3;
+        pSI_real = SI_real + ia * ngrid;
+        pSI_imag = SI_imag + ia * ngrid;
+        #pragma omp simd
+        for (i = 0; i < ngrid; i++) {
+            pGv = Gv + i*3;
+            G2 = SQUARE(pGv);
+            if (G2 < 1e-12) {continue;}
+            coulG = fac / G2 * exp(-G2 / fac1);
+            tmp  = coulG * charge_i;
+            tmp *= (pSI_imag[i] * ZSI_real[i] - pSI_real[i] * ZSI_imag[i]);
+            pout[0] += tmp * pGv[0];
+            pout[1] += tmp * pGv[1];
+            pout[2] += tmp * pGv[2];
+        }
+    }
+}
+    free(SI_real);
+    free(SI_imag);
+    free(ZSI_real);
+    free(ZSI_imag);
+}
+
+
+void get_ewald_direct(double* ewovrl, double* chargs, double* coords, double* Ls,
+                      double beta, double rcut, int natm, int nL)
+{
+    *ewovrl = 0.0;
+
+    #pragma omp parallel
+    {
+        int i, j, l;
+        double *ri, *rj, *rL;
+        double rij[3];
+        double r, qi, qj;
+        double e_loc = 0.0;
+        #pragma omp for schedule(static)
+        for (i = 0; i < natm; i++) {
+            ri = coords + i*3;
+            qi = chargs[i];
+            for (j = 0; j < natm; j++) {
+                rj = coords + j*3;
+                qj = chargs[j];
+                for (l = 0; l < nL; l++) {
+                    rL = Ls + l*3;
+                    rij[0] = rj[0] + rL[0] - ri[0];
+                    rij[1] = rj[1] + rL[1] - ri[1];
+                    rij[2] = rj[2] + rL[2] - ri[2];
+                    r = sqrt(SQUARE(rij));
+                    if (r > 1e-10 && r < rcut) {
+                        e_loc += qi * qj * erfc(beta * r) / r;
+                    }
+                }
+            }
+        }
+        e_loc *= 0.5;
+
+        #pragma omp critical
+        *ewovrl += e_loc;
+    }
+}
+
+
+void get_ewald_direct_nuc_grad(double* out, double* chargs, double* coords, double* Ls,
+                               double beta, double rcut, int natm, int nL)
+{
+    double fac = 2. * beta / sqrt(M_PI);
+    double beta2 = beta * beta;
+
+    #pragma omp parallel
+    {
+        int i, j, l;
+        double *ri, *rj, *rL, *pout;
+        double rij[3];
+        double r, r2, qi, qj, tmp;
+        #pragma omp for schedule(static)
+        for (i = 0; i < natm; i++) {
+            pout = out + i*3;
+            ri = coords + i*3;
+            qi = chargs[i];
+            for (j = 0; j < natm; j++) {
+                rj = coords + j*3;
+                qj = chargs[j];
+                for (l = 0; l < nL; l++) {
+                    rL = Ls + l*3;
+                    rij[0] = ri[0] - rj[0] + rL[0];
+                    rij[1] = ri[1] - rj[1] + rL[1];
+                    rij[2] = ri[2] - rj[2] + rL[2];
+                    r2 = SQUARE(rij);
+                    r = sqrt(r2);
+                    if (r > 1e-10 && r < rcut) {
+                        tmp  = qi * qj * (erfc(beta * r) / (r2 * r) + fac * exp(-beta2 * r2) / r2);
+                        pout[0] -= tmp * rij[0];
+                        pout[1] -= tmp * rij[1];
+                        pout[2] -= tmp * rij[2];
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/pyscf/lib/pbc/cell.h b/pyscf/lib/pbc/cell.h
new file mode 100644
index 0000000000..bec26bb2ea
--- /dev/null
+++ b/pyscf/lib/pbc/cell.h
@@ -0,0 +1,29 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_CELL_H
+#define HAVE_DEFINED_CELL_H
+
+#define RCUT_MAX_CYCLE 10
+#define RCUT_EPS 1e-3
+
+double pgf_rcut(int l, double alpha, double coeff, double precision, double r0);
+void rcut_by_shells(double* shell_radius, double** ptr_pgf_rcut,
+                    int* bas, double* env, int nbas,
+                    double r0, double precision);
+#endif
diff --git a/pyscf/lib/pbc/fft.c b/pyscf/lib/pbc/fft.c
new file mode 100644
index 0000000000..3affbb9a02
--- /dev/null
+++ b/pyscf/lib/pbc/fft.c
@@ -0,0 +1,147 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdio.h>
+#include <complex.h>
+#include <fft.h>
+#include "config.h"
+
+#define BLKSIZE 128
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+fftw_plan fft_create_r2c_plan(double* in, complex double* out, int rank, int* mesh)
+{
+    fftw_plan p;
+    p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE);
+    return p;
+}
+
+fftw_plan fft_create_c2r_plan(complex double* in, double* out, int rank, int* mesh)
+{
+    fftw_plan p;
+    p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE);
+    return p;
+}
+
+void fft_execute(fftw_plan p)
+{
+    fftw_execute(p);
+}
+
+void fft_destroy_plan(fftw_plan p)
+{
+    fftw_destroy_plan(p);
+}
+
+void _complex_fft(complex double* in, complex double* out, int* mesh, int rank, int sign)
+{
+    int i;
+    int nx = mesh[0];
+    int nyz = 1;
+    for (i = 1; i < rank; i++) {
+        nyz *= mesh[i];
+    }
+    int nmax = nyz / BLKSIZE * BLKSIZE;
+    fftw_plan p_2d = fftw_plan_dft(rank-1, mesh+1, in, out, sign, FFTW_ESTIMATE);
+    int nn[BLKSIZE] = {nx};
+    fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, BLKSIZE,
+                                          out, NULL, nyz, 1,
+                                          out, NULL, nyz, 1,
+                                          sign, FFTW_ESTIMATE);
+
+    #pragma omp parallel private(i)
+    {
+        int off;
+        #pragma omp for schedule(dynamic)
+        for (i = 0; i < nx; i++) {
+            off = i * nyz;
+            fftw_execute_dft(p_2d, in+off, out+off);
+        }
+
+        #pragma omp for schedule(dynamic)
+        for (i = 0; i < nmax; i+=BLKSIZE) {
+            fftw_execute_dft(p_3d_x, out+i, out+i);
+        }
+    }
+    fftw_destroy_plan(p_2d);
+    fftw_destroy_plan(p_3d_x);
+    
+    int nres = nyz - nmax;
+    if (nres > 0) {
+        fftw_plan p_3d_x = fftw_plan_many_dft(1, nn, nres,
+                                          out+nmax, NULL, nyz, 1,
+                                          out+nmax, NULL, nyz, 1,
+                                          sign, FFTW_ESTIMATE);
+        fftw_execute(p_3d_x);
+        fftw_destroy_plan(p_3d_x);
+    }
+}
+
+void fft(complex double* in, complex double* out, int* mesh, int rank)
+{
+    _complex_fft(in, out, mesh, rank, FFTW_FORWARD);
+}
+
+void ifft(complex double* in, complex double* out, int* mesh, int rank)
+{
+    _complex_fft(in, out, mesh, rank, FFTW_BACKWARD);
+    size_t i, n = 1;
+    for (i = 0; i < rank; i++) {
+        n *= mesh[i];
+    }
+    double fac = 1. / (double)n;
+    #pragma omp parallel for schedule(static)
+    for (i = 0; i < n; i++) {
+        out[i] *= fac;
+    }
+}
+
+void rfft(double* in, complex double* out, int* mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_r2c(rank, mesh, in, out, FFTW_ESTIMATE); 
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+}
+
+void irfft(complex double* in, double* out, int* mesh, int rank)
+{
+    fftw_plan p = fftw_plan_dft_c2r(rank, mesh, in, out, FFTW_ESTIMATE);
+    fftw_execute(p);
+    fftw_destroy_plan(p);
+    size_t i, n = 1;
+    for (i = 0; i < rank; i++) {
+        n *= mesh[i];
+    }
+    double fac = 1. / (double)n;
+    #pragma omp parallel for schedule(static)
+    for (i = 0; i < n; i++) {
+        out[i] *= fac;
+    }
+}
+
+void _copy_d2z(double complex *out, const double *in, const size_t n)
+{
+#pragma omp parallel
+{
+    size_t i;
+    #pragma omp for schedule(static)
+    for (i = 0; i < n; i++) {
+        out[i] = in[i] + 0*_Complex_I;
+    }
+}
+}
diff --git a/pyscf/lib/pbc/fft.h b/pyscf/lib/pbc/fft.h
new file mode 100644
index 0000000000..edc5382f7e
--- /dev/null
+++ b/pyscf/lib/pbc/fft.h
@@ -0,0 +1,26 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <fftw3.h>
+
+#define FFT_PLAN fftw_plan
+
+FFT_PLAN fft_create_r2c_plan(double* in, complex double* out, int rank, int* mesh);
+FFT_PLAN fft_create_c2r_plan(complex double* in, double* out, int rank, int* mesh);
+void fft_execute(FFT_PLAN p);
+void fft_destroy_plan(FFT_PLAN p);
diff --git a/pyscf/lib/pbc/fill_ints.c b/pyscf/lib/pbc/fill_ints.c
index 36c853724c..95857b19ee 100644
--- a/pyscf/lib/pbc/fill_ints.c
+++ b/pyscf/lib/pbc/fill_ints.c
@@ -1260,9 +1260,9 @@ static void shift_bas(double *env_loc, double *env, double *Ls, int ptr, int iL)
         env_loc[ptr+2] = env[ptr+2] + Ls[iL*3+2];
 }
 
-static void sort2c_ks1(double complex *out, double *bufr, double *bufi,
-                       int *shls_slice, int *ao_loc, int nkpts, int comp,
-                       int jsh, int msh0, int msh1)
+void sort2c_ks1(double complex *out, double *bufr, double *bufi,
+                int *shls_slice, int *ao_loc, int nkpts, int comp,
+                int jsh, int msh0, int msh1)
 {
         const int ish0 = shls_slice[0];
         const int ish1 = shls_slice[1];
diff --git a/pyscf/lib/pbc/fill_ints.h b/pyscf/lib/pbc/fill_ints.h
new file mode 100644
index 0000000000..ec2000755e
--- /dev/null
+++ b/pyscf/lib/pbc/fill_ints.h
@@ -0,0 +1,29 @@
+/* Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ */
+
+#ifndef HAVE_DEFINED_PBC_FILL_INTS_H
+#define HAVE_DEFINED_PBC_FILL_INTS_H
+
+void sort2c_gs1(double *out, double *in, int *shls_slice, int *ao_loc,
+                int comp, int ish, int jsh);
+void sort2c_gs2_igtj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh);
+void sort2c_gs2_ieqj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh);
+void sort2c_ks1(double complex *out, double *bufr, double *bufi,
+                int *shls_slice, int *ao_loc, int nkpts, int comp,
+                int jsh, int msh0, int msh1);
+#endif
diff --git a/pyscf/lib/pbc/fill_ints_screened.c b/pyscf/lib/pbc/fill_ints_screened.c
new file mode 100644
index 0000000000..5d100c7ae3
--- /dev/null
+++ b/pyscf/lib/pbc/fill_ints_screened.c
@@ -0,0 +1,1012 @@
+/* Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <complex.h>
+#include <assert.h>
+#include <string.h>
+#include "config.h"
+#include "cint.h"
+#include "vhf/fblas.h"
+#include "pbc/optimizer.h"
+#include "pbc/fill_ints.h"
+#include "pbc/neighbor_list.h"
+#include "np_helper/np_helper.h"
+
+#define INTBUFMAX       1000
+#define INTBUFMAX10     8000
+#define IMGBLK          80
+#define OF_CMPLX        2
+#define MAX_THREADS     256
+
+int GTOmax_shell_dim(int *ao_loc, int *shls_slice, int ncenter);
+int GTOmax_cache_size(int (*intor)(), int *shls_slice, int ncenter,
+                      int *atm, int natm, int *bas, int nbas, double *env);
+
+static int shloc_partition(int *kshloc, int *ao_loc, int ksh0, int ksh1, int dkmax)
+{
+        int ksh;
+        int nloc = 0;
+        int loclast = ao_loc[ksh0];
+        kshloc[0] = ksh0;
+        for (ksh = ksh0+1; ksh < ksh1; ksh++) {
+                assert(ao_loc[ksh+1] - ao_loc[ksh] < dkmax);
+                if (ao_loc[ksh+1] - loclast > dkmax) {
+                        nloc += 1;
+                        kshloc[nloc] = ksh;
+                        loclast = ao_loc[ksh];
+                }
+        }
+        nloc += 1;
+        kshloc[nloc] = ksh1;
+        return nloc;
+}
+
+static void shift_bas(double *env_loc, double *env, double *Ls, int ptr, int iL)
+{
+        env_loc[ptr+0] = env[ptr+0] + Ls[iL*3+0];
+        env_loc[ptr+1] = env[ptr+1] + Ls[iL*3+1];
+        env_loc[ptr+2] = env[ptr+2] + Ls[iL*3+2];
+}
+
+static void sort3c_gs1(double *out, double *in, int *shls_slice, int *ao_loc,
+                       int comp, int ish, int jsh, int msh0, int msh1)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+        const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
+        const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        const size_t naok = ao_loc[ksh1] - ao_loc[ksh0];
+        const size_t njk = naoj * naok;
+        const size_t nijk = njk * naoi;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int ip = ao_loc[ish] - ao_loc[ish0];
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        const int dij = di * dj;
+        out += (ip * naoj + jp) * naok;
+
+        int i, j, k, ksh, ic, dk, dijk;
+        double *pin, *pout;
+
+        for (ksh = msh0; ksh < msh1; ksh++) {
+                dk = ao_loc[ksh+1] - ao_loc[ksh];
+                dijk = dij * dk;
+                for (ic = 0; ic < comp; ic++) {
+                        pout = out + nijk * ic + ao_loc[ksh]-ao_loc[ksh0];
+                        pin = in + dijk * ic;
+                        for (j = 0; j < dj; j++) {
+                                for (i = 0; i < di; i++) {
+                                for (k = 0; k < dk; k++) {
+                                        pout[i*njk+k] = pin[k*dij+i];
+                                } }
+                                pout += naok;
+                                pin += di;
+                        }
+                }
+                in += dijk * comp;
+        }
+}
+
+static void _nr3c_screened_fill_g(int (*intor)(), void (*fsort)(), double *out, int nkpts_ij,
+                         int nkpts, int comp, int nimgs, int ish, int jsh,
+                         double *buf, double *env_loc, double *Ls,
+                         double *expkL_r, double *expkL_i, int *kptij_idx,
+                         int *shls_slice, int *ao_loc,
+                         CINTOpt *cintopt, PBCOpt *pbcopt,
+                         int *atm, int natm, int *bas, int nbas, double *env,
+                         NeighborList** neighbor_list)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+
+        jsh += jsh0;
+        ish += ish0;
+        int iptrxyz = atm[PTR_COORD+bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        int dkmax = INTBUFMAX10 / dij / 2 * MIN(IMGBLK,nimgs);
+        int kshloc[ksh1-ksh0+1];
+        int nkshloc = shloc_partition(kshloc, ao_loc, ksh0, ksh1, dkmax);
+
+        int i, m, msh0, msh1, dijm;
+        int ksh, dk, iL, jL, dijkc, ksh_off, jsh_off;
+        int shls[3];
+
+        int nshi = ish1 - ish0;
+        int nshj = jsh1 - jsh0;
+        int nshij = nshi + nshj;
+        int idx_i, idx_j;
+
+        int dijmc = dij * dkmax * comp;
+        double *bufL = buf + dijmc;
+        double *cache = bufL + dijmc;
+        double *pbuf;
+        int (*fprescreen)();
+        if (pbcopt != NULL) {
+                fprescreen = pbcopt->fprescreen;
+        } else {
+                fprescreen = PBCnoscreen;
+        }
+
+        shls[0] = ish;
+        shls[1] = jsh;
+        jsh_off = jsh - nshi;
+        NeighborList *nl0 = *neighbor_list;
+        NeighborPair *np0_ki, *np0_kj;
+        for (m = 0; m < nkshloc; m++) {
+                msh0 = kshloc[m];
+                msh1 = kshloc[m+1];
+                dkmax = ao_loc[msh1] - ao_loc[msh0];
+                dijm = dij * dkmax;
+                dijmc = dijm * comp;
+                for (i = 0; i < dijmc; i++) {
+                    bufL[i] = 0;
+                }
+
+                pbuf = bufL;
+                for (ksh = msh0; ksh < msh1; ksh++){
+                    shls[2] = ksh;
+                    ksh_off = ksh - nshij;
+                    dk = ao_loc[ksh+1] - ao_loc[ksh];
+                    dijkc = dij*dk * comp;
+                    np0_ki = (nl0->pairs)[ksh_off*nshi + ish];
+                    np0_kj = (nl0->pairs)[ksh_off*nshj + jsh_off];
+                    if (np0_ki->nimgs > 0 && np0_kj->nimgs > 0) { 
+                        for (idx_i = 0; idx_i < np0_ki->nimgs; idx_i++){
+                            iL = (np0_ki->Ls_list)[idx_i];
+                            shift_bas(env_loc, env, Ls, iptrxyz, iL);
+                            for (idx_j = 0; idx_j < np0_kj->nimgs; idx_j++){
+                                jL = (np0_kj->Ls_list)[idx_j];
+                                shift_bas(env_loc, env, Ls, jptrxyz, jL);
+
+                                if ((*fprescreen)(shls, pbcopt, atm, bas, env_loc)) {
+                                    if ((*intor)(buf, NULL, shls, atm, natm, bas, nbas,
+                                        env_loc, cintopt, cache)) {
+                                        for (i = 0; i < dijkc; i++) {
+                                            pbuf[i] += buf[i];
+                                        }
+                                    }
+                                }
+                            } 
+
+                        }
+                    }
+                    pbuf += dijkc;
+                }
+
+                (*fsort)(out, bufL, shls_slice, ao_loc, comp, ish, jsh, msh0, msh1);
+        }
+}
+
+static void _nr3c_screened_sum_auxbas_fill_g(int (*intor)(), void (*fsort)(), double *out, int nkpts_ij,
+                         int nkpts, int comp, int nimgs, int ish, int jsh,
+                         double *buf, double *env_loc, double *Ls,
+                         double *expkL_r, double *expkL_i, int *kptij_idx,
+                         int *shls_slice, int *ao_loc,
+                         CINTOpt *cintopt, PBCOpt *pbcopt,
+                         int *atm, int natm, int *bas, int nbas, double *env,
+                         NeighborList** neighbor_list)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+
+        jsh += jsh0;
+        ish += ish0;
+        int iptrxyz = atm[PTR_COORD+bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+        int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        int dkmax = INTBUFMAX10 / dij / 2 * MIN(IMGBLK,nimgs);
+        //int kshloc[ksh1-ksh0+1];
+        //int nkshloc = shloc_partition(kshloc, ao_loc, ksh0, ksh1, dkmax);
+
+        int i, k, ic;
+        int ksh, dk, dijk, iL, jL, ksh_off, jsh_off;
+        int shls[3];
+
+        int nshi = ish1 - ish0;
+        int nshj = jsh1 - jsh0;
+        int nshij = nshi + nshj;
+        int idx_i, idx_j;
+
+        int dijmc = dij * dkmax * comp;
+        double *bufL = buf + dijmc;
+        double *cache = bufL + dijmc;
+        double *pbuf, *pbufL;
+        int (*fprescreen)();
+        if (pbcopt != NULL) {
+                fprescreen = pbcopt->fprescreen;
+        } else {
+                fprescreen = PBCnoscreen;
+        }
+
+        shls[0] = ish;
+        shls[1] = jsh;
+        jsh_off = jsh - nshi;
+        NeighborList *nl0 = *neighbor_list;
+        NeighborPair *np0_ki, *np0_kj;
+
+        int dijc = dij * comp;
+        for (i = 0; i < dijc; i++) {
+            bufL[i] = 0;
+        }
+
+        for (ksh = ksh0; ksh < ksh1; ksh++){
+            dk = ao_loc[ksh+1] - ao_loc[ksh];
+            assert(dk < dkmax);
+            dijk = dij * dk;
+            shls[2] = ksh;
+            ksh_off = ksh - nshij;
+            np0_ki = (nl0->pairs)[ksh_off*nshi + ish];
+            np0_kj = (nl0->pairs)[ksh_off*nshj + jsh_off];
+            if (np0_ki->nimgs > 0 && np0_kj->nimgs > 0) { 
+                for (idx_i = 0; idx_i < np0_ki->nimgs; idx_i++){
+                    iL = (np0_ki->Ls_list)[idx_i];
+                    shift_bas(env_loc, env, Ls, iptrxyz, iL);
+                    for (idx_j = 0; idx_j < np0_kj->nimgs; idx_j++){
+                        jL = (np0_kj->Ls_list)[idx_j];
+                        shift_bas(env_loc, env, Ls, jptrxyz, jL);
+
+                        if ((*fprescreen)(shls, pbcopt, atm, bas, env_loc)) {
+                            if ((*intor)(buf, NULL, shls, atm, natm, bas, nbas,
+                                env_loc, cintopt, cache)) {
+                                for (ic = 0; ic < comp; ic++) {
+                                    pbufL = bufL + ic * dij;
+                                    pbuf = buf + ic * dijk;
+                                    for (k = 0; k < dk; k++) {
+                                        for (i = 0; i < dij; i++) {
+                                            pbufL[i] += pbuf[i];
+                                        }
+                                        pbuf += dij;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        (*fsort)(out, bufL, shls_slice, ao_loc, comp, ish, jsh);
+}
+
+void PBCnr3c_screened_fill_gs1(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+     _nr3c_screened_fill_g(intor, &sort3c_gs1, out, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                  buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                  shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+static void sort3c_gs2_igtj(double *out, double *in, int *shls_slice, int *ao_loc,
+                            int comp, int ish, int jsh, int msh0, int msh1)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+        const size_t naok = ao_loc[ksh1] - ao_loc[ksh0];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+        const size_t nijk = nij * naok;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += (((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp) * naok;
+
+        int i, j, k, ij, ksh, ic, dk, dijk;
+        double *pin, *pout;
+
+        for (ksh = msh0; ksh < msh1; ksh++) {
+                dk = ao_loc[ksh+1] - ao_loc[ksh];
+                dijk = dij * dk;
+                for (ic = 0; ic < comp; ic++) {
+                        pout = out + nijk * ic + ao_loc[ksh]-ao_loc[ksh0];
+                        pin = in + dijk * ic;
+                        for (i = 0; i < di; i++) {
+                                for (j = 0; j < dj; j++) {
+                                        ij = j * di + i;
+                                        for (k = 0; k < dk; k++) {
+                                                pout[j*naok+k] = pin[k*dij+ij];
+                                        }
+                                }
+                                pout += (i+ao_loc[ish]+1) * naok;
+                        }
+                }
+                in += dijk * comp;
+        }
+}
+
+void sort2c_gs2_igtj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += ((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp;
+
+        int i, j, ic;
+        double *pin, *pout;
+
+        for (ic = 0; ic < comp; ic++) {
+                pout = out + nij * ic;
+                pin = in + dij * ic;
+                for (i = 0; i < di; i++) {
+                        for (j = 0; j < dj; j++) {
+                                pout[j] = pin[j*di+i];
+                        }
+                        pout += (i+ao_loc[ish]+1);
+                }
+        }
+}
+
+static void sort3c_gs2_ieqj(double *out, double *in, int *shls_slice, int *ao_loc,
+                            int comp, int ish, int jsh, int msh0, int msh1)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int ksh0 = shls_slice[4];
+        const int ksh1 = shls_slice[5];
+        const size_t naok = ao_loc[ksh1] - ao_loc[ksh0];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+        const size_t nijk = nij * naok;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dij = di * di;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += (((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp) * naok;
+
+        int i, j, k, ij, ksh, ic, dk, dijk;
+        double *pin, *pout;
+
+        for (ksh = msh0; ksh < msh1; ksh++) {
+                dk = ao_loc[ksh+1] - ao_loc[ksh];
+                dijk = dij * dk;
+                for (ic = 0; ic < comp; ic++) {
+                        pout = out + nijk * ic + ao_loc[ksh]-ao_loc[ksh0];
+                        pin = in + dijk * ic;
+                        for (i = 0; i < di; i++) {
+                                for (j = 0; j <= i; j++) {
+                                        ij = j * di + i;
+                                        for (k = 0; k < dk; k++) {
+                                                pout[j*naok+k] = pin[k*dij+ij];
+                                        }
+                                }
+                                pout += (i+ao_loc[ish]+1) * naok;
+                        }
+                }
+                in += dijk * comp;
+        }
+}
+
+void sort2c_gs2_ieqj(double *out, double *in, int *shls_slice, int *ao_loc,
+                     int comp, int ish, int jsh)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const size_t off0 = ((size_t)ao_loc[ish0]) * (ao_loc[ish0] + 1) / 2;
+        const size_t nij = ((size_t)ao_loc[ish1]) * (ao_loc[ish1] + 1) / 2 - off0;
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dij = di * di;
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        out += ((size_t)ao_loc[ish])*(ao_loc[ish]+1)/2-off0 + jp;
+
+        int i, j, ic;
+        double *pin, *pout;
+
+        for (ic = 0; ic < comp; ic++) {
+                pout = out + nij * ic;
+                pin = in + dij * ic;
+                for (i = 0; i < di; i++) {
+                        for (j = 0; j <= i; j++) {
+                                pout[j] = pin[j*di+i];
+                        }
+                        pout += (i+ao_loc[ish]+1);
+                }
+        }
+}
+
+void sort2c_gs1(double *out, double *in, int *shls_slice, int *ao_loc,
+                int comp, int ish, int jsh)
+{
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        const int dij = di * dj;
+        const int ip = ao_loc[ish] - ao_loc[ish0];
+        const int jp = ao_loc[jsh] - ao_loc[jsh0];
+        const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
+        const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        const size_t nij = naoi * naoj;
+        out += ip * naoj + jp;
+
+        int i, j, ic;
+        double *pin, *pout;
+
+        for (ic = 0; ic < comp; ic++) {
+                pout = out + nij * ic;
+                pin = in + dij * ic;
+                for (i = 0; i < di; i++) {
+                        for (j = 0; j < dj; j++) {
+                                pout[j] = pin[j*di+i];
+                        }
+                        pout += naoj;
+                }
+        }
+}
+
+void PBCnr3c_screened_fill_gs2(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        int ip = ish + shls_slice[0];
+        int jp = jsh + shls_slice[2] - nbas;
+        if (ip > jp) {
+             _nr3c_screened_fill_g(intor, &sort3c_gs2_igtj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        } else if (ip == jp) {
+             _nr3c_screened_fill_g(intor, &sort3c_gs2_ieqj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+}
+
+void PBCnr3c_screened_sum_auxbas_fill_gs1(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        _nr3c_screened_sum_auxbas_fill_g(intor, &sort2c_gs1, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+void PBCnr3c_screened_sum_auxbas_fill_gs2(int (*intor)(), double *out, int nkpts_ij,
+                      int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        int ip = ish + shls_slice[0];
+        int jp = jsh + shls_slice[2] - nbas;
+        if (ip > jp) {
+             _nr3c_screened_sum_auxbas_fill_g(intor, &sort2c_gs2_igtj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        } else if (ip == jp) {
+             _nr3c_screened_sum_auxbas_fill_g(intor, &sort2c_gs2_ieqj, out,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+}
+
+static void contract_3c1e_ipik_dm_gs1(double *grad, double* dm, double *eri,
+                                      int *shls, int *ao_loc, int *atm, int natm,
+                                      int *bas, int nbas, int comp, int nao)
+{
+    const int ish = shls[0];
+    const int jsh = shls[1];
+    const int ksh = shls[2];
+
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di * dj;
+    const size_t i0 = ao_loc[ish];
+    const size_t j0 = ao_loc[jsh] - nao;
+
+    const int ia = bas[ATOM_OF+ish*BAS_SLOTS];
+    const int ka = bas[ATOM_OF+ksh*BAS_SLOTS] - 2*natm;
+
+    int i, j, ic;
+    double *ptr_eri, *ptr_dm;
+    double *dm0 = dm + (i0 * nao + j0);
+    double ipi_dm[comp];
+    for (ic = 0; ic < comp; ic++) {
+        ipi_dm[ic] = 0;
+        ptr_dm = dm0;
+        ptr_eri = eri + dij * ic;
+        for (i = 0; i < di; i++) {
+            for (j = 0; j < dj; j++) {
+                ipi_dm[ic] += ptr_eri[j*di+i] * ptr_dm[j];
+            }
+            ptr_dm += nao;
+        }
+    }
+
+    for (ic = 0; ic < comp; ic++) {
+        grad[ia*comp+ic] += ipi_dm[ic];
+        grad[ka*comp+ic] -= ipi_dm[ic];
+    }
+}
+
+static void _nr3c1e_screened_nuc_grad_fill_g(int (*intor)(), void (*fcontract)(),
+            double *grad, double *dm, int nkpts_ij, int nkpts,
+            int comp, int nimgs, int ish, int jsh,
+            double *buf, double *env_loc, double *Ls,
+            double *expkL_r, double *expkL_i, int *kptij_idx,
+            int *shls_slice, int *ao_loc,
+            CINTOpt *cintopt, PBCOpt *pbcopt,
+            int *atm, int natm, int *bas, int nbas, double *env, int nao,
+            NeighborList** neighbor_list)
+{
+    const int ish0 = shls_slice[0];
+    //const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    //const int jsh1 = shls_slice[3];
+    const int ksh0 = shls_slice[4];
+    const int ksh1 = shls_slice[5];
+
+    ish += ish0;
+    jsh += jsh0;
+    int iptrxyz = atm[PTR_COORD+bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS];
+    int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di * dj;
+    int dkmax = INTBUFMAX10 / dij / 2 * MIN(IMGBLK,nimgs);
+    //int kshloc[ksh1-ksh0+1];
+    //int nkshloc = shloc_partition(kshloc, ao_loc, ksh0, ksh1, dkmax);
+
+    int i, k, ic;
+    int ksh, dk, dijk, iL, jL, ksh_off, jsh_off;
+    int shls[3];
+
+    int idx_i, idx_j;
+
+    int dijc = dij * comp;
+    int dijmc = dijc * dkmax;
+    double *bufL = buf + dijmc;
+    double *cache = bufL + dijc;
+    double *pbuf, *pbufL;
+    int (*fprescreen)();
+    if (pbcopt != NULL) {
+            fprescreen = pbcopt->fprescreen;
+    } else {
+            fprescreen = PBCnoscreen;
+    }
+
+    shls[0] = ish;
+    shls[1] = jsh;
+    jsh_off = jsh - nbas;
+    NeighborList *nl0 = *neighbor_list;
+    NeighborPair *np0_ki, *np0_kj;
+
+    for (ksh = ksh0; ksh < ksh1; ksh++){
+        dk = ao_loc[ksh+1] - ao_loc[ksh];
+        assert(dk < dkmax);
+        dijk = dij * dk;
+        shls[2] = ksh;
+        ksh_off = ksh - nbas*2;
+        np0_ki = (nl0->pairs)[ksh_off*nbas + ish];
+        np0_kj = (nl0->pairs)[ksh_off*nbas + jsh_off];
+        if (np0_ki->nimgs > 0 && np0_kj->nimgs > 0) {
+            for (i = 0; i < dijc; i++) {
+                bufL[i] = 0;
+            }
+            for (idx_i = 0; idx_i < np0_ki->nimgs; idx_i++){
+                iL = (np0_ki->Ls_list)[idx_i];
+                shift_bas(env_loc, env, Ls, iptrxyz, iL);
+                for (idx_j = 0; idx_j < np0_kj->nimgs; idx_j++){
+                    jL = (np0_kj->Ls_list)[idx_j];
+                    shift_bas(env_loc, env, Ls, jptrxyz, jL);
+
+                    if ((*fprescreen)(shls, pbcopt, atm, bas, env_loc)) {
+                        if ((*intor)(buf, NULL, shls, atm, natm, bas, nbas,
+                                     env_loc, cintopt, cache))
+                        {
+                            for (ic = 0; ic < comp; ic++) {
+                                pbufL = bufL + ic * dij;
+                                pbuf = buf + ic * dijk;
+                                for (k = 0; k < dk; k++) {
+                                    for (i = 0; i < dij; i++) {
+                                        pbufL[i] += pbuf[i];
+                                    }
+                                    pbuf += dij;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            (*fcontract)(grad, dm, bufL, shls, ao_loc, atm, natm, bas, nbas, comp, nao);
+        }
+    }
+}
+
+void PBCnr3c1e_screened_nuc_grad_fill_gs1(int (*intor)(), double *out, double* dm,
+                      int nkpts_ij, int nkpts, int comp, int nimgs, int ish, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i, int *kptij_idx,
+                      int *shls_slice, int *ao_loc,
+                      CINTOpt *cintopt, PBCOpt *pbcopt,
+                      int *atm, int natm, int *bas, int nbas, double *env, int nao,
+                      NeighborList** neighbor_list)
+{
+        _nr3c1e_screened_nuc_grad_fill_g(intor, &contract_3c1e_ipik_dm_gs1, out, dm,
+                          nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                          buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                          shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, nao, neighbor_list);
+}
+
+void PBCnr3c_screened_drv(int (*intor)(), void (*fill)(), double complex *eri,
+                 int nkpts_ij, int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL, int *kptij_idx,
+                 int *shls_slice, int *ao_loc,
+                 CINTOpt *cintopt, PBCOpt *pbcopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nish = ish1 - ish0;
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        double *expkL_i = expkL_r + nimgs*nkpts;
+        int i;
+        for (i = 0; i < nimgs*nkpts; i++) {
+                expkL_r[i] = creal(expkL[i]);
+                expkL_i[i] = cimag(expkL[i]);
+        }
+
+        size_t count;
+        count = (nkpts * OF_CMPLX + nimgs) * INTBUFMAX10 * comp;
+        count+= nimgs * nkpts * OF_CMPLX;
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 3,
+                                                 atm, natm, bas, nbas, env);
+
+#pragma omp parallel
+{
+        int ish, jsh, ij;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        double *buf = malloc(sizeof(double)*(count+cache_size));
+#pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nish*njsh; ij++) {
+                ish = ij / njsh;
+                jsh = ij % njsh;
+                (*fill)(intor, eri, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                        shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+}
+        free(expkL_r);
+}
+
+void PBCnr3c_screened_sum_auxbas_drv(int (*intor)(), void (*fill)(), double complex *eri,
+                 int nkpts_ij, int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL, int *kptij_idx,
+                 int *shls_slice, int *ao_loc,
+                 CINTOpt *cintopt, PBCOpt *pbcopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nish = ish1 - ish0;
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r=NULL, *expkL_i=NULL;
+        //expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        //expkL_i = expkL_r + nimgs*nkpts;
+        //int i;
+        //for (i = 0; i < nimgs*nkpts; i++) {
+        //        expkL_r[i] = creal(expkL[i]);
+        //        expkL_i[i] = cimag(expkL[i]);
+        //}
+
+        size_t count;
+        count = (nkpts * OF_CMPLX + nimgs) * INTBUFMAX10 * comp;
+        count+= nimgs * nkpts * OF_CMPLX;
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 3,
+                                                 atm, natm, bas, nbas, env);
+
+#pragma omp parallel
+{
+        int ish, jsh, ij;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        double *buf = malloc(sizeof(double)*(count+cache_size));
+#pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nish*njsh; ij++) {
+                ish = ij / njsh;
+                jsh = ij % njsh;
+                (*fill)(intor, eri, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                        shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+}
+        //free(expkL_r);
+}
+
+void PBCnr3c1e_screened_nuc_grad_drv(int (*intor)(), void (*fill)(), 
+                 double* grad, double* dm,
+                 int nkpts_ij, int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL, int *kptij_idx,
+                 int *shls_slice, int *ao_loc,
+                 CINTOpt *cintopt, PBCOpt *pbcopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv, int nao,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int ish0 = shls_slice[0];
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nish = ish1 - ish0;
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r=NULL, *expkL_i=NULL;
+        //double *expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        //double *expkL_i = expkL_r + nimgs*nkpts;
+        //int i;
+        //for (i = 0; i < nimgs*nkpts; i++) {
+        //        expkL_r[i] = creal(expkL[i]);
+        //        expkL_i[i] = cimag(expkL[i]);
+        //}
+
+        size_t count;
+        count = (nkpts * OF_CMPLX + nimgs) * INTBUFMAX10 * comp;
+        count+= nimgs * nkpts * OF_CMPLX;
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 3,
+                                                 atm, natm, bas, nbas, env);
+
+        double *gradbufs[MAX_THREADS];
+#pragma omp parallel
+{
+        int ish, jsh, ij;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        double *grad_loc;
+        int thread_id = omp_get_thread_num();
+        if (thread_id == 0) {
+                grad_loc = grad;
+        } else {
+                grad_loc = calloc(natm*comp, sizeof(double));
+        }
+        gradbufs[thread_id] = grad_loc;
+
+        double *buf = malloc(sizeof(double)*(count+cache_size));
+        #pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nish*njsh; ij++) {
+                ish = ij / njsh;
+                jsh = ij % njsh;
+                (*fill)(intor, grad_loc, dm, nkpts_ij, nkpts, comp, nimgs, ish, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i, kptij_idx,
+                        shls_slice, ao_loc, cintopt, pbcopt, atm, natm, bas, nbas, env, nao, neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+
+        NPomp_dsum_reduce_inplace(gradbufs, natm*comp);
+        if (thread_id != 0) {
+                free(grad_loc);
+        }
+}
+        //free(expkL_r);
+}
+
+
+static int _nr2c_screened_fill(
+                int (*intor)(), double complex *out,
+                int nkpts, int comp, int nimgs, int jsh, int ish0,
+                double *buf, double *env_loc, double *Ls,
+                double *expkL_r, double *expkL_i,
+                int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                int *atm, int natm, int *bas, int nbas, double *env,
+                NeighborList** neighbor_list)
+{
+        const int ish1 = shls_slice[1];
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int nshi = ish1 - shls_slice[0];
+        const int nshj = jsh1 - jsh0;
+
+        const double D1 = 1;
+        const int I1 = 1;
+
+        ish0 += shls_slice[0];
+        jsh += jsh0;
+        int jsh_off = jsh - nshi;
+        int jptrxyz = atm[PTR_COORD+bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS];
+        const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+        int dimax = INTBUFMAX10 / dj;
+        int ishloc[ish1-ish0+1];
+        int nishloc = shloc_partition(ishloc, ao_loc, ish0, ish1, dimax);
+
+        int m, msh0, msh1, dijc, dmjc, ish, di, empty;
+        int jL, idx_j;
+        int shls[2];
+        double *bufk_r = buf;
+        double *bufk_i, *bufL, *pbufk_r, *pbufk_i, *cache;
+
+        NeighborList *nl0 = *neighbor_list;
+        NeighborPair *np0;
+
+        shls[1] = jsh;
+        for (m = 0; m < nishloc; m++) {
+                msh0 = ishloc[m];
+                msh1 = ishloc[m+1];
+                dimax = ao_loc[msh1] - ao_loc[msh0];
+                dmjc = dj * dimax * comp;
+                bufk_i = bufk_r + dmjc * nkpts;
+                bufL   = bufk_i + dmjc * nkpts;
+                cache  = bufL   + dmjc;
+
+                memset(bufk_r, 0, 2*dmjc*nkpts*sizeof(double));
+                pbufk_r = bufk_r;
+                pbufk_i = bufk_i;
+                for (ish = msh0; ish < msh1; ish++) {
+                        shls[0] = ish;
+                        di = ao_loc[ish+1] - ao_loc[ish];
+                        dijc = di * dj * comp;
+                        np0 = (nl0->pairs)[ish*nshj + jsh_off];
+                        if (np0->nimgs > 0) {
+                                for (idx_j = 0; idx_j < np0->nimgs; idx_j++){
+                                        jL = (np0->Ls_list)[idx_j];
+                                        shift_bas(env_loc, env, Ls, jptrxyz, jL);
+                                        if ((*intor)(bufL, NULL, shls, atm, natm, bas, nbas,
+                                                     env_loc, cintopt, cache)) {
+                                                empty = 0;
+                                                dger_(&dijc, &nkpts, &D1, bufL, &I1,
+                                                      expkL_r+jL, &nimgs, pbufk_r, &dmjc);
+                                                dger_(&dijc, &nkpts, &D1, bufL, &I1,
+                                                      expkL_i+jL, &nimgs, pbufk_i, &dmjc);
+                                        }
+                                }
+                        }
+                        pbufk_r += dijc;
+                        pbufk_i += dijc;
+                }
+                sort2c_ks1(out, bufk_r, bufk_i, shls_slice, ao_loc,
+                           nkpts, comp, jsh, msh0, msh1);
+        }
+        return !empty;
+}
+
+void PBCnr2c_screened_fill_ks1(int (*intor)(), double complex *out,
+                      int nkpts, int comp, int nimgs, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i,
+                      int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        _nr2c_screened_fill(intor, out, nkpts, comp, nimgs, jsh, 0,
+                   buf, env_loc, Ls, expkL_r, expkL_i, shls_slice, ao_loc,
+                   cintopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+void PBCnr2c_screened_fill_ks2(int (*intor)(), double complex *out,
+                      int nkpts, int comp, int nimgs, int jsh,
+                      double *buf, double *env_loc, double *Ls,
+                      double *expkL_r, double *expkL_i,
+                      int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                      int *atm, int natm, int *bas, int nbas, double *env,
+                      NeighborList** neighbor_list)
+{
+        _nr2c_screened_fill(intor, out, nkpts, comp, nimgs, jsh, jsh,
+                   buf, env_loc, Ls, expkL_r, expkL_i, shls_slice, ao_loc,
+                   cintopt, atm, natm, bas, nbas, env, neighbor_list);
+}
+
+void PBCnr2c_screened_drv(int (*intor)(), void (*fill)(), double complex *out,
+                 int nkpts, int comp, int nimgs,
+                 double *Ls, double complex *expkL,
+                 int *shls_slice, int *ao_loc, CINTOpt *cintopt,
+                 int *atm, int natm, int *bas, int nbas, double *env, int nenv,
+                 NeighborList** neighbor_list)
+{
+        assert(neighbor_list != NULL);
+        const int jsh0 = shls_slice[2];
+        const int jsh1 = shls_slice[3];
+        const int njsh = jsh1 - jsh0;
+        double *expkL_r = malloc(sizeof(double) * nimgs*nkpts * OF_CMPLX);
+        double *expkL_i = expkL_r + nimgs*nkpts;
+        int i;
+        for (i = 0; i < nimgs*nkpts; i++) {
+                expkL_r[i] = creal(expkL[i]);
+                expkL_i[i] = cimag(expkL[i]);
+        }
+        const int cache_size = GTOmax_cache_size(intor, shls_slice, 2,
+                                                 atm, natm, bas, nbas, env);
+
+#pragma omp parallel
+{
+        int jsh;
+        double *env_loc = malloc(sizeof(double)*nenv);
+        NPdcopy(env_loc, env, nenv);
+        size_t count = (nkpts+1) * OF_CMPLX;
+        double *buf = malloc(sizeof(double)*(count*INTBUFMAX10*comp+cache_size));
+#pragma omp for schedule(dynamic)
+        for (jsh = 0; jsh < njsh; jsh++) {
+                (*fill)(intor, out, nkpts, comp, nimgs, jsh,
+                        buf, env_loc, Ls, expkL_r, expkL_i,
+                        shls_slice, ao_loc, cintopt, atm, natm, bas, nbas, env,
+                        neighbor_list);
+        }
+        free(buf);
+        free(env_loc);
+}
+        free(expkL_r);
+}
diff --git a/pyscf/lib/pbc/hf_grad.c b/pyscf/lib/pbc/hf_grad.c
new file mode 100644
index 0000000000..7c781fba19
--- /dev/null
+++ b/pyscf/lib/pbc/hf_grad.c
@@ -0,0 +1,95 @@
+/* Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include "config.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "pbc/neighbor_list.h"
+
+#define MAX_THREADS 256
+
+void contract_vhf_dm(double* out, double* vhf, double* dm,
+                     NeighborList** neighbor_list,
+                     int* shls_slice, int* ao_loc, int* shls_atm,
+                     int comp, int natm, int nbas)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    const size_t nijsh = (size_t)nish * njsh;
+    const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
+    const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+
+    const int I1 = 1;
+    double *out_bufs[MAX_THREADS];
+
+#pragma omp parallel
+{
+    size_t ij, ish, jsh, p0, q0;
+    int ni, nj, i, ic, iatm, nimgs=1;
+    NeighborList *nl0=NULL;
+    if (neighbor_list != NULL) {
+        nl0 = *neighbor_list;
+    }
+    double *pvhf, *pdm;
+
+    int thread_id = omp_get_thread_num();
+    double *buf;
+    if (thread_id == 0) {
+        buf = out;
+    } else {
+        buf = calloc(comp*natm, sizeof(double));
+    }
+    out_bufs[thread_id] = buf;
+
+    #pragma omp for schedule(dynamic) 
+    for (ij = 0; ij < nijsh; ij++) {
+        ish = ij / njsh + ish0;
+        jsh = ij % njsh + jsh0;
+
+        if (nl0 != NULL) {
+            nimgs = ((nl0->pairs)[ish*nbas + jsh])->nimgs;
+        }
+        if (nimgs > 0) { // this shell pair has contribution
+            p0 = ao_loc[ish] - ao_loc[ish0];
+            q0 = ao_loc[jsh] - ao_loc[jsh0];
+            ni = ao_loc[ish+1] - ao_loc[ish];
+            nj = ao_loc[jsh+1] - ao_loc[jsh];
+
+            iatm = shls_atm[ish];
+            pvhf = vhf + (p0 * naoj + q0);
+            pdm = dm + (p0 * naoj + q0);
+            for (ic = 0; ic < comp; ic++) {
+                for (i = 0; i < ni; i++) {
+                    buf[iatm*3+ic] += ddot_(&nj, pvhf+i*naoj, &I1, pdm+i*naoj, &I1);
+                }
+                pvhf += naoi * naoj;
+            }
+        }
+    }
+
+    NPomp_dsum_reduce_inplace(out_bufs, comp*natm);
+    if (thread_id != 0) {
+        free(buf);
+    }
+}
+}
diff --git a/pyscf/lib/pbc/neighbor_list.c b/pyscf/lib/pbc/neighbor_list.c
new file mode 100644
index 0000000000..26fb52fd37
--- /dev/null
+++ b/pyscf/lib/pbc/neighbor_list.c
@@ -0,0 +1,206 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include "config.h"
+#include "cint.h"
+#include "pbc/neighbor_list.h"
+
+#define SQUARE(r)       (r[0]*r[0]+r[1]*r[1]+r[2]*r[2])
+
+void init_neighbor_pair(NeighborPair** np, int nimgs, int* Ls_list)
+{
+    NeighborPair *np0 = (NeighborPair*) malloc(sizeof(NeighborPair));
+    np0->nimgs = nimgs;
+    np0->q_cond = NULL;
+    np0->center = NULL;
+    if (nimgs > 0){
+        np0->Ls_list = (int*) malloc(sizeof(int)*nimgs);
+        int i;
+        for (i=0; i<nimgs; i++) {
+            np0->Ls_list[i] = Ls_list[i];
+        }
+    }
+    else {
+        np0->Ls_list = NULL;
+    }
+    *np = np0;
+}
+
+void del_neighbor_pair(NeighborPair** np)
+{
+    NeighborPair *np0 = *np;
+    if (!np0) {
+        return;
+    }
+    if (np0->Ls_list) {
+        free(np0->Ls_list);
+    }
+    if (np0->q_cond) {
+        free(np0->q_cond);
+    }
+    if (np0->center) {
+        free(np0->center);
+    }
+    free(np0);
+    *np = NULL;
+}
+
+void init_neighbor_list(NeighborList** nl, int nish, int njsh, int nimgs)
+{
+    NeighborList *nl0 = (NeighborList*) malloc(sizeof(NeighborList)); 
+    nl0->nish = nish;
+    nl0->njsh = njsh;
+    nl0->nimgs = nimgs;
+    nl0->pairs = (NeighborPair**) malloc(sizeof(NeighborPair*)*nish*njsh);
+    int ish, jsh;
+    for (ish=0; ish<nish; ish++)
+        for (jsh=0; jsh<njsh; jsh++) {
+            (nl0->pairs)[ish*njsh+jsh] = NULL;
+        }
+    *nl = nl0;
+}
+
+void build_neighbor_list(NeighborList** nl,
+                         int* ish_atm, int* ish_bas, double* ish_env, double* ish_rcut, 
+                         int* jsh_atm, int* jsh_bas, double* jsh_env, double* jsh_rcut,
+                         int nish, int njsh, double* Ls, int nimgs, int hermi)
+{
+    init_neighbor_list(nl, nish, njsh, nimgs);
+    NeighborList* nl0 = *nl;
+
+#pragma omp parallel
+{
+    int *buf = (int*) malloc(sizeof(int)*nimgs);
+    int ish, jsh, iL, nL;
+    int ish_atm_id, jsh_atm_id;
+    double ish_radius, jsh_radius, rmax, dij;
+    double *ish_ratm, *jsh_ratm, *rL;
+    double rij[3];
+    NeighborPair **np = NULL;
+#pragma omp for schedule(dynamic)
+    for (ish=0; ish<nish; ish++) {
+        ish_radius = ish_rcut[ish];
+        ish_atm_id = ish_bas[ish*BAS_SLOTS+ATOM_OF];
+        ish_ratm = ish_env + ish_atm[ish_atm_id*ATM_SLOTS+PTR_COORD];
+        for (jsh=0; jsh<njsh; jsh++) {
+            if (hermi == 1 && jsh < ish) {
+                continue;
+            }
+            jsh_radius = jsh_rcut[jsh];
+            jsh_atm_id = jsh_bas[jsh*BAS_SLOTS+ATOM_OF];
+            jsh_ratm = jsh_env + jsh_atm[jsh_atm_id*ATM_SLOTS+PTR_COORD];
+            rmax = ish_radius + jsh_radius;
+            nL = 0;
+            for (iL=0; iL<nimgs; iL++) {
+                rL = Ls + iL*3;
+                rij[0] = jsh_ratm[0] + rL[0] - ish_ratm[0];
+                rij[1] = jsh_ratm[1] + rL[1] - ish_ratm[1];
+                rij[2] = jsh_ratm[2] + rL[2] - ish_ratm[2];
+                dij = sqrt(SQUARE(rij));
+                if (dij < rmax) {
+                    buf[nL] = iL;
+                    nL += 1;
+                }
+            }
+            np = nl0->pairs + ish*njsh+jsh;
+            init_neighbor_pair(np, nL, buf);
+        }
+    }
+    free(buf);
+}
+}
+
+void del_neighbor_list(NeighborList** nl)
+{
+    NeighborList *nl0 = *nl;
+    if (!nl0) {
+        return;
+    }
+    int ish, jsh;
+    int nish = nl0->nish;
+    int njsh = nl0->njsh;
+    if (nl0->pairs) {
+        for (ish=0; ish<nish; ish++) {
+            for (jsh=0; jsh<njsh; jsh++) {
+                del_neighbor_pair(nl0->pairs + ish*njsh+jsh);
+            }
+        }
+        free(nl0->pairs);
+    }
+    free(nl0);
+    *nl = NULL;
+}
+
+
+int NLOpt_noscreen(int* shls, NeighborListOpt* opt)
+{
+    return 1;
+}
+
+int NLOpt_screen(int* shls, NeighborListOpt* opt)
+{
+    int ish = shls[0];
+    int jsh = shls[1];
+    NeighborList *nl = opt->nl;
+    int njsh = nl->njsh;
+    NeighborPair *np;
+    np = (nl->pairs)[ish*njsh + jsh];
+    return np->nimgs > 0;
+}
+
+void NLOpt_init(NeighborListOpt **opt)
+{
+    NeighborListOpt *opt0 = malloc(sizeof(NeighborListOpt));
+    opt0->nl = NULL;
+    opt0->fprescreen = &NLOpt_noscreen;
+    *opt = opt0;
+}
+
+void NLOpt_del(NeighborListOpt **opt)
+{
+    NeighborListOpt *opt0 = *opt;
+    if (!opt0) {
+        return;
+    }
+    free(opt0);
+    *opt = NULL;
+}
+
+void NLOpt_set_nl(NeighborListOpt *opt, NeighborList *nl)
+{
+    opt->nl = nl;
+}
+
+void NLOpt_reset(NeighborListOpt *opt)
+{
+    opt->nl = NULL;
+    opt->fprescreen = &NLOpt_screen;
+}
+
+void NLOpt_set_optimizer(NeighborListOpt *opt)
+{
+    opt->fprescreen = &NLOpt_screen;
+}
+
+void NLOpt_del_optimizer(NeighborListOpt *opt)
+{
+    opt->fprescreen = &NLOpt_noscreen;
+}
+
diff --git a/pyscf/lib/pbc/neighbor_list.h b/pyscf/lib/pbc/neighbor_list.h
new file mode 100644
index 0000000000..3364be1f3d
--- /dev/null
+++ b/pyscf/lib/pbc/neighbor_list.h
@@ -0,0 +1,41 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#ifndef HAVE_DEFINED_NEIGHBOR_LIST_H
+#define HAVE_DEFINED_NEIGHBOR_LIST_H
+typedef struct NeighborPair_struct {
+    int nimgs;
+    int *Ls_list;
+    double *q_cond;
+    double *center;
+} NeighborPair;
+
+typedef struct NeighborList_struct {
+    int nish;
+    int njsh;
+    int nimgs;
+    NeighborPair **pairs;
+} NeighborList;
+
+typedef struct NeighborListOpt_struct {
+    NeighborList *nl;
+    int (*fprescreen)(int *shls, struct NeighborListOpt_struct *opt);
+} NeighborListOpt;
+
+int NLOpt_noscreen(int* shls, NeighborListOpt* opt);
+#endif
diff --git a/pyscf/lib/pbc/optimizer.c b/pyscf/lib/pbc/optimizer.c
index d30c81c3e8..a37494ca0a 100644
--- a/pyscf/lib/pbc/optimizer.c
+++ b/pyscf/lib/pbc/optimizer.c
@@ -17,6 +17,7 @@
  */
 
 #include <stdlib.h>
+#include <math.h>
 #include "cint.h"
 #include "pbc/optimizer.h"
 
@@ -27,6 +28,7 @@ void PBCinit_optimizer(PBCOpt **opt, int *atm, int natm,
 {
         PBCOpt *opt0 = malloc(sizeof(PBCOpt));
         opt0->rrcut = NULL;
+        opt0->rcut = NULL;
         opt0->fprescreen = &PBCnoscreen;
         *opt = opt0;
 }
@@ -41,11 +43,13 @@ void PBCdel_optimizer(PBCOpt **opt)
         if (opt0->rrcut != NULL) {
                 free(opt0->rrcut);
         }
+        if (!opt0->rcut) {
+                free(opt0->rcut);
+        }
         free(opt0);
         *opt = NULL;
 }
 
-
 int PBCnoscreen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env)
 {
         return 1;
@@ -68,6 +72,23 @@ int PBCrcut_screen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env)
         return (rr < opt->rrcut[ish] || rr < opt->rrcut[jsh]);
 }
 
+int PBCrcut_screen_loose(int *shls, PBCOpt *opt, int *atm, int *bas, double *env)
+{
+        if (opt == NULL) {
+                return 1; // no screen
+        }
+        const int ish = shls[0];
+        const int jsh = shls[1];
+        const double *ri = env + atm[bas[ATOM_OF+ish*BAS_SLOTS]*ATM_SLOTS+PTR_COORD];
+        const double *rj = env + atm[bas[ATOM_OF+jsh*BAS_SLOTS]*ATM_SLOTS+PTR_COORD];
+        double rirj[3];
+        rirj[0] = ri[0] - rj[0];
+        rirj[1] = ri[1] - rj[1];
+        rirj[2] = ri[2] - rj[2];
+        double r = sqrt(SQUARE(rirj));
+        return r < opt->rcut[ish] + opt->rcut[jsh];
+}
+
 void PBCset_rcut_cond(PBCOpt *opt, double *rcut,
                       int *atm, int natm, int *bas, int nbas, double *env)
 {
@@ -82,3 +103,18 @@ void PBCset_rcut_cond(PBCOpt *opt, double *rcut,
                 opt->rrcut[i] = rcut[i] * rcut[i];
         }
 }
+
+void PBCset_rcut_cond_loose(PBCOpt *opt, double *rcut,
+                            int *atm, int natm, int *bas, int nbas, double *env)
+{
+        if (opt->rcut != NULL) {
+                free(opt->rcut);
+        }
+        opt->rcut = (double *)malloc(sizeof(double) * nbas);
+        opt->fprescreen = &PBCrcut_screen_loose;
+
+        int i;
+        for (i = 0; i < nbas; i++) {
+                opt->rcut[i] = rcut[i];
+        }
+}
diff --git a/pyscf/lib/pbc/optimizer.h b/pyscf/lib/pbc/optimizer.h
index ff3299715b..62c8be5d32 100644
--- a/pyscf/lib/pbc/optimizer.h
+++ b/pyscf/lib/pbc/optimizer.h
@@ -16,10 +16,11 @@
  * Author: Qiming Sun <osirpt.sun@gmail.com>
  */
 
-#if !defined(HAVE_DEFINED_CVHFOPT_H)
-#define HAVE_DEFINED_CVHFOPT_H
+#if !defined(HAVE_DEFINED_PBCOPT_H)
+#define HAVE_DEFINED_PBCOPT_H
 typedef struct PBCOpt_struct {
     double *rrcut;
+    double *rcut;
     int (*fprescreen)(int *shls, struct PBCOpt_struct *opt,
                       int *atm, int *bas, double *env);
 } PBCOpt;
@@ -27,4 +28,3 @@ typedef struct PBCOpt_struct {
 
 int PBCnoscreen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env);
 int PBCrcut_screen(int *shls, PBCOpt *opt, int *atm, int *bas, double *env);
-
diff --git a/pyscf/lib/pbc/pp.c b/pyscf/lib/pbc/pp.c
new file mode 100644
index 0000000000..4885080544
--- /dev/null
+++ b/pyscf/lib/pbc/pp.c
@@ -0,0 +1,448 @@
+/* Copyright 2021- The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xing Zhang <zhangxing.nju@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <complex.h>
+#include <math.h>
+#include "config.h"
+#include "cint.h"
+#include "gto/gto.h"
+#include "vhf/fblas.h"
+#include "np_helper/np_helper.h"
+#include "pbc/fill_ints.h"
+#include "pbc/neighbor_list.h"
+
+#define HL_TABLE_SLOTS  7
+//#define ATOM_OF         0
+//#define ANG_OF          1
+#define HL_DIM_OF       2
+#define HL_DATA_OF      3
+#define HL_OFFSET0      4
+#define HF_OFFSET1      5
+#define HF_OFFSET2      6
+#define MAX_THREADS     256
+
+
+static void _ppnl_fill_g(void (*fsort)(), double* out, double** ints,
+                         int comp, int ish, int jsh, double* buf,
+                         int *shls_slice, int *ao_loc,
+                         int* hl_table, double* hl_data, int nhl,
+                         NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+
+    ish += ish0;
+    jsh += jsh0;
+
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di *dj;
+    const int ioff = ao_loc[ish] - ao_loc[ish0];
+    const int joff = ao_loc[jsh] - ao_loc[jsh0];
+    const int naoi = ao_loc[ish1] - ao_loc[ish0];
+    const int naoj = ao_loc[jsh1] - ao_loc[jsh0];
+
+    int i, j, ij, pi, pj, ksh;
+    int hl_dim, nd;
+    int shls_ki[2], shls_kj[2];
+    int *table, *offset;
+    double *hl;
+    for (ij = 0; ij < dij; ij++) {
+        buf[ij] = 0;
+    }
+
+    int (*fprescreen)();
+    if (nlopt != NULL) {
+        fprescreen = nlopt->fprescreen;
+    } else {
+        fprescreen = NLOpt_noscreen;
+    }
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D1 = 1.;
+    for (ksh = 0; ksh < nhl; ksh++) {
+        shls_ki[0] = ksh;
+        shls_ki[1] = ish;
+        shls_kj[0] = ksh;
+        shls_kj[1] = jsh;
+        if ((*fprescreen)(shls_ki, nlopt) && (*fprescreen)(shls_kj, nlopt)) {
+            table = hl_table + ksh * HL_TABLE_SLOTS;
+            hl_dim = table[HL_DIM_OF];
+            nd = table[ANG_OF] * 2 + 1;
+            offset = table + HL_OFFSET0;
+            hl = hl_data + table[HL_DATA_OF];
+            for (i=0; i<hl_dim; i++) {
+                pi = offset[i];
+                for (j=0; j<hl_dim; j++) {
+                    pj = offset[j];
+                    dgemm_(&TRANS_N, &TRANS_T, &di, &dj, &nd,
+                           hl+j+i*hl_dim, ints[i]+pi*naoi+ioff, &naoi,
+                           ints[j]+pj*naoj+joff, &naoj, &D1, buf, &di);
+                }
+            }
+        }
+    }
+    (*fsort)(out, buf, shls_slice, ao_loc, comp, ish, jsh);
+}
+
+
+void ppnl_fill_gs1(double* out, double** ints,
+                   int comp, int ish, int jsh, double* buf,
+                   int *shls_slice, int *ao_loc,
+                   int* hl_table, double* hl_data, int nhl,
+                   NeighborListOpt* nlopt)
+{
+    _ppnl_fill_g(&sort2c_gs1, out, ints, comp, ish, jsh, buf,
+                 shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+}
+
+
+void ppnl_fill_gs2(double* out, double** ints,
+                   int comp, int ish, int jsh, double* buf,
+                   int *shls_slice, int *ao_loc,
+                   int* hl_table, double* hl_data, int nhl,
+                   NeighborListOpt* nlopt)
+{
+    int ip = ish + shls_slice[0];
+    int jp = jsh + shls_slice[2];
+    if (ip > jp) {
+        _ppnl_fill_g(&sort2c_gs2_igtj, out, ints, comp, ish, jsh, buf,
+                     shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+    } else if (ip == jp) {
+        _ppnl_fill_g(&sort2c_gs2_ieqj, out, ints, comp, ish, jsh, buf,
+                     shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+    }
+}
+
+
+void contract_ppnl(void (*fill)(), double* out,
+                   double* ppnl_half0, double* ppnl_half1, double* ppnl_half2,
+                   int comp, int* shls_slice, int *ao_loc,
+                   int* hl_table, double* hl_data, int nhl,
+                   NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    const size_t nijsh = (size_t) nish * njsh;
+
+    double *ints[3] = {ppnl_half0, ppnl_half1, ppnl_half2};
+
+    int di = GTOmax_shell_dim(ao_loc, shls_slice+0, 1);
+    int dj = GTOmax_shell_dim(ao_loc, shls_slice+2, 1);
+    size_t buf_size = di*dj*comp;
+
+    #pragma omp parallel
+    {
+        int ish, jsh;
+        size_t ij;
+        double *buf = (double*) malloc(sizeof(double) * buf_size);
+        #pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nijsh; ij++) {
+            ish = ij / njsh;
+            jsh = ij % njsh;
+            (*fill)(out, ints, comp, ish, jsh, buf,
+                    shls_slice, ao_loc, hl_table, hl_data, nhl, nlopt);
+        }
+        free(buf);
+    }
+}
+
+
+void contract_ppnl_ip1(double* out, int comp,
+                       double* ppnl_half0, double* ppnl_half1, double* ppnl_half2,
+                       double* ppnl_half_ip2_0, double* ppnl_half_ip2_1, double* ppnl_half_ip2_2,
+                       int* hl_table, double* hl_data, int nhl, int nao, int* naux,
+                       int* aux_id)
+{
+    const int One = 1;
+    const char TRANS_N = 'N';
+    //const char TRANS_T = 'T';
+    const double D1 = 1.;
+    const double D0 = 0.;
+
+    size_t nao_pair = (size_t) nao * nao;
+    memset(out, 0, nao_pair*comp*sizeof(double));
+
+    size_t n2[3];
+    n2[0] = (size_t) nao * naux[0];
+    n2[1] = (size_t) nao * naux[1];
+    n2[2] = (size_t) nao * naux[2];
+    size_t buf_size = 54 * (size_t) nao + 27;
+
+#pragma omp parallel
+{
+    size_t ib, id, i, p, ic;
+    double *pout;
+    double *buf = (double*) malloc(sizeof(double)*buf_size);
+
+    #pragma omp for schedule(dynamic)
+    for (p = 0; p < nao; p++){
+        pout = out + (size_t)p*nao;
+        for (id = 0; id < nhl; id++) {
+            ib = aux_id[id];
+            int *table = hl_table + ib * HL_TABLE_SLOTS;
+            int hl_dim = table[HL_DIM_OF];
+            int ptr = table[HL_DATA_OF];
+            int nd = table[ANG_OF] * 2 + 1;
+            int *offset = table + HL_OFFSET0;
+            double *hl = hl_data + ptr;
+            int lp_dim = nd * nao;
+            int ilp_dim = hl_dim * lp_dim;
+            int il_dim = hl_dim * nd;
+
+            double *ilp = buf;
+            double *ilp_ip2 = ilp + ilp_dim;
+            double *hilp = ilp_ip2 + nd*3;
+            for (ic = 0; ic < comp; ic++) {
+                for (i=0; i<hl_dim; i++) {
+                    int p0 = offset[i];
+                    if (i == 0) {
+                        dcopy_(&lp_dim, ppnl_half0+p0*nao, &One, ilp+i*lp_dim, &One);
+                        dcopy_(&nd, ppnl_half_ip2_0+p+p0*nao+ic*n2[0], &nao, ilp_ip2+i*nd, &One);
+                    }
+                    else if (i == 1) {
+                        dcopy_(&lp_dim, ppnl_half1+p0*nao, &One, ilp+i*lp_dim, &One);
+                        dcopy_(&nd, ppnl_half_ip2_1+p+p0*nao+ic*n2[1], &nao, ilp_ip2+i*nd, &One);
+                    }
+                    else if (i == 2) {
+                        dcopy_(&lp_dim, ppnl_half2+p0*nao, &One, ilp+i*lp_dim, &One);
+                        dcopy_(&nd, ppnl_half_ip2_2+p+p0*nao+ic*n2[2], &nao, ilp_ip2+i*nd, &One);
+                    }
+                }
+                dgemm_(&TRANS_N, &TRANS_N, &lp_dim, &hl_dim, &hl_dim, 
+                       &D1, ilp, &lp_dim, hl, &hl_dim, &D0, hilp, &lp_dim);
+                dgemm_(&TRANS_N, &TRANS_N, &nao, &One, &il_dim,
+                       &D1, hilp, &nao, ilp_ip2, &il_dim, &D1, pout+ic*nao_pair, &nao);
+            }
+        }
+    }
+    free(buf);
+}
+}
+
+
+static void _contract_vnuc_ip1_dm(double* out, double* in, double* dm, int comp,
+                                  int* shls_slice, int* ao_loc, int* bas,
+                                  int ish, int jsh, int naoi, int katm)
+{
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int iatm = bas[ATOM_OF+ish*BAS_SLOTS];
+
+    const int One = 1;
+    int ic, j;
+    double buf[comp];
+    double *pdm;
+    for (ic = 0; ic < comp; ic++) {
+        buf[ic] = 0;
+        pdm = dm;
+        for (j = 0; j < dj; j++) {
+            buf[ic] += ddot_(&di, in, &One, pdm, &One);
+            in += di;
+            pdm += naoi;
+        }
+    }
+
+    for (ic = 0; ic < comp; ic++) {
+        out[iatm*comp+ic] += buf[ic];
+        out[katm*comp+ic] -= buf[ic];
+    }
+}
+
+
+void ppnl_nuc_grad_fill_gs1(double* out, double* dm, int comp,
+                            double** ints, double** ints_ip2,
+                            int* hl_table, double* hl_data, int nhl, int* naux,
+                            int* shls_slice, int* ao_loc, int* bas, double* buf, int ish, int jsh,
+                            NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+
+    ish += ish0;
+    jsh += jsh0;
+
+    const int di = ao_loc[ish+1] - ao_loc[ish];
+    const int dj = ao_loc[jsh+1] - ao_loc[jsh];
+    const int dij = di * dj;
+    const size_t dijm = (size_t)dij * comp;
+    const int i0 = ao_loc[ish] - ao_loc[ish0];
+    const int j0 = ao_loc[jsh] - ao_loc[jsh0];
+    const int naoi = ao_loc[ish1] - ao_loc[ish0];
+    const int naoj = ao_loc[jsh1] - ao_loc[jsh0];
+
+    size_t n2[3];
+    n2[0] = (size_t) naoi * naux[0];
+    n2[1] = (size_t) naoi * naux[1];
+    n2[2] = (size_t) naoi * naux[2];
+
+    int (*fprescreen)();
+    if (nlopt != NULL) {
+        fprescreen = nlopt->fprescreen;
+    } else {
+        fprescreen = NLOpt_noscreen;
+    }
+
+    const char TRANS_N = 'N';
+    const char TRANS_T = 'T';
+    const double D1 = 1.;
+
+    int i, j, pi, pj, ksh, ic;
+    int katm, l, hl_dim, nd;
+    int shls_ki[2], shls_kj[2];
+    int *table, *offset;
+    double *hl;
+    for (ksh = 0; ksh < nhl; ksh++) {
+        shls_ki[0] = ksh;
+        shls_ki[1] = ish;
+        shls_kj[0] = ksh;
+        shls_kj[1] = jsh;
+        if ((*fprescreen)(shls_ki, nlopt) && (*fprescreen)(shls_kj, nlopt)) {
+            table = hl_table + ksh * HL_TABLE_SLOTS;
+            katm = table[ATOM_OF];
+            l = table[ANG_OF];
+            hl_dim = table[HL_DIM_OF];
+            nd = 2 * l + 1;
+            offset = table + HL_OFFSET0;
+            hl = hl_data + table[HL_DATA_OF];
+
+            memset(buf, 0, dijm*sizeof(double));
+            for (ic = 0; ic < comp; ic++) {
+                for (i=0; i<hl_dim; i++) {
+                    pi = offset[i];
+                    for (j=0; j<hl_dim; j++) {
+                        pj = offset[j];
+                        dgemm_(&TRANS_N, &TRANS_T, &di, &dj, &nd,
+                               hl+j+i*hl_dim, ints_ip2[i]+ic*n2[i]+pi*naoi+i0, &naoi,
+                               ints[j]+pj*naoj+j0, &naoj, &D1, buf+ic*dij, &di);
+                    }
+                }
+            }
+            _contract_vnuc_ip1_dm(out, buf, dm+j0*naoi+i0, comp,
+                                  shls_slice, ao_loc, bas,
+                                  ish, jsh, naoi, katm);
+        }
+    }
+}
+
+
+void contract_ppnl_nuc_grad(void (*fill)(), double* grad, double* dm, int comp,
+                            double* ppnl_half0, double* ppnl_half1, double* ppnl_half2,
+                            double* ppnl_half_ip2_0, double* ppnl_half_ip2_1, double* ppnl_half_ip2_2,
+                            int* hl_table, double* hl_data, int nhl, int* naux,
+                            int* shls_slice, int* ao_loc, int* bas, int natm,
+                            NeighborListOpt* nlopt)
+{
+    const int ish0 = shls_slice[0];
+    const int ish1 = shls_slice[1];
+    const int jsh0 = shls_slice[2];
+    const int jsh1 = shls_slice[3];
+    const int nish = ish1 - ish0;
+    const int njsh = jsh1 - jsh0;
+    const size_t nijsh = (size_t)nish * njsh;
+
+    int di = GTOmax_shell_dim(ao_loc, shls_slice+0, 1);
+    int dj = GTOmax_shell_dim(ao_loc, shls_slice+2, 1);
+    size_t buf_size = di*dj*comp;
+
+    double *ints[3] = {ppnl_half0, ppnl_half1, ppnl_half2};
+    double *ints_ip2[3] = {ppnl_half_ip2_0, ppnl_half_ip2_1, ppnl_half_ip2_2};
+
+    double *gradbufs[MAX_THREADS];
+    #pragma omp parallel
+    {
+        int ish, jsh;
+        size_t ij;
+        double *grad_loc;
+        int thread_id = omp_get_thread_num();
+        if (thread_id == 0) {
+            grad_loc = grad;
+        } else {
+            grad_loc = calloc(natm*comp, sizeof(double));
+        }
+        gradbufs[thread_id] = grad_loc;
+        double *buf = (double*) malloc(sizeof(double)*buf_size);
+
+        #pragma omp for schedule(dynamic)
+        for (ij = 0; ij < nijsh; ij++) {
+            ish = ij / njsh;
+            jsh = ij % njsh;
+
+            (*fill)(grad_loc, dm, comp, ints, ints_ip2,
+                    hl_table, hl_data, nhl, naux,
+                    shls_slice, ao_loc, bas, buf, ish, jsh, nlopt);
+        }
+        free(buf);
+
+        NPomp_dsum_reduce_inplace(gradbufs, natm*comp);
+        if (thread_id != 0) {
+            free(grad_loc);
+        }
+    }
+}
+
+
+void pp_loc_part1_gs(double complex* out, double* coulG,
+                     double* Gv, double* G2, int G0idx, int ngrid,
+                     double* Z, double* coords, double* rloc,
+                     int natm)
+{
+#pragma omp parallel
+{
+    int ig, ia;
+    double vlocG, r0, RG;
+    double *Gv_loc, *coords_local;
+    #pragma omp for schedule(static)
+    for (ig = 0; ig < ngrid; ig++){
+        out[ig] = 0;
+        Gv_loc = Gv + ig*3;
+        for (ia = 0; ia < natm; ia++)
+        {
+            coords_local = coords + ia*3;
+            RG = (coords_local[0] * Gv_loc[0]
+                  + coords_local[1] * Gv_loc[1]
+                  + coords_local[2] * Gv_loc[2]);
+
+            r0 = rloc[ia];
+            if (r0 > 0) {
+                if (ig == G0idx) {
+                    vlocG = -2. * M_PI * Z[ia] * r0*r0;
+                }
+                else {
+                    vlocG = Z[ia] * coulG[ig] * exp(-0.5*r0*r0 * G2[ig]);
+                }
+            }
+            else { // Z/r
+                vlocG = Z[ia] * coulG[ig];
+            }
+            out[ig] -= (vlocG * cos(RG)) - (vlocG * sin(RG)) * _Complex_I;
+        }
+    }
+}
+}
diff --git a/pyscf/lib/test/test_numint_uniform_grid.py b/pyscf/lib/test/test_numint_uniform_grid.py
index 296dcbd61a..05e5664ab0 100644
--- a/pyscf/lib/test/test_numint_uniform_grid.py
+++ b/pyscf/lib/test/test_numint_uniform_grid.py
@@ -7,7 +7,7 @@
 from pyscf.pbc.dft import gen_grid
 from pyscf.pbc.dft import multigrid
 
-from pyscf.pbc.dft.multigrid import eval_mat, eval_rho
+from pyscf.pbc.dft.multigrid.multigrid import eval_mat, eval_rho
 
 def uncontract(cell):
     pcell, contr_coeff = cell.to_uncontracted_cartesian_basis()
@@ -18,8 +18,8 @@ def setUpModule():
     global bak_EXPDROP, bak_EXTRA_PREC
     global vxc, kpts, nkpts, nao, dm, dm_kpts, grids_orth, grids_north
     global ao_kpts_orth, ao_kpts_north, ao_orth, ao_north, ao_gamma_orth, ao_gamma_north
-    multigrid.EXPDROP, bak_EXPDROP = 1e-14, multigrid.EXPDROP
-    multigrid.EXTRA_PREC, bak_EXTRA_PREC = 1e-3, multigrid.EXTRA_PREC
+    multigrid.multigrid.EXPDROP, bak_EXPDROP = 1e-14, multigrid.multigrid.EXPDROP
+    multigrid.multigrid.EXTRA_PREC, bak_EXTRA_PREC = 1e-3, multigrid.multigrid.EXTRA_PREC
 
     numpy.random.seed(2)
     cell_orth = gto.M(atom='H1 1 1 0; H2 0 0 1',
diff --git a/pyscf/pbc/df/incore.py b/pyscf/pbc/df/incore.py
index 253250a405..76c23f8e3e 100644
--- a/pyscf/pbc/df/incore.py
+++ b/pyscf/pbc/df/incore.py
@@ -30,6 +30,7 @@
 from pyscf.pbc.tools import k2gamma
 from pyscf.pbc.tools import pbc as pbctools
 from pyscf import __config__
+from pyscf.pbc.gto import _pbcintor
 
 RCUT_THRESHOLD = getattr(__config__, 'pbc_scf_rsjk_rcut_threshold', 2.5)
 KECUT_THRESHOLD = getattr(__config__, 'pbc_scf_rsjk_kecut_threshold', 10.0)
@@ -471,3 +472,246 @@ def _conc_locs(ao_loc1, ao_loc2):
     basis accordingly.'''
     comp_loc = np.append(ao_loc1[:-1], ao_loc1[-1] + ao_loc2)
     return np.asarray(comp_loc, dtype=np.int32)
+
+# The following functions use pre-constructed shell pair list
+def aux_e2_sum_auxbas(cell, auxcell_or_auxbasis, intor='int3c2e', aosym='s1', comp=None,
+                      kptij_lst=np.zeros((1,2,3)), shls_slice=None, **kwargs):
+    r'''Compute :math:`\sum_{L} (ij|L)` on the fly.
+
+    Returns:
+        out : (nao_pair,) array
+    '''
+    if isinstance(auxcell_or_auxbasis, gto.MoleBase):
+        auxcell = auxcell_or_auxbasis
+    else:
+        assert isinstance(auxcell_or_auxbasis, str)
+        auxcell = make_auxcell(cell, auxcell_or_auxbasis)
+
+    int3c = wrap_int3c_sum_auxbas(cell, auxcell, intor, aosym, comp, kptij_lst, **kwargs)
+    out = int3c(shls_slice)
+    return out
+
+def wrap_int3c_sum_auxbas(cell, auxcell, intor='int3c2e', aosym='s1', comp=None,
+                          kptij_lst=np.zeros((1,2,3)), cintopt=None, pbcopt=None,
+                          neighbor_list=None):
+    if neighbor_list is None:
+        raise KeyError('Neighbor list is not initialized.')
+
+    log = logger.new_logger(cell)
+
+    nkptij = len(kptij_lst)
+    kpti = kptij_lst[:,0]
+    kptj = kptij_lst[:,1]
+    j_only = is_zero(kpti - kptj)
+    if j_only:
+        kpts = kpti
+        nkpts = len(kpts)
+        kptij_idx = np.arange(nkpts, dtype=np.int32)
+    else:
+        raise NotImplementedError
+
+    intor = cell._add_suffix(intor)
+    intor, comp = gto.moleintor._get_intor_and_comp(intor, comp)
+
+    pcell = cell.copy()
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                         cell._atm, cell._bas, cell._env)
+    ao_loc = gto.moleintor.make_loc(bas, intor)
+    aux_loc = auxcell.ao_loc_nr()
+    ao_loc = np.asarray(np.hstack([ao_loc, ao_loc[-1]+aux_loc[1:]]),
+                        dtype=np.int32)
+    atm, bas, env = gto.conc_env(atm, bas, env,
+                                 auxcell._atm, auxcell._bas, auxcell._env)
+
+    Ls = cell.get_lattice_Ls()
+    nimgs = len(Ls)
+    nbas = cell.nbas
+
+    gamma_point_only = is_zero(kpts)
+    if gamma_point_only:
+        assert nkpts == 1
+        kk_type = 'g'
+        expkL = np.ones(1, dtype=np.complex128)
+        out_dtype = np.double
+    else:
+        raise NotImplementedError
+
+    fill = 'PBCnr3c_screened_sum_auxbas_fill_%s%s' % (kk_type, aosym[:2])
+    drv = libpbc.PBCnr3c_screened_sum_auxbas_drv
+
+    if cintopt is None:
+        if nbas > 0:
+            env[gto.PTR_EXPCUTOFF] = abs(np.log(cell.precision))
+            cintopt = _vhf.make_cintopt(atm, bas, env, intor)
+        else:
+            cintopt = lib.c_null_ptr()
+        if intor[:3] != 'ECP':
+            libpbc.CINTdel_pairdata_optimizer(cintopt)
+    if pbcopt is None:
+        pbcopt = _pbcintor.PBCOpt(pcell).init_rcut_cond(pcell)
+    if isinstance(pbcopt, _pbcintor.PBCOpt):
+        cpbcopt = pbcopt._this
+    else:
+        cpbcopt = lib.c_null_ptr()
+
+    def int3c(shls_slice=None, out=None):
+        t0 = (logger.process_clock(), logger.perf_counter())
+        if shls_slice is None:
+            shls_slice = (0, nbas, 0, nbas, 0, auxcell.nbas)
+        shls_slice = (shls_slice[0], shls_slice[1],
+                      nbas+shls_slice[2], nbas+shls_slice[3],
+                      nbas*2+shls_slice[4], nbas*2+shls_slice[5])
+        ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
+        nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
+
+        if aosym[:2] == 's2':
+            assert ni == nj
+            nao_pair = (ao_loc[shls_slice[1]]*(ao_loc[shls_slice[1]]+1)//2 -
+                        ao_loc[shls_slice[0]]*(ao_loc[shls_slice[0]]+1)//2)
+        else:
+            nao_pair = ni * nj
+
+        if out is None:
+            out = np.empty((nkptij,comp,nao_pair), dtype=out_dtype)
+
+        drv(getattr(libpbc, intor), getattr(libpbc, fill),
+            out.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nkptij), ctypes.c_int(nkpts),
+            ctypes.c_int(comp), ctypes.c_int(nimgs),
+            Ls.ctypes.data_as(ctypes.c_void_p),
+            expkL.ctypes.data_as(ctypes.c_void_p),
+            kptij_idx.ctypes.data_as(ctypes.c_void_p),
+            (ctypes.c_int*6)(*shls_slice),
+            ao_loc.ctypes.data_as(ctypes.c_void_p), cintopt, cpbcopt,
+            atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(cell.natm),
+            bas.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nbas),  # need to pass cell.nbas to libpbc.PBCnr3c_drv
+            env.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(env.size),
+            ctypes.byref(neighbor_list))
+
+        log.timer_debug1(f'pbc integral {intor}', *t0)
+
+        if comp == 1:
+            out = out[:,0]
+        if nkptij == 1:
+            out = out[0]
+        return out
+
+    return int3c
+
+def int3c1e_nuc_grad(cell, auxcell, dm, intor='int3c1e', aosym='s1', comp=3,
+                     kptij_lst=np.zeros((1,2,3)), shls_slice=None, **kwargs):
+    '''Compute the nuclear gradient contribution
+    to the 2nd local part of PP on the fly.
+    See `pbc.gto.pseudo.pp_int.vpploc_part2_nuc_grad`.
+
+    Returns:
+        out : (natm,comp) array
+    '''
+    if comp != 3:
+        raise NotImplementedError
+    if aosym != 's1':
+        raise NotImplementedError
+
+    int3c = wrap_int3c1e_nuc_grad(cell, auxcell, dm, intor, aosym, comp, kptij_lst, **kwargs)
+    out = int3c(shls_slice)
+    return out
+
+def wrap_int3c1e_nuc_grad(cell, auxcell, dm, intor='int3c1e', aosym='s1', comp=3,
+                          kptij_lst=np.zeros((1,2,3)), cintopt=None, pbcopt=None,
+                          neighbor_list=None):
+    if neighbor_list is None:
+        raise KeyError('Neighbor list is not initialized.')
+
+    log = logger.new_logger(cell)
+
+    nkptij = len(kptij_lst)
+    kpti = kptij_lst[:,0]
+    kptj = kptij_lst[:,1]
+    j_only = is_zero(kpti - kptj)
+    if j_only:
+        kpts = kpti
+        nkpts = len(kpts)
+        kptij_idx = np.arange(nkpts, dtype=np.int32)
+    else:
+        raise NotImplementedError
+
+    intor = cell._add_suffix(intor)
+    intor, comp = gto.moleintor._get_intor_and_comp(intor, comp)
+
+    pcell = cell.copy()
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                         cell._atm, cell._bas, cell._env)
+    ao_loc = gto.moleintor.make_loc(bas, intor)
+    aux_loc = auxcell.ao_loc_nr()
+    ao_loc = np.asarray(np.hstack([ao_loc, ao_loc[-1]+aux_loc[1:]]),
+                        dtype=np.int32)
+    atm, bas, env = gto.conc_env(atm, bas, env,
+                                 auxcell._atm, auxcell._bas, auxcell._env)
+
+    Ls = cell.get_lattice_Ls()
+    nimgs = len(Ls)
+    nbas = cell.nbas
+
+    gamma_point_only = is_zero(kpts)
+    if gamma_point_only:
+        assert nkpts == 1
+        kk_type = 'g'
+        expkL = np.ones(1, dtype=np.complex128)
+        dm = np.asarray(dm, order="C", dtype=np.double)
+    else:
+        raise NotImplementedError
+
+    fill = 'PBCnr3c1e_screened_nuc_grad_fill_%s%s' % (kk_type, aosym[:2])
+    drv = libpbc.PBCnr3c1e_screened_nuc_grad_drv
+
+    if cintopt is None:
+        if nbas > 0:
+            env[gto.PTR_EXPCUTOFF] = abs(np.log(cell.precision))
+            cintopt = _vhf.make_cintopt(atm, bas, env, intor)
+        else:
+            cintopt = lib.c_null_ptr()
+        if intor[:3] != 'ECP':
+            libpbc.CINTdel_pairdata_optimizer(cintopt)
+    if pbcopt is None:
+        pbcopt = _pbcintor.PBCOpt(pcell).init_rcut_cond(pcell)
+    if isinstance(pbcopt, _pbcintor.PBCOpt):
+        cpbcopt = pbcopt._this
+    else:
+        cpbcopt = lib.c_null_ptr()
+
+    def int3c(shls_slice=None, out=None):
+        t0 = (logger.process_clock(), logger.perf_counter())
+        if shls_slice is None:
+            shls_slice = (0, nbas, 0, nbas, 0, auxcell.nbas)
+        shls_slice = (shls_slice[0], shls_slice[1],
+                      nbas+shls_slice[2], nbas+shls_slice[3],
+                      nbas*2+shls_slice[4], nbas*2+shls_slice[5])
+
+        if out is None:
+            out = np.zeros((nkptij,cell.natm,comp), dtype=np.double)
+
+        drv(getattr(libpbc, intor), getattr(libpbc, fill),
+            out.ctypes.data_as(ctypes.c_void_p),
+            dm.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(nkptij), ctypes.c_int(nkpts),
+            ctypes.c_int(comp), ctypes.c_int(nimgs),
+            Ls.ctypes.data_as(ctypes.c_void_p),
+            expkL.ctypes.data_as(ctypes.c_void_p),
+            kptij_idx.ctypes.data_as(ctypes.c_void_p),
+            (ctypes.c_int*6)(*shls_slice),
+            ao_loc.ctypes.data_as(ctypes.c_void_p), cintopt, cpbcopt,
+            atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(cell.natm),
+            bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbas),
+            env.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(env.size),
+            ctypes.c_int(cell.nao), ctypes.byref(neighbor_list))
+
+        log.timer_debug1(f'pbc integral {intor}', *t0)
+
+        if nkptij == 1:
+            out = out[0]
+        return out
+
+    return int3c
diff --git a/pyscf/pbc/dft/gks.py b/pyscf/pbc/dft/gks.py
index 8d496bbfb1..5536b53daa 100644
--- a/pyscf/pbc/dft/gks.py
+++ b/pyscf/pbc/dft/gks.py
@@ -77,7 +77,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     ni = ks._numint
     n, exc, vxc = ni.get_vxc(cell, ks.grids, ks.xc, dm, hermi=hermi, kpt=kpt,
                              kpts_band=kpts_band, max_memory=max_memory)
-    logger.debug(ks, 'nelec by numeric integration = %s', n)
+    logger.info(ks, 'nelec by numeric integration = %s', n)
     t0 = logger.timer(ks, 'vxc', *t0)
 
     if not hybrid:
diff --git a/pyscf/pbc/dft/kgks.py b/pyscf/pbc/dft/kgks.py
index f43a8ee04c..fd97e43cd1 100644
--- a/pyscf/pbc/dft/kgks.py
+++ b/pyscf/pbc/dft/kgks.py
@@ -84,7 +84,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     ni = ks._numint
     n, exc, vxc = ni.get_vxc(cell, ks.grids, ks.xc, dm, hermi=hermi, kpts=kpts,
                              kpts_band=kpts_band, max_memory=max_memory)
-    logger.debug(ks, 'nelec by numeric integration = %s', n)
+    logger.info(ks, 'nelec by numeric integration = %s', n)
     t0 = logger.timer(ks, 'vxc', *t0)
 
     nkpts = len(kpts)
diff --git a/pyscf/pbc/dft/krks.py b/pyscf/pbc/dft/krks.py
index 572a7614af..3cd23636b1 100644
--- a/pyscf/pbc/dft/krks.py
+++ b/pyscf/pbc/dft/krks.py
@@ -69,7 +69,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
                                        kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -84,7 +84,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         max_memory = ks.max_memory - lib.current_memory()[0]
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
                                 kpts, kpts_band, max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -95,7 +95,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     nkpts = len(kpts)
diff --git a/pyscf/pbc/dft/krks_ksymm.py b/pyscf/pbc/dft/krks_ksymm.py
index fb15bf6f40..0d9e1401e2 100644
--- a/pyscf/pbc/dft/krks_ksymm.py
+++ b/pyscf/pbc/dft/krks_ksymm.py
@@ -59,7 +59,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm_bz, hermi,
                                        kpts.kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -72,7 +72,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm_bz,
                                 kpts=kpts.kpts, kpts_band=kpts_band,
                                 max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -83,7 +83,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpts.kpts, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     weight = kpts.weights_ibz
diff --git a/pyscf/pbc/dft/kuks.py b/pyscf/pbc/dft/kuks.py
index a07949ccca..634c99f8ff 100644
--- a/pyscf/pbc/dft/kuks.py
+++ b/pyscf/pbc/dft/kuks.py
@@ -55,7 +55,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
                                        kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -79,7 +79,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpts, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     nkpts = len(kpts)
diff --git a/pyscf/pbc/dft/kuks_ksymm.py b/pyscf/pbc/dft/kuks_ksymm.py
index eb02e674e9..15c2a623b5 100644
--- a/pyscf/pbc/dft/kuks_ksymm.py
+++ b/pyscf/pbc/dft/kuks_ksymm.py
@@ -58,7 +58,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm_bz, hermi,
                                        kpts.kpts, kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -71,7 +71,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = ni.nr_uks(cell, ks.grids, ks.xc, dm_bz,
                                 kpts=kpts.kpts, kpts_band=kpts_band,
                                 max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -82,7 +82,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpts.kpts, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     weight = kpts.weights_ibz
diff --git a/pyscf/pbc/dft/multigrid/__init__.py b/pyscf/pbc/dft/multigrid/__init__.py
new file mode 100644
index 0000000000..707853bf51
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/__init__.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .multigrid import MultiGridFFTDF
+from .multigrid import (
+    multigrid_fftdf as multigrid_fftdf,
+    _gen_rhf_response as _gen_rhf_response,
+    _gen_uhf_response as _gen_uhf_response,
+    nr_rks as nr_rks_v1,
+    nr_rks_fxc as nr_rks_fxc,
+    nr_rks_fxc_st as nr_rks_fxc_st,
+    nr_uks as nr_uks_v1,
+    nr_uks_fxc as nr_uks_fxc
+)
+
+from .multigrid_pair import MultiGridFFTDF2
+from .multigrid_pair import nr_rks as nr_rks_v2
+from .multigrid_pair import nr_uks as nr_uks_v2
+
+def nr_rks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    if isinstance(mydf, MultiGridFFTDF2):
+        return nr_rks_v2(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose) 
+    elif isinstance(mydf, MultiGridFFTDF):
+        return nr_rks_v1(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose)
+    else:
+        raise TypeError("Wrong density fitting type for multigrid DFT.")
+
+def nr_uks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    if isinstance(mydf, MultiGridFFTDF2):
+        return nr_uks_v2(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose)
+    elif isinstance(mydf, MultiGridFFTDF):
+        return nr_uks_v1(mydf, xc_code, dm_kpts, hermi=hermi, kpts=kpts,
+                         kpts_band=kpts_band, with_j=with_j,
+                         return_j=return_j, verbose=verbose)
+    else:
+        raise TypeError("Wrong density fitting type for multigrid DFT.")
diff --git a/pyscf/pbc/dft/multigrid.py b/pyscf/pbc/dft/multigrid/multigrid.py
similarity index 95%
rename from pyscf/pbc/dft/multigrid.py
rename to pyscf/pbc/dft/multigrid/multigrid.py
index 80e72e551b..56fb3059cf 100644
--- a/pyscf/pbc/dft/multigrid.py
+++ b/pyscf/pbc/dft/multigrid/multigrid.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2021 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 import numpy
 import scipy.linalg
 
+from pyscf import __config__
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf.gto import ATOM_OF, ANG_OF, NPRIM_OF, PTR_EXP, PTR_COEFF
@@ -29,12 +30,21 @@
 from pyscf.pbc import tools
 from pyscf.pbc import gto
 from pyscf.pbc.gto import pseudo
+from pyscf.pbc.gto.pseudo import pp_int
 from pyscf.pbc.dft import numint, gen_grid
-from pyscf.pbc.df.df_jk import _format_dms, _format_kpts_band, _format_jks
+from pyscf.pbc.df.df_jk import (
+    _format_dms,
+    _format_kpts_band,
+    _format_jks,
+)
 from pyscf.pbc.lib.kpts_helper import gamma_point
-from pyscf.pbc.df import fft
-from pyscf.pbc.df import ft_ao
-from pyscf import __config__
+from pyscf.pbc.df import fft, ft_ao
+from pyscf.pbc.dft.multigrid.utils import (
+    _take_4d,
+    _take_5d,
+    _takebak_4d,
+    _takebak_5d,
+)
 
 #sys.stderr.write('WARN: multigrid is an experimental feature. It is still in '
 #                 'testing\nFeatures and APIs may be changed in the future.\n')
@@ -367,23 +377,31 @@ def get_nuc(mydf, kpts=None):
         vne = vne[0]
     return numpy.asarray(vne)
 
-def get_pp(mydf, kpts=None):
+def get_pp(mydf, kpts=None, max_memory=4000):
     '''Get the periodic pseudotential nuc-el AO matrix, with G=0 removed.
     '''
     from pyscf import gto
     kpts, is_single_kpt = fft._check_kpts(mydf, kpts)
     cell = mydf.cell
     mesh = mydf.mesh
-    SI = cell.get_SI()
     Gv = cell.get_Gv(mesh)
-    vpplocG = pseudo.get_vlocG(cell, Gv)
-    vpplocG = -numpy.einsum('ij,ij->j', SI, vpplocG)
-    # from get_jvloc_G0 function
-    vpplocG[0] = numpy.sum(pseudo.get_alphas(cell))
-    ngrids = len(vpplocG)
+
+    ngrids = len(Gv)
+    vpplocG = numpy.empty((ngrids,), dtype=numpy.complex128)
+
+    mem_avail = max(max_memory, mydf.max_memory-lib.current_memory()[0])
+    blksize = int(mem_avail*1e6/((cell.natm*2)*16))
+    blksize = min(ngrids, max(21**3, blksize))
+    for ig0, ig1 in lib.prange(0, ngrids, blksize):
+        vpplocG_batch = pp_int.get_gth_vlocG_part1(cell, Gv[ig0:ig1])
+        SI = cell.get_SI(Gv[ig0:ig1])
+        vpplocG[ig0:ig1] = -numpy.einsum('ij,ij->j', SI, vpplocG_batch)
 
     hermi = 1
     vpp = _get_j_pass2(mydf, vpplocG, hermi, kpts)[0]
+    vpp2 = pp_int.get_pp_loc_part2(cell, kpts)
+    for k, kpt in enumerate(kpts):
+        vpp[k] += vpp2[k]
 
     # vppnonloc evaluated in reciprocal space
     fakemol = gto.Mole()
@@ -396,51 +414,76 @@ def get_pp(mydf, kpts=None):
     fakemol._bas[0,gto.PTR_EXP  ] = ptr+3
     fakemol._bas[0,gto.PTR_COEFF] = ptr+4
 
-    # buf for SPG_lmi upto l=0..3 and nl=3
-    buf = numpy.empty((48,ngrids), dtype=numpy.complex128)
-
     def vppnl_by_k(kpt):
-        Gk = Gv + kpt
-        G_rad = lib.norm(Gk, axis=1)
-        aokG = ft_ao.ft_ao(cell, Gv, kpt=kpt) * (ngrids/cell.vol)
-        vppnl = 0
+        SPG_lm_aoGs = []
         for ia in range(cell.natm):
             symb = cell.atom_symbol(ia)
             if symb not in cell._pseudo:
+                SPG_lm_aoGs.append(None)
                 continue
             pp = cell._pseudo[symb]
             p1 = 0
             for l, proj in enumerate(pp[5:]):
                 rl, nl, hl = proj
                 if nl > 0:
-                    fakemol._bas[0,gto.ANG_OF] = l
-                    fakemol._env[ptr+3] = .5*rl**2
-                    fakemol._env[ptr+4] = rl**(l+1.5)*numpy.pi**1.25
-                    pYlm_part = fakemol.eval_gto('GTOval', Gk)
+                    p1 = p1+nl*(l*2+1)
+            SPG_lm_aoGs.append(numpy.zeros((p1, cell.nao), dtype=numpy.complex128))
 
-                    p0, p1 = p1, p1+nl*(l*2+1)
-                    # pYlm is real, SI[ia] is complex
-                    pYlm = numpy.ndarray((nl,l*2+1,ngrids), dtype=numpy.complex128, buffer=buf[p0:p1])
-                    for k in range(nl):
-                        qkl = pseudo.pp._qli(G_rad*rl, l, k)
-                        pYlm[k] = pYlm_part.T * qkl
-                    #:SPG_lmi = numpy.einsum('g,nmg->nmg', SI[ia].conj(), pYlm)
-                    #:SPG_lm_aoG = numpy.einsum('nmg,gp->nmp', SPG_lmi, aokG)
-                    #:tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
-                    #:vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
-            if p1 > 0:
-                SPG_lmi = buf[:p1]
-                SPG_lmi *= SI[ia].conj()
-                SPG_lm_aoGs = lib.zdot(SPG_lmi, aokG)
+        mem_avail = max(max_memory, mydf.max_memory-lib.current_memory()[0])
+        blksize = int(mem_avail*1e6/((48+cell.nao+13+3)*16))
+        blksize = min(ngrids, max(21**3, blksize))
+        vppnl = 0
+        for ig0, ig1 in lib.prange(0, ngrids, blksize):
+            ng = ig1 - ig0
+            # buf for SPG_lmi upto l=0..3 and nl=3
+            buf = numpy.empty((48,ng), dtype=numpy.complex128)
+            Gk = Gv[ig0:ig1] + kpt
+            G_rad = numpy.linalg.norm(Gk, axis=1)
+            aokG = ft_ao.ft_ao(cell, Gv[ig0:ig1], kpt=kpt) * (ngrids/cell.vol)
+            for ia in range(cell.natm):
+                symb = cell.atom_symbol(ia)
+                if symb not in cell._pseudo:
+                    continue
+                pp = cell._pseudo[symb]
                 p1 = 0
                 for l, proj in enumerate(pp[5:]):
                     rl, nl, hl = proj
                     if nl > 0:
+                        fakemol._bas[0,gto.ANG_OF] = l
+                        fakemol._env[ptr+3] = .5*rl**2
+                        fakemol._env[ptr+4] = rl**(l+1.5)*numpy.pi**1.25
+                        pYlm_part = fakemol.eval_gto('GTOval', Gk)
+
                         p0, p1 = p1, p1+nl*(l*2+1)
-                        hl = numpy.asarray(hl)
-                        SPG_lm_aoG = SPG_lm_aoGs[p0:p1].reshape(nl,l*2+1,-1)
-                        tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
-                        vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+                        # pYlm is real, SI[ia] is complex
+                        pYlm = numpy.ndarray((nl,l*2+1,ng), dtype=numpy.complex128, buffer=buf[p0:p1])
+                        for k in range(nl):
+                            qkl = pseudo.pp._qli(G_rad*rl, l, k)
+                            pYlm[k] = pYlm_part.T * qkl
+                        #:SPG_lmi = numpy.einsum('g,nmg->nmg', SI[ia].conj(), pYlm)
+                        #:SPG_lm_aoG = numpy.einsum('nmg,gp->nmp', SPG_lmi, aokG)
+                        #:tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+                        #:vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+                if p1 > 0:
+                    SPG_lmi = buf[:p1]
+                    SPG_lmi *= cell.get_SI(Gv[ig0:ig1], atmlst=[ia,]).conj()
+                    SPG_lm_aoGs[ia] += lib.zdot(SPG_lmi, aokG)
+            buf = None
+        for ia in range(cell.natm):
+            symb = cell.atom_symbol(ia)
+            if symb not in cell._pseudo:
+                continue
+            pp = cell._pseudo[symb]
+            p1 = 0
+            for l, proj in enumerate(pp[5:]):
+                rl, nl, hl = proj
+                if nl > 0:
+                    p0, p1 = p1, p1+nl*(l*2+1)
+                    hl = numpy.asarray(hl)
+                    SPG_lm_aoG = SPG_lm_aoGs[ia][p0:p1].reshape(nl,l*2+1,-1)
+                    tmp = numpy.einsum('ij,jmp->imp', hl, SPG_lm_aoG)
+                    vppnl += numpy.einsum('imp,imq->pq', SPG_lm_aoG.conj(), tmp)
+        SPG_lm_aoGs=None
         return vppnl * (1./ngrids**2)
 
     for k, kpt in enumerate(kpts):
@@ -454,7 +497,6 @@ def vppnl_by_k(kpt):
         vpp = vpp[0]
     return numpy.asarray(vpp)
 
-
 def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None):
     '''Get the Coulomb (J) AO matrix at sampled k-points.
 
@@ -1859,7 +1901,7 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
     get_rho = get_rho
 
 
-def multigrid(mf):
+def multigrid_fftdf(mf):
     '''Use MultiGridFFTDF to replace the default FFTDF integration method in
     the DFT object.
     '''
@@ -1867,56 +1909,7 @@ def multigrid(mf):
     mf.with_df.__dict__.update(old_df.__dict__)
     return mf
 
+multigrid = multigrid_fftdf # for backward compatibility
 
 def _pgto_shells(cell):
     return cell._bas[:,NPRIM_OF].sum()
-
-def _take_4d(a, indices):
-    a_shape = a.shape
-    ranges = []
-    for i, s in enumerate(indices):
-        if s is None:
-            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
-        else:
-            idx = numpy.asarray(s, dtype=numpy.int32)
-            idx[idx < 0] += a_shape[i]
-        ranges.append(idx)
-    idx = ranges[0][:,None] * a_shape[1] + ranges[1]
-    idy = ranges[2][:,None] * a_shape[3] + ranges[3]
-    a = a.reshape(a_shape[0]*a_shape[1], a_shape[2]*a_shape[3])
-    out = lib.take_2d(a, idx.ravel(), idy.ravel())
-    return out.reshape([len(s) for s in ranges])
-
-def _takebak_4d(out, a, indices):
-    out_shape = out.shape
-    a_shape = a.shape
-    ranges = []
-    for i, s in enumerate(indices):
-        if s is None:
-            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
-        else:
-            idx = numpy.asarray(s, dtype=numpy.int32)
-            idx[idx < 0] += out_shape[i]
-        assert (len(idx) == a_shape[i])
-        ranges.append(idx)
-    idx = ranges[0][:,None] * out_shape[1] + ranges[1]
-    idy = ranges[2][:,None] * out_shape[3] + ranges[3]
-    nx = idx.size
-    ny = idy.size
-    out = out.reshape(out_shape[0]*out_shape[1], out_shape[2]*out_shape[3])
-    lib.takebak_2d(out, a.reshape(nx,ny), idx.ravel(), idy.ravel())
-    return out
-
-def _take_5d(a, indices):
-    a_shape = a.shape
-    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
-    indices = (None,) + indices[2:]
-    return _take_4d(a, indices)
-
-def _takebak_5d(out, a, indices):
-    a_shape = a.shape
-    out_shape = out.shape
-    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
-    out = out.reshape((out_shape[0]*out_shape[1],) + out_shape[2:])
-    indices = (None,) + indices[2:]
-    return _takebak_4d(out, a, indices)
diff --git a/pyscf/pbc/dft/multigrid/multigrid_pair.py b/pyscf/pbc/dft/multigrid/multigrid_pair.py
new file mode 100644
index 0000000000..3ef43b688d
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/multigrid_pair.py
@@ -0,0 +1,1405 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import __config__
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.gto import moleintor
+from pyscf.pbc import tools
+from pyscf.pbc.lib.kpts_helper import gamma_point
+from pyscf.pbc.df import fft
+from pyscf.pbc.df.df_jk import (
+    _format_dms,
+    _format_kpts_band,
+    _format_jks,
+)
+from pyscf.pbc.dft.multigrid.pp import (
+    _get_vpplocG_part1,
+    _get_pp_without_erf,
+    vpploc_part1_nuc_grad,
+)
+from pyscf.pbc.dft.multigrid.utils import (
+    _take_4d,
+    _take_5d,
+    _takebak_4d,
+    _takebak_5d,
+)
+from pyscf.pbc.dft.multigrid.multigrid import MultiGridFFTDF
+
+NGRIDS = getattr(__config__, 'pbc_dft_multigrid_ngrids', 4)
+KE_RATIO = getattr(__config__, 'pbc_dft_multigrid_ke_ratio', 3.0)
+REL_CUTOFF = getattr(__config__, 'pbc_dft_multigrid_rel_cutoff', 20.0)
+GGA_METHOD = getattr(__config__, 'pbc_dft_multigrid_gga_method', 'FFT')
+
+EXTRA_PREC = getattr(__config__, 'pbc_gto_eval_gto_extra_precision', 1e-2)
+RHOG_HIGH_ORDER = getattr(__config__, 'pbc_dft_multigrid_rhog_high_order', False)
+PTR_EXPDROP = 16
+EXPDROP = getattr(__config__, 'pbc_dft_multigrid_expdrop', 1e-12)
+IMAG_TOL = 1e-9
+
+libdft = lib.load_library('libdft')
+
+def gradient_gs(f_gs, Gv):
+    r'''Compute the G-space components of :math:`\nabla f(r)`
+    given :math:`f(G)` and :math:`G`,
+    which is equivalent to einsum('np,px->nxp', f_gs, 1j*Gv)
+    '''
+    ng, dim = Gv.shape
+    assert dim == 3
+    Gv = np.asarray(Gv, order='C', dtype=np.double)
+    f_gs = np.asarray(f_gs.reshape(-1,ng), order='C', dtype=np.complex128)
+    n = f_gs.shape[0]
+    out = np.empty((n,dim,ng), dtype=np.complex128)
+
+    fn = getattr(libdft, 'gradient_gs', None)
+    try:
+        fn(out.ctypes.data_as(ctypes.c_void_p),
+           f_gs.ctypes.data_as(ctypes.c_void_p),
+           Gv.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(n), ctypes.c_size_t(ng))
+    except Exception as e:
+        raise RuntimeError(f'Error in gradient_gs: {e}')
+    return out
+
+
+class GridLevel_Info(ctypes.Structure):
+    '''
+    Info about the grid levels.
+    '''
+    _fields_ = [("nlevels", ctypes.c_int), # number of grid levels
+                ("rel_cutoff", ctypes.c_double),
+                ("cutoff", ctypes.POINTER(ctypes.c_double)),
+                ("mesh", ctypes.POINTER(ctypes.c_int))]
+
+class RS_Grid(ctypes.Structure):
+    '''
+    Values on real space multigrid.
+    '''
+    _fields_ = [("nlevels", ctypes.c_int),
+                ("gridlevel_info", ctypes.POINTER(GridLevel_Info)),
+                ("comp", ctypes.c_int),
+                # data is list of 1d arrays
+                ("data", ctypes.POINTER(ctypes.POINTER(ctypes.c_double)))]
+
+class PGFPair(ctypes.Structure):
+    '''
+    A primitive Gaussian function pair.
+    '''
+    _fields_ = [("ish", ctypes.c_int),
+                ("ipgf", ctypes.c_int),
+                ("jsh", ctypes.c_int),
+                ("jpgf", ctypes.c_int),
+                ("iL", ctypes.c_int),
+                ("radius", ctypes.c_double)]
+
+
+class Task(ctypes.Structure):
+    '''
+    A single task.
+    '''
+    _fields_ = [("buf_size", ctypes.c_size_t),
+                ("ntasks", ctypes.c_size_t),
+                ("pgfpairs", ctypes.POINTER(ctypes.POINTER(PGFPair))),
+                ("radius", ctypes.c_double)]
+
+
+class TaskList(ctypes.Structure):
+    '''
+    A task list.
+    '''
+    _fields_ = [("nlevels", ctypes.c_int),
+                ("hermi", ctypes.c_int),
+                ("gridlevel_info", ctypes.POINTER(GridLevel_Info)),
+                ("tasks", ctypes.POINTER(ctypes.POINTER(Task)))]
+
+
+def multi_grids_tasks(cell, ke_cutoff=None, hermi=0,
+                      ngrids=NGRIDS, ke_ratio=KE_RATIO, rel_cutoff=REL_CUTOFF):
+    if ke_cutoff is None:
+        ke_cutoff = cell.ke_cutoff
+    if ke_cutoff is None:
+        raise ValueError("cell.ke_cutoff is not set.")
+    ke1 = ke_cutoff
+    cutoff = [ke1,]
+    for i in range(ngrids-1):
+        ke1 /= ke_ratio
+        cutoff.append(ke1)
+    cutoff.reverse()
+    a = cell.lattice_vectors()
+    mesh = []
+    for ke in cutoff:
+        mesh.append(tools.cutoff_to_mesh(a, ke))
+    logger.info(cell, 'ke_cutoff for multigrid tasks:\n%s', cutoff)
+    logger.info(cell, 'meshes for multigrid tasks:\n%s', mesh)
+    gridlevel_info = init_gridlevel_info(cutoff, rel_cutoff, mesh)
+    task_list = build_task_list(cell, gridlevel_info, hermi=hermi)
+    return task_list
+
+
+def _update_task_list(mydf, hermi=0, ngrids=None, ke_ratio=None, rel_cutoff=None):
+    '''
+    Update :attr:`task_list` if necessary.
+    '''
+    cell = mydf.cell
+    if ngrids is None:
+        ngrids = mydf.ngrids
+    if ke_ratio is None:
+        ke_ratio = mydf.ke_ratio
+    if rel_cutoff is None:
+        rel_cutoff = mydf.rel_cutoff
+
+    need_update = False
+    task_list = getattr(mydf, 'task_list', None)
+    if task_list is None:
+        need_update = True
+    else:
+        hermi_orig = task_list.contents.hermi
+        nlevels = task_list.contents.nlevels
+        rel_cutoff_orig = task_list.contents.gridlevel_info.contents.rel_cutoff
+        #TODO also need to check kenetic energy cutoff change
+        if (hermi_orig > hermi or
+                nlevels != ngrids or
+                abs(rel_cutoff_orig-rel_cutoff) > 1e-12):
+            need_update = True
+
+    if need_update:
+        if task_list is not None:
+            free_task_list(task_list)
+        task_list = multi_grids_tasks(cell, hermi=hermi, ngrids=ngrids,
+                                      ke_ratio=ke_ratio, rel_cutoff=rel_cutoff)
+        mydf.task_list = task_list
+    return task_list
+
+
+def init_gridlevel_info(cutoff, rel_cutoff, mesh):
+    if cutoff[0] < 1e-15:
+        cutoff = cutoff[1:]
+    cutoff = np.asarray(cutoff, order='C', dtype=np.double)
+    mesh = np.asarray(np.asarray(mesh).reshape(-1,3), order='C', dtype=np.int32)
+    nlevels = len(cutoff)
+    gridlevel_info = ctypes.POINTER(GridLevel_Info)()
+    fn = getattr(libdft, "init_gridlevel_info", None)
+    try:
+        fn(ctypes.byref(gridlevel_info),
+           cutoff.ctypes.data_as(ctypes.c_void_p),
+           mesh.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(nlevels), ctypes.c_double(rel_cutoff))
+    except Exception as e:
+        raise RuntimeError("Failed to init grid level info. %s" % e)
+    return gridlevel_info
+
+
+def free_gridlevel_info(gridlevel_info):
+    fn = getattr(libdft, "del_gridlevel_info", None)
+    try:
+        fn(ctypes.byref(gridlevel_info))
+    except Exception as e:
+        raise RuntimeError("Failed to free grid level info. %s" % e)
+
+
+def init_rs_grid(gridlevel_info, comp):
+    '''
+    Initialize values on real space multigrid
+    '''
+    rs_grid = ctypes.POINTER(RS_Grid)()
+    fn = getattr(libdft, "init_rs_grid", None)
+    try:
+        fn(ctypes.byref(rs_grid),
+           ctypes.byref(gridlevel_info),
+           ctypes.c_int(comp))
+    except Exception as e:
+        raise RuntimeError("Failed to initialize real space multigrid data. %s" % e)
+    return rs_grid
+
+
+def free_rs_grid(rs_grid):
+    fn = getattr(libdft, "del_rs_grid", None)
+    try:
+        fn(ctypes.byref(rs_grid))
+    except Exception as e:
+        raise RuntimeError("Failed to free real space multigrid data. %s" % e)
+
+
+def build_task_list(cell, gridlevel_info, cell1=None, Ls=None, hermi=0, precision=None):
+    '''
+    Build the task list for multigrid DFT calculations.
+
+    Arguments:
+        cell : :class:`pbc.gto.cell.Cell`
+            The :class:`Cell` instance for the bra basis functions.
+        gridlevel_info : :class:`ctypes.POINTER`
+            The C pointer of the :class:`GridLevel_Info` structure.
+        cell1 : :class:`pbc.gto.cell.Cell`, optional
+            The :class:`Cell` instance for the ket basis functions.
+            If not given, both bra and ket basis functions come from cell.
+        Ls : (*,3) array, optional
+            The cartesian coordinates of the periodic images.
+            Default is calculated by :func:`cell.get_lattice_Ls`.
+        hermi : int, optional
+            If :math:`hermi=1`, the task list is built only for
+            the upper triangle of the matrix. Default is 0.
+        precision : float, optional
+            The integral precision. Default is :attr:`cell.precision`.
+
+    Returns: :class:`ctypes.POINTER`
+        The C pointer of the :class:`TaskList` structure.
+    '''
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+    if cell1 is None:
+        cell1 = cell
+    if Ls is None:
+        Ls = cell.get_lattice_Ls()
+    if precision is None:
+        precision = cell.precision
+
+    if hermi == 1 and cell1 is not cell:
+        logger.warn(cell,
+                    "Set hermi=0 because cell and cell1 are not the same.")
+        hermi = 0
+
+    ish_atm = np.asarray(cell._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell._env, order='C', dtype=float)
+    nish = len(ish_bas)
+    ish_rcut, ipgf_rcut = cell.rcut_by_shells(precision=precision,
+                                              return_pgf_radius=True)
+    assert nish == len(ish_rcut)
+    ptr_ipgf_rcut = lib.ndarray_pointer_2d(ipgf_rcut)
+
+    if cell1 is cell:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+        jsh_rcut = ish_rcut
+        jpgf_rcut = ipgf_rcut
+        ptr_jpgf_rcut = ptr_ipgf_rcut
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=float)
+        jsh_rcut, jpgf_rcut = cell1.rcut_by_shells(precision=precision,
+                                                   return_pgf_radius=True)
+        ptr_jpgf_rcut = lib.ndarray_pointer_2d(jpgf_rcut)
+    njsh = len(jsh_bas)
+    assert njsh == len(jsh_rcut)
+
+    nl = build_neighbor_list_for_shlpairs(cell, cell1, Ls=Ls,
+                                          ish_rcut=ish_rcut, jsh_rcut=jsh_rcut,
+                                          hermi=hermi)
+
+    task_list = ctypes.POINTER(TaskList)()
+    func = getattr(libdft, "build_task_list", None)
+    try:
+        func(ctypes.byref(task_list),
+             ctypes.byref(nl), ctypes.byref(gridlevel_info),
+             ish_atm.ctypes.data_as(ctypes.c_void_p),
+             ish_bas.ctypes.data_as(ctypes.c_void_p),
+             ish_env.ctypes.data_as(ctypes.c_void_p),
+             ish_rcut.ctypes.data_as(ctypes.c_void_p),
+             ptr_ipgf_rcut,
+             jsh_atm.ctypes.data_as(ctypes.c_void_p),
+             jsh_bas.ctypes.data_as(ctypes.c_void_p),
+             jsh_env.ctypes.data_as(ctypes.c_void_p),
+             jsh_rcut.ctypes.data_as(ctypes.c_void_p),
+             ptr_jpgf_rcut,
+             ctypes.c_int(nish), ctypes.c_int(njsh),
+             Ls.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_double(precision), ctypes.c_int(hermi))
+    except Exception as e:
+        raise RuntimeError("Failed to build task list. %s" % e)
+    free_neighbor_list(nl)
+    return task_list
+
+
+def free_task_list(task_list):
+    '''
+    Note:
+        This will also free task_list.contents.gridlevel_info.
+    '''
+    if task_list is None:
+        return
+    func = getattr(libdft, "del_task_list", None)
+    try:
+        func(ctypes.byref(task_list))
+    except Exception as e:
+        raise RuntimeError("Failed to free task list. %s" % e)
+
+
+def eval_rho(cell, dm, task_list, shls_slice=None, hermi=0, xctype='LDA', kpts=None,
+             dimension=None, cell1=None, shls_slice1=None, Ls=None,
+             a=None, ignore_imag=False):
+    '''
+    Collocate density (opt. gradients) on the real-space grid.
+    The two sets of Gaussian functions can be different.
+
+    Returns:
+        rho: RS_Grid object
+            Densities on real space multigrids.
+    '''
+    cell0 = cell
+    shls_slice0 = shls_slice
+    if cell1 is None:
+        cell1 = cell0
+
+    #TODO mixture of cartesian and spherical bases
+    assert cell0.cart == cell1.cart
+
+    ish_atm = np.asarray(cell0._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell0._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell0._env, order='C', dtype=np.double)
+    ish_env[PTR_EXPDROP] = min(cell0.precision*EXTRA_PREC, EXPDROP)
+
+    if cell1 is cell0:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=np.double)
+        jsh_env[PTR_EXPDROP] = min(cell1.precision*EXTRA_PREC, EXPDROP)
+
+    if shls_slice0 is None:
+        shls_slice0 = (0, cell0.nbas)
+    i0, i1 = shls_slice0
+    if shls_slice1 is None:
+        shls_slice1 = shls_slice0
+    j0, j1 = shls_slice1
+
+    if hermi == 1:
+        assert cell1 is cell0
+        assert i0 == j0 and i1 == j1
+
+    key0 = 'cart' if cell0.cart else 'sph'
+    ao_loc0 = moleintor.make_loc(ish_bas, key0)
+    naoi = ao_loc0[i1] - ao_loc0[i0]
+    if hermi == 1:
+        ao_loc1 = ao_loc0
+    else:
+        key1 = 'cart' if cell1.cart else 'sph'
+        ao_loc1 = moleintor.make_loc(jsh_bas, key1)
+    naoj = ao_loc1[j1] - ao_loc1[j0]
+
+    dm = np.asarray(dm, order='C')
+    assert dm.shape[-2:] == (naoi, naoj)
+
+    if dimension is None:
+        dimension = cell0.dimension
+    assert dimension == getattr(cell1, "dimension", None)
+
+    if Ls is None and dimension > 0:
+        Ls = np.asarray(cell0.get_lattice_Ls(), order='C')
+    elif Ls is None and dimension == 0:
+        Ls = np.zeros((1,3))
+
+    if dimension == 0 or kpts is None or gamma_point(kpts):
+        nkpts, nimgs = 1, Ls.shape[0]
+        dm = dm.reshape(-1,1,naoi,naoj)
+    else:
+        expkL = np.exp(1j*kpts.reshape(-1,3).dot(Ls.T))
+        nkpts, nimgs = expkL.shape
+        dm = dm.reshape(-1,nkpts,naoi,naoj)
+    n_dm = dm.shape[0]
+
+    #TODO check if cell1 has the same lattice vectors
+    if a is None:
+        a = cell0.lattice_vectors()
+    b = np.linalg.inv(a.T)
+
+    if abs(a-np.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+    xctype = xctype.upper()
+    if xctype == 'LDA':
+        comp = 1
+    elif xctype == 'GGA':
+        if hermi == 1:
+            raise RuntimeError('hermi=1 is not supported for GGA functional')
+        comp = 4
+    else:
+        raise NotImplementedError('meta-GGA')
+
+    eval_fn = 'make_rho_' + xctype.lower() + lattice_type
+    drv = getattr(libdft, "grid_collocate_drv", None)
+
+    def make_rho_(rs_rho, dm):
+        try:
+            drv(getattr(libdft, eval_fn, None),
+                ctypes.byref(rs_rho),
+                dm.ctypes.data_as(ctypes.c_void_p),
+                ctypes.byref(task_list),
+                ctypes.c_int(comp), ctypes.c_int(hermi),
+                (ctypes.c_int*4)(i0, i1, j0, j1),
+                ao_loc0.ctypes.data_as(ctypes.c_void_p),
+                ao_loc1.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(dimension),
+                Ls.ctypes.data_as(ctypes.c_void_p),
+                a.ctypes.data_as(ctypes.c_void_p),
+                b.ctypes.data_as(ctypes.c_void_p),
+                ish_atm.ctypes.data_as(ctypes.c_void_p),
+                ish_bas.ctypes.data_as(ctypes.c_void_p),
+                ish_env.ctypes.data_as(ctypes.c_void_p),
+                jsh_atm.ctypes.data_as(ctypes.c_void_p),
+                jsh_bas.ctypes.data_as(ctypes.c_void_p),
+                jsh_env.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(cell0.cart))
+        except Exception as e:
+            raise RuntimeError("Failed to compute rho. %s" % e)
+        return rs_rho
+
+    gridlevel_info = task_list.contents.gridlevel_info
+    rho = []
+    for i, dm_i in enumerate(dm):
+        rs_rho = init_rs_grid(gridlevel_info, comp)
+        if dimension == 0 or kpts is None or gamma_point(kpts):
+            make_rho_(rs_rho, dm_i)
+        else:
+            raise NotImplementedError
+        rho.append(rs_rho)
+
+    if n_dm == 1:
+        rho = rho[0]
+    return rho
+
+
+def _eval_rhoG(mydf, dm_kpts, hermi=1, kpts=np.zeros((1,3)), deriv=0,
+               rhog_high_order=RHOG_HIGH_ORDER):
+    assert(deriv < 2)
+    cell = mydf.cell
+
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    gga_high_order = False
+    if deriv == 0:
+        xctype = 'LDA'
+        rhodim = 1
+    elif deriv == 1:
+        if rhog_high_order:
+            xctype = 'GGA'
+            rhodim = 4
+        else:  # approximate high order derivatives in reciprocal space
+            gga_high_order = True
+            xctype = 'LDA'
+            rhodim = 1
+            deriv = 0
+        assert(hermi == 1 or gamma_point(kpts))
+    elif deriv == 2:  # meta-GGA
+        raise NotImplementedError
+        assert(hermi == 1 or gamma_point(kpts))
+
+    ignore_imag = (hermi == 1)
+
+    rs_rho = eval_rho(cell, dms, task_list, hermi=hermi, xctype=xctype, kpts=kpts,
+                      ignore_imag=ignore_imag)
+
+    nx, ny, nz = mydf.mesh
+    rhoG = np.zeros((nset*rhodim,nx,ny,nz), dtype=np.complex128)
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+        if nset > 1:
+            rho = []
+            for i in range(nset):
+                rho.append(np.ctypeslib.as_array(rs_rho[i].contents.data[ilevel], shape=(ngrids,)))
+            rho = np.asarray(rho)
+        else:
+            rho = np.ctypeslib.as_array(rs_rho.contents.data[ilevel], shape=(ngrids,))
+
+        weight = 1./nkpts * cell.vol/ngrids
+        rho_freq = tools.fft(rho.reshape(nset*rhodim, -1), mesh)
+        rho = None
+        rho_freq *= weight
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        _takebak_4d(rhoG, rho_freq.reshape((-1,) + tuple(mesh)), (None, gx, gy, gz))
+        rho_freq = None
+
+    if nset > 1:
+        for i in range(nset):
+            free_rs_grid(rs_rho[i])
+    else:
+        free_rs_grid(rs_rho)
+    rs_rho = None
+
+    rhoG = rhoG.reshape(nset,rhodim,-1)
+    if gga_high_order:
+        Gv = cell.get_Gv(mydf.mesh)
+        #:rhoG1 = np.einsum('np,px->nxp', 1j*rhoG[:,0], Gv)
+        rhoG1 = gradient_gs(rhoG[:,0], Gv)
+        rhoG = np.concatenate([rhoG, rhoG1], axis=1)
+        Gv = rhoG1 = None
+    return rhoG
+
+
+def eval_mat(cell, weights, task_list, shls_slice=None, comp=1, hermi=0, deriv=0,
+             xctype='LDA', kpts=None, grid_level=None, dimension=None, mesh=None,
+             cell1=None, shls_slice1=None, Ls=None, a=None):
+
+    cell0 = cell
+    shls_slice0 = shls_slice
+    if cell1 is None:
+        cell1 = cell0
+
+    if mesh is None:
+        mesh = cell0.mesh
+
+    #TODO mixture of cartesian and spherical bases
+    assert cell0.cart == cell1.cart
+
+    ish_atm = np.asarray(cell0._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell0._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell0._env, order='C', dtype=np.double)
+    ish_env[PTR_EXPDROP] = min(cell0.precision*EXTRA_PREC, EXPDROP)
+
+    if cell1 is cell0:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=np.double)
+        jsh_env[PTR_EXPDROP] = min(cell1.precision*EXTRA_PREC, EXPDROP)
+
+    if shls_slice0 is None:
+        shls_slice0 = (0, cell0.nbas)
+    i0, i1 = shls_slice0
+    if shls_slice1 is None:
+        shls_slice1 = (0, cell1.nbas)
+    j0, j1 = shls_slice1
+
+    if hermi == 1:
+        assert cell1 is cell0
+        assert i0 == j0 and i1 == j1
+
+    key0 = 'cart' if cell0.cart else 'sph'
+    ao_loc0 = moleintor.make_loc(ish_bas, key0)
+    naoi = ao_loc0[i1] - ao_loc0[i0]
+    if hermi == 1:
+        ao_loc1 = ao_loc0
+    else:
+        key1 = 'cart' if cell1.cart else 'sph'
+        ao_loc1 = moleintor.make_loc(jsh_bas, key1)
+    naoj = ao_loc1[j1] - ao_loc1[j0]
+
+    if dimension is None:
+        dimension = cell0.dimension
+    assert dimension == getattr(cell1, "dimension", None)
+
+    if Ls is None and dimension > 0:
+        Ls = np.asarray(cell0.get_lattice_Ls(), order='C')
+    elif Ls is None and dimension == 0:
+        Ls = np.zeros((1,3))
+
+    if dimension == 0 or kpts is None or gamma_point(kpts):
+        nkpts, nimgs = 1, Ls.shape[0]
+    else:
+        expkL = np.exp(1j*kpts.reshape(-1,3).dot(Ls.T))
+        nkpts, nimgs = expkL.shape
+
+    #TODO check if cell1 has the same lattice vectors
+    if a is None:
+        a = cell0.lattice_vectors()
+    b = np.linalg.inv(a.T)
+
+    if abs(a-np.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+
+    weights = np.asarray(weights, order='C')
+    assert(weights.dtype == np.double)
+    xctype = xctype.upper()
+    n_mat = None
+    if xctype == 'LDA':
+        if weights.ndim == 1:
+            weights = weights.reshape(-1, np.prod(mesh))
+        else:
+            n_mat = weights.shape[0]
+    elif xctype == 'GGA':
+        if weights.ndim == 2:
+            weights = weights.reshape(-1, 4, np.prod(mesh))
+        else:
+            n_mat = weights.shape[0]
+    else:
+        raise NotImplementedError
+
+    eval_fn = 'eval_mat_' + xctype.lower() + lattice_type
+    if deriv > 0:
+        if deriv == 1:
+            assert comp == 3
+            assert hermi == 0
+            eval_fn += '_ip1'
+        else:
+            raise NotImplementedError
+    drv = getattr(libdft, "grid_integrate_drv", None)
+
+    def make_mat(wv):
+        if comp == 1:
+            mat = np.zeros((naoi, naoj))
+        else:
+            mat = np.zeros((comp, naoi, naoj))
+
+        try:
+            drv(getattr(libdft, eval_fn, None),
+                mat.ctypes.data_as(ctypes.c_void_p),
+                wv.ctypes.data_as(ctypes.c_void_p),
+                ctypes.byref(task_list),
+                ctypes.c_int(comp), ctypes.c_int(hermi),
+                ctypes.c_int(grid_level),
+                (ctypes.c_int*4)(i0, i1, j0, j1),
+                ao_loc0.ctypes.data_as(ctypes.c_void_p),
+                ao_loc1.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(dimension),
+                Ls.ctypes.data_as(ctypes.c_void_p),
+                a.ctypes.data_as(ctypes.c_void_p),
+                b.ctypes.data_as(ctypes.c_void_p),
+                ish_atm.ctypes.data_as(ctypes.c_void_p),
+                ish_bas.ctypes.data_as(ctypes.c_void_p),
+                ish_env.ctypes.data_as(ctypes.c_void_p),
+                jsh_atm.ctypes.data_as(ctypes.c_void_p),
+                jsh_bas.ctypes.data_as(ctypes.c_void_p),
+                jsh_env.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(cell0.cart))
+        except Exception as e:
+            raise RuntimeError("Failed to compute rho. %s" % e)
+        return mat
+
+    out = []
+    for wv in weights:
+        if dimension == 0 or kpts is None or gamma_point(kpts):
+            mat = make_mat(wv)
+        else:
+            raise NotImplementedError
+        out.append(mat)
+
+    if n_mat is None:
+        out = out[0]
+    return out
+
+
+def _get_j_pass2(mydf, vG, kpts=np.zeros((1,3)), hermi=1, verbose=None):
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    at_gamma_point = gamma_point(kpts)
+    if at_gamma_point:
+        vj_kpts = np.zeros((nset,nkpts,nao,nao))
+    else:
+        vj_kpts = np.zeros((nset,nkpts,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_4d(vG, (None, gx, gy, gz)).reshape(nset,ngrids)
+
+        v_rs = tools.ifft(sub_vG, mesh).reshape(nset,ngrids)
+        vR = np.asarray(v_rs.real, order='C')
+        vI = np.asarray(v_rs.imag, order='C')
+        if at_gamma_point:
+            v_rs = vR
+
+        mat = eval_mat(cell, vR, task_list, comp=1, hermi=hermi,
+                       xctype='LDA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        vj_kpts += np.asarray(mat).reshape(nset,-1,nao,nao)
+        if not at_gamma_point and abs(vI).max() > IMAG_TOL:
+            raise NotImplementedError
+
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+def _get_j_pass2_ip1(mydf, vG, kpts=np.zeros((1,3)), hermi=0, deriv=1, verbose=None):
+    if deriv == 1:
+        comp = 3
+        assert hermi == 0
+    else:
+        raise NotImplementedError
+
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    at_gamma_point = gamma_point(kpts)
+    if at_gamma_point:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao))
+    else:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_4d(vG, (None, gx, gy, gz)).reshape(nset,ngrids)
+
+        v_rs = tools.ifft(sub_vG, mesh).reshape(nset,ngrids)
+        if at_gamma_point:
+            vR = np.asarray(v_rs.real, order='C', dtype=float)
+            #vI = None
+        else:
+            raise NotImplementedError
+
+        mat = eval_mat(cell, vR, task_list, comp=comp, hermi=hermi, deriv=deriv,
+                       xctype='LDA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        mat = np.asarray(mat).reshape(nset,-1,comp,nao,nao)
+        vj_kpts = np.add(vj_kpts, mat, out=vj_kpts)
+
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+def _get_gga_pass2(mydf, vG, kpts=np.zeros((1,3)), hermi=1, verbose=None):
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,4,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    if gamma_point(kpts):
+        veff = np.zeros((nset,nkpts,nao,nao))
+    else:
+        veff = np.zeros((nset,nkpts,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_5d(vG, (None, None, gx, gy, gz)).reshape(-1,ngrids)
+        wv = tools.ifft(sub_vG, mesh).real.reshape(nset,4,ngrids)
+        wv = np.asarray(wv, order='C')
+
+        mat = eval_mat(cell, wv, task_list, comp=1, hermi=hermi,
+                       xctype='GGA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        mat = np.asarray(mat).reshape(nset,-1,nao,nao)
+        veff = np.add(veff, mat, out=veff)
+        if not gamma_point(kpts):
+            raise NotImplementedError
+
+    if nset == 1:
+        veff = veff[0]
+    return veff
+
+
+def _get_gga_pass2_ip1(mydf, vG, kpts=np.zeros((1,3)), hermi=0, deriv=1, verbose=None):
+    if deriv == 1:
+        comp = 3
+        assert hermi == 0
+    else:
+        raise NotImplementedError
+
+    cell = mydf.cell
+    nkpts = len(kpts)
+    nao = cell.nao_nr()
+    nx, ny, nz = mydf.mesh
+    vG = vG.reshape(-1,4,nx,ny,nz)
+    nset = vG.shape[0]
+
+    task_list = _update_task_list(mydf, hermi=hermi, ngrids=mydf.ngrids,
+                                  ke_ratio=mydf.ke_ratio, rel_cutoff=mydf.rel_cutoff)
+
+    at_gamma_point = gamma_point(kpts)
+    if at_gamma_point:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao))
+    else:
+        vj_kpts = np.zeros((nset,nkpts,comp,nao,nao), dtype=np.complex128)
+
+    nlevels = task_list.contents.nlevels
+    meshes = task_list.contents.gridlevel_info.contents.mesh
+    meshes = np.ctypeslib.as_array(meshes, shape=(nlevels,3))
+    for ilevel in range(nlevels):
+        mesh = meshes[ilevel]
+        ngrids = np.prod(mesh)
+
+        gx = np.fft.fftfreq(mesh[0], 1./mesh[0]).astype(np.int32)
+        gy = np.fft.fftfreq(mesh[1], 1./mesh[1]).astype(np.int32)
+        gz = np.fft.fftfreq(mesh[2], 1./mesh[2]).astype(np.int32)
+        sub_vG = _take_5d(vG, (None, None, gx, gy, gz)).reshape(-1,ngrids)
+
+        v_rs = tools.ifft(sub_vG, mesh).reshape(nset,4,ngrids)
+        vR = np.asarray(v_rs.real, order='C')
+        vI = np.asarray(v_rs.imag, order='C')
+        if at_gamma_point:
+            v_rs = vR
+
+        mat = eval_mat(cell, vR, task_list, comp=comp, hermi=hermi, deriv=deriv,
+                       xctype='GGA', kpts=kpts, grid_level=ilevel, mesh=mesh)
+        vj_kpts += np.asarray(mat).reshape(nset,-1,comp,nao,nao)
+        if not at_gamma_point and abs(vI).max() > IMAG_TOL:
+            raise NotImplementedError
+
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+def _rks_gga_wv0(rho, vxc, weight):
+    vrho, vgamma = vxc[:2]
+    ngrid = vrho.size
+    wv = np.empty((4,ngrid))
+    wv[0]  = np.multiply(weight, vrho, out=wv[0])
+    for i in range(1, 4):
+        wv[i] = np.multiply(weight * 2, np.multiply(vgamma, rho[i], out=wv[i]), out=wv[i])
+    return wv
+
+
+def _uks_gga_wv0(rho, vxc, weight):
+    rhoa, rhob = rho
+    vrho, vsigma = vxc[:2]
+    ngrids = vrho.shape[0]
+    wv = np.empty((2, 4, ngrids))
+    wv[0,0]  = np.multiply(weight, vrho[:,0], out=wv[0,0])
+    for i in range(1,4):
+        wv[0,i] = np.multiply(2., np.multiply(rhoa[i], vsigma[:,0], out=wv[0,i]), out=wv[0,i])
+        wv[0,i] = np.add(wv[0,i], np.multiply(rhob[i], vsigma[:,1]), out=wv[0,i])
+        wv[0,i] = np.multiply(weight, wv[0,i], out=wv[0,i])
+    wv[1,0]  = np.multiply(weight, vrho[:,1], out=wv[1,0])
+    for i in range(1,4):
+        wv[1,i] = np.multiply(2., np.multiply(rhob[i], vsigma[:,2], out=wv[1,i]), out=wv[1,i])
+        wv[1,i] = np.add(wv[1,i], np.multiply(rhoa[i], vsigma[:,1]), out=wv[1,i])
+        wv[1,i] = np.multiply(weight, wv[1,i], out=wv[1,i])
+    return wv
+
+
+def _rks_gga_wv0_pw(cell, rho, vxc, weight, mesh):
+    vrho, vgamma = vxc[:2]
+    ngrid = vrho.size
+    buf = np.empty((3,ngrid))
+    for i in range(1, 4):
+        buf[i-1] = np.multiply(vgamma, rho[i], out=buf[i-1])
+
+    vrho_freq = tools.fft(vrho, mesh).reshape((1,ngrid))
+    buf_freq = tools.fft(buf, mesh).reshape((3,ngrid))
+    Gv = cell.get_Gv(mesh)
+    #out  = vrho_freq - 2j * np.einsum('px,xp->p', Gv, buf_freq)
+    #out *= weight
+
+    out = np.empty((ngrid,), order="C", dtype=np.complex128)
+    func = getattr(libdft, 'get_gga_vrho_gs', None)
+    func(out.ctypes.data_as(ctypes.c_void_p),
+         vrho_freq.ctypes.data_as(ctypes.c_void_p),
+         buf_freq.ctypes.data_as(ctypes.c_void_p),
+         Gv.ctypes.data_as(ctypes.c_void_p),
+         ctypes.c_double(weight), ctypes.c_int(ngrid))
+    return out
+
+
+def _uks_gga_wv0_pw(cell, rho, vxc, weight, mesh):
+    rhoa, rhob = rho
+    vrho, vgamma = vxc[:2]
+    ngrid = vrho.shape[0]
+    buf = np.empty((2,3,ngrid))
+    for i in range(1, 4):
+        buf[0,i-1] = np.multiply(vgamma[:,0], rhoa[i], out=buf[0,i-1])
+        tmp = np.multiply(vgamma[:,1], rhob[i])
+        tmp = np.multiply(.5, tmp, out=tmp)
+        buf[0,i-1] = np.add(buf[0,i-1], tmp, out=buf[0,i-1])
+
+        buf[1,i-1] = np.multiply(vgamma[:,2], rhob[i], out=buf[1,i-1])
+        tmp = np.multiply(vgamma[:,1], rhoa[i])
+        tmp = np.multiply(.5, tmp, out=tmp)
+        buf[1,i-1] = np.add(buf[1,i-1], tmp, out=buf[1,i-1])
+
+
+    vrho_freq = tools.fft(vrho.T, mesh).reshape((2,ngrid))
+    buf_freq = tools.fft(buf.reshape(-1,ngrid), mesh).reshape((2,3,ngrid))
+    Gv = cell.get_Gv(mesh)
+    #out  = vrho_freq - 2j * np.einsum('px,xp->p', Gv, buf_freq)
+    #out *= weight
+
+    out = np.empty((2,ngrid), order="C", dtype=np.complex128)
+    func = getattr(libdft, 'get_gga_vrho_gs')
+    for s in range(2):
+        func(out[s].ctypes.data_as(ctypes.c_void_p),
+             vrho_freq[s].ctypes.data_as(ctypes.c_void_p),
+             buf_freq[s].ctypes.data_as(ctypes.c_void_p),
+             Gv.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_double(weight), ctypes.c_int(ngrid))
+    return out
+
+
+def nr_rks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    '''
+    Same as multigrid.nr_rks, but considers Hermitian symmetry also for GGA
+    '''
+    if kpts is None: kpts = mydf.kpts
+    log = logger.new_logger(mydf, verbose)
+    cell = mydf.cell
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+
+    ni = mydf._numint
+    xctype = ni._xc_type(xc_code)
+    if xctype == 'LDA':
+        deriv = 0
+    elif xctype == 'GGA':
+        deriv = 1
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv)
+
+    mesh = mydf.mesh
+    ngrids = np.prod(mesh)
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    #vG = np.einsum('ng,g->ng', rhoG[:,0], coulG)
+    vG = np.empty_like(rhoG[:,0], dtype=np.result_type(rhoG[:,0], coulG))
+    for i, rhoG_i in enumerate(rhoG[:,0]):
+        vG[i] = np.multiply(rhoG_i, coulG, out=vG[i])
+    coulG = None
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] += mydf.vpplocG_part1 * 2
+            vG[i] = np.add(vG[i], np.multiply(2., mydf.vpplocG_part1), out=vG[i])
+
+    #ecoul = .5 * np.einsum('ng,ng->n', rhoG[:,0].real, vG.real)
+    #ecoul+= .5 * np.einsum('ng,ng->n', rhoG[:,0].imag, vG.imag)
+    ecoul = np.zeros((rhoG.shape[0],))
+    for i in range(rhoG.shape[0]):
+        ecoul[i] = .5 * np.vdot(rhoG[i,0], vG[i]).real
+
+    ecoul /= cell.vol
+    log.debug('Multigrid Coulomb energy %s', ecoul)
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] -= mydf.vpplocG_part1
+            vG[i] = np.subtract(vG[i], mydf.vpplocG_part1, out=vG[i])
+
+    weight = cell.vol / ngrids
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    rhoR = rhoR.reshape(nset,-1,ngrids)
+    wv_freq = []
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    for i in range(nset):
+        exc, vxc = ni.eval_xc(xc_code, rhoR[i], spin=0, deriv=1)[:2]
+        if xctype == 'LDA':
+            wv = np.multiply(weight, vxc[0])
+            wv_freq.append(tools.fft(wv, mesh))
+            wv = None
+        elif xctype == 'GGA':
+            if GGA_METHOD.upper() == 'FFT':
+                wv_freq.append(_rks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh).reshape(1,ngrids))
+            else:
+                wv = _rks_gga_wv0(rhoR[i], vxc, weight)
+                wv_freq.append(tools.fft(wv, mesh))
+                wv = None
+        else:
+            raise NotImplementedError
+
+        nelec[i]  += np.sum(rhoR[i,0]) * weight
+        excsum[i] += np.sum(np.multiply(rhoR[i,0], exc)) * weight
+        exc = vxc = None
+
+    rhoR = rhoG = None
+
+    if len(wv_freq) == 1:
+        wv_freq = wv_freq[0].reshape(nset,-1,*mesh)
+    else:
+        wv_freq = np.asarray(wv_freq).reshape(nset,-1,*mesh)
+
+    if nset == 1:
+        ecoul = ecoul[0]
+        nelec = nelec[0]
+        excsum = excsum[0]
+    log.debug('Multigrid exc %s  nelec %s', excsum, nelec)
+
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    if xctype == 'LDA':
+        if with_j:
+            wv_freq[:,0] += vG.reshape(nset,*mesh)
+        veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+    elif xctype == 'GGA':
+        if with_j:
+            #wv_freq[:,0] += vG.reshape(nset,*mesh)
+            wv_freq[:,0] = np.add(wv_freq[:,0], vG.reshape(nset,*mesh), out=wv_freq[:,0])
+        if GGA_METHOD.upper() == 'FFT':
+            veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+        else:
+            veff = _get_gga_pass2(mydf, wv_freq, kpts_band, hermi=hermi, verbose=log)
+    wv_freq = None
+    veff = _format_jks(veff, dm_kpts, input_band, kpts)
+
+    if return_j:
+        vj = _get_j_pass2(mydf, vG, kpts_band, verbose=log)
+        vj = _format_jks(veff, dm_kpts, input_band, kpts)
+    else:
+        vj = None
+    vG = None
+
+    veff = lib.tag_array(veff, ecoul=ecoul, exc=excsum, vj=vj, vk=None)
+    return nelec, excsum, veff
+
+def nr_uks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
+           kpts_band=None, with_j=False, return_j=False, verbose=None):
+    if kpts is None: kpts = mydf.kpts
+    log = logger.new_logger(mydf, verbose)
+    cell = mydf.cell
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    nset //= 2
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+
+    mesh = mydf.mesh
+    ngrids = np.prod(mesh)
+    ni = mydf._numint
+    xctype = ni._xc_type(xc_code)
+    if xctype == 'LDA':
+        deriv = 0
+    elif xctype == 'GGA':
+        deriv = 1
+
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi, kpts, deriv)
+    rhoG = rhoG.reshape(nset,2,-1,ngrids)
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    #vG = np.einsum('nsg,g->ng', rhoG[:,:,0], coulG)
+    vG = np.empty((nset,ngrids), dtype=np.result_type(rhoG[:,:,0], coulG))
+    for i, rhoG_i in enumerate(rhoG[:,:,0]):
+        vG[i] = np.multiply(np.add(rhoG_i[0], rhoG_i[1]), coulG, out=vG[i])
+    coulG = None
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] += mydf.vpplocG_part1 * 2
+            vG[i] = np.add(vG[i], np.multiply(2., mydf.vpplocG_part1), out=vG[i])
+
+    ecoul = np.zeros(nset)
+    for i in range(nset):
+        ecoul[i] = .5 * np.vdot(np.add(rhoG[i,0,0], rhoG[i,1,0]), vG[i]).real
+
+    ecoul /= cell.vol
+    log.debug('Multigrid Coulomb energy %s', ecoul)
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            #vG[i] -= mydf.vpplocG_part1
+            vG[i] = np.subtract(vG[i], mydf.vpplocG_part1, out=vG[i])
+
+    weight = cell.vol / ngrids
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    rhoR = rhoR.reshape(nset,2,-1,ngrids)
+    wv_freq = []
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    for i in range(nset):
+        exc, vxc = ni.eval_xc(xc_code, rhoR[i], spin=1, deriv=1)[:2]
+        if xctype == 'LDA':
+            wv = np.multiply(weight, vxc[0].T)
+            wv_freq.append(tools.fft(wv, mesh))
+            wv = None
+        elif xctype == 'GGA':
+            if GGA_METHOD.upper() == 'FFT':
+                wv_freq.append(_uks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh))
+            else:
+                wv = _uks_gga_wv0(rhoR[i], vxc, weight)
+                wv_freq.append(tools.fft(wv.reshape(-1,*mesh), mesh))
+                wv = None
+        else:
+            raise NotImplementedError
+
+        nelec[i]  += np.sum(rhoR[i,:,0]).sum() * weight
+        excsum[i] += np.sum(np.multiply(np.add(rhoR[i,0,0],rhoR[i,1,0]), exc)) * weight
+        exc = vxc = None
+
+    rhoR = rhoG = None
+
+    if len(wv_freq) == 1:
+        wv_freq = wv_freq[0].reshape(nset,2,-1,*mesh)
+    else:
+        wv_freq = np.asarray(wv_freq).reshape(nset,2,-1,*mesh)
+
+    if nset == 1:
+        ecoul = ecoul[0]
+        nelec = nelec[0]
+        excsum = excsum[0]
+    log.debug('Multigrid exc %s  nelec %s', excsum, nelec)
+
+    kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
+    if xctype == 'LDA':
+        if with_j:
+            for s in range(2):
+                wv_freq[:,s,0] += vG.reshape(nset,*mesh)
+        veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+    elif xctype == 'GGA':
+        if with_j:
+            #wv_freq[:,:,0] += vG.reshape(nset,*mesh)
+            for s in range(2):
+                wv_freq[:,s,0] = np.add(wv_freq[:,s,0], vG.reshape(nset,*mesh), out=wv_freq[:,s,0])
+        if GGA_METHOD.upper() == 'FFT':
+            veff = _get_j_pass2(mydf, wv_freq, kpts_band, verbose=log)
+        else:
+            veff = _get_gga_pass2(mydf, wv_freq, kpts_band, hermi=hermi, verbose=log)
+    wv_freq = None
+    veff = _format_jks(veff, dm_kpts, input_band, kpts)
+
+    if return_j:
+        vj = _get_j_pass2(mydf, vG, kpts_band, verbose=log)
+        vj = _format_jks(veff, dm_kpts, input_band, kpts)
+    else:
+        vj = None
+    vG = None
+
+    veff = lib.tag_array(veff, ecoul=ecoul, exc=excsum, vj=vj, vk=None)
+    return nelec, excsum, veff
+
+def get_veff_ip1(mydf, dm_kpts, xc_code=None, kpts=np.zeros((1,3)), kpts_band=None, spin=0):
+    cell = mydf.cell
+    dm_kpts = np.asarray(dm_kpts, order='C')
+    dms = _format_dms(dm_kpts, kpts)
+    nset, nkpts, nao = dms.shape[:3]
+    kpts_band = _format_kpts_band(kpts_band, kpts)
+    if spin == 1:
+        nset //= 2
+
+    mesh = mydf.mesh
+    ngrids = np.prod(mesh)
+    ni = mydf._numint
+    xctype = ni._xc_type(xc_code)
+    if xctype == 'LDA':
+        deriv = 0
+    elif xctype == 'GGA':
+        deriv = 1
+    rhoG = _eval_rhoG(mydf, dm_kpts, hermi=1, kpts=kpts_band, deriv=deriv)
+    if spin == 1:
+        rhoG = rhoG.reshape(nset,2,-1,ngrids)
+    # cache rhoG for core density gradients
+    mydf.rhoG = rhoG
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    vG = np.empty((nset,ngrids), dtype=np.result_type(rhoG, coulG))
+    for i in range(nset):
+        if spin == 0:
+            vG[i] = np.multiply(rhoG[i,0], coulG, out=vG[i])
+        elif spin == 1:
+            tmp = np.add(rhoG[i,0,0], rhoG[i,1,0])
+            vG[i] = np.multiply(tmp, coulG, out=vG[i])
+
+    if mydf.vpplocG_part1 is not None:
+        for i in range(nset):
+            vG[i] = np.add(vG[i], mydf.vpplocG_part1, out=vG[i])
+
+    weight = cell.vol / ngrids
+
+    # *(1./weight) because rhoR is scaled by weight in _eval_rhoG.  When
+    # computing rhoR with IFFT, the weight factor is not needed.
+    rhoR = tools.ifft(rhoG.reshape(-1,ngrids), mesh).real * (1./weight)
+    if spin == 0:
+        rhoR = rhoR.reshape(nset,-1,ngrids)
+    elif spin == 1:
+        rhoR = rhoR.reshape(nset,2,-1,ngrids)
+
+    wv_freq = []
+    for i in range(nset):
+        exc, vxc = ni.eval_xc(xc_code, rhoR[i], spin=spin, deriv=1)[:2]
+        if spin == 0:
+            if xctype == 'LDA':
+                wv = np.multiply(weight, vxc[0])
+                wv_freq.append(tools.fft(wv, mesh))
+                wv = None
+            elif xctype == 'GGA':
+                if GGA_METHOD.upper() == 'FFT':
+                    wv_freq.append(_rks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh).reshape(1,ngrids))
+                else:
+                    wv = _rks_gga_wv0(rhoR[i], vxc, weight)
+                    wv_freq.append(tools.fft(wv, mesh))
+            else:
+                raise NotImplementedError
+        elif spin == 1:
+            if xctype == 'LDA':
+                wv = np.multiply(weight, vxc[0].T)
+                wv_freq.append(tools.fft(wv, mesh))
+                wv = None
+            elif xctype == 'GGA':
+                if GGA_METHOD.upper() == 'FFT':
+                    wv_freq.append(_uks_gga_wv0_pw(cell, rhoR[i], vxc, weight, mesh))
+                else:
+                    wv = _uks_gga_wv0(rhoR[i], vxc, weight)
+                    wv_freq.append(tools.fft(wv.reshape(-1,*mesh), mesh))
+                wv = None
+            else:
+                raise NotImplementedError
+
+    rhoR = rhoG = None
+    if spin == 0:
+        if len(wv_freq) == 1:
+            wv_freq = wv_freq[0].reshape(nset,-1,*mesh)
+        else:
+            wv_freq = np.asarray(wv_freq).reshape(nset,-1,*mesh)
+    elif spin == 1:
+        if len(wv_freq) == 1:
+            wv_freq = wv_freq[0].reshape(nset,2,-1,*mesh)
+        else:
+            wv_freq = np.asarray(wv_freq).reshape(nset,2,-1,*mesh)
+
+    for i in range(nset):
+        if spin == 0:
+            wv_freq[i,0] = np.add(wv_freq[i,0], vG[i].reshape(*mesh), out=wv_freq[i,0])
+        elif spin == 1:
+            for s in range(2):
+                wv_freq[i,s,0] = np.add(wv_freq[i,s,0], vG[i].reshape(*mesh), out=wv_freq[i,s,0])
+
+    if xctype == 'LDA':
+        vj_kpts = _get_j_pass2_ip1(mydf, wv_freq, kpts_band, hermi=0, deriv=1)
+    elif xctype == 'GGA':
+        if GGA_METHOD.upper() == 'FFT':
+            vj_kpts = _get_j_pass2_ip1(mydf, wv_freq, kpts_band, hermi=0, deriv=1)
+        else:
+            vj_kpts = _get_gga_pass2_ip1(mydf, wv_freq, kpts_band, hermi=0, deriv=1)
+    else:
+        raise NotImplementedError
+
+    comp = 3
+    nao = cell.nao
+    if spin == 0:
+        vj_kpts = vj_kpts.reshape(nset,nkpts,comp,nao,nao)
+    elif spin == 1:
+        vj_kpts = vj_kpts.reshape(nset,2,nkpts,comp,nao,nao)
+    vj_kpts = np.moveaxis(vj_kpts, -3, -4)
+
+    if nkpts == 1:
+        vj_kpts = vj_kpts[...,0,:,:]
+    if nset == 1:
+        vj_kpts = vj_kpts[0]
+    return vj_kpts
+
+
+class MultiGridFFTDF2(MultiGridFFTDF):
+    '''
+    Base class for multigrid DFT (version 2).
+
+    Attributes:
+        task_list : TaskList instance
+            Task list recording which primitive basis function pairs
+            need to be considered.
+        vpplocG_part1 : arrary
+            Short-range part of the local pseudopotential represented
+            in the reciprocal space. It is cached to reduce cost.
+        rhoG : array
+            Electronic density represented in the reciprocal space.
+            It is cached in nuclear gradient calculations to reduce cost.
+    '''
+    ngrids = getattr(__config__, 'pbc_dft_multigrid_ngrids', 4)
+    ke_ratio = getattr(__config__, 'pbc_dft_multigrid_ke_ratio', 3.0)
+    rel_cutoff = getattr(__config__, 'pbc_dft_multigrid_rel_cutoff', 20.0)
+    _keys = {'ngrids', 'ke_ratio', 'rel_cutoff',
+             'task_list', 'vpplocG_part1', 'rhoG'}
+
+    def __init__(self, cell, kpts=np.zeros((1,3))):
+        fft.FFTDF.__init__(self, cell, kpts)
+        self.task_list = None
+        self.vpplocG_part1 = None
+        self.rhoG = None
+        if not gamma_point(kpts):
+            raise NotImplementedError('MultiGridFFTDF2 only supports Gamma-point calculations.')
+        a = cell.lattice_vectors()
+        if abs(a-np.diag(a.diagonal())).max() > 1e-12:
+            raise NotImplementedError('MultiGridFFTDF2 only supports orthorhombic lattices.')
+
+    def reset(self, cell=None):
+        self.vpplocG_part1 = None
+        self.rhoG = None
+        if self.task_list is not None:
+            free_task_list(self.task_list)
+            self.task_list = None
+        fft.FFTDF.reset(self, cell=cell)
+
+    def __del__(self):
+        self.reset()
+
+    def get_veff_ip1(self, dm, xc_code=None, kpts=None, kpts_band=None, spin=0):
+        if kpts is None:
+            if self.kpts is None:
+                kpts = np.zeros(1,3)
+            else:
+                kpts = self.kpts
+        kpts = kpts.reshape(-1,3)
+        vj = get_veff_ip1(self, dm, xc_code=xc_code,
+                          kpts=kpts, kpts_band=kpts_band, spin=spin)
+        return vj
+
+    def get_pp(self, kpts=None):
+        '''Compute the GTH pseudopotential matrix, which includes
+        the second part of the local potential and the non-local potential.
+        The first part of the local potential is cached as `vpplocG_part1`,
+        which is the reciprocal space representation, to be added to the electron
+        density for computing the Coulomb matrix.
+        In order to get the full PP matrix, the potential due to `vpplocG_part1`
+        needs to be added.
+        '''
+        self.vpplocG_part1 = _get_vpplocG_part1(self, with_rho_core=True)
+        return _get_pp_without_erf(self, kpts)
+
+    vpploc_part1_nuc_grad = vpploc_part1_nuc_grad
diff --git a/pyscf/pbc/dft/multigrid/pp.py b/pyscf/pbc/dft/multigrid/pp.py
new file mode 100644
index 0000000000..13c0813dac
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/pp.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy
+from pyscf import __config__
+from pyscf import lib, gto
+from pyscf.lib import logger
+from pyscf.pbc import tools
+from pyscf.pbc.gto import pseudo
+from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+PP_WITH_RHO_CORE = getattr(__config__, 'pbc_dft_multigrid_pp_with_rho_core', True)
+
+libpbc = lib.load_library('libpbc')
+libdft = lib.load_library('libdft')
+
+def make_rho_core(cell, mesh=None, precision=None, atm_id=None):
+    if mesh is None:
+        mesh = cell.mesh
+    fakecell, max_radius = fake_cell_vloc_part1(cell, atm_id=atm_id, precision=precision)
+    atm = fakecell._atm
+    bas = fakecell._bas
+    env = fakecell._env
+
+    a = numpy.asarray(cell.lattice_vectors(), order='C', dtype=float)
+    if abs(a - numpy.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+        raise NotImplementedError
+    eval_fn = 'make_rho_lda' + lattice_type
+
+    b = numpy.asarray(numpy.linalg.inv(a.T), order='C', dtype=float)
+    mesh = numpy.asarray(mesh, order='C', dtype=numpy.int32)
+    rho_core = numpy.zeros((numpy.prod(mesh),), order='C', dtype=float)
+    drv = getattr(libdft, 'build_core_density', None)
+    try:
+        drv(getattr(libdft, eval_fn),
+            rho_core.ctypes.data_as(ctypes.c_void_p),
+            atm.ctypes.data_as(ctypes.c_void_p),
+            bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(len(bas)),
+            env.ctypes.data_as(ctypes.c_void_p),
+            mesh.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(cell.dimension),
+            a.ctypes.data_as(ctypes.c_void_p),
+            b.ctypes.data_as(ctypes.c_void_p), ctypes.c_double(max_radius))
+    except Exception as e:
+        raise RuntimeError("Failed to compute rho_core. %s" % e)
+    return rho_core
+
+
+def _get_pp_without_erf(mydf, kpts=None):
+    '''Get the periodic pseudotential nuc-el AO matrix, with G=0 removed.
+    '''
+    cell = mydf.cell
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    vpp = pp_int.get_pp_loc_part2(cell, kpts_lst)
+    vppnl = pp_int.get_pp_nl(cell, kpts_lst)
+
+    for k, kpt in enumerate(kpts_lst):
+        if gamma_point(kpt):
+            vpp[k] = vpp[k].real + vppnl[k].real
+        else:
+            vpp[k] += vppnl[k]
+    vppnl = None
+
+    if kpts is None or numpy.shape(kpts) == (3,):
+        vpp = vpp[0]
+    return numpy.asarray(vpp)
+
+
+def get_pp_loc_part1_gs(cell, Gv):
+    coulG = tools.get_coulG(cell, Gv=Gv)
+    G2 = numpy.einsum('ix,ix->i', Gv, Gv)
+    G0idx = numpy.where(G2==0)[0]
+    ngrid = len(G2)
+    Gv = numpy.asarray(Gv, order='C', dtype=numpy.double)
+    coulG = numpy.asarray(coulG, order='C', dtype=numpy.double)
+    G2 = numpy.asarray(G2, order='C', dtype=numpy.double)
+
+    coords = cell.atom_coords()
+    coords = numpy.asarray(coords, order='C', dtype=numpy.double)
+    Z = numpy.empty([cell.natm,], order='C', dtype=numpy.double)
+    rloc = numpy.empty([cell.natm,], order='C', dtype=numpy.double)
+    for ia in range(cell.natm):
+        Z[ia] = cell.atom_charge(ia)
+        symb = cell.atom_symbol(ia)
+        if symb in cell._pseudo:
+            rloc[ia] = cell._pseudo[symb][1]
+        else:
+            rloc[ia] = -999
+
+    out = numpy.empty((ngrid,), order='C', dtype=numpy.complex128)
+    fn = getattr(libpbc, "pp_loc_part1_gs", None)
+    try:
+        fn(out.ctypes.data_as(ctypes.c_void_p),
+           coulG.ctypes.data_as(ctypes.c_void_p),
+           Gv.ctypes.data_as(ctypes.c_void_p),
+           G2.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(G0idx), ctypes.c_int(ngrid),
+           Z.ctypes.data_as(ctypes.c_void_p),
+           coords.ctypes.data_as(ctypes.c_void_p),
+           rloc.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(cell.natm))
+    except Exception as e:
+        raise RuntimeError("Failed to get vlocG part1. %s" % e)
+    return out
+
+
+def _get_vpplocG_part1(mydf, with_rho_core=PP_WITH_RHO_CORE):
+    cell = mydf.cell
+    mesh = mydf.mesh
+
+    if not with_rho_core:
+        # compute rho_core directly in G-space
+        # this is much slower that the following
+        Gv = cell.get_Gv(mesh)
+        vpplocG_part1 = get_pp_loc_part1_gs(cell, Gv)
+    else:
+        # compute rho_core in real space then transform to G-space
+        weight = cell.vol / numpy.prod(mesh)
+        rho_core = make_rho_core(cell)
+        rhoG_core = weight * tools.fft(rho_core, mesh)
+        rho_core = None
+        coulG = tools.get_coulG(cell, mesh=mesh)
+        vpplocG_part1 = rhoG_core * coulG
+        rhoG_core = coulG = None
+        # G = 0 contribution
+        chargs = cell.atom_charges()
+        rloc = []
+        for ia in range(cell.natm):
+            symb = cell.atom_symbol(ia)
+            rloc.append(cell._pseudo[symb][1])
+        rloc = numpy.asarray(rloc)
+        vpplocG_part1[0] += 2. * numpy.pi * numpy.sum(rloc * rloc * chargs)
+    return vpplocG_part1
+
+
+def get_vpploc_part1_ip1(mydf, kpts=numpy.zeros((1,3))):
+    from .multigrid_pair import _get_j_pass2_ip1
+    if mydf.pp_with_erf:
+        return 0
+
+    mesh = mydf.mesh
+    vG = mydf.vpplocG_part1
+    vG.reshape(-1,*mesh)
+
+    vpp_kpts = _get_j_pass2_ip1(mydf, vG, kpts, hermi=0, deriv=1)
+    if gamma_point(kpts):
+        vpp_kpts = vpp_kpts.real
+    if len(kpts) == 1:
+        vpp_kpts = vpp_kpts[0]
+    return vpp_kpts
+
+
+def vpploc_part1_nuc_grad(mydf, dm, kpts=numpy.zeros((1,3)), atm_id=None, precision=None):
+    from .multigrid_pair import _eval_rhoG
+    t0 = (logger.process_clock(), logger.perf_counter())
+    cell = mydf.cell
+    fakecell, max_radius = fake_cell_vloc_part1(cell, atm_id=atm_id, precision=precision)
+    atm = fakecell._atm
+    bas = fakecell._bas
+    env = fakecell._env
+
+    a = numpy.asarray(cell.lattice_vectors(), order='C', dtype=float)
+    if abs(a - numpy.diag(a.diagonal())).max() < 1e-12:
+        lattice_type = '_orth'
+    else:
+        lattice_type = '_nonorth'
+        raise NotImplementedError
+    eval_fn = 'eval_mat_lda' + lattice_type + '_ip1'
+
+    b = numpy.asarray(numpy.linalg.inv(a.T), order='C', dtype=float)
+    mesh = numpy.asarray(mydf.mesh, order='C', dtype=numpy.int32)
+    ngrids = numpy.prod(mesh)
+    comp = 3
+    grad = numpy.zeros((len(atm),comp), order="C", dtype=float)
+    drv = getattr(libdft, 'int_gauss_charge_v_rs', None)
+
+    if mydf.rhoG is None:
+        rhoG = _eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=0)
+    else:
+        rhoG = mydf.rhoG
+    rhoG = rhoG[...,0,:]
+    rhoG = rhoG.reshape(-1,ngrids)
+    if rhoG.shape[0] == 2: #unrestricted
+        rhoG = rhoG[0] + rhoG[1]
+    else:
+        assert rhoG.shape[0] == 1
+        rhoG = rhoG[0]
+
+    coulG = tools.get_coulG(cell, mesh=mesh)
+    vG = numpy.multiply(rhoG, coulG)
+
+    v_rs = numpy.asarray(tools.ifft(vG, mesh).real, order="C")
+    try:
+        drv(getattr(libdft, eval_fn),
+            grad.ctypes.data_as(ctypes.c_void_p),
+            v_rs.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(comp),
+            atm.ctypes.data_as(ctypes.c_void_p),
+            bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(len(bas)),
+            env.ctypes.data_as(ctypes.c_void_p),
+            mesh.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(cell.dimension),
+            a.ctypes.data_as(ctypes.c_void_p),
+            b.ctypes.data_as(ctypes.c_void_p), ctypes.c_double(max_radius))
+    except Exception as e:
+        raise RuntimeError("Failed to computed nuclear gradients of vpploc part1. %s" % e)
+    grad *= -1
+    t0 = logger.timer(mydf, 'vpploc_part1_nuc_grad', *t0)
+    return grad
+
+
+def fake_cell_vloc_part1(cell, atm_id=None, precision=None):
+    '''
+    Generate fakecell for the non-local term of the local part of
+    the GTH pseudo-potential. Also stores the atomic radii.
+    Differs from pp_int.fake_cell_vloc(cell, cn=0) in the normalization factors.
+    '''
+    from pyscf.pbc.gto.cell import pgf_rcut
+    if atm_id is None:
+        atm_id = numpy.arange(cell.natm)
+    else:
+        atm_id = numpy.asarray(atm_id)
+    natm = len(atm_id)
+
+    if precision is None:
+        precision = cell.precision
+
+    max_radius = 0
+    kind = {}
+    # FIXME prec may be too tight
+    prec = precision ** 2
+    for symb in cell._pseudo:
+        charge = numpy.sum(cell._pseudo[symb][0])
+        rloc = cell._pseudo[symb][1]
+        zeta = .5 / rloc**2
+        norm = (zeta / numpy.pi) ** 1.5
+        radius = pgf_rcut(0, zeta, charge*norm, precision=prec)
+        max_radius = max(radius, max_radius)
+        kind[symb] = [zeta, norm, radius]
+
+    fake_env = [cell.atom_coords()[atm_id].ravel()]
+    fake_atm = cell._atm[atm_id].copy().reshape(natm,-1)
+    fake_atm[:,gto.PTR_COORD] = numpy.arange(0, natm*3, 3)
+    ptr = natm * 3
+    fake_bas = []
+    for ia, atm in enumerate(atm_id):
+        if cell.atom_charge(atm) == 0:  # pass ghost atoms
+            continue
+
+        symb = cell.atom_symbol(atm)
+        if symb in kind:
+            fake_env.append(kind[symb])
+        else:
+            alpha = 1e16
+            norm = (alpha / numpy.pi) ** 1.5
+            radius = 0.0
+            fake_env.append([alpha, norm, radius])
+        fake_bas.append([ia, 0, 1, 1, 0, ptr, ptr+1, 0])
+        fake_atm[ia,gto.PTR_RADIUS] = ptr+2
+        ptr += 3
+
+    fakecell = cell.copy(deep=False)
+    fakecell._atm = numpy.asarray(fake_atm, order="C", dtype=numpy.int32)
+    fakecell._bas = numpy.asarray(fake_bas, order="C", dtype=numpy.int32).reshape(-1, gto.BAS_SLOTS)
+    fakecell._env = numpy.asarray(numpy.hstack(fake_env), order="C", dtype=float)
+    return fakecell, max_radius
diff --git a/pyscf/pbc/dft/multigrid/utils.py b/pyscf/pbc/dft/multigrid/utils.py
new file mode 100644
index 0000000000..3ca9f0addb
--- /dev/null
+++ b/pyscf/pbc/dft/multigrid/utils.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Qiming Sun <osirpt.sun@gmail.com>
+#
+
+import numpy
+from pyscf import lib
+
+def _take_4d(a, indices):
+    a_shape = a.shape
+    ranges = []
+    for i, s in enumerate(indices):
+        if s is None:
+            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
+        else:
+            idx = numpy.asarray(s, dtype=numpy.int32)
+            idx[idx < 0] += a_shape[i]
+        ranges.append(idx)
+    idx = ranges[0][:,None] * a_shape[1] + ranges[1]
+    idy = ranges[2][:,None] * a_shape[3] + ranges[3]
+    a = a.reshape(a_shape[0]*a_shape[1], a_shape[2]*a_shape[3])
+    out = lib.take_2d(a, idx.ravel(), idy.ravel())
+    return out.reshape([len(s) for s in ranges])
+
+def _takebak_4d(out, a, indices):
+    out_shape = out.shape
+    a_shape = a.shape
+    ranges = []
+    for i, s in enumerate(indices):
+        if s is None:
+            idx = numpy.arange(a_shape[i], dtype=numpy.int32)
+        else:
+            idx = numpy.asarray(s, dtype=numpy.int32)
+            idx[idx < 0] += out_shape[i]
+        assert (len(idx) == a_shape[i])
+        ranges.append(idx)
+    idx = ranges[0][:,None] * out_shape[1] + ranges[1]
+    idy = ranges[2][:,None] * out_shape[3] + ranges[3]
+    nx = idx.size
+    ny = idy.size
+    out = out.reshape(out_shape[0]*out_shape[1], out_shape[2]*out_shape[3])
+    lib.takebak_2d(out, a.reshape(nx,ny), idx.ravel(), idy.ravel())
+    return out
+
+def _take_5d(a, indices):
+    a_shape = a.shape
+    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
+    indices = (None,) + indices[2:]
+    return _take_4d(a, indices)
+
+def _takebak_5d(out, a, indices):
+    a_shape = a.shape
+    out_shape = out.shape
+    a = a.reshape((a_shape[0]*a_shape[1],) + a_shape[2:])
+    out = out.reshape((out_shape[0]*out_shape[1],) + out_shape[2:])
+    indices = (None,) + indices[2:]
+    return _takebak_4d(out, a, indices)
diff --git a/pyscf/pbc/dft/rks.py b/pyscf/pbc/dft/rks.py
index 228bc6e91a..d3dc8d1047 100644
--- a/pyscf/pbc/dft/rks.py
+++ b/pyscf/pbc/dft/rks.py
@@ -73,7 +73,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
                                        kpt.reshape(1,3), kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -87,7 +87,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         max_memory = ks.max_memory - lib.current_memory()[0]
         n, exc, vxc = ni.nr_rks(cell, ks.grids, ks.xc, dm, 0, hermi,
                                 kpt, kpts_band, max_memory=max_memory)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         if ks.nlc or ni.libxc.is_nlc(ks.xc):
             if ni.libxc.is_nlc(ks.xc):
                 xc = ks.xc
@@ -98,7 +98,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-            logger.debug(ks, 'nelec with nlc grids = %s', n)
+            logger.info(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     if not hybrid:
diff --git a/pyscf/pbc/dft/test/test_krks_ksym.py b/pyscf/pbc/dft/test/test_krks_ksym.py
index 6c7bd46c4a..615f1d456f 100644
--- a/pyscf/pbc/dft/test/test_krks_ksym.py
+++ b/pyscf/pbc/dft/test/test_krks_ksym.py
@@ -207,14 +207,14 @@ def test_rsh_mdf(self):
     def test_multigrid(self):
         kmf0 = krks.KRKS(cell, kpts=cell.make_kpts(nk))
         kmf0.xc = 'lda'
-        kmf0 = multigrid.multigrid(kmf0)
+        kmf0 = multigrid.multigrid_fftdf(kmf0)
         kmf0.kernel()
         rho0 = kmf0.get_rho()
 
         kpts = cell.make_kpts(nk,space_group_symmetry=True,time_reversal_symmetry=True)
         kmf = pscf.KRKS(cell, kpts=kpts)
         kmf.xc = 'lda'
-        kmf = multigrid.multigrid(kmf)
+        kmf = multigrid.multigrid_fftdf(kmf)
         kmf.kernel()
         self.assertAlmostEqual(kmf.e_tot, kmf0.e_tot, 7)
         rho = kmf.get_rho()
@@ -231,14 +231,14 @@ def test_multigrid(self):
     def test_multigrid_kuks(self):
         kmf0 = pscf.KUKS(cell, kpts=cell.make_kpts(nk))
         kmf0.xc = 'lda'
-        kmf0 = multigrid.multigrid(kmf0)
+        kmf0 = multigrid.multigrid_fftdf(kmf0)
         kmf0.kernel()
         rho0 = kmf0.get_rho()
 
         kpts = cell.make_kpts(nk,space_group_symmetry=True,time_reversal_symmetry=True)
         kmf = pscf.KUKS(cell, kpts=kpts)
         kmf.xc = 'lda'
-        kmf = multigrid.multigrid(kmf)
+        kmf = multigrid.multigrid_fftdf(kmf)
         kmf.kernel()
         self.assertAlmostEqual(kmf.e_tot, kmf0.e_tot, 7)
         rho = kmf.get_rho()
diff --git a/pyscf/pbc/dft/test/test_multigrid.py b/pyscf/pbc/dft/test/test_multigrid.py
index 2cd11e7732..9db362ded3 100644
--- a/pyscf/pbc/dft/test/test_multigrid.py
+++ b/pyscf/pbc/dft/test/test_multigrid.py
@@ -85,12 +85,24 @@ def test_orth_get_pp(self):
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
+        # test small memory
+        mydf = multigrid.MultiGridFFTDF(cell_orth)
+        mydf.max_memory = 10
+        out = mydf.get_pp(max_memory=2)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
+
     def test_nonorth_get_pp(self):
         ref = df.FFTDF(cell_nonorth).get_pp()
         out = multigrid.MultiGridFFTDF(cell_nonorth).get_pp()
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
+        # test small memory
+        mydf = multigrid.MultiGridFFTDF(cell_nonorth)
+        mydf.max_memory = 10
+        out = mydf.get_pp(max_memory=2)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
+
     def test_orth_get_nuc_kpts(self):
         ref = df.FFTDF(cell_orth).get_nuc(kpts)
         out = multigrid.MultiGridFFTDF(cell_orth).get_nuc(kpts)
@@ -133,7 +145,7 @@ def test_multigrid_kuks(self):
         mf = dft.KUKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, numpy.array((dm_he,dm_he)), kpts=kpts)
-        out = multigrid.multigrid(mf).get_veff(cell_he, (dm_he,dm_he), kpts=kpts)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, (dm_he,dm_he), kpts=kpts)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 8)
@@ -143,7 +155,7 @@ def test_multigrid_krks(self):
         mf = dft.KRKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, dm_he, kpts=kpts)
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm_he, kpts=kpts)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm_he, kpts=kpts)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 8)
@@ -159,7 +171,7 @@ def test_multigrid_kroks(self):
         dm1 = lib.tag_array(numpy.array([dm1,dm1]), mo_coeff=mo,
                             mo_occ=mo_occ*2)
         ref = mf.get_veff(cell_he, dm1, kpts=kpts)
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm1, kpts=kpts)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm1, kpts=kpts)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -169,7 +181,7 @@ def test_multigrid_uks(self):
         mf = dft.UKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, numpy.array((dm_he[0],dm_he[0])))
-        out = multigrid.multigrid(mf).get_veff(cell_he, (dm_he[0], dm_he[0]))
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, (dm_he[0], dm_he[0]))
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -179,7 +191,7 @@ def test_multigrid_rks(self):
         mf = dft.RKS(cell_he)
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, dm_he[0])
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm_he[0])
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm_he[0])
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -195,7 +207,7 @@ def test_multigrid_roks(self):
         dm1 = lib.tag_array(numpy.array([dm1,dm1]), mo_coeff=mo,
                             mo_occ=mo_occ*2)
         ref = mf.get_veff(cell_he, dm1)
-        out = multigrid.multigrid(mf).get_veff(cell_he, dm1)
+        out = multigrid.multigrid_fftdf(mf).get_veff(cell_he, dm1)
         self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
@@ -218,8 +230,8 @@ def test_eval_rhoG_orth_kpts(self):
         numpy.random.seed(9)
         dm = numpy.random.random(dm1.shape) + numpy.random.random(dm1.shape) * 1j
         mydf = multigrid.MultiGridFFTDF(cell_orth)
-        rhoG = multigrid._eval_rhoG(mydf, dm, hermi=0, kpts=kpts, deriv=0,
-                                    rhog_high_order=True)
+        rhoG = multigrid.multigrid._eval_rhoG(mydf, dm, hermi=0, kpts=kpts, deriv=0,
+                                              rhog_high_order=True)
         self.assertTrue(rhoG.dtype == numpy.complex128)
 
         mydf = df.FFTDF(cell_orth)
@@ -232,8 +244,8 @@ def test_eval_rhoG_orth_kpts(self):
 
     def test_eval_rhoG_orth_gga(self):
         mydf = multigrid.MultiGridFFTDF(cell_orth)
-        rhoG = multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
-                                    rhog_high_order=True)
+        rhoG = multigrid.multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
+                                              rhog_high_order=True)
 
         mydf = df.FFTDF(cell_orth)
         ni = dft.numint.KNumInt()
@@ -245,8 +257,8 @@ def test_eval_rhoG_orth_gga(self):
 
     def test_eval_rhoG_nonorth_gga(self):
         mydf = multigrid.MultiGridFFTDF(cell_nonorth)
-        rhoG = multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
-                                    rhog_high_order=True)
+        rhoG = multigrid.multigrid._eval_rhoG(mydf, dm, hermi=1, kpts=kpts, deriv=1,
+                                              rhog_high_order=True)
 
         mydf = df.FFTDF(cell_nonorth)
         ni = dft.numint.KNumInt()
@@ -273,7 +285,7 @@ def test_gen_rhf_response(self):
                                     hermi=1, kpts=kpts)
         vj = mydf.get_jk(dm1, with_k=False, kpts=kpts)[0]
         ref += vj
-        v = multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 8)
@@ -282,7 +294,7 @@ def test_gen_rhf_response(self):
         ref = dft.numint.nr_rks_fxc(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1,
                                     hermi=1, kpts=kpts)
         ref += vj
-        v = multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 6)
@@ -356,7 +368,7 @@ def test_nr_rks_fxc_st(self):
         mf.xc = 'b88,'
         ref = dft.numint.nr_rks_fxc_st(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1,
                                        singlet=True, kpts=kpts)
-        v = multigrid._gen_rhf_response(mf, dm_he, singlet=True)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, singlet=True)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 5)
@@ -364,7 +376,7 @@ def test_nr_rks_fxc_st(self):
         mf.xc = 'lda,'
         ref = dft.numint.nr_rks_fxc_st(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1,
                                        singlet=False, kpts=kpts)
-        v = multigrid._gen_rhf_response(mf, dm_he, singlet=False)(dm1)
+        v = multigrid.multigrid._gen_rhf_response(mf, dm_he, singlet=False)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 4)
@@ -391,7 +403,7 @@ def test_gen_uhf_response(self):
         ref = dft.numint.nr_uks_fxc(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1, hermi=1)
         vj = mydf.get_jk(dm1, with_k=False)[0]
         ref += vj[0] + vj[1]
-        v = multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 7)
@@ -399,7 +411,7 @@ def test_gen_uhf_response(self):
         mf.xc = 'b88,'
         ref = dft.numint.nr_uks_fxc(ni, cell_he, mydf.grids, mf.xc, dm_he, dm1, hermi=1)
         ref += vj[0] + vj[1]
-        v = multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
+        v = multigrid.multigrid._gen_uhf_response(mf, dm_he, with_j=True, hermi=1)(dm1)
         self.assertEqual(ref.dtype, v.dtype)
         self.assertEqual(ref.shape, v.shape)
         self.assertAlmostEqual(abs(v-ref).max(), 0, 7)
@@ -454,11 +466,11 @@ def test_orth_uks_fxc_hermi0(self):
 
     def test_rcut_vs_ke_cut(self):
         xc = 'lda,'
-        with lib.temporary_env(multigrid, TASKS_TYPE='rcut'):
+        with lib.temporary_env(multigrid.multigrid, TASKS_TYPE='rcut'):
             mg_df = multigrid.MultiGridFFTDF(cell_orth)
             n1, exc1, v1 = multigrid.nr_rks(mg_df, xc, dm1, kpts=kpts)
             self.assertEqual(len(mg_df.tasks), 3)
-        with lib.temporary_env(multigrid, TASKS_TYPE='ke_cut'):
+        with lib.temporary_env(multigrid.multigrid, TASKS_TYPE='ke_cut'):
             mg_df = multigrid.MultiGridFFTDF(cell_orth)
             n2, exc2, v2 = multigrid.nr_rks(mg_df, xc, dm1, kpts=kpts)
             self.assertEqual(len(mg_df.tasks), 6)
diff --git a/pyscf/pbc/dft/test/test_multigrid2.py b/pyscf/pbc/dft/test/test_multigrid2.py
new file mode 100644
index 0000000000..f23c687a48
--- /dev/null
+++ b/pyscf/pbc/dft/test/test_multigrid2.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import unittest
+import numpy
+from pyscf.pbc import gto, dft
+from pyscf.pbc.dft import multigrid
+from pyscf.pbc.grad import rks as rks_grad
+from pyscf.pbc.grad import uks as uks_grad
+from pyscf.pbc.grad import krks as krks_grad
+
+def setUpModule():
+    global cell
+    cell = gto.Cell()
+    boxlen = 5.0
+    cell.a = numpy.array([[boxlen,0.0,0.0],
+                          [0.0,boxlen,0.0],
+                          [0.0,0.0,boxlen]])
+    cell.atom = """
+        O          1.84560        1.21649        1.10372
+        H          2.30941        1.30070        1.92953
+        H          0.91429        1.26674        1.28886
+    """
+    cell.basis = 'gth-szv'
+    cell.ke_cutoff = 200
+    cell.pseudo = 'gth-pade'
+    cell.verbose = 0
+    cell.use_loose_rcut = True
+    cell.build()
+
+def tearDownModule():
+    global cell
+    del cell
+
+def _fftdf_energy_grad(cell, xc):
+    mf = dft.KRKS(cell, kpts=numpy.zeros((1,3)))
+    mf.xc = xc
+    e = mf.kernel()
+    grad = krks_grad.Gradients(mf)
+    g = grad.kernel()
+    return e, g
+
+def _multigrid2_energy_grad(cell, xc, spin=0):
+    if spin == 0:
+        mf = dft.RKS(cell)
+    elif spin == 1:
+        mf = dft.UKS(cell)
+    mf.xc =  xc
+    mf.with_df = multigrid.MultiGridFFTDF2(cell)
+    e = mf.kernel()
+    if spin == 0:
+        g = rks_grad.Gradients(mf).kernel()
+    elif spin == 1:
+        g = uks_grad.Gradients(mf).kernel()
+    return e, g
+
+class KnownValues(unittest.TestCase):
+    def test_orth_lda(self):
+        xc = 'lda, vwn'
+        e0, g0 = _fftdf_energy_grad(cell, xc)
+        e,  g  = _multigrid2_energy_grad(cell, xc, 0)
+        e1, g1 = _multigrid2_energy_grad(cell, xc, 1)
+        assert abs(e-e0) < 1e-8
+        assert abs(e1-e0) < 1e-8
+        assert abs(g-g0).max() < 2e-5
+        assert abs(g1-g0).max() < 2e-5
+
+    def test_orth_gga(self):
+        xc = 'pbe, pbe'
+        e0, g0 = _fftdf_energy_grad(cell, xc)
+        e,  g  = _multigrid2_energy_grad(cell, xc, 0)
+        e1, g1 = _multigrid2_energy_grad(cell, xc, 1)
+        assert abs(e-e0) < 1e-6
+        assert abs(e1-e0) < 1e-6
+        assert abs(g-g0).max() < 1e-4
+        assert abs(g1-g0).max() < 1e-4
+
+if __name__ == '__main__':
+    print("Full Tests for multigrid2")
+    unittest.main()
diff --git a/pyscf/pbc/dft/uks.py b/pyscf/pbc/dft/uks.py
index de72d6452d..20d8d14c71 100644
--- a/pyscf/pbc/dft/uks.py
+++ b/pyscf/pbc/dft/uks.py
@@ -57,7 +57,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
                                        kpt.reshape(1,3), kpts_band,
                                        with_j=True, return_j=False)
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
         return vxc
 
@@ -86,7 +86,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                           0, hermi, kpt, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        logger.info(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
     if not hybrid:
diff --git a/pyscf/pbc/grad/__init__.py b/pyscf/pbc/grad/__init__.py
index 5408a1eb50..e308bca1e1 100644
--- a/pyscf/pbc/grad/__init__.py
+++ b/pyscf/pbc/grad/__init__.py
@@ -19,7 +19,10 @@
 '''
 Analytical nuclear gradients for PBC
 '''
-
+from pyscf.pbc.grad import rhf
+from pyscf.pbc.grad import rks
+from pyscf.pbc.grad import uhf
+from pyscf.pbc.grad import uks
 from pyscf.pbc.grad import krhf
 from pyscf.pbc.grad import kuhf
 from pyscf.pbc.grad import krks
@@ -30,4 +33,4 @@
 from pyscf.pbc.grad.krks import Gradients as KRKS
 from pyscf.pbc.grad.kuks import Gradients as KUKS
 
-grad_nuc = krhf.grad_nuc
+grad_nuc = rhf.grad_nuc
diff --git a/pyscf/pbc/grad/krhf.py b/pyscf/pbc/grad/krhf.py
index 9fd628882f..0dd6a171e4 100644
--- a/pyscf/pbc/grad/krhf.py
+++ b/pyscf/pbc/grad/krhf.py
@@ -211,6 +211,10 @@ def hcore_deriv(atm_id):
 def grad_nuc(cell, atmlst):
     '''
     Derivatives of nuclear repulsion energy wrt nuclear coordinates
+
+    Notes:
+        An optimized version of this function is available in
+        `pbc.gto.ewald_methods.ewald_nuc_grad`
     '''
     chargs = cell.atom_charges()
     ew_eta, ew_cut = cell.get_ewald_params()
@@ -244,12 +248,14 @@ def grad_nuc(cell, atmlst):
     absG2[absG2==0] = 1e200
     ewg_grad = np.zeros([natom,3])
     SI = cell.get_SI(Gv)
-    if cell.low_dim_ft_type is None or cell.dimension == 3:
+    if cell.dimension != 2 or cell.low_dim_ft_type == 'inf_vacuum':
         coulG = 4*np.pi / absG2
         coulG *= weights
         ZSI = np.einsum("i,ij->j", chargs, SI)
         ZexpG2 = coulG * np.exp(-absG2/(4*ew_eta**2))
         ZexpG2_mod = ZexpG2.reshape(len(ZexpG2),1) * Gv
+    else:
+        raise NotImplementedError
     for i, qi in enumerate(chargs):
         Zfac = np.imag(ZSI * SI[i].conj()) * qi
         ewg_grad[i] = - np.sum(Zfac.reshape((len(Zfac),1)) * ZexpG2_mod, axis = 0)
diff --git a/pyscf/pbc/grad/rhf.py b/pyscf/pbc/grad/rhf.py
new file mode 100644
index 0000000000..720451b719
--- /dev/null
+++ b/pyscf/pbc/grad/rhf.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import __config__
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.grad import rhf as mol_rhf
+from pyscf.grad.rhf import _write
+from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+SCREEN_VHF_DM_CONTRA = getattr(__config__, 'pbc_rhf_grad_screen_vhf_dm_contract', True)
+libpbc = lib.load_library('libpbc')
+
+def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None,
+              atmlst=None, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mol = mf_grad.mol
+    if mo_energy is None: mo_energy = mf.mo_energy
+    if mo_occ is None:    mo_occ = mf.mo_occ
+    if mo_coeff is None:  mo_coeff = mf.mo_coeff
+    log = logger.Logger(mf_grad.stdout, mf_grad.verbose)
+
+    s1 = mf_grad.get_ovlp(mol, kpt)
+    dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+    log.debug('Computing Gradients of NR-HF Coulomb repulsion')
+    vhf = mf_grad.get_veff(mol, dm0, kpt)
+    log.timer('gradients of 2e part', *t0)
+
+    dme0 = mf_grad.make_rdm1e(mo_energy, mo_coeff, mo_occ)
+
+    if atmlst is None:
+        atmlst = range(mol.natm)
+
+    de = 0
+    if gamma_point(kpt):
+        de  = mf.with_df.vpploc_part1_nuc_grad(dm0, kpts=kpt.reshape(-1,3))
+        de += pp_int.vpploc_part2_nuc_grad(mol, dm0)
+        de += pp_int.vppnl_nuc_grad(mol, dm0)
+        h1ao = -mol.pbc_intor('int1e_ipkin', kpt=kpt)
+        if getattr(mf.with_df, 'vpplocG_part1', None) is None:
+            h1ao += -mf.with_df.get_vpploc_part1_ip1(kpts=kpt.reshape(-1,3))
+        de += _contract_vhf_dm(mf_grad, np.add(h1ao, vhf), dm0) * 2
+        de += _contract_vhf_dm(mf_grad, s1, dme0) * -2
+        h1ao = s1 = vhf = dm0 = dme0 = None
+        de = de[atmlst]
+    else:
+        raise NotImplementedError
+
+    for k, ia in enumerate(atmlst):
+        de[k] += mf_grad.extra_force(ia, locals())
+
+    if log.verbose >= logger.DEBUG:
+        log.debug('gradients of electronic part')
+        _write(log, mol, de, atmlst)
+    return de
+
+
+def _contract_vhf_dm(mf_grad, vhf, dm, comp=3, atmlst=None,
+                     screen=SCREEN_VHF_DM_CONTRA):
+    from pyscf.gto.mole import ao_loc_nr, ATOM_OF
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+
+    mol = mf_grad.mol
+    natm = mol.natm
+    nbas = mol.nbas
+    shls_slice = np.asarray([0,nbas,0,nbas], order="C", dtype=np.int32)
+    ao_loc = np.asarray(ao_loc_nr(mol), order="C", dtype=np.int32)
+    shls_atm = np.asarray(mol._bas[:,ATOM_OF].copy(), order="C", dtype=np.int32)
+
+    de = np.zeros((natm,comp), order="C")
+    vhf = np.asarray(vhf, order="C")
+    dm = np.asarray(dm, order="C")
+
+    if screen:
+        neighbor_list = build_neighbor_list_for_shlpairs(mol)
+    else:
+        neighbor_list = lib.c_null_ptr()
+    func = getattr(libpbc, "contract_vhf_dm", None)
+    try:
+        func(de.ctypes.data_as(ctypes.c_void_p),
+             vhf.ctypes.data_as(ctypes.c_void_p),
+             dm.ctypes.data_as(ctypes.c_void_p),
+             ctypes.byref(neighbor_list),
+             shls_slice.ctypes.data_as(ctypes.c_void_p),
+             ao_loc.ctypes.data_as(ctypes.c_void_p),
+             shls_atm.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_int(comp), ctypes.c_int(natm),
+             ctypes.c_int(nbas))
+    except RuntimeError:
+        raise
+    free_neighbor_list(neighbor_list)
+
+    if atmlst is not None:
+        de = de[atmlst]
+
+    logger.timer(mf_grad, '_contract_vhf_dm', *t0)
+    return de
+
+
+def get_ovlp(cell, kpt=np.zeros(3)):
+    return -cell.pbc_intor('int1e_ipovlp', kpt=kpt)
+
+
+def get_veff(mf_grad, mol, dm, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mydf = mf.with_df
+    xc_code = getattr(mf, 'xc', None)
+    kpts = kpt.reshape(-1,3)
+    return -mydf.get_veff_ip1(dm, xc_code=xc_code, kpts=kpts)
+
+
+def grad_nuc(cell, atmlst=None, ew_eta=None, ew_cut=None):
+    from pyscf.pbc.gto import ewald_methods
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+
+    grad = ewald_methods.ewald_nuc_grad(cell, ew_eta, ew_cut)
+    if atmlst is not None:
+        grad = grad[atmlst]
+
+    logger.timer(cell, 'nuclear gradient', *t0)
+    return grad
+
+
+class GradientsBase(mol_rhf.GradientsBase):
+    '''Base class for Gamma-point nuclear gradient'''
+    def grad_nuc(self, mol=None, atmlst=None):
+        if mol is None: mol = self.mol
+        return grad_nuc(mol, atmlst)
+
+    def get_ovlp(self, mol=None, kpt=np.zeros(3)):
+        if mol is None:
+            mol = self.mol
+        return get_ovlp(mol, kpt)
+
+
+class Gradients(GradientsBase):
+    '''Non-relativistic Gamma-point restricted Hartree-Fock gradients'''
+    def get_veff(self, mol=None, dm=None, kpt=np.zeros(3)):
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.base.make_rdm1()
+        return get_veff(self, mol, dm, kpt)
+
+    make_rdm1e = mol_rhf.Gradients.make_rdm1e
+    grad_elec = grad_elec
diff --git a/pyscf/pbc/grad/rks.py b/pyscf/pbc/grad/rks.py
new file mode 100644
index 0000000000..1429050002
--- /dev/null
+++ b/pyscf/pbc/grad/rks.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+from pyscf.pbc.grad import rhf
+
+
+class Gradients(rhf.Gradients):
+    '''Non-relativistic Gamma-point restricted Kohn-Sham DFT gradients'''
+    pass
diff --git a/pyscf/pbc/grad/uhf.py b/pyscf/pbc/grad/uhf.py
new file mode 100644
index 0000000000..fd71aa0920
--- /dev/null
+++ b/pyscf/pbc/grad/uhf.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import __config__
+from pyscf.lib import logger
+from pyscf.grad import uhf as mol_uhf
+from pyscf.grad.rhf import _write
+from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.pbc.grad import rhf as rhf_grad
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mol = mf_grad.mol
+    if mo_energy is None: mo_energy = mf.mo_energy
+    if mo_occ is None:    mo_occ = mf.mo_occ
+    if mo_coeff is None:  mo_coeff = mf.mo_coeff
+    log = logger.Logger(mf_grad.stdout, mf_grad.verbose)
+
+    s1 = mf_grad.get_ovlp(mol, kpt)
+    dm0 = mf.make_rdm1(mo_coeff, mo_occ)
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+    log.debug('Computing Gradients of NR-HF Coulomb repulsion')
+    vhf = mf_grad.get_veff(mol, dm0, kpt)
+    log.timer('gradients of 2e part', *t0)
+
+    dme0 = mf_grad.make_rdm1e(mo_energy, mo_coeff, mo_occ)
+    dm0_sf = dm0[0] + dm0[1]
+    dme0_sf = dme0[0] + dme0[1]
+
+    if atmlst is None:
+        atmlst = range(mol.natm)
+
+    de = 0
+    if gamma_point(kpt):
+        de  = mf.with_df.vpploc_part1_nuc_grad(dm0_sf, kpts=kpt.reshape(-1,3))
+        de += pp_int.vpploc_part2_nuc_grad(mol, dm0_sf)
+        de += pp_int.vppnl_nuc_grad(mol, dm0_sf)
+        h1ao = -mol.pbc_intor('int1e_ipkin', kpt=kpt)
+        if getattr(mf.with_df, 'vpplocG_part1', None) is None:
+            h1ao += -mf.with_df.get_vpploc_part1_ip1(kpts=kpt.reshape(-1,3))
+        de += rhf_grad._contract_vhf_dm(mf_grad, h1ao, dm0_sf) * 2
+        for s in range(2):
+            de += rhf_grad._contract_vhf_dm(mf_grad, vhf[s], dm0[s]) * 2
+        de += rhf_grad._contract_vhf_dm(mf_grad, s1, dme0_sf) * -2
+        h1ao = s1 = vhf = dm0 = dme0 = dm0_sf = dme0_sf = None
+        de = de[atmlst]
+    else:
+        raise NotImplementedError
+
+    for k, ia in enumerate(atmlst):
+        de[k] += mf_grad.extra_force(ia, locals())
+
+    if log.verbose >= logger.DEBUG:
+        log.debug('gradients of electronic part')
+        _write(log, mol, de, atmlst)
+    return de
+
+def get_veff(mf_grad, mol, dm, kpt=np.zeros(3)):
+    mf = mf_grad.base
+    mydf = mf.with_df
+    xc_code = getattr(mf, 'xc', None)
+    kpts = kpt.reshape(-1,3)
+    return -mydf.get_veff_ip1(dm, xc_code=xc_code, kpts=kpts, spin=1)
+
+class Gradients(rhf_grad.GradientsBase):
+    '''Non-relativistic Gamma-point restricted Hartree-Fock gradients'''
+    def get_veff(self, mol=None, dm=None, kpt=np.zeros(3)):
+        if mol is None: mol = self.mol
+        if dm is None: dm = self.base.make_rdm1()
+        return get_veff(self, mol, dm, kpt)
+
+    make_rdm1e = mol_uhf.Gradients.make_rdm1e
+    grad_elec = grad_elec
diff --git a/pyscf/pbc/grad/uks.py b/pyscf/pbc/grad/uks.py
new file mode 100644
index 0000000000..4a6ce67c1a
--- /dev/null
+++ b/pyscf/pbc/grad/uks.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+from pyscf.pbc.grad import uhf
+
+
+class Gradients(uhf.Gradients):
+    '''Non-relativistic Gamma-point unrestricted Kohn-Sham DFT gradients'''
+    pass
diff --git a/pyscf/pbc/gto/__init__.py b/pyscf/pbc/gto/__init__.py
index dcaaddebbc..769b76c616 100644
--- a/pyscf/pbc/gto/__init__.py
+++ b/pyscf/pbc/gto/__init__.py
@@ -22,6 +22,7 @@
 from pyscf.pbc.gto.basis import parse, load, parse_ecp, load_ecp
 from pyscf.pbc.gto import pseudo
 from pyscf.pbc.gto.cell import *
+from pyscf.pbc.gto.neighborlist import *
 
 parse_pp = parsepp = pseudo.parse
 load_pp = loadpp = pseudo.load
diff --git a/pyscf/pbc/gto/_pbcintor.py b/pyscf/pbc/gto/_pbcintor.py
index f721eb0304..c5b921b2e0 100644
--- a/pyscf/pbc/gto/_pbcintor.py
+++ b/pyscf/pbc/gto/_pbcintor.py
@@ -33,15 +33,21 @@ def __init__(self, cell):
 
     def init_rcut_cond(self, cell, precision=None):
         if precision is None: precision = cell.precision
-        rcut = numpy.array([cell.bas_rcut(ib, precision)
-                            for ib in range(cell.nbas)])
+        if cell.use_loose_rcut:
+            rcut = cell.rcut_by_shells(precision)
+            fn_set_rcut_cond = getattr(libpbc, 'PBCset_rcut_cond_loose')
+        else:
+            rcut = numpy.array([cell.bas_rcut(ib, precision)
+                                for ib in range(cell.nbas)])
+            fn_set_rcut_cond = getattr(libpbc, 'PBCset_rcut_cond')
+
         natm = ctypes.c_int(cell._atm.shape[0])
         nbas = ctypes.c_int(cell._bas.shape[0])
-        libpbc.PBCset_rcut_cond(self._this,
-                                rcut.ctypes.data_as(ctypes.c_void_p),
-                                cell._atm.ctypes.data_as(ctypes.c_void_p), natm,
-                                cell._bas.ctypes.data_as(ctypes.c_void_p), nbas,
-                                cell._env.ctypes.data_as(ctypes.c_void_p))
+        fn_set_rcut_cond(self._this,
+                         rcut.ctypes.data_as(ctypes.c_void_p),
+                         cell._atm.ctypes.data_as(ctypes.c_void_p), natm,
+                         cell._bas.ctypes.data_as(ctypes.c_void_p), nbas,
+                         cell._env.ctypes.data_as(ctypes.c_void_p))
         return self
 
     def del_rcut_cond(self):
@@ -56,4 +62,5 @@ def __del__(self):
 
 class _CPBCOpt(ctypes.Structure):
     _fields_ = [('rrcut', ctypes.c_void_p),
+                ('rcut', ctypes.c_void_p),
                 ('fprescreen', ctypes.c_void_p)]
diff --git a/pyscf/pbc/gto/cell.py b/pyscf/pbc/gto/cell.py
index 87282fbfd4..872fda36e2 100644
--- a/pyscf/pbc/gto/cell.py
+++ b/pyscf/pbc/gto/cell.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2021 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,6 +41,9 @@
 WITH_GAMMA = getattr(__config__, 'pbc_gto_cell_make_kpts_with_gamma', True)
 EXP_DELIMITER = getattr(__config__, 'pbc_gto_cell_split_basis_exp_delimiter',
                         [1.0, 0.5, 0.25, 0.1, 0])
+# defined in lib/pbc/cell.h
+RCUT_EPS = 1e-3
+RCUT_MAX_CYCLE = 10
 
 libpbc = _pbcintor.libpbc
 
@@ -281,6 +284,89 @@ def intor_cross(intor, cell1, cell2, comp=None, hermi=0, kpts=None, kpt=None,
         mat = mat[0]
     return mat
 
+def _intor_cross_screened(
+        intor, cell1, cell2, comp=None, hermi=0, kpts=None, kpt=None,
+        shls_slice=None, **kwargs):
+    '''`intor_cross` with prescreening.
+
+    Notes:
+         This function may be subject to change.
+    '''
+    from pyscf.pbc.gto.neighborlist import NeighborListOpt
+    intor, comp = moleintor._get_intor_and_comp(cell1._add_suffix(intor), comp)
+
+    if kpts is None:
+        if kpt is not None:
+            kpts_lst = np.reshape(kpt, (1,3))
+        else:
+            kpts_lst = np.zeros((1,3))
+    else:
+        kpts_lst = np.reshape(kpts, (-1,3))
+    nkpts = len(kpts_lst)
+
+    pcell = cell1.copy(deep=False)
+    pcell.precision = min(cell1.precision, cell2.precision)
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = conc_env(cell1._atm, cell1._bas, cell1._env,
+                                     cell2._atm, cell2._bas, cell2._env)
+    if shls_slice is None:
+        shls_slice = (0, cell1.nbas, 0, cell2.nbas)
+    i0, i1, j0, j1 = shls_slice[:4]
+    j0 += cell1.nbas
+    j1 += cell1.nbas
+    ao_loc = moleintor.make_loc(bas, intor)
+    ni = ao_loc[i1] - ao_loc[i0]
+    nj = ao_loc[j1] - ao_loc[j0]
+    out = np.empty((nkpts,comp,ni,nj), dtype=np.complex128)
+
+    if hermi == 0:
+        aosym = 's1'
+    else:
+        aosym = 's2'
+    fill = getattr(libpbc, 'PBCnr2c_screened_fill_k'+aosym)
+    fintor = getattr(moleintor.libcgto, intor)
+    drv = libpbc.PBCnr2c_screened_drv
+
+    rcut = max(cell1.rcut, cell2.rcut)
+    Ls = cell1.get_lattice_Ls(rcut=rcut)
+    expkL = np.asarray(np.exp(1j*np.dot(kpts_lst, Ls.T)), order='C')
+
+    neighbor_list = kwargs.get('neighbor_list', None)
+    if neighbor_list is None:
+        nlopt = NeighborListOpt(cell1)
+        nlopt.build(cell1, cell2, Ls, set_optimizer=False)
+        neighbor_list = nlopt.nl
+
+    cintopt = lib.c_null_ptr()
+
+    drv(fintor, fill, out.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(nkpts), ctypes.c_int(comp), ctypes.c_int(len(Ls)),
+        Ls.ctypes.data_as(ctypes.c_void_p),
+        expkL.ctypes.data_as(ctypes.c_void_p),
+        (ctypes.c_int*4)(i0, i1, j0, j1),
+        ao_loc.ctypes.data_as(ctypes.c_void_p), cintopt,
+        atm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(pcell.natm),
+        bas.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(pcell.nbas),
+        env.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(env.size),
+        ctypes.byref(neighbor_list))
+
+    nlopt = None
+
+    mat = []
+    for k, kpt in enumerate(kpts_lst):
+        v = out[k]
+        if hermi != 0:
+            for ic in range(comp):
+                lib.hermi_triu(v[ic], hermi=hermi, inplace=True)
+        if comp == 1:
+            v = v[0]
+        if abs(kpt).sum() < 1e-9:  # gamma_point
+            v = v.real
+        mat.append(v)
+
+    if kpts is None or np.shape(kpts) == (3,):  # A single k-point
+        mat = mat[0]
+    return mat
 
 def get_nimgs(cell, precision=None):
     r'''Choose number of basis function images in lattice sums
@@ -339,6 +425,9 @@ def estimate_rcut(cell, precision=None):
         return 0.01
     if precision is None:
         precision = cell.precision
+    if cell.use_loose_rcut:
+        return cell.rcut_by_shells(precision).max()
+
     exps, cs = _extract_pgto_params(cell, 'min')
     ls = cell._bas[:,mole.ANG_OF]
     rcut = _estimate_rcut(exps, ls, cs, precision)
@@ -491,7 +580,24 @@ def get_Gv_weights(cell, mesh=None, **kwargs):
             weights = np.einsum('i,k->ik', wxy, wz).reshape(-1)
 
     Gvbase = (rx, ry, rz)
-    Gv = np.dot(lib.cartesian_prod(Gvbase), b)
+
+    #:Gv = np.dot(lib.cartesian_prod(Gvbase), b)
+    # NOTE mesh can be different from the input mesh
+    mesh = np.asarray((len(rx),len(ry),len(rz)), dtype=np.int32)
+    Gv = np.empty((*mesh,3), order='C', dtype=float)
+    b = np.asarray(b, order='C')
+    rx = np.asarray(rx, order='C')
+    ry = np.asarray(ry, order='C')
+    rz = np.asarray(rz, order='C')
+    fn = libpbc.get_Gv
+    fn(Gv.ctypes.data_as(ctypes.c_void_p),
+       rx.ctypes.data_as(ctypes.c_void_p),
+       ry.ctypes.data_as(ctypes.c_void_p),
+       rz.ctypes.data_as(ctypes.c_void_p),
+       mesh.ctypes.data_as(ctypes.c_void_p),
+       b.ctypes.data_as(ctypes.c_void_p))
+    Gv = Gv.reshape(-1, 3)
+
     # 1/cell.vol == det(b)/(2pi)^3
     weights *= 1/(2*np.pi)**3
     return Gv, Gvbase, weights
@@ -504,7 +610,7 @@ def _non_uniform_Gv_base(n):
     #return np.hstack((0,rs,-rs[::-1])), np.hstack((0,ws,ws[::-1]))
     return np.hstack((rs,-rs[::-1])), np.hstack((ws,ws[::-1]))
 
-def get_SI(cell, Gv=None, mesh=None):
+def get_SI(cell, Gv=None, mesh=None, atmlst=None):
     '''Calculate the structure factor (0D, 1D, 2D, 3D) for all atoms; see MH (3.34).
 
     Args:
@@ -513,11 +619,16 @@ def get_SI(cell, Gv=None, mesh=None):
         Gv : (N,3) array
             G vectors
 
+        atmlst : list of ints, optional
+            Indices of atoms for which the structure factors are computed.
+
     Returns:
         SI : (natm, ngrids) ndarray, dtype=np.complex128
             The structure factor for each atom at each G-vector.
     '''
     coords = cell.atom_coords()
+    if atmlst is not None:
+        coords = coords[np.asarray(atmlst)]
     if Gv is None:
         if mesh is None:
             mesh = cell.mesh
@@ -598,6 +709,10 @@ def ewald(cell, ew_eta=None, ew_cut=None):
     if cell.natm == 0:
         return 0
 
+    if cell.dimension == 3 and cell.use_particle_mesh_ewald:
+        from pyscf.pbc.gto import ewald_methods
+        return ewald_methods.particle_mesh_ewald(cell, ew_eta, ew_cut)
+
     chargs = cell.atom_charges()
 
     if ew_eta is None or ew_cut is None:
@@ -639,7 +754,16 @@ def ewald(cell, ew_eta=None, ew_cut=None):
         # have relatively large error
         coulG = 4*np.pi / absG2
         coulG *= weights
-        ZSI = np.einsum("i,ij->j", chargs, cell.get_SI(Gv))
+
+        #:ZSI = np.einsum('i,ij->j', chargs, cell.get_SI(Gv))
+        ngrids = len(Gv)
+        ZSI = np.empty((ngrids,), dtype=np.complex128)
+        mem_avail = cell.max_memory - lib.current_memory()[0]
+        blksize = int((mem_avail*1e6 - cell.natm*24)/((3+cell.natm*2)*8))
+        blksize = min(ngrids, max(mesh[2], blksize))
+        for ig0, ig1 in lib.prange(0, ngrids, blksize):
+            np.einsum('i,ij->j', chargs, cell.get_SI(Gv[ig0:ig1]), out=ZSI[ig0:ig1])
+
         ZexpG2 = ZSI * np.exp(-absG2/(4*ew_eta**2))
         ewg = .5 * np.einsum('i,i,i', ZSI.conj(), ZexpG2, coulG).real
 
@@ -835,6 +959,59 @@ def _mesh_inf_vaccum(cell):
     # meshz has to be even number due to the symmetry on z+ and z-
     return int(meshz*.5 + .999) * 2
 
+def pgf_rcut(l, alpha, coeff, precision=INTEGRAL_PRECISION,
+             rcut=0, max_cycle=RCUT_MAX_CYCLE, eps=RCUT_EPS):
+    '''Estimate the cutoff radii of primitive Gaussian functions
+    based on their values in real space:
+    `c*rcut^(l+2)*exp(-alpha*rcut^2) ~ precision`.
+    '''
+    c = np.log(coeff / precision)
+
+    rmin = np.sqrt(.5 * (l+2) / alpha) * 2
+    eps = np.minimum(rmin/10, eps)
+    rcut = np.maximum(rcut, rmin+eps)
+    for i in range(max_cycle):
+        rcut_last = rcut
+        rcut = np.sqrt(((l+2) * np.log(rcut) + c) / alpha)
+        if np.all(abs(rcut - rcut_last) < eps):
+            return rcut
+    warnings.warn(f'cell.pgf_rcut failed to converge in {max_cycle} cycles.')
+    return rcut
+
+def rcut_by_shells(cell, precision=None, rcut=0,
+                   return_pgf_radius=False):
+    '''Compute shell and primitive gaussian function radii.
+    '''
+    # TODO the internal implementation loops over all shells,
+    # which can be optimized to loop over atom types.
+    if precision is None:
+        precision = cell.precision
+
+    bas = np.asarray(cell._bas, order='C')
+    env = np.asarray(cell._env, order='C')
+    nbas = len(bas)
+    shell_radius = np.empty((nbas,), order='C', dtype=float)
+    if return_pgf_radius:
+        nprim = bas[:,mole.NPRIM_OF].max()
+        # be careful that the unused memory blocks are not initialized
+        pgf_radius = np.empty((nbas,nprim), order='C', dtype=np.double)
+        ptr_pgf_radius = lib.ndarray_pointer_2d(pgf_radius)
+    else:
+        ptr_pgf_radius = lib.c_null_ptr()
+    fn = getattr(libpbc, 'rcut_by_shells', None)
+    try:
+        fn(shell_radius.ctypes.data_as(ctypes.c_void_p),
+           ptr_pgf_radius,
+           bas.ctypes.data_as(ctypes.c_void_p),
+           env.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_int(nbas), ctypes.c_double(rcut),
+           ctypes.c_double(precision))
+    except Exception as e:
+        raise RuntimeError(f'Failed to get shell radii.\n{e}')
+    if return_pgf_radius:
+        return shell_radius, pgf_radius
+    return shell_radius
+
 
 class Cell(mole.MoleBase):
     '''A Cell object holds the basic information of a crystal.
@@ -864,6 +1041,14 @@ class Cell(mole.MoleBase):
             infinity vacuum (inf_vacuum) or truncated Coulomb potential
             (analytic_2d_1). Unless explicitly specified, analytic_2d_1 is
             used for 2D system and inf_vacuum is assumed for 1D and 0D.
+        use_loose_rcut : bool
+            If set to True, a loose `rcut` determined by shell radius is used,
+            which is usually accurate enough for pure DFT calculations;
+            otherwise, a tight `rcut` determined by overlap integral is used.
+            Default value is False. Has no effect if `rcut` is set manually.
+        use_particle_mesh_ewald : bool
+            If set to True, use particle-mesh Ewald to compute the nuclear repulsion.
+            Default value is False, meaning to use classical Ewald summation.
         space_group_symmetry : bool
             Whether to consider space group symmetry. Default is False.
         symmorphic : bool
@@ -892,6 +1077,7 @@ class Cell(mole.MoleBase):
         'precision', 'exp_to_discard',
         'a', 'ke_cutoff', 'pseudo', 'dimension', 'low_dim_ft_type',
         'space_group_symmetry', 'symmorphic', 'lattice_symmetry', 'mesh', 'rcut',
+        'use_loose_rcut', 'use_particle_mesh_ewald',
     }
 
     def __init__(self, **kwargs):
@@ -906,6 +1092,8 @@ def __init__(self, **kwargs):
         #       density-fitting class.  This determines how the ewald produces
         #       its energy.
         self.low_dim_ft_type = None
+        self.use_loose_rcut = False
+        self.use_particle_mesh_ewald = False
         self.space_group_symmetry = False
         self.symmorphic = False
         self.lattice_symmetry = None
@@ -1082,7 +1270,9 @@ def build_lattice_symmetry(self, check_mesh_symmetry=True):
     def build(self, dump_input=True, parse_arg=mole.ARGPARSE,
               a=None, mesh=None, ke_cutoff=None, precision=None, nimgs=None,
               h=None, dimension=None, rcut= None, low_dim_ft_type=None,
-              space_group_symmetry=None, symmorphic=None, *args, **kwargs):
+              space_group_symmetry=None, symmorphic=None,
+              use_loose_rcut=None, use_particle_mesh_ewald=None,
+              *args, **kwargs):
         '''Setup Mole molecule and Cell and initialize some control parameters.
         Whenever you change the value of the attributes of :class:`Cell`,
         you need call this function to refresh the internal data of Cell.
@@ -1133,6 +1323,10 @@ def build(self, dump_input=True, parse_arg=mole.ARGPARSE,
         if rcut is not None: self.rcut = rcut
         if ke_cutoff is not None: self.ke_cutoff = ke_cutoff
         if low_dim_ft_type is not None: self.low_dim_ft_type = low_dim_ft_type
+        if use_loose_rcut is not None:
+            self.use_loose_rcut = use_loose_rcut
+        if use_particle_mesh_ewald is not None:
+            self.use_particle_mesh_ewald = use_particle_mesh_ewald
         if space_group_symmetry is not None:
             self.space_group_symmetry = space_group_symmetry
         if symmorphic is not None:
@@ -1265,7 +1459,7 @@ def build(self, dump_input=True, parse_arg=mole.ARGPARSE,
             logger.info(self, 'Cell volume = %g', self.vol)
             # Check atoms coordinates
             if self.dimension > 0 and self.natm > 0:
-                scaled_atom_coords = np.linalg.solve(_a.T, self.atom_coords().T).T
+                scaled_atom_coords = self.get_scaled_atom_coords(_a)
                 atom_boundary_max = scaled_atom_coords[:,:self.dimension].max(axis=0)
                 atom_boundary_min = scaled_atom_coords[:,:self.dimension].min(axis=0)
                 if (np.any(atom_boundary_max > 1) or np.any(atom_boundary_min < -1)):
@@ -1367,13 +1561,12 @@ def lattice_vectors(self):
         else:
             return a/self.unit
 
-    def get_scaled_positions(self):
-        ''' Get scaled atom positions.
+    def get_scaled_atom_coords(self, a=None):
+        ''' Get scaled atomic coordinates.
         '''
-        a = self.lattice_vectors()
-        atm_pos = self.atom_coords()
-        scaled_atm_pos = np.dot(atm_pos,np.linalg.inv(a))
-        return scaled_atm_pos
+        if a is None:
+            a = self.lattice_vectors()
+        return np.dot(self.atom_coords(), np.linalg.inv(a))
 
     def reciprocal_vectors(self, norm_to=2*np.pi):
         r'''
@@ -1475,6 +1668,7 @@ def loads_(self, molstr):
         return self
 
     bas_rcut = bas_rcut
+    rcut_by_shells = rcut_by_shells
 
     get_lattice_Ls = pbctools.get_lattice_Ls
 
@@ -1511,6 +1705,10 @@ def pbc_intor(self, intor, comp=None, hermi=0, kpts=None, kpt=None,
             # FIXME: Whether to check _built and call build?  ._bas and .basis
             # may not be consistent. calling .build() may leads to wrong intor env.
             #self.build(False, False)
+        if self.use_loose_rcut:
+            return _intor_cross_screened(
+                            intor, self, self, comp, hermi, kpts, kpt,
+                            shls_slice, **kwargs)
         return intor_cross(intor, self, self, comp, hermi, kpts, kpt,
                            shls_slice, **kwargs)
 
@@ -1551,6 +1749,7 @@ def to_mol(self):
         mol = self.view(mole.Mole)
         delattr(mol, 'a')
         delattr(mol, '_mesh')
+        mol.enuc = None #reset nuclear energy
         if mol.symmetry:
             mol._build_symmetry()
         return mol
diff --git a/pyscf/pbc/gto/ewald_methods.py b/pyscf/pbc/gto/ewald_methods.py
new file mode 100644
index 0000000000..75d028a564
--- /dev/null
+++ b/pyscf/pbc/gto/ewald_methods.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+import scipy
+from pyscf import __config__
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.gto import mole
+from pyscf.pbc import tools
+
+libpbc = lib.load_library('libpbc')
+
+INTERPOLATION_ORDER = getattr(__config__, 'pyscf_pbc_ewald_bspline_order', 10)
+
+def _bspline(u, n=4):
+    fac = 1. / scipy.special.factorial(n-1)
+    M = 0
+    for k in range(n+1):
+        fac1 = ((-1)**k) * scipy.special.binom(n, k)
+        M += fac1 * ((np.maximum(u-k, 0)) ** (n-1))
+    M *= fac
+    return M
+
+def _bspline_grad(u, n=4):
+    r'''
+    ... math::
+        \frac{dM}{du} = M_{n-1}(u) - M_{n-1}(u-1)
+    '''
+    dMdu = _bspline(u, n-1) - _bspline(u-1, n-1)
+    return dMdu
+
+def bspline(u, ng, n=4, deriv=0):
+    u = np.asarray(u).ravel()
+    u_floor = np.floor(u)
+    delta = u - u_floor
+    idx = []
+    val = []
+    for i in range(n):
+        idx.append(np.rint((u_floor - i) % ng).astype(int))
+        val.append(delta + i)
+
+    M = np.zeros((u.size, ng))
+    for i in range(n):
+        M[np.arange(u.size),idx[i]] += _bspline(val[i], n)
+
+    if deriv > 0:
+        if deriv > 1:
+            raise NotImplementedError
+        dM = np.zeros((u.size, ng))
+        for i in range(n):
+            dM[np.arange(u.size),idx[i]] += _bspline_grad(val[i], n)
+        M = [M, dM]
+
+    m = np.arange(ng)
+    b = np.exp(2*np.pi*1j*(n-1)*m/ng)
+    tmp = 0
+    for k in range(n-1):
+        tmp += _bspline(k+1, n) * np.exp(2*np.pi*1j*m*k/ng)
+    b /= tmp
+    if n % 2 > 0 and ng % 2 == 0 :
+        b[ng//2] = 0
+    return M, b, idx
+
+def _get_ewald_direct(cell, ew_eta=None, ew_cut=None):
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+
+    chargs = np.asarray(cell.atom_charges(), order='C', dtype=float)
+    coords = np.asarray(cell.atom_coords(), order='C')
+    Lall = np.asarray(cell.get_lattice_Ls(rcut=ew_cut), order='C')
+
+    natm = len(chargs)
+    nL = len(Lall)
+    ewovrl = np.zeros([1])
+    fun = getattr(libpbc, "get_ewald_direct")
+    fun(ewovrl.ctypes.data_as(ctypes.c_void_p),
+        chargs.ctypes.data_as(ctypes.c_void_p),
+        coords.ctypes.data_as(ctypes.c_void_p),
+        Lall.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_double(ew_eta), ctypes.c_double(ew_cut),
+        ctypes.c_int(natm), ctypes.c_int(nL))
+    return ewovrl[0]
+
+def _get_ewald_direct_nuc_grad(cell, ew_eta=None, ew_cut=None):
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+
+    chargs = np.asarray(cell.atom_charges(), order='C', dtype=float)
+    coords = np.asarray(cell.atom_coords(), order='C')
+    Lall = np.asarray(cell.get_lattice_Ls(rcut=ew_cut), order='C')
+
+    natm = len(chargs)
+    nL = len(Lall)
+    grad = np.zeros([natm,3], order='C', dtype=float)
+    fun = getattr(libpbc, "get_ewald_direct_nuc_grad")
+    fun(grad.ctypes.data_as(ctypes.c_void_p),
+        chargs.ctypes.data_as(ctypes.c_void_p),
+        coords.ctypes.data_as(ctypes.c_void_p),
+        Lall.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_double(ew_eta), ctypes.c_double(ew_cut),
+        ctypes.c_int(natm), ctypes.c_int(nL))
+    return grad
+
+
+# FIXME The default interpolation order may be too high
+def particle_mesh_ewald(cell, ew_eta=None, ew_cut=None,
+                        order=INTERPOLATION_ORDER):
+    if cell.dimension != 3:
+        raise NotImplementedError("Particle mesh ewald only works for 3D.")
+
+    chargs = cell.atom_charges()
+    coords = cell.atom_coords()
+    natm = len(coords)
+
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+    log_precision = np.log(cell.precision / (chargs.sum()*16*np.pi**2))
+    ke_cutoff = -2*ew_eta**2*log_precision
+    mesh = cell.cutoff_to_mesh(ke_cutoff)
+
+    ewovrl = _get_ewald_direct(cell, ew_eta, ew_cut)
+    ewself  = -.5 * np.dot(chargs,chargs) * 2 * ew_eta / np.sqrt(np.pi)
+    if cell.dimension == 3:
+        ewself += -.5 * np.sum(chargs)**2 * np.pi/(ew_eta**2 * cell.vol)
+
+    b = cell.reciprocal_vectors(norm_to=1)
+    u = np.dot(coords, b.T) * mesh[None,:]
+
+    Mx, bx, idx = bspline(u[:,0], mesh[0], order)
+    My, by, idy = bspline(u[:,1], mesh[1], order)
+    Mz, bz, idz = bspline(u[:,2], mesh[2], order)
+
+    idx = np.asarray(idx).T
+    idy = np.asarray(idy).T
+    idz = np.asarray(idz).T
+    Mx_s = Mx[np.arange(natm)[:,None], idx]
+    My_s = My[np.arange(natm)[:,None], idy]
+    Mz_s = Mz[np.arange(natm)[:,None], idz]
+
+    #:Q = np.einsum('i,ix,iy,iz->xyz', chargs, Mx, My, Mz)
+    Q = np.zeros([*mesh])
+    for ia in range(len(chargs)):
+        Q_s = np.einsum('x,y,z->xyz', Mx_s[ia], My_s[ia], Mz_s[ia])
+        Q[np.ix_(idx[ia], idy[ia], idz[ia])] += chargs[ia] * Q_s
+
+    B = np.einsum('x,y,z->xyz', bx*bx.conj(), by*by.conj(), bz*bz.conj())
+
+    Gv, Gvbase, weights = cell.get_Gv_weights(mesh)
+    absG2 = np.einsum('ix,ix->i', Gv, Gv)
+    absG2[absG2==0] = 1e200
+    coulG = 4*np.pi / absG2
+    C = weights * coulG * np.exp(-absG2/(4*ew_eta**2))
+    C = C.reshape(*mesh)
+
+    Q_ifft = tools.ifft(Q, mesh).reshape(*mesh)
+    tmp = tools.fft(B * C * Q_ifft, mesh).real.reshape(*mesh)
+    ewg = 0.5 * np.prod(mesh) * np.einsum('xyz,xyz->', Q, tmp)
+
+    logger.debug(cell, 'Ewald components = %.15g, %.15g, %.15g', ewovrl, ewself, ewg)
+    return ewovrl + ewself + ewg
+
+def particle_mesh_ewald_nuc_grad(cell, ew_eta=None, ew_cut=None,
+                                 order=INTERPOLATION_ORDER):
+    if cell.dimension != 3:
+        raise NotImplementedError("Particle mesh ewald only works for 3D.")
+
+    chargs = cell.atom_charges()
+    coords = cell.atom_coords()
+
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+    log_precision = np.log(cell.precision / (chargs.sum()*16*np.pi**2))
+    ke_cutoff = -2*ew_eta**2*log_precision
+    mesh = cell.cutoff_to_mesh(ke_cutoff)
+
+    grad_dir = _get_ewald_direct_nuc_grad(cell, ew_eta, ew_cut)
+
+    b = cell.reciprocal_vectors(norm_to=1)
+    u = np.dot(coords, b.T) * mesh[None,:]
+
+    [Mx, dMx], bx, idx = bspline(u[:,0], mesh[0], order, deriv=1)
+    [My, dMy], by, idy = bspline(u[:,1], mesh[1], order, deriv=1)
+    [Mz, dMz], bz, idz = bspline(u[:,2], mesh[2], order, deriv=1)
+
+    idx = np.asarray(idx).T
+    idy = np.asarray(idy).T
+    idz = np.asarray(idz).T
+    Mx_s = Mx[np.indices(idx.shape)[0], idx]
+    My_s = My[np.indices(idy.shape)[0], idy]
+    Mz_s = Mz[np.indices(idz.shape)[0], idz]
+    dMx_s = dMx[np.indices(idx.shape)[0], idx]
+    dMy_s = dMy[np.indices(idy.shape)[0], idy]
+    dMz_s = dMz[np.indices(idz.shape)[0], idz]
+
+    Q = np.zeros([*mesh])
+    for ia in range(len(chargs)):
+        Q_s = np.einsum('x,y,z->xyz', Mx_s[ia], My_s[ia], Mz_s[ia])
+        Q[np.ix_(idx[ia], idy[ia], idz[ia])] += chargs[ia] * Q_s
+
+    B = np.einsum('x,y,z->xyz', bx*bx.conj(), by*by.conj(), bz*bz.conj())
+
+    Gv, Gvbase, weights = cell.get_Gv_weights(mesh)
+    absG2 = np.einsum('ix,ix->i', Gv, Gv)
+    absG2[absG2==0] = 1e200
+    coulG = 4*np.pi / absG2
+    C = weights * coulG * np.exp(-absG2/(4*ew_eta**2))
+    C = C.reshape(*mesh)
+
+    Q_ifft = tools.ifft(Q, mesh).reshape(*mesh)
+    tmp = tools.fft(B * C * Q_ifft, mesh).real.reshape(*mesh)
+
+    ng = np.prod(mesh)
+    bK = b * mesh[:,None]
+    grad_rec = np.zeros_like(grad_dir)
+    for ia in range(len(chargs)):
+        mask = np.ix_(idx[ia], idy[ia], idz[ia])
+        dQ_s = np.einsum('x,y,z->xyz', dMx_s[ia], My_s[ia], Mz_s[ia])
+        dQdr = np.einsum('x,abc->xabc', bK[0], dQ_s)
+        grad_rec[ia] += np.einsum('xabc,abc->x', dQdr, tmp[mask])
+
+        dQ_s = np.einsum('x,y,z->xyz', Mx_s[ia], dMy_s[ia], Mz_s[ia])
+        dQdr = np.einsum('x,abc->xabc', bK[1], dQ_s)
+        grad_rec[ia] += np.einsum('xabc,abc->x', dQdr, tmp[mask])
+
+        dQ_s = np.einsum('x,y,z->xyz', Mx_s[ia], My_s[ia], dMz_s[ia])
+        dQdr = np.einsum('x,abc->xabc', bK[2], dQ_s)
+        grad_rec[ia] += np.einsum('xabc,abc->x', dQdr, tmp[mask])
+
+        grad_rec[ia] *= chargs[ia] * ng
+
+    # reciprocal space summation does not conserve momentum
+    shift = -np.sum(grad_rec, axis=0) / len(grad_rec)
+    logger.debug(cell, f'Shift ewald nuclear gradient by {shift} to keep momentum conservation.')
+    grad_rec += shift[None,:]
+
+    grad = grad_dir + grad_rec
+    return grad
+
+def ewald_nuc_grad(cell, ew_eta=None, ew_cut=None):
+    chargs = np.asarray(cell.atom_charges(), order='C', dtype=float)
+    coords = np.asarray(cell.atom_coords(), order='C')
+
+    if ew_eta is None or ew_cut is None:
+        ew_eta, ew_cut = cell.get_ewald_params()
+    log_precision = np.log(cell.precision / (chargs.sum()*16*np.pi**2))
+    ke_cutoff = -2*ew_eta**2*log_precision
+    mesh = cell.cutoff_to_mesh(ke_cutoff)
+
+    if cell.dimension == 3 and cell.use_particle_mesh_ewald:
+        return particle_mesh_ewald_nuc_grad(cell, ew_eta=ew_eta, ew_cut=ew_cut)
+
+    grad_dir = _get_ewald_direct_nuc_grad(cell, ew_eta, ew_cut)
+    grad_rec = np.zeros_like(grad_dir, order="C")
+
+    Gv, _, weights = cell.get_Gv_weights(mesh)
+    fn = getattr(libpbc, "ewald_gs_nuc_grad")
+    if cell.dimension != 2 or cell.low_dim_ft_type == 'inf_vacuum':
+        ngrids = len(Gv)
+        mem_avail = cell.max_memory - lib.current_memory()[0]
+        if mem_avail <= 0:
+            logger.warn(cell, "Not enough memory for computing ewald force.")
+        blksize = min(ngrids, max(mesh[2], int(mem_avail*1e6 / ((2+cell.natm*2)*8))))
+        for ig0, ig1 in lib.prange(0, ngrids, blksize):
+            ngrid_sub = ig1 - ig0
+            Gv_sub = np.asarray(Gv[ig0:ig1], order="C")
+            fn(grad_rec.ctypes.data_as(ctypes.c_void_p),
+               Gv_sub.ctypes.data_as(ctypes.c_void_p),
+               chargs.ctypes.data_as(ctypes.c_void_p),
+               coords.ctypes.data_as(ctypes.c_void_p),
+               ctypes.c_double(ew_eta), ctypes.c_double(weights),
+               ctypes.c_int(cell.natm), ctypes.c_size_t(ngrid_sub))
+    else:
+        raise NotImplementedError
+
+    grad = grad_dir + grad_rec
+    return grad
diff --git a/pyscf/pbc/gto/neighborlist.py b/pyscf/pbc/gto/neighborlist.py
new file mode 100644
index 0000000000..f4a0527ee2
--- /dev/null
+++ b/pyscf/pbc/gto/neighborlist.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import ctypes
+import numpy as np
+from pyscf import lib
+from pyscf.lib import logger
+
+libpbc = lib.load_library('libpbc')
+
+class _CNeighborPair(ctypes.Structure):
+    _fields_ = [("nimgs", ctypes.c_int),
+                ("Ls_list", ctypes.POINTER(ctypes.c_int)),
+                ("q_cond", ctypes.POINTER(ctypes.c_double)),
+                ("center", ctypes.POINTER(ctypes.c_double))]
+
+
+class _CNeighborList(ctypes.Structure):
+    _fields_ = [("nish", ctypes.c_int),
+                ("njsh", ctypes.c_int),
+                ("nimgs", ctypes.c_int),
+                ("pairs", ctypes.POINTER(ctypes.POINTER(_CNeighborPair)))]
+
+
+class _CNeighborListOpt(ctypes.Structure):
+    _fields_ = [("nl", ctypes.POINTER(_CNeighborList)),
+                ('fprescreen', ctypes.c_void_p)]
+
+
+def build_neighbor_list_for_shlpairs(cell, cell1=None, Ls=None,
+                                     ish_rcut=None, jsh_rcut=None, hermi=0,
+                                     precision=None):
+    '''
+    Build the neighbor list of shell pairs for periodic calculations.
+
+    Arguments:
+        cell : :class:`pbc.gto.cell.Cell`
+            The :class:`Cell` instance for the bra basis functions.
+        cell1 : :class:`pbc.gto.cell.Cell`, optional
+            The :class:`Cell` instance for the ket basis functions.
+            If not given, both bra and ket basis functions come from cell.
+        Ls : (*,3) array, optional
+            The cartesian coordinates of the periodic images.
+            Default is calculated by :func:`cell.get_lattice_Ls`.
+        ish_rcut : (nish,) array, optional
+            The cutoff radii of the shells for bra basis functions.
+        jsh_rcut : (njsh,) array, optional
+            The cutoff radii of the shells for ket basis functions.
+        hermi : int, optional
+            If :math:`hermi=1`, the task list is built only for
+            the upper triangle of the matrix. Default is 0.
+        precision : float, optional
+            The integral precision. Default is :attr:`cell.precision`.
+            If both ``ish_rcut`` and ``jsh_rcut`` are given,
+            ``precision`` will be ignored.
+
+    Returns: :class:`ctypes.POINTER`
+        The C pointer of the :class:`NeighborList` structure.
+    '''
+    if cell1 is None:
+        cell1 = cell
+    if Ls is None:
+        Ls = cell.get_lattice_Ls()
+    Ls = np.asarray(Ls, order='C', dtype=float)
+    nimgs = len(Ls)
+
+    if hermi == 1 and cell1 is not cell:
+        logger.warn(cell,
+                    "Set hermi=0 because cell and cell1 are not the same.")
+        hermi = 0
+
+    ish_atm = np.asarray(cell._atm, order='C', dtype=np.int32)
+    ish_bas = np.asarray(cell._bas, order='C', dtype=np.int32)
+    ish_env = np.asarray(cell._env, order='C', dtype=float)
+    nish = len(ish_bas)
+    if ish_rcut is None:
+        ish_rcut = cell.rcut_by_shells(precision=precision)
+    assert nish == len(ish_rcut)
+
+    if cell1 is cell:
+        jsh_atm = ish_atm
+        jsh_bas = ish_bas
+        jsh_env = ish_env
+        if jsh_rcut is None:
+            jsh_rcut = ish_rcut
+    else:
+        jsh_atm = np.asarray(cell1._atm, order='C', dtype=np.int32)
+        jsh_bas = np.asarray(cell1._bas, order='C', dtype=np.int32)
+        jsh_env = np.asarray(cell1._env, order='C', dtype=float)
+        if jsh_rcut is None:
+            jsh_rcut = cell1.rcut_by_shells(precision=precision)
+    njsh = len(jsh_bas)
+    assert njsh == len(jsh_rcut)
+
+    nl = ctypes.POINTER(_CNeighborList)()
+    func = getattr(libpbc, "build_neighbor_list", None)
+    try:
+        func(ctypes.byref(nl),
+             ish_atm.ctypes.data_as(ctypes.c_void_p),
+             ish_bas.ctypes.data_as(ctypes.c_void_p),
+             ish_env.ctypes.data_as(ctypes.c_void_p),
+             ish_rcut.ctypes.data_as(ctypes.c_void_p),
+             jsh_atm.ctypes.data_as(ctypes.c_void_p),
+             jsh_bas.ctypes.data_as(ctypes.c_void_p),
+             jsh_env.ctypes.data_as(ctypes.c_void_p),
+             jsh_rcut.ctypes.data_as(ctypes.c_void_p),
+             ctypes.c_int(nish), ctypes.c_int(njsh),
+             Ls.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nimgs),
+             ctypes.c_int(hermi))
+    except Exception as e:
+        raise RuntimeError(f"Failed to build neighbor list for shell pairs.\n{e}")
+    return nl
+
+def free_neighbor_list(nl):
+    func = getattr(libpbc, "del_neighbor_list", None)
+    try:
+        func(ctypes.byref(nl))
+    except Exception as e:
+        raise RuntimeError(f"Failed to free neighbor list.\n{e}")
+
+def neighbor_list_to_ndarray(cell, cell1, nl):
+    '''
+    Returns:
+        Ls_list: (nLtot,) ndarray
+            indices of Ls
+        Ls_idx: (2 x nish x njsh,) ndarray
+            starting and ending indices in Ls_list
+    '''
+    nish = cell.nbas
+    njsh = cell1.nbas
+    Ls_list = []
+    Ls_idx = []
+    nLtot = 0
+    for i in range(nish):
+        for j in range(njsh):
+            pair = nl.contents.pairs[i*njsh+j]
+            nL = pair.contents.nimgs
+            nLtot += nL
+            for iL in range(nL):
+                idx = pair.contents.Ls_list[iL]
+                Ls_list.append(idx)
+            if nL > 0:
+                Ls_idx.extend([nLtot-nL, nLtot])
+            else:
+                Ls_idx.extend([-1,-1])
+    return np.asarray(Ls_list), np.asarray(Ls_idx)
+
+
+class NeighborListOpt():
+    def __init__(self, cell):
+        self.cell = cell
+        self.nl = None
+        self._this = ctypes.POINTER(_CNeighborListOpt)()
+        libpbc.NLOpt_init(ctypes.byref(self._this))
+
+    def build(self, cell=None, cell1=None, Ls=None,
+              ish_rcut=None, jsh_rcut=None,
+              hermi=0, precision=None,
+              set_nl=True, set_optimizer=True):
+        if cell is None:
+            cell = self.cell
+
+        if (set_nl or set_optimizer) and self.nl is None:
+            self.nl = build_neighbor_list_for_shlpairs(
+                            cell, cell1=cell1, Ls=Ls,
+                            ish_rcut=ish_rcut, jsh_rcut=jsh_rcut,
+                            hermi=hermi, precision=precision)
+            libpbc.NLOpt_set_nl(self._this, self.nl)
+
+        if set_optimizer:
+            libpbc.NLOpt_set_optimizer(self._this)
+
+    def reset(self, free_nl=True):
+        if self.nl is not None and free_nl:
+            free_neighbor_list(self.nl)
+        self.nl = None
+        libpbc.NLOpt_reset(self._this)
+
+    def __del__(self):
+        self.reset()
+        try:
+            libpbc.NLOpt_del(ctypes.byref(self._this))
+        except AttributeError:
+            pass
diff --git a/pyscf/pbc/gto/pseudo/pp_int.py b/pyscf/pbc/gto/pseudo/pp_int.py
index 6114fb7f86..2ff3436dbc 100644
--- a/pyscf/pbc/gto/pseudo/pp_int.py
+++ b/pyscf/pbc/gto/pseudo/pp_int.py
@@ -29,6 +29,17 @@
 from pyscf import lib
 from pyscf import gto
 from pyscf import __config__
+from pyscf.pbc.lib.kpts_helper import gamma_point
+
+EPS_PPL = getattr(__config__, "pbc_gto_pseudo_eps_ppl", 1e-2)
+HL_TABLE_SLOTS = 7
+ATOM_OF        = 0
+ANG_OF         = 1
+HL_DIM_OF      = 2
+HL_DATA_OF     = 3
+HL_OFFSET0     = 4
+HF_OFFSET1     = 5
+HF_OFFSET2     = 6
 
 libpbc = lib.load_library('libpbc')
 
@@ -106,12 +117,293 @@ def get_gth_vlocG_part1(cell, Gv):
 def get_pp_loc_part2(cell, kpts=None):
     '''PRB, 58, 3641 Eq (1), integrals associated to C1, C2, C3, C4
     '''
-    from pyscf.pbc.df.aft import _IntPPBuilder
-    vpploc = _IntPPBuilder(cell, kpts).get_pp_loc_part2()
+    if kpts is None or gamma_point(kpts):
+        vpploc = [get_pp_loc_part2_gamma(cell)]
+    else:
+        from pyscf.pbc.df.aft import _IntPPBuilder
+        vpploc = _IntPPBuilder(cell, kpts).get_pp_loc_part2()
     if kpts is None or numpy.shape(kpts) == (3,):
         vpploc = vpploc[0]
     return vpploc
 
+
+def get_pp_loc_part2_gamma(cell):
+    from pyscf.pbc.df import incore
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+
+    fake_cells = {}
+    for cn in range(1, 5):
+        fake_cell = fake_cell_vloc(cell, cn)
+        fake_cell.precision = EPS_PPL
+        if fake_cell.nbas > 0:
+            fake_cells[cn] = fake_cell
+
+    if not fake_cells:
+        if any(cell.atom_symbol(ia) in cell._pseudo for ia in range(cell.natm)):
+            pass
+        else:
+            lib.logger.warn(cell, 'cell.pseudo was specified but its elements %s '
+                            'were not found in the system.', cell._pseudo.keys())
+        return 0
+
+    intors = ('int3c2e', 'int3c1e', 'int3c1e_r2_origk',
+              'int3c1e_r4_origk', 'int3c1e_r6_origk')
+    kptij_lst = numpy.zeros((1,2,3))
+    Ls = cell.get_lattice_Ls()
+    buf = None
+    for i, (cn, fake_cell) in enumerate(fake_cells.items()):
+        neighbor_list = build_neighbor_list_for_shlpairs(fake_cell, cell, Ls)
+        v = incore.aux_e2_sum_auxbas(cell, fake_cell, intors[cn], aosym='s2', comp=1,
+                                     kptij_lst=kptij_lst, neighbor_list=neighbor_list)
+        if i == 0:
+            buf = v
+        else:
+            buf = numpy.add(buf, v, out=buf)
+        v = None
+        free_neighbor_list(neighbor_list)
+
+    vpploc = lib.unpack_tril(buf)
+    return vpploc
+
+
+# TODO add k-point sampling
+def vpploc_part2_nuc_grad(cell, dm, kpts=None):
+    '''
+    Nuclear gradients of the 2nd part of the local part of
+    the GTH pseudo potential, contracted with the density matrix.
+    '''
+    from pyscf.pbc.df import incore
+    from pyscf.pbc.gto import build_neighbor_list_for_shlpairs, free_neighbor_list
+    if kpts is not None and not gamma_point(kpts):
+        raise NotImplementedError("k-point sampling not available")
+
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+    kptij_lst = numpy.hstack((kpts_lst,kpts_lst)).reshape(-1,2,3)
+
+    intors = ('int3c2e_ip1', 'int3c1e_ip1', 'int3c1e_ip1_r2_origk',
+              'int3c1e_ip1_r4_origk', 'int3c1e_ip1_r6_origk')
+
+    Ls = cell.get_lattice_Ls()
+    count = 0
+    grad = 0
+    for cn in range(1, 5):
+        fakecell = fake_cell_vloc(cell, cn)
+        fakecell.precision = EPS_PPL
+        if fakecell.nbas > 0:
+            neighbor_list = build_neighbor_list_for_shlpairs(fakecell, cell, Ls)
+            buf = incore.int3c1e_nuc_grad(cell, fakecell, dm, intors[cn],
+                                          kptij_lst=kptij_lst, neighbor_list=neighbor_list)
+            if count == 0:
+                grad = buf
+            else:
+                grad = numpy.add(grad, buf, out=grad)
+            buf = None
+            count += 1
+            free_neighbor_list(neighbor_list)
+    grad *= -2
+    return grad
+
+
+def _prepare_hl_data(fakecell, hl_blocks):
+    offset = [0] * 3
+    hl_table = numpy.empty((len(hl_blocks),HL_TABLE_SLOTS), order='C', dtype=numpy.int32)
+    hl_data = []
+    ptr = 0
+    for ib, hl in enumerate(hl_blocks):
+        hl_table[ib,ATOM_OF] = fakecell._bas[ib,0]
+        hl_table[ib,ANG_OF] = l = fakecell.bas_angular(ib)
+        hl_dim = hl.shape[0]
+        hl_table[ib,HL_DIM_OF], hl_table[ib,HL_DATA_OF] = hl_dim, ptr
+        ptr += hl_dim**2
+        hl_data.extend(list(hl.ravel()))
+        nd = 2 * l + 1
+        for i in range(hl_dim):
+            hl_table[ib, i+HL_OFFSET0] = offset[i]
+            offset[i] += nd
+    hl_data = numpy.asarray(hl_data, order='C', dtype=numpy.double)
+    return hl_table, hl_data
+
+
+# TODO add k-point sampling
+def _contract_ppnl(cell, fakecell, hl_blocks, ppnl_half, comp=1, kpts=None):
+    from pyscf.pbc.gto import NeighborListOpt
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    hl_table, hl_data = _prepare_hl_data(fakecell, hl_blocks)
+
+    opt = NeighborListOpt(fakecell)
+    opt.build(fakecell, cell)
+
+    shls_slice = (0, cell.nbas, 0, cell.nbas)
+    key = 'cart' if cell.cart else 'sph'
+    ao_loc = gto.moleintor.make_loc(cell._bas, key)
+
+    ppnl = []
+    nao = cell.nao_nr()
+    nao_pair = nao * (nao+1) // 2
+    for k, kpt in enumerate(kpts_lst):
+        ppnl_half0 = ppnl_half1 = ppnl_half2 = None
+        if len(ppnl_half[0]) > 0:
+            ppnl_half0 = ppnl_half[0][k]
+        if len(ppnl_half[1]) > 0:
+            ppnl_half1 = ppnl_half[1][k]
+        if len(ppnl_half[2]) > 0:
+            ppnl_half2 = ppnl_half[2][k]
+
+        if gamma_point(kpt):
+            if ppnl_half0 is not None:
+                ppnl_half0 = ppnl_half0.real
+            if ppnl_half1 is not None:
+                ppnl_half1 = ppnl_half1.real
+            if ppnl_half2 is not None:
+                ppnl_half2 = ppnl_half2.real
+            buf = numpy.empty([nao_pair], order='C', dtype=numpy.double)
+            fill = getattr(libpbc, 'ppnl_fill_gs2')
+        else:
+            buf = numpy.empty([nao_pair], order='C', dtype=numpy.complex128)
+            raise NotImplementedError
+
+        ppnl_half0 = numpy.asarray(ppnl_half0, order='C')
+        ppnl_half1 = numpy.asarray(ppnl_half1, order='C')
+        ppnl_half2 = numpy.asarray(ppnl_half2, order='C')
+
+        drv = getattr(libpbc, "contract_ppnl", None)
+        try:
+            drv(fill, buf.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half0.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half1.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half2.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(comp), (ctypes.c_int*4)(*shls_slice),
+                ao_loc.ctypes.data_as(ctypes.c_void_p),
+                hl_table.ctypes.data_as(ctypes.c_void_p),
+                hl_data.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(len(hl_blocks)), opt._this)
+        except Exception as e:
+            raise RuntimeError(f"Failed to compute non-local pseudo-potential.\n{e}")
+
+        ppnl_k = lib.unpack_tril(buf)
+        ppnl.append(ppnl_k)
+
+    if kpts is None or numpy.shape(kpts) == (3,):
+        ppnl = ppnl[0]
+    return ppnl
+
+
+# TODO add k-point sampling
+def _contract_ppnl_nuc_grad(cell, fakecell, dms, hl_blocks, ppnl_half, ppnl_half_ip2,
+                            comp=3, kpts=None, hl_table=None, hl_data=None):
+    from pyscf.pbc.gto import NeighborListOpt
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    if hl_table is None:
+        hl_table, hl_data = _prepare_hl_data(fakecell, hl_blocks)
+
+    opt = NeighborListOpt(fakecell)
+    opt.build(fakecell, cell)
+
+    nkpts = len(kpts_lst)
+    nao = cell.nao
+    dms = dms.reshape(nkpts, nao, nao)
+    shls_slice = (0, cell.nbas, 0, cell.nbas)
+    bas = numpy.asarray(cell._bas, order='C', dtype=numpy.int32)
+    key = 'cart' if cell.cart else 'sph'
+    ao_loc = gto.moleintor.make_loc(bas, key)
+
+    grad = []
+    for k, kpt in enumerate(kpts_lst):
+        dm = dms[k]
+        naux = [0] * 3
+        ppnl_half0 = ppnl_half1 = ppnl_half2 = None
+        if len(ppnl_half[0]) > 0:
+            ppnl_half0 = ppnl_half[0][k]
+            naux[0] = ppnl_half0.shape[0]
+        if len(ppnl_half[1]) > 0:
+            ppnl_half1 = ppnl_half[1][k]
+            naux[1] = ppnl_half1.shape[0]
+        if len(ppnl_half[2]) > 0:
+            ppnl_half2 = ppnl_half[2][k]
+            naux[2] = ppnl_half2.shape[0]
+
+        ppnl_half_ip2_0 = ppnl_half_ip2_1 = ppnl_half_ip2_2 = None
+        if len(ppnl_half_ip2[0]) > 0:
+            ppnl_half_ip2_0 = ppnl_half_ip2[0][k]
+            assert naux[0] == ppnl_half_ip2_0.shape[1]
+        if len(ppnl_half_ip2[1]) > 0:
+            ppnl_half_ip2_1 = ppnl_half_ip2[1][k]
+            assert naux[1] == ppnl_half_ip2_1.shape[1]
+        if len(ppnl_half_ip2[2]) > 0:
+            ppnl_half_ip2_2 = ppnl_half_ip2[2][k]
+            assert naux[2] == ppnl_half_ip2_2.shape[1]
+
+        naux = numpy.asarray(naux, dtype=numpy.int32)
+
+        if gamma_point(kpt):
+            dm = dm.real
+            if ppnl_half0 is not None:
+                ppnl_half0 = ppnl_half0.real
+                ppnl_half_ip2_0 = ppnl_half_ip2_0.real
+            if ppnl_half1 is not None:
+                ppnl_half1 = ppnl_half1.real
+                ppnl_half_ip2_1 = ppnl_half_ip2_1.real
+            if ppnl_half2 is not None:
+                ppnl_half2 = ppnl_half2.real
+                ppnl_half_ip2_2 = ppnl_half_ip2_2.real
+            grad_k = numpy.zeros([cell.natm, comp], order='C', dtype=numpy.double)
+            fill = getattr(libpbc, 'ppnl_nuc_grad_fill_gs1')
+        else:
+            grad_k = numpy.empty([cell.natm, comp], order='C', dtype=numpy.complex128)
+            raise NotImplementedError
+
+        dm = numpy.asarray(dm, order='C')
+        ppnl_half0 = numpy.asarray(ppnl_half0, order='C')
+        ppnl_half1 = numpy.asarray(ppnl_half1, order='C')
+        ppnl_half2 = numpy.asarray(ppnl_half2, order='C')
+        ppnl_half_ip2_0 = numpy.asarray(ppnl_half_ip2_0, order='C')
+        ppnl_half_ip2_1 = numpy.asarray(ppnl_half_ip2_1, order='C')
+        ppnl_half_ip2_2 = numpy.asarray(ppnl_half_ip2_2, order='C')
+
+        drv = getattr(libpbc, "contract_ppnl_nuc_grad", None)
+        try:
+            drv(fill,
+                grad_k.ctypes.data_as(ctypes.c_void_p),
+                dm.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(comp),
+                ppnl_half0.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half1.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half2.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half_ip2_0.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half_ip2_1.ctypes.data_as(ctypes.c_void_p),
+                ppnl_half_ip2_2.ctypes.data_as(ctypes.c_void_p),
+                hl_table.ctypes.data_as(ctypes.c_void_p),
+                hl_data.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(len(hl_blocks)),
+                naux.ctypes.data_as(ctypes.c_void_p),
+                (ctypes.c_int*4)(*shls_slice),
+                ao_loc.ctypes.data_as(ctypes.c_void_p),
+                bas.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(cell.natm), opt._this)
+        except Exception as e:
+            raise RuntimeError(f"Failed to compute non-local pp nuclear gradient.\n{e}")
+        grad.append(grad_k)
+
+    grad_tot = 0
+    if nkpts == 1:
+        grad_tot = grad[0]
+    else:
+        for k in range(nkpts):
+            grad_tot += grad[k]
+        grad_tot = grad_tot.real
+    return grad_tot
+
+
 def get_pp_nl(cell, kpts=None):
     if kpts is None:
         kpts_lst = numpy.zeros((1,3))
@@ -122,6 +414,10 @@ def get_pp_nl(cell, kpts=None):
     fakecell, hl_blocks = fake_cell_vnl(cell)
     ppnl_half = _int_vnl(cell, fakecell, hl_blocks, kpts_lst)
     nao = cell.nao_nr()
+
+    if gamma_point(kpts_lst):
+        return _contract_ppnl(cell, fakecell, hl_blocks, ppnl_half, kpts=kpts)
+
     buf = numpy.empty((3*9*nao), dtype=numpy.complex128)
 
     # We set this equal to zeros in case hl_blocks loop is skipped
@@ -148,7 +444,32 @@ def get_pp_nl(cell, kpts=None):
     return ppnl
 
 
-def fake_cell_vloc(cell, cn=0):
+def vppnl_nuc_grad(cell, dm, kpts=None):
+    '''
+    Nuclear gradients of the non-local part of the GTH pseudo potential,
+    contracted with the density matrix.
+    '''
+    if kpts is None:
+        kpts_lst = numpy.zeros((1,3))
+    else:
+        kpts_lst = numpy.reshape(kpts, (-1,3))
+
+    fakecell, hl_blocks = fake_cell_vnl(cell)
+    intors = ('int1e_ipovlp', 'int1e_r2_origi_ip2', 'int1e_r4_origi_ip2')
+    ppnl_half = _int_vnl(cell, fakecell, hl_blocks, kpts_lst)
+    ppnl_half_ip2 = _int_vnl(cell, fakecell, hl_blocks, kpts_lst, intors, comp=3)
+    # int1e_ipovlp computes ip1 so multiply -1 to get ip2
+    if len(ppnl_half_ip2[0]) > 0:
+        for k, kpt in enumerate(kpts_lst):
+            ppnl_half_ip2[0][k] *= -1
+
+    grad = _contract_ppnl_nuc_grad(cell, fakecell, dm, hl_blocks,
+                                   ppnl_half, ppnl_half_ip2, kpts=kpts)
+    grad *= -2
+    return grad
+
+
+def fake_cell_vloc(cell, cn=0, atm_id=None):
     '''Generate fake cell for V_{loc}.
 
     Each term of V_{loc} (erf, C_1, C_2, C_3, C_4) is a gaussian type
@@ -158,17 +479,23 @@ def fake_cell_vloc(cell, cn=0):
     The kwarg cn indiciates which term to generate for the fake cell.
     If cn = 0, the erf term is generated.  C_1,..,C_4 are generated with cn = 1..4
     '''
-    fake_env = [cell.atom_coords().ravel()]
-    fake_atm = cell._atm.copy()
-    fake_atm[:,gto.PTR_COORD] = numpy.arange(0, cell.natm*3, 3)
-    ptr = cell.natm * 3
+    if atm_id is None:
+        atm_id = numpy.arange(cell.natm)
+    else:
+        atm_id = numpy.asarray(atm_id)
+    natm = len(atm_id)
+
+    fake_env = [cell.atom_coords()[atm_id].ravel()]
+    fake_atm = cell._atm[atm_id].copy().reshape(natm,-1)
+    fake_atm[:,gto.PTR_COORD] = numpy.arange(0, natm*3, 3)
+    ptr = natm * 3
     fake_bas = []
     half_sph_norm = .5/numpy.pi**.5
-    for ia in range(cell.natm):
-        if cell.atom_charge(ia) == 0:  # pass ghost atoms
+    for ia, atm in enumerate(atm_id):
+        if cell.atom_charge(atm) == 0:  # pass ghost atoms
             continue
 
-        symb = cell.atom_symbol(ia)
+        symb = cell.atom_symbol(atm)
         if cn == 0:
             if symb in cell._pseudo:
                 pp = cell._pseudo[symb]
@@ -196,6 +523,7 @@ def fake_cell_vloc(cell, cn=0):
     fakecell._env = numpy.asarray(numpy.hstack(fake_env), dtype=numpy.double)
     return fakecell
 
+
 # sqrt(Gamma(l+1.5)/Gamma(l+2i+1.5))
 _PLI_FAC = 1/numpy.sqrt(numpy.array((
     (1, 3.75 , 59.0625  ),  # l = 0,
@@ -249,12 +577,14 @@ def fake_cell_vnl(cell):
 
     fakecell = cell.copy(deep=False)
     fakecell._atm = numpy.asarray(fake_atm, dtype=numpy.int32)
-    fakecell._bas = numpy.asarray(fake_bas, dtype=numpy.int32)
+    fakecell._bas = numpy.asarray(fake_bas, dtype=numpy.int32).reshape(-1, gto.BAS_SLOTS)
     fakecell._env = numpy.asarray(numpy.hstack(fake_env), dtype=numpy.double)
     return fakecell, hl_blocks
 
-def _int_vnl(cell, fakecell, hl_blocks, kpts):
+def _int_vnl(cell, fakecell, hl_blocks, kpts, intors=None, comp=1):
     '''Vnuc - Vloc'''
+    if intors is None:
+        intors = ['int1e_ovlp', 'int1e_r2_origi', 'int1e_r4_origi']
     rcut = max(cell.rcut, fakecell.rcut)
     Ls = cell.get_lattice_Ls(rcut=rcut)
     nimgs = len(Ls)
@@ -262,6 +592,7 @@ def _int_vnl(cell, fakecell, hl_blocks, kpts):
     nkpts = len(kpts)
 
     fill = getattr(libpbc, 'PBCnr2c_fill_ks1')
+    # TODO add screening
     cintopt = lib.c_null_ptr()
 
     def int_ket(_bas, intor):
@@ -279,8 +610,10 @@ def int_ket(_bas, intor):
         ao_loc = gto.moleintor.make_loc(bas, intor)
         ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
         nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
-        out = numpy.empty((nkpts,ni,nj), dtype=numpy.complex128)
-        comp = 1
+        if comp == 1:
+            out = numpy.empty((nkpts,ni,nj), dtype=numpy.complex128)
+        else:
+            out = numpy.empty((nkpts,comp,ni,nj), dtype=numpy.complex128)
 
         fintor = getattr(gto.moleintor.libcgto, intor)
 
@@ -297,7 +630,7 @@ def int_ket(_bas, intor):
         return out
 
     hl_dims = numpy.asarray([len(hl) for hl in hl_blocks])
-    out = (int_ket(fakecell._bas[hl_dims>0], 'int1e_ovlp'),
-           int_ket(fakecell._bas[hl_dims>1], 'int1e_r2_origi'),
-           int_ket(fakecell._bas[hl_dims>2], 'int1e_r4_origi'))
+    out = (int_ket(fakecell._bas[hl_dims>0], intors[0]),
+           int_ket(fakecell._bas[hl_dims>1], intors[1]),
+           int_ket(fakecell._bas[hl_dims>2], intors[2]))
     return out
diff --git a/pyscf/pbc/gto/pseudo/test/test_pp.py b/pyscf/pbc/gto/pseudo/test/test_pp.py
index c00057a064..95b343bbf6 100644
--- a/pyscf/pbc/gto/pseudo/test/test_pp.py
+++ b/pyscf/pbc/gto/pseudo/test/test_pp.py
@@ -22,6 +22,7 @@
 from pyscf.pbc.dft import numint
 from pyscf.pbc.gto import pseudo
 from pyscf.pbc.gto.pseudo import pp_int
+from pyscf.data.nist import BOHR
 
 
 def get_pp_loc_part2(cell, kpt=np.zeros(3)):
@@ -244,7 +245,42 @@ def test_pp(self):
         v1 = pseudo.get_pp(cell, k)
         self.assertAlmostEqual(abs(v0-v1).max(), 0, 6)
 
+    def test_pp_nuc_grad(self):
+        cell = pbcgto.Cell()
+        cell.atom = 'H 0 0 0; Na 0 0 0.8'
+        cell.a = np.diag([6,6,6])
+        cell.basis='gth-szv'
+        cell.pseudo='gth-pade'
+        cell.ke_cutoff=200
+        cell.build()
+
+        cellp = cell.copy()
+        cellp.atom = 'H 0 0 0; Na 0 0 0.8001'
+        cellp.build()
 
+        cellm = cell.copy()
+        cellm.atom = 'H 0 0 0; Na 0 0 0.7999'
+        cellm.build()
+
+        np.random.seed(1)
+        dm = np.random.rand(cell.nao, cell.nao)
+        dm = (dm + dm.T) / 2
+
+        # local_part2
+        vp = pp_int.get_pp_loc_part2(cellp)
+        vm = pp_int.get_pp_loc_part2(cellm)
+        v_fd = (vp - vm) / (0.0002 / BOHR)
+        grad = pp_int.vpploc_part2_nuc_grad(cell, dm)[1,2]
+        grad_fd = np.einsum("ij,ij->", v_fd, dm)
+        self.assertAlmostEqual(abs(grad - grad_fd), 0, 7)
+
+        # non-local
+        vp = pp_int.get_pp_nl(cellp)
+        vm = pp_int.get_pp_nl(cellm)
+        v_fd = (vp - vm) / (0.0002 / BOHR)
+        grad = pp_int.vppnl_nuc_grad(cell, dm)[1,2]
+        grad_fd = np.einsum("ij,ij->", v_fd, dm)
+        self.assertAlmostEqual(abs(grad - grad_fd), 0, 7)
 
 if __name__ == '__main__':
     print("Full Tests for pbc.gto.pseudo")
diff --git a/pyscf/pbc/gto/test/test_cell.py b/pyscf/pbc/gto/test/test_cell.py
index 5dee058140..bd7a0e067f 100644
--- a/pyscf/pbc/gto/test/test_cell.py
+++ b/pyscf/pbc/gto/test/test_cell.py
@@ -25,6 +25,7 @@
 from pyscf.pbc import gto as pgto
 from pyscf.pbc.gto import ecp
 from pyscf.pbc.tools import pbc as pbctools
+from pyscf.pbc.gto import ewald_methods
 
 
 def setUpModule():
@@ -252,6 +253,30 @@ def test_ewald_2d(self):
 #        eref = cell.to_mol().energy_nuc()
 #        self.assertAlmostEqual(cell.ewald(), eref, 2)
 
+    def test_particle_mesh_ewald(self):
+        cell = pgto.Cell()
+        cell.a = np.diag([10.,]*3)
+        cell.atom = '''
+            O          5.84560        5.21649        5.10372
+            H          6.30941        5.30070        5.92953
+            H          4.91429        5.26674        5.28886
+        '''
+        cell.pseudo = 'gth-pade'
+        cell.verbose = 0
+        cell.build()
+
+        cell1 = cell.copy()
+        cell1.use_particle_mesh_ewald = True
+        cell1.build()
+
+        e0 = cell.ewald()
+        e1 = cell1.ewald()
+        self.assertAlmostEqual(e0, e1, 6)
+
+        g0 = ewald_methods.ewald_nuc_grad(cell)
+        g1 = ewald_methods.ewald_nuc_grad(cell1)
+        self.assertAlmostEqual(abs(g1-g0).max(), 0, 6)
+
     def test_pbc_intor(self):
         numpy.random.seed(12)
         kpts = numpy.random.random((4,3))
diff --git a/pyscf/pbc/scf/hf.py b/pyscf/pbc/scf/hf.py
index 8225d778b6..f6c91336ed 100644
--- a/pyscf/pbc/scf/hf.py
+++ b/pyscf/pbc/scf/hf.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2019 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2024 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,23 +53,24 @@ def get_ovlp(cell, kpt=np.zeros(3)):
         # Avoid pbcopt's prescreening in the lattice sum, for better accuracy
         s = cell.pbc_intor('int1e_ovlp', hermi=0, kpts=kpt,
                            pbcopt=lib.c_null_ptr())
-    s = lib.asarray(s)
+    s = np.asarray(s)
     hermi_error = abs(s - np.rollaxis(s.conj(), -1, -2)).max()
     if hermi_error > cell.precision and hermi_error > 1e-12:
         logger.warn(cell, '%.4g error found in overlap integrals. '
                     'cell.precision  or  cell.rcut  can be adjusted to '
                     'improve accuracy.', hermi_error)
 
-    cond = np.max(lib.cond(s))
-    if cond * precision > 1e2:
-        prec = 1e7 / cond
-        rmin = gto.estimate_rcut(cell, prec*1e-5)
-        logger.warn(cell, 'Singularity detected in overlap matrix.  '
-                    'Integral accuracy may be not enough.\n      '
-                    'You can adjust  cell.precision  or  cell.rcut  to '
-                    'improve accuracy.  Recommended settings are\n      '
-                    'cell.precision < %.2g\n      '
-                    'cell.rcut > %.4g', prec, rmin)
+    if cell.verbose >= logger.DEBUG:
+        cond = np.max(lib.cond(s))
+        if cond * precision > 1e2:
+            prec = 1e7 / cond
+            rmin = gto.estimate_rcut(cell, prec*1e-5)
+            logger.warn(cell, 'Singularity detected in overlap matrix.  '
+                        'Integral accuracy may be not enough.\n      '
+                        'You can adjust  cell.precision  or  cell.rcut  to '
+                        'improve accuracy.  Recommended settings are\n      '
+                        'cell.precision < %.2g\n      '
+                        'cell.rcut > %.4g', prec, rmin)
     return s
 
 
@@ -615,11 +616,18 @@ def dump_flags(self, verbose=None):
         return self
 
     def check_sanity(self):
-        mol_hf.SCF.check_sanity(self)
+        lib.StreamObject.check_sanity(self)
         if (isinstance(self.exxdiv, str) and self.exxdiv.lower() != 'ewald' and
             isinstance(self.with_df, df.df.DF)):
             logger.warn(self, 'exxdiv %s is not supported in DF or MDF',
                         self.exxdiv)
+
+        if self.verbose >= logger.DEBUG:
+            s = self.get_ovlp()
+            cond = np.max(lib.cond(s))
+            if cond * 1e-17 > self.conv_tol:
+                logger.warn(self, 'Singularity detected in overlap matrix (condition number = %4.3g). '
+                            'SCF may be inaccurate and hard to converge.', cond)
         return self
 
     def get_hcore(self, cell=None, kpt=None):
@@ -738,7 +746,7 @@ def get_jk_incore(self, cell=None, dm=None, hermi=1, kpt=None, omega=None,
         return self.get_jk(cell, dm, hermi, kpt)
 
     def energy_nuc(self):
-        return self.cell.energy_nuc()
+        return self.cell.enuc
 
     @lib.with_doc(dip_moment.__doc__)
     def dip_moment(self, cell=None, dm=None, unit='Debye', verbose=logger.NOTE,
@@ -758,10 +766,10 @@ def _finalize(self):
             makov_payne_correction(self)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
         if cell is None: cell = self.cell
         dm = mol_hf.SCF.get_init_guess(self, cell, key)
-        dm = normalize_dm_(self, dm)
+        dm = normalize_dm_(self, dm, s1e)
         return dm
 
     def init_guess_by_1e(self, cell=None):
@@ -914,12 +922,14 @@ def _format_jks(vj, dm, kpts_band):
         vj = vj[0]
     return vj
 
-def normalize_dm_(mf, dm):
+def normalize_dm_(mf, dm, s1e=None):
     '''
     Scale density matrix to make it produce the correct number of electrons.
     '''
     cell = mf.cell
-    ne = np.einsum('ij,ji->', dm, mf.get_ovlp(cell)).real
+    if s1e is None:
+        s1e = mf.get_ovlp(cell)
+    ne = lib.einsum('ij,ji->', dm, s1e).real
     if abs(ne - cell.nelectron) > 0.01:
         logger.debug(mf, 'Big error detected in the electron number '
                      'of initial guess density matrix (Ne/cell = %g)!\n'
diff --git a/pyscf/pbc/scf/khf.py b/pyscf/pbc/scf/khf.py
index 1ef2d88908..89124e8af4 100644
--- a/pyscf/pbc/scf/khf.py
+++ b/pyscf/pbc/scf/khf.py
@@ -496,7 +496,7 @@ def dump_flags(self, verbose=None):
             self.with_df.dump_flags(verbose)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
         raise NotImplementedError
 
     def init_guess_by_1e(self, cell=None):
@@ -524,10 +524,10 @@ def get_jk(self, cell=None, dm_kpts=None, hermi=1, kpts=None, kpts_band=None,
         cpu0 = (logger.process_clock(), logger.perf_counter())
         if self.rsjk:
             vj, vk = self.rsjk.get_jk(dm_kpts, hermi, kpts, kpts_band,
-                                      with_j, with_k, omega, self.exxdiv)
+                                      with_j, with_k, omega=omega, exxdiv=self.exxdiv)
         else:
             vj, vk = self.with_df.get_jk(dm_kpts, hermi, kpts, kpts_band,
-                                         with_j, with_k, omega, self.exxdiv)
+                                         with_j, with_k, omega=omega, exxdiv=self.exxdiv)
         logger.timer(self, 'vj and vk', *cpu0)
         return vj, vk
 
@@ -700,7 +700,9 @@ def check_sanity(self):
                         'found in KRHF method.', cell.nelec, nkpts)
         return KSCF.check_sanity(self)
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm = mol_hf.SCF.get_init_guess(self, cell, key)
         nkpts = len(self.kpts)
         if dm.ndim == 2:
@@ -708,7 +710,7 @@ def get_init_guess(self, cell=None, key='minao'):
             dm = np.repeat(dm[None,:,:], nkpts, axis=0)
         dm_kpts = dm
 
-        ne = np.einsum('kij,kji->', dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('kij,kji->', dm_kpts, s1e).real
         # FIXME: consider the fractional num_electron or not? This maybe
         # relate to the charged system.
         nelectron = float(self.cell.tot_electrons(nkpts))
diff --git a/pyscf/pbc/scf/khf_ksymm.py b/pyscf/pbc/scf/khf_ksymm.py
index baaf5543a6..69e4d5c5d1 100644
--- a/pyscf/pbc/scf/khf_ksymm.py
+++ b/pyscf/pbc/scf/khf_ksymm.py
@@ -343,14 +343,16 @@ class KsymAdaptedKRHF(KsymAdaptedKSCF, khf.KRHF):
     to_ks = khf.KRHF.to_ks
     convert_from_ = khf.KRHF.convert_from_
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm_kpts = mol_hf.SCF.get_init_guess(self, cell, key)
         if dm_kpts.ndim == 2:
             dm_kpts = np.asarray([dm_kpts]*self.kpts.nkpts_ibz)
         elif len(dm_kpts) != self.kpts.nkpts_ibz:
             dm_kpts = dm_kpts[self.kpts.ibz2bz]
 
-        ne = np.einsum('k,kij,kji', self.kpts.weights_ibz, dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('k,kij,kji', self.kpts.weights_ibz, dm_kpts, s1e).real
         nkpts = self.kpts.nkpts
         ne *= nkpts
         nelectron = float(self.cell.tot_electrons(nkpts))
diff --git a/pyscf/pbc/scf/kuhf.py b/pyscf/pbc/scf/kuhf.py
index af56a2ced3..eae04c0713 100644
--- a/pyscf/pbc/scf/kuhf.py
+++ b/pyscf/pbc/scf/kuhf.py
@@ -416,7 +416,9 @@ def dump_flags(self, verbose=None):
                     'alpha = %d beta = %d', *self.nelec)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm_kpts = mol_hf.SCF.get_init_guess(self, cell, key)
         assert dm_kpts.shape[0] == 2
         nkpts = len(self.kpts)
@@ -424,7 +426,7 @@ def get_init_guess(self, cell=None, key='minao'):
             # dm[spin,nao,nao] at gamma point -> dm_kpts[spin,nkpts,nao,nao]
             dm_kpts = np.repeat(dm_kpts[:,None,:,:], nkpts, axis=1)
 
-        ne = np.einsum('xkij,kji->x', dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('xkij,kji->x', dm_kpts, s1e).real
         nelec = np.asarray(self.nelec)
         if np.any(abs(ne - nelec) > 0.01*nkpts):
             logger.debug(self, 'Big error detected in the electron number '
diff --git a/pyscf/pbc/scf/kuhf_ksymm.py b/pyscf/pbc/scf/kuhf_ksymm.py
index 310de63289..4e10ed0fdc 100644
--- a/pyscf/pbc/scf/kuhf_ksymm.py
+++ b/pyscf/pbc/scf/kuhf_ksymm.py
@@ -155,7 +155,9 @@ def dump_flags(self, verbose=None):
                     'alpha = %d beta = %d', *self.nelec)
         return self
 
-    def get_init_guess(self, cell=None, key='minao'):
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm_kpts = mol_hf.SCF.get_init_guess(self, cell, key)
         assert dm_kpts.shape[0]==2
         if dm_kpts.ndim != 4:
@@ -165,7 +167,7 @@ def get_init_guess(self, cell=None, key='minao'):
         elif dm_kpts.shape[1] != self.kpts.nkpts_ibz:
             dm_kpts = dm_kpts[:,self.kpts.ibz2bz]
 
-        ne = np.einsum('k,xkij,kji->x', self.kpts.weights_ibz, dm_kpts, self.get_ovlp(cell)).real
+        ne = lib.einsum('k,xkij,kji->x', self.kpts.weights_ibz, dm_kpts, s1e).real
         nkpts = self.kpts.nkpts
         ne *= nkpts
         nelec = np.asarray(self.nelec)
diff --git a/pyscf/pbc/scf/test/test_hf.py b/pyscf/pbc/scf/test/test_hf.py
index fe3387468b..3e47561cee 100644
--- a/pyscf/pbc/scf/test/test_hf.py
+++ b/pyscf/pbc/scf/test/test_hf.py
@@ -20,6 +20,7 @@
 import tempfile
 import numpy
 from pyscf import lib
+from pyscf.scf import atom_hf
 from pyscf.pbc import gto as pbcgto
 from pyscf.pbc.scf import hf as pbchf
 import pyscf.pbc.scf as pscf
@@ -511,7 +512,7 @@ def test_init_guess_by_1e(self):
         self.assertEqual(dm.ndim, 3)
         self.assertAlmostEqual(lib.fp(dm), 0.025922864381755062, 6)
 
-    def test_init_guess_by_atom(self):
+    def test_init_guess_by_minao(self):
         with lib.temporary_env(cell, dimension=1):
             dm = mf.get_init_guess(key='minao')
             kdm = kmf.get_init_guess(key='minao')
@@ -521,6 +522,29 @@ def test_init_guess_by_atom(self):
         self.assertEqual(kdm.ndim, 3)
         self.assertAlmostEqual(lib.fp(kdm), -1.714952331211208, 8)
 
+    def test_init_guess_by_atom(self):
+        with lib.temporary_env(cell, dimension=1):
+            dm = mf.get_init_guess(key='atom')
+            kdm = kmf.get_init_guess(key='atom')
+
+        self.assertAlmostEqual(lib.fp(dm), 0.18074522075843902, 7)
+
+        self.assertEqual(kdm.ndim, 3)
+        self.assertAlmostEqual(lib.fp(dm), 0.18074522075843902, 7)
+
+    def test_atom_hf_with_pp(self):
+        mol = pbcgto.Cell()
+        mol.build(
+            verbose = 7,
+            output = '/dev/null',
+            atom  = 'O 0 0 0; H 0 0 -1; H 0 0 1',
+            a = [[5, 0, 0], [0, 5, 0], [0, 0, 5]],
+            basis = 'gth-dzvp',
+            pseudo = 'gth-pade')
+        scf_result = atom_hf.get_atm_nrhf(mol)
+        self.assertAlmostEqual(scf_result['O'][0], -15.193243796069835, 9)
+        self.assertAlmostEqual(scf_result['H'][0], -0.49777509423571864, 9)
+
     def test_jk(self):
         nao = cell.nao
         numpy.random.seed(2)
diff --git a/pyscf/pbc/scf/uhf.py b/pyscf/pbc/scf/uhf.py
index b9d9b1407d..0d247f745e 100644
--- a/pyscf/pbc/scf/uhf.py
+++ b/pyscf/pbc/scf/uhf.py
@@ -221,10 +221,13 @@ def dip_moment(self, cell=None, dm=None, unit='Debye', verbose=logger.NOTE,
             rho = self.get_rho(dm)
         return dip_moment(cell, dm, unit, verbose, rho=rho, kpt=self.kpt, **kwargs)
 
-    def get_init_guess(self, cell=None, key='minao'):
-        if cell is None: cell = self.cell
+    def get_init_guess(self, cell=None, key='minao', s1e=None):
+        if cell is None:
+            cell = self.cell
+        if s1e is None:
+            s1e = self.get_ovlp(cell)
         dm = mol_uhf.UHF.get_init_guess(self, cell, key)
-        ne = np.einsum('xij,ji->x', dm, self.get_ovlp(cell)).real
+        ne = np.einsum('xij,ji->x', dm, s1e).real
         nelec = self.nelec
         if np.any(abs(ne - nelec) > 0.01):
             logger.debug(self, 'Big error detected in the electron number '
diff --git a/pyscf/pbc/symm/geom.py b/pyscf/pbc/symm/geom.py
index 74119a4483..ae698d2347 100644
--- a/pyscf/pbc/symm/geom.py
+++ b/pyscf/pbc/symm/geom.py
@@ -77,7 +77,7 @@ def search_space_group_ops(cell, rotations=None, tol=SYMPREC):
     '''
     if rotations is None: rotations = search_point_group_ops(cell, tol=tol)
     a = cell.lattice_vectors()
-    coords = cell.get_scaled_positions()
+    coords = cell.get_scaled_atom_coords()
     atmgrp = mole.atom_types(cell._atom, magmom=cell.magmom)
     atmgrp_spin_inv = {} #spin up and down inverted
     has_spin = False
diff --git a/pyscf/pbc/symm/pyscf_spglib.py b/pyscf/pbc/symm/pyscf_spglib.py
index 3a0d1442cb..f87117a8dd 100644
--- a/pyscf/pbc/symm/pyscf_spglib.py
+++ b/pyscf/pbc/symm/pyscf_spglib.py
@@ -29,7 +29,7 @@ def cell_to_spgcell(cell):
     Convert PySCF Cell object to spglib cell object
     '''
     a = cell.lattice_vectors()
-    atm_pos = cell.get_scaled_positions()
+    atm_pos = cell.get_scaled_atom_coords()
     atm_num = []
     from pyscf.data import elements
     for symbol in cell.elements:
diff --git a/pyscf/pbc/symm/symmetry.py b/pyscf/pbc/symm/symmetry.py
index c79bc81167..ce29e3afac 100644
--- a/pyscf/pbc/symm/symmetry.py
+++ b/pyscf/pbc/symm/symmetry.py
@@ -219,7 +219,7 @@ def dump_info(self):
 
 def _get_phase(cell, op, kpt_scaled, ignore_phase=False, tol=SYMPREC):
     kpt_scaled = op.a2b(cell).dot_rot(kpt_scaled)
-    coords_scaled = cell.get_scaled_positions().reshape(-1,3)
+    coords_scaled = cell.get_scaled_atom_coords().reshape(-1,3)
     natm = coords_scaled.shape[0]
     phase = np.ones((natm,), dtype=np.complex128)
     atm_map = np.arange(natm)
diff --git a/pyscf/pbc/tools/pbc.py b/pyscf/pbc/tools/pbc.py
index 7ca867fd21..20d45fe692 100644
--- a/pyscf/pbc/tools/pbc.py
+++ b/pyscf/pbc/tools/pbc.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import warnings
+import ctypes
 import numpy as np
 import scipy.linalg
 from pyscf import lib
@@ -57,6 +58,44 @@ def _ifftn_blas(g, mesh):
     return out.reshape(-1, *mesh)
 
 if FFT_ENGINE == 'FFTW':
+    try:
+        libfft = lib.load_library('libfft')
+    except OSError:
+        raise RuntimeError("Failed to load libfft")
+
+    def _copy_d2z(a):
+        fn = libfft._copy_d2z
+        out = np.empty(a.shape, dtype=np.complex128)
+        fn(out.ctypes.data_as(ctypes.c_void_p),
+           a.ctypes.data_as(ctypes.c_void_p),
+           ctypes.c_size_t(a.size))
+        return out
+
+    def _complex_fftn_fftw(f, mesh, func):
+        if f.dtype == np.double and f.flags.c_contiguous:
+            # np.asarray or np.astype is too slow
+            f = _copy_d2z(f)
+        else:
+            f = np.asarray(f, order='C', dtype=np.complex128)
+        mesh = np.asarray(mesh, order='C', dtype=np.int32)
+        rank = len(mesh)
+        out = np.empty_like(f)
+        fn = getattr(libfft, func)
+        for i, fi in enumerate(f):
+            fn(fi.ctypes.data_as(ctypes.c_void_p),
+               out[i].ctypes.data_as(ctypes.c_void_p),
+               mesh.ctypes.data_as(ctypes.c_void_p),
+               ctypes.c_int(rank))
+        return out
+
+    def _fftn_wrapper(a):
+        mesh = a.shape[1:]
+        return _complex_fftn_fftw(a, mesh, 'fft')
+    def _ifftn_wrapper(a):
+        mesh = a.shape[1:]
+        return _complex_fftn_fftw(a, mesh, 'ifft')
+
+elif FFT_ENGINE == 'PYFFTW':
     # pyfftw is slower than np.fft in most cases
     try:
         import pyfftw
@@ -235,8 +274,9 @@ def get_coulG(cell, k=np.zeros(3), exx=False, mf=None, mesh=None, Gv=None,
     else:
         kG = Gv
 
-    equal2boundary = np.zeros(Gv.shape[0], dtype=bool)
+    equal2boundary = None
     if wrap_around and abs(k).sum() > 1e-9:
+        equal2boundary = np.zeros(Gv.shape[0], dtype=bool)
         # Here we 'wrap around' the high frequency k+G vectors into their lower
         # frequency counterparts.  Important if you want the gamma point and k-point
         # answers to agree
@@ -357,7 +397,8 @@ def get_coulG(cell, k=np.zeros(3), exx=False, mf=None, mesh=None, Gv=None,
         if cell.dimension > 0 and exxdiv == 'ewald' and len(G0_idx) > 0:
             coulG[G0_idx] += Nk*cell.vol*madelung(cell, kpts)
 
-    coulG[equal2boundary] = 0
+    if equal2boundary is not None:
+        coulG[equal2boundary] = 0
 
     # Scale the coulG kernel for attenuated Coulomb integrals.
     # * omega is used by RangeSeparatedJKBuilder which requires ewald probe charge
@@ -507,7 +548,7 @@ def get_lattice_Ls(cell, nimgs=None, rcut=None, dimension=None, discard=True):
 
     a = cell.lattice_vectors()
 
-    scaled_atom_coords = np.linalg.solve(a.T, cell.atom_coords().T).T
+    scaled_atom_coords = cell.get_scaled_atom_coords()
     atom_boundary_max = scaled_atom_coords[:,:dimension].max(axis=0)
     atom_boundary_min = scaled_atom_coords[:,:dimension].min(axis=0)
     if (np.any(atom_boundary_max > 1) or np.any(atom_boundary_min < -1)):
@@ -542,11 +583,12 @@ def find_boundary(a):
                              np.arange(-bounds[2], bounds[2]+1)))
     Ls = np.dot(Ts[:,:dimension], a[:dimension])
 
-    ovlp_penalty += 1e-200  # avoid /0
-    Ts_scaled = (Ts[:,:dimension] + 1e-200) / ovlp_penalty
-    ovlp_penalty_fac = 1. / abs(Ts_scaled).min(axis=1)
-    Ls_mask = np.linalg.norm(Ls, axis=1) * (1-ovlp_penalty_fac) < rcut
-    Ls = Ls[Ls_mask]
+    if discard:
+        ovlp_penalty += 1e-200  # avoid /0
+        Ts_scaled = (Ts[:,:dimension] + 1e-200) / ovlp_penalty
+        ovlp_penalty_fac = 1. / abs(Ts_scaled).min(axis=1)
+        Ls_mask = np.linalg.norm(Ls, axis=1) * (1-ovlp_penalty_fac) < rcut
+        Ls = Ls[Ls_mask]
     return np.asarray(Ls, order='C')
 
 
diff --git a/pyscf/scf/atom_hf.py b/pyscf/scf/atom_hf.py
index 58e0a585c3..4430963493 100644
--- a/pyscf/scf/atom_hf.py
+++ b/pyscf/scf/atom_hf.py
@@ -30,6 +30,7 @@ def get_atm_nrhf(mol, atomic_configuration=elements.NRSRHF_CONFIGURATION):
 
     atm_template = mol.copy(deep=False)
     atm_template.charge = 0
+    atm_template.enuc = 0
     atm_template.symmetry = False  # TODO: enable SO3 symmetry here
     atm_template.atom = atm_template._atom = []
     atm_template.cart = False  # AtomSphAverageRHF does not support cartesian basis
@@ -50,7 +51,6 @@ def get_atm_nrhf(mol, atomic_configuration=elements.NRSRHF_CONFIGURATION):
         atm._ecpbas[:,0] = 0
         if element in mol._pseudo:
             atm._pseudo = {element: mol._pseudo.get(element)}
-            raise NotImplementedError
         atm.spin = atm.nelectron % 2
 
         nao = atm.nao
@@ -59,6 +59,19 @@ def get_atm_nrhf(mol, atomic_configuration=elements.NRSRHF_CONFIGURATION):
             mo_occ = mo_energy = numpy.zeros(nao)
             mo_coeff = numpy.zeros((nao,nao))
             atm_scf_result[element] = (0, mo_energy, mo_coeff, mo_occ)
+        elif atm._pseudo:
+            from pyscf.scf import atom_hf_pp
+            atm.a = None
+            if atm.nelectron == 1:
+                atm_hf = atom_hf_pp.AtomHF1ePP(atm)
+            else:
+                atm_hf = atom_hf_pp.AtomSCFPP(atm)
+                atm_hf.atomic_configuration = atomic_configuration
+
+            atm_hf.verbose = mol.verbose
+            atm_hf.run()
+            atm_scf_result[element] = (atm_hf.e_tot, atm_hf.mo_energy,
+                                       atm_hf.mo_coeff, atm_hf.mo_occ)
         else:
             if atm.nelectron == 1:
                 atm_hf = AtomHF1e(atm)
diff --git a/pyscf/scf/atom_hf_pp.py b/pyscf/scf/atom_hf_pp.py
new file mode 100644
index 0000000000..19a2f73930
--- /dev/null
+++ b/pyscf/scf/atom_hf_pp.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Xing Zhang <zhangxing.nju@gmail.com>
+#
+
+import copy
+import numpy
+from scipy.special import erf
+
+from pyscf import lib
+from pyscf import gto, scf
+from pyscf.dft import gen_grid, numint
+from pyscf.pbc import gto as pbcgto
+from pyscf.scf import atom_hf, rohf
+
+def get_pp_loc_part1_rs(mol, coords):
+    atm_coords = mol.atom_coords()
+    out = 0
+    for ia in range(mol.natm):
+        r0 = atm_coords[ia]
+        r2 = numpy.sum((coords - r0)**2, axis=1)
+        r = numpy.sqrt(r2)
+        Zia = mol.atom_charge(ia)
+        symb = mol.atom_symbol(ia)
+        if symb in mol._pseudo:
+            pp = mol._pseudo[symb]
+            rloc, nexp, cexp = pp[1:3+1]
+        else:
+            rloc = 1e16
+        alpha = 1.0 / (numpy.sqrt(2) * rloc)
+        out += - Zia / r * erf(alpha * r)
+    return out
+
+def _aux_e2(cell, auxcell, intor, aosym='s1', comp=1):
+    intor = cell._add_suffix(intor)
+    pcell = copy.copy(cell)
+    pcell._atm, pcell._bas, pcell._env = \
+            atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                         cell._atm, cell._bas, cell._env)
+    ao_loc = gto.moleintor.make_loc(bas, intor)
+    aux_loc = auxcell.ao_loc_nr(auxcell.cart or 'ssc' in intor)
+    ao_loc = numpy.asarray(numpy.hstack([ao_loc, ao_loc[-1]+aux_loc[1:]]),
+                           dtype=numpy.int32)
+    atm, bas, env = gto.conc_env(atm, bas, env,
+                                 auxcell._atm, auxcell._bas, auxcell._env)
+    nbas = cell.nbas
+    shls_slice = (0, nbas, nbas, nbas*2, nbas*2, nbas*2+auxcell.nbas)
+    comp = 1
+    out = gto.moleintor.getints3c(intor, atm, bas, env, shls_slice=shls_slice,
+                                  comp=comp, aosym=aosym, ao_loc=ao_loc)
+    return out
+
+def get_pp_loc_part2(mol):
+    buf = 0
+    intors = ('int3c2e', 'int3c1e', 'int3c1e_r2_origk',
+              'int3c1e_r4_origk', 'int3c1e_r6_origk')
+    for cn in range(1, 5):
+        fakecell = pbcgto.pseudo.pp_int.fake_cell_vloc(mol, cn)
+        if fakecell.nbas > 0:
+            v = _aux_e2(mol, fakecell, intors[cn], aosym='s2', comp=1)
+            buf += numpy.einsum('...i->...', v)
+    if numpy.isscalar(buf):
+        vpp_loc =  buf
+    else:
+        vpp_loc = lib.unpack_tril(buf)
+    return vpp_loc
+
+def get_pp_loc(mol):
+    # TODO use analytic integral
+    grids = gen_grid.Grids(mol)
+    grids.level = 3
+    grids.build(with_non0tab=True)
+    _numint = numint.NumInt()
+
+    vpp = 0
+    for ao, mask, weight, coords in _numint.block_loop(mol, grids):
+        vloc = get_pp_loc_part1_rs(mol, coords)
+        vpp += numpy.einsum("g,g,gi,gj->ij", weight, vloc, ao, ao)
+    vpp += get_pp_loc_part2(mol)
+    return vpp
+
+def get_pp_nl(mol):
+    nao = mol.nao
+    fakecell, hl_blocks = pbcgto.pseudo.pp_int.fake_cell_vnl(mol)
+    ppnl_half = _int_vnl(mol, fakecell, hl_blocks)
+
+    ppnl = numpy.zeros((nao,nao), dtype=numpy.double)
+    offset = [0] * 3
+    for ib, hl in enumerate(hl_blocks):
+        l = fakecell.bas_angular(ib)
+        nd = 2 * l + 1
+        hl_dim = hl.shape[0]
+        ilp = numpy.ndarray((hl_dim,nd,nao), dtype=numpy.double)
+        for i in range(hl_dim):
+            p0 = offset[i]
+            ilp[i] = ppnl_half[i][p0:p0+nd]
+            offset[i] = p0 + nd
+        ppnl += numpy.einsum('ilp,ij,jlq->pq', ilp, hl, ilp)
+    return ppnl
+
+def _int_vnl(cell, fakecell, hl_blocks):
+    intopt = lib.c_null_ptr()
+
+    def int_ket(_bas, intor):
+        if len(_bas) == 0:
+            return []
+        intor = cell._add_suffix(intor)
+        atm, bas, env = gto.conc_env(cell._atm, cell._bas, cell._env,
+                                     fakecell._atm, _bas, fakecell._env)
+        atm = numpy.asarray(atm, dtype=numpy.int32)
+        bas = numpy.asarray(bas, dtype=numpy.int32)
+        env = numpy.asarray(env, dtype=numpy.double)
+        nbas = len(bas)
+        shls_slice = (cell.nbas, nbas, 0, cell.nbas)
+        ao_loc = gto.moleintor.make_loc(bas, intor)
+        ni = ao_loc[shls_slice[1]] - ao_loc[shls_slice[0]]
+        nj = ao_loc[shls_slice[3]] - ao_loc[shls_slice[2]]
+        out = numpy.empty((ni,nj), dtype=numpy.double)
+        comp = 1
+        out = gto.moleintor.getints2c(intor, atm, bas, env, shls_slice=shls_slice, comp=comp, hermi=0,
+                                      ao_loc=ao_loc, cintopt=intopt, out=out)
+        return out
+
+    hl_dims = numpy.asarray([len(hl) for hl in hl_blocks])
+    out = (int_ket(fakecell._bas[hl_dims>0], 'int1e_ovlp'),
+           int_ket(fakecell._bas[hl_dims>1], 'int1e_r2_origi'),
+           int_ket(fakecell._bas[hl_dims>2], 'int1e_r4_origi'))
+    return out
+
+class AtomSCFPP(atom_hf.AtomSphAverageRHF):
+    def get_hcore(self, mol=None):
+        if mol is None:
+            mol = self.mol
+        h = mol.intor('int1e_kin', hermi=1)
+        h += get_pp_nl(mol)
+        h += get_pp_loc(mol)
+        return h
+
+class AtomHF1ePP(rohf.HF1e, AtomSCFPP):
+    eig = AtomSCFPP.eig
+    get_hcore = AtomSCFPP.get_hcore
diff --git a/pyscf/scf/dhf.py b/pyscf/scf/dhf.py
index 32d2d0f7f2..6e29d5a450 100644
--- a/pyscf/scf/dhf.py
+++ b/pyscf/scf/dhf.py
@@ -285,14 +285,14 @@ def fproj(mo):
     return dm
 
 
-def get_init_guess(mol, key='minao'):
+def get_init_guess(mol, key='minao', **kwargs):
     '''Generate density matrix for initial guess
 
     Kwargs:
         key : str
             One of 'minao', 'atom', 'huckel', 'mod_huckel', 'hcore', '1e', 'chkfile'.
     '''
-    return UHF(mol).get_init_guess(mol, key)
+    return UHF(mol).get_init_guess(mol, key, **kwargs)
 
 def time_reversal_matrix(mol, mat):
     ''' T(A_ij) = A[T(i),T(j)]^*
diff --git a/pyscf/scf/diis.py b/pyscf/scf/diis.py
index 321f81cdfe..a442f58b9c 100644
--- a/pyscf/scf/diis.py
+++ b/pyscf/scf/diis.py
@@ -72,13 +72,13 @@ def get_num_vec(self):
 def get_err_vec_orig(s, d, f):
     '''error vector = SDF - FDS'''
     if isinstance(f, numpy.ndarray) and f.ndim == 2:
-        sdf = reduce(numpy.dot, (s,d,f))
+        sdf = reduce(lib.dot, (s,d,f))
         errvec = (sdf.conj().T - sdf).ravel()
 
     elif isinstance(f, numpy.ndarray) and f.ndim == 3 and s.ndim == 3:
         errvec = []
         for i in range(f.shape[0]):
-            sdf = reduce(numpy.dot, (s[i], d[i], f[i]))
+            sdf = reduce(lib.dot, (s[i], d[i], f[i]))
             errvec.append((sdf.conj().T - sdf).ravel())
         errvec = numpy.hstack(errvec)
 
@@ -98,7 +98,7 @@ def get_err_vec_orth(s, d, f, Corth):
         sym_forbid = orbsym[:,None] != orbsym
 
     if isinstance(f, numpy.ndarray) and f.ndim == 2:
-        sdf = reduce(numpy.dot, (Corth.conj().T, s, d, f, Corth))
+        sdf = reduce(lib.dot, (Corth.conj().T, s, d, f, Corth))
         if orbsym is not None:
             sdf[sym_forbid] = 0
         errvec = (sdf.conj().T - sdf).ravel()
@@ -106,7 +106,7 @@ def get_err_vec_orth(s, d, f, Corth):
     elif isinstance(f, numpy.ndarray) and f.ndim == 3 and s.ndim == 3:
         errvec = []
         for i in range(f.shape[0]):
-            sdf = reduce(numpy.dot, (Corth[i].conj().T, s[i], d[i], f[i], Corth[i]))
+            sdf = reduce(lib.dot, (Corth[i].conj().T, s[i], d[i], f[i], Corth[i]))
             if orbsym is not None:
                 sdf[sym_forbid] = 0
             errvec.append((sdf.conj().T - sdf).ravel())
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index b6ecb5ace0..7a8c0e8f22 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -115,8 +115,10 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
         logger.info(mf, 'Set gradient conv threshold to %g', conv_tol_grad)
 
     mol = mf.mol
+    s1e = mf.get_ovlp(mol)
+
     if dm0 is None:
-        dm = mf.get_init_guess(mol, mf.init_guess)
+        dm = mf.get_init_guess(mol, mf.init_guess, s1e=s1e)
     else:
         dm = dm0
 
@@ -128,13 +130,6 @@ def kernel(mf, conv_tol=1e-10, conv_tol_grad=None,
     scf_conv = False
     mo_energy = mo_coeff = mo_occ = None
 
-    s1e = mf.get_ovlp(mol)
-    cond = lib.cond(s1e)
-    logger.debug(mf, 'cond(S) = %s', cond)
-    if numpy.max(cond)*1e-17 > conv_tol:
-        logger.warn(mf, 'Singularity detected in overlap matrix (condition number = %4.3g). '
-                    'SCF may be inaccurate and hard to converge.', numpy.max(cond))
-
     # Skip SCF iterations. Compute only the total energy of the initial density
     if mf.max_cycle <= 0:
         fock = mf.get_fock(h1e, s1e, vhf, dm)  # = h1e + vhf, no DIIS
@@ -722,14 +717,14 @@ def fproj(mo):
     return dm
 
 
-def get_init_guess(mol, key='minao'):
+def get_init_guess(mol, key='minao', **kwargs):
     '''Generate density matrix for initial guess
 
     Kwargs:
         key : str
             One of 'minao', 'atom', 'huckel', 'hcore', '1e', 'chkfile'.
     '''
-    return RHF(mol).get_init_guess(mol, key)
+    return RHF(mol).get_init_guess(mol, key, **kwargs)
 
 
 # eigenvalue of d is 1
@@ -752,7 +747,7 @@ def level_shift(s, d, f, factor):
     Returns:
         New Fock matrix, 2D ndarray
     '''
-    dm_vir = s - reduce(numpy.dot, (s, d, s))
+    dm_vir = s - reduce(lib.dot, (s, d, s))
     return f + dm_vir * factor
 
 
@@ -1570,6 +1565,15 @@ def __init__(self, mol):
         self._opt = {None: None}
         self._eri = None # Note: self._eri requires large amount of memory
 
+    def check_sanity(self):
+        s1e = self.get_ovlp()
+        cond = lib.cond(s1e)
+        logger.debug(self, 'cond(S) = %s', cond)
+        if numpy.max(cond)*1e-17 > self.conv_tol:
+            logger.warn(self, 'Singularity detected in overlap matrix (condition number = %4.3g). '
+                        'SCF may be inaccurate and hard to converge.', numpy.max(cond))
+        return super().check_sanity()
+
     def build(self, mol=None):
         if mol is None: mol = self.mol
         if self.verbose >= logger.WARN:
@@ -1704,7 +1708,7 @@ def from_chk(self, chkfile=None, project=None):
         return self.init_guess_by_chkfile(chkfile, project)
     from_chk.__doc__ = init_guess_by_chkfile.__doc__
 
-    def get_init_guess(self, mol=None, key='minao'):
+    def get_init_guess(self, mol=None, key='minao', **kwargs):
         if not isinstance(key, str):
             return key
 
@@ -1742,7 +1746,7 @@ def get_init_guess(self, mol=None, key='minao'):
     energy_tot = energy_tot
 
     def energy_nuc(self):
-        return self.mol.energy_nuc()
+        return self.mol.enuc
 
     # A hook for overloading convergence criteria in SCF iterations. Assigning
     # a function
@@ -2103,8 +2107,8 @@ def check_sanity(self):
                         mol.nelectron)
         return SCF.check_sanity(self)
 
-    def get_init_guess(self, mol=None, key='minao'):
-        dm = SCF.get_init_guess(self, mol, key)
+    def get_init_guess(self, mol=None, key='minao', **kwargs):
+        dm = SCF.get_init_guess(self, mol, key, **kwargs)
         if self.verbose >= logger.DEBUG1:
             s = self.get_ovlp()
             nelec = numpy.einsum('ij,ji', dm, s).real
diff --git a/pyscf/scf/uhf.py b/pyscf/scf/uhf.py
index 4f07335bd6..0afc66d0ba 100644
--- a/pyscf/scf/uhf.py
+++ b/pyscf/scf/uhf.py
@@ -130,8 +130,8 @@ def _break_dm_spin_symm(mol, dm):
             dmb[...,p0:p1,p0:p1] = dma[...,p0:p1,p0:p1]
     return dma, dmb
 
-def get_init_guess(mol, key='minao'):
-    return UHF(mol).get_init_guess(mol, key)
+def get_init_guess(mol, key='minao', **kwargs):
+    return UHF(mol).get_init_guess(mol, key, **kwargs)
 
 def make_rdm1(mo_coeff, mo_occ, **kwargs):
     '''One-particle density matrix in AO representation
@@ -830,8 +830,8 @@ def make_rdm2(self, mo_coeff=None, mo_occ=None, **kwargs):
 
     energy_elec = energy_elec
 
-    def get_init_guess(self, mol=None, key='minao'):
-        dm = hf.SCF.get_init_guess(self, mol, key)
+    def get_init_guess(self, mol=None, key='minao', **kwargs):
+        dm = hf.SCF.get_init_guess(self, mol, key, **kwargs)
         if self.verbose >= logger.DEBUG1:
             s = self.get_ovlp()
             nelec =(numpy.einsum('ij,ji', dm[0], s).real,