From 7de9150598b23f7412d27772e39a530ba1832899 Mon Sep 17 00:00:00 2001
From: Qiming Sun <osirpt.sun@gmail.com>
Date: Fri, 10 Nov 2023 20:34:23 -0800
Subject: [PATCH] 4th, and 5th order XC derivatives (#1930)

* Update conda build

* Arbitrary order to transform_xc; patch xcfun up to 5th order

* Update libxc itrf

* Configure max_deriv_order; arbitrary order for ud2ts and ts2ud

* Add 4th order XC deriv in CI job
---
 .github/workflows/ci_linux/build_pyscf.sh |   2 +-
 conda/conda_build_config.yaml             |   3 +-
 docker/Dockerfile                         |   2 +-
 pyscf/dft/libxc.py                        | 358 +++++++----
 pyscf/dft/numint.py                       |  50 +-
 pyscf/dft/numint2c.py                     |  47 +-
 pyscf/dft/test/test_xc_deriv.py           | 264 ++++++++
 pyscf/dft/test/test_xcfun.py              |  22 +-
 pyscf/dft/xc_deriv.py                     | 268 +++++---
 pyscf/dft/xcfun.py                        | 185 +++---
 pyscf/lib/CMakeLists.txt                  |  17 +-
 pyscf/lib/config.h.in                     |   2 +-
 pyscf/lib/dft/CMakeLists.txt              |   2 +-
 pyscf/lib/dft/libxc_itrf.c                | 734 +++++++++++-----------
 pyscf/lib/dft/xc_deriv.c                  | 174 +++--
 pyscf/lib/dft/xcfun_itrf.c                |  41 +-
 pyscf/lib/libxcfun.patch                  | 112 +++-
 pyscf/lib/misc.py                         |  15 +-
 18 files changed, 1440 insertions(+), 858 deletions(-)

diff --git a/.github/workflows/ci_linux/build_pyscf.sh b/.github/workflows/ci_linux/build_pyscf.sh
index e33ab6f09c..04e2aac97b 100755
--- a/.github/workflows/ci_linux/build_pyscf.sh
+++ b/.github/workflows/ci_linux/build_pyscf.sh
@@ -5,7 +5,7 @@ set -e
 cd ./pyscf/lib
 curl -L "https://github.com/pyscf/pyscf-build-deps/blob/master/pyscf-2.4a-deps.tar.gz?raw=true" | tar xzf -
 mkdir build; cd build
-cmake -DBUILD_LIBXC=OFF -DBUILD_XCFUN=OFF -DBUILD_LIBCINT=OFF ..
+cmake -DBUILD_LIBXC=OFF -DBUILD_XCFUN=OFF -DBUILD_LIBCINT=OFF -DXCFUN_MAX_ORDER=4 ..
 make -j4
 cd ..
 rm -Rf build
diff --git a/conda/conda_build_config.yaml b/conda/conda_build_config.yaml
index 8d657fe804..d7d85008cc 100644
--- a/conda/conda_build_config.yaml
+++ b/conda/conda_build_config.yaml
@@ -1,7 +1,6 @@
 python:
-  - 3.6
-  - 3.7
   - 3.8
   - 3.9
   - 3.10
   - 3.11
+  - 3.12
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 004e470b98..6246ce9ba0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7-slim
+FROM python:3.10-slim
 
 LABEL maintainer="Qiming Sun <osirpt.sun@gmail.com>"
 
diff --git a/pyscf/dft/libxc.py b/pyscf/dft/libxc.py
index b5b4f781e2..d584278a80 100644
--- a/pyscf/dft/libxc.py
+++ b/pyscf/dft/libxc.py
@@ -30,6 +30,7 @@
 from functools import lru_cache
 from pyscf import lib
 from pyscf.dft.xc.utils import remove_dup, format_xc_code
+from pyscf.dft import xc_deriv
 from pyscf import __config__
 
 _itrf = lib.load_library('libxc_itrf')
@@ -942,6 +943,7 @@ def is_nlc(xc_code):
 def needs_laplacian(xc_code):
     return _itrf.LIBXC_needs_laplacian(xc_code) != 0
 
+@lru_cache(100)
 def max_deriv_order(xc_code):
     hyb, fn_facs = parse_xc(xc_code)
     if fn_facs:
@@ -1405,200 +1407,302 @@ def eval_xc(xc_code, rho, spin=0, relativity=0, deriv=1, omega=None, verbose=Non
 
         see also libxc_itrf.c
     '''  # noqa: E501
-    hyb, fn_facs = parse_xc(xc_code)
-    if omega is not None:
-        hyb = hyb[:2] + (float(omega),)
-    return _eval_xc(hyb, fn_facs, rho, spin, relativity, deriv, verbose)
-
-
-def _eval_xc(hyb, fn_facs, rho, spin=0, relativity=0, deriv=1, verbose=None):
-    assert (deriv <= 3)
-    if spin == 0:
-        nspin = 1
-        rho_u = rho_d = numpy.asarray(rho, order='C')
-    else:
-        nspin = 2
-        rho_u = numpy.asarray(rho[0], order='C')
-        rho_d = numpy.asarray(rho[1], order='C')
-    assert (rho_u.dtype == numpy.double)
-    assert (rho_d.dtype == numpy.double)
-
-    if rho_u.ndim == 1:
-        rho_u = rho_u.reshape(1,-1)
-        rho_d = rho_d.reshape(1,-1)
-    ngrids = rho_u.shape[1]
-
-    fn_ids = [x[0] for x in fn_facs]
-    facs   = [x[1] for x in fn_facs]
-    if hyb[2] != 0:
-        # Current implementation does not support different omegas for
-        # different RSH functionals if there are multiple RSHs
-        omega = [hyb[2]] * len(facs)
-    else:
-        omega = [0] * len(facs)
-    fn_ids_set = set(fn_ids)
-    if fn_ids_set.intersection(PROBLEMATIC_XC):
-        problem_xc = [PROBLEMATIC_XC[k]
-                      for k in fn_ids_set.intersection(PROBLEMATIC_XC)]
-        warnings.warn('Libxc functionals %s may have discrepancy to xcfun '
-                      'library.\n' % problem_xc)
-
-    if any([needs_laplacian(fid) for fid in fn_ids]):
-        raise NotImplementedError('laplacian in meta-GGA method')
-
-    n = len(fn_ids)
-    if (n == 0 or  # xc_code = '' or xc_code = 'HF', an empty functional
-        all((is_lda(x) for x in fn_ids))):
-        if spin == 0:
-            nvar = 1
-            xctype = 'R-LDA'
-        else:
-            nvar = 2
-            xctype = 'U-LDA'
-    elif any((is_meta_gga(x) for x in fn_ids)):
-        if spin == 0:
-            nvar = 4
-            xctype = 'R-MGGA'
-        else:
-            nvar = 9
-            xctype = 'U-MGGA'
-    else:  # GGA
-        if spin == 0:
-            nvar = 2
-            xctype = 'R-GGA'
-        else:
-            nvar = 5
-            xctype = 'U-GGA'
-
-    # Check that the density rho has the appropriate shape
-    # should it be >= or ==, in test test_xcfun.py, test_vs_libxc_rks
-    # the density contain 6 rows independently of the functional
-    if xctype[2:] == 'LDA':
-        for rho_ud in [rho_u, rho_d]:
-            assert rho_ud.shape[0] >= 1
-
-    elif xctype[2:] == 'GGA':
-        for rho_ud in [rho_u, rho_d]:
-            assert rho_ud.shape[0] >= 4
-
-    elif xctype[2:] == 'MGGA':
-        for rho_ud in [rho_u, rho_d]:
-            assert rho_ud.shape[0] >= 6
-    else:
-        raise ValueError("Unknown nvar {}".format(nvar))
-
-    outlen = (math.factorial(nvar+deriv) //
-              (math.factorial(nvar) * math.factorial(deriv)))
-    outbuf = numpy.zeros((outlen,ngrids))
-
-    density_threshold = 0
-    _itrf.LIBXC_eval_xc(ctypes.c_int(n),
-                        (ctypes.c_int*n)(*fn_ids),
-                        (ctypes.c_double*n)(*facs),
-                        (ctypes.c_double*n)(*omega),
-                        ctypes.c_int(nspin),
-                        ctypes.c_int(deriv), ctypes.c_int(ngrids),
-                        rho_u.ctypes.data_as(ctypes.c_void_p),
-                        rho_d.ctypes.data_as(ctypes.c_void_p),
-                        outbuf.ctypes.data_as(ctypes.c_void_p),
-                        ctypes.c_double(density_threshold))
-
+    outbuf = _eval_xc(xc_code, rho, spin, deriv, omega)
     exc = outbuf[0]
     vxc = fxc = kxc = None
-    if xctype == 'R-LDA':
+    xctype = xc_type(xc_code)
+    if xctype == 'LDA' and spin == 0:
         if deriv > 0:
             vxc = [outbuf[1]]
         if deriv > 1:
             fxc = [outbuf[2]]
         if deriv > 2:
             kxc = [outbuf[3]]
-    elif xctype == 'R-GGA':
+    elif xctype == 'GGA' and spin == 0:
         if deriv > 0:
             vxc = [outbuf[1], outbuf[2]]
         if deriv > 1:
             fxc = [outbuf[3], outbuf[4], outbuf[5]]
         if deriv > 2:
             kxc = [outbuf[6], outbuf[7], outbuf[8], outbuf[9]]
-    elif xctype == 'U-LDA':
+    elif xctype == 'LDA' and spin == 1:
         if deriv > 0:
             vxc = [outbuf[1:3].T]
         if deriv > 1:
             fxc = [outbuf[3:6].T]
         if deriv > 2:
             kxc = [outbuf[6:10].T]
-    elif xctype == 'U-GGA':
+    elif xctype == 'GGA' and spin == 1:
         if deriv > 0:
             vxc = [outbuf[1:3].T, outbuf[3:6].T]
         if deriv > 1:
             fxc = [outbuf[6:9].T, outbuf[9:15].T, outbuf[15:21].T]
         if deriv > 2:
             kxc = [outbuf[21:25].T, outbuf[25:34].T, outbuf[34:46].T, outbuf[46:56].T]
-    elif xctype == 'R-MGGA':
+    elif xctype == 'MGGA' and spin == 0:
         if deriv > 0:
-            vxc = [outbuf[1], outbuf[2], None, outbuf[4]]
+            vxc = [outbuf[1], outbuf[2], None, outbuf[3]]
         if deriv > 1:
             fxc = [
                 # v2rho2, v2rhosigma, v2sigma2,
-                outbuf[5], outbuf[6], outbuf[7],
+                outbuf[4], outbuf[5], outbuf[6],
                 # v2lapl2, v2tau2,
                 None, outbuf[9],
                 # v2rholapl, v2rhotau,
-                None, outbuf[11],
+                None, outbuf[7],
                 # v2lapltau, v2sigmalapl, v2sigmatau,
-                None, None, outbuf[14]]
+                None, None, outbuf[8]]
         if deriv > 2:
             # v3lapltau2 might not be strictly 0
             # outbuf[18] = 0
             kxc = [
                 # v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3,
-                outbuf[15], outbuf[16], outbuf[17], outbuf[18],
+                outbuf[10], outbuf[11], outbuf[12], outbuf[13],
                 # v3rho2lapl, v3rho2tau,
-                None, outbuf[20],
+                None, outbuf[14],
                 # v3rhosigmalapl, v3rhosigmatau,
-                None, outbuf[22],
+                None, outbuf[15],
                 # v3rholapl2, v3rholapltau, v3rhotau2,
-                None, None, outbuf[25],
+                None, None, outbuf[16],
                 # v3sigma2lapl, v3sigma2tau,
-                None, outbuf[27],
+                None, outbuf[17],
                 # v3sigmalapl2, v3sigmalapltau, v3sigmatau2,
-                None, None, outbuf[30],
+                None, None, outbuf[18],
                 # v3lapl3, v3lapl2tau, v3lapltau2, v3tau3)
-                None, None, None, outbuf[34]]
-    elif xctype == 'U-MGGA':
+                None, None, None, outbuf[19]]
+    elif xctype == 'MGGA' and spin == 1:
         if deriv > 0:
-            vxc = [outbuf[1:3].T, outbuf[3:6].T, None, outbuf[8:10].T]
+            vxc = [outbuf[1:3].T, outbuf[3:6].T, None, outbuf[6:8].T]
         if deriv > 1:
             # v2lapltau might not be strictly 0
             # outbuf[39:43] = 0
             fxc = [
                 # v2rho2, v2rhosigma, v2sigma2,
-                outbuf[10:13].T, outbuf[13:19].T, outbuf[19:25].T,
+                outbuf[8:11].T, outbuf[11:17].T, outbuf[17:23].T,
                 # v2lapl2, v2tau2,
-                None, outbuf[28:31].T,
+                None, outbuf[33:36].T,
                 # v2rholapl, v2rhotau,
-                None, outbuf[35:39].T,
+                None, outbuf[23:27].T,
                 # v2lapltau, v2sigmalapl, v2sigmatau,
-                None, None, outbuf[49:55].T]
+                None, None, outbuf[27:33].T]
         if deriv > 2:
             # v3lapltau2 might not be strictly 0
             # outbuf[204:216] = 0
             kxc = [
                 # v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3,
-                outbuf[55:59].T, outbuf[59:68].T, outbuf[68:80].T, outbuf[80:90].T,
+                outbuf[36:40].T, outbuf[40:49].T, outbuf[49:61].T, outbuf[61:71].T,
                 # v3rho2lapl, v3rho2tau,
-                None, outbuf[96:102].T,
+                None, outbuf[71:77].T,
                 # v3rhosigmalapl, v3rhosigmatau,
-                None, outbuf[114:126].T,
+                None, outbuf[77:89].T,
                 # v3rholapl2, v3rholapltau, v3rhotau2,
-                None, None, outbuf[140:146].T,
+                None, None, outbuf[89:95].T,
                 # v3sigma2lapl, v3sigma2tau,
-                None, outbuf[158:170].T,
+                None, outbuf[95:107].T,
                 # v3sigmalapl2, v3sigmalapltau, v3sigmatau2,
-                None, None, outbuf[191:200].T,
+                None, None, outbuf[107:116].T,
                 # v3lapl3, v3lapl2tau, v3lapltau2, v3tau3)
-                None, None, None, outbuf[216:220].T]
+                None, None, None, outbuf[116:120].T]
     return exc, vxc, fxc, kxc
 
+_GGA_SORT = {
+    (1, 2): numpy.array([
+        6, 7, 9, 10, 11, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+    ]),
+    (1, 3): numpy.array([
+        21, 22, 25, 26, 27, 23, 28, 29, 30, 34, 35, 36, 37, 38, 39, 24, 31, 32,
+        33, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+    ]),
+    (1, 4): numpy.array([
+        56, 57, 61, 62, 63, 58, 64, 65, 66, 73, 74, 75, 76, 77, 78, 59, 67, 68,
+        69, 79, 80, 81, 82, 83, 84, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 60,
+        70, 71, 72, 85, 86, 87, 88, 89, 90, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+        122, 123, 124, 125,
+    ])
+}
+_MGGA_SORT = {
+    (0, 2): numpy.array([
+        0, # v2rho2
+        1, # v2rhosigma
+        3, # v2rhotau
+        2, # v2sigma2
+        4, # v2sigmatau
+        5, # v2tau2
+    ]) + 4,
+    (0, 3): numpy.array([
+        0, # v3rho3
+        1, # v3rho2sigma
+        4, # v3rho2tau
+        2, # v3rhosigma2
+        5, # v3rhosigmatau
+        6, # v3rhotau2
+        3, # v3sigma3
+        7, # v3sigma2tau
+        8, # v3sigmatau2
+        9, # v3tau3
+    ]) + 10,
+    (0, 4): numpy.array([
+        0,  # v4rho4
+        1,  # v4rho3sigma
+        5,  # v4rho3tau
+        2,  # v4rho2sigma2
+        6,  # v4rho2sigmatau
+        7,  # v4rho2tau2
+        3,  # v4rhosigma3
+        8,  # v4rhosigma2tau
+        9,  # v4rhosigmatau2
+        10, # v4rhotau3
+        4,  # v4sigma4
+        11, # v4sigma3tau
+        12, # v4sigma2tau2
+        13, # v4sigmatau3
+        14, # v4tau4
+    ]) + 20,
+    (1, 2): numpy.array([
+        8, 9, 11, 12, 13, 23, 24, 10, 14, 15, 16, 25, 26, 17, 18, 19, 27, 28,
+        20, 21, 29, 30, 22, 31, 32, 33, 34, 35,
+    ]),
+    (1, 3): numpy.array([
+        36, 37, 40, 41, 42, 71, 72, 38, 43, 44, 45, 73, 74, 49, 50, 51, 77, 78,
+        52, 53, 79, 80, 54, 81, 82, 89, 90, 91, 39, 46, 47, 48, 75, 76, 55, 56,
+        57, 83, 84, 58, 59, 85, 86, 60, 87, 88, 92, 93, 94, 61, 62, 63, 95, 96,
+        64, 65, 97, 98, 66, 99, 100, 107, 108, 109, 67, 68, 101, 102, 69, 103,
+        104, 110, 111, 112, 70, 105, 106, 113, 114, 115, 116, 117, 118, 119,
+    ]),
+    (1, 4): numpy.array([
+        120, 121, 125, 126, 127, 190, 191, 122, 128, 129, 130, 192, 193, 137,
+        138, 139, 198, 199, 140, 141, 200, 201, 142, 202, 203, 216, 217, 218,
+        123, 131, 132, 133, 194, 195, 143, 144, 145, 204, 205, 146, 147, 206,
+        207, 148, 208, 209, 219, 220, 221, 155, 156, 157, 225, 226, 158, 159,
+        227, 228, 160, 229, 230, 249, 250, 251, 161, 162, 231, 232, 163, 233,
+        234, 252, 253, 254, 164, 235, 236, 255, 256, 257, 267, 268, 269, 270,
+        124, 134, 135, 136, 196, 197, 149, 150, 151, 210, 211, 152, 153, 212,
+        213, 154, 214, 215, 222, 223, 224, 165, 166, 167, 237, 238, 168, 169,
+        239, 240, 170, 241, 242, 258, 259, 260, 171, 172, 243, 244, 173, 245,
+        246, 261, 262, 263, 174, 247, 248, 264, 265, 266, 271, 272, 273, 274,
+        175, 176, 177, 275, 276, 178, 179, 277, 278, 180, 279, 280, 295, 296,
+        297, 181, 182, 281, 282, 183, 283, 284, 298, 299, 300, 184, 285, 286,
+        301, 302, 303, 313, 314, 315, 316, 185, 186, 287, 288, 187, 289, 290,
+        304, 305, 306, 188, 291, 292, 307, 308, 309, 317, 318, 319, 320, 189,
+        293, 294, 310, 311, 312, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+    ])
+}
+
+def eval_xc1(xc_code, rho, spin=0, deriv=1, omega=None):
+    '''Similar to eval_xc.
+    Returns an array with the order of derivatives following xcfun convention.
+    '''
+    out = _eval_xc(xc_code, rho, spin, deriv=deriv, omega=omega)
+    xctype = xc_type(xc_code)
+    if deriv <= 1:
+        return out
+    elif xctype == 'LDA' or xctype == 'HF':
+        return out
+    elif xctype == 'GGA':
+        if spin == 0:
+            return out
+        else:
+            idx = [numpy.arange(6)] # up to deriv=1
+            for i in range(2, deriv+1):
+                idx.append(_GGA_SORT[(spin, i)])
+    else: # MGGA
+        if spin == 0:
+            idx = [numpy.arange(4)] # up to deriv=1
+        else:
+            idx = [numpy.arange(8)] # up to deriv=1
+        for i in range(2, deriv+1):
+            idx.append(_MGGA_SORT[(spin, i)])
+    return out[numpy.hstack(idx)]
+
+def _eval_xc(xc_code, rho, spin=0, deriv=1, omega=None):
+    assert deriv <= max_deriv_order(xc_code)
+    xctype = xc_type(xc_code)
+    assert xctype in ('HF', 'LDA', 'GGA', 'MGGA')
+
+    rho = numpy.asarray(rho, order='C', dtype=numpy.double)
+    if xctype == 'MGGA' and rho.shape[-2] == 6:
+        rho = numpy.asarray(rho[...,[0,1,2,3,5],:], order='C')
+
+    hyb, fn_facs = parse_xc(xc_code)
+    if omega is not None:
+        hyb = hyb[:2] + (float(omega),)
+
+    fn_ids = [x[0] for x in fn_facs]
+    facs   = [x[1] for x in fn_facs]
+    if hyb[2] != 0:
+        # Current implementation does not support different omegas for
+        # different RSH functionals if there are multiple RSHs
+        omega = [hyb[2]] * len(facs)
+    else:
+        omega = [0] * len(facs)
+    fn_ids_set = set(fn_ids)
+    if fn_ids_set.intersection(PROBLEMATIC_XC):
+        problem_xc = [PROBLEMATIC_XC[k]
+                      for k in fn_ids_set.intersection(PROBLEMATIC_XC)]
+        warnings.warn('Libxc functionals %s may have discrepancy to xcfun '
+                      'library.\n' % problem_xc)
+
+    if any([needs_laplacian(fid) for fid in fn_ids]):
+        raise NotImplementedError('laplacian in meta-GGA method')
+
+    nvar, xlen = xc_deriv._XC_NVAR[xctype, spin]
+    ngrids = rho.shape[-1]
+    rho = rho.reshape(spin+1,nvar,ngrids)
+    outlen = lib.comb(xlen+deriv, deriv)
+    out = numpy.zeros((outlen,ngrids))
+    n = len(fn_ids)
+    if n > 0:
+        density_threshold = 0
+        _itrf.LIBXC_eval_xc(ctypes.c_int(n),
+                            (ctypes.c_int*n)(*fn_ids),
+                            (ctypes.c_double*n)(*facs),
+                            (ctypes.c_double*n)(*omega),
+                            ctypes.c_int(spin), ctypes.c_int(deriv),
+                            ctypes.c_int(nvar), ctypes.c_int(ngrids),
+                            ctypes.c_int(outlen),
+                            rho.ctypes.data_as(ctypes.c_void_p),
+                            out.ctypes.data_as(ctypes.c_void_p),
+                            ctypes.c_double(density_threshold))
+    return out
+
+def eval_xc_eff(xc_code, rho, deriv=1, omega=None):
+    r'''Returns the derivative tensor against the density parameters
+
+    [density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a]
+
+    or spin-polarized density parameters
+
+    [[density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a],
+     [density_b, (nabla_x)_b, (nabla_y)_b, (nabla_z)_b, tau_b]].
+
+    It differs from the eval_xc method in the derivatives of non-local part.
+    The eval_xc method returns the XC functional derivatives to sigma
+    (|\nabla \rho|^2)
+
+    Args:
+        rho: 2-dimensional or 3-dimensional array
+            Total density or (spin-up, spin-down) densities (and their
+            derivatives if GGA or MGGA functionals) on grids
+
+    Kwargs:
+        deriv: int
+            derivative orders
+        omega: float
+            define the exponent in the attenuated Coulomb for RSH functional
+    '''
+    xctype = xc_type(xc_code)
+    rho = numpy.asarray(rho, order='C', dtype=numpy.double)
+    if xctype == 'MGGA' and rho.shape[-2] == 6:
+        rho = numpy.asarray(rho[...,[0,1,2,3,5],:], order='C')
+
+    spin_polarized = rho.ndim >= 2 and rho.shape[0] == 2
+    if spin_polarized:
+        spin = 1
+    else:
+        spin = 0
+    out = eval_xc1(xc_code, rho, spin, deriv, omega)
+    return xc_deriv.transform_xc(rho, out, xctype, spin, deriv)
 
 def define_xc_(ni, description, xctype='LDA', hyb=0, rsh=(0,0,0)):
     '''Define XC functional.  See also :func:`eval_xc` for the rules of input description.
diff --git a/pyscf/dft/numint.py b/pyscf/dft/numint.py
index 6e61dddda6..2269f94d71 100644
--- a/pyscf/dft/numint.py
+++ b/pyscf/dft/numint.py
@@ -2553,13 +2553,14 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
         ao_deriv = 2 if MGGA_DENSITY_LAPL else 1
     else:
         ao_deriv = 0
+    with_lapl = MGGA_DENSITY_LAPL
 
     if mo_coeff[0].ndim == 1:  # RKS
         nao = mo_coeff.shape[0]
         rho = []
         for ao, mask, weight, coords \
                 in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
-            rho.append(ni.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype))
+            rho.append(ni.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype, with_lapl))
         rho = numpy.hstack(rho)
         if spin == 1:  # RKS with nr_rks_fxc_st
             rho *= .5
@@ -2572,8 +2573,8 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
         rhob = []
         for ao, mask, weight, coords \
                 in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
-            rhoa.append(ni.eval_rho2(mol, ao, mo_coeff[0], mo_occ[0], mask, xctype))
-            rhob.append(ni.eval_rho2(mol, ao, mo_coeff[1], mo_occ[1], mask, xctype))
+            rhoa.append(ni.eval_rho2(mol, ao, mo_coeff[0], mo_occ[0], mask, xctype, with_lapl))
+            rhob.append(ni.eval_rho2(mol, ao, mo_coeff[1], mo_occ[1], mask, xctype, with_lapl))
         rho = (numpy.hstack(rhoa), numpy.hstack(rhob))
     vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
     return rho, vxc, fxc
@@ -2681,42 +2682,25 @@ def eval_xc_eff(self, xc_code, rho, deriv=1, omega=None, xctype=None,
         '''
         if omega is None: omega = self.omega
         if xctype is None: xctype = self._xc_type(xc_code)
-        rhop = numpy.asarray(rho)
 
-        if xctype == 'LDA':
-            spin_polarized = rhop.ndim >= 2
-        else:
-            spin_polarized = rhop.ndim == 3
+        rho = numpy.asarray(rho, order='C', dtype=numpy.double)
+        if xctype == 'MGGA' and rho.shape[-2] == 6:
+            rho = numpy.asarray(rho[...,[0,1,2,3,5],:], order='C')
 
+        spin_polarized = rho.ndim >= 2 and rho.shape[0] == 2
         if spin_polarized:
-            assert rhop.shape[0] == 2
             spin = 1
-            if rhop.shape[1] == 5:  # MGGA
-                ngrids = rhop.shape[2]
-                rhop = numpy.empty((2, 6, ngrids))
-                rhop[0,:4] = rho[0][:4]
-                rhop[1,:4] = rho[1][:4]
-                rhop[:,4] = 0
-                rhop[0,5] = rho[0][4]
-                rhop[1,5] = rho[1][4]
         else:
             spin = 0
-            if rhop.shape[0] == 5:  # MGGA
-                ngrids = rho.shape[1]
-                rhop = numpy.empty((6, ngrids))
-                rhop[:4] = rho[:4]
-                rhop[4] = 0
-                rhop[5] = rho[4]
-
-        exc, vxc, fxc, kxc = self.eval_xc(xc_code, rhop, spin, 0, deriv, omega,
-                                          verbose)
-        if deriv > 2:
-            kxc = xc_deriv.transform_kxc(rhop, fxc, kxc, xctype, spin)
-        if deriv > 1:
-            fxc = xc_deriv.transform_fxc(rhop, vxc, fxc, xctype, spin)
-        if deriv > 0:
-            vxc = xc_deriv.transform_vxc(rhop, vxc, xctype, spin)
-        return exc, vxc, fxc, kxc
+
+        out = self.libxc.eval_xc1(xc_code, rho, spin, deriv, omega)
+        evfk = [out[0]]
+        for order in range(1, deriv+1):
+            evfk.append(xc_deriv.transform_xc(rho, out, xctype, spin, order))
+        if deriv < 3:
+            # The return has at least [e, v, f, k] terms
+            evfk.extend([None] * (3 - deriv))
+        return evfk
 
     def _xc_type(self, xc_code):
         return self.libxc.xc_type(xc_code)
diff --git a/pyscf/dft/numint2c.py b/pyscf/dft/numint2c.py
index 4939542484..f34bc6b892 100644
--- a/pyscf/dft/numint2c.py
+++ b/pyscf/dft/numint2c.py
@@ -271,41 +271,29 @@ def _eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None,
     else:
         raise RuntimeError(f'Unknown collinear scheme {ni.collinear}')
 
-    if xctype == 'MGGA':
-        ngrids = rho.shape[2]
-        rhop = np.empty((2, 6, ngrids))
-        rhop[0,:4] = (t[:4] + s[:4]) * .5
-        rhop[1,:4] = (t[:4] - s[:4]) * .5
-        # Padding for laplacian
-        rhop[:,4] = 0
-        rhop[0,5] = (t[4] + s[4]) * .5
-        rhop[1,5] = (t[4] - s[4]) * .5
-    else:
-        rhop = np.stack([(t + s) * .5, (t - s) * .5])
+    rho = np.stack([(t + s) * .5, (t - s) * .5])
+    if xctype == 'MGGA' and rho.shape[1] == 6:
+        rho = np.asarray(rho[:,[0,1,2,3,5],:], order='C')
 
     spin = 1
-    exc, vxc, fxc, kxc = ni.eval_xc(xc_code, rhop, spin, 0, deriv, omega,
-                                    verbose)
 
-    if deriv > 2:
-        kxc = xc_deriv.transform_kxc(rhop, fxc, kxc, xctype, spin)
-    if deriv > 1:
-        fxc = xc_deriv.transform_fxc(rhop, vxc, fxc, xctype, spin)
-    if deriv > 0:
-        vxc = xc_deriv.transform_vxc(rhop, vxc, xctype, spin)
-    return exc, vxc, fxc, kxc
+    out = ni.libxc.eval_xc1(xc_code, rho, spin, deriv, omega)
+    evfk = [out[0]]
+    for order in range(1, deriv+1):
+        evfk.append(xc_deriv.transform_xc(rho, out, xctype, spin, order))
+    if deriv < 3:
+        # The return has at least [e, v, f, k] terms
+        evfk.extend([None] * (3 - deriv))
+    return evfk
 
 # * Mcfun requires functional derivatives to total-density and spin-density.
 # * Make it a global function than a closure so as to be callable by multiprocessing
 def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv):
-    exc, vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype)
-    if deriv > 0:
-        vxc = xc_deriv.ud2ts(vxc)
-    if deriv > 1:
-        fxc = xc_deriv.ud2ts(fxc)
-    if deriv > 2:
-        kxc = xc_deriv.ud2ts(kxc)
-    return exc, vxc, fxc, kxc
+    evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype)
+    for order in range(1, deriv+1):
+        if evfk[order] is not None:
+            evfk[order] = xc_deriv.ud2ts(evfk[order])
+    return evfk
 
 def mcfun_eval_xc_adapter(ni, xc_code):
     '''Wrapper to generate the eval_xc function required by mcfun'''
@@ -566,9 +554,9 @@ def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
             ao_deriv = 0
         n2c = mo_coeff.shape[0]
         nao = n2c // 2
+        with_lapl = numint.MGGA_DENSITY_LAPL
 
         if self.collinear[0] in ('m', 'n'):  # mcol or ncol
-            with_lapl = False
             rho = []
             for ao, mask, weight, coords \
                     in self.block_loop(mol, grids, nao, ao_deriv, max_memory):
@@ -587,7 +575,6 @@ def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
             dm_a = dm[:nao,:nao].real.copy('C')
             dm_b = dm[nao:,nao:].real.copy('C')
             ni = self._to_numint1c()
-            with_lapl = True
             hermi = 1
             rhoa = []
             rhob = []
diff --git a/pyscf/dft/test/test_xc_deriv.py b/pyscf/dft/test/test_xc_deriv.py
index b3e293f63c..03655f45e1 100644
--- a/pyscf/dft/test/test_xc_deriv.py
+++ b/pyscf/dft/test/test_xc_deriv.py
@@ -117,6 +117,59 @@ def v5to6(v5):
         v6[:,[0,1,2,3,5]] = v5
     return v6
 
+def eval_xc_eff(xc_code, rho, deriv, mod):
+    xctype = mod.xc_type(xc_code)
+    rhop = np.asarray(rho)
+
+    if xctype == 'LDA':
+        spin_polarized = rhop.ndim >= 2
+    else:
+        spin_polarized = rhop.ndim == 3
+
+    if spin_polarized:
+        assert rhop.shape[0] == 2
+        spin = 1
+        if rhop.ndim == 3 and rhop.shape[1] == 5:  # MGGA
+            ngrids = rhop.shape[2]
+            rhop = np.empty((2, 6, ngrids))
+            rhop[0,:4] = rho[0][:4]
+            rhop[1,:4] = rho[1][:4]
+            rhop[:,4] = 0
+            rhop[0,5] = rho[0][4]
+            rhop[1,5] = rho[1][4]
+    else:
+        spin = 0
+        if rhop.ndim == 2 and rhop.shape[0] == 5:  # MGGA
+            ngrids = rho.shape[1]
+            rhop = np.empty((6, ngrids))
+            rhop[:4] = rho[:4]
+            rhop[4] = 0
+            rhop[5] = rho[4]
+
+    exc, vxc, fxc, kxc = mod.eval_xc(xc_code, rhop, spin, 0, deriv)
+    if deriv > 2:
+        kxc = xc_deriv.transform_kxc(rhop, fxc, kxc, xctype, spin)
+    if deriv > 1:
+        fxc = xc_deriv.transform_fxc(rhop, vxc, fxc, xctype, spin)
+    if deriv > 0:
+        vxc = xc_deriv.transform_vxc(rhop, vxc, xctype, spin)
+    return exc, vxc, fxc, kxc
+
+def setUpModule():
+    global rho
+    rho = np.array(
+        [[[ 0.17283732, 0.17272921, 0.17244017, 0.17181541, 0.17062690],
+          [-0.01025988,-0.02423402,-0.04315779,-0.06753381,-0.09742367],
+          [ 0.00219947, 0.00222727, 0.00226589, 0.00231774, 0.00238570],
+          [ 0.00151577, 0.00153381, 0.00155893, 0.00159277, 0.00163734],
+          [ 0.00323925, 0.00386831, 0.00520072, 0.00774571, 0.01218266]],
+         [[ 0.17443331, 0.17436845, 0.17413969, 0.17359613, 0.17251427],
+          [-0.00341093,-0.01727580,-0.03605226,-0.06023877,-0.08989467],
+          [ 0.00357202, 0.00361952, 0.00368537, 0.00377355, 0.00388873],
+          [ 0.00233614, 0.00236578, 0.00240683, 0.00246173, 0.00253334],
+          [ 0.00473343, 0.00533707, 0.00663345, 0.00912920, 0.01350123],]]
+    )
+
 class KnownValues(unittest.TestCase):
     def test_gga_deriv1(self):
         ng = 7
@@ -267,6 +320,217 @@ def test_ud2ts(self):
         self.assertAlmostEqual(abs(xc_deriv.ud2ts(f_ud) - f_ts).max(), 0, 12)
         self.assertAlmostEqual(abs(xc_deriv.ud2ts(k_ud) - k_ts).max(), 0, 12)
 
+    def test_libxc_lda_deriv3(self):
+        rho1 = rho[:,0].copy()
+        ref = eval_xc_eff('LDA,', rho1, 3, dft.libxc)
+
+        xc1 = dft.libxc.eval_xc_eff('LDA,', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 51.36053114469969, 9)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('LDA,', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -13.323225829690143, 9)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('LDA,', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -6.912554696220437, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+        rho1 = rho[1,0].copy()
+        ref = eval_xc_eff('LDA,', rho1, 3, dft.libxc)
+
+        xc1 = dft.libxc.eval_xc_eff('LDA,', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 20.21333987261437, 9)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('LDA,', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -5.269784014086463, 9)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('LDA,', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -2.7477984980958627, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+    def test_libxc_gga_deriv3(self):
+        rho1 = rho[:,:4].copy()
+        ref = eval_xc_eff('PBE', rho1, 3, dft.libxc)
+
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 61.29042037001073, 3)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -13.896034377219816, 4)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -7.616226587554259, 6)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+        rho1 = rho[1,:4].copy()
+        ni = numint.NumInt()
+        ref = eval_xc_eff('PBE', rho1, 3, dft.libxc)
+
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 26.08081046374974, 3)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -5.559303849017572, 4)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -3.0715856471099032, 6)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+    def test_libxc_mgga_deriv3(self):
+        rho1 = rho
+        ref = eval_xc_eff('M06', rho1, 3, dft.libxc)
+
+        xc1 = dft.libxc.eval_xc_eff('M06', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 3461867.985594323, 1)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('M06', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -19196.865088253828, 3)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('M06', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), 90.99262909378264, 6)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+        rho1 = rho[1]
+        ni = numint.NumInt()
+        ref = eval_xc_eff('M06', rho1, 3, dft.libxc)
+
+        xc1 = dft.libxc.eval_xc_eff('M06', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 2506574.915698602, 1)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('M06', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -9308.64852580393, 3)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.libxc.eval_xc_eff('M06', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), 19.977512805950784, 7)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+    @unittest.skip(dft.libxc.max_deriv_order('pbe,') <= 3)
+    def test_libxc_gga_deriv4(self):
+        rho1 = rho[:,:4].copy()
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=4)
+        self.assertAlmostEqual(xc1.sum(), -920.135878252819, 4)
+
+        rho1 = rho[1,:4].copy()
+        xc1 = dft.libxc.eval_xc_eff('PBE', rho1, deriv=4)
+        self.assertAlmostEqual(xc1.sum(), -869.6617638095072, 4)
+
+    @unittest.skip(not hasattr(dft, 'xcfun'))
+    def test_xcfun_lda_deriv3(self):
+        rho1 = rho[:,0].copy()
+        ref = eval_xc_eff('LDA,', rho1, 3, dft.xcfun)
+
+        xc1 = dft.xcfun.eval_xc_eff('LDA,', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 51.36053114469969, 9)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('LDA,', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -13.323225829690143, 9)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('LDA,', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -6.912554696220437, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+        rho1 = rho[1,0].copy()
+        ref = eval_xc_eff('LDA,', rho1, 3, dft.xcfun)
+
+        xc1 = dft.xcfun.eval_xc_eff('LDA,', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 20.21333987261437, 9)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('LDA,', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -5.269784014086463, 9)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('LDA,', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -2.7477984980958627, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+    @unittest.skip(not hasattr(dft, 'xcfun'))
+    def test_xcfun_gga_deriv3(self):
+        rho1 = rho[:,:4].copy()
+        ref = eval_xc_eff('PBE', rho1, 3, dft.xcfun)
+
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 61.29042037001073, 9)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -13.896034377219816, 9)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -7.616226587554259, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+        rho1 = rho[1,:4].copy()
+        ref = eval_xc_eff('PBE', rho1, 3, dft.xcfun)
+
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 26.08081046374974, 9)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -5.559303849017572, 9)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), -3.0715856471099032, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+    @unittest.skip(not hasattr(dft, 'xcfun'))
+    def test_xcfun_mgga_deriv3(self):
+        rho1 = rho
+        ref = eval_xc_eff('M06', rho1, 3, dft.xcfun)
+
+        xc1 = dft.xcfun.eval_xc_eff('M06', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 3461867.985594323, 5)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('M06', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -19196.865088253828, 5)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('M06', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), 90.99262909378264, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+        rho1 = rho[1]
+        ref = eval_xc_eff('M06', rho1, 3, dft.xcfun)
+
+        xc1 = dft.xcfun.eval_xc_eff('M06', rho1, deriv=3)
+        self.assertAlmostEqual(xc1.sum(), 2506574.915698602, 5)
+        self.assertAlmostEqual(abs(ref[3] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('M06', rho1, deriv=2)
+        self.assertAlmostEqual(xc1.sum(), -9308.64852580393, 5)
+        self.assertAlmostEqual(abs(ref[2] - xc1).max(), 0, 9)
+
+        xc1 = dft.xcfun.eval_xc_eff('M06', rho1, deriv=1)
+        self.assertAlmostEqual(xc1.sum(), 19.977512805950784, 9)
+        self.assertAlmostEqual(abs(ref[1] - xc1).max(), 0, 9)
+
+    @unittest.skip(not (hasattr(dft, 'xcfun') and dft.xcfun.MAX_DERIV_ORDER > 3))
+    def test_xcfun_gga_deriv4(self):
+        rho1 = rho[:,:4].copy()
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=4)
+        self.assertAlmostEqual(xc1.sum(), -920.135878252819, 9)
+
+        rho1 = rho[1,:4].copy()
+        xc1 = dft.xcfun.eval_xc_eff('PBE', rho1, deriv=4)
+        self.assertAlmostEqual(xc1.sum(), -869.6617638095072, 9)
+
 if __name__ == "__main__":
     print("Test xc_deriv")
     unittest.main()
diff --git a/pyscf/dft/test/test_xcfun.py b/pyscf/dft/test/test_xcfun.py
index 70bf9e7da1..c2c860f221 100644
--- a/pyscf/dft/test/test_xcfun.py
+++ b/pyscf/dft/test/test_xcfun.py
@@ -244,8 +244,15 @@ def test_vs_libxc_rks(self):
         rho = dft.numint.eval_rho(mol, ao, dm, xctype='MGGA', with_lapl=True)
         rhoa = rho[:,:200]
         def check(xc_code, deriv=3, e_place=9, v_place=8, f_place=6, k_place=4):
-            exc0, vxc0, fxc0, kxc0 = dft.libxc.eval_xc(xc_code, rhoa, 0, deriv=deriv)
-            exc1, vxc1, fxc1, kxc1 = dft.xcfun.eval_xc(xc_code, rhoa, 0, deriv=deriv)
+            xctype = dft.libxc.xc_type(xc_code)
+            if xctype == 'LDA':
+                nv = 1
+            elif xctype == 'GGA':
+                nv = 4
+            else:
+                nv = 6
+            exc0, vxc0, fxc0, kxc0 = dft.libxc.eval_xc(xc_code, rhoa[:nv], 0, deriv=deriv)
+            exc1, vxc1, fxc1, kxc1 = dft.xcfun.eval_xc(xc_code, rhoa[:nv], 0, deriv=deriv)
             self.assertAlmostEqual(abs(exc0-exc1).max(), 0, e_place)
             if deriv > 0:
                 for v0, v1 in zip(vxc0, vxc1):
@@ -338,8 +345,15 @@ def test_vs_libxc_uks(self):
         rhoa = rho[:,:200]
         rhob = rhoa + rho[:,200:400]
         def check(xc_code, deriv=3, e_place=9, v_place=8, f_place=6, k_place=4):
-            exc0, vxc0, fxc0, kxc0 = dft.libxc.eval_xc(xc_code, (rhoa, rhob), 1, deriv=deriv)
-            exc1, vxc1, fxc1, kxc1 = dft.xcfun.eval_xc(xc_code, (rhoa, rhob), 1, deriv=deriv)
+            xctype = dft.libxc.xc_type(xc_code)
+            if xctype == 'LDA':
+                nv = 1
+            elif xctype == 'GGA':
+                nv = 4
+            else:
+                nv = 6
+            exc0, vxc0, fxc0, kxc0 = dft.libxc.eval_xc(xc_code, (rhoa[:nv], rhob[:nv]), 1, deriv=deriv)
+            exc1, vxc1, fxc1, kxc1 = dft.xcfun.eval_xc(xc_code, (rhoa[:nv], rhob[:nv]), 1, deriv=deriv)
             self.assertAlmostEqual(abs(exc0-exc1).max(), 0, e_place)
             if deriv > 0:
                 for v0, v1 in zip(vxc0, vxc1):
diff --git a/pyscf/dft/xc_deriv.py b/pyscf/dft/xc_deriv.py
index 408a0c25ca..59d4c4d905 100644
--- a/pyscf/dft/xc_deriv.py
+++ b/pyscf/dft/xc_deriv.py
@@ -19,6 +19,9 @@
 Transform XC functional derivatives between different representations
 '''
 
+import math
+import itertools
+from functools import lru_cache
 import ctypes
 import numpy as np
 from pyscf import lib
@@ -449,100 +452,201 @@ def _stack_fggg(fggg, axis=0, rho=None):
     fggg = _stack_fg(fggg, axis=axis+1, rho=rho)
     return _stack_fg(fggg, axis=axis, rho=rho)
 
-def compress(vp, spin=0):
-    if spin != 0:  # spin polarized
-        shape = vp.shape
-        comp_shape = [shape[i*2]*shape[i*2+1] for i in range(0, vp.ndim-1, 2)]
-        vp = vp.reshape(comp_shape + [shape[-1]])
-
-    order = vp.ndim - 1
-    if order < 2:
-        pass
-    elif order == 2:  # 2nd derivatives
-        vp = vp[np.tril_indices(vp.shape[0])]
-    elif order == 3:  # 2nd derivatives
-        nd = vp.shape[0]
-        vp = np.vstack([vp[i][np.tril_indices(i+1)] for i in range(nd)])
-    else:
-        raise NotImplementedError('High order derivatives')
-    return vp
+@lru_cache(100)
+def _product_uniq_indices(nvars, order):
+    '''
+    Indexing the symmetry unique elements in cartensian product
+    '''
+    # n = 0
+    # for i range(nvars):
+    #    for j in range(i, nvars):
+    #        for k in range(k, nvars):
+    #            ...
+    #            prod[(i,j,k,...)] = n
+    #            n += 1
+    prod = np.ones([nvars] * order, dtype=int)
+    uniq_idx = list(itertools.combinations_with_replacement(range(nvars), order))
+    prod[tuple(np.array(uniq_idx).T)] = np.arange(len(uniq_idx))
+
+    idx = np.where(prod)
+    sidx = np.sort(idx, axis=0)
+    prod[idx] = prod[tuple(sidx)]
+    return prod
+
+def _pair_combinations(lst):
+    n = len(lst)
+    if n <= 2:
+        return [[tuple(lst)]]
+    a = lst[0]
+    results = []
+    for i in range(1, n):
+        pair = (a, lst[i])
+        rests = _pair_combinations(lst[1:i]+lst[i+1:])
+        for rest in rests:
+            rest.append(pair)
+        results.extend(rests)
+    return results
+
+def _unfold_gga(rho, xc_val, spin, order, nvar, xlen, reserve=0):
+    assert nvar >= 4
+    ngrids = rho.shape[-1]
+    n_transform = order - reserve
 
-def decompress(vp, spin=0):
-    order = vp.ndim - 1
-    ngrids = vp.shape[-1]
-    if order < 2:
-        out = vp
-    elif order == 2:  # 2nd derivatives
-        nd = vp.shape[0]
-        out = np.empty((nd, nd, ngrids))
-        idx, idy = np.tril_indices(nd)
-        out[idx,idy] = vp
-        out[idy,idx] = vp
-    elif order == 3:  # 2nd derivatives
-        nd = vp.shape[0]
-        out = np.empty((nd, nd, nd, ngrids))
-        p1 = 0
-        for i in range(nd):
-            idx, idy = np.tril_indices(i+1)
-            p0, p1 = p1, p1+idx.size
-            out[i,idx,idy] = vp[p0:p1]
-            out[i,idy,idx] = vp[p0:p1]
-            out[idx,i,idy] = vp[p0:p1]
-            out[idy,i,idx] = vp[p0:p1]
-            out[idx,idy,i] = vp[p0:p1]
-            out[idy,idx,i] = vp[p0:p1]
+    if spin == 0:
+        nvar2 = nvar
+        drv = libdft.VXCunfold_sigma_spin0
+    else:
+        nvar2 = nvar * 2
+        drv = libdft.VXCunfold_sigma_spin1
+
+    # xc_val[idx] expands unique xc elements to n-order tensors
+    idx = _product_uniq_indices(xlen, order)
+    xc_tensor = np.empty((xlen**reserve * nvar2**n_transform, ngrids))
+    xc_tensor[:idx.size] = xc_val[idx.ravel()]
+    buf = np.empty_like(xc_tensor)
+    for i in range(n_transform):
+        xc_tensor, buf = buf, xc_tensor
+        ncounts = xlen**(order-1-i) * nvar2**i
+        drv(xc_tensor.ctypes, buf.ctypes, rho.ctypes,
+            ctypes.c_int(ncounts), ctypes.c_int(nvar), ctypes.c_int(ngrids))
+    if spin == 0:
+        xc_tensor = xc_tensor.reshape([xlen]*reserve + [nvar]*n_transform + [ngrids])
     else:
-        raise NotImplementedError('High order derivatives')
+        xc_tensor = xc_tensor.reshape([xlen]*reserve + [2,nvar]*n_transform + [ngrids])
+    return xc_tensor
+
+def _diagonal_indices(idx, order):
+    assert order > 0
+    n = len(idx)
+    diag_idx = [np.asarray(idx)]
+    for i in range(1, order):
+        last_dim = diag_idx[-1]
+        diag_idx = [np.repeat(idx, n) for x in diag_idx]
+        diag_idx.append(np.tile(last_dim, n))
+    # repeat(diag_idx, 2)
+    return tuple(x for x in diag_idx for i in range(2))
+
+_XC_NVAR = {
+    ('HF', 0): (1, 1),
+    ('HF', 1): (1, 2),
+    ('LDA', 0): (1, 1),
+    ('LDA', 1): (1, 2),
+    ('GGA', 0): (4, 2),
+    ('GGA', 1): (4, 5),
+    ('MGGA', 0): (5, 3),
+    ('MGGA', 1): (5, 7),
+}
+
+def transform_xc(rho, xc_val, xctype, spin, order):
+    '''General transformation to construct XC derivative tensor'''
+    xc_val = np.asarray(xc_val, order='C')
+    if order == 0:
+        return xc_val[0]
+
+    nvar, xlen = _XC_NVAR[xctype, spin]
+    ngrids = xc_val.shape[-1]
+    offsets = [0] + [count_combinations(xlen, o) for o in range(order+1)]
+    p0, p1 = offsets[order:order+2]
+    if xctype == 'LDA' or xctype == 'HF':
+        xc_out = xc_val[p0:p1]
+        if spin == 0:
+            return xc_out.reshape([1]*order + [ngrids])
+        else:
+            idx = _product_uniq_indices(xlen, order)
+            return xc_out[idx].reshape([2,1]*order + [ngrids])
 
-    if spin != 0:  # spin polarized
-        shape = out.shape
-        nvar = shape[0] // 2
-        decomp_shape = [(2, nvar)] * order + [ngrids]
-        # reshape to (2,n,2,n,...,ngrids)
-        out = out.reshape(np.hstack(decomp_shape))
-    return out
+    rho = np.asarray(rho, order='C')
+    assert rho.size == (spin+1) * nvar * ngrids
+    xc_tensor = _unfold_gga(rho, xc_val[p0:p1], spin, order, nvar, xlen)
+    nabla_idx = np.arange(1, 4)
+    if spin == 0:
+        for n_pairs in range(1, order//2+1):
+            p0, p1 = offsets[order-n_pairs:order-n_pairs+2]
+            xc_sub = _unfold_gga(rho, xc_val[p0:p1], spin, order-n_pairs,
+                                 nvar, xlen, n_pairs)
+            # Just the sigma components
+            xc_sub = xc_sub[(1,)*n_pairs] * 2**n_pairs
+
+            # low_sigmas indicates the index for the extra sigma terms
+            low_sigmas = itertools.combinations(range(order), n_pairs*2)
+            pair_combs = [list(itertools.chain(*p[::-1]))
+                          for p in _pair_combinations(list(range(n_pairs*2)))]
+            diag_idx = _diagonal_indices(nabla_idx, n_pairs)
+
+            for dim_lst in low_sigmas:
+                rest_dims = [i for i in range(xc_tensor.ndim) if i not in dim_lst]
+                for pair_comb in pair_combs:
+                    xc_tensor_1 = xc_tensor.transpose(
+                        [dim_lst[i] for i in pair_comb] + rest_dims)
+                    xc_tensor_1[diag_idx] += xc_sub
+    else:
+        for n_pairs in range(1, order//2+1):
+            p0, p1 = offsets[order-n_pairs:order-n_pairs+2]
+            xc_sub = _unfold_gga(rho, xc_val[p0:p1], spin, order-n_pairs,
+                                 nvar, xlen, n_pairs)
+            # Just the sigma components
+            xc_sub = xc_sub[(slice(2,5),) * n_pairs]
+            for i in range(n_pairs):
+                xc_sub[(slice(None),)*i+(0,)] *= 2
+                xc_sub[(slice(None),)*i+(2,)] *= 2
+            sigma_idx = _product_uniq_indices(2, n_pairs*2)
+            xc_sub = xc_sub.reshape((3**n_pairs,) + xc_sub.shape[n_pairs:])
+            xc_sub = xc_sub[sigma_idx]
+
+            low_sigmas = itertools.combinations(range(order), n_pairs*2)
+            pair_combs = [list(itertools.chain(*p[::-1]))
+                          for p in _pair_combinations(list(range(n_pairs*2)))]
+            diag_idx = _diagonal_indices(nabla_idx, n_pairs)
+
+            for dim_lst in low_sigmas:
+                dim_lst2 = np.array(dim_lst)*2 + np.array([1, 0])[:,None]
+                rest_dims = [i for i in range(xc_tensor.ndim) if i not in dim_lst2]
+                for pair_comb in pair_combs:
+                    leading_dims = list(dim_lst2[:,pair_comb].ravel())
+                    xc_tensor_1 = xc_tensor.transpose(leading_dims + rest_dims)
+                    xc_tensor_1[diag_idx] += xc_sub
+    return xc_tensor
+
+def count_combinations(nvar, order):
+    '''sum(len(combinations_with_replacement(range(nvar), o) for o in range(order))'''
+    return lib.comb(nvar+order, order)
 
 def ud2ts(v_ud):
     '''XC derivatives spin-up/spin-down representations to
     total-density/spin-density representations'''
     v_ud = np.asarray(v_ud, order='C')
-    nvar, ngrids = v_ud.shape[-2:]
-    #:c = np.array([[.5,  .5],    # vrho = (va + vb) / 2
-    #:              [.5, -.5]])   # vs   = (va - vb) / 2
-    if v_ud.ndim == 3:  # vxc
-        #:v_ts = np.einsum('ra,axg->rxg', c, v_ud)
-        drv = libdft.VXCud2ts_deriv1
-    elif v_ud.ndim == 5:  # fxc
-        #:v_ts = np.einsum('ra,axbyg->rxbyg', c, v_ud)
-        #:v_ts = np.einsum('sb,rxbyg->rxsyg', c, v_ts)
-        drv = libdft.VXCud2ts_deriv2
-    elif v_ud.ndim == 7:  # kxc
-        #:v_ts = np.einsum('ra,axbyczg->rxbyczg', c, v_ud)
-        #:v_ts = np.einsum('sb,rxbyczg->rxsyczg', c, v_ts)
-        #:v_ts = np.einsum('tc,rxsyczg->rxsytzg', c, v_ts)
-        drv = libdft.VXCud2ts_deriv3
-    else:
-        raise NotImplementedError(f'Shape {v_ud.shape} not supported')
     v_ts = np.empty_like(v_ud)
-    drv(v_ts.ctypes.data_as(ctypes.c_void_p),
-        v_ud.ctypes.data_as(ctypes.c_void_p),
-        ctypes.c_int(nvar), ctypes.c_int(ngrids))
+    nvar, ngrids = v_ts.shape[-2:]
+    nvar2 = nvar * 2
+    order = v_ud.ndim // 2
+    drv = libdft.VXCud2ts
+    v_ts, v_ud = v_ud, v_ts
+    for i in range(order):
+        v_ts, v_ud = v_ud, v_ts
+        n = nvar2**i
+        nvg = nvar * nvar2**(order-1-i) * ngrids
+        #:c = np.array([[.5,  .5],    # vrho = (va + vb) / 2
+        #:              [.5, -.5]])   # vs   = (va - vb) / 2
+        #:v_ts = np.einsum('ra,...ax...g->...rx...g', c, v_ud)
+        drv(v_ts.ctypes, v_ud.ctypes, ctypes.c_int(n), ctypes.c_int(nvg))
     return v_ts
 
 def ts2ud(v_ts):
     '''XC derivatives total-density/spin-density representations to
     spin-up/spin-down representations'''
-    c = np.array([[1,  1],    # va = vrho + vs
-                  [1, -1]])   # vb = vrho - vs
-    if v_ts.ndim == 3:  # vxc
-        v_ud = np.einsum('ra,axg->rxg', c, v_ts)
-    elif v_ts.ndim == 5:  # fxc
-        v_ud = np.einsum('ra,axbyg->rxbyg', c, v_ts)
-        v_ud = np.einsum('sb,rxbyg->rxsyg', c, v_ud)
-    elif v_ts.ndim == 7:  # kxc
-        v_ud = np.einsum('ra,axbyczg->rxbyczg', c, v_ts)
-        v_ud = np.einsum('sb,rxbyczg->rxsyczg', c, v_ud)
-        v_ud = np.einsum('tc,rxsyczg->rxsytzg', c, v_ud)
-    else:
-        raise NotImplementedError(f'Shape {v_ts.shape} not supported')
+    v_ts = np.asarray(v_ts, order='C')
+    v_ud = np.empty_like(v_ts)
+    nvar, ngrids = v_ts.shape[-2:]
+    nvar2 = nvar * 2
+    order = v_ud.ndim // 2
+    drv = libdft.VXCts2ud
+    v_ts, v_ud = v_ud, v_ts
+    for i in range(order):
+        v_ts, v_ud = v_ud, v_ts
+        n = nvar2**i
+        nvg = nvar * nvar2**(order-1-i) * ngrids
+        #:c = np.array([[1,  1],    # va = vrho + vs
+        #:              [1, -1]])   # vb = vrho - vs
+        #:v_ud = np.einsum('ra,...ax...g->...rx...g', c, v_ts)
+        drv(v_ud.ctypes, v_ts.ctypes, ctypes.c_int(n), ctypes.c_int(nvg))
     return v_ud
diff --git a/pyscf/dft/xcfun.py b/pyscf/dft/xcfun.py
index 20942094bf..8fd498b338 100644
--- a/pyscf/dft/xcfun.py
+++ b/pyscf/dft/xcfun.py
@@ -28,12 +28,14 @@
 import numpy
 from pyscf import lib
 from pyscf.dft.xc.utils import remove_dup, format_xc_code
+from pyscf.dft import xc_deriv
 from pyscf import __config__
 
 _itrf = lib.load_library('libxcfun_itrf')
 
 _itrf.xcfun_splash.restype = ctypes.c_char_p
 _itrf.xcfun_version.restype = ctypes.c_char_p
+_itrf.XCFUN_eval_xc.restype = ctypes.c_int
 
 __version__ = _itrf.xcfun_version().decode("UTF-8")
 __reference__ = _itrf.xcfun_splash().decode("UTF-8")
@@ -246,11 +248,11 @@
                42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 66, 70, 75])
 '''
 HYB_XC = {'PBE0'    , 'PBE1PBE' , 'B3PW91'  , 'B3P86'   , 'B3LYP'   ,
-              'B3PW91G' , 'B3P86G'  , 'B3LYPG'  , 'O3LYP'   , 'CAMB3LYP',
-              'B97XC'   , 'B97_1XC' , 'B97_2XC' , 'M05XC'   , 'TPSSH'   ,
-              'HFLYP'}
+          'B3PW91G' , 'B3P86G'  , 'B3LYPG'  , 'O3LYP'   , 'CAMB3LYP',
+          'B97XC'   , 'B97_1XC' , 'B97_2XC' , 'M05XC'   , 'TPSSH'   ,
+          'HFLYP'}
 RSH_XC = {'CAMB3LYP'}
-MAX_DERIV_ORDER = 3
+MAX_DERIV_ORDER = ctypes.c_int.in_dll(_itrf, 'XCFUN_max_deriv_order').value
 
 VV10_XC = {
     'B97M_V'    : (6.0, 0.01),
@@ -509,18 +511,6 @@ def parse_token(token, suffix, search_xc_alias=False):
                    'M06-2X' : 'M062X',
                    'E-'     : 'E_'} # For scientific notation
 
-
-def eval_xc(xc_code, rho, spin=0, relativity=0, deriv=1, omega=None, verbose=None):
-    r'''Interface to call xcfun library to evaluate XC functional, potential
-    and functional derivatives.
-
-    See also :func:`pyscf.dft.libxc.eval_xc`
-    '''
-    hyb, fn_facs = parse_xc(xc_code)
-    if omega is not None:
-        hyb = hyb[:2] + (float(omega),)
-    return _eval_xc(hyb, fn_facs, rho, spin, relativity, deriv, verbose)
-
 XC_D0 = 0
 XC_D1 = 1
 XC_D2 = 2
@@ -827,93 +817,38 @@ def eval_xc(xc_code, rho, spin=0, relativity=0, deriv=1, omega=None, verbose=Non
 XC_D0000012 = 118
 XC_D0000003 = 119
 
-def _eval_xc(hyb, fn_facs, rho, spin=0, relativity=0, deriv=1, verbose=None):
-    assert (deriv < 4)
-    if spin == 0:
-        rho_u = rho_d = numpy.asarray(rho, order='C')
-    else:
-        rho_u = numpy.asarray(rho[0], order='C')
-        rho_d = numpy.asarray(rho[1], order='C')
-    assert (rho_u.dtype == numpy.double)
-    assert (rho_d.dtype == numpy.double)
-
-    if rho_u.ndim == 1:
-        rho_u = rho_u.reshape(1,-1)
-        rho_d = rho_d.reshape(1,-1)
-    ngrids = rho_u.shape[1]
-
-    fn_ids = [x[0] for x in fn_facs]
-    facs   = [x[1] for x in fn_facs]
-    if hyb[2] != 0:
-        # Current implementation does not support different omegas for
-        # different functionals
-        omega = [hyb[2]] * len(facs)
-    else:
-        omega = [0] * len(facs)
-
-    n = len(fn_ids)
-    if (n == 0 or  # xc_code = '' or xc_code = 'HF', an empty functional
-        all((is_lda(x) for x in fn_ids))):  # LDA
-        if spin == 0:
-            nvar = 1
-            xctype = 'R-LDA'
-        else:
-            nvar = 2
-            xctype = 'U-LDA'
-    elif any((is_meta_gga(x) for x in fn_ids)):
-        if spin == 0:
-            nvar = 3
-            xctype = 'R-MGGA'
-        else:
-            nvar = 7
-            xctype = 'U-MGGA'
-    else:  # GGA
-        if spin == 0:
-            nvar = 2
-            xctype = 'R-GGA'
-        else:
-            nvar = 5
-            xctype = 'U-GGA'
-    outlen = (math.factorial(nvar+deriv) //
-              (math.factorial(nvar) * math.factorial(deriv)))
-    outbuf = numpy.zeros((ngrids,outlen))
+def eval_xc(xc_code, rho, spin=0, relativity=0, deriv=1, omega=None, verbose=None):
+    r'''Interface to call xcfun library to evaluate XC functional, potential
+    and functional derivatives. Return deriviates following libxc convention.
 
-    if n > 0:
-        _itrf.XCFUN_eval_xc(ctypes.c_int(n),
-                            (ctypes.c_int*n)(*fn_ids),
-                            (ctypes.c_double*n)(*facs),
-                            (ctypes.c_double*n)(*omega),
-                            ctypes.c_int(spin),
-                            ctypes.c_int(deriv), ctypes.c_int(rho_u.shape[1]),
-                            rho_u.ctypes.data_as(ctypes.c_void_p),
-                            rho_d.ctypes.data_as(ctypes.c_void_p),
-                            outbuf.ctypes.data_as(ctypes.c_void_p))
-
-    outbuf = lib.transpose(outbuf)
+    See also :func:`pyscf.dft.libxc.eval_xc`
+    '''
+    outbuf = eval_xc1(xc_code, rho, spin, deriv, omega)
     exc = outbuf[0]
     vxc = fxc = kxc = None
-    if xctype == 'R-LDA':
+    xctype = xc_type(xc_code)
+    if xctype == 'LDA' and spin == 0:
         if deriv > 0:
             vxc = [outbuf[1]]
         if deriv > 1:
             fxc = [outbuf[2]]
         if deriv > 2:
             kxc = [outbuf[3]]
-    elif xctype == 'R-GGA':
+    elif xctype == 'GGA' and spin == 0:
         if deriv > 0:
             vxc = [outbuf[1], outbuf[2]]
         if deriv > 1:
             fxc = [outbuf[3], outbuf[4], outbuf[5]]
         if deriv > 2:
             kxc = [outbuf[6], outbuf[7], outbuf[8], outbuf[9]]
-    elif xctype == 'U-LDA':
+    elif xctype == 'LDA' and spin == 1:
         if deriv > 0:
             vxc = [outbuf[1:3].T]
         if deriv > 1:
             fxc = [outbuf[3:6].T]
         if deriv > 2:
             kxc = [outbuf[6:10].T]
-    elif xctype == 'U-GGA':
+    elif xctype == 'GGA' and spin == 1:
         if deriv > 0:
             vxc = [outbuf[1:3].T, outbuf[3:6].T]
         if deriv > 1:
@@ -933,7 +868,7 @@ def _eval_xc(hyb, fn_facs, rho, spin=0, relativity=0, deriv=1, verbose=None):
 # MGGA/MLGGA: Note the MLGGA interface are not implemented. MGGA only needs 3
 # input arguments.  To make the interface compatible with libxc, treat MGGA as
 # MLGGA
-    elif xctype == 'R-MGGA':
+    elif xctype == 'MGGA' and spin == 0:
         if deriv > 0:
             vxc = [outbuf[1], outbuf[2], None, outbuf[3]]
         if deriv > 1:
@@ -962,7 +897,7 @@ def _eval_xc(hyb, fn_facs, rho, spin=0, relativity=0, deriv=1, verbose=None):
                 None, None, outbuf[XC_D012],
                 # v3lapl3, v3lapl2tau, v3lapltau2, v3tau3)
                 None, None, None, outbuf[XC_D003]]
-    elif xctype == 'U-MGGA':
+    elif xctype == 'MGGA' and spin == 1:
         if deriv > 0:
             vxc = (outbuf[1:3].T, outbuf[3:6].T, None, outbuf[6:8].T)
         if deriv > 1:
@@ -1018,6 +953,88 @@ def _eval_xc(hyb, fn_facs, rho, spin=0, relativity=0, deriv=1, verbose=None):
                 outbuf[[XC_D0000030,XC_D0000021,XC_D0000012,XC_D0000003]].T]
     return exc, vxc, fxc, kxc
 
+def eval_xc1(xc_code, rho, spin=0, deriv=1, omega=None):
+    '''Similar to eval_xc.
+    Returns an array with the order of derivatives following xcfun convention.
+    '''
+    assert deriv <= MAX_DERIV_ORDER
+    xctype = xc_type(xc_code)
+    assert xctype in ('HF', 'LDA', 'GGA', 'MGGA')
+
+    rho = numpy.asarray(rho, order='C', dtype=numpy.double)
+    if xctype == 'MGGA' and rho.shape[-2] == 6:
+        rho = numpy.asarray(rho[...,[0,1,2,3,5],:], order='C')
+
+    hyb, fn_facs = parse_xc(xc_code)
+    if omega is not None:
+        hyb = hyb[:2] + (float(omega),)
+
+    fn_ids = [x[0] for x in fn_facs]
+    facs   = [x[1] for x in fn_facs]
+    if hyb[2] != 0:
+        # Current implementation does not support different omegas for
+        # different RSH functionals if there are multiple RSHs
+        omega = [hyb[2]] * len(facs)
+    else:
+        omega = [0] * len(facs)
+
+    nvar, xlen = xc_deriv._XC_NVAR[xctype, spin]
+    ngrids = rho.shape[-1]
+    rho = rho.reshape(spin+1,nvar,ngrids)
+    outlen = lib.comb(xlen+deriv, deriv)
+    out = numpy.zeros((ngrids,outlen))
+    n = len(fn_ids)
+    if n > 0:
+        err = _itrf.XCFUN_eval_xc(ctypes.c_int(n),
+                                  (ctypes.c_int*n)(*fn_ids),
+                                  (ctypes.c_double*n)(*facs),
+                                  (ctypes.c_double*n)(*omega),
+                                  ctypes.c_int(spin), ctypes.c_int(deriv),
+                                  ctypes.c_int(nvar), ctypes.c_int(ngrids),
+                                  ctypes.c_int(outlen),
+                                  rho.ctypes.data_as(ctypes.c_void_p),
+                                  out.ctypes.data_as(ctypes.c_void_p))
+        if err != 0:
+            raise RuntimeError(f'Failed to eval {xc_code} for deriv={deriv}')
+    return lib.transpose(out)
+
+def eval_xc_eff(xc_code, rho, deriv=1, omega=None):
+    r'''Returns the derivative tensor against the density parameters
+
+    [density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a]
+
+    or spin-polarized density parameters
+
+    [[density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a],
+     [density_b, (nabla_x)_b, (nabla_y)_b, (nabla_z)_b, tau_b]].
+
+    It differs from the eval_xc method in the derivatives of non-local part.
+    The eval_xc method returns the XC functional derivatives to sigma
+    (|\nabla \rho|^2)
+
+    Args:
+        rho: 2-dimensional or 3-dimensional array
+            Total density or (spin-up, spin-down) densities (and their
+            derivatives if GGA or MGGA functionals) on grids
+
+    Kwargs:
+        deriv: int
+            derivative orders
+        omega: float
+            define the exponent in the attenuated Coulomb for RSH functional
+    '''
+    xctype = xc_type(xc_code)
+    rho = numpy.asarray(rho, order='C', dtype=numpy.double)
+    if xctype == 'MGGA' and rho.shape[-2] == 6:
+        rho = numpy.asarray(rho[...,[0,1,2,3,5],:], order='C')
+
+    spin_polarized = rho.ndim >= 2 and rho.shape[0] == 2
+    if spin_polarized:
+        spin = 1
+    else:
+        spin = 0
+    out = eval_xc1(xc_code, rho, spin, deriv, omega)
+    return xc_deriv.transform_xc(rho, out, xctype, spin, deriv)
 
 def define_xc_(ni, description, xctype='LDA', hyb=0, rsh=(0,0,0)):
     '''Define XC functional.  See also :func:`eval_xc` for the rules of input description.
diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt
index 91f6aba135..d04e6444dc 100644
--- a/pyscf/lib/CMakeLists.txt
+++ b/pyscf/lib/CMakeLists.txt
@@ -100,6 +100,10 @@ if(ENABLE_OPENMP)
   endif()
 endif()
 
+if(NOT XCFUN_MAX_ORDER)
+  set(XCFUN_MAX_ORDER 3)
+endif()
+
 #find_package(PythonInterp REQUIRED)
 #find_package(PythonLibs REQUIRED)
 #execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.get_include())"
@@ -215,17 +219,14 @@ endif() # ENABLE_LIBXC
 
 if(ENABLE_XCFUN AND BUILD_XCFUN)
   ExternalProject_Add(libxcfun
-    #GIT_REPOSITORY https://github.com/sunqm/xcfun.git
-    GIT_REPOSITORY https://github.com/fishjojo/xcfun.git
-    # copy of v2.1.1, downgrade to cmake 3.5
-    GIT_TAG cmake-3.5
-    # Range seperated parameters can be set in the python code. This patch to
-    # RSH omega is not needed anymore.
-    #PATCH_COMMAND patch -p1 < ${PROJECT_SOURCE_DIR}/libxcfun.patch
+    GIT_REPOSITORY https://github.com/dftlibs/xcfun.git
+    GIT_TAG a89b783
+    # This patch adds the xcfun derivative order up to 5
+    PATCH_COMMAND git apply ${PROJECT_SOURCE_DIR}/libxcfun.patch
     PREFIX ${PROJECT_BINARY_DIR}/deps
     INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps
     CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_SHARED_LIBS=1
-            -DXCFUN_MAX_ORDER=3 -DENABLE_TESTALL=0
+            -DXCFUN_MAX_ORDER=${XCFUN_MAX_ORDER} -DENABLE_TESTALL=0
             -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
             -DCMAKE_CXX_CREATE_SHARED_LIBRARY=${CXX_LINK_TEMPLATE}
             -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/pyscf/lib/config.h.in b/pyscf/lib/config.h.in
index 596ac76849..fe7d506188 100644
--- a/pyscf/lib/config.h.in
+++ b/pyscf/lib/config.h.in
@@ -6,4 +6,4 @@
 #endif
 
 #cmakedefine HAVE_FFS
-
+#define XCFUN_MAX_DERIV_ORDER @XCFUN_MAX_ORDER@
diff --git a/pyscf/lib/dft/CMakeLists.txt b/pyscf/lib/dft/CMakeLists.txt
index c7095a8cd0..6b01b7eca0 100644
--- a/pyscf/lib/dft/CMakeLists.txt
+++ b/pyscf/lib/dft/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_library(dft SHARED 
+add_library(dft SHARED
   CxLebedevGrid.c grid_basis.c nr_numint.c r_numint.c
   numint_uniform_grid.c xc_deriv.c nr_numint_sparse.c
   )
diff --git a/pyscf/lib/dft/libxc_itrf.c b/pyscf/lib/dft/libxc_itrf.c
index 60641e5e43..76d7497980 100644
--- a/pyscf/lib/dft/libxc_itrf.c
+++ b/pyscf/lib/dft/libxc_itrf.c
@@ -73,19 +73,108 @@
     v3sigma(10)     = (uu_uu_uu, uu_uu_ud, uu_uu_dd, uu_ud_ud, uu_ud_dd, uu_dd_dd, ud_ud_ud, ud_ud_dd, ud_dd_dd, dd_dd_dd)
 
  */
+
+#define LDA_NVAR        1
+#define GGA_NVAR        4
+#define MGGA_NVAR       5
+
 /*
  * rho_u/rho_d = (den,grad_x,grad_y,grad_z,laplacian,tau)
  * In spin restricted case (spin == 1), rho_u is assumed to be the
  * spin-free quantities, rho_d is not used.
  */
-static void _eval_xc(xc_func_type *func_x, int spin, int np,
-                     double *rho_u, double *rho_d,
-                     double *ex, double *vxc, double *fxc, double *kxc)
+static void _eval_rho(double *rho, double *rho_u, int spin, int nvar, int np)
 {
         int i;
-        double *rho, *sigma, *lapl, *tau;
+        double *sigma, *tau;
         double *gxu, *gyu, *gzu, *gxd, *gyd, *gzd;
-        double *lapl_u, *lapl_d, *tau_u, *tau_d;
+        double *tau_u, *tau_d;
+        double *rho_d = rho_u + np * nvar;
+
+        switch (nvar) {
+        case LDA_NVAR:
+                if (spin == 1) {
+                        for (i = 0; i < np; i++) {
+                                rho[i*2+0] = rho_u[i];
+                                rho[i*2+1] = rho_d[i];
+                        }
+                } else {
+                        for (i = 0; i < np; i++) {
+                                rho[i] = rho_u[i];
+                        }
+                }
+                break;
+        case GGA_NVAR:
+                if (spin == 1) {
+                        sigma = rho + np * 2;
+                        gxu = rho_u + np;
+                        gyu = rho_u + np * 2;
+                        gzu = rho_u + np * 3;
+                        gxd = rho_d + np;
+                        gyd = rho_d + np * 2;
+                        gzd = rho_d + np * 3;
+                        for (i = 0; i < np; i++) {
+                                rho[i*2+0] = rho_u[i];
+                                rho[i*2+1] = rho_d[i];
+                                sigma[i*3+0] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
+                                sigma[i*3+1] = gxu[i]*gxd[i] + gyu[i]*gyd[i] + gzu[i]*gzd[i];
+                                sigma[i*3+2] = gxd[i]*gxd[i] + gyd[i]*gyd[i] + gzd[i]*gzd[i];
+                        }
+                } else {
+                        sigma = rho + np;
+                        gxu = rho_u + np;
+                        gyu = rho_u + np * 2;
+                        gzu = rho_u + np * 3;
+                        for (i = 0; i < np; i++) {
+                                rho[i] = rho_u[i];
+                                sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
+                        }
+                }
+                break;
+        case MGGA_NVAR:
+                if (spin == 1) {
+                        sigma = rho + np * 2;
+                        tau = sigma + np * 3;
+                        gxu = rho_u + np;
+                        gyu = rho_u + np * 2;
+                        gzu = rho_u + np * 3;
+                        gxd = rho_d + np;
+                        gyd = rho_d + np * 2;
+                        gzd = rho_d + np * 3;
+                        tau_u  = rho_u + np * 4;
+                        tau_d  = rho_d + np * 4;
+                        for (i = 0; i < np; i++) {
+                                rho[i*2+0] = rho_u[i];
+                                rho[i*2+1] = rho_d[i];
+                                tau[i*2+0] = tau_u[i];
+                                tau[i*2+1] = tau_d[i];
+                        }
+                        for (i = 0; i < np; i++) {
+                                sigma[i*3+0] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
+                                sigma[i*3+1] = gxu[i]*gxd[i] + gyu[i]*gyd[i] + gzu[i]*gzd[i];
+                                sigma[i*3+2] = gxd[i]*gxd[i] + gyd[i]*gyd[i] + gzd[i]*gzd[i];
+                        }
+                } else {
+                        sigma = rho + np;
+                        tau  = sigma + np;
+                        gxu = rho_u + np;
+                        gyu = rho_u + np * 2;
+                        gzu = rho_u + np * 3;
+                        tau_u = rho_u + np * 4;
+                        for (i = 0; i < np; i++) {
+                                rho[i] = rho_u[i];
+                                sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
+                                tau[i] = tau_u[i];
+                        }
+                }
+                break;
+        }
+}
+static void _eval_xc(xc_func_type *func_x, int spin, int deriv, int np,
+                     double *rho, double *exc)
+{
+        double *sigma, *tau;
+        double *lapl = rho;
         double *vrho   = NULL;
         double *vsigma = NULL;
         double *vlapl  = NULL;
@@ -120,6 +209,41 @@ static void _eval_xc(xc_func_type *func_x, int spin, int np,
         double *v3lapl2tau     = NULL;
         double *v3lapltau2     = NULL;
         double *v3tau3         = NULL;
+        double *v4rho4           = NULL;
+        double *v4rho3sigma      = NULL;
+        double *v4rho3lapl       = NULL;
+        double *v4rho3tau        = NULL;
+        double *v4rho2sigma2     = NULL;
+        double *v4rho2sigmalapl  = NULL;
+        double *v4rho2sigmatau   = NULL;
+        double *v4rho2lapl2      = NULL;
+        double *v4rho2lapltau    = NULL;
+        double *v4rho2tau2       = NULL;
+        double *v4rhosigma3      = NULL;
+        double *v4rhosigma2lapl  = NULL;
+        double *v4rhosigma2tau   = NULL;
+        double *v4rhosigmalapl2  = NULL;
+        double *v4rhosigmalapltau= NULL;
+        double *v4rhosigmatau2   = NULL;
+        double *v4rholapl3       = NULL;
+        double *v4rholapl2tau    = NULL;
+        double *v4rholapltau2    = NULL;
+        double *v4rhotau3        = NULL;
+        double *v4sigma4         = NULL;
+        double *v4sigma3lapl     = NULL;
+        double *v4sigma3tau      = NULL;
+        double *v4sigma2lapl2    = NULL;
+        double *v4sigma2lapltau  = NULL;
+        double *v4sigma2tau2     = NULL;
+        double *v4sigmalapl3     = NULL;
+        double *v4sigmalapl2tau  = NULL;
+        double *v4sigmalapltau2  = NULL;
+        double *v4sigmatau3      = NULL;
+        double *v4lapl4          = NULL;
+        double *v4lapl3tau       = NULL;
+        double *v4lapl2tau2      = NULL;
+        double *v4lapltau3       = NULL;
+        double *v4tau4           = NULL;
 
         switch (func_x->info->family) {
         case XC_FAMILY_LDA:
@@ -129,231 +253,207 @@ static void _eval_xc(xc_func_type *func_x, int spin, int np,
                 // ex is the energy density
                 // NOTE libxc library added ex/ec into vrho/vcrho
                 // vrho = rho d ex/d rho + ex, see work_lda.c:L73
-                if (spin == XC_POLARIZED) {
-                        rho = malloc(sizeof(double) * np*2);
-                        for (i = 0; i < np; i++) {
-                                rho[i*2+0] = rho_u[i];
-                                rho[i*2+1] = rho_d[i];
+                if (spin == 1) {
+                        if (deriv > 0) {
+                                vrho = exc + np;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 = vrho + np * 2;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 = v2rho2 + np * 3;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 = v3rho3 + np * 4;
                         }
-                        xc_lda_exc_vxc_fxc_kxc(func_x, np, rho, ex, vxc, fxc, kxc);
-                        free(rho);
                 } else {
-                        rho = rho_u;
-                        xc_lda_exc_vxc_fxc_kxc(func_x, np, rho, ex, vxc, fxc, kxc);
+                        if (deriv > 0) {
+                                vrho = exc + np;
+                        }
+                        if (deriv > 1) {
+                                v2rho2 = vrho + np;
+                        }
+                        if (deriv > 2) {
+                                v3rho3 = v2rho2 + np;
+                        }
+                        if (deriv > 3) {
+                                v4rho4 = v3rho3 + np;
+                        }
                 }
+                xc_lda(func_x, np, rho, exc, vrho, v2rho2, v3rho3, v4rho4);
                 break;
         case XC_FAMILY_GGA:
 #ifdef XC_FAMILY_HYB_GGA
         case XC_FAMILY_HYB_GGA:
 #endif
-                if (spin == XC_POLARIZED) {
-                        rho = malloc(sizeof(double) * np * 5);
+                if (spin == 1) {
                         sigma = rho + np * 2;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        gxd = rho_d + np;
-                        gyd = rho_d + np * 2;
-                        gzd = rho_d + np * 3;
-                        for (i = 0; i < np; i++) {
-                                rho[i*2+0] = rho_u[i];
-                                rho[i*2+1] = rho_d[i];
-                                sigma[i*3+0] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
-                                sigma[i*3+1] = gxu[i]*gxd[i] + gyu[i]*gyd[i] + gzu[i]*gzd[i];
-                                sigma[i*3+2] = gxd[i]*gxd[i] + gyd[i]*gyd[i] + gzd[i]*gzd[i];
-                        }
-                        if (vxc != NULL) {
-                                vrho = vxc;
-                                vsigma = vxc + np * 2;
+                        if (deriv > 0) {
+                                vrho = exc + np;
+                                vsigma = vrho + np * 2;
                         }
-                        if (fxc != NULL) {
-                                v2rho2 = fxc;
-                                v2rhosigma = fxc + np * 3;
+                        if (deriv > 1) {
+                                v2rho2 = vsigma + np * 3;
+                                v2rhosigma = v2rho2 + np * 3;
                                 v2sigma2 = v2rhosigma + np * 6; // np*6
                         }
-                        if (kxc != NULL) {
-                                v3rho3 = kxc;
-                                v3rho2sigma = kxc + np * 4;
+                        if (deriv > 2) {
+                                v3rho3 = v2sigma2 + np * 6;
+                                v3rho2sigma = v3rho3 + np * 4;
                                 v3rhosigma2 = v3rho2sigma + np * 9;
                                 v3sigma3 = v3rhosigma2 + np * 12; // np*10
                         }
-                        xc_gga_exc_vxc_fxc_kxc(func_x, np, rho, sigma, ex,
-                               vrho, vsigma, v2rho2, v2rhosigma, v2sigma2,
-                               v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3);
-                        free(rho);
-                } else {
-                        rho = rho_u;
-                        sigma = malloc(sizeof(double) * np);
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        for (i = 0; i < np; i++) {
-                                sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
+                        if (deriv > 3) {
+                                v4rho4       = v3sigma3     + np * 10  ;
+                                v4rho3sigma  = v4rho4       + np * 5   ;
+                                v4rho2sigma2 = v4rho3sigma  + np * 4*3 ;
+                                v4rhosigma3  = v4rho2sigma2 + np * 3*6 ;
+                                v4sigma4     = v4rhosigma3  + np * 2*10;
                         }
-                        if (vxc != NULL) {
-                                vrho = vxc;
-                                vsigma = vxc + np;
+                } else {
+                        sigma = rho + np;
+                        if (deriv > 0) {
+                                vrho = exc + np;
+                                vsigma = vrho + np;
                         }
-                        if (fxc != NULL) {
-                                v2rho2 = fxc;
-                                v2rhosigma = fxc + np;
+                        if (deriv > 1) {
+                                v2rho2 = vsigma + np;
+                                v2rhosigma = v2rho2 + np;
                                 v2sigma2 = v2rhosigma + np;
                         }
-                        if (kxc != NULL) {
-                                v3rho3 = kxc;
+                        if (deriv > 2) {
+                                v3rho3 = v2sigma2 + np;
                                 v3rho2sigma = v3rho3 + np;
                                 v3rhosigma2 = v3rho2sigma + np;
                                 v3sigma3 = v3rhosigma2 + np;
                         }
-                        xc_gga_exc_vxc_fxc_kxc(func_x, np, rho, sigma, ex,
-                               vrho, vsigma, v2rho2, v2rhosigma, v2sigma2,
-                               v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3);
-                        free(sigma);
+                        if (deriv > 3) {
+                                v4rho4       = v3sigma3     + np;
+                                v4rho3sigma  = v4rho4       + np;
+                                v4rho2sigma2 = v4rho3sigma  + np;
+                                v4rhosigma3  = v4rho2sigma2 + np;
+                                v4sigma4     = v4rhosigma3  + np;
+                        }
                 }
+                xc_gga(func_x, np, rho, sigma,
+                       exc, vrho, vsigma,
+                       v2rho2, v2rhosigma, v2sigma2,
+                       v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3,
+                       v4rho4, v4rho3sigma, v4rho2sigma2, v4rhosigma3, v4sigma4);
                 break;
         case XC_FAMILY_MGGA:
 #ifdef XC_FAMILY_HYB_MGGA
         case XC_FAMILY_HYB_MGGA:
 #endif
-                if (spin == XC_POLARIZED) {
-                        rho = malloc(sizeof(double) * np * 9);
+                if (spin == 1) {
                         sigma = rho + np * 2;
-                        lapl = sigma + np * 3;
-                        tau = lapl + np * 2;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        gxd = rho_d + np;
-                        gyd = rho_d + np * 2;
-                        gzd = rho_d + np * 3;
-                        lapl_u = rho_u + np * 4;
-                        tau_u  = rho_u + np * 5;
-                        lapl_d = rho_d + np * 4;
-                        tau_d  = rho_d + np * 5;
-                        for (i = 0; i < np; i++) {
-                                rho[i*2+0] = rho_u[i];
-                                rho[i*2+1] = rho_d[i];
-                                lapl[i*2+0] = lapl_u[i];
-                                lapl[i*2+1] = lapl_d[i];
-                                tau[i*2+0] = tau_u[i];
-                                tau[i*2+1] = tau_d[i];
+                        tau = sigma + np * 3;
+                        if (deriv > 0) {
+                                vrho = exc + np;
+                                vsigma = vrho + np * 2;
+                                vtau = vsigma + np * 3;
                         }
-                        for (i = 0; i < np; i++) {
-                                sigma[i*3+0] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
-                                sigma[i*3+1] = gxu[i]*gxd[i] + gyu[i]*gyd[i] + gzu[i]*gzd[i];
-                                sigma[i*3+2] = gxd[i]*gxd[i] + gyd[i]*gyd[i] + gzd[i]*gzd[i];
-                        }
-                        if (vxc != NULL) {
-                                vrho = vxc;
-                                vsigma = vxc + np * 2;
-                                vlapl = vsigma + np * 3;
-                                vtau = vlapl + np * 2; // np*2
-                        }
-                        if (fxc != NULL) {
-                                v2rho2      = fxc;
+                        if (deriv > 1) {
+                                v2rho2      = vtau        + np * 2;
                                 v2rhosigma  = v2rho2      + np * 3;
                                 v2sigma2    = v2rhosigma  + np * 6;
-                                v2lapl2     = v2sigma2    + np * 6;
-                                v2tau2      = v2lapl2     + np * 3;
-                                v2rholapl   = v2tau2      + np * 3;
-                                v2rhotau    = v2rholapl   + np * 4;
-                                v2lapltau   = v2rhotau    + np * 4;
-                                v2sigmalapl = v2lapltau   + np * 4;
-                                v2sigmatau  = v2sigmalapl + np * 6;
-                        }
-                        if (kxc != NULL) {
-                                v3rho3         = kxc;
+                                v2rhotau    = v2sigma2    + np * 6;
+                                v2sigmatau  = v2rhotau    + np * 4;
+                                v2tau2      = v2sigmatau  + np * 6;
+                        }
+                        if (deriv > 2) {
+                                v3rho3         = v2tau2         + np * 3 ;
                                 v3rho2sigma    = v3rho3         + np * 4 ;
                                 v3rhosigma2    = v3rho2sigma    + np * 9 ;
                                 v3sigma3       = v3rhosigma2    + np * 12;
-                                v3rho2lapl     = v3sigma3       + np * 10;
-                                v3rho2tau      = v3rho2lapl     + np * 6 ;
-                                v3rhosigmalapl = v3rho2tau      + np * 6 ;
-                                v3rhosigmatau  = v3rhosigmalapl + np * 12;
-                                v3rholapl2     = v3rhosigmatau  + np * 12;
-                                v3rholapltau   = v3rholapl2     + np * 6 ;
-                                v3rhotau2      = v3rholapltau   + np * 8 ;
-                                v3sigma2lapl   = v3rhotau2      + np * 6 ;
-                                v3sigma2tau    = v3sigma2lapl   + np * 12;
-                                v3sigmalapl2   = v3sigma2tau    + np * 12;
-                                v3sigmalapltau = v3sigmalapl2   + np * 9 ;
-                                v3sigmatau2    = v3sigmalapltau + np * 12;
-                                v3lapl3        = v3sigmatau2    + np * 9 ;
-                                v3lapl2tau     = v3lapl3        + np * 4 ;
-                                v3lapltau2     = v3lapl2tau     + np * 6 ;
-                                v3tau3         = v3lapltau2     + np * 6 ;
-                        }
-                        xc_mgga_exc_vxc_fxc_kxc(func_x, np, rho, sigma, lapl, tau, ex,
-                                vrho, vsigma, vlapl, vtau,
-                                v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2,
-                                v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2,
-                                v3rho3, v3rho2sigma, v3rho2lapl, v3rho2tau, v3rhosigma2,
-                                v3rhosigmalapl, v3rhosigmatau, v3rholapl2, v3rholapltau,
-                                v3rhotau2, v3sigma3, v3sigma2lapl, v3sigma2tau,
-                                v3sigmalapl2, v3sigmalapltau, v3sigmatau2, v3lapl3,
-                                v3lapl2tau, v3lapltau2, v3tau3);
-                        free(rho);
-                } else {
-                        rho = rho_u;
-                        sigma = malloc(sizeof(double) * np);
-                        lapl = rho_u + np * 4;
-                        tau  = rho_u + np * 5;
-                        gxu = rho_u + np;
-                        gyu = rho_u + np * 2;
-                        gzu = rho_u + np * 3;
-                        for (i = 0; i < np; i++) {
-                                sigma[i] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
+                                v3rho2tau      = v3sigma3       + np * 10;
+                                v3rhosigmatau  = v3rho2tau      + np * 6 ;
+                                v3rhotau2      = v3rhosigmatau  + np * 12;
+                                v3sigma2tau    = v3rhotau2      + np * 6 ;
+                                v3sigmatau2    = v3sigma2tau    + np * 12;
+                                v3tau3         = v3sigmatau2    + np * 9 ;
                         }
-                        if (vxc != NULL) {
-                                vsigma = vxc + np;
-                                vlapl = vsigma + np;
-                                vtau = vlapl + np;
+                        if (deriv > 3) {
+                                v4rho4         = v3tau3         + np * 4    ;
+                                v4rho3sigma    = v4rho4         + np * 5    ;
+                                v4rho2sigma2   = v4rho3sigma    + np * 4*3  ;
+                                v4rhosigma3    = v4rho2sigma2   + np * 3*6  ;
+                                v4sigma4       = v4rhosigma3    + np * 2*10 ;
+                                v4rho3tau      = v4sigma4       + np * 15   ;
+                                v4rho2sigmatau = v4rho3tau      + np * 4*2  ;
+                                v4rho2tau2     = v4rho2sigmatau + np * 3*3*2;
+                                v4rhosigma2tau = v4rho2tau2     + np * 3*3  ;
+                                v4rhosigmatau2 = v4rhosigma2tau + np * 2*6*2;
+                                v4rhotau3      = v4rhosigmatau2 + np * 2*3*3;
+                                v4sigma3tau    = v4rhotau3      + np * 2*4  ;
+                                v4sigma2tau2   = v4sigma3tau    + np * 10*2 ;
+                                v4sigmatau3    = v4sigma2tau2   + np * 6*3  ;
+                                v4tau4         = v4sigmatau3    + np * 3*4  ;
                         }
-                        if (fxc != NULL) {
-                                v2rho2      = fxc;
+                } else {
+                        sigma = rho + np;
+                        tau = sigma + np;
+                        if (deriv > 0) {
+                                vrho = exc + np;
+                                vsigma = vrho + np;
+                                vtau = vsigma + np;
+                        }
+                        if (deriv > 1) {
+                                v2rho2      = vtau        + np;
                                 v2rhosigma  = v2rho2      + np;
                                 v2sigma2    = v2rhosigma  + np;
-                                v2lapl2     = v2sigma2    + np;
-                                v2tau2      = v2lapl2     + np;
-                                v2rholapl   = v2tau2      + np;
-                                v2rhotau    = v2rholapl   + np;
-                                v2lapltau   = v2rhotau    + np;
-                                v2sigmalapl = v2lapltau   + np;
-                                v2sigmatau  = v2sigmalapl + np;
-                        }
-                        if (kxc != NULL) {
-                                v3rho3         = kxc;
+                                v2rhotau    = v2sigma2    + np;
+                                v2sigmatau  = v2rhotau    + np;
+                                v2tau2      = v2sigmatau  + np;
+                        }
+                        if (deriv > 2) {
+                                v3rho3         = v2tau2         + np;
                                 v3rho2sigma    = v3rho3         + np;
                                 v3rhosigma2    = v3rho2sigma    + np;
                                 v3sigma3       = v3rhosigma2    + np;
-                                v3rho2lapl     = v3sigma3       + np;
-                                v3rho2tau      = v3rho2lapl     + np;
-                                v3rhosigmalapl = v3rho2tau      + np;
-                                v3rhosigmatau  = v3rhosigmalapl + np;
-                                v3rholapl2     = v3rhosigmatau  + np;
-                                v3rholapltau   = v3rholapl2     + np;
-                                v3rhotau2      = v3rholapltau   + np;
-                                v3sigma2lapl   = v3rhotau2      + np;
-                                v3sigma2tau    = v3sigma2lapl   + np;
-                                v3sigmalapl2   = v3sigma2tau    + np;
-                                v3sigmalapltau = v3sigmalapl2   + np;
-                                v3sigmatau2    = v3sigmalapltau + np;
-                                v3lapl3        = v3sigmatau2    + np;
-                                v3lapl2tau     = v3lapl3        + np;
-                                v3lapltau2     = v3lapl2tau     + np;
-                                v3tau3         = v3lapltau2     + np;
-                        }
-                        xc_mgga_exc_vxc_fxc_kxc(func_x, np, rho, sigma, lapl, tau, ex,
-                                vxc, vsigma, vlapl, vtau,
-                                v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2,
-                                v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2,
-                                v3rho3, v3rho2sigma, v3rho2lapl, v3rho2tau, v3rhosigma2,
-                                v3rhosigmalapl, v3rhosigmatau, v3rholapl2, v3rholapltau,
-                                v3rhotau2, v3sigma3, v3sigma2lapl, v3sigma2tau,
-                                v3sigmalapl2, v3sigmalapltau, v3sigmatau2, v3lapl3,
-                                v3lapl2tau, v3lapltau2, v3tau3);
-                        free(sigma);
+                                v3rho2tau      = v3sigma3       + np;
+                                v3rhosigmatau  = v3rho2tau      + np;
+                                v3rhotau2      = v3rhosigmatau  + np;
+                                v3sigma2tau    = v3rhotau2      + np;
+                                v3sigmatau2    = v3sigma2tau    + np;
+                                v3tau3         = v3sigmatau2    + np;
+                        }
+                        if (deriv > 3) {
+                                v4rho4         = v3tau3         + np;
+                                v4rho3sigma    = v4rho4         + np;
+                                v4rho2sigma2   = v4rho3sigma    + np;
+                                v4rhosigma3    = v4rho2sigma2   + np;
+                                v4sigma4       = v4rhosigma3    + np;
+                                v4rho3tau      = v4sigma4       + np;
+                                v4rho2sigmatau = v4rho3tau      + np;
+                                v4rho2tau2     = v4rho2sigmatau + np;
+                                v4rhosigma2tau = v4rho2tau2     + np;
+                                v4rhosigmatau2 = v4rhosigma2tau + np;
+                                v4rhotau3      = v4rhosigmatau2 + np;
+                                v4sigma3tau    = v4rhotau3      + np;
+                                v4sigma2tau2   = v4sigma3tau    + np;
+                                v4sigmatau3    = v4sigma2tau2   + np;
+                                v4tau4         = v4sigmatau3    + np;
+                        }
                 }
+                xc_mgga(func_x, np, rho, sigma, lapl, tau,
+                     exc, vrho, vsigma, vlapl, vtau,
+                     v2rho2, v2rhosigma, v2rholapl, v2rhotau, v2sigma2,
+                     v2sigmalapl, v2sigmatau, v2lapl2, v2lapltau, v2tau2,
+                     v3rho3, v3rho2sigma, v3rho2lapl, v3rho2tau, v3rhosigma2,
+                     v3rhosigmalapl, v3rhosigmatau, v3rholapl2, v3rholapltau,
+                     v3rhotau2, v3sigma3, v3sigma2lapl, v3sigma2tau,
+                     v3sigmalapl2, v3sigmalapltau, v3sigmatau2, v3lapl3,
+                     v3lapl2tau, v3lapltau2, v3tau3,
+                     v4rho4, v4rho3sigma, v4rho3lapl, v4rho3tau, v4rho2sigma2,
+                     v4rho2sigmalapl, v4rho2sigmatau, v4rho2lapl2, v4rho2lapltau,
+                     v4rho2tau2, v4rhosigma3, v4rhosigma2lapl, v4rhosigma2tau,
+                     v4rhosigmalapl2, v4rhosigmalapltau, v4rhosigmatau2,
+                     v4rholapl3, v4rholapl2tau, v4rholapltau2, v4rhotau3,
+                     v4sigma4, v4sigma3lapl, v4sigma3tau, v4sigma2lapl2,
+                     v4sigma2lapltau, v4sigma2tau2, v4sigmalapl3, v4sigmalapl2tau,
+                     v4sigmalapltau2, v4sigmatau3, v4lapl4, v4lapl3tau,
+                     v4lapl2tau2, v4lapltau3, v4tau4);
                 break;
         default:
                 fprintf(stderr, "functional %d '%s' is not implemented\n",
@@ -582,77 +682,26 @@ int LIBXC_xc_type(int fn_id)
         return type;
 }
 
-static int xc_output_length(int nvar, int deriv)
-{
-        int i;
-        int len = 1.;
-        for (i = 1; i <= nvar; i++) {
-                len *= deriv + i;
-                len /= i;
-        }
-        return len;
-}
-
-// return value 0 means no functional needs to be evaluated.
-int LIBXC_input_length(int nfn, int *fn_id, double *fac, int spin)
-{
-        int i;
-        int nvar = 0;
-        xc_func_type func;
-        for (i = 0; i < nfn; i++) {
-                if (xc_func_init(&func, fn_id[i], spin) != 0) {
-                        fprintf(stderr, "XC functional %d not found\n",
-                                fn_id[i]);
-                        raise_error -1;
-                }
-                if (spin == XC_POLARIZED) {
-                        switch (func.info->family) {
-                        case XC_FAMILY_LDA:
-#ifdef XC_FAMILY_HYB_LDA
-                        case XC_FAMILY_HYB_LDA:
-#endif
-                                nvar = MAX(nvar, 2);
-                                break;
-                        case XC_FAMILY_GGA:
-#ifdef XC_FAMILY_HYB_GGA
-                        case XC_FAMILY_HYB_GGA:
-#endif
-                                nvar = MAX(nvar, 5);
-                                break;
-                        case XC_FAMILY_MGGA:
-#ifdef XC_FAMILY_HYB_MGGA
-                        case XC_FAMILY_HYB_MGGA:
-#endif
-                                nvar = MAX(nvar, 9);
-                        }
-                } else {
-                        switch (func.info->family) {
-                        case XC_FAMILY_LDA:
-#ifdef XC_FAMILY_HYB_LDA
-                        case XC_FAMILY_HYB_LDA:
-#endif
-                                nvar = MAX(nvar, 1);
-                                break;
-                        case XC_FAMILY_GGA:
-#ifdef XC_FAMILY_HYB_GGA
-                        case XC_FAMILY_HYB_GGA:
-#endif
-                                nvar = MAX(nvar, 2);
-                                break;
-                        case XC_FAMILY_MGGA:
-#ifdef XC_FAMILY_HYB_MGGA
-                        case XC_FAMILY_HYB_MGGA:
-#endif
-                                nvar = MAX(nvar, 4);
-                        }
-                }
-                xc_func_end(&func);
-        }
-        return nvar;
-}
+//static int xc_output_length(int nvar, int deriv)
+//{
+//        int i;
+//        int len = 1;
+//        for (i = 1; i <= nvar; i++) {
+//                len *= deriv + i;
+//                len /= i;
+//        }
+//        return len;
+//}
+// offsets = [xc_output_length(nvar, i) for i in range(deriv+1)
+//            for nvar in [1,2,3,5,7]]
+static int xc_nvar1_offsets[] = {0, 1, 2, 3, 4, 5};
+static int xc_nvar2_offsets[] = {0, 1, 3, 6, 10, 15};
+static int xc_nvar3_offsets[] = {0, 1, 4, 10, 20, 35};
+static int xc_nvar5_offsets[] = {0, 1, 6, 21, 56, 126};
+static int xc_nvar7_offsets[] = {0, 1, 8, 36, 120, 330};
 
 static void axpy(double *dst, double *src, double fac,
-                 int np, int ndst, int nsrc)
+                 int np, int nsrc)
 {
         int i, j;
         for (j = 0; j < nsrc; j++) {
@@ -661,130 +710,106 @@ static void axpy(double *dst, double *src, double fac,
                 }
         }
 }
-
-static void merge_xc(double *dst, double *ebuf, double *vbuf,
-                     double *fbuf, double *kbuf, double fac,
-                     int np, int ndst, int nvar, int spin, int type)
+static int vseg1[] = {2, 3, 2};
+static int fseg1[] = {3, 6, 6, 4, 6, 3};
+static int kseg1[] = {4, 9, 12, 10, 6, 12, 6, 12, 9, 4};
+static int lseg1[] = {5, 12, 18, 20, 15, 8, 18, 9, 24, 18, 8, 20, 18, 12, 5};
+static int *seg1[] = {NULL, vseg1, fseg1, kseg1, lseg1};
+
+static void merge_xc(double *dst, double *ebuf, double fac,
+                     int spin, int deriv, int nvar, int np, int outlen, int type)
 {
-        int seg0 [] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-        // LDA         |  |
-        // GGA         |     |
-        // MGGA        |        |
-        int vseg1[] = {2, 3, 2, 2};
-        // LDA         |  |
-        // GGA         |        |
-        // MGGA        |                          |
-        int fseg1[] = {3, 6, 6, 3, 3, 4, 4, 4, 6, 6};
-        // LDA         |  |
-        // GGA         |        |
-        // MGGA        |                                                        |
-        int kseg1[] = {4, 9,12,10, 6, 6,12,12, 6, 8, 6,12,12, 9,12, 9, 4, 6, 6, 4};
-        int vsegtot, fsegtot, ksegtot;
-        int *vseg, *fseg, *kseg;
-        if (spin == XC_POLARIZED) {
-                vseg = vseg1;
-                fseg = fseg1;
-                kseg = kseg1;
-        } else {
-                vseg = seg0;
-                fseg = seg0;
-                kseg = seg0;
+        int order, nsrc, i;
+        for (i = 0; i < np; i++) {
+                dst[i] += fac * ebuf[i];
         }
 
+        int *offsets0, *offsets1;
+        double *pout, *pin;
         switch (type) {
         case XC_FAMILY_GGA:
 #ifdef XC_FAMILY_HYB_GGA
         case XC_FAMILY_HYB_GGA:
 #endif
-                vsegtot = 2;
-                fsegtot = 3;
-                ksegtot = 4;
+                offsets0 = xc_nvar2_offsets;
                 break;
         case XC_FAMILY_MGGA:
 #ifdef XC_FAMILY_HYB_MGGA
         case XC_FAMILY_HYB_MGGA:
 #endif
-                vsegtot = 4;
-                fsegtot = 10;
-                ksegtot = 20;  // not supported
+                offsets0 = xc_nvar3_offsets;
                 break;
         default: //case XC_FAMILY_LDA:
-                vsegtot = 1;
-                fsegtot = 1;
-                ksegtot = 1;
+                offsets0 = xc_nvar1_offsets;
         }
 
-        int i;
-        size_t offset;
-        axpy(dst, ebuf, fac, np, ndst, 1);
-
-        if (vbuf != NULL) {
-                offset = np;
-                for (i = 0; i < vsegtot; i++) {
-                        axpy(dst+offset, vbuf, fac, np, ndst, vseg[i]);
-                        offset += np * vseg[i];
-                        vbuf += np * vseg[i];
+        if (spin == 0) {
+                switch (nvar) {
+                case LDA_NVAR:
+                        offsets1 = xc_nvar1_offsets;
+                        break;
+                case GGA_NVAR:
+                        offsets1 = xc_nvar2_offsets;
+                        break;
+                default: // MGGA_NVAR
+                        offsets1 = xc_nvar3_offsets;
+                        break;
                 }
-        }
 
-        if (fbuf != NULL) {
-                offset = np * xc_output_length(nvar, 1);
-                for (i = 0; i < fsegtot; i++) {
-                        axpy(dst+offset, fbuf, fac, np, ndst, fseg[i]);
-                        offset += np * fseg[i];
-                        fbuf += np * fseg[i];
+                for (order = 1; order <= deriv; order++) {
+                        pout = dst + offsets1[order] * np;
+                        pin = ebuf + offsets0[order] * np;
+                        nsrc = offsets0[order+1] - offsets0[order];
+                        for (i = 0; i < np * nsrc; i++) {
+                                pout[i] += fac * pin[i];
+                        }
                 }
+                return;
         }
 
-        if (kbuf != NULL) {
-                offset = np * xc_output_length(nvar, 2);
-                for (i = 0; i < ksegtot; i++) {
-                        axpy(dst+offset, kbuf, fac, np, ndst, kseg[i]);
-                        offset += np * kseg[i];
-                        kbuf += np * kseg[i];
+        switch (nvar) {
+        case LDA_NVAR:
+                offsets1 = xc_nvar2_offsets;
+                break;
+        case GGA_NVAR:
+                offsets1 = xc_nvar5_offsets;
+                break;
+        default: // MGGA_NVAR
+                offsets1 = xc_nvar7_offsets;
+                break;
+        }
+
+        int terms;
+        int *pseg1;
+        pin = ebuf + np;
+        for (order = 1; order <= deriv; order++) {
+                pseg1 = seg1[order];
+                pout = dst + offsets1[order] * np;
+                terms = offsets0[order+1] - offsets0[order];
+                for (i = 0; i < terms; i++) {
+                        nsrc = pseg1[i];
+                        axpy(pout, pin, fac, np, nsrc);
+                        pin += np * nsrc;
+                        pout += np * nsrc;
                 }
         }
 }
 
 // omega is the range separation parameter mu in xcfun
 void LIBXC_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
-                   int spin, int deriv, int np,
-                   double *rho_u, double *rho_d, double *output,
-                   double dens_threshold)
+                   int spin, int deriv, int nvar, int np, int outlen,
+                   double *rho_u, double *output, double dens_threshold)
 {
-        assert(deriv <= 3);
-        int nvar = LIBXC_input_length(nfn, fn_id, fac, spin);
-        if (nvar == 0) { // No functional needs to be evaluated.
-                return;
-        }
-
-        int outlen = xc_output_length(nvar, deriv);
-        // output buffer is zeroed in the Python caller
-        //NPdset0(output, np*outlen);
-
-        double *ebuf = malloc(sizeof(double) * np);
-        double *vbuf = NULL;
-        double *fbuf = NULL;
-        double *kbuf = NULL;
-        if (deriv > 0) {
-                vbuf = malloc(sizeof(double) * np*9);
-        }
-        if (deriv > 1) {
-                fbuf = malloc(sizeof(double) * np*45);
-        }
-        if (deriv > 2) {
-                if (spin == XC_POLARIZED) {  // spin-unresctricted MGGA
-                        // FIXME *220 in xcfun
-                        kbuf = malloc(sizeof(double) * np*165);
-                } else {  // spin-resctricted MGGA
-                        kbuf = malloc(sizeof(double) * np*20);
-                }
-        }
+        assert(deriv <= 4);
+        double *ebuf = malloc(sizeof(double) * np * outlen);
+        double *rho = malloc(sizeof(double) * np * 7);
+        _eval_rho(rho, rho_u, spin, nvar, np);
+        int nspin = spin + 1;
 
         int i, j;
         xc_func_type func;
         for (i = 0; i < nfn; i++) {
-                if (xc_func_init(&func, fn_id[i], spin) != 0) {
+                if (xc_func_init(&func, fn_id[i], nspin) != 0) {
                         fprintf(stderr, "XC functional %d not found\n",
                                 fn_id[i]);
                         raise_error;
@@ -832,22 +857,13 @@ void LIBXC_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
 #if defined XC_SET_RELATIVITY
                 xc_lda_x_set_params(&func, relativity);
 #endif
-                _eval_xc(&func, spin, np, rho_u, rho_d, ebuf, vbuf, fbuf, kbuf);
-                merge_xc(output, ebuf, vbuf, fbuf, kbuf, fac[i],
-                         np, outlen, nvar, spin, func.info->family);
+                _eval_xc(&func, spin, deriv, np, rho, ebuf);
+                merge_xc(output, ebuf, fac[i],
+                         spin, deriv, nvar, np, outlen, func.info->family);
                 xc_func_end(&func);
         }
-
         free(ebuf);
-        if (deriv > 0) {
-                free(vbuf);
-        }
-        if (deriv > 1) {
-                free(fbuf);
-        }
-        if (deriv > 2) {
-                free(kbuf);
-        }
+        free(rho);
 }
 
 int LIBXC_max_deriv_order(int xc_id)
diff --git a/pyscf/lib/dft/xc_deriv.c b/pyscf/lib/dft/xc_deriv.c
index 3292d31eb8..ecab4adb89 100644
--- a/pyscf/lib/dft/xc_deriv.c
+++ b/pyscf/lib/dft/xc_deriv.c
@@ -17,6 +17,7 @@
  */
 
 #include <stdlib.h>
+#include <assert.h>
 
 /*
  * tmp = fg[...,[[0,1],[1,2]],...]
@@ -65,109 +66,104 @@ void VXCfg_to_direct_deriv(double *qg, double *fg, double *rho,
         }
 }
 
-void VXCud2ts_deriv1(double *v_ts, double *v_ud, int nvar, int ngrids)
+void VXCud2ts(double *v_ts, double *v_ud, int ncounts, size_t ngrids)
 {
-        size_t Ngrids = ngrids;
-        size_t vg = nvar * Ngrids;
-        double *vu = v_ud;
-        double *vd = v_ud + vg;
-        double *vt = v_ts;
-        double *vs = v_ts + vg;
-        size_t n;
+        double *vu, *vd, *vt, *vs;
+        int i, n;
+        for (n = 0; n < ncounts; n++) {
+                vu = v_ud + 2*n    * ngrids;
+                vd = v_ud +(2*n+1) * ngrids;
+                vt = v_ts + 2*n    * ngrids;
+                vs = v_ts +(2*n+1) * ngrids;
 #pragma GCC ivdep
-        for (n = 0; n < vg; n++) {
-                vt[n] = (vu[n] + vd[n]) * .5;
-                vs[n] = (vu[n] - vd[n]) * .5;
+                for (i = 0; i < ngrids; i++) {
+                        vt[i] = (vu[i] + vd[i]) * .5;
+                        vs[i] = (vu[i] - vd[i]) * .5;
+                }
         }
 }
 
-void VXCud2ts_deriv2(double *v_ts, double *v_ud, int nvar, int ngrids)
+void VXCts2ud(double *v_ud, double *v_ts, int ncounts, size_t ngrids)
+{
+        double *vu, *vd, *vt, *vs;
+        int i, n;
+        for (n = 0; n < ncounts; n++) {
+                vu = v_ud + 2*n    * ngrids;
+                vd = v_ud +(2*n+1) * ngrids;
+                vt = v_ts + 2*n    * ngrids;
+                vs = v_ts +(2*n+1) * ngrids;
+#pragma GCC ivdep
+                for (i = 0; i < ngrids; i++) {
+                        vu[i] = vt[i] + vs[i];
+                        vd[i] = vt[i] - vs[i];
+                }
+        }
+}
+
+#define frho_at(n, a, x, g)     frho[((n*2+a)*Nvg + x*Ngrids + g)]
+#define fsigma_at(x, n, g)      fsigma[(x*Ncg + n*Ngrids + g)]
+// spin0 shape: frho[...,nvar,ngrids], fsigma[:,...,ngrids], rho[nvar,ngrids]
+void VXCunfold_sigma_spin0(double *frho, double *fsigma, double *rho,
+                           int ncounts, int nvar, int ngrids)
 {
         size_t Ngrids = ngrids;
-        size_t vg = nvar * Ngrids;
-        size_t vg2 = vg * 2;
-        size_t vvg = nvar * vg2;
-        double *vuu = v_ud;
-        double *vud = v_ud + vg;
-        double *vdu = vuu + vvg;
-        double *vdd = vud + vvg;
-        double *vtt = v_ts;
-        double *vts = v_ts + vg;
-        double *vst = vtt + vvg;
-        double *vss = vts + vvg;
-        double ut, us, dt, ds;
-        size_t i, n;
-        for (i = 0; i < nvar; i++) {
+        size_t Ncg = ncounts * Ngrids;
+        size_t Nvg = nvar * Ngrids;
+        int g, n;
+        for (n = 0; n < ncounts; n++) {
 #pragma GCC ivdep
-                for (n = 0; n < vg; n++) {
-                        ut = vuu[i*vg2+n] + vud[i*vg2+n];
-                        us = vuu[i*vg2+n] - vud[i*vg2+n];
-                        dt = vdu[i*vg2+n] + vdd[i*vg2+n];
-                        ds = vdu[i*vg2+n] - vdd[i*vg2+n];
-                        vtt[i*vg2+n] = (ut + dt) * .25;
-                        vts[i*vg2+n] = (us + ds) * .25;
-                        vst[i*vg2+n] = (ut - dt) * .25;
-                        vss[i*vg2+n] = (us - ds) * .25;
+                for (g = 0; g < ngrids; g++) {
+                        frho[n*Nvg+g] = fsigma[n*Ngrids+g];
+                        frho[n*Nvg+1*Ngrids+g] = fsigma[Ncg+n*Ngrids+g] * rho[1*Ngrids+g] * 2;
+                        frho[n*Nvg+2*Ngrids+g] = fsigma[Ncg+n*Ngrids+g] * rho[2*Ngrids+g] * 2;
+                        frho[n*Nvg+3*Ngrids+g] = fsigma[Ncg+n*Ngrids+g] * rho[3*Ngrids+g] * 2;
+                }
+        }
+        if (nvar > 4) {
+                // MGGA
+                assert(nvar == 5);
+                for (n = 0; n < ncounts; n++) {
+#pragma GCC ivdep
+                        for (g = 0; g < ngrids; g++) {
+                                frho[n*Nvg+4*Ngrids+g] = fsigma[2*Ncg+n*Ngrids+g];
+                        }
                 }
         }
 }
 
-void VXCud2ts_deriv3(double *v_ts, double *v_ud, int nvar, int ngrids)
+#define frho_at(n, a, x, g)     frho[((n*2+a)*Nvg + x*Ngrids + g)]
+#define fsigma_at(x, n, g)      fsigma[(x*Ncg + n*Ngrids + g)]
+#define rho_at(a, x, g)         rho[(a*Nvg + x*Ngrids + g)]
+// spin1 shape: frho[...,2,nvar,ngrids], fsigma[:,...,ngrids], rho[2,nvar,ngrids]
+void VXCunfold_sigma_spin1(double *frho, double *fsigma, double *rho,
+                           int ncounts, int nvar, int ngrids)
 {
         size_t Ngrids = ngrids;
-        size_t vg = nvar * Ngrids;
-        size_t vg2 = vg * 2;
-        size_t vvg = nvar * vg2;
-        size_t vvg2 = vvg * 2;
-        size_t vvvg = nvar * vvg2;
-        double *vuuu = v_ud;
-        double *vuud = v_ud + vg;
-        double *vudu = vuuu + vvg;
-        double *vudd = vuud + vvg;
-        double *vduu = vuuu + vvvg;
-        double *vdud = vuud + vvvg;
-        double *vddu = vudu + vvvg;
-        double *vddd = vudd + vvvg;
-        double *vttt = v_ts;
-        double *vtts = v_ts + vg;
-        double *vtst = vttt + vvg;
-        double *vtss = vtts + vvg;
-        double *vstt = vttt + vvvg;
-        double *vsts = vtts + vvvg;
-        double *vsst = vtst + vvvg;
-        double *vsss = vtss + vvvg;
-        double uut, uus, udt, uds, dut, dus, ddt, dds;
-        double utt, uts, ust, uss, dtt, dts, dst, dss;
-        size_t i, j, ij, n;
-        for (i = 0; i < nvar; i++) {
-        for (j = 0; j < nvar; j++) {
-                ij = (i * nvar * 2 + j) * vg2;
+        size_t Ncg = ncounts * Ngrids;
+        size_t Nvg = nvar * Ngrids;
+        int g, n;
+        for (n = 0; n < ncounts; n++) {
 #pragma GCC ivdep
-                for (n = 0; n < vg; n++) {
-                        uut = vuuu[ij+n] + vuud[ij+n];
-                        uus = vuuu[ij+n] - vuud[ij+n];
-                        udt = vudu[ij+n] + vudd[ij+n];
-                        uds = vudu[ij+n] - vudd[ij+n];
-                        dut = vduu[ij+n] + vdud[ij+n];
-                        dus = vduu[ij+n] - vdud[ij+n];
-                        ddt = vddu[ij+n] + vddd[ij+n];
-                        dds = vddu[ij+n] - vddd[ij+n];
-                        utt = uut + udt;
-                        uts = uus + uds;
-                        ust = uut - udt;
-                        uss = uus - uds;
-                        dtt = dut + ddt;
-                        dts = dus + dds;
-                        dst = dut - ddt;
-                        dss = dus - dds;
-                        vttt[ij+n] = (utt + dtt) * .125;
-                        vtts[ij+n] = (uts + dts) * .125;
-                        vtst[ij+n] = (ust + dst) * .125;
-                        vtss[ij+n] = (uss + dss) * .125;
-                        vstt[ij+n] = (utt - dtt) * .125;
-                        vsts[ij+n] = (uts - dts) * .125;
-                        vsst[ij+n] = (ust - dst) * .125;
-                        vsss[ij+n] = (uss - dss) * .125;
+                for (g = 0; g < ngrids; g++) {
+                        frho_at(n,0,0,g) = fsigma_at(0,n,g);
+                        frho_at(n,1,0,g) = fsigma_at(1,n,g);
+                        frho_at(n,0,1,g) = fsigma_at(2,n,g) * rho_at(0,1,g) * 2 + fsigma_at(3,n,g) * rho_at(1,1,g);
+                        frho_at(n,1,1,g) = fsigma_at(3,n,g) * rho_at(0,1,g) + 2 * fsigma_at(4,n,g) * rho_at(1,1,g);
+                        frho_at(n,0,2,g) = fsigma_at(2,n,g) * rho_at(0,2,g) * 2 + fsigma_at(3,n,g) * rho_at(1,2,g);
+                        frho_at(n,1,2,g) = fsigma_at(3,n,g) * rho_at(0,2,g) + 2 * fsigma_at(4,n,g) * rho_at(1,2,g);
+                        frho_at(n,0,3,g) = fsigma_at(2,n,g) * rho_at(0,3,g) * 2 + fsigma_at(3,n,g) * rho_at(1,3,g);
+                        frho_at(n,1,3,g) = fsigma_at(3,n,g) * rho_at(0,3,g) + 2 * fsigma_at(4,n,g) * rho_at(1,3,g);
                 }
-        } }
+        }
+        if (nvar > 4) {
+                // MGGA
+                assert(nvar == 5);
+                for (n = 0; n < ncounts; n++) {
+#pragma GCC ivdep
+                        for (g = 0; g < ngrids; g++) {
+                                frho_at(n,0,4,g) = fsigma_at(5,n,g);
+                                frho_at(n,1,4,g) = fsigma_at(6,n,g);
+                        }
+                }
+        }
 }
diff --git a/pyscf/lib/dft/xcfun_itrf.c b/pyscf/lib/dft/xcfun_itrf.c
index 8bfcecffaf..a7825030ea 100644
--- a/pyscf/lib/dft/xcfun_itrf.c
+++ b/pyscf/lib/dft/xcfun_itrf.c
@@ -25,12 +25,17 @@
 #include <XCFun/xcfun.h>
 #include "config.h"
 
+int XCFUN_max_deriv_order = XCFUN_MAX_DERIV_ORDER;
+
 static int eval_xc(xcfun_t* fun, int deriv, xcfun_vars vars,
-                   int np, int ncol, double *rho, double *output)
+                   int np, int ncol, int outlen, double *rho, double *output)
 {
-        xcfun_eval_setup(fun, vars, XC_PARTIAL_DERIVATIVES, deriv);
+        int err = xcfun_eval_setup(fun, vars, XC_PARTIAL_DERIVATIVES, deriv);
+        if (err != 0) {
+                fprintf(stderr, "Failed to initialize xcfun %d\n", err);
+                return err;
+        }
         assert(ncol == xcfun_input_length(fun));
-        int outlen = xcfun_output_length(fun);
 
         //xcfun_eval_vec(fun, np, rho, ncol, output, outlen);
 #pragma omp parallel default(none) \
@@ -42,14 +47,15 @@ static int eval_xc(xcfun_t* fun, int deriv, xcfun_vars vars,
                 xcfun_eval(fun, rho+i*ncol, output+i*outlen);
         }
 }
-        return outlen;
+        return 0;
 }
 
-void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
-                   int spin, int deriv, int np,
-                   double *rho_u, double *rho_d, double *output)
+int XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
+                  int spin, int deriv, int nvar, int np, int outlen,
+                  double *rho_u, double *output)
 {
-        int i, outlen;
+        int i, err;
+        double *rho_d = rho_u + np * nvar;
         double *rho;
         double *gxu, *gyu, *gzu, *gxd, *gyd, *gzd, *tau_u, *tau_d;
         const char *name;
@@ -73,13 +79,13 @@ void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
                         gxu = rho_u + np;
                         gyu = rho_u + np * 2;
                         gzu = rho_u + np * 3;
-                        tau_u = rho_u + np * 5;
+                        tau_u = rho_u + np * 4;
                         for (i = 0; i < np; i++) {
                                 rho[i*3+0] = rho_u[i];
                                 rho[i*3+1] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
                                 rho[i*3+2] = tau_u[i];
                         }
-                        outlen = eval_xc(fun, deriv, XC_N_GNN_TAUN, np, 3, rho, output);
+                        err = eval_xc(fun, deriv, XC_N_GNN_TAUN, np, 3, outlen, rho, output);
                         free(rho);
                 } else if (xcfun_is_gga(fun)) {
                         rho = malloc(sizeof(double) * np*2);
@@ -90,11 +96,11 @@ void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
                                 rho[i*2+0] = rho_u[i];
                                 rho[i*2+1] = gxu[i]*gxu[i] + gyu[i]*gyu[i] + gzu[i]*gzu[i];
                         }
-                        outlen = eval_xc(fun, deriv, XC_N_GNN, np, 2, rho, output);
+                        err = eval_xc(fun, deriv, XC_N_GNN, np, 2, outlen, rho, output);
                         free(rho);
                 } else { // LDA
                         rho = rho_u;
-                        outlen = eval_xc(fun, deriv, XC_N, np, 1, rho, output);
+                        err = eval_xc(fun, deriv, XC_N, np, 1, outlen, rho, output);
                 }
 // xcfun computed rho*Exc[rho] for zeroth order deriviative instead of Exc[rho]
                 for (i = 0; i < np; i++) {
@@ -109,8 +115,8 @@ void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
                         gxd = rho_d + np;
                         gyd = rho_d + np * 2;
                         gzd = rho_d + np * 3;
-                        tau_u = rho_u + np * 5;
-                        tau_d = rho_d + np * 5;
+                        tau_u = rho_u + np * 4;
+                        tau_d = rho_d + np * 4;
                         for (i = 0; i < np; i++) {
                                 rho[i*7+0] = rho_u[i];
                                 rho[i*7+1] = rho_d[i];
@@ -120,7 +126,7 @@ void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
                                 rho[i*7+5] = tau_u[i];
                                 rho[i*7+6] = tau_d[i];
                         }
-                        outlen = eval_xc(fun, deriv, XC_A_B_GAA_GAB_GBB_TAUA_TAUB, np, 7, rho, output);
+                        err = eval_xc(fun, deriv, XC_A_B_GAA_GAB_GBB_TAUA_TAUB, np, 7, outlen, rho, output);
                         free(rho);
                 } else if (xcfun_is_gga(fun)) {
                         rho = malloc(sizeof(double) * np*5);
@@ -137,7 +143,7 @@ void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
                                 rho[i*5+3] = gxu[i]*gxd[i] + gyu[i]*gyd[i] + gzu[i]*gzd[i];
                                 rho[i*5+4] = gxd[i]*gxd[i] + gyd[i]*gyd[i] + gzd[i]*gzd[i];
                         }
-                        outlen = eval_xc(fun, deriv, XC_A_B_GAA_GAB_GBB, np, 5, rho, output);
+                        err = eval_xc(fun, deriv, XC_A_B_GAA_GAB_GBB, np, 5, outlen, rho, output);
                         free(rho);
                 } else { // LDA
                         rho = malloc(sizeof(double) * np*2);
@@ -145,7 +151,7 @@ void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
                                 rho[i*2+0] = rho_u[i];
                                 rho[i*2+1] = rho_d[i];
                         }
-                        outlen = eval_xc(fun, deriv, XC_A_B, np, 2, rho, output);
+                        err = eval_xc(fun, deriv, XC_A_B, np, 2, outlen, rho, output);
                         free(rho);
                 }
                 for (i = 0; i < np; i++) {
@@ -153,6 +159,7 @@ void XCFUN_eval_xc(int nfn, int *fn_id, double *fac, double *omega,
                 }
         }
         xcfun_delete(fun);
+        return err;
 }
 
 /*
diff --git a/pyscf/lib/libxcfun.patch b/pyscf/lib/libxcfun.patch
index 7624c1ddfc..04e2a2a245 100644
--- a/pyscf/lib/libxcfun.patch
+++ b/pyscf/lib/libxcfun.patch
@@ -1,13 +1,101 @@
-diff --git a/src/functionals/common_parameters.cpp b/src/functionals/common_parameters.cpp
-index 4142b0d..210b4ab 100644
---- a/src/functionals/common_parameters.cpp
-+++ b/src/functionals/common_parameters.cpp
-@@ -3,7 +3,7 @@
- PARAMETER(XC_RANGESEP_MU) = 
- {
-   "Range separation inverse length [1/a0]",
--  0.4
-+  0.33
- };
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4281527..8a2f4bb 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -2,7 +2,7 @@
+ # Copyright (c) 2015-2020 by Radovan Bast, Roberto Di Remigio, Jonas Juselius, and contributors.
  
- PARAMETER(XC_EXX) = 
+ # set minimum cmake version
+-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
++cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+
+ # project name
+ project(XCFun LANGUAGES CXX)
+diff --git a/src/XCFunctional.cpp b/src/XCFunctional.cpp
+index 239cef5..6288e40 100644
+--- a/src/XCFunctional.cpp
++++ b/src/XCFunctional.cpp
+@@ -432,7 +432,7 @@ int xcfun_eval_setup(XCFunctional * fun,
+     return xcfun::XC_EVARS;
+   }
+   if ((order < 0 || order > XCFUN_MAX_ORDER) ||
+-      (mode == XC_PARTIAL_DERIVATIVES && order > 4))
++      (mode == XC_PARTIAL_DERIVATIVES && order > 6))
+     return xcfun::XC_EORDER;
+   if (mode == XC_POTENTIAL) {
+     // GGA potential needs full laplacian
+@@ -557,6 +557,74 @@ void xcfun_eval(const XCFunctional * fun, const double input[], double output[])
+ #endif
+ #if XCFUN_MAX_ORDER >= 2
+ #if XCFUN_MAX_ORDER >= 3
++#if XCFUN_MAX_ORDER >= 4
++#if XCFUN_MAX_ORDER >= 5
++      case 5: {
++        typedef ctaylor<ireal_t, 5> ttype;
++        int inlen = xcint_vars[fun->vars].len;
++        ttype in[XC_MAX_INVARS], out = 0;
++        for (int i = 0; i < inlen; i++)
++          in[i] = input[i];
++        int k = 1 + inlen + (inlen * (inlen + 1)) / 2;
++        for (int i = 0; i < inlen; i++) {
++          in[i].set(VAR0, 1);
++          for (int j = i; j < inlen; j++) {
++            in[j].set(VAR1, 1);
++            for (int s = j; s < inlen; s++) {
++              in[s].set(VAR2, 1);
++              for (int s1 = s; s1 < inlen; s1++) {
++                in[s1].set(VAR3, 1);
++                for (int s2 = s1; s2 < inlen; s2++) {
++                  in[s1].set(VAR4, 1);
++                  densvars<ttype> d(fun, in);
++                  out = 0;
++                  for (int n = 0; n < fun->nr_active_functionals; n++)
++                    out += fun->settings[fun->active_functionals[n]->id] *
++                           fun->active_functionals[n]->fp5(d);
++                  output[k++] = out.get(VAR0 | VAR1 | VAR2 | VAR3 | VAR4);
++                  in[s2].set(VAR4, 0);
++                }
++                in[s1].set(VAR3, 0);
++              }
++              in[s].set(VAR2, 0);
++            }
++            in[j].set(VAR1, 0);
++          }
++          in[i] = input[i];
++        }
++      }
++#endif
++      case 4: {
++        typedef ctaylor<ireal_t, 4> ttype;
++        int inlen = xcint_vars[fun->vars].len;
++        ttype in[XC_MAX_INVARS], out = 0;
++        for (int i = 0; i < inlen; i++)
++          in[i] = input[i];
++        int k = 1 + inlen + (inlen * (inlen + 1)) / 2;
++        for (int i = 0; i < inlen; i++) {
++          in[i].set(VAR0, 1);
++          for (int j = i; j < inlen; j++) {
++            in[j].set(VAR1, 1);
++            for (int s = j; s < inlen; s++) {
++              in[s].set(VAR2, 1);
++              for (int s1 = s; s1 < inlen; s1++) {
++                  in[s1].set(VAR3, 1);
++                  densvars<ttype> d(fun, in);
++                  out = 0;
++                  for (int n = 0; n < fun->nr_active_functionals; n++)
++                    out += fun->settings[fun->active_functionals[n]->id] *
++                           fun->active_functionals[n]->fp4(d);
++                  output[k++] = out.get(VAR0 | VAR1 | VAR2 | VAR3);
++                  in[s1].set(VAR3, 0);
++              }
++              in[s].set(VAR2, 0);
++            }
++            in[j].set(VAR1, 0);
++          }
++          in[i] = input[i];
++        }
++      }
++#endif
+       // Do the third order derivatives here, then use the second order code. This is
+       // getting expensive..
+       case 3: {
diff --git a/pyscf/lib/misc.py b/pyscf/lib/misc.py
index 08199fd6cf..63342131ac 100644
--- a/pyscf/lib/misc.py
+++ b/pyscf/lib/misc.py
@@ -341,6 +341,14 @@ def prange_split(n_total, n_sections):
     div_points = numpy.array(section_sizes).cumsum()
     return zip(div_points[:-1], div_points[1:])
 
+izip = zip
+
+if sys.version_info > (3, 8):
+    from math import comb
+else:
+    import math
+    def comb(n, k):
+        return math.factorial(n) // math.factorial(n-k) // math.factorial(k)
 
 def map_with_prefetch(func, *iterables):
     '''
@@ -906,13 +914,6 @@ def overwrite_mro(obj, mro):
     '''A hacky function to overwrite the __mro__ attribute'''
     raise DeprecationWarning
 
-def izip(*args):
-    '''python2 izip == python3 zip'''
-    if sys.version_info < (3,):
-        return itertools.izip(*args)
-    else:
-        return zip(*args)
-
 class ProcessWithReturnValue(Process):
     def __init__(self, group=None, target=None, name=None, args=(),
                  kwargs=None):