diff --git a/pyscf/lib/mcscf/fci_contract.c b/pyscf/lib/mcscf/fci_contract.c
index 4c05a00e9b..fe4af02261 100644
--- a/pyscf/lib/mcscf/fci_contract.c
+++ b/pyscf/lib/mcscf/fci_contract.c
@@ -26,8 +26,8 @@
 #include "vhf/fblas.h"
 #include "np_helper/np_helper.h"
 #include "fci.h"
-// for (16e,16o) ~ 11 MB buffer = 120 * 12870 * 8
-#define STRB_BLKSIZE    112
+// optimized for 512 KB L2 cache, (16e,16o)
+#define STRB_BLKSIZE    160
 
 /*
  * CPU timing of single thread can be estimated:
@@ -291,7 +291,7 @@ static void spread_bufa_t1(double *ci1, double *t1, int nrow_t1,
  * bcount_for_spread_a is different for spin1 and spin0
  */
 static void ctr_rhf2e_kern(double *eri, double *ci0, double *ci1,
-                           double *ci1buf, double *t1buf,
+                           double *ci1buf, double *t1, double *vt1,
                            int bcount_for_spread_a, int ncol_ci1buf,
                            int bcount, int stra_id, int strb_id,
                            int norb, int na, int nb, int nlinka, int nlinkb,
@@ -301,8 +301,6 @@ static void ctr_rhf2e_kern(double *eri, double *ci0, double *ci1,
         const double D0 = 0;
         const double D1 = 1;
         const int nnorb = norb * (norb+1)/2;
-        double *t1 = t1buf;
-        double *vt1 = t1buf + nnorb*bcount;
 
         NPdset0(t1, nnorb*bcount);
         FCIprog_a_t1(ci0, t1, bcount, stra_id, strb_id,
@@ -371,7 +369,11 @@ void FCIcontract_2e_spin0(double *eri, double *ci0, double *ci1,
 {
         int strk, ib;
         size_t blen;
-        double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*norb*(norb+1)+2));
+        int nnorb = norb*(norb+1)/2;
+        double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*nnorb*2+2));
+        double *tmp;
+        double *t1 = t1buf;
+        double *vt1 = t1buf + nnorb*STRB_BLKSIZE;
         double *ci1buf = malloc(sizeof(double) * (na*STRB_BLKSIZE+2));
         ci1bufs[omp_get_thread_num()] = ci1buf;
         for (ib = 0; ib < na; ib += STRB_BLKSIZE) {
@@ -380,15 +382,16 @@ void FCIcontract_2e_spin0(double *eri, double *ci0, double *ci1,
 #pragma omp for schedule(static, 112)
 /* strk starts from MAX(strk0, ib), because [0:ib,0:ib] have been evaluated */
                 for (strk = ib; strk < na; strk++) {
-                        ctr_rhf2e_kern(eri, ci0, ci1, ci1buf, t1buf,
+                        ctr_rhf2e_kern(eri, ci0, ci1, ci1buf, t1, vt1,
                                        MIN(STRB_BLKSIZE, strk-ib), blen,
                                        MIN(STRB_BLKSIZE, strk+1-ib),
                                        strk, ib, norb, na, na, nlink, nlink,
                                        clink, clink);
+                        // swap buffer for better cache utilization in next task
+                        tmp = t1;
+                        t1 = vt1;
+                        vt1 = tmp;
                 }
-//                NPomp_dsum_reduce_inplace(ci1bufs, blen*na);
-//#pragma omp master
-//                FCIaxpy2d(ci1+ib, ci1buf, na, na, blen);
 #pragma omp barrier
                 _reduce(ci1+ib, ci1bufs, na, na, blen);
 // An explicit barrier to ensure ci1 is updated. Without barrier, there may
@@ -417,7 +420,11 @@ void FCIcontract_2e_spin1(double *eri, double *ci0, double *ci1,
 {
         int strk, ib;
         size_t blen;
-        double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*norb*(norb+1)+2));
+        int nnorb = norb*(norb+1)/2;
+        double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*nnorb*2+2));
+        double *tmp;
+        double *t1 = t1buf;
+        double *vt1 = t1buf + nnorb*STRB_BLKSIZE;
         double *ci1buf = malloc(sizeof(double) * (na*STRB_BLKSIZE+2));
         ci1bufs[omp_get_thread_num()] = ci1buf;
         for (ib = 0; ib < nb; ib += STRB_BLKSIZE) {
@@ -425,14 +432,15 @@ void FCIcontract_2e_spin1(double *eri, double *ci0, double *ci1,
                 NPdset0(ci1buf, ((size_t)na) * blen);
 #pragma omp for schedule(static)
                 for (strk = 0; strk < na; strk++) {
-                        ctr_rhf2e_kern(eri, ci0, ci1, ci1buf, t1buf,
+                        ctr_rhf2e_kern(eri, ci0, ci1, ci1buf, t1, vt1,
                                        blen, blen, blen, strk, ib,
                                        norb, na, nb, nlinka, nlinkb,
                                        clinka, clinkb);
+                        // swap buffer for better cache utilization in next task
+                        tmp = t1;
+                        t1 = vt1;
+                        vt1 = tmp;
                 }
-//                NPomp_dsum_reduce_inplace(ci1bufs, blen*na);
-//#pragma omp master
-//                FCIaxpy2d(ci1+ib, ci1buf, na, nb, blen);
 #pragma omp barrier
                 _reduce(ci1+ib, ci1bufs, na, nb, blen);
 // An explicit barrier to ensure ci1 is updated. Without barrier, there may
diff --git a/pyscf/lib/mcscf/fci_contract_nosym.c b/pyscf/lib/mcscf/fci_contract_nosym.c
index ddf12add55..bab54ba3ba 100644
--- a/pyscf/lib/mcscf/fci_contract_nosym.c
+++ b/pyscf/lib/mcscf/fci_contract_nosym.c
@@ -27,7 +27,8 @@
 #include "np_helper/np_helper.h"
 #include "fci.h"
 #define CSUMTHR         1e-28
-#define STRB_BLKSIZE    112
+// optimized for 1 MB L2 cache, (16e,16o)
+#define STRB_BLKSIZE    120
 
 double FCI_t1ci_sf(double *ci0, double *t1, int bcount,
                    int stra_id, int strb_id,
@@ -147,7 +148,7 @@ static void spread_b_t1(double *ci1, double *t1,
 }
 
 static void ctr_rhf2e_kern(double *eri, double *ci0, double *ci1,
-                           double *ci1buf, double *t1buf,
+                           double *ci1buf, double *t1, double *vt1,
                            int bcount_for_spread_a, int ncol_ci1buf,
                            int bcount, int stra_id, int strb_id,
                            int norb, int na, int nb, int nlinka, int nlinkb,
@@ -157,8 +158,6 @@ static void ctr_rhf2e_kern(double *eri, double *ci0, double *ci1,
         const double D0 = 0;
         const double D1 = 1;
         const int nnorb = norb * norb;
-        double *t1 = t1buf;
-        double *vt1 = t1buf + nnorb*bcount;
         double csum;
 
         csum = FCI_t1ci_sf(ci0, t1, bcount, stra_id, strb_id,
@@ -203,16 +202,23 @@ void FCIcontract_2es1(double *eri, double *ci0, double *ci1,
 {
         int strk, ib, blen;
         double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*norb*norb*2+2));
+        double *tmp;
+        double *t1 = t1buf;
+        double *vt1 = t1buf + norb*norb*STRB_BLKSIZE;
         double *ci1buf = malloc(sizeof(double) * (na*STRB_BLKSIZE+2));
         for (ib = 0; ib < nb; ib += STRB_BLKSIZE) {
                 blen = MIN(STRB_BLKSIZE, nb-ib);
                 NPdset0(ci1buf, ((size_t)na) * blen);
 #pragma omp for schedule(static)
                 for (strk = 0; strk < na; strk++) {
-                        ctr_rhf2e_kern(eri, ci0, ci1, ci1buf, t1buf,
+                        ctr_rhf2e_kern(eri, ci0, ci1, ci1buf, t1, vt1,
                                        blen, blen, blen, strk, ib,
                                        norb, na, nb, nlinka, nlinkb,
                                        clinka, clinkb);
+                        // swap buffer for better cache utilization in next task
+                        tmp = t1;
+                        t1 = vt1;
+                        vt1 = tmp;
                 }
 #pragma omp critical
                 axpy2d(ci1+ib, ci1buf, na, nb, blen);