diff --git a/LiCSBAS_lib/LiCSBAS_inv_lib.py b/LiCSBAS_lib/LiCSBAS_inv_lib.py
index 1c0beeb..b24cc4c 100644
--- a/LiCSBAS_lib/LiCSBAS_inv_lib.py
+++ b/LiCSBAS_lib/LiCSBAS_inv_lib.py
@@ -8,6 +8,8 @@
 =========
 Changelog
 =========
+v1.5 20210305 Yu Morishita, GSI
+ - Add GPU option into invert_nsbas
 v1.4.2 20201118 Yu Morishita, GSI
  - Again Bug fix of multiprocessing
 v1.4.1 20201116 Yu Morishita, GSI
@@ -80,7 +82,7 @@ def make_sb_matrix2(ifgdates):
 
 
 #%%
-def invert_nsbas(unw, G, dt_cum, gamma, n_core):
+def invert_nsbas(unw, G, dt_cum, gamma, n_core, gpu):
     """
     Calculate increment displacement difference by NSBAS inversion. Points with all unw data are solved by simple SB inversion firstly at a time.
 
@@ -91,6 +93,7 @@ def invert_nsbas(unw, G, dt_cum, gamma, n_core):
       dt_cum : Cumulative years(or days) for each image (n_im)
       gamma  : Gamma value for NSBAS inversion, should be small enough (e.g., 0.0001)
       n_core : Number of cores for parallel processing
+      gpu  : GPU flag
 
     Returns:
       inc     : Incremental displacement (n_im-1, n_pt)
@@ -120,20 +123,30 @@ def invert_nsbas(unw, G, dt_cum, gamma, n_core):
 
     if n_pt_full!=0:
         print('  Solving {0:6}/{1:6}th points with full unw at a time...'.format(n_pt_full, n_pt), flush=True)
-    
+
         ## Sovle
         unw_tmp = np.concatenate((unw[bool_pt_full, :], np.zeros((n_pt_full, n_im), dtype=np.float32)), axis=1).transpose()
-        result[:, bool_pt_full] = np.linalg.lstsq(Gall, unw_tmp, rcond=None)[0]
 
+        if gpu:
+            print('  Using GPU')
+            import cupy as cp
+            unw_tmp_cp = cp.asarray(unw_tmp)
+            Gall_cp = cp.asarray(Gall)
+            _sol = cp.linalg.lstsq(Gall_cp, unw_tmp_cp, rcond=None)[0]
+            result[:, bool_pt_full] = cp.asnumpy(_sol)
+            del unw_tmp_cp, Gall_cp, _sol
+        else:
+            result[:, bool_pt_full] = np.linalg.lstsq(Gall, unw_tmp, rcond=None)[0]
 
     ### Solve other points with nan point by point.
+    ## Not use GPU because lstsq with small matrix is slower than CPU
     unw_tmp = np.concatenate((unw[~bool_pt_full, :], np.zeros((n_pt-n_pt_full, n_im), dtype=np.float32)), axis=1).transpose()
     mask = (~np.isnan(unw_tmp))
     unw_tmp[np.isnan(unw_tmp)] = 0
     print('  Next, solve {0} points including nan point-by-point...'.format(n_pt-n_pt_full), flush=True)
-    
+
     if n_core == 1:
-        result[:, ~bool_pt_full] = censored_lstsq_slow(Gall, unw_tmp, mask) #(n_im+1, n_pt) 
+        result[:, ~bool_pt_full] = censored_lstsq_slow(Gall, unw_tmp, mask) #(n_im+1, n_pt)
     else:
         print('  {} parallel processing'.format(n_core), flush=True)
 
@@ -206,7 +219,7 @@ def invert_nsbas_wls(unw, var, G, dt_cum, gamma, n_core):
 
     if n_core == 1:
         for i in range(n_pt):
-            result[:, i] = wls_nsbas(i) #(n_im+1, n_pt) 
+            result[:, i] = wls_nsbas(i) #(n_im+1, n_pt)
     else:
         print('  {} parallel processing'.format(n_core), flush=True)
 
@@ -227,7 +240,7 @@ def wls_nsbas(i):
     ### Use global value of Gall, unw_tmp, mask
     if np.mod(i, 1000) == 0:
         print('  Running {0:6}/{1:6}th point...'.format(i, unw_tmp.shape[1]), flush=True)
-        
+
     ## Weight unw and G
 
     Gall_w = Gall/np.sqrt(np.float64(var_tmp[:,i][:,np.newaxis]))
@@ -256,7 +269,7 @@ def calc_vel(cum, dt_cum):
     """
     n_pt, n_im = cum.shape
     result = np.zeros((2, n_pt), dtype=np.float32)*np.nan #[vconst, vel]
-   
+
     G = np.stack((np.ones_like(dt_cum), dt_cum), axis=1)
     vconst = np.zeros((n_pt), dtype=np.float32)*np.nan
     vel = np.zeros((n_pt), dtype=np.float32)*np.nan
@@ -266,7 +279,7 @@ def calc_vel(cum, dt_cum):
 
     if n_pt_full!=0:
         print('  Solving {0:6}/{1:6}th points with full cum at a time...'.format(n_pt_full, n_pt), flush=True)
-    
+
         ## Sovle
         result[:, bool_pt_full] = np.linalg.lstsq(G, cum[bool_pt_full, :].transpose(), rcond=None)[0]
 
@@ -275,9 +288,9 @@ def calc_vel(cum, dt_cum):
     mask = (~np.isnan(cum_tmp))
     cum_tmp[np.isnan(cum_tmp)] = 0
     print('  Next, solve {0} points including nan point-by-point...'.format(n_pt-n_pt_full), flush=True)
-    
-    result[:, ~bool_pt_full] = censored_lstsq_slow(G, cum_tmp, mask) #(n_im+1, n_pt) 
-        
+
+    result[:, ~bool_pt_full] = censored_lstsq_slow(G, cum_tmp, mask) #(n_im+1, n_pt)
+
     vconst = result[0, :]
     vel = result[1, :]
 
@@ -301,11 +314,11 @@ def calc_velsin(cum, dt_cum, imd0):
       dt     : Time difference of sin function wrt Jan 1 (day)
     """
     doy0 = (dt.datetime.strptime(imd0, '%Y%m%d')-dt.datetime.strptime(imd0[0:4]+'0101', '%Y%m%d')).days
-    
+
     n_pt, n_im = cum.shape
     result = np.zeros((4, n_pt), dtype=np.float32)*np.nan #[vconst, vel, coef_s, coef_c]
-   
-    
+
+
     sin = np.sin(2*np.pi*dt_cum)
     cos = np.cos(2*np.pi*dt_cum)
     G = np.stack((np.ones_like(dt_cum), dt_cum, sin, cos), axis=1)
@@ -320,7 +333,7 @@ def calc_velsin(cum, dt_cum, imd0):
 
     if n_pt_full!=0:
         print('  Solving {0:6}/{1:6}th points with full cum at a time...'.format(n_pt_full, n_pt), flush=True)
-    
+
         ## Sovle
         result[:, bool_pt_full] = np.linalg.lstsq(G, cum[bool_pt_full, :].transpose(), rcond=None)[0]
 
@@ -329,9 +342,9 @@ def calc_velsin(cum, dt_cum, imd0):
     mask = (~np.isnan(cum_tmp))
     cum_tmp[np.isnan(cum_tmp)] = 0
     print('  Next, solve {0} points including nan point-by-point...'.format(n_pt-n_pt_full), flush=True)
-    
-    result[:, ~bool_pt_full] = censored_lstsq_slow(G, cum_tmp, mask) #(n_im+1, n_pt) 
-        
+
+    result[:, ~bool_pt_full] = censored_lstsq_slow(G, cum_tmp, mask) #(n_im+1, n_pt)
+
     vconst = result[0, :]
     vel = result[1, :]
     coef_s = result[2, :]
@@ -362,21 +375,21 @@ def calc_velstd_withnan(cum, dt_cum):
     global bootcount, bootnum
     n_pt, n_im = cum.shape
     bootnum = 100
-    bootcount = 0 
+    bootcount = 0
 
     vstd = np.zeros((n_pt), dtype=np.float32)
     G = np.stack((np.ones_like(dt_cum), dt_cum), axis=1)
-    
+
     data = cum.transpose().copy()
     ixs_day = np.arange(n_im)
     mask = (~np.isnan(data))
     data[np.isnan(data)] = 0
-            
+
     velinv = lambda x : censored_lstsq2(G[x, :], data[x, :], mask[x, :])[1]
 
     with NumpyRNGContext(1):
         bootresult = bootstrap(ixs_day, bootnum, bootfunc=velinv)
-        
+
     vstd = np.nanstd(bootresult, axis=0)
 
     print('')
@@ -401,10 +414,10 @@ def censored_lstsq2(A, B, M):
         X = np.squeeze(np.linalg.solve(T, rhs)).T # transpose to get r x n
     except: ## In case Singular matrix
         X = np.zeros((B.shape[1]), dtype=np.float32)*np.nan
-    
+
     return X
 
-    
+
 #%%
 def calc_stc(cum):
     """
@@ -427,21 +440,21 @@ def calc_stc(cum):
     _stc = np.ones((length, width, 8), dtype=np.float32)*np.nan
     pixels = [[0, 0], [0, 1], [0, 2], [1, 0], [1, 2], [2, 0], [2, 1], [2, 2]]
     ## Left Top = [0, 0], Rigth Bottmon = [2, 2], Center = [1, 1]
-    
+
     for i, pixel in enumerate(pixels):
         ### Spatial difference (surrounding pixel-center)
         d_cum = cum1[:, pixel[0]:length+pixel[0], pixel[1]:width+pixel[1]] - cum1[:, 1:length+1, 1:width+1]
-        
+
         ### Temporal difference (double difference)
         dd_cum = d_cum[:-1,:,:]-d_cum[1:,:,:]
-        
+
         ### STC (i.e., RMS of DD)
         sumsq_dd_cum = np.nansum(dd_cum**2, axis=0)
         n_dd_cum = np.float32(np.sum(~np.isnan(dd_cum), axis=0)) #nof non-nan
         n_dd_cum[n_dd_cum==0] = np.nan #to avoid 0 division
         _stc[:, :, i] = np.sqrt(sumsq_dd_cum/n_dd_cum)
 
-    ### Strange but some adjacent pixels can have identical time series, 
+    ### Strange but some adjacent pixels can have identical time series,
     ### resulting in 0 of stc. To avoid this, replace 0 with nan.
     _stc[_stc==0] = np.nan
 
@@ -516,6 +529,6 @@ def censored_lstsq_slow(A, B, M):
             X[:,i] = np.linalg.lstsq(A[m], B[m,i], rcond=None)[0]
         except:
             X[:,i] = np.nan
-    
+
     print('')
     return X
diff --git a/batch_LiCSBAS.sh b/batch_LiCSBAS.sh
index c441aa8..34235cd 100755
--- a/batch_LiCSBAS.sh
+++ b/batch_LiCSBAS.sh
@@ -41,6 +41,7 @@ p05_clip_range_geo=""	# e.g. 130.11/131.12/34.34/34.6 (in deg)
 p11_unw_thre=""	# default: 0.3
 p11_coh_thre=""	# default: 0.05
 p12_loop_thre=""	# default: 1.5 rad
+p13_gpu="n"	# y/n. default: n
 p15_coh_thre=""	# default: 0.05
 p15_n_unw_r_thre=""	# default: 1.5
 p15_vstd_thre=""	# default: 100 mm/yr
@@ -285,6 +286,7 @@ if [ $start_step -le 13 -a $end_step -ge 13 ];then
   if [ ! -z $p13_n_para ];then p13_op="$p13_op --n_para $p13_n_para"; fi
   if [ ! -z $p13_n_unw_r_thre ];then p13_op="$p13_op --n_unw_r_thre $p13_n_unw_r_thre"; fi
   if [ $p13_keep_incfile == "y" ];then p13_op="$p13_op --keep_incfile"; fi
+  if [ $p13_gpu == "y" ];then p13_op="$p13_op --gpu"; fi
 
   if [ $check_only == "y" ];then
     echo "LiCSBAS13_sb_inv.py $p13_op"
diff --git a/bin/LiCSBAS13_sb_inv.py b/bin/LiCSBAS13_sb_inv.py
index 4221731..576d35d 100755
--- a/bin/LiCSBAS13_sb_inv.py
+++ b/bin/LiCSBAS13_sb_inv.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-v1.4.8 20210127 Yu Morishita, GSI
+v1.5.0 20210305 Yu Morishita, GSI
 
 This script inverts the SB network of unw to obtain the time series and velocity
 using NSBAS (López-Quiroz et al., 2009; Doin et al., 2011) approach.
@@ -47,7 +47,7 @@
 =====
 Usage
 =====
-LiCSBAS13_sb_inv.py -d ifgdir [-t tsadir] [--inv_alg LS|WLS] [--mem_size float] [--gamma float] [--n_para int] [--n_unw_r_thre float] [--keep_incfile]
+LiCSBAS13_sb_inv.py -d ifgdir [-t tsadir] [--inv_alg LS|WLS] [--mem_size float] [--gamma float] [--n_para int] [--n_unw_r_thre float] [--keep_incfile] [--gpu]
 
  -d  Path to the GEOCml* dir containing stack of unw data
  -t  Path to the output TS_GEOCml* dir.
@@ -65,10 +65,14 @@
      (Default: 1 and 0.5 for C- and L-band, respectively)
  --keep_incfile
      Not remove inc and resid files (Default: remove them)
+ --gpu        Use GPU (Need cupy module)
 
 """
 #%% Change log
 '''
+v1.5 20210305 Yu Morishita, GSI
+ - Add GPU option
+ - Speed up by activating n_para_inv and OMP_NUM_THREADS=1
 v1.4.8 20210127 Yu Morishita, GSI
  - Automatically reduce mem_size if available RAM is small
 v1.4.7 20201124 Yu Morishita, GSI
@@ -136,7 +140,7 @@ def main(argv=None):
         argv = sys.argv
 
     start = time.time()
-    ver="1.4.8"; date=20210127; author="Y. Morishita"
+    ver="1.5"; date=20210305; author="Y. Morishita"
     print("\n{} ver{} {} {}".format(os.path.basename(argv[0]), ver, date, author), flush=True)
     print("{} {}".format(os.path.basename(argv[0]), ' '.join(argv[1:])), flush=True)
 
@@ -150,12 +154,16 @@ def main(argv=None):
     ifgdir = []
     tsadir = []
     inv_alg = 'LS'
+    gpu = False
 
     try:
         n_para = len(os.sched_getaffinity(0))
     except:
         n_para = multi.cpu_count()
-    n_para_inv = 1
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+    # Because np.linalg.lstsq use full CPU but not much faster than 1CPU.
+    # Instead parallelize by multiprocessing
 
     memory_size = 4000
     gamma = 0.0001
@@ -173,7 +181,10 @@ def main(argv=None):
     #%% Read options
     try:
         try:
-            opts, args = getopt.getopt(argv[1:], "hd:t:", ["help",  "mem_size=", "gamma=", "n_unw_r_thre=", "keep_incfile", "inv_alg=", "n_para="])
+            opts, args = getopt.getopt(argv[1:], "hd:t:",
+                                       ["help",  "mem_size=", "gamma=",
+                                        "n_unw_r_thre=", "keep_incfile",
+                                        "inv_alg=", "n_para=", "gpu"])
         except getopt.error as msg:
             raise Usage(msg)
         for o, a in opts:
@@ -196,6 +207,8 @@ def main(argv=None):
                 inv_alg = a
             elif o == '--n_para':
                 n_para = int(a)
+            elif o == '--gpu':
+                gpu = True
 
         if not ifgdir:
             raise Usage('No data directory given, -d is not optional!')
@@ -203,6 +216,9 @@ def main(argv=None):
             raise Usage('No {} dir exists!'.format(ifgdir))
         elif not os.path.exists(os.path.join(ifgdir, 'slc.mli.par')):
             raise Usage('No slc.mli.par file exists in {}!'.format(ifgdir))
+        if gpu:
+            print("\nGPU option is activated. Need cupy module.\n")
+            import cupy as cp
 
     except Usage as err:
         print("\nERROR:", file=sys.stderr, end='')
@@ -241,6 +257,12 @@ def main(argv=None):
 
     cumh5file = os.path.join(tsadir,'cum.h5')
 
+    if n_para > 32:
+        # Emprically >32 does not make much faster despite using large resource
+        n_para_inv = 32
+    else:
+        n_para_inv = n_para
+
 
     #%% Check files
     try:
@@ -531,27 +553,74 @@ def main(argv=None):
             gap_patch = np.zeros((n_im-1, n_pt_all), dtype=np.int8)
             ns_ifg_noloop_patch = np.zeros((n_pt_all), dtype=np.float32)*np.nan
             maxTlen_patch = np.zeros((n_pt_all), dtype=np.float32)*np.nan
-
-            ### Determine n_para
-            n_pt_patch_min = 1000
-            if n_pt_patch_min*n_para > n_pt_unnan:
-                ## Too much n_para
-                n_para_gap = int(np.floor(n_pt_unnan/n_pt_patch_min))
-                if n_para_gap == 0: n_para_gap = 1
-            else:
-                n_para_gap = n_para
-
             print('\n  Identifing gaps, and counting n_gap and n_ifg_noloop,')
-            print('  with {} parallel processing...'.format(n_para_gap), flush=True)
 
-            ### Devide unwpatch by n_para for parallel processing
-            p = q.Pool(n_para_gap)
-            _result = np.array(p.map(count_gaps_wrapper, range(n_para_gap)), dtype=object)
-            p.close()
+            if gpu:
+                print('  using GPU...', flush=True)
+                n_loop, _ = Aloop.shape
+                unwpatch_cp = cp.asarray(unwpatch)
+                G_cp = cp.asarray(G)
+                Aloop_cp = cp.asarray(Aloop)
+
+                ns_unw_unnan4inc = cp.array(
+                    [(G_cp[:, i]*(~cp.isnan(unwpatch_cp))).sum(
+                        axis=1, dtype=cp.int16) for i in range(n_im-1)])
+                # n_ifg*(n_pt,n_ifg) -> (n_im-1,n_pt)
+                ns_gap_patch[ix_unnan_pt] = cp.asnumpy(
+                    (ns_unw_unnan4inc==0).sum(axis=0)) #n_pt
+                gap_patch[:, ix_unnan_pt] = cp.asnumpy(ns_unw_unnan4inc==0)
+
+                del ns_unw_unnan4inc
+                del G_cp
+
+                ### n_ifg_noloop
+                # n_ifg*(n_pt,n_ifg)->(n_loop,n_pt)
+                # Number of ifgs for each loop at eath point.
+                # 3 means complete loop, 1 or 2 means broken loop.
+                ns_ifg4loop = cp.array([
+                        (cp.abs(Aloop_cp[i, :])*(~cp.isnan(unwpatch_cp))).sum(axis=1)
+                        for i in range(n_loop)])
+                bool_loop = (ns_ifg4loop==3)
+                #(n_loop,n_pt) identify complete loop only
+
+                # n_loop*(n_loop,n_pt)*n_pt->(n_ifg,n_pt)
+                # Number of loops for each ifg at eath point.
+                ns_loop4ifg = cp.array([(
+                        (cp.abs(Aloop_cp[:, i])*bool_loop.T).T*
+                        (~cp.isnan(unwpatch_cp[:, i]))
+                        ).sum(axis=0) for i in range(n_ifg)]) #
+
+                ns_ifg_noloop_tmp = (ns_loop4ifg==0).sum(axis=0) #n_pt
+                ns_nan_ifg = cp.isnan(unwpatch_cp).sum(axis=1) #n_pt, nan ifg count
+                ns_ifg_noloop_patch[ix_unnan_pt] = cp.asnumpy(
+                    ns_ifg_noloop_tmp - ns_nan_ifg)
+
+                del bool_loop, ns_ifg4loop, ns_loop4ifg
+                del ns_ifg_noloop_tmp, ns_nan_ifg
+                del unwpatch_cp, Aloop_cp
 
-            ns_gap_patch[ix_unnan_pt] = np.hstack(_result[:, 0]) #n_pt
-            gap_patch[:, ix_unnan_pt] = np.hstack(_result[:, 1]) #n_im-1, n_pt
-            ns_ifg_noloop_patch[ix_unnan_pt] = np.hstack(_result[:, 2])
+            else:
+                ### Determine n_para
+                n_pt_patch_min = 1000
+                if n_pt_patch_min*n_para > n_pt_unnan:
+                    ## Too much n_para
+                    n_para_gap = int(np.floor(n_pt_unnan/n_pt_patch_min))
+                    if n_para_gap == 0: n_para_gap = 1
+                else:
+                    n_para_gap = n_para
+
+                print('  with {} parallel processing...'.format(n_para_gap),
+                      flush=True)
+
+                ### Devide unwpatch by n_para for parallel processing
+                p = q.Pool(n_para_gap)
+                _result = np.array(p.map(count_gaps_wrapper, range(n_para_gap)),
+                                   dtype=object)
+                p.close()
+
+                ns_gap_patch[ix_unnan_pt] = np.hstack(_result[:, 0]) #n_pt
+                gap_patch[:, ix_unnan_pt] = np.hstack(_result[:, 1]) #n_im-1, n_pt
+                ns_ifg_noloop_patch[ix_unnan_pt] = np.hstack(_result[:, 2])
 
             ### maxTlen
             _maxTlen = np.zeros((n_pt_unnan), dtype=np.float32) #temporaly
@@ -566,9 +635,11 @@ def main(argv=None):
             #%% Time series inversion
             print('\n  Small Baseline inversion by {}...\n'.format(inv_alg), flush=True)
             if inv_alg == 'WLS':
-                inc_tmp, vel_tmp, vconst_tmp = inv_lib.invert_nsbas_wls(unwpatch, varpatch, G, dt_cum, gamma, n_para_inv)
+                inc_tmp, vel_tmp, vconst_tmp = inv_lib.invert_nsbas_wls(
+                    unwpatch, varpatch, G, dt_cum, gamma, n_para_inv)
             else:
-                inc_tmp, vel_tmp, vconst_tmp = inv_lib.invert_nsbas(unwpatch, G, dt_cum, gamma, n_para_inv)
+                inc_tmp, vel_tmp, vconst_tmp = inv_lib.invert_nsbas(
+                    unwpatch, G, dt_cum, gamma, n_para_inv, gpu)
 
             ### Set to valuables
             inc_patch = np.zeros((n_im-1, n_pt_all), dtype=np.float32)*np.nan