diff --git a/als.py b/als.py
index 36f7f79..ea6b1ce 100644
--- a/als.py
+++ b/als.py
@@ -18,33 +18,30 @@ def __init__(self, f1, f2, omega, string, use_MTTKRP):
         self.string = string
         self.use_MTTKRP = use_MTTKRP
 
-    def mul(self, idx, sk):
+    def MTTKRP_TTTP(self, sk, out):
         if self.use_MTTKRP:
             if self.string=="U":
-                out = ctf.tensor([self.omega.shape[0], self.f1.shape[1]])
                 ctf.MTTKRP(ctf.TTTP(self.omega, [sk, self.f1, self.f2]),[out,self.f1,self.f2],0)
             elif self.string=="V":
-                out = ctf.tensor([self.omega.shape[1], self.f1.shape[1]])
                 ctf.MTTKRP(ctf.TTTP(self.omega, [self.f1, sk, self.f2]),[self.f1,out,self.f2],1)
             elif self.string=="W":
-                out = ctf.tensor([self.omega.shape[2], self.f1.shape[1]])
                 ctf.MTTKRP(ctf.TTTP(self.omega, [self.f1, self.f2, sk]),[self.f1,self.f2,out],2)
             else:
-                return ValueError("Invalid string for MTTKRP_mul")
-            return out.i(idx)
+                print("Invalid string for implicit MTTKRP_TTTP")
         else:
+            idx = "ir"
             if self.string=="U":
-                return  self.f1.i("J"+idx[1]) \
-                       *self.f2.i("K"+idx[1]) \
-                       *ctf.TTTP(self.omega, [sk, self.f1, self.f2]).i(idx[0]+"JK")
+                out.i(idx) << self.f1.i("J"+idx[1]) \
+                             *self.f2.i("K"+idx[1]) \
+                             *ctf.TTTP(self.omega, [sk, self.f1, self.f2]).i(idx[0]+"JK")
             if self.string=="V":
-                return  self.f1.i("I"+idx[1]) \
-                       *self.f2.i("K"+idx[1]) \
-                       *ctf.TTTP(self.omega, [self.f1, sk, self.f2]).i("I"+idx[0]+"K")
+                out.i(idx) << self.f1.i("I"+idx[1]) \
+                             *self.f2.i("K"+idx[1]) \
+                             *ctf.TTTP(self.omega, [self.f1, sk, self.f2]).i("I"+idx[0]+"K")
             if self.string=="W":
-                return  self.f1.i("I"+idx[1]) \
-                       *self.f2.i("J"+idx[1]) \
-                       *ctf.TTTP(self.omega, [self.f1, self.f2, sk]).i("IJ"+idx[0])
+                out.i(idx) << self.f1.i("I"+idx[1]) \
+                             *self.f2.i("J"+idx[1]) \
+                             *ctf.TTTP(self.omega, [self.f1, self.f2, sk]).i("IJ"+idx[0])
 
 def CG(A,b,x0,r,regParam,I,is_implicit=False):
 
@@ -53,7 +50,7 @@ def CG(A,b,x0,r,regParam,I,is_implicit=False):
 
     Ax0 = ctf.tensor((I,r))
     if is_implicit:
-        Ax0.i("ir") << A.mul("ir",x0)
+        A.MTTKRP_TTTP(x0,Ax0)
     else:
         Ax0.i("ir") << A.i("irl")*x0.i("il")
     Ax0 += regParam*x0
@@ -66,7 +63,7 @@ def CG(A,b,x0,r,regParam,I,is_implicit=False):
         t_cg_bmvec.start()
         t0 = time.time()
         if is_implicit:
-            Ask.i("ir") << A.mul("ir",sk)
+            A.MTTKRP_TTTP(sk,Ask)
         else:
             Ask.i("ir") << A.i("irl")*sk.i("il")
         t1 = time.time()
diff --git a/arg_defs.py b/arg_defs.py
index 85e9a30..f695c45 100644
--- a/arg_defs.py
+++ b/arg_defs.py
@@ -133,11 +133,11 @@ def add_general_arguments(parser):
         metavar='int',
         help='whether to use function tensor as test problem (default: 0, i.e. False, use explicit low CP-rank sampled tensor)')
     parser.add_argument(
-        '--use-CCD-TTTP',
+        '--use-MTTKRP',
         type=int,
         default=1,
         metavar='int',
-        help='whether to use TTTP for CCD contractions (default: 1, i.e. Yes)')
+        help='whether to use special MTTKRP kernel (default: 1, i.e. Yes)')
     parser.add_argument(
         '--tensor-file',
         type=str,
diff --git a/ccd.py b/ccd.py
index 05978f0..9f07693 100644
--- a/ccd.py
+++ b/ccd.py
@@ -28,7 +28,7 @@ def get_objective(T,U,V,W,omega,regParam):
     return [objective, RMSE]
 
 
-def run_CCD(T,U,V,W,omega,regParam,num_iter,time_limit,objective_frequency,use_TTTP=True):
+def run_CCD(T,U,V,W,omega,regParam,num_iter,time_limit,objective_frequency,use_MTTKRP=True):
     U_vec_list = []
     V_vec_list = []
     W_vec_list = []
@@ -90,17 +90,19 @@ def run_CCD(T,U,V,W,omega,regParam,num_iter,time_limit,objective_frequency,use_T
                 print('updating U[:,{}]'.format(f))
 
             t0 = time.time()
-            if use_TTTP:
+            if use_MTTKRP:
                 alphas = ctf.tensor(R.shape[0])
-                ctf.einsum('ijk -> i', ctf.TTTP(R, [None, V_vec_list[f], W_vec_list[f]]),out=alphas)
+                #ctf.einsum('ijk -> i', ctf.TTTP(R, [None, V_vec_list[f], W_vec_list[f]]),out=alphas)
+                ctf.MTTKRP(R, [alphas, V_vec_list[f], W_vec_list[f]], 0)
             else:
                 alphas = ctf.einsum('ijk, j, k -> i', R, V_vec_list[f], W_vec_list[f])
 
             t1 = time.time()
 
-            if use_TTTP:
+            if use_MTTKRP:
                 betas = ctf.tensor(R.shape[0])
-                ctf.einsum('ijk -> i', ctf.TTTP(omega, [None, V_vec_list[f]*V_vec_list[f], W_vec_list[f]*W_vec_list[f]]),out=betas)
+                #ctf.einsum('ijk -> i', ctf.TTTP(omega, [None, V_vec_list[f]*V_vec_list[f], W_vec_list[f]*W_vec_list[f]]),out=betas)
+                ctf.MTTKRP(omega, [betas, V_vec_list[f]*V_vec_list[f], W_vec_list[f]*W_vec_list[f]], 0)
             else:
                 betas = ctf.einsum('ijk, j, j, k, k -> i', omega, V_vec_list[f], V_vec_list[f], W_vec_list[f], W_vec_list[f])
 
@@ -117,15 +119,17 @@ def run_CCD(T,U,V,W,omega,regParam,num_iter,time_limit,objective_frequency,use_T
             # update V[:,f]
             if glob_comm.rank() == 0 and status_prints == True:
                 print('updating V[:,{}]'.format(f))
-            if use_TTTP:
+            if use_MTTKRP:
                 alphas = ctf.tensor(R.shape[1])
-                ctf.einsum('ijk -> j', ctf.TTTP(R, [U_vec_list[f], None, W_vec_list[f]]),out=alphas)
+                #ctf.einsum('ijk -> j', ctf.TTTP(R, [U_vec_list[f], None, W_vec_list[f]]),out=alphas)
+                ctf.MTTKRP(R, [U_vec_list[f], alphas, W_vec_list[f]], 1)
             else:
                 alphas = ctf.einsum('ijk, i, k -> j', R, U_vec_list[f], W_vec_list[f])
 
-            if use_TTTP:
+            if use_MTTKRP:
                 betas = ctf.tensor(R.shape[1])
-                ctf.einsum('ijk -> j', ctf.TTTP(omega, [U_vec_list[f]*U_vec_list[f], None, W_vec_list[f]*W_vec_list[f]]),out=betas)
+                #ctf.einsum('ijk -> j', ctf.TTTP(omega, [U_vec_list[f]*U_vec_list[f], None, W_vec_list[f]*W_vec_list[f]]),out=betas)
+                ctf.MTTKRP(omega, [U_vec_list[f]*U_vec_list[f], betas, W_vec_list[f]*W_vec_list[f]], 1)
             else:
                 betas = ctf.einsum('ijk, i, i, k, k -> j', omega, U_vec_list[f], U_vec_list[f], W_vec_list[f], W_vec_list[f])
 
@@ -135,15 +139,17 @@ def run_CCD(T,U,V,W,omega,regParam,num_iter,time_limit,objective_frequency,use_T
 
             if glob_comm.rank() == 0 and status_prints == True:
                 print('updating W[:,{}]'.format(f))
-            if use_TTTP:
+            if use_MTTKRP:
                 alphas = ctf.tensor(R.shape[2])
-                ctf.einsum('ijk -> k', ctf.TTTP(R, [U_vec_list[f], V_vec_list[f], None]),out=alphas)
+                #ctf.einsum('ijk -> k', ctf.TTTP(R, [U_vec_list[f], V_vec_list[f], None]),out=alphas)
+                ctf.MTTKRP(R, [U_vec_list[f], W_vec_list[f], alphas], 2)
             else:
                 alphas = ctf.einsum('ijk, i, j -> k', R, U_vec_list[f], V_vec_list[f])
 
-            if use_TTTP:
+            if use_MTTKRP:
                 betas = ctf.tensor(R.shape[2])
-                ctf.einsum('ijk -> k', ctf.TTTP(omega, [U_vec_list[f]*U_vec_list[f], V_vec_list[f]*V_vec_list[f], None]),out=betas)
+                #ctf.einsum('ijk -> k', ctf.TTTP(omega, [U_vec_list[f]*U_vec_list[f], V_vec_list[f]*V_vec_list[f], None]),out=betas)
+                ctf.MTTKRP(omega, [U_vec_list[f]*U_vec_list[f], V_vec_list[f]*V_vec_list[f], betas], 2)
             else:
                 betas = ctf.einsum('ijk, i, i, j, j -> k', omega, U_vec_list[f], U_vec_list[f], V_vec_list[f], V_vec_list[f])
 
diff --git a/combined_test.py b/combined_test.py
index 8daeeec..f301e9f 100644
--- a/combined_test.py
+++ b/combined_test.py
@@ -111,7 +111,7 @@ def create_function_tensor(I, J, K, sp_frac, use_sp_rep):
     sample_frac_SGD = args.sample_frac_SGD
     use_func_tsr = args.function_tensor
     tensor_file = args.tensor_file
-    use_CCD_TTTP = args.use_CCD_TTTP
+    use_MTTKRP = args.use_MTTKRP
 
 
     if use_func_tsr == True:
@@ -134,6 +134,7 @@ def create_function_tensor(I, J, K, sp_frac, use_sp_rep):
             print("Dense tensor shape is",T.shape)
 
         print("Computing tensor completion with CP rank",R)
+        print("use_MTTKRP set to",use_MTTKRP)
 
     omega = getOmega(T)
     U = ctf.random.random((I, R))
@@ -152,7 +153,7 @@ def create_function_tensor(I, J, K, sp_frac, use_sp_rep):
         V_copy = ctf.copy(V)
         W_copy = ctf.copy(W)
 
-        getALS_CG(T,U_copy,V_copy,W_copy,reg_ALS,omega,I,J,K,R,block_size_ALS_imp,numiter_ALS_imp,err_thresh,time_limit,True)
+        getALS_CG(T,U_copy,V_copy,W_copy,reg_ALS,omega,I,J,K,R,block_size_ALS_imp,numiter_ALS_imp,err_thresh,time_limit,True,use_MTTKRP)
 
     if numiter_ALS_exp > 0:
         if ctf.comm().rank() == 0:
@@ -162,18 +163,18 @@ def create_function_tensor(I, J, K, sp_frac, use_sp_rep):
         V_copy = ctf.copy(V)
         W_copy = ctf.copy(W)
 
-        getALS_CG(T,U_copy,V_copy,W_copy,reg_ALS,omega,I,J,K,R,block_size_ALS_exp,numiter_ALS_exp,err_thresh,time_limit,False)
+        getALS_CG(T,U_copy,V_copy,W_copy,reg_ALS,omega,I,J,K,R,block_size_ALS_exp,numiter_ALS_exp,err_thresh,time_limit,False,use_MTTKRP)
 
 
     if numiter_CCD > 0:
         if ctf.comm().rank() == 0:
-            print("Performing up to",numiter_CCD,"iterations, or reaching time limit of",time_limit,"seconds of CCD with use of TTTP for contractions set to use_CCD_TTTP =",use_CCD_TTTP)
+            print("Performing up to",numiter_CCD,"iterations, or reaching time limit of",time_limit,"seconds of CCD")
             print("CCD regularization parameter is",reg_CCD)
         U_copy = ctf.copy(U)
         V_copy = ctf.copy(V)
         W_copy = ctf.copy(W)
 
-        run_CCD(T,U_copy,V_copy,W_copy,omega,reg_CCD,numiter_CCD,time_limit,objfreq_CCD,use_CCD_TTTP)
+        run_CCD(T,U_copy,V_copy,W_copy,omega,reg_CCD,numiter_CCD,time_limit,objfreq_CCD,use_MTTKRP)
 
 
     if numiter_SGD > 0:
@@ -184,7 +185,7 @@ def create_function_tensor(I, J, K, sp_frac, use_sp_rep):
         V_copy = ctf.copy(V)
         W_copy = ctf.copy(W)
 
-        sparse_SGD(T, U_copy, V_copy, W_copy, reg_SGD, omega, I, J, K, R, learn_rate, sample_frac_SGD, numiter_SGD, err_thresh, time_limit, objfreq_SGD)
+        sparse_SGD(T, U_copy, V_copy, W_copy, reg_SGD, omega, I, J, K, R, learn_rate, sample_frac_SGD, numiter_SGD, err_thresh, time_limit, objfreq_SGD,use_MTTKRP)
 
 
 
diff --git a/sgd.py b/sgd.py
index 1727b85..1bef953 100644
--- a/sgd.py
+++ b/sgd.py
@@ -11,7 +11,7 @@
 
 INDEX_STRING = "ijklmnopq"
 
-def sparse_update(T, factors, Lambda, sizes, rank, stepSize, sample_rate, times):
+def sparse_update(T, factors, Lambda, sizes, rank, stepSize, sample_rate, times, use_MTTKRP):
     starting_time = time.time()
     t_go = ctf.timer("SGD_getOmega")
     t_go.start()
@@ -22,29 +22,30 @@ def sparse_update(T, factors, Lambda, sizes, rank, stepSize, sample_rate, times)
     R = ctf.tensor(copy=T) #ctf.tensor(tuple(sizes), sp = True)
     times[2] += time.time() - starting_time
     for i in range(dimension):
-        tup_list = [factors[i].i(indexes[i] + "r") for i in range(dimension)]
-        #R.i(indexes) << T.i(indexes) - omega.i(indexes) * reduce(lambda x, y: x * y, tup_list)
         starting_time = time.time()
         R.i(indexes) << -1.* ctf.TTTP(omega, factors).i(indexes)
         times[3] += time.time() - starting_time
         starting_time = time.time()
-        #H = ctf.tensor(tuple((sizes[:i] + sizes[i + 1:] + [rank])))
         times[4] += time.time() - starting_time
         starting_time = time.time()
-        #H.i(indexes[:i] + indexes[i + 1:] + "r") << 
-        Hterm = reduce(lambda x, y: x * y, tup_list[:i] + tup_list[i + 1:])
         times[5] += time.time() - starting_time
         starting_time = time.time()
         t_ctr = ctf.timer("SGD_main_contraction")
         t_ctr.start()
-        (1- stepSize * 2 * Lambda * sample_rate)*factors[i].i(indexes[i] + "r") << stepSize * Hterm * R.i(indexes)
+        if use_MTTKRP:
+            new_fi = (1- stepSize * 2 * Lambda * sample_rate)*factors[i]
+            ctf.MTTKRP(R, factors, i)
+            stepSize*factors[i].i("ir") << new_fi.i("ir")
+        else:
+            tup_list = [factors[i].i(indexes[i] + "r") for i in range(dimension)]
+            Hterm = reduce(lambda x, y: x * y, tup_list[:i] + tup_list[i + 1:])
+            (1- stepSize * 2 * Lambda * sample_rate)*factors[i].i(indexes[i] + "r") << stepSize * Hterm * R.i(indexes)
         t_ctr.stop()
         times[6] += time.time() - starting_time
         if i < dimension - 1:
             R = ctf.tensor(copy=T)
-    #return ctf.vecnorm(R) + (sum([ctf.vecnorm(f) for f in factors])) * Lambda
 
-def sparse_SGD(T, U, V, W, Lambda, omega, I, J, K, r, stepSize, sample_rate, num_iter, errThresh, time_limit, work_cycle):
+def sparse_SGD(T, U, V, W, Lambda, omega, I, J, K, r, stepSize, sample_rate, num_iter, errThresh, time_limit, work_cycle, use_MTTKRP):
     times = [0 for i in range(7)]
 
     iteration_count = 0
@@ -58,11 +59,9 @@ def sparse_SGD(T, U, V, W, Lambda, omega, I, J, K, r, stepSize, sample_rate, num
     starting_time = time.time()
     dtime = 0
     R.i("ijk") << T.i("ijk") - ctf.TTTP(omega, [U, V, W]).i("ijk")
-    # R.i("ijk") << T.i("ijk") - U.i("iu") * V.i("ju") * W.i("ku") * omega.i("ijk")
     curr_err_norm = ctf.vecnorm(R) + (ctf.vecnorm(U) + ctf.vecnorm(V) + ctf.vecnorm(W)) * Lambda
     times[0] += time.time() - starting_time
     norm = [curr_err_norm]
-    #work_cycle = 1 #int(1.0 / sample_rate)
     step = stepSize * 0.5
     t_obj_calc = 0.
 
@@ -73,8 +72,8 @@ def sparse_SGD(T, U, V, W, Lambda, omega, I, J, K, r, stepSize, sample_rate, num
         sampled_T.sample(sample_rate)
         times[1] += time.time() - starting_time
 
-        sparse_update(sampled_T, [U, V, W], Lambda, [I, J, K], r, stepSize * 0.5 + step, sample_rate, times)
-        step *= 0.99
+        sparse_update(sampled_T, [U, V, W], Lambda, [I, J, K], r, stepSize * 0.5 + step, sample_rate, times, use_MTTKRP)
+        #step *= 0.99
         sampled_T.set_zero()
 
         if iteration_count % work_cycle == 0:
@@ -82,7 +81,6 @@ def sparse_SGD(T, U, V, W, Lambda, omega, I, J, K, r, stepSize, sample_rate, num
             t_b_obj = time.time()
             total_count += 1
             R.set_zero()
-            #R.i("ijk") << T.i("ijk") - U.i("iu") * V.i("ju") * W.i("ku") * omega.i("ijk")
             R.i("ijk") << T.i("ijk") - ctf.TTTP(omega, [U, V, W]).i("ijk")
             diff_norm = ctf.vecnorm(R)
             RMSE = diff_norm/(nnz_tot**.5)
@@ -91,11 +89,6 @@ def sparse_SGD(T, U, V, W, Lambda, omega, I, J, K, r, stepSize, sample_rate, num
                 print('Objective after',duration,'seconds (',iteration_count,'iterations) is: {}'.format(next_err_norm))
                 print('RMSE after',duration,'seconds (',iteration_count,'iterations) is: {}'.format(RMSE))
             t_obj_calc += time.time() - t_b_obj
-            #print(curr_err_norm, next_err_norm, diff_norm)
-            #if ctf.comm().rank() == 0:
-            #    x = time.time() - start_time
-            #    print(diff_norm, x, total_count, x/total_count)
-            #    # print(times)
 
             if abs(curr_err_norm - next_err_norm) < errThresh:
                 break
@@ -107,10 +100,6 @@ def sparse_SGD(T, U, V, W, Lambda, omega, I, J, K, r, stepSize, sample_rate, num
     if ctf.comm().rank() == 0:
         print('SGD amortized seconds per sweep: {}'.format(duration/(iteration_count*sample_rate)))
         print("Time/SGD iteration: {}".format(duration/iteration_count))
-    #curr_err_norm = ctf.vecnorm(R) + (ctf.vecnorm(U) + ctf.vecnorm(V) + ctf.vecnorm(W)) * Lambda
-    #R.i("ijk") << T.i("ijk") - ctf.TTTP(omega, [U, V, W]).i("ijk")
-    #if ctf.comm().rank() == 0:
-    #    print('Objective after',duration,'seconds is: {}'.format(curr_err_norm))
     return norm
 
 def getOmega(T):