diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 07fd41bcfc..cc4464b496 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1130,36 +1130,45 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
float alpha = 1.0f, beta = 0.0f;
// matrix A: output projection weight
// matrix A's layout: [vProjSize * num_heads, oProjSize]
- DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
- m->kProjSize * m->num_q_heads +
- m->vProjSize * m->num_q_heads);
- // matrix B: output gradients
- // matrix B's layout: [oProjSize, num_new_tokens]
- DT const *B =
- output_grad_ptr +
- bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
- // matrix C: attn_heads gradients
- // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
+ // DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+ // m->kProjSize * m->num_q_heads +
+ // m->vProjSize * m->num_q_heads);
+ // // matrix B: output gradients
+ // // matrix B's layout: [oProjSize, num_new_tokens]
+ // DT const *B =
+ // output_grad_ptr +
+ // bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
+ // // matrix C: attn_heads gradients
+ // // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
+ // DT *C = static_cast
(m->handle.workSpace);
+ // checkCUDA(cublasGemmEx(m->handle.blas,
+ // CUBLAS_OP_N,
+ // CUBLAS_OP_N,
+ // m_,
+ // n_,
+ // k_,
+ // &alpha,
+ // A,
+ // cublas_data_type,
+ // lda,
+ // B,
+ // cublas_data_type,
+ // ldb,
+ // &beta,
+ // C,
+ // cublas_data_type,
+ // ldc,
+ // compute_type,
+ // CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+ // here we copy gradient from o_proj directly into C
DT *C = static_cast(m->handle.workSpace);
- checkCUDA(cublasGemmEx(m->handle.blas,
- CUBLAS_OP_N,
- CUBLAS_OP_N,
- m_,
- n_,
- k_,
- &alpha,
- A,
- cublas_data_type,
- lda,
- B,
- cublas_data_type,
- ldb,
- &beta,
- C,
- cublas_data_type,
- ldc,
- compute_type,
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+ cudaMemcpyAsync(C,
+ output_grad_ptr +
+ bc->requestsInfo[i].first_token_offset_in_batch *
+ m->oProjSize,
+ m_ * n_ * sizeof(DT),
+ cudaMemcpyDeviceToDevice,
+ stream);
if (m->inference_debugging) {
// save result to file for checking
std::string filename =
@@ -1526,25 +1535,30 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
int lda = m_;
int ldb = n_;
int ldc = m_;
- checkCUDA(cublasGemmEx(m->handle.blas,
- CUBLAS_OP_N,
- CUBLAS_OP_T,
- m_,
- n_,
- k_,
- &alpha,
- A,
- cublas_data_type,
- lda,
- B,
- cublas_data_type,
- ldb,
- &beta,
- C,
- cublas_data_type,
- ldc,
- compute_type,
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+ // checkCUDA(cublasGemmEx(m->handle.blas,
+ // CUBLAS_OP_N,
+ // CUBLAS_OP_T,
+ // m_,
+ // n_,
+ // k_,
+ // &alpha,
+ // A,
+ // cublas_data_type,
+ // lda,
+ // B,
+ // cublas_data_type,
+ // ldb,
+ // &beta,
+ // C,
+ // cublas_data_type,
+ // ldc,
+ // compute_type,
+ // CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+ cudaMemcpyAsync(C,
+ B,
+ n_ * k_ * sizeof(DT),
+ cudaMemcpyDeviceToDevice,
+ stream);
if (m->inference_debugging) {
std::string filename =
get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";