Code cleanup

flexflow · Sep 20, 2024 · 1bc1c1e · 1bc1c1e
1 parent 281a8bf
commit 1bc1c1e
Show file tree

Hide file tree

Showing 8 changed files with 10 additions and 113 deletions.
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
@@ -91,7 +91,6 @@ void LLAMA::create_llama_model(FFModel &ff,
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
     }
-    att_norm->print("att_norm");
     Tensor qkv_proj = ff.dense(
         att_norm,
         llama_config.hidden_size *
@@ -107,7 +106,6 @@ void LLAMA::create_llama_model(FFModel &ff,
         0.0f,          // no dropout
         std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
             .c_str());
-    qkv_proj->print("qkv_proj");
 
     Tensor mha;
     switch (mode) {
@@ -189,7 +187,6 @@ void LLAMA::create_llama_model(FFModel &ff,
     }
 
     Tensor mha_input = mha;
-    mha_input->print("mha_input");
     mha = ff.dense(
         mha_input,
         llama_config.hidden_size,
@@ -203,7 +200,6 @@ void LLAMA::create_llama_model(FFModel &ff,
         0.0f,
         std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
             .c_str());
-    mha->print("mha");
 
     // step 2: SILU activaion
     Tensor token_ff_norm[2] = {nullptr, nullptr};

diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
@@ -600,13 +600,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  if (attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
-    printf("attn o_proj size %d does not match output domain %d\n",
-           attn->oProjSize,
-           output.domain.hi()[0] - output.domain.lo()[0] + 1);
-  }
-  // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] +
-  // 1);
 
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
   MemoryAllocator gpu_mem_allocator(gpu_mem);

diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
@@ -923,9 +923,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *qkv_ptr,
-                      DT const *weight_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       hipStream_t stream) {
 
   if (m->offload && m->biasSize > 0) {
@@ -954,7 +952,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   if (bc->num_tokens > bc->num_generation_tokens) {
     // phase 4: Compute attention score for prompt tokens;
     compute_attention_kernel_prompt(
-        m, bc, shard_id, bias_ptr, weight_ptr, stream);
+        m, bc, shard_id, stream);
   }
 
   // compute output production and bias together for all tokens
@@ -1482,12 +1480,11 @@ __global__ void store_query_cache(DT const *devQKVProjArray,
   }
 }
 
-template <typename DT>
+// Please refer to the implementation in .cu file.
+// This implementation is outdated
 void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                      BatchConfig const *bc,
                                      int shard_id,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
                                      hipStream_t stream) {
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
@@ -1802,9 +1799,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         bc,
         shard_id,
         input.get_half_ptr(),
-        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
         output.get_half_ptr(),
-        bias_ptr,
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
@@ -1817,10 +1812,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         bc,
         shard_id,
         input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
         output.get_float_ptr(),
-        bias_ptr,
         stream);
   } else {
     assert(false && "Unspported data type");

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
@@ -739,7 +739,7 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
 
 // this kernel is no longer used by the attention operator because
 // there's no more weights
-// TODO: check if this is needed by the projection layers?
+// It is left in case we want to reuse this part in the future
 template <typename DT>
 void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
@@ -805,9 +805,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *qkv_ptr,
-                      DT const *weight_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       cudaStream_t stream) {
 
   // phase 0: copy calculated qkv into devQKVProjArray
@@ -825,11 +823,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
-                     //  input_ptr,
-                     //  weight_ptr,
-                     //  nullptr, // does not use weight
                      static_cast<DT *>(m->devQKVProjArray),
-                     //  bias_ptr,
                      stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
@@ -842,7 +836,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   if (bc->num_tokens > bc->num_generation_tokens) {
     // phase 4: Compute attention score for prompt tokens;
     compute_attention_kernel_prompt(
-        m, bc, shard_id, bias_ptr, weight_ptr, stream);
+        m, bc, shard_id, static_cast<DT*>(nullptr), static_cast<DT*>(nullptr), stream);
   }
 
   // compute output production and bias together for all tokens
@@ -1355,14 +1349,12 @@ void peft_bwd_kernel(
       int n_ = num_tokens;
       int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
 
-      // TODO: checkout if the input grad ptr has some relation with
-      // m->devQKVProjArray so we may potentially skip this transpose and copy
-      // TODO: check if this transposeAdd can correctly implement gradient
-      // accumulation
+      // The original version uses existing result and attention's projection to
+      // do further calculation in a way different than the usual dense layer, 
+      // they are off by a transpose. So an explicit transpose is needed here.
+      // The add here is just for gradient accumulation.
       transposeAdd(C, B, n_, k_, alpha, beta, stream);
 
-      // printf("backward of raw attn grad: %d, %d, with redudant dimension
-      // %d\n", k_, n_, m_);
       if (m->inference_debugging) {
         std::string filename =
             get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
@@ -1712,14 +1704,10 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    // GenericTensorAccessorR const &weight,
     GenericTensorAccessorW const &output
-    // GenericTensorAccessorR const &bias
 ) {
-  // printf("inf_k_warpper start\n");
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  // bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -1728,31 +1716,23 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  // if (use_bias) {
-  //   assert(input.data_type == bias.data_type);
-  // }
 
   if (input.data_type == DT_HALF) {
     Kernels::IncMultiHeadAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_half_ptr(),
-        static_cast<half const *>(nullptr), // weight_ptr is no longer used
         output.get_half_ptr(),
-        static_cast<half const *>(nullptr), // bias_ptr is no longer used
         stream);
   } else if (input.data_type == DT_FLOAT) {
     Kernels::IncMultiHeadAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_float_ptr(),
-        static_cast<float const *>(nullptr), // weight_ptr is no longer used
         output.get_float_ptr(),
-        static_cast<float const *>(nullptr), // bias_ptr is no longer used
         stream);
   } else {
     assert(false && "Unspported data type");
@@ -1775,9 +1755,7 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorW const &input_grad,
-    // GenericTensorAccessorR const &weight,
     GenericTensorAccessorR const &output_grad) {
-  // GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
@@ -1789,41 +1767,28 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  // assert(input.data_type == weight.data_type);
   assert(input_grad.data_type == output_grad.data_type);
-  // if (use_bias) {
-  //   assert(input_grad.data_type == bias.data_type);
-  // }
 
   if (input_grad.data_type == DT_HALF) {
     assert(!m->offload);
-    // half const *bias_ptr =
-    //     use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::IncMultiHeadAttention::peft_bwd_kernel(
         m,
         bc,
         shard_id,
         input_grad.get_half_ptr(),
-        // weight.get_half_ptr(),
         static_cast<half const *>(nullptr),
         output_grad.get_half_ptr(),
-        // bias_ptr,
         static_cast<half const *>(nullptr),
         stream);
   } else if (input_grad.data_type == DT_FLOAT) {
     assert(!m->offload);
-    // float const *bias_ptr =
-    //     use_bias ? bias.get_float_ptr() : static_cast<float const
-    //     *>(nullptr);
     Kernels::IncMultiHeadAttention::peft_bwd_kernel(
         m,
         bc,
         shard_id,
         input_grad.get_float_ptr(),
-        // weight.get_float_ptr(),
         static_cast<float const *>(nullptr),
         output_grad.get_float_ptr(),
-        // bias_ptr,
         static_cast<float const *>(nullptr),
         stream);
   } else {

diff --git a/src/ops/linear.cc b/src/ops/linear.cc
@@ -779,14 +779,6 @@ void Linear::peft_bwd_task(Task const *task,
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    printf("%s: in_dim = %d, out_dim = %d, num_infr_tokens = %d, "
-           "num_peft_tokens = %d, volume = %d\n",
-           m->op_name,
-           in_dim,
-           out_dim,
-           num_infr_tokens,
-           num_peft_tokens,
-           input_grad.domain.get_volume());
     Linear::save_inference_tensors_to_file(
         m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
   }

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
@@ -463,8 +463,6 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
                                      BeamSearchBatchConfig const *bc,
                                      int shard_id,
                                      DT *output_ptr,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
                                      cudaStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
@@ -699,9 +697,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       BeamSearchBatchConfig const *bc,
                       int shard_id,
                       DT const *qkv_ptr,
-                      DT const *weight_ptr,
                       DT *output_ptr,
-                      DT const *bias_ptr,
                       cudaStream_t stream) {
 
   // phase 0: copy calculated qkv into devQKVProjArray
@@ -736,7 +732,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   // 3 kernels for pahse 3: matmul1 - softmax - matmal2
   if (bc->num_tokens > bc->num_generation_tokens) {
     compute_attention_kernel_prompt(
-        m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+        m, bc, shard_id, output_ptr, stream);
   }
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
@@ -780,19 +776,15 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
         bc,
         shard_id,
         input.get_half_ptr(),
-        static_cast<half const *>(nullptr),
         output.get_half_ptr(),
-        static_cast<half const *>(nullptr),
         stream);
   } else if (input.data_type == DT_FLOAT) {
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_float_ptr(),
-        static_cast<float const *>(nullptr),
         output.get_float_ptr(),
-        static_cast<float const *>(nullptr),
         stream);
   } else {
     assert(false && "Unspported data type");

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
@@ -918,17 +918,10 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   for (int i = 0; i < weight->num_dims; i++) {
     dims_vec.push_back(weight->dims[i]);
     volume *= weight->dims[i];
-    // std::cout<<l->name<<" dim "<<i<<": "<<weight->dims[i]<<std::endl;
   }
   assert(data_type_size(weight->data_type) == sizeof(DT));
   DT *data = (DT *)malloc(sizeof(DT) * volume);
 
-  // printf("loading weight for %s, shapes: ", l->name);
-  // for(int i = 0; i < weight->num_dims; i++) {
-  //   printf("%d ", weight->dims[i]);
-  // }
-  // printf("\n");
-
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
   bool is_attn_proj = false, is_o_proj = false;
 
@@ -961,29 +954,6 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-      // if (weight_idx == 0) {
-      //   load_attention_weights_v2(data,
-      //                             num_heads,
-      //                             num_kv_heads,
-      //                             hidden_dim,
-      //                             qkv_inner_dim,
-      //                             weight_filename,
-      //                             weights_folder,
-      //                             volume,
-      //                             tensor_parallelism_degree);
-      // } else {
-      //   long long value;
-      //   l->get_int_property("final_bias", value);
-      //   bool final_bias = (bool)value;
-      //   load_attention_bias_v2(data,
-      //                          num_heads,
-      //                          num_kv_heads,
-      //                          hidden_dim,
-      //                          qkv_inner_dim,
-      //                          final_bias,
-      //                          weight_filename,
-      //                          weights_folder);
-      // }
     } else if (is_attn_proj) {
       if (is_o_proj) {
         if (weight_idx == 0) {
@@ -1053,7 +1023,6 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   }
 
   // Copy the weight data from the buffer to the weight's ParallelTensor
-  printf("using default load for %s\n", l->name);
   ParallelTensor weight_pt;
   ff->get_parallel_tensor_from_tensor(weight, weight_pt);
   weight_pt->set_tensor<DT>(ff, dims_vec, data);

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
@@ -2756,7 +2756,6 @@ void RequestManager::start_background_server(FFModel *model) {
   // Register callbacks for termination
   {
     std::set_terminate([]() {
-      // assert(false && "terminate");
       RequestManager::terminate_background_server_at_exit();
       std::abort();
     });
@@ -3013,7 +3012,6 @@ void RequestManager::trigger_request_completion_future(
 /*static*/
 void RequestManager::terminate_background_server_at_exit() {
   RequestManager *rm = RequestManager::get_request_manager();
-  // assert(false && "RM terminating bg server due to exit");
   rm->terminate_background_server();
 }