Implemented for SpecInfer

flexflow · Aug 7, 2024 · 09e4741 · 09e4741
1 parent 30b3fae
commit 09e4741
Show file tree

Hide file tree

Showing 8 changed files with 138 additions and 386 deletions.
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -112,9 +112,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                BeamSearchBatchConfig const *bc,
                                int shard_id,
                                GenericTensorAccessorR const &input,
-                               GenericTensorAccessorR const &weight,
-                               GenericTensorAccessorW const &output,
-                               GenericTensorAccessorR const &bias);
+                               GenericTensorAccessorW const &output);
   Params get_params() const;
 
 public:

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -114,10 +114,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                        TreeVerifyBatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
-                                       GenericTensorAccessorR const &weight,
-                                       GenericTensorAccessorW const &output,
-                                       GenericTensorAccessorR const &bias);
-
+                                       GenericTensorAccessorW const &output);
   Params get_params() const;
 
 public:

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
@@ -449,6 +449,7 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // TODO: why is op_num_weight still non-zero?
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -461,9 +462,7 @@ __host__ void
             bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            // my_weight_accessor[0],
             my_output_accessor[0]
-            // biases
             );
         break;
       }
@@ -486,9 +485,7 @@ __host__ void
             &tree_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -512,9 +509,7 @@ __host__ void
             &beam_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_LAYERNORM: {

diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
@@ -715,14 +715,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                   stream);
   // phase 1: Implement kernel to compute KQV for input tokens
   // TODO WARNING: this is commented out only because we are fixing the inc_attn first
-  // compute_qkv_kernel(m,
-  //                    bc,
-  //                    shard_id,
-  //                   //  input_ptr,
-  //                    weight_ptr,
-  //                    static_cast<DT *>(m->devQKVProjArray),
-  //                    bias_ptr,
-  //                    stream);
+  compute_qkv_kernel(m,
+                     bc,
+                     shard_id,
+                    //  input_ptr,
+                    //  weight_ptr,
+                     static_cast<DT *>(m->devQKVProjArray),
+                    //  bias_ptr,
+                     stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
   if (bc->num_generation_tokens > 0) {
@@ -756,9 +756,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     BeamSearchBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
@@ -770,35 +768,28 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    half const *bias_ptr = static_cast<half const *>(nullptr);
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_half_ptr(),
-        weight.get_half_ptr(),
+        static_cast<half const *>(nullptr),
         output.get_half_ptr(),
-        bias_ptr,
+        static_cast<half const *>(nullptr),
         stream);
   } else if (input.data_type == DT_FLOAT) {
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_float_ptr(),
-        weight.get_float_ptr(),
+        static_cast<float const *>(nullptr),
         output.get_float_ptr(),
-        bias_ptr,
+        static_cast<float const *>(nullptr),
         stream);
   } else {
     assert(false && "Unspported data type");