fix

flexflow · Oct 22, 2024 · 2dab7cb · 2dab7cb
1 parent 674eed7
commit 2dab7cb
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 4 deletions.
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
@@ -105,7 +105,7 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
                                           bool add_zero_attn,
                                           DataType data_type,
                                           Initializer *kernel_initializer,
-                                          RotaryEmbeddingMeta rotary_embedding_meta,,
+                                          RotaryEmbeddingMeta rotary_embedding_meta,
                                           bool scaling_query,
                                           float scaling_factor,
                                           bool qk_prod_scaling,

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -329,7 +329,7 @@ void apply_pos_encoding_to_tokens_in_batch(
     DT *output_ptr,
     cudaStream_t stream) {
   // apply rotary embedding if needed
-  if (!*m->apply_rotary_embedding) {
+  if (!m->rotary_embedding_meta->apply_rotary_embedding) {
     return;
   }
   int num_tokens = bc->num_active_tokens();
@@ -338,13 +338,20 @@ void apply_pos_encoding_to_tokens_in_batch(
   }
   int parallelism = num_tokens * m->local_hidden_size;
   size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
+  bool llama3_rope = (m->rotary_embedding_meta->rope_type == "llama3");
   apply_pos_encoding_to_tokens_in_batch_kernel<<<GET_BLOCKS(parallelism),
                                                  min(CUDA_NUM_THREADS,
                                                      parallelism),
                                                  0,
                                                  stream>>>(
       output_ptr,
       m->token_infos,
+      m->rotary_embedding_meta->rope_theta,
+      llama3_rope,
+      m->rotary_embedding_meta->factor,
+      m->rotary_embedding_meta->low_freq_factor,
+      m->rotary_embedding_meta->high_freq_factor,
+      m->rotary_embedding_meta->original_max_position_embeddings,
       m->qk_dim,
       num_tokens,
       q_array_size,

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
@@ -314,7 +314,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  // bool use_bias = *m->qkv_bias || *m->final_bias;
+  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
@@ -522,7 +522,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  // bool use_bias = *m->qkv_bias || *m->final_bias;
+  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   //   int device;
   //   checkCUDA(cudaGetDevice(&device));