init page manager at request manager init and clean the format

flexflow · Nov 4, 2024 · 4a7162f · 4a7162f
1 parent 832f5cb
commit 4a7162f
Show file tree

Hide file tree

Showing 22 changed files with 514 additions and 343 deletions.
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -114,9 +114,9 @@ class BatchConfig {
     int first_token_index_in_request = -1;
     int first_token_offset_in_batch = -1;
     int num_tokens_in_batch = 0;
-    int padding = 0; // Padding for memory pointer alignment
-    int num_kv_pages; //number of kv pages used
-    int kv_last_page_len; //last page length of kv
+    int padding = 0;      // Padding for memory pointer alignment
+    int num_kv_pages;     // number of kv pages used
+    int kv_last_page_len; // last page length of kv
     RequestGuid request_guid;
   };
 

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -1081,6 +1081,10 @@ class FFModel {
                CompMode comp_mode = COMP_MODE_TRAINING);
   void compile_inference();
   void set_transformer_layer_id(int id);
+  void set_num_transformer_layers(int num_layers);
+  void set_num_kv_heads(int num_heads);
+  void set_qkv_dim(int qkv_dim);
+  void set_size_dt(int size_dt);
   void set_position_offset(int offset);
   void graph_optimize(size_t budget,
                       bool only_data_parallel,
@@ -1142,6 +1146,8 @@ class FFModel {
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
   // positional embedding start offset
+  int num_transformer_layers;
+  int num_kv_heads, qkv_dim, size_dt;
   int position_offset;
   FFConfig config;
   FFIterationConfig iter_config;

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -15,20 +15,25 @@ namespace Kernels {
 namespace IncMultiHeadAttention {
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_k_entry_offset_verify(int const token_idx,
-                                                     int const page_idx,
-                                                     int const num_heads,
-                                                     int const head_dim) {
-  size_t index = ((page_idx) * kPagesize * 2 + (token_idx % kPagesize)) * head_dim * num_heads;
+__device__ __forceinline__ size_t
+    get_k_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index = ((page_idx)*kPagesize * 2 + (token_idx % kPagesize)) *
+                 head_dim * num_heads;
   return index;
 }
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_v_entry_offset_verify(int const token_idx,
-                                                     int const page_idx,
-                                                     int const num_heads,
-                                                     int const head_dim) {
-  size_t index = ((page_idx) * kPagesize * 2 + kPagesize + (token_idx % kPagesize)) * head_dim * num_heads;
+__device__ __forceinline__ size_t
+    get_v_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index =
+      ((page_idx)*kPagesize * 2 + kPagesize + (token_idx % kPagesize)) *
+      head_dim * num_heads;
   return index;
 }
 

diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
@@ -1,14 +1,14 @@
 #pragma once
 
 #include "flexflow/batch_config.h"
+#include "flexflow/config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
-#include "flexflow/config.h"
 #include "flexflow/utils/file_loader.h"
+#include <deque>
 #include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
-#include <deque>
 
 namespace FlexFlow {
 
@@ -20,118 +20,143 @@ using TokenId = BatchConfig::TokenId;
  */
 class LogicalTokenBlock {
 public:
-    using TokenId = BatchConfig::TokenId;
+  using TokenId = BatchConfig::TokenId;
 
-    // Constructor
-    LogicalTokenBlock(int block_number, uint32_t block_size);
+  // Constructor
+  LogicalTokenBlock(int block_number, uint32_t block_size);
 
-    // Method to check if the block is empty
-    bool is_empty() const;
+  // Method to check if the block is empty
+  bool is_empty() const;
 
-    // Method to check if the block is full
-    bool is_full() const;
+  // Method to check if the block is full
+  bool is_full() const;
 
-    // Method to get the number of empty slots
-    int get_num_empty_slots() const;
+  // Method to get the number of empty slots
+  int get_num_empty_slots() const;
 
-    // Method to get the number of allocated slots
-    int get_num_alloc_slots() const;
+  // Method to get the number of allocated slots
+  int get_num_alloc_slots() const;
 
-    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
-    void reset_num_spec_tokens();
+  // Used to clean up the spec tokens in a block since these spec tokens may not
+  // be committed after use
+  void reset_num_spec_tokens();
 
-    // Method to append tokens
-    void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
+  // Method to append tokens
+  void append_tokens(std::vector<TokenId> const &token_ids_to_append,
+                     bool committed);
 
-    int get_num_tokens() const { return num_tokens; }
-    int get_num_commit_tokens() const { return num_commit_tokens; }
-    int get_num_spec_tokens() const { return num_spec_tokens; }
+  int get_num_tokens() const {
+    return num_tokens;
+  }
+  int get_num_commit_tokens() const {
+    return num_commit_tokens;
+  }
+  int get_num_spec_tokens() const {
+    return num_spec_tokens;
+  }
 
-    std::vector<TokenId> get_token_ids() const;
+  std::vector<TokenId> get_token_ids() const;
 
 private:
-    int block_number; // the index of the logical token block
-    int block_size; // the size of the block
-    int num_tokens; // the number of tokens currently stored in the block
-    int num_commit_tokens; // the number of tokens inside this block that are already committed
-    int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
-    std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
+  int block_number;      // the index of the logical token block
+  int block_size;        // the size of the block
+  int num_tokens;        // the number of tokens currently stored in the block
+  int num_commit_tokens; // the number of tokens inside this block that are
+                         // already committed
+  int num_spec_tokens;   // the number of tokens inside this block that are
+                         // speculative tokens, which is stored temporarily
+  std::vector<TokenId> token_ids; // store the token ids in a order that
+                                  // corresponds to the inference sequence
 };
 
 /**
  * @class PhysicalTokenBlock
- * @brief A class to represent a physical block of tokens similar to physical memory address
- * It keeps track of the location of the tokens stored on GPU memory
+ * @brief A class to represent a physical block of tokens similar to physical
+ * memory address It keeps track of the location of the tokens stored on GPU
+ * memory
  */
 class PhysicalTokenBlock {
 public:
-    // Constructor
-    PhysicalTokenBlock(int block_number, int block_size);
-
-    // Method to get the block number
-    int get_block_number() const { return block_number; }
-    void incr_ref_count() { ref_count++; }
-    void decr_ref_count() { ref_count--; }
-    int ref_count; // reference count, TODO: move to private
+  // Constructor
+  PhysicalTokenBlock(int block_number, int block_size);
+
+  // Method to get the block number
+  int get_block_number() const {
+    return block_number;
+  }
+  void incr_ref_count() {
+    ref_count++;
+  }
+  void decr_ref_count() {
+    ref_count--;
+  }
+  int ref_count; // reference count, TODO: move to private
 
 private:
-    int block_number; // the index of the physical token block
-    int block_size; // the size of the block
+  int block_number; // the index of the physical token block
+  int block_size;   // the size of the block
 };
 
 /**
  * @class BlockAllocator
- * @brief A Block Manager that is reponsible for maintaining a pool of free blocks
+ * @brief A Block Manager that is reponsible for maintaining a pool of free
+ * blocks
  */
 class BlockAllocator {
 public:
-    // Constructor
-    BlockAllocator(int block_size, int num_total_blocks);
+  // Constructor
+  BlockAllocator(int block_size, int num_total_blocks);
 
-    // Allocate a block
-    PhysicalTokenBlock allocate();
+  // Allocate a block
+  PhysicalTokenBlock allocate();
 
-    // Free a block
-    void free(PhysicalTokenBlock& block);
+  // Free a block
+  void free(PhysicalTokenBlock &block);
 
-    // Get the number of free blocks
-    int get_num_free_blocks() const;
+  // Get the number of free blocks
+  int get_num_free_blocks() const;
 
 private:
-    int block_size;
-    int num_total_blocks;
-    std::deque<PhysicalTokenBlock> free_blocks;
+  int block_size;
+  int num_total_blocks;
+  std::deque<PhysicalTokenBlock> free_blocks;
 };
 
 /*
-* @class PageManager
-* @brief A wrapper class that manages the kv cache allocation status
-* notice that all the layers of model will share the same page manager because the position of kv cache will be the same
-*/
+ * @class PageManager
+ * @brief A wrapper class that manages the kv cache allocation status
+ * notice that all the layers of model will share the same page manager because
+ * the position of kv cache will be the same
+ */
 class PageManager {
 public:
-    // Get the singleton instance of the PageManager as it will be shared in multiple places
-    static PageManager *get_page_manager();
-    using BlockTable = std::vector<PhysicalTokenBlock>;
-    using RequestGuid = BatchConfig::RequestGuid;
-    PageManager(int block_size, int num_total_blocks);
-
-    int allocate_one_block(const RequestGuid& request_guid);
-    void free_request(const RequestGuid& request_guid);
-    //used for the case that we want to free the last num_blocks that stores spec tokens(which are the tokens are not yet committed)
-    void free_multiple_blocks(const RequestGuid& request_guid, int num_blocks);
-    std::vector<int> get_block_table_indices(const RequestGuid& request_guid) const;
-
-
-    void free_block_table(BlockTable& block_table);
-private:
-    int block_size; // the size of the block
-    int num_total_blocks; // the total number of blocks
-    BlockAllocator block_allocator;
-    std::unordered_map<RequestGuid, BlockTable> block_tables;
+  // Get the singleton instance of the PageManager as it will be shared in
+  // multiple places
+  static PageManager *get_page_manager();
+  static PageManager *get_page_manager(FFModel *ff, int kv_cache_size);
+  using BlockTable = std::vector<PhysicalTokenBlock>;
+  using RequestGuid = BatchConfig::RequestGuid;
+  PageManager(int block_size, int num_total_blocks);
+  int allocate_one_block(RequestGuid const &request_guid);
+  void free_request(RequestGuid const &request_guid);
+  // used for the case that we want to free the last num_blocks that stores spec
+  // tokens(which are the tokens are not yet committed)
+  void free_multiple_blocks(RequestGuid const &request_guid, int num_blocks);
+  std::vector<int>
+      get_block_table_indices(RequestGuid const &request_guid) const;
+
+  void free_block_table(BlockTable &block_table);
 
-    int get_num_total_free_blocks() const;
-    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
+private:
+  int num_transformer_layers;
+  int total_kv_cache_size;
+  int block_size;       // the size of the block
+  int num_total_blocks; // the total number of blocks
+  BlockAllocator block_allocator;
+  std::unordered_map<RequestGuid, BlockTable> block_tables;
+
+  int get_num_total_free_blocks() const;
+  int get_num_allocated_blocks(RequestGuid const &request_guid) const;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -18,8 +18,8 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
-#include "flexflow/utils/file_loader.h"
 #include "flexflow/page_manager.h"
+#include "flexflow/utils/file_loader.h"
 #include <condition_variable>
 #include <future>
 #include <mutex>
@@ -149,7 +149,8 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  //page attention, page_last_committed should be -1 because there are no blocks at the beginning
+  // page attention, page_last_committed should be -1 because there are no
+  // blocks at the beginning
   int page_last_committed = -1;
   std::vector<LogicalTokenBlock> blocks;
 
@@ -539,8 +540,7 @@ class RequestManager {
   int get_len_last_block(Request &request) const;
   int get_idx_last_logical_token(Request &request) const;
   int idx_logical_to_physical(Request &request, int idx_logical);
-  void _append_block_to_request(
-    Request &request, bool is_commit);
+  void _append_block_to_request(Request &request, bool is_commit);
   int append_token_to_block(Request &request, TokenId token, bool is_commit);
   void reset_block_table(Request &request);
   void print_num_tokens(Request &request);

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -209,7 +209,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
   int max_output_length = 512;
-  int max_kv_cache_size = -1; //if -1, then use the default value
+  int max_kv_cache_size = -1; // if -1, then use the default value
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
@@ -63,6 +63,11 @@ void FALCON::create_falcon_model(FFModel &ff,
   Tensor mha = nullptr, mlp_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(falcon_config.n_layer);
+  ff.set_num_kv_heads(falcon_config.n_head_kv);
+  ff.set_qkv_dim(falcon_config.hidden_size / falcon_config.n_head * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
+
   for (int i = 0; i < falcon_config.n_layer; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);

diff --git a/inference/models/falcon.h b/inference/models/falcon.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
@@ -64,6 +64,11 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   Tensor w2 = nullptr;
 
+  ff.set_num_transformer_layers(llama_config.num_hidden_layers);
+  ff.set_num_kv_heads(llama_config.num_key_value_heads);
+  ff.set_qkv_dim(llama_config.hidden_size / llama_config.num_attention_heads *
+                 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < llama_config.num_hidden_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);

diff --git a/inference/models/llama.h b/inference/models/llama.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>

diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
@@ -64,6 +64,10 @@ void MPT::create_mpt_model(FFModel &ff,
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(mpt_config.n_layers);
+  ff.set_num_kv_heads(mpt_config.n_heads);
+  ff.set_qkv_dim(mpt_config.hidden_size / mpt_config.n_heads * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < mpt_config.n_layers; i++) {
     ff.set_transformer_layer_id(i);