Skip to content

Commit

Permalink
init page manager at request manager init and clean the format
Browse files Browse the repository at this point in the history
  • Loading branch information
Bob-Chen222 committed Nov 4, 2024
1 parent 832f5cb commit 4a7162f
Show file tree
Hide file tree
Showing 22 changed files with 514 additions and 343 deletions.
6 changes: 3 additions & 3 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ class BatchConfig {
int first_token_index_in_request = -1;
int first_token_offset_in_batch = -1;
int num_tokens_in_batch = 0;
int padding = 0; // Padding for memory pointer alignment
int num_kv_pages; //number of kv pages used
int kv_last_page_len; //last page length of kv
int padding = 0; // Padding for memory pointer alignment
int num_kv_pages; // number of kv pages used
int kv_last_page_len; // last page length of kv
RequestGuid request_guid;
};

Expand Down
6 changes: 6 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,10 @@ class FFModel {
CompMode comp_mode = COMP_MODE_TRAINING);
void compile_inference();
void set_transformer_layer_id(int id);
void set_num_transformer_layers(int num_layers);
void set_num_kv_heads(int num_heads);
void set_qkv_dim(int qkv_dim);
void set_size_dt(int size_dt);
void set_position_offset(int offset);
void graph_optimize(size_t budget,
bool only_data_parallel,
Expand Down Expand Up @@ -1142,6 +1146,8 @@ class FFModel {
size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
size_t current_transformer_layer_id;
// positional embedding start offset
int num_transformer_layers;
int num_kv_heads, qkv_dim, size_dt;
int position_offset;
FFConfig config;
FFIterationConfig iter_config;
Expand Down
25 changes: 15 additions & 10 deletions include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,25 @@ namespace Kernels {
namespace IncMultiHeadAttention {

// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
__device__ __forceinline__ size_t get_k_entry_offset_verify(int const token_idx,
int const page_idx,
int const num_heads,
int const head_dim) {
size_t index = ((page_idx) * kPagesize * 2 + (token_idx % kPagesize)) * head_dim * num_heads;
__device__ __forceinline__ size_t
get_k_entry_offset_verify(int const token_idx,
int const page_idx,
int const num_heads,
int const head_dim) {
size_t index = ((page_idx)*kPagesize * 2 + (token_idx % kPagesize)) *
head_dim * num_heads;
return index;
}

// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
__device__ __forceinline__ size_t get_v_entry_offset_verify(int const token_idx,
int const page_idx,
int const num_heads,
int const head_dim) {
size_t index = ((page_idx) * kPagesize * 2 + kPagesize + (token_idx % kPagesize)) * head_dim * num_heads;
__device__ __forceinline__ size_t
get_v_entry_offset_verify(int const token_idx,
int const page_idx,
int const num_heads,
int const head_dim) {
size_t index =
((page_idx)*kPagesize * 2 + kPagesize + (token_idx % kPagesize)) *
head_dim * num_heads;
return index;
}

Expand Down
177 changes: 101 additions & 76 deletions include/flexflow/page_manager.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#pragma once

#include "flexflow/batch_config.h"
#include "flexflow/config.h"
#include "flexflow/inference.h"
#include "flexflow/model.h"
#include "flexflow/config.h"
#include "flexflow/utils/file_loader.h"
#include <deque>
#include <future>
#include <mutex>
#include <tokenizers_cpp.h>
#include <deque>

namespace FlexFlow {

Expand All @@ -20,118 +20,143 @@ using TokenId = BatchConfig::TokenId;
*/
class LogicalTokenBlock {
public:
using TokenId = BatchConfig::TokenId;
using TokenId = BatchConfig::TokenId;

// Constructor
LogicalTokenBlock(int block_number, uint32_t block_size);
// Constructor
LogicalTokenBlock(int block_number, uint32_t block_size);

// Method to check if the block is empty
bool is_empty() const;
// Method to check if the block is empty
bool is_empty() const;

// Method to check if the block is full
bool is_full() const;
// Method to check if the block is full
bool is_full() const;

// Method to get the number of empty slots
int get_num_empty_slots() const;
// Method to get the number of empty slots
int get_num_empty_slots() const;

// Method to get the number of allocated slots
int get_num_alloc_slots() const;
// Method to get the number of allocated slots
int get_num_alloc_slots() const;

// Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
void reset_num_spec_tokens();
// Used to clean up the spec tokens in a block since these spec tokens may not
// be committed after use
void reset_num_spec_tokens();

// Method to append tokens
void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
// Method to append tokens
void append_tokens(std::vector<TokenId> const &token_ids_to_append,
bool committed);

int get_num_tokens() const { return num_tokens; }
int get_num_commit_tokens() const { return num_commit_tokens; }
int get_num_spec_tokens() const { return num_spec_tokens; }
int get_num_tokens() const {
return num_tokens;
}
int get_num_commit_tokens() const {
return num_commit_tokens;
}
int get_num_spec_tokens() const {
return num_spec_tokens;
}

std::vector<TokenId> get_token_ids() const;
std::vector<TokenId> get_token_ids() const;

private:
int block_number; // the index of the logical token block
int block_size; // the size of the block
int num_tokens; // the number of tokens currently stored in the block
int num_commit_tokens; // the number of tokens inside this block that are already committed
int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
int block_number; // the index of the logical token block
int block_size; // the size of the block
int num_tokens; // the number of tokens currently stored in the block
int num_commit_tokens; // the number of tokens inside this block that are
// already committed
int num_spec_tokens; // the number of tokens inside this block that are
// speculative tokens, which is stored temporarily
std::vector<TokenId> token_ids; // store the token ids in a order that
// corresponds to the inference sequence
};

/**
* @class PhysicalTokenBlock
* @brief A class to represent a physical block of tokens similar to physical memory address
* It keeps track of the location of the tokens stored on GPU memory
* @brief A class to represent a physical block of tokens similar to physical
* memory address It keeps track of the location of the tokens stored on GPU
* memory
*/
class PhysicalTokenBlock {
public:
// Constructor
PhysicalTokenBlock(int block_number, int block_size);

// Method to get the block number
int get_block_number() const { return block_number; }
void incr_ref_count() { ref_count++; }
void decr_ref_count() { ref_count--; }
int ref_count; // reference count, TODO: move to private
// Constructor
PhysicalTokenBlock(int block_number, int block_size);

// Method to get the block number
int get_block_number() const {
return block_number;
}
void incr_ref_count() {
ref_count++;
}
void decr_ref_count() {
ref_count--;
}
int ref_count; // reference count, TODO: move to private

private:
int block_number; // the index of the physical token block
int block_size; // the size of the block
int block_number; // the index of the physical token block
int block_size; // the size of the block
};

/**
* @class BlockAllocator
* @brief A Block Manager that is reponsible for maintaining a pool of free blocks
* @brief A Block Manager that is reponsible for maintaining a pool of free
* blocks
*/
class BlockAllocator {
public:
// Constructor
BlockAllocator(int block_size, int num_total_blocks);
// Constructor
BlockAllocator(int block_size, int num_total_blocks);

// Allocate a block
PhysicalTokenBlock allocate();
// Allocate a block
PhysicalTokenBlock allocate();

// Free a block
void free(PhysicalTokenBlock& block);
// Free a block
void free(PhysicalTokenBlock &block);

// Get the number of free blocks
int get_num_free_blocks() const;
// Get the number of free blocks
int get_num_free_blocks() const;

private:
int block_size;
int num_total_blocks;
std::deque<PhysicalTokenBlock> free_blocks;
int block_size;
int num_total_blocks;
std::deque<PhysicalTokenBlock> free_blocks;
};

/*
* @class PageManager
* @brief A wrapper class that manages the kv cache allocation status
* notice that all the layers of model will share the same page manager because the position of kv cache will be the same
*/
* @class PageManager
* @brief A wrapper class that manages the kv cache allocation status
* notice that all the layers of model will share the same page manager because
* the position of kv cache will be the same
*/
class PageManager {
public:
// Get the singleton instance of the PageManager as it will be shared in multiple places
static PageManager *get_page_manager();
using BlockTable = std::vector<PhysicalTokenBlock>;
using RequestGuid = BatchConfig::RequestGuid;
PageManager(int block_size, int num_total_blocks);

int allocate_one_block(const RequestGuid& request_guid);
void free_request(const RequestGuid& request_guid);
//used for the case that we want to free the last num_blocks that stores spec tokens(which are the tokens are not yet committed)
void free_multiple_blocks(const RequestGuid& request_guid, int num_blocks);
std::vector<int> get_block_table_indices(const RequestGuid& request_guid) const;


void free_block_table(BlockTable& block_table);
private:
int block_size; // the size of the block
int num_total_blocks; // the total number of blocks
BlockAllocator block_allocator;
std::unordered_map<RequestGuid, BlockTable> block_tables;
// Get the singleton instance of the PageManager as it will be shared in
// multiple places
static PageManager *get_page_manager();
static PageManager *get_page_manager(FFModel *ff, int kv_cache_size);
using BlockTable = std::vector<PhysicalTokenBlock>;
using RequestGuid = BatchConfig::RequestGuid;
PageManager(int block_size, int num_total_blocks);
int allocate_one_block(RequestGuid const &request_guid);
void free_request(RequestGuid const &request_guid);
// used for the case that we want to free the last num_blocks that stores spec
// tokens(which are the tokens are not yet committed)
void free_multiple_blocks(RequestGuid const &request_guid, int num_blocks);
std::vector<int>
get_block_table_indices(RequestGuid const &request_guid) const;

void free_block_table(BlockTable &block_table);

int get_num_total_free_blocks() const;
int get_num_allocated_blocks(const RequestGuid& request_guid) const;
private:
int num_transformer_layers;
int total_kv_cache_size;
int block_size; // the size of the block
int num_total_blocks; // the total number of blocks
BlockAllocator block_allocator;
std::unordered_map<RequestGuid, BlockTable> block_tables;

int get_num_total_free_blocks() const;
int get_num_allocated_blocks(RequestGuid const &request_guid) const;
};

}; // namespace FlexFlow
8 changes: 4 additions & 4 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include "flexflow/batch_config.h"
#include "flexflow/inference.h"
#include "flexflow/model.h"
#include "flexflow/utils/file_loader.h"
#include "flexflow/page_manager.h"
#include "flexflow/utils/file_loader.h"
#include <condition_variable>
#include <future>
#include <mutex>
Expand Down Expand Up @@ -149,7 +149,8 @@ struct Request {
Status status = PENDING;
std::vector<BatchConfig::TokenId> tokens;

//page attention, page_last_committed should be -1 because there are no blocks at the beginning
// page attention, page_last_committed should be -1 because there are no
// blocks at the beginning
int page_last_committed = -1;
std::vector<LogicalTokenBlock> blocks;

Expand Down Expand Up @@ -539,8 +540,7 @@ class RequestManager {
int get_len_last_block(Request &request) const;
int get_idx_last_logical_token(Request &request) const;
int idx_logical_to_physical(Request &request, int idx_logical);
void _append_block_to_request(
Request &request, bool is_commit);
void _append_block_to_request(Request &request, bool is_commit);
int append_token_to_block(Request &request, TokenId token, bool is_commit);
void reset_block_table(Request &request);
void print_num_tokens(Request &request);
Expand Down
2 changes: 1 addition & 1 deletion inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ void FlexFlow::top_level_task(Task const *task,
int max_tokens_per_prefilling_batch = -1;
int max_sequence_length = 256;
int max_output_length = 512;
int max_kv_cache_size = -1; //if -1, then use the default value
int max_kv_cache_size = -1; // if -1, then use the default value
RequestManager::DecodingMode decoding_mode =
RequestManager::INCREMENTAL_DECODING;
int sampling_seed = 0;
Expand Down
5 changes: 5 additions & 0 deletions inference/models/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ void FALCON::create_falcon_model(FFModel &ff,
Tensor mha = nullptr, mlp_output = nullptr;
Tensor res_ln_outputs[2] = {nullptr, nullptr};

ff.set_num_transformer_layers(falcon_config.n_layer);
ff.set_num_kv_heads(falcon_config.n_head_kv);
ff.set_qkv_dim(falcon_config.hidden_size / falcon_config.n_head * 2);
ff.set_size_dt(data_type_size(input->data_type));

for (int i = 0; i < falcon_config.n_layer; i++) {
// set transformer layer id
ff.set_transformer_layer_id(i);
Expand Down
1 change: 1 addition & 0 deletions inference/models/falcon.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

// #include "file_loader.h"
#include "flexflow/batch_config.h"
#include "flexflow/ffconst_utils.h"
#include "flexflow/inference.h"
#include "flexflow/request_manager.h"
#include <nlohmann/json.hpp>
Expand Down
5 changes: 5 additions & 0 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ void LLAMA::create_llama_model(FFModel &ff,

Tensor w2 = nullptr;

ff.set_num_transformer_layers(llama_config.num_hidden_layers);
ff.set_num_kv_heads(llama_config.num_key_value_heads);
ff.set_qkv_dim(llama_config.hidden_size / llama_config.num_attention_heads *
2);
ff.set_size_dt(data_type_size(input->data_type));
for (int i = 0; i < llama_config.num_hidden_layers; i++) {
// set transformer layer id
ff.set_transformer_layer_id(i);
Expand Down
1 change: 1 addition & 0 deletions inference/models/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

// #include "file_loader.h"
#include "flexflow/batch_config.h"
#include "flexflow/ffconst_utils.h"
#include "flexflow/inference.h"
#include "flexflow/request_manager.h"
#include <nlohmann/json.hpp>
Expand Down
4 changes: 4 additions & 0 deletions inference/models/mpt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ void MPT::create_mpt_model(FFModel &ff,
Tensor intermediate_output = nullptr, layernorm_output = nullptr;
Tensor res_ln_outputs[2] = {nullptr, nullptr};

ff.set_num_transformer_layers(mpt_config.n_layers);
ff.set_num_kv_heads(mpt_config.n_heads);
ff.set_qkv_dim(mpt_config.hidden_size / mpt_config.n_heads * 2);
ff.set_size_dt(data_type_size(input->data_type));
for (int i = 0; i < mpt_config.n_layers; i++) {
ff.set_transformer_layer_id(i);

Expand Down
Loading

0 comments on commit 4a7162f

Please sign in to comment.