Skip to content

Commit

Permalink
ckpt for performance issue
Browse files Browse the repository at this point in the history
  • Loading branch information
Bob-Chen222 committed Nov 5, 2024
1 parent 20cb714 commit 311c450
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 46 deletions.
2 changes: 2 additions & 0 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,8 @@ class RequestManager {
std::vector<int> generated_tokens_per_step;
// To calculate the E2E time of serving
long long server_start_time = 0;
// added for seeing how many things are disabled
int num_disabled = 0;
};

ProfileInfo profiling;
Expand Down
4 changes: 2 additions & 2 deletions src/ops/inc_multihead_self_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -509,8 +509,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
BatchConfig::max_requests_per_batch() *
max_num_pages * kPagesize;
}else{
key_cache_size = total_kv_cache_size_per_layer / 2;
value_cache_size = total_kv_cache_size_per_layer / 2;
key_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
value_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
}
break;
}
Expand Down
24 changes: 4 additions & 20 deletions src/runtime/page_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ void PageManager::free_block_table(BlockTable &block_table) {

void PageManager::free_request(RequestGuid const &request_guid) {
// we only free the blocks that are already used
assert(block_tables.find(request_guid) != block_tables.end());
// assert(block_tables.find(request_guid) != block_tables.end());
printf("free the blocks for request %d\n", request_guid);
BlockTable block_table = block_tables[request_guid];
free_block_table(block_table);
block_tables.erase(request_guid);
Expand All @@ -165,9 +166,9 @@ void PageManager::free_request(RequestGuid const &request_guid) {
// delete the last num_blocks in the request_guid
void PageManager::free_multiple_blocks(RequestGuid const &request_guid,
int num_blocks) {
assert(block_tables.find(request_guid) != block_tables.end());
// assert(block_tables.find(request_guid) != block_tables.end());
auto &block_table = block_tables[request_guid];
assert(num_blocks <= block_table.size());
// assert(num_blocks <= block_table.size());
int num_blocks_allocated = block_table.size();
for (int i = 0; i < num_blocks; i++) {
block_allocator.free(block_table[num_blocks_allocated - i - 1]);
Expand All @@ -179,12 +180,6 @@ void PageManager::free_multiple_blocks(RequestGuid const &request_guid,
return;
}

// int PageManager::get_index_last_block(const RequestGuid& request_guid) const
// {
// const auto& block_table = block_tables.at(request_guid);
// return block_table.back.get_block_number();
// }

std::vector<int> PageManager::get_block_table_indices(
RequestGuid const &request_guid) const {
std::vector<int> indices;
Expand Down Expand Up @@ -220,10 +215,6 @@ PageManager *PageManager::get_page_manager(FFModel *ff,
int qkv_dim = ff->qkv_dim;
int num_transformer_layers = ff->num_transformer_layers;
int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree;
printf("num_kv_heads: %d, size_dt: %d, qkv_dim: %d, num_transformer_layers: "
"%d, pipeline_parallelism_degree: %d\n",
num_kv_heads, size_dt, qkv_dim, num_transformer_layers,
pipeline_parallelism_degree);
assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 &&
num_transformer_layers > 0 && pipeline_parallelism_degree > 0); //needs to make sure that the model is initialized
if (page_manager_singleton == nullptr) {
Expand All @@ -250,13 +241,6 @@ size_t PageManager::get_kv_cache_size_per_layer() {
}

PageManager *PageManager::get_page_manager() {
// if (page_manager_singleton == nullptr) {
// int num_total_blocks =
// (BatchConfig::max_spec_tree_token_num() +
// BatchConfig::max_sequence_length() + kPagesize - 1) /
// kPagesize * BatchConfig::max_requests_per_batch();
// page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
// }
assert(page_manager_singleton != nullptr);
return page_manager_singleton;
}
Expand Down
48 changes: 24 additions & 24 deletions src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
get_num_blocks_allocated(*request);
if (bc.requestsInfo[request_index].num_kv_pages == 0) {
// turn this request into not available for one round
profiling.num_disabled++;
bc.request_available[request_index] = false;
}
bc.requestsInfo[request_index].kv_last_page_len =
Expand Down Expand Up @@ -1607,8 +1608,8 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
int idx_to_physical =
append_token_to_block(request, committed_token.token_id, true);
int idx_from_logical = committed_token.from_index;
assert(idx_from_logical >= 0);
assert(idx_from_logical / kPagesize < block_table_before_commit.size());
// assert(idx_from_logical >= 0);
// assert(idx_from_logical / kPagesize < block_table_before_commit.size());
int idx_from_physical =
block_table_before_commit[idx_from_logical / kPagesize] * kPagesize +
committed_token.from_index % kPagesize;
Expand Down Expand Up @@ -1661,10 +1662,10 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
// page attention information
new_bc.requestsInfo[request_index].num_kv_pages =
get_num_blocks_allocated(request);
assert(new_bc.requestsInfo[request_index].num_kv_pages > 0);
// assert(new_bc.requestsInfo[request_index].num_kv_pages > 0);
new_bc.requestsInfo[request_index].kv_last_page_len =
get_len_last_block(request);
assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0);
// assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0);
new_bc.requestsInfo[request_index].request_guid = request.guid;
}

Expand Down Expand Up @@ -1985,9 +1986,6 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
/* --------- Page Attention Related Functions --------- */
int RequestManager::get_num_blocks_allocated(Request &request) const {
// needs some assertion
assert(request.blocks.size() == PageManager::get_page_manager()
->get_block_table_indices(request.guid)
.size());
return request.blocks.size();
}

Expand Down Expand Up @@ -2016,7 +2014,7 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
std::vector<int> block_table_indices =
page_manager->get_block_table_indices(request.guid);
if (request.blocks.size() != block_table_indices.size()) {
assert(request.blocks.size() == block_table_indices.size());
// assert(request.blocks.size() == block_table_indices.size());
}
return block_table_indices[idx_logical / kPagesize] * kPagesize +
idx_logical % kPagesize;
Expand All @@ -2026,9 +2024,9 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
void RequestManager::_append_block_to_request(Request &request,
bool is_commit) {
PageManager *page_manager = PageManager::get_page_manager();
assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
assert(request.blocks.size() ==
page_manager->get_block_table_indices(request.guid).size());
// assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
// assert(request.blocks.size() ==
// page_manager->get_block_table_indices(request.guid).size());
// Append the logical block to the request
// page attention: in this function we need to remember the last logical block
// number that still contains committed tokens
Expand All @@ -2037,14 +2035,14 @@ void RequestManager::_append_block_to_request(Request &request,
page_manager->allocate_one_block(request.guid);
std::vector<int> block_table_indices =
page_manager->get_block_table_indices(request.guid);
assert(request.blocks.size() ==
page_manager->get_block_table_indices(request.guid).size());
// assert(request.blocks.size() ==
// page_manager->get_block_table_indices(request.guid).size());
// update page_id_commit
if (is_commit) {
request.page_last_committed++;
int size_blocks = request.blocks.size();
assert(request.page_last_committed <
static_cast<int>(request.blocks.size()));
// assert(request.page_last_committed <
// static_cast<int>(request.blocks.size()));
}
}

Expand All @@ -2058,14 +2056,14 @@ int RequestManager::append_token_to_block(Request &request,
if (request.blocks.empty() || request.blocks.back().is_full()) {
// Append a new logical block
_append_block_to_request(request, is_commit);
assert(request.blocks.size() ==
page_manager->get_block_table_indices(request.guid).size());
// assert(request.blocks.size() ==
// page_manager->get_block_table_indices(request.guid).size());
// also allocate one physical page
}
// insert token to both logical block and physical block
request.blocks.back().append_tokens({token}, is_commit);
assert(request.blocks.size() ==
page_manager->get_block_table_indices(request.guid).size());
// assert(request.blocks.size() ==
// page_manager->get_block_table_indices(request.guid).size());
int idx_logical = get_idx_last_logical_token(request);
assert(idx_logical >= 0);
int idx_physical = idx_logical_to_physical(request, idx_logical);
Expand All @@ -2077,12 +2075,12 @@ void RequestManager::reset_block_table(Request &request) {
// get the indices of original physical block table for request
PageManager *page_manager = PageManager::get_page_manager();
assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
assert(request.blocks.size() ==
page_manager->get_block_table_indices(request.guid).size());
// assert(request.blocks.size() ==
// page_manager->get_block_table_indices(request.guid).size());
std::vector<int> block_table_indices =
page_manager->get_block_table_indices(request.guid);
// reset the block table according to the request's page_last_commit
assert(block_table_indices.size() > request.page_last_committed);
// assert(block_table_indices.size() > request.page_last_committed);
page_manager->free_multiple_blocks(request.guid,
block_table_indices.size() -
request.page_last_committed - 1);
Expand All @@ -2097,8 +2095,8 @@ void RequestManager::reset_block_table(Request &request) {
std::vector<int> block_table =
page_manager->get_block_table_indices(request.guid);

assert(request.blocks.size() ==
page_manager->get_block_table_indices(request.guid).size());
// assert(request.blocks.size() ==
// page_manager->get_block_table_indices(request.guid).size());
return;
}

Expand Down Expand Up @@ -2884,6 +2882,8 @@ void RequestManager::terminate_background_server() {
generated_tokens_per_step += ")";
str += generated_tokens_per_step;

printf("there are %d requests disabled\n", profiling.num_disabled);

std::string mean_generated_tokens_per_step =
"\n mean_generated_tokens_per_step( ";
double mean_generated_tokens =
Expand Down

0 comments on commit 311c450

Please sign in to comment.