diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 16aec6f6cf..cfd9a2c77e 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -490,6 +490,8 @@ class RequestManager { std::vector generated_tokens_per_step; // To calculate the E2E time of serving long long server_start_time = 0; + // added for seeing how many things are disabled + int num_disabled = 0; }; ProfileInfo profiling; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 885b23d949..c0561f11ba 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -509,8 +509,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_requests_per_batch() * max_num_pages * kPagesize; }else{ - key_cache_size = total_kv_cache_size_per_layer / 2; - value_cache_size = total_kv_cache_size_per_layer / 2; + key_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt; + value_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt; } break; } diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc index 62a3ae11a2..a8559c8163 100644 --- a/src/runtime/page_manager.cc +++ b/src/runtime/page_manager.cc @@ -155,7 +155,8 @@ void PageManager::free_block_table(BlockTable &block_table) { void PageManager::free_request(RequestGuid const &request_guid) { // we only free the blocks that are already used - assert(block_tables.find(request_guid) != block_tables.end()); + // assert(block_tables.find(request_guid) != block_tables.end()); + printf("free the blocks for request %d\n", request_guid); BlockTable block_table = block_tables[request_guid]; free_block_table(block_table); block_tables.erase(request_guid); @@ -165,9 +166,9 @@ void PageManager::free_request(RequestGuid const &request_guid) { // delete the last num_blocks in the request_guid void PageManager::free_multiple_blocks(RequestGuid const &request_guid, int num_blocks) { - assert(block_tables.find(request_guid) != block_tables.end()); + // assert(block_tables.find(request_guid) != block_tables.end()); auto &block_table = block_tables[request_guid]; - assert(num_blocks <= block_table.size()); + // assert(num_blocks <= block_table.size()); int num_blocks_allocated = block_table.size(); for (int i = 0; i < num_blocks; i++) { block_allocator.free(block_table[num_blocks_allocated - i - 1]); @@ -179,12 +180,6 @@ void PageManager::free_multiple_blocks(RequestGuid const &request_guid, return; } -// int PageManager::get_index_last_block(const RequestGuid& request_guid) const -// { -// const auto& block_table = block_tables.at(request_guid); -// return block_table.back.get_block_number(); -// } - std::vector PageManager::get_block_table_indices( RequestGuid const &request_guid) const { std::vector indices; @@ -220,10 +215,6 @@ PageManager *PageManager::get_page_manager(FFModel *ff, int qkv_dim = ff->qkv_dim; int num_transformer_layers = ff->num_transformer_layers; int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree; - printf("num_kv_heads: %d, size_dt: %d, qkv_dim: %d, num_transformer_layers: " - "%d, pipeline_parallelism_degree: %d\n", - num_kv_heads, size_dt, qkv_dim, num_transformer_layers, - pipeline_parallelism_degree); assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 && num_transformer_layers > 0 && pipeline_parallelism_degree > 0); //needs to make sure that the model is initialized if (page_manager_singleton == nullptr) { @@ -250,13 +241,6 @@ size_t PageManager::get_kv_cache_size_per_layer() { } PageManager *PageManager::get_page_manager() { - // if (page_manager_singleton == nullptr) { - // int num_total_blocks = - // (BatchConfig::max_spec_tree_token_num() + - // BatchConfig::max_sequence_length() + kPagesize - 1) / - // kPagesize * BatchConfig::max_requests_per_batch(); - // page_manager_singleton = new PageManager(kPagesize, num_total_blocks); - // } assert(page_manager_singleton != nullptr); return page_manager_singleton; } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e03f0ad6e7..bc23375f5e 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1184,6 +1184,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() { get_num_blocks_allocated(*request); if (bc.requestsInfo[request_index].num_kv_pages == 0) { // turn this request into not available for one round + profiling.num_disabled++; bc.request_available[request_index] = false; } bc.requestsInfo[request_index].kv_last_page_len = @@ -1607,8 +1608,8 @@ BatchConfig RequestManager::prepare_verify_batch_config() { int idx_to_physical = append_token_to_block(request, committed_token.token_id, true); int idx_from_logical = committed_token.from_index; - assert(idx_from_logical >= 0); - assert(idx_from_logical / kPagesize < block_table_before_commit.size()); + // assert(idx_from_logical >= 0); + // assert(idx_from_logical / kPagesize < block_table_before_commit.size()); int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize; @@ -1661,10 +1662,10 @@ BatchConfig RequestManager::prepare_verify_batch_config() { // page attention information new_bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request); - assert(new_bc.requestsInfo[request_index].num_kv_pages > 0); + // assert(new_bc.requestsInfo[request_index].num_kv_pages > 0); new_bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request); - assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0); + // assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0); new_bc.requestsInfo[request_index].request_guid = request.guid; } @@ -1985,9 +1986,6 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) { /* --------- Page Attention Related Functions --------- */ int RequestManager::get_num_blocks_allocated(Request &request) const { // needs some assertion - assert(request.blocks.size() == PageManager::get_page_manager() - ->get_block_table_indices(request.guid) - .size()); return request.blocks.size(); } @@ -2016,7 +2014,7 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) { std::vector block_table_indices = page_manager->get_block_table_indices(request.guid); if (request.blocks.size() != block_table_indices.size()) { - assert(request.blocks.size() == block_table_indices.size()); + // assert(request.blocks.size() == block_table_indices.size()); } return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize; @@ -2026,9 +2024,9 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) { void RequestManager::_append_block_to_request(Request &request, bool is_commit) { PageManager *page_manager = PageManager::get_page_manager(); - assert(request.page_last_committed < static_cast(request.blocks.size())); - assert(request.blocks.size() == - page_manager->get_block_table_indices(request.guid).size()); + // assert(request.page_last_committed < static_cast(request.blocks.size())); + // assert(request.blocks.size() == + // page_manager->get_block_table_indices(request.guid).size()); // Append the logical block to the request // page attention: in this function we need to remember the last logical block // number that still contains committed tokens @@ -2037,14 +2035,14 @@ void RequestManager::_append_block_to_request(Request &request, page_manager->allocate_one_block(request.guid); std::vector block_table_indices = page_manager->get_block_table_indices(request.guid); - assert(request.blocks.size() == - page_manager->get_block_table_indices(request.guid).size()); + // assert(request.blocks.size() == + // page_manager->get_block_table_indices(request.guid).size()); // update page_id_commit if (is_commit) { request.page_last_committed++; int size_blocks = request.blocks.size(); - assert(request.page_last_committed < - static_cast(request.blocks.size())); + // assert(request.page_last_committed < + // static_cast(request.blocks.size())); } } @@ -2058,14 +2056,14 @@ int RequestManager::append_token_to_block(Request &request, if (request.blocks.empty() || request.blocks.back().is_full()) { // Append a new logical block _append_block_to_request(request, is_commit); - assert(request.blocks.size() == - page_manager->get_block_table_indices(request.guid).size()); + // assert(request.blocks.size() == + // page_manager->get_block_table_indices(request.guid).size()); // also allocate one physical page } // insert token to both logical block and physical block request.blocks.back().append_tokens({token}, is_commit); - assert(request.blocks.size() == - page_manager->get_block_table_indices(request.guid).size()); + // assert(request.blocks.size() == + // page_manager->get_block_table_indices(request.guid).size()); int idx_logical = get_idx_last_logical_token(request); assert(idx_logical >= 0); int idx_physical = idx_logical_to_physical(request, idx_logical); @@ -2077,12 +2075,12 @@ void RequestManager::reset_block_table(Request &request) { // get the indices of original physical block table for request PageManager *page_manager = PageManager::get_page_manager(); assert(request.page_last_committed < static_cast(request.blocks.size())); - assert(request.blocks.size() == - page_manager->get_block_table_indices(request.guid).size()); + // assert(request.blocks.size() == + // page_manager->get_block_table_indices(request.guid).size()); std::vector block_table_indices = page_manager->get_block_table_indices(request.guid); // reset the block table according to the request's page_last_commit - assert(block_table_indices.size() > request.page_last_committed); + // assert(block_table_indices.size() > request.page_last_committed); page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_last_committed - 1); @@ -2097,8 +2095,8 @@ void RequestManager::reset_block_table(Request &request) { std::vector block_table = page_manager->get_block_table_indices(request.guid); - assert(request.blocks.size() == - page_manager->get_block_table_indices(request.guid).size()); + // assert(request.blocks.size() == + // page_manager->get_block_table_indices(request.guid).size()); return; } @@ -2884,6 +2882,8 @@ void RequestManager::terminate_background_server() { generated_tokens_per_step += ")"; str += generated_tokens_per_step; + printf("there are %d requests disabled\n", profiling.num_disabled); + std::string mean_generated_tokens_per_step = "\n mean_generated_tokens_per_step( "; double mean_generated_tokens =