From 5250a3b78ca1238e9ac7c08a0985cefbf0148de6 Mon Sep 17 00:00:00 2001 From: Bob-Chen222 Date: Tue, 5 Nov 2024 20:27:39 -0800 Subject: [PATCH] ckpt for nothing --- src/ops/kernels/inc_multihead_self_attention_kernels.cu | 2 +- src/runtime/request_manager.cc | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu index 1a697fbda8..f6c3da3190 100644 --- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu +++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu @@ -521,7 +521,7 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m, int const max_num_pages = round_up_pages(BatchConfig::max_sequence_length() + BatchConfig::max_spec_tree_token_num()); - update_qkv_in_batch_verify_kernel<<>>( diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d156c62532..44e719e667 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1310,6 +1310,9 @@ BatchConfig RequestManager::prepare_decoding_batch() { int idx_to_physical = append_token_to_block(request, request.tokens.back(), true); bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request); bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request); + bc.requestsInfo[request_index].request_guid = request.guid; + printf("Request %d, token %d, idx_to_physical %d\n", request.guid, request.tokens.back(), idx_to_physical); + printf("Request %d, num_kv_pages %d, kv_last_page_len %d\n", request.guid, bc.requestsInfo[request_index].num_kv_pages, bc.requestsInfo[request_index].kv_last_page_len); bc.num_tokens++; @@ -2521,7 +2524,6 @@ void RequestManager::background_serving_task( } // page attention: initalize the page manager here int kv_cache_size = rm->get_max_kv_cache_size(); - printf("KV cache size: %d\n", kv_cache_size); PageManager::get_page_manager(llm, rm->get_max_kv_cache_size()); if (rm->decoding_mode == INCREMENTAL_DECODING) { // No SSMs: perform incremental decoding @@ -2738,6 +2740,10 @@ void RequestManager::terminate_background_server_at_exit() { void RequestManager::terminate_background_server() { if (is_background_server_serving()) { + printf("profiling llm step times size: %ld\n", + profiling.llm_step_times.size()); + printf("profiling requests per step size: %ld\n", + profiling.requests_per_step.size()); assert(profiling.llm_step_times.size() == profiling.requests_per_step.size()); // Write the last profiling statistics to output file