From a97f3c347a823f76465034822252125fe8271af8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 4 Oct 2024 04:49:04 +0000 Subject: [PATCH 1/8] first commit --- CMakeLists.txt | 10 ++++++++++ src/runtime/request_manager.cc | 1 + 2 files changed, 11 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e24e1e54b..c0c75f8686 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -296,6 +296,16 @@ if(NOT BUILD_LEGION_ONLY) endif() set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) + # SuffixDecoding + include(FetchContent) + FetchContent_Declare( + suffix_decoding + GIT_REPOSITORY git@github.com:Snowflake-Labs/suffix-tree-decoding.git + GIT_TAG main # or a specific tag/commit hash + ) + FetchContent_MakeAvailable(suffix_decoding) + list(APPEND FLEXFLOW_INCLUDE_DIRS ${suffix_decoding_SOURCE_DIR}/src) + list(APPEND FLEXFLOW_SRC ${suffix_decoding_SOURCE_DIR}/src/suffix_decoding.cc) add_library(substitution_loader SHARED ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 31a32dd3c8..24fdc2e572 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -18,6 +18,7 @@ #include "flexflow/ops/lora_linear.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" +#include "suffix_decoding.h" #include #include #include From 673184d41008e075fcaeb031595fefd30f2f79c9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 7 Oct 2024 03:50:26 +0000 Subject: [PATCH 2/8] update --- src/runtime/request_manager.cc | 71 ++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 24fdc2e572..d49a20a2f6 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -3002,6 +3002,77 @@ void RequestManager::serve_spec_infer(FFModel *llm) { } } +/*static*/ +void RequestManager::serve_suffix_decoding(FFModel *llm) { + Context ctx = llm->config.lg_ctx; + Runtime *runtime = llm->config.lg_hlr; + InferenceManager *im = InferenceManager::get_inference_manager(); + { + // Compile the llm + im->compile_model_and_allocate_buffer(llm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[llm]->load_weights(llm); + // init operators + im->init_operators_inference(llm); + } + + std::queue> + batch_pipeline; + // Legion futures for inc_decoding and spec_infer + TreeVerifyBatchConfigFuture last_tree_bcf; + InferenceResultFuture last_tree_irf; + { + // Initialize futures for spec infer + TreeVerifyBatchConfig tree_bc; + InferenceResult tree_ir; + last_tree_bcf = Future::from_value(tree_bc); + last_tree_irf = Future::from_value(tree_ir); + } + batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf)); + + while (!is_background_server_terminated()) { + + if (batch_pipeline.size() >= 4) { + // Block here to avoid launching too many batches + auto const &batch = batch_pipeline.front(); + batch.second.get_void_result(); + } + // deque finished batches + while (batch_pipeline.size() > 1) { + auto const &batch = batch_pipeline.front(); + if (batch.second.is_ready()) { + batch_pipeline.pop(); + } else { + break; + } + } + + runtime->begin_trace(ctx, 12347 /*trace_id*/); + auto const &next_batch = batch_pipeline.back(); + + BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(next_batch.first, next_batch.second, 0, ctx, runtime); + FutureMap fm = im->inference(ssm, 0, beam_bcf); + assert(fm.get_future_map_domain().get_volume() == 1); + BeamInferenceResultFuture beam_irf = fm.get_future(0); + beam_bcf = prepare_next_batch_beam(beam_bcf, beam_irf, ctx, runtime); + std::vector beam_bcf_vec(1); + beam_bcf_vec[0] = beam_bcf; + // Token Tree Verification + { + TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf, ctx, runtime); + FutureMap fm = im->inference(llm, 0, tree_bcf); + assert(fm.get_future_map_domain().get_volume() == 1); + InferenceResultFuture tree_irf = fm.get_future(0); + batch_pipeline.push(std::make_pair(tree_bcf, tree_irf)); + last_tree_bcf = tree_bcf; + last_tree_irf = tree_irf; + } + runtime->end_trace(ctx, 12347 /*trace_id*/); + } +} + void RequestManager::trigger_request_completion_future( RequestGuid const &guid) { const std::lock_guard lock(request_to_promise_mutex); From 4a3d1bd811b8494b8ac900f19bfc8a3136d44399 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 7 Oct 2024 04:03:52 +0000 Subject: [PATCH 3/8] update --- src/runtime/request_manager.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d49a20a2f6..f13277ddd1 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -3053,7 +3053,7 @@ void RequestManager::serve_suffix_decoding(FFModel *llm) { auto const &next_batch = batch_pipeline.back(); BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(next_batch.first, next_batch.second, 0, ctx, runtime); - FutureMap fm = im->inference(ssm, 0, beam_bcf); + FutureMap fm = im->suffix_decode(llm, 0, beam_bcf); assert(fm.get_future_map_domain().get_volume() == 1); BeamInferenceResultFuture beam_irf = fm.get_future(0); beam_bcf = prepare_next_batch_beam(beam_bcf, beam_irf, ctx, runtime); @@ -3061,7 +3061,7 @@ void RequestManager::serve_suffix_decoding(FFModel *llm) { beam_bcf_vec[0] = beam_bcf; // Token Tree Verification { - TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf, ctx, runtime); + TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf_vec, ctx, runtime); FutureMap fm = im->inference(llm, 0, tree_bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture tree_irf = fm.get_future(0); From d2caba835bfab2fabcb06648e59808c629d17ef6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 7 Oct 2024 19:40:17 +0000 Subject: [PATCH 4/8] update --- include/flexflow/model.h | 1 + include/flexflow/request_manager.h | 19 +++ src/mapper/mapper.cc | 1 + src/runtime/model.cc | 21 +++ src/runtime/request_manager.cc | 257 ++++++++++++++++++++++++----- 5 files changed, 259 insertions(+), 40 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 4ad735ef7d..8fcee5e2f6 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -277,6 +277,7 @@ enum TaskIDs { RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, + RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID, RM_BACKGROUND_SERVING_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index f0fab957ee..45731efe33 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -19,6 +19,7 @@ #include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/utils/file_loader.h" +#include "suffix_decoding.h" #include #include #include @@ -164,6 +165,7 @@ class RequestManager { void serve_incr_decoding(FFModel *model); void serve_spec_infer(FFModel *model); + void serve_suffix_decoding(FFModel *model); GenerationResult get_generation_result(RequestGuid const &guid); RequestGuid register_new_request(Request const &request_); RequestGuid register_new_peft_request(Request const &request_); @@ -210,6 +212,15 @@ class RequestManager { Legion::Context ctx, Legion::Runtime *runtime); + TreeVerifyBatchConfig + prepare_next_batch_suffix_decode(TreeVerifyBatchConfig const &old_bc, + InferenceResult const &result); + TreeVerifyBatchConfigFuture prepare_next_batch_suffix_decode( + TreeVerifyBatchConfigFuture const &old_bc, + InferenceResultFuture const &result, + Legion::Context ctx, + Legion::Runtime *runtime); + void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); void update_beam_metadata(BeamSearchBatchConfig &new_bc, @@ -280,6 +291,12 @@ class RequestManager { Legion::Context ctx, Legion::Runtime *runtime); + static TreeVerifyBatchConfig prepare_next_batch_suffix_decode_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + private: // configuration parameters int max_requests_per_batch; @@ -295,6 +312,8 @@ class RequestManager { // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; + SuffixTree *suffix_tree; + // private fields std::unique_ptr tokenizer_; bool verbose; diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d7b9a5e99d..d321aeb583 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -284,6 +284,7 @@ void FFMapper::select_task_options(const MapperContext ctx, (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) || + (task.task_id == RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID) || (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) { output.initial_proc = all_cpus[0]; return; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 52f1dd2220..4e8aebe2aa 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4822,6 +4822,27 @@ void register_flexflow_internal_tasks(Runtime *runtime, RequestManager::prepare_next_batch_verify_task>(registrar); } } + // RequestManager prepare_next_batch_suffix_decode + { + TaskVariantRegistrar registrar( + RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID, + "RequestManager Prepare Next Batch (Suffix Decode)"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + TreeVerifyBatchConfig, + RequestManager::prepare_next_batch_suffix_decode_task>( + registrar, "RequestManager Prepare Next Batch (Suffix Decode) Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + TreeVerifyBatchConfig, + RequestManager::prepare_next_batch_suffix_decode_task>(registrar); + } + } // RequestManager background serving task { TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index f13277ddd1..90d060c97f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -18,7 +18,6 @@ #include "flexflow/ops/lora_linear.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" -#include "suffix_decoding.h" #include #include #include @@ -97,6 +96,124 @@ std::ostream &operator<<(std::ostream &os, Request const &req) { bool RequestManager::inference_finished = false; +DailSqlTrace load_trace_dail_sql(std::string const &trace_filepath) { + std::filesystem::path cwd = std::filesystem::current_path(); + std::ifstream file(trace_filepath); + assert(file.good() && "File does not exist or cannot be opened"); + + nlohmann::json data; + try { + file >> data; + } catch (nlohmann::json::parse_error &e) { + std::cerr << "JSON parse error: " << e.what() << std::endl; + assert(false); + } + std::cout << "finished loading json file: " << trace_filepath << std::endl; + + DailSqlTrace trace; + for (auto const &question : data["questions"]) { + trace.prompts.push_back(question["prompt"]); + trace.responses.push_back(question["response"]); + } + + assert(trace.prompts.size() == trace.responses.size()); + return trace; +} + +std::string replaceSlashes(std::string str) { + size_t pos = 0; + while ((pos = str.find('/', pos)) != std::string::npos) { + str.replace(pos, 1, "--"); + pos += 2; + } + return str; +} + +std::string getHFHome() { + if (char const *env_p = std::getenv("HF_HOME")) { + return std::string(env_p); + } else { + std::filesystem::path home = std::filesystem::path(getenv("HOME")); + return (home / ".cache" / "huggingface").string(); + } +} + +std::string get_tokenizer_base_folder(std::string const &model_name) { + // Replace '/' with '--' in the model name + std::string model_name_without_slashes = model_name; + model_name_without_slashes = replaceSlashes(model_name_without_slashes); + + // Construct the base path + std::string hf_home = getHFHome(); + + std::string base_path = + hf_home + "/hub/models--" + model_name_without_slashes + "/snapshots/"; + + // Find the first subfolder in the snapshots directory + std::string first_folder_name; + for (auto const &entry : fs::directory_iterator(base_path)) { + if (fs::is_directory(entry)) { + first_folder_name = entry.path().filename().string(); + break; + } + } + return base_path + first_folder_name; +} + +std::string get_tokenizer_path(std::string const &model_name) { + std::string base_folder = get_tokenizer_base_folder(model_name); + + if (fs::exists(base_folder + "/tokenizer.model")) { + return base_folder + "/tokenizer.model"; + } else if (fs::exists(base_folder + "/tokenizer.json")) { + return base_folder + "/tokenizer.json"; + } else { + assert(false); + } +} + +int get_bos_token_id(std::string const &model_name) { + std::string base_folder = get_tokenizer_base_folder(model_name); + std::string filename = base_folder + "/config.json"; + if (!fs::exists(filename)) { + assert(false && "config.json not found"); + return -1; + } + // Read the JSON file + std::ifstream file(filename); + if (!file.is_open()) { + assert(false && "Unable to open file"); + return -1; + } + + // Parse JSON + nlohmann::json j; + file >> j; + + // Get the bos_token_id value + if (j.contains("bos_token_id")) { + return j["bos_token_id"].get(); + } else { + assert(false && "bos_token_id not found in JSON"); + return -1; + } + return -1; +} + +auto get_tokenizer(std::string const &model_name) { + std::string tokenizer_path = get_tokenizer_path(model_name); + // if the tokenizer_path ends with the ".json" extension: + if (tokenizer_path.find("tokenizer.json") != std::string::npos) { + auto blob = LoadBytesFromFile(tokenizer_path); + return Tokenizer::FromBlobJSON(blob); + } else if (tokenizer_path.find("tokenizer.model") != std::string::npos) { + auto blob = LoadBytesFromFile(tokenizer_path); + return Tokenizer::FromBlobSentencePiece(blob); + } else { + assert(false); + } +} + RequestManager::RequestManager() : request_manager_status(INITIALIZED), verbose(false), next_available_guid(1000000), num_processed_requests(0), @@ -110,6 +227,30 @@ RequestManager::RequestManager() max_tokens_per_batch = -1; max_spec_tree_token_num = -1; max_sequence_length = -1; + + std::string model_name = "meta-llama/Meta-Llama-3-70B"; + DailSqlTrace dail_sql_trace = + load_trace_dail_sql("/usr/suffix-tree-decoding/trace/spider.json"); + int num_prompts = dail_sql_trace.prompts.size(); + int num_responses = dail_sql_trace.responses.size(); + assert(num_prompts == num_responses); + auto tokenizer = get_tokenizer(model_name); + int bos_token_id = get_bos_token_id(model_name); + int train_size = num_prompts / 2; + std::cout << "Number of prompts: " << num_prompts << std::endl; + std::cout << "Train size: " << train_size << std::endl; + std::vector> training_dataset; + for (int i = 0; i < train_size; i++) { + std::string text = dail_sql_trace.prompts[i] + dail_sql_trace.responses[i]; + std::vector encoded = tokenizer->Encode(text); + encoded.insert(encoded.begin(), bos_token_id); + training_dataset.push_back(encoded); + } + suffix_tree = new SuffixTree(50); + for (auto const &text : training_dataset) { + suffix_tree->insert(text, suffix_tree->query_guid); + suffix_tree->query_guid++; + } } void RequestManager::set_max_requests_per_batch(int max_num_requests) { @@ -1721,6 +1862,38 @@ BeamSearchBatchConfig return new_bc; } +TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_suffix_decode( + TreeVerifyBatchConfigFuture const &old_bc, + InferenceResultFuture const &result, + Context ctx, + Runtime *runtime) { + + RequestManager *rm = this; + TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID, + TaskArgument(&rm, sizeof(RequestManager *))); + launcher.add_future(old_bc); + launcher.add_future(result); + return runtime->execute_task(ctx, launcher); +} + +TreeVerifyBatchConfig RequestManager::prepare_next_batch_suffix_decode_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RequestManager *rm = *((RequestManager **)task->args); + TreeVerifyBatchConfig const &bc = + Future(task->futures[0]).get_result(); + InferenceResult const &result = + Future(task->futures[1]).get_result(); + return rm->prepare_next_batch_suffix_decode(bc, result); +} + +TreeVerifyBatchConfig RequestManager::prepare_next_batch_suffix_decode( + TreeVerifyBatchConfig const &old_bc, InferenceResult const &result) { + const std::lock_guard lock(request_queue_mutex); +} + /***** Verify Phase *****/ TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify( @@ -2802,8 +2975,15 @@ void RequestManager::background_serving_task( "###PEFT DEBUGGING### Updated models' configuration."); if (rm->get_num_ssms() == 0) { - // No SSMs: perform incremental decoding - rm->serve_incr_decoding(llm); + + char const *env_var = std::getenv("FF_SUFFIX_DECODING"); + + if (env_var != nullptr && std::string(env_var) == "1") { + rm->serve_suffix_decoding(llm); + } else { + // No SSMs: perform incremental decoding + rm->serve_incr_decoding(llm); + } } else { // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); @@ -3004,33 +3184,41 @@ void RequestManager::serve_spec_infer(FFModel *llm) { /*static*/ void RequestManager::serve_suffix_decoding(FFModel *llm) { + + // Check if the model object exists + if (llm == nullptr) { + std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl; + } + Context ctx = llm->config.lg_ctx; Runtime *runtime = llm->config.lg_hlr; + // Compile the llm InferenceManager *im = InferenceManager::get_inference_manager(); + im->compile_model_and_allocate_buffer(llm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[llm]->load_weights(llm); + // init operators + im->init_operators_inference(llm); + // Legion futures for inc_decoding and spec_infer + TreeVerifyBatchConfigFuture last_bcf; + InferenceResultFuture last_irf; { - // Compile the llm - im->compile_model_and_allocate_buffer(llm); - assert(im->model_weights_loaders.find(llm) != - im->model_weights_loaders.end()); - // Load model weights - im->model_weights_loaders[llm]->load_weights(llm); - // init operators - im->init_operators_inference(llm); + // Initialize futures for incr decoding + TreeVerifyBatchConfig bc; + InferenceResult ir; + last_bcf = Future::from_value(bc); + last_irf = Future::from_value(ir); } std::queue> batch_pipeline; - // Legion futures for inc_decoding and spec_infer - TreeVerifyBatchConfigFuture last_tree_bcf; - InferenceResultFuture last_tree_irf; - { - // Initialize futures for spec infer - TreeVerifyBatchConfig tree_bc; - InferenceResult tree_ir; - last_tree_bcf = Future::from_value(tree_bc); - last_tree_irf = Future::from_value(tree_ir); - } - batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf)); + { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); } while (!is_background_server_terminated()) { @@ -3048,27 +3236,16 @@ void RequestManager::serve_suffix_decoding(FFModel *llm) { break; } } - runtime->begin_trace(ctx, 12347 /*trace_id*/); auto const &next_batch = batch_pipeline.back(); - - BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(next_batch.first, next_batch.second, 0, ctx, runtime); - FutureMap fm = im->suffix_decode(llm, 0, beam_bcf); + TreeVerifyBatchConfigFuture bcf = prepare_next_batch_suffix_decode( + next_batch.first, next_batch.second, ctx, runtime); + FutureMap fm = im->inference(llm, 0, bcf); assert(fm.get_future_map_domain().get_volume() == 1); - BeamInferenceResultFuture beam_irf = fm.get_future(0); - beam_bcf = prepare_next_batch_beam(beam_bcf, beam_irf, ctx, runtime); - std::vector beam_bcf_vec(1); - beam_bcf_vec[0] = beam_bcf; - // Token Tree Verification - { - TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf_vec, ctx, runtime); - FutureMap fm = im->inference(llm, 0, tree_bcf); - assert(fm.get_future_map_domain().get_volume() == 1); - InferenceResultFuture tree_irf = fm.get_future(0); - batch_pipeline.push(std::make_pair(tree_bcf, tree_irf)); - last_tree_bcf = tree_bcf; - last_tree_irf = tree_irf; - } + InferenceResultFuture irf = fm.get_future(0); + batch_pipeline.push(std::make_pair(bcf, irf)); + last_bcf = bcf; + last_irf = irf; runtime->end_trace(ctx, 12347 /*trace_id*/); } } From 0194ef221463705c053519792d4d4131641ea26b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Oct 2024 05:43:25 +0000 Subject: [PATCH 5/8] update --- CMakeLists.txt | 1 + inference/suffix_decoding/CMakeLists.txt | 38 +++ inference/suffix_decoding/Makefile | 37 +++ inference/suffix_decoding/suffix_decoding.cc | 129 ++++++++ inference/suffix_decoding/utils.cc | 311 +++++++++++++++++++ inference/suffix_decoding/utils.h | 65 ++++ src/ops/inc_multihead_self_attention.cu | 2 +- 7 files changed, 582 insertions(+), 1 deletion(-) create mode 100644 inference/suffix_decoding/CMakeLists.txt create mode 100644 inference/suffix_decoding/Makefile create mode 100644 inference/suffix_decoding/suffix_decoding.cc create mode 100644 inference/suffix_decoding/utils.cc create mode 100644 inference/suffix_decoding/utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c0c75f8686..d0a1c3a4fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -544,6 +544,7 @@ if(NOT BUILD_LEGION_ONLY) if(FF_BUILD_INFERENCE) add_subdirectory(inference/spec_infer) + add_subdirectory(inference/suffix_decoding) add_subdirectory(inference/incr_decoding) add_subdirectory(inference/peft) endif() diff --git a/inference/suffix_decoding/CMakeLists.txt b/inference/suffix_decoding/CMakeLists.txt new file mode 100644 index 0000000000..c865ae04e6 --- /dev/null +++ b/inference/suffix_decoding/CMakeLists.txt @@ -0,0 +1,38 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlow_SpecInfer) +set(project_target suffix_decoding) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + suffix_decoding.cc + utils.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target} ${CPU_SRC}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target} ${CPU_SRC}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/inference/suffix_decoding/Makefile b/inference/suffix_decoding/Makefile new file mode 100644 index 0000000000..0e4b79f51f --- /dev/null +++ b/inference/suffix_decoding/Makefile @@ -0,0 +1,37 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= llama_pipeline +# List all the application source files here +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/inference/suffix_decoding/suffix_decoding.cc b/inference/suffix_decoding/suffix_decoding.cc new file mode 100644 index 0000000000..d2b4aa17c7 --- /dev/null +++ b/inference/suffix_decoding/suffix_decoding.cc @@ -0,0 +1,129 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suffix_decoding/utils.h" + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + + + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + FilePaths file_paths; + ModelMeta model_metadata; + bool use_full_precision = false; + bool verbose = false; + int max_requests_per_batch = 16; + int max_tokens_per_batch = 256; + int max_sequence_length = 1024; + int max_spec_tree_token_num = 23; + int expansion_degree = 3; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + model_metadata.model_names, + use_full_precision, + verbose, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + expansion_degree); + + get_model_meta(file_paths, model_metadata, use_full_precision); + + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + GenerationConfig generationConfig; + InferenceManager *im = InferenceManager::get_inference_manager(); + RequestManager *rm = RequestManager::get_request_manager(); + init_request_manager(rm, + model_metadata, + file_paths, + max_requests_per_batch, + max_tokens_per_batch, + max_spec_tree_token_num, + max_sequence_length, + expansion_degree); + + // Create LLM model + FFModel tree_model(ffconfig, ffconfig.cpu_offload); + init_llm(tree_model, model_metadata, generationConfig, use_full_precision); + + // Create SSM models + int num_ssms = model_metadata.ssm_model_types.size(); + std::vector ssm_models; + FFConfig bm_config = ffconfig; + bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree = + bm_config.pipeline_parallelism_degree = 1; + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + FFModel beam_model(bm_config); + ssm_models.push_back(beam_model); + } + init_ssms(rm, ssm_models, num_ssms, model_metadata, generationConfig, use_full_precision); + + rm->start_background_server(&tree_model); + + // Register requests from prompt file + int total_num_requests = 0; + { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + std::vector requests; + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // Add inference request + Request inference_req; + inference_req.prompt = text; + inference_req.max_length = 128; + requests.push_back(inference_req); + total_num_requests++; + } + tree_model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/suffix_decoding/utils.cc b/inference/suffix_decoding/utils.cc new file mode 100644 index 0000000000..0f20c1e5ff --- /dev/null +++ b/inference/suffix_decoding/utils.cc @@ -0,0 +1,311 @@ +#include "suffix_decoding/utils.h" + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + ModelNames &model_names, + bool &use_full_precision, + bool &verbose, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &expansion_degree) { + for (int i = 1; i < argc; i++) { + // llm model name + if (!strcmp(argv[i], "-llm-model")) { + model_names.llm_model_name = std::string(argv[++i]); + for (char &c : model_names.llm_model_name) { + c = std::tolower(c); + } + continue; + } + // ssm models names + if (!strcmp(argv[i], "-ssm-model")) { + std::string ssm_model_name = std::string(argv[++i]); + for (char &c : ssm_model_name) { + c = std::tolower(c); + } + model_names.ssm_model_names.push_back(ssm_model_name); + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--expansion-degree")) { + expansion_degree = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void get_model_meta(FilePaths &file_paths, + ModelMeta &model_metadata, + bool use_full_precision) { + if (model_metadata.model_names.llm_model_name.empty() || + model_metadata.model_names.ssm_model_names.size() == 0) { + assert(false && "SpecInfer needs at least one LLM and one SSM for " + "speculative inference"); + } + model_metadata.llm_model_config_path = + join_path({file_paths.cache_folder_path, + "configs", + model_metadata.model_names.llm_model_name, + "config.json"}); + model_metadata.llm_tokenizer_path = + join_path({file_paths.cache_folder_path, + "tokenizers", + model_metadata.model_names.llm_model_name}); + model_metadata.llm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + model_metadata.model_names.llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path); + if (!llm_config_file_handle.good()) { + std::cout << "LLM Model config file " + << model_metadata.llm_model_config_path << " not found." + << std::endl; + assert(false); + } + json llm_model_config = json::parse(llm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + model_metadata.llm_model_type = ModelType::UNKNOWN; + auto architectures = llm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_metadata.llm_model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_metadata.llm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_metadata.llm_model_type = ModelType::FALCON; + break; + } else if (str == "MPTForCausalLM") { + model_metadata.llm_model_type = ModelType::MPT; + break; + } + } + model_metadata.bos_token_id = + llm_model_config.find("bos_token_id") == llm_model_config.end() + ? -1 + : (int)llm_model_config.at("bos_token_id"); + model_metadata.eos_token_id = + llm_model_config.find("eos_token_id") == llm_model_config.end() + ? -1 + : (int)llm_model_config.at("eos_token_id"); + + for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { + std::string ssm_config_path = join_path({file_paths.cache_folder_path, + "configs", + ssm_model_name, + "config.json"}); + std::string ssm_tokenizer_path = + join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name}); + std::string ssm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + ssm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream ssm_config_file_handle(ssm_config_path); + if (!ssm_config_file_handle.good()) { + std::cout << "SSM Model config file " << ssm_config_path << " not found." + << std::endl; + assert(false); + } + json ssm_model_config = json::parse(ssm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + ModelType ssm_model_type = ModelType::UNKNOWN; + auto architectures = ssm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + ssm_model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + ssm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM") { + ssm_model_type = ModelType::FALCON; + break; + } else if (str == "MPTForCausalLM") { + ssm_model_type = ModelType::MPT; + break; + } + } + int ssm_bos_id = + ssm_model_config.find("bos_token_id") == ssm_model_config.end() + ? -1 + : (int)ssm_model_config.at("bos_token_id"); + int ssm_eos_id = + ssm_model_config.find("eos_token_id") == ssm_model_config.end() + ? -1 + : (int)ssm_model_config.at("eos_token_id"); + if (ssm_bos_id != model_metadata.bos_token_id || + ssm_eos_id != model_metadata.eos_token_id) { + printf("Warning: bos/eos token id mismatch between LLM and one of the " + "SSMs!\n"); + } + model_metadata.ssm_model_types.push_back(ssm_model_type); + model_metadata.ssm_model_config_paths.push_back(ssm_config_path); + model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path); + } + + assert(model_metadata.llm_model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + for (auto mt : model_metadata.ssm_model_types) { + if (mt == ModelType::UNKNOWN) { + assert(false && "One of the SSM model types passed is invalid."); + } + } +} + +void init_request_manager(RequestManager *rm, ModelMeta &model_metadata, + FilePaths &file_paths, int max_requests_per_batch, + int max_tokens_per_batch, int max_spec_tree_token_num, int max_sequence_length, + int expansion_degree) { + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_spec_tree_token_num(max_spec_tree_token_num); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer(model_metadata.llm_model_type, + model_metadata.bos_token_id, + model_metadata.eos_token_id, + model_metadata.llm_tokenizer_path); + rm->register_output_filepath(file_paths.output_file_path); + + // first decoding step: 3 results + if (expansion_degree != -1) { + rm->push_spec_infer_tree_width(1); + rm->push_spec_infer_tree_width(1); + rm->push_spec_infer_tree_width(expansion_degree); + } +} + +void init_llm(FFModel &tree_model, ModelMeta &model_metadata, + GenerationConfig &generationConfig, bool use_full_precision) { + if (model_metadata.llm_model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + generationConfig, + use_full_precision); + } else if (model_metadata.llm_model_type == ModelType::OPT) { + OPT::create_opt_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + use_full_precision); + } else if (model_metadata.llm_model_type == ModelType::FALCON) { + FALCON::create_falcon_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + use_full_precision); + } else if (model_metadata.llm_model_type == ModelType::MPT) { + MPT::create_mpt_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "Invalid LLM model type passed (or no type was passed)."); + } +} + +void init_ssms(RequestManager *rm, std::vector &ssm_models, int num_ssms, + ModelMeta &model_metadata, GenerationConfig &generationConfig, + bool use_full_precision) { + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + FFModel &beam_model = ssm_models[ssm_id]; + if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) { + LLAMA::create_llama_model(beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + BEAM_SEARCH_MODE, + generationConfig, + use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) { + OPT::create_opt_model(beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + BEAM_SEARCH_MODE, + use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) { + FALCON::create_falcon_model( + beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + BEAM_SEARCH_MODE, + use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) { + MPT::create_mpt_model(beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + BEAM_SEARCH_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "Invalid SSM model type passed."); + } + + rm->register_ssm_model(&beam_model); + } +} \ No newline at end of file diff --git a/inference/suffix_decoding/utils.h b/inference/suffix_decoding/utils.h new file mode 100644 index 0000000000..f4f1f90216 --- /dev/null +++ b/inference/suffix_decoding/utils.h @@ -0,0 +1,65 @@ +#include "flexflow/inference.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include +#include +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +struct ModelNames { + std::string llm_model_name; + std::vector ssm_model_names; +}; + +struct ModelMeta { + ModelNames model_names; + + ModelType llm_model_type; + std::string llm_tokenizer_path; + std::string llm_weights_path; + std::string llm_model_config_path; + + int bos_token_id, eos_token_id; + + std::vector ssm_model_types; + std::vector ssm_model_config_paths; + std::vector ssm_model_weights_paths; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + ModelNames &model_names, + bool &use_full_precision, + bool &verbose, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &expansion_degree); + +void get_model_meta(FilePaths &file_paths, + ModelMeta &model_metadata, + bool use_full_precision); + +void init_request_manager(RequestManager *rm, ModelMeta &model_metadata, + FilePaths &file_paths, int max_requests_per_batch, + int max_tokens_per_batch, int max_spec_tree_token_num, + int max_sequence_length, int expansion_degree); + +void init_llm(FFModel &tree_model, ModelMeta &model_metadata, + GenerationConfig &generationConfig, bool use_full_precision); + +void init_ssms(RequestManager *rm, std::vector &ssm_models, int num_ssms, + ModelMeta &model_metadata, GenerationConfig &generationConfig, + bool use_full_precision); \ No newline at end of file diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 2802dd41b6..454926bcdb 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -126,7 +126,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { From e5d05d7b45f3248d70f25ad1f75cdd24d7fea2f7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Oct 2024 06:52:03 +0000 Subject: [PATCH 6/8] update --- inference/suffix_decoding/suffix_decoding.cc | 57 +++++++++++++----- inference/suffix_decoding/utils.cc | 63 ++++++++++++++++++++ inference/suffix_decoding/utils.h | 7 ++- 3 files changed, 110 insertions(+), 17 deletions(-) diff --git a/inference/suffix_decoding/suffix_decoding.cc b/inference/suffix_decoding/suffix_decoding.cc index d2b4aa17c7..49dd819b4a 100644 --- a/inference/suffix_decoding/suffix_decoding.cc +++ b/inference/suffix_decoding/suffix_decoding.cc @@ -22,6 +22,8 @@ using json = nlohmann::json; Legion::Logger log_app("llama"); +void process_partition(RequestManager *rm, std::string input_filename) { +} void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, @@ -30,13 +32,14 @@ void FlexFlow::top_level_task(Task const *task, FFConfig ffconfig; FilePaths file_paths; ModelMeta model_metadata; + std::string partition_name; bool use_full_precision = false; bool verbose = false; int max_requests_per_batch = 16; int max_tokens_per_batch = 256; int max_sequence_length = 1024; int max_spec_tree_token_num = 23; - int expansion_degree = 3; + int expansion_degree = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -45,6 +48,7 @@ void FlexFlow::top_level_task(Task const *task, argc, file_paths, model_metadata.model_names, + partition_name, use_full_precision, verbose, max_requests_per_batch, @@ -57,6 +61,10 @@ void FlexFlow::top_level_task(Task const *task, assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); + + json trace = load_trace(file_paths.prompt_file_path); + json training_entries = get_training_entries(trace, partition_name); + json eval_entries = get_eval_entries(trace, partition_name); GenerationConfig generationConfig; InferenceManager *im = InferenceManager::get_inference_manager(); @@ -88,30 +96,47 @@ void FlexFlow::top_level_task(Task const *task, rm->start_background_server(&tree_model); - // Register requests from prompt file int total_num_requests = 0; { - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - std::vector requests; - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + for (auto entry: eval_entries) { + std::string prompt = entry["prompt"]; + int response_length = entry["response_length"]; + // printf("Prompt[%d]: %s\n", total_num_requests, prompt.c_str()); // Add inference request Request inference_req; - inference_req.prompt = text; - inference_req.max_length = 128; + inference_req.prompt = prompt; + inference_req.max_new_tokens = response_length; requests.push_back(inference_req); total_num_requests++; } tree_model.generate(requests); - } + } + + // Register requests from prompt file + // int total_num_requests = 0; + // { + // using json = nlohmann::json; + // std::ifstream file_handle(file_paths.prompt_file_path); + // assert(file_handle.good() && "Prompt file does not exist."); + // json prompt_json = json::parse(file_handle, + // /*parser_callback_t */ nullptr, + // /*allow_exceptions */ true, + // /*ignore_comments */ true); + + // std::vector requests; + // for (auto &prompt : prompt_json) { + // std::string text = prompt.get(); + // printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // // Add inference request + // Request inference_req; + // inference_req.prompt = text; + // inference_req.max_length = 128; + // requests.push_back(inference_req); + // total_num_requests++; + // } + // tree_model.generate(requests); + // } // terminate the request manager by stopping the background thread rm->terminate_background_server(); diff --git a/inference/suffix_decoding/utils.cc b/inference/suffix_decoding/utils.cc index 0f20c1e5ff..f71aac2c26 100644 --- a/inference/suffix_decoding/utils.cc +++ b/inference/suffix_decoding/utils.cc @@ -8,6 +8,7 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, ModelNames &model_names, + std::string &partition_name, bool &use_full_precision, bool &verbose, int &max_requests_per_batch, @@ -37,6 +38,11 @@ void parse_input_args(char **argv, paths.cache_folder_path = std::string(argv[++i]); continue; } + // cache folder + if (!strcmp(argv[i], "-partition-name")) { + partition_name = std::string(argv[++i]); + continue; + } // prompts if (!strcmp(argv[i], "-prompt")) { paths.prompt_file_path = std::string(argv[++i]); @@ -308,4 +314,61 @@ void init_ssms(RequestManager *rm, std::vector &ssm_models, int num_ssm rm->register_ssm_model(&beam_model); } +} + +json load_trace(std::string input_filename) { + std::cout << "Loading input file: " << input_filename << std::endl; + std::ifstream file(input_filename); + if (!file.is_open()) { + std::cerr << "Error opening file: " << input_filename << std::endl; + return nullptr; + } + + try { + json data = json::parse(file); + + // Print metadata + const auto& metadata = data["metadata"]; + std::cout << "Metadata:" << std::endl; + std::cout << "Average entries per partition: " << metadata["avg_entries_per_partition"] << std::endl; + std::cout << "Max prompt length: " << metadata["max_prompt_length"] << std::endl; + std::cout << "Min prompt length: " << metadata["min_prompt_length"] << std::endl; + std::cout << "Avg prompt length: " << metadata["avg_prompt_length"] << std::endl; + std::cout << "Max response length: " << metadata["max_response_length"] << std::endl; + std::cout << "Min response length: " << metadata["min_response_length"] << std::endl; + std::cout << "Avg response length: " << metadata["avg_response_length"] << std::endl; + // Print list of partition names + const auto& partitions = data["partitions"]; + std::cout << "Partitions:" << std::endl; + int counter = 0; + for (const auto& partition : partitions) { + std::cout << counter++ << ". " << partition["name"] << std::endl; + } + } + catch (json::parse_error& e) { + std::cerr << "JSON parse error: " << e.what() << std::endl; + return nullptr; + } +} + +json get_training_entries(json data, std::string partition_name) { + const auto& partitions = data["partitions"]; + for (const auto& partition : partitions) { + if (partition["name"] == partition_name) { + return partition["training_entries"]; + } + } + std::cerr << "Partition not found: " << partition_name << std::endl; + return 1; +} + +json get_eval_entries(json data, std::string partition_name) { + const auto& partitions = data["partitions"]; + for (const auto& partition : partitions) { + if (partition["name"] == partition_name) { + return partition["eval_entries"]; + } + } + std::cerr << "Partition not found: " << partition_name << std::endl; + return 1; } \ No newline at end of file diff --git a/inference/suffix_decoding/utils.h b/inference/suffix_decoding/utils.h index f4f1f90216..7b56c06952 100644 --- a/inference/suffix_decoding/utils.h +++ b/inference/suffix_decoding/utils.h @@ -41,6 +41,7 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, ModelNames &model_names, + std::string &partition_name, bool &use_full_precision, bool &verbose, int &max_requests_per_batch, @@ -62,4 +63,8 @@ void init_llm(FFModel &tree_model, ModelMeta &model_metadata, void init_ssms(RequestManager *rm, std::vector &ssm_models, int num_ssms, ModelMeta &model_metadata, GenerationConfig &generationConfig, - bool use_full_precision); \ No newline at end of file + bool use_full_precision); + +json load_trace(std::string filename); +json get_training_entries(json data, std::string partition_name); +json get_eval_entries(json data, std::string partition_name); \ No newline at end of file From 6f4249fd618b535d31df907c821294ef28d8e8c7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Oct 2024 06:59:09 +0000 Subject: [PATCH 7/8] add script --- run_suffix_decoding.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 run_suffix_decoding.sh diff --git a/run_suffix_decoding.sh b/run_suffix_decoding.sh new file mode 100644 index 0000000000..e124f5e4c0 --- /dev/null +++ b/run_suffix_decoding.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash +set -e +set -x + +# Cd into parent directory of folder holding this script +cd "${BASH_SOURCE[0]%/*}/build" + +# Download models +python ../inference/utils/download_hf_model.py --half-precision-only meta-llama/Meta-Llama-3-8B Felladrin/Llama-160M-Chat-v1 + +./suffix_decoding/suffix_decoding \ + -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 4 \ + -ll:fsize 20000 -ll:zsize 30000 \ + -llm-model meta-llama/Meta-Llama-3-8B \ + -ssm-model Felladrin/Llama-160M-Chat-v1 \ + -partition-name "" \ + -prompt ../../suffix_decoding/trace/spider_v2.json \ + -output-file ../inference/output/spider_v2.out + From 08d15178720fe002a9c2516cc6b19d8cf303b6b3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 11 Oct 2024 08:15:33 +0000 Subject: [PATCH 8/8] fixes --- inference/suffix_decoding/utils.cc | 7 +-- run_suffix_decoding.sh | 5 +- src/c/flexflow_c.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cc | 1 + src/runtime/request_manager.cc | 48 ++++++++++---------- 5 files changed, 34 insertions(+), 29 deletions(-) mode change 100644 => 100755 run_suffix_decoding.sh diff --git a/inference/suffix_decoding/utils.cc b/inference/suffix_decoding/utils.cc index f71aac2c26..1c8fd930ff 100644 --- a/inference/suffix_decoding/utils.cc +++ b/inference/suffix_decoding/utils.cc @@ -342,8 +342,9 @@ json load_trace(std::string input_filename) { std::cout << "Partitions:" << std::endl; int counter = 0; for (const auto& partition : partitions) { - std::cout << counter++ << ". " << partition["name"] << std::endl; + std::cout << counter++ << ". " << partition["partition_name"] << std::endl; } + return data; } catch (json::parse_error& e) { std::cerr << "JSON parse error: " << e.what() << std::endl; @@ -354,7 +355,7 @@ json load_trace(std::string input_filename) { json get_training_entries(json data, std::string partition_name) { const auto& partitions = data["partitions"]; for (const auto& partition : partitions) { - if (partition["name"] == partition_name) { + if (partition["partition_name"] == partition_name) { return partition["training_entries"]; } } @@ -365,7 +366,7 @@ json get_training_entries(json data, std::string partition_name) { json get_eval_entries(json data, std::string partition_name) { const auto& partitions = data["partitions"]; for (const auto& partition : partitions) { - if (partition["name"] == partition_name) { + if (partition["partition_name"] == partition_name) { return partition["eval_entries"]; } } diff --git a/run_suffix_decoding.sh b/run_suffix_decoding.sh old mode 100644 new mode 100755 index e124f5e4c0..69e2105254 --- a/run_suffix_decoding.sh +++ b/run_suffix_decoding.sh @@ -7,14 +7,15 @@ cd "${BASH_SOURCE[0]%/*}/build" # Download models python ../inference/utils/download_hf_model.py --half-precision-only meta-llama/Meta-Llama-3-8B Felladrin/Llama-160M-Chat-v1 +export RUST_BACKTRACE=1 -./suffix_decoding/suffix_decoding \ +gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \ -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ -tensor-parallelism-degree 4 \ -ll:fsize 20000 -ll:zsize 30000 \ -llm-model meta-llama/Meta-Llama-3-8B \ -ssm-model Felladrin/Llama-160M-Chat-v1 \ -partition-name "" \ - -prompt ../../suffix_decoding/trace/spider_v2.json \ + -prompt ../../suffix-tree-decoding/trace/spider_v2.json \ -output-file ../inference/output/spider_v2.out diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index bfa60a6d54..980ecd356b 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1730,7 +1730,7 @@ void flexflow_model_generate(flexflow_model_t handle_, handle, dataset_fp.c_str(), max_lengths[i], - max_new_tokens[i], + max_new_tokens_[i], training_steps[i]); } else { assert(false && "Unknown request type"); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index ae0795ac1e..a28c0869cc 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -163,6 +163,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index faebaa83cc..ad4dd7c11b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -229,29 +229,29 @@ RequestManager::RequestManager() max_spec_tree_token_num = -1; max_sequence_length = -1; - std::string model_name = "meta-llama/Meta-Llama-3-70B"; - DailSqlTrace dail_sql_trace = - load_trace_dail_sql("/usr/suffix-tree-decoding/trace/spider.json"); - int num_prompts = dail_sql_trace.prompts.size(); - int num_responses = dail_sql_trace.responses.size(); - assert(num_prompts == num_responses); - auto tokenizer = get_tokenizer(model_name); - int bos_token_id = get_bos_token_id(model_name); - int train_size = num_prompts / 2; - std::cout << "Number of prompts: " << num_prompts << std::endl; - std::cout << "Train size: " << train_size << std::endl; - std::vector> training_dataset; - for (int i = 0; i < train_size; i++) { - std::string text = dail_sql_trace.prompts[i] + dail_sql_trace.responses[i]; - std::vector encoded = tokenizer->Encode(text); - encoded.insert(encoded.begin(), bos_token_id); - training_dataset.push_back(encoded); - } - suffix_tree = new SuffixTree(50); - for (auto const &text : training_dataset) { - suffix_tree->insert(text, suffix_tree->query_guid); - suffix_tree->query_guid++; - } + // std::string model_name = "meta-llama/Meta-Llama-3-70B"; + // DailSqlTrace dail_sql_trace = + // load_trace_dail_sql("/usr/suffix-tree-decoding/trace/spider.json"); + // int num_prompts = dail_sql_trace.prompts.size(); + // int num_responses = dail_sql_trace.responses.size(); + // assert(num_prompts == num_responses); + // auto tokenizer = get_tokenizer(model_name); + // int bos_token_id = get_bos_token_id(model_name); + // int train_size = num_prompts / 2; + // std::cout << "Number of prompts: " << num_prompts << std::endl; + // std::cout << "Train size: " << train_size << std::endl; + // std::vector> training_dataset; + // for (int i = 0; i < train_size; i++) { + // std::string text = dail_sql_trace.prompts[i] + dail_sql_trace.responses[i]; + // std::vector encoded = tokenizer->Encode(text); + // encoded.insert(encoded.begin(), bos_token_id); + // training_dataset.push_back(encoded); + // } + // suffix_tree = new SuffixTree(50); + // for (auto const &text : training_dataset) { + // suffix_tree->insert(text, suffix_tree->query_guid); + // suffix_tree->query_guid++; + // } } void RequestManager::set_max_requests_per_batch(int max_num_requests) { @@ -348,6 +348,8 @@ void RequestManager::register_tokenizer(ModelType type, << std::endl; assert(false); } + std::cout << "Loading tokenizer from: " << tokenizer_json_path + << std::endl; this->tokenizer_ = Tokenizer::FromBlobJSON( LoadBytesFromFile(tokenizer_json_path.string())); }