Skip to content

Commit

Permalink
Merge branch 'suffix_decoding' into new_tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro authored Oct 15, 2024
2 parents eeafdc7 + 08d1517 commit 07217e8
Show file tree
Hide file tree
Showing 14 changed files with 1,001 additions and 3 deletions.
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,16 @@ if(NOT BUILD_LEGION_ONLY)
endif()

set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
# SuffixDecoding
include(FetchContent)
FetchContent_Declare(
suffix_decoding
GIT_REPOSITORY [email protected]:Snowflake-Labs/suffix-tree-decoding.git
GIT_TAG main # or a specific tag/commit hash
)
FetchContent_MakeAvailable(suffix_decoding)
list(APPEND FLEXFLOW_INCLUDE_DIRS ${suffix_decoding_SOURCE_DIR}/src)
list(APPEND FLEXFLOW_SRC ${suffix_decoding_SOURCE_DIR}/src/suffix_decoding.cc)

add_library(substitution_loader SHARED
${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
Expand Down Expand Up @@ -534,6 +544,7 @@ if(NOT BUILD_LEGION_ONLY)

if(FF_BUILD_INFERENCE)
add_subdirectory(inference/spec_infer)
add_subdirectory(inference/suffix_decoding)
add_subdirectory(inference/incr_decoding)
add_subdirectory(inference/peft)
endif()
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ enum TaskIDs {
RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID,
RM_BACKGROUND_SERVING_TASK_ID,
// Custom tasks
CUSTOM_GPU_TASK_ID_FIRST,
Expand Down
19 changes: 19 additions & 0 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "flexflow/inference.h"
#include "flexflow/model.h"
#include "flexflow/utils/file_loader.h"
#include "suffix_decoding.h"
#include <future>
#include <mutex>
#include <tokenizers_cpp.h>
Expand Down Expand Up @@ -165,6 +166,7 @@ class RequestManager {

void serve_incr_decoding(FFModel *model);
void serve_spec_infer(FFModel *model);
void serve_suffix_decoding(FFModel *model);
GenerationResult get_generation_result(RequestGuid const &guid);
RequestGuid register_new_request(Request const &request_);
RequestGuid register_new_peft_request(Request const &request_);
Expand Down Expand Up @@ -211,6 +213,15 @@ class RequestManager {
Legion::Context ctx,
Legion::Runtime *runtime);

TreeVerifyBatchConfig
prepare_next_batch_suffix_decode(TreeVerifyBatchConfig const &old_bc,
InferenceResult const &result);
TreeVerifyBatchConfigFuture prepare_next_batch_suffix_decode(
TreeVerifyBatchConfigFuture const &old_bc,
InferenceResultFuture const &result,
Legion::Context ctx,
Legion::Runtime *runtime);

void store_beam_metadata(BeamSearchBatchConfig const &old_bc,
BeamInferenceResult const &result);
void update_beam_metadata(BeamSearchBatchConfig &new_bc,
Expand Down Expand Up @@ -281,6 +292,12 @@ class RequestManager {
Legion::Context ctx,
Legion::Runtime *runtime);

static TreeVerifyBatchConfig prepare_next_batch_suffix_decode_task(
Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);

private:
// configuration parameters
int max_requests_per_batch;
Expand All @@ -296,6 +313,8 @@ class RequestManager {
// tree width in each speculative step, if not specified 1
std::vector<int> spec_infer_tree_width;

SuffixTree *suffix_tree;

// private fields
std::unique_ptr<Tokenizer> tokenizer_;
bool verbose;
Expand Down
38 changes: 38 additions & 0 deletions inference/suffix_decoding/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
cmake_minimum_required(VERSION 3.10)

project(FlexFlow_SpecInfer)
set(project_target suffix_decoding)


set(CPU_SRC
${FLEXFLOW_CPP_DRV_SRC}
suffix_decoding.cc
utils.cc
../models/llama.cc
../models/opt.cc
../models/falcon.cc
../models/mpt.cc)

if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
cuda_add_executable(${project_target} ${CPU_SRC})
if (FF_GPU_BACKEND STREQUAL "hip_cuda")
target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__)
endif()
elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP)
hip_add_executable(${project_target} ${CPU_SRC})
if (FF_HIP_ARCH STREQUAL "")
message(FATAL_ERROR "FF_HIP_ARCH is empty!")
endif()
set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__)
else()
message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported")
endif()

target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})

set(BIN_DEST "bin")
install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
37 changes: 37 additions & 0 deletions inference/suffix_decoding/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Flags for directing the runtime makefile what to include
DEBUG ?= 0 # Include debugging symbols
MAX_DIM ?= 4 # Maximum number of dimensions
OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level
USE_CUDA ?= 1 # Include CUDA support (requires CUDA)
USE_GASNET ?= 0 # Include GASNet support (requires GASNet)
USE_HDF ?= 1 # Include HDF5 support (requires HDF5)
ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended)

# Put the binary file name here
OUTFILE ?= llama_pipeline
# List all the application source files here
ifndef CUDA_HOME
CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
endif


ifndef FF_HOME
$(error FF_HOME variable is not defined, aborting build)
endif

include $(FF_HOME)/FlexFlow.mk
154 changes: 154 additions & 0 deletions inference/suffix_decoding/suffix_decoding.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "suffix_decoding/utils.h"

using namespace FlexFlow;
using namespace Legion;
using json = nlohmann::json;

Legion::Logger log_app("llama");


void process_partition(RequestManager *rm, std::string input_filename) {
}

void FlexFlow::top_level_task(Task const *task,
std::vector<PhysicalRegion> const &regions,
Context ctx,
Runtime *runtime) {
FFConfig ffconfig;
FilePaths file_paths;
ModelMeta model_metadata;
std::string partition_name;
bool use_full_precision = false;
bool verbose = false;
int max_requests_per_batch = 16;
int max_tokens_per_batch = 256;
int max_sequence_length = 1024;
int max_spec_tree_token_num = 23;
int expansion_degree = 1;

InputArgs const &command_args = HighLevelRuntime::get_input_args();
char **argv = command_args.argv;
int argc = command_args.argc;
parse_input_args(argv,
argc,
file_paths,
model_metadata.model_names,
partition_name,
use_full_precision,
verbose,
max_requests_per_batch,
max_tokens_per_batch,
max_sequence_length,
expansion_degree);

get_model_meta(file_paths, model_metadata, use_full_precision);

assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
ffconfig.pipeline_parallelism_degree ==
ffconfig.numNodes * ffconfig.workersPerNode);

json trace = load_trace(file_paths.prompt_file_path);
json training_entries = get_training_entries(trace, partition_name);
json eval_entries = get_eval_entries(trace, partition_name);

GenerationConfig generationConfig;
InferenceManager *im = InferenceManager::get_inference_manager();
RequestManager *rm = RequestManager::get_request_manager();
init_request_manager(rm,
model_metadata,
file_paths,
max_requests_per_batch,
max_tokens_per_batch,
max_spec_tree_token_num,
max_sequence_length,
expansion_degree);

// Create LLM model
FFModel tree_model(ffconfig, ffconfig.cpu_offload);
init_llm(tree_model, model_metadata, generationConfig, use_full_precision);

// Create SSM models
int num_ssms = model_metadata.ssm_model_types.size();
std::vector<FFModel> ssm_models;
FFConfig bm_config = ffconfig;
bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
bm_config.pipeline_parallelism_degree = 1;
for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
FFModel beam_model(bm_config);
ssm_models.push_back(beam_model);
}
init_ssms(rm, ssm_models, num_ssms, model_metadata, generationConfig, use_full_precision);

rm->start_background_server(&tree_model);

int total_num_requests = 0;
{
std::vector<Request> requests;
for (auto entry: eval_entries) {
std::string prompt = entry["prompt"];
int response_length = entry["response_length"];
// printf("Prompt[%d]: %s\n", total_num_requests, prompt.c_str());
// Add inference request
Request inference_req;
inference_req.prompt = prompt;
inference_req.max_new_tokens = response_length;
requests.push_back(inference_req);
total_num_requests++;
}
tree_model.generate(requests);
}

// Register requests from prompt file
// int total_num_requests = 0;
// {
// using json = nlohmann::json;
// std::ifstream file_handle(file_paths.prompt_file_path);
// assert(file_handle.good() && "Prompt file does not exist.");
// json prompt_json = json::parse(file_handle,
// /*parser_callback_t */ nullptr,
// /*allow_exceptions */ true,
// /*ignore_comments */ true);

// std::vector<Request> requests;
// for (auto &prompt : prompt_json) {
// std::string text = prompt.get<std::string>();
// printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
// // Add inference request
// Request inference_req;
// inference_req.prompt = text;
// inference_req.max_length = 128;
// requests.push_back(inference_req);
// total_num_requests++;
// }
// tree_model.generate(requests);
// }

// terminate the request manager by stopping the background thread
rm->terminate_background_server();

// Execution fence
{
Future future = runtime->issue_execution_fence(ctx);
future.get_void_result();
}

// float* data
std::cout << "----------inference finished--------------" << std::endl;
}

void FlexFlow::register_custom_tasks() {}
Loading

0 comments on commit 07217e8

Please sign in to comment.