From 1691100906ddf25191fb0e1444fa75d0675cd44d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 6 Oct 2024 05:10:58 +0000 Subject: [PATCH] bug fix --- include/flexflow/batch_config.h | 4 +- include/flexflow/fftype.h | 1 + include/flexflow/model.h | 1 + .../ops/kernels/lora_linear_kernels.h | 4 + include/flexflow/ops/lora_linear.h | 2 - include/flexflow/ops/lora_linear_params.h | 25 ++++-- include/flexflow/request_manager.h | 2 + .../flexflow/utils/peft_weight_allocator.h | 22 +++-- src/ops/kernels/lora_linear_kernels.cu | 41 +++++---- src/ops/lora_linear.cc | 88 +++++++++++-------- src/ops/lora_linear_params.cc | 9 +- src/runtime/fftype.cc | 2 + src/runtime/peft_weight_allocator.cc | 43 ++++++--- src/runtime/peft_weight_allocator.cu | 8 +- src/runtime/request_manager.cc | 5 +- 15 files changed, 156 insertions(+), 101 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index cb2f8d3a3d..44d829a7f7 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -94,6 +94,7 @@ class BatchConfig { num_tokens_in_batch = 0; max_length = 0; request_guid = 0; + peft_model_id = PEFTModelID::NO_ID; prompt_phase = false; batch_config_request_id = -1; peft_bwd = false; @@ -109,7 +110,8 @@ class BatchConfig { bool prompt_phase = false; RequestGuid request_guid; // PEFT fields - std::unordered_map peft_adapters; + PEFTModelID peft_model_id; + std::string peft_model_config; bool peft_bwd; OptimizerTasks optimizer_tasks; }; diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 3e482b8d67..ebc811c262 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -27,6 +27,7 @@ class PEFTModelID { PEFTModelID(size_t id); bool is_valid_id() const; friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); + friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs); friend std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id); diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d1dbe72d7c..e3beafe20c 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -847,6 +847,7 @@ class FFModel { // ======================================== // PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); void add_lora_layers(std::vector target_modules); + PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config); // ======================================== // Inference APIs // ======================================== diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index eef3b392b3..00f16af146 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -6,6 +6,7 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/ops/lora_linear.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { @@ -35,6 +36,9 @@ class LoraLinearMeta : public OpMeta { namespace Kernels { namespace LoraLinear { + +bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config); + void init_kernel_wrapper(LoraLinearMeta *m, int seed); void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 8d37be0c64..1c6070afe4 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -20,12 +20,10 @@ class LoraLinear : public Op { LoraLinear( FFModel &model, LayerID const &layer_guid, - OperatorType type, ParallelTensor const input, ParallelTensor const output, int max_rank, int max_concurrent_adapters, - // std::unordered_map const &_peft_configs, char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index c5a327459f..525a9209d3 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -124,16 +124,28 @@ class LoraLinearConfig { std::vector const &target_modules_ = {}); // constructor used to support std::unordered_map LoraLinearConfig(); + + // Method to set optimizer template - void setOptimizer(T&& opt) { - optimizer_config = std::make_unique(std::forward(opt)); + void setOptimizer(T&& opt) { + if constexpr (std::is_base_of_v>) { + optimizer_config = std::make_unique>(std::forward(opt)); + } else if constexpr (std::is_same_v, std::remove_reference_t>) { + optimizer_config = std::move(opt); + } else { + static_assert(always_false, "Unsupported optimizer type"); } + } + // Helper template for static_assert + template + static inline constexpr bool always_false = false; + friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); std::string serialize_to_json_string(int indent=-1) const { - json j = { + nlohmann::json j = { {"cache_folder", cache_folder}, {"peft_model_id", peft_model_id}, {"rank", rank}, @@ -144,7 +156,8 @@ class LoraLinearConfig { {"init_lora_weights", init_lora_weights}, {"base_model_name_or_path", base_model_name_or_path}, {"precision", precision}, - {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} + // {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} + {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()} }; return j.dump(indent); // No indentation @@ -156,7 +169,7 @@ class LoraLinearConfig { } // Deserialization method static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) { - json j = json::parse(json_string); + nlohmann::json j = nlohmann::json::parse(json_string); LoraLinearConfig config( j["cache_folder"].get(), j["peft_model_id"].get(), @@ -208,8 +221,6 @@ class LoraLinearConfig { class LoraLinearParams { public: LayerID layer_guid; - // OperatorType type; - // std::unordered_map peft_configs; int max_rank; int max_concurrent_adapters; char name[MAX_OPNAME]; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 542deb336d..628714dcc0 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -149,6 +149,8 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); + void register_peft_config(PEFTModelID const &peft_model_id, + LoraLinearConfig const &peft_config); LoraLinearConfig get_peft_config(PEFTModelID peft_model_id); void set_max_lora_rank(int max_lora_rank); void set_max_concurrent_adapters(int max_concurrent_adapters); diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 3c9efc0812..9670da8a4f 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -17,12 +17,13 @@ #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ #include "flexflow/config.h" -#include "lora_linear_params.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/lora_linear_params.h" // #include namespace FlexFlow { -#ifdef DEACODE +#ifdef DEADCODE class PEFTWeightAllocator { public: PEFTWeightAllocator(void *_base_ptr, size_t _total_size) @@ -108,19 +109,21 @@ struct LoraLinearWeight { low_rank_activation(low_rank_activation_), input_activation(input_activation_) {} }; +void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed); + class PEFTMemoryManager { public: - PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) + PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) : gpu_mem(gpu_mem_), max_concurrent_adapters(max_concurrent_adapters_), - max_lora_size(max_lora_size_), + max_rank(max_rank_), in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_), max_peft_tokens(max_peft_tokens_), lora_layername_substr(lora_layername_substr_), dt(dt_), base_ptr(nullptr), finetuning_ptr(nullptr), finetuning_model_id(PEFTModelID::NO_ID) { - + max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim); assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0"); assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0"); allocate_inference_memory(); @@ -146,12 +149,13 @@ class PEFTMemoryManager { LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); // Legion memory management apparatus - Memory gpu_mem; + Legion::Memory gpu_mem; Realm::RegionInstance peftLegionInst; void *base_ptr, *finetuning_ptr; // Size and shapes int max_concurrent_adapters; - size_t max_lora_size; + int max_rank; + int max_lora_size; int in_dim, out_dim, num_shards, shard_id; int max_peft_tokens; // LRU cache apparatus @@ -162,8 +166,8 @@ class PEFTMemoryManager { std::string lora_layername_substr; DataType dt; PEFTModelID finetuning_model_id; -} +}; -}; // namespace FlexFlow +} // namespace FlexFlow #endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index d5baf49cdc..134af3ca6e 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -24,8 +24,10 @@ namespace FlexFlow { LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) : OpMeta(handler, li) { +#ifdef DEADCODE allocated_peft_buffer_size1 = 0; allocated_peft_buffer_size2 = 0; +#endif } LoraLinearMeta::~LoraLinearMeta(void) {} @@ -145,6 +147,16 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m, } } +bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) { + for (std::string s : config.target_modules) { + std::string n(m->op_name); + if (n.find(s) != std::string::npos) { + return true; + } + } + return false; +} + namespace Internal { @@ -289,17 +301,6 @@ void inference_kernel(LoraLinearMeta *m, } #endif -bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) { - for (std::string s : config.target_modules) { - std::string n(m->op_name); - if (n.find(s) != std::string::npos) { - return true; - } - } - return false; -} - - template void inference_kernel(LoraLinearMeta *m, BatchConfig const *bc, @@ -326,7 +327,7 @@ void inference_kernel(LoraLinearMeta *m, if (bc->requestsInfo[i].peft_bwd) { num_peft_requests++; } - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -444,8 +445,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) { continue; } - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -453,7 +453,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; // int max_peft_tokens = bc->requestsInfo[i].max_length; - int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); @@ -562,15 +562,14 @@ void peft_bwd_kernel(LoraLinearMeta *m, } if (bc->requestsInfo[i].optimizer_tasks.update_weights) { - LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config; - assert(optimizer_config != nullptr); + assert(lora_config.optimizer_config != nullptr); int w0_num_elements = lora_config.rank * in_dim; int w1_num_elements = lora_config.rank * out_dim; // Get optimizer config - if (optimizer_config->getType() == "SGD") { - LoraSGDOptimizerConfig const *sgd_config = - (LoraSGDOptimizerConfig const *)optimizer_config; + + if (lora_config.optimizer_config->getType() == "SGD") { + LoraSGDOptimizerConfig const *sgd_config = static_cast(lora_config.optimizer_config.get()); // LoRA_A weight is split in tensor parallelism, so no need to apply // all-reduce sgd_update<<(weight.w1_grad_ptr), static_cast
(weight.w1_v_values_ptr), static_cast
(weight.w1_ptr)); - } else if (optimizer_config->getType() == "Adam") { + } else if (lora_config.optimizer_config->getType() == "Adam") { assert(false && "Adam optimizer type not implemented yet"); } else { assert(false && "Unsupported optimizer type"); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index a18f47c4ac..f7ac4ff06e 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -6,6 +6,7 @@ #include "flexflow/utils/hash_utils.h" #include "flexflow/utils/peft_weight_allocator.h" #include "legion/legion_utilities.h" +#include "flexflow/request_manager.h" #include #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -51,13 +52,13 @@ bool check_lora_layer_match(Layer *potential_target, return false; } -void FFmodel::add_lora_layers(std::vector target_modules) { +void FFModel::add_lora_layers(std::vector target_modules) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); assert(target_modules.size() > 0 && "LoRA target module name is empty"); RequestManager *rm = RequestManager::get_request_manager(); int max_lora_rank = rm->get_max_lora_rank(); int max_concurrent_adapters = rm->get_max_concurrent_adapters(); - assert(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank"); + assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank"); assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters"); for (std::string target_module_name : target_modules) { @@ -120,7 +121,7 @@ void FFmodel::add_lora_layers(std::vector target_modules) { true /*create_grad*/); } // pass max_rank and max_concurrent_adapters to OP_LORA layer - peft_layer->add_int_property("max_rank", max_rank); + peft_layer->add_int_property("max_rank", max_lora_rank); peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters); it = layers.insert(it + 1, peft_layer); ++it; @@ -263,7 +264,7 @@ Op *LoraLinear::create_operator_from_layer( long long value; layer->get_int_property("max_rank", value); int max_rank = value; - layer->get_int_property("max_concurrent_adapters", max_concurrent_adapters); + layer->get_int_property("max_concurrent_adapters", value); int max_concurrent_adapters = value; #ifdef DEADCODE std::unordered_map _peft_configs; @@ -276,7 +277,6 @@ Op *LoraLinear::create_operator_from_layer( #endif return new LoraLinear(model, layer->layer_guid, - layer->op_type, inputs[0], inputs[1], max_rank, @@ -290,7 +290,6 @@ LoraLinear::LoraLinear(FFModel &model, ParallelTensor const output) : LoraLinear(model, other.layer_guid, - other.op_type, input, output, other.max_rank, @@ -303,7 +302,6 @@ LoraLinear::LoraLinear(FFModel &model, char const *name) : LoraLinear(model, params.layer_guid, - params.type, inputs.first, inputs.second, params.max_rank, @@ -313,7 +311,6 @@ LoraLinear::LoraLinear(FFModel &model, LoraLinear::LoraLinear( FFModel &model, LayerID const &_layer_guid, - OperatorType _op_type, ParallelTensor const _input, ParallelTensor const _output, int _max_rank, @@ -321,7 +318,7 @@ LoraLinear::LoraLinear( // std::unordered_map const &_peft_configs, char const *name) : Op(model, - _op_type, + OP_LORA, _output->data_type, name, 2 /*inputs*/, @@ -473,9 +470,8 @@ OpMeta *LoraLinear::init_task(Task const *task, lora_layername.substr(0, found + searchString.length()); // allocate space for lora weights - size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim); Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); - m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, max_lora_size, lora->max_concurrent_adapters, in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt); + m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, lora->max_rank, lora->max_concurrent_adapters, BatchConfig::max_sequence_length(), in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt); m->peft_memory_manager->allocate_inference_memory(); return m; } @@ -709,8 +705,8 @@ void LoraLinear::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorRW( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); @@ -761,12 +757,15 @@ void LoraLinear::inference_task(Task const *task, assert(false); } - int rank, num_tokens; - for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { - PEFTModelID peft_model_id = it->first; - LoraLinearWeight weight = m->model_state[peft_model_id].weights; - rank = weight.rank; - num_tokens = input.domain.get_volume() / weight.in_dim; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); fs::path dst_filepath_weights = get_dst_folder("weights", m->decoding_step, shard_id) / layername; std::string filenameA = @@ -775,21 +774,38 @@ void LoraLinear::inference_task(Task const *task, dst_filepath_weights.string() + ".weight_B.original"; if (m->input_type[0] == DT_FLOAT) { save_tensor((float *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filenameA.c_str()); save_tensor((float *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filenameB.c_str()); } else if (m->input_type[0] == DT_HALF) { save_tensor((half *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filenameA.c_str()); save_tensor((half *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filenameB.c_str()); } else { assert(false && "Data type not supported"); } + + if (bc->requestsInfo[i].peft_bwd) { + int num_tokens = input.domain.get_volume() / in_dim; + // input activation (intermediate) + filename = dst_filepath.string() + ".low_rank_activation"; + if (output.data_type == DT_FLOAT) { + save_tensor((float *)weight.low_rank_activation, + lora_config.rank * num_tokens, + filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor((half *)weight.low_rank_activation, + lora_config.rank * num_tokens, + filename.c_str()); + } else { + assert(false); + } + } } filename = dst_filepath.string() + ".output_0"; @@ -803,21 +819,7 @@ void LoraLinear::inference_task(Task const *task, assert(false); } - if (bc->num_active_peft_tokens() > 0) { - // input activation (intermediate) - filename = dst_filepath.string() + ".low_rank_activation"; - if (output.data_type == DT_FLOAT) { - save_tensor((float *)m->low_rank_activation, - rank * num_tokens, - filename.c_str()); - } else if (output.data_type == DT_HALF) { - save_tensor((half *)m->low_rank_activation, - rank * num_tokens, - filename.c_str()); - } else { - assert(false); - } - } + m->decoding_step++; } } @@ -905,6 +907,16 @@ void lora_inference_debugging(LoraLinearMeta *m, // weights, weights gradients fs::path dst_filepath_weights = get_dst_folder("weights", m->bwd_step, shard_id) / layername; + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + assert(m->model_state.size() >= 1 && "Model state empty!"); for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { PEFTModelID peft_model_id = it->first; diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 310b6d0973..c7b9fcc711 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -170,11 +170,10 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { os << "trainable: " << llc.trainable << ", "; if (llc.optimizer_config != nullptr) { os << "optimizer_config: "; - if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) { - os << *static_cast(llc.optimizer_config); - } else if (typeid(*llc.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - os << *static_cast(llc.optimizer_config); + if (llc.optimizer_config.get()->getType() == "SGD") { + os << *static_cast(llc.optimizer_config.get()); + } else if (llc.optimizer_config.get()->getType() == "Adam") { + os << *static_cast(llc.optimizer_config.get()); } else { os << "Unknown optimizer config type"; } diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 8213726e8a..0af5f45350 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -46,6 +46,8 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { return lhs.id == rhs.id; } +bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { return !(lhs == rhs); } + std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { if (peft_model_id == PEFTModelID::NO_ID) { os << "NO_ID"; diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index cc40d666ed..287eb7e20a 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -1,6 +1,24 @@ -#include "peft_weight_allocator.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { +// declare legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; void PEFTMemoryManager::allocate_inference_memory() { // allocate chunk of memory for all the PEFT adapters @@ -21,7 +39,7 @@ void PEFTMemoryManager::allocate_inference_memory() { void PEFTMemoryManager::allocate_finetuning_memory() { size_t ft_size = max_lora_size*3; // weights, gradients, momentum values - ft_size += max_peft_tokens*(in_dim+rank); // input, low-rank activations + ft_size += max_peft_tokens * (in_dim + max_rank); // input, low-rank activations // allocate chunk of memory for PEFT adapter Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), @@ -144,7 +162,7 @@ void load_peft_from_file(DT *ptr, void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) { // Load weights - assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr "PEFT Memory Manager weight ptr null"); + assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && "PEFT Memory Manager weight ptr null"); int w0_num_elements = lora_config.rank * in_dim; int w1_num_elements = lora_config.rank * out_dim; // values below represent total weight sizes before sharding. Lora B is not @@ -235,7 +253,7 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_ int data_size = data_type_size(dt); LoraLinearWeight result; result.w0_ptr = static_cast(base_ptr) + mem_slot * max_lora_size; - result.w1_ptr = result.w0_ptr + w0_num_elements * data_size; + result.w1_ptr = static_cast(result.w0_ptr) + w0_num_elements * data_size; if (cache_miss) { load_peft_model(result, lora_config); } @@ -244,19 +262,20 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_ LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); - bool cache_miss = get_finetuning_slot(model_id); + bool cache_miss; + get_finetuning_slot(model_id, &cache_miss); int w0_num_elements = lora_config.rank * in_dim; int w1_num_elements = lora_config.rank * out_dim; int data_size = data_type_size(dt); LoraLinearWeight result; result.w0_ptr = finetuning_ptr; - result.w1_ptr = result.w0_ptr + w0_num_elements*data_size; - result.w0_grad_ptr = result.w1_ptr + w1_num_elements*data_size; - result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size; - result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size; - result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size; - result.input_activation = result.w1_v_values_ptr + w1_num_elements*data_size; // max_peft_tokens*in_dim - result.low_rank_activation = result.input_activation + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank + result.w1_ptr = static_cast(result.w0_ptr)+ w0_num_elements*data_size; + result.w0_grad_ptr = static_cast(result.w1_ptr) + w1_num_elements*data_size; + result.w1_grad_ptr = static_cast(result.w0_grad_ptr) + w0_num_elements*data_size; + result.w0_v_values_ptr = static_cast(result.w1_grad_ptr) + w1_num_elements*data_size; + result.w1_v_values_ptr = static_cast(result.w0_v_values_ptr) + w0_num_elements*data_size; + result.input_activation = static_cast(result.w1_v_values_ptr) + w1_num_elements*data_size; // max_peft_tokens*in_dim + result.low_rank_activation = static_cast(result.input_activation) + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank if (cache_miss) { load_peft_model(result, lora_config); } diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu index cc8d095069..bc9ab443cb 100644 --- a/src/runtime/peft_weight_allocator.cu +++ b/src/runtime/peft_weight_allocator.cu @@ -8,7 +8,7 @@ namespace FlexFlow { template -void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) { +void lora_init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) { // Initialize generator std::mt19937 gen(seed); @@ -47,7 +47,7 @@ void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int ra num = num_float; } } - checkCUDA(cudaMemcpyAsync(static_cast
(w1_ptr), + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), lora_b_random_init.data(), w1_num_elements * sizeof(DT), cudaMemcpyHostToDevice, @@ -59,9 +59,9 @@ void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int ou checkCUDA(get_legion_stream(&stream)); if (dt == DT_FLOAT) { - Internal::init_kernel(weight, in_di, out_dim, rank, seed, stream); + lora_init_kernel(weight, in_dim, out_dim, rank, seed, stream); } else if (dt == DT_HALF) { - Internal::init_kernel(weight, in_di, out_dim, rank, seed, stream); + lora_init_kernel(weight, in_dim, out_dim, rank, seed, stream); } else { assert(false && "Unsupported data type"); } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 79fcdfdcfe..2377a4f938 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -260,7 +260,8 @@ void RequestManager::register_peft_config(PEFTModelID const &peft_model_id, // check that peft_model_id is not already in use assert(peft_configs.find(peft_model_id) == peft_configs.end() && "PEFT model ID already in use"); - peft_configs[peft_model_id] = peft_config; + // peft_configs[peft_model_id] = std::move(peft_config); + peft_configs.emplace(peft_model_id, std::move(peft_config)); } LoraLinearConfig const &RequestManager::get_peft_config( @@ -284,7 +285,7 @@ int RequestManager::get_max_concurrent_adapters() { return max_concurrent_adapters; } -PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) { +PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); if (peft_config.target_modules.size() == 0) {