Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Oct 6, 2024
1 parent 53c408c commit 1691100
Show file tree
Hide file tree
Showing 15 changed files with 156 additions and 101 deletions.
4 changes: 3 additions & 1 deletion include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class BatchConfig {
num_tokens_in_batch = 0;
max_length = 0;
request_guid = 0;
peft_model_id = PEFTModelID::NO_ID;
prompt_phase = false;
batch_config_request_id = -1;
peft_bwd = false;
Expand All @@ -109,7 +110,8 @@ class BatchConfig {
bool prompt_phase = false;
RequestGuid request_guid;
// PEFT fields
std::unordered_map<PEFTModelID, std::string> peft_adapters;
PEFTModelID peft_model_id;
std::string peft_model_config;
bool peft_bwd;
OptimizerTasks optimizer_tasks;
};
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/fftype.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class PEFTModelID {
PEFTModelID(size_t id);
bool is_valid_id() const;
friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs);
friend std::ostream &operator<<(std::ostream &os,
PEFTModelID const &peft_model_id);

Expand Down
1 change: 1 addition & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,7 @@ class FFModel {
// ========================================
// PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
void add_lora_layers(std::vector<std::string> target_modules);
PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
// ========================================
// Inference APIs
// ========================================
Expand Down
4 changes: 4 additions & 0 deletions include/flexflow/ops/kernels/lora_linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "flexflow/fftype.h"
#include "flexflow/op_meta.h"
#include "flexflow/ops/lora_linear.h"
#include "flexflow/utils/peft_weight_allocator.h"

namespace FlexFlow {

Expand Down Expand Up @@ -35,6 +36,9 @@ class LoraLinearMeta : public OpMeta {

namespace Kernels {
namespace LoraLinear {

bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config);

void init_kernel_wrapper(LoraLinearMeta *m, int seed);
void inference_kernel_wrapper(LoraLinearMeta *m,
BatchConfig const *bc,
Expand Down
2 changes: 0 additions & 2 deletions include/flexflow/ops/lora_linear.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@ class LoraLinear : public Op {
LoraLinear(
FFModel &model,
LayerID const &layer_guid,
OperatorType type,
ParallelTensor const input,
ParallelTensor const output,
int max_rank,
int max_concurrent_adapters,
// std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
char const *name = nullptr);
LoraLinear(FFModel &model,
LoraLinear const &other,
Expand Down
25 changes: 18 additions & 7 deletions include/flexflow/ops/lora_linear_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,16 +124,28 @@ class LoraLinearConfig {
std::vector<std::string> const &target_modules_ = {});
// constructor used to support std::unordered_map
LoraLinearConfig();

// Method to set optimizer
template<typename T>
void setOptimizer(T&& opt) {
optimizer_config = std::make_unique<T>(std::forward<T>(opt));
void setOptimizer(T&& opt) {
if constexpr (std::is_base_of_v<LoraOptimizerConfig, std::remove_reference_t<T>>) {
optimizer_config = std::make_unique<std::remove_reference_t<T>>(std::forward<T>(opt));
} else if constexpr (std::is_same_v<std::unique_ptr<LoraOptimizerConfig>, std::remove_reference_t<T>>) {
optimizer_config = std::move(opt);
} else {
static_assert(always_false<T>, "Unsupported optimizer type");
}
}
// Helper template for static_assert
template <typename>
static inline constexpr bool always_false = false;

friend bool operator==(LoraLinearConfig const &lhs,
LoraLinearConfig const &rhs);
friend std::ostream &operator<<(std::ostream &os,
LoraLinearConfig const &llc);
std::string serialize_to_json_string(int indent=-1) const {
json j = {
nlohmann::json j = {
{"cache_folder", cache_folder},
{"peft_model_id", peft_model_id},
{"rank", rank},
Expand All @@ -144,7 +156,8 @@ class LoraLinearConfig {
{"init_lora_weights", init_lora_weights},
{"base_model_name_or_path", base_model_name_or_path},
{"precision", precision},
{"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
// {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
{"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()}
};

return j.dump(indent); // No indentation
Expand All @@ -156,7 +169,7 @@ class LoraLinearConfig {
}
// Deserialization method
static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) {
json j = json::parse(json_string);
nlohmann::json j = nlohmann::json::parse(json_string);
LoraLinearConfig config(
j["cache_folder"].get<std::string>(),
j["peft_model_id"].get<std::string>(),
Expand Down Expand Up @@ -208,8 +221,6 @@ class LoraLinearConfig {
class LoraLinearParams {
public:
LayerID layer_guid;
// OperatorType type;
// std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
int max_rank;
int max_concurrent_adapters;
char name[MAX_OPNAME];
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ class RequestManager {
int eos_token_id,
std::string const &path);
void register_output_filepath(std::string const &);
void register_peft_config(PEFTModelID const &peft_model_id,
LoraLinearConfig const &peft_config);
LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
void set_max_lora_rank(int max_lora_rank);
void set_max_concurrent_adapters(int max_concurrent_adapters);
Expand Down
22 changes: 13 additions & 9 deletions include/flexflow/utils/peft_weight_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
#define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_

#include "flexflow/config.h"
#include "lora_linear_params.h"
#include "flexflow/ffconst_utils.h"
#include "flexflow/ops/lora_linear_params.h"
// #include <mutex>

namespace FlexFlow {

#ifdef DEACODE
#ifdef DEADCODE
class PEFTWeightAllocator {
public:
PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
Expand Down Expand Up @@ -108,19 +109,21 @@ struct LoraLinearWeight {
low_rank_activation(low_rank_activation_), input_activation(input_activation_) {}
};

void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed);

class PEFTMemoryManager {
public:
PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_)
PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_)
: gpu_mem(gpu_mem_),
max_concurrent_adapters(max_concurrent_adapters_),
max_lora_size(max_lora_size_),
max_rank(max_rank_),
in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_),
max_peft_tokens(max_peft_tokens_),
lora_layername_substr(lora_layername_substr_), dt(dt_),
base_ptr(nullptr),
finetuning_ptr(nullptr),
finetuning_model_id(PEFTModelID::NO_ID) {

max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim);
assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0");
assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0");
allocate_inference_memory();
Expand All @@ -146,12 +149,13 @@ class PEFTMemoryManager {
LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);

// Legion memory management apparatus
Memory gpu_mem;
Legion::Memory gpu_mem;
Realm::RegionInstance peftLegionInst;
void *base_ptr, *finetuning_ptr;
// Size and shapes
int max_concurrent_adapters;
size_t max_lora_size;
int max_rank;
int max_lora_size;
int in_dim, out_dim, num_shards, shard_id;
int max_peft_tokens;
// LRU cache apparatus
Expand All @@ -162,8 +166,8 @@ class PEFTMemoryManager {
std::string lora_layername_substr;
DataType dt;
PEFTModelID finetuning_model_id;
}
};

}; // namespace FlexFlow
} // namespace FlexFlow

#endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
41 changes: 20 additions & 21 deletions src/ops/kernels/lora_linear_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ namespace FlexFlow {

LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
: OpMeta(handler, li) {
#ifdef DEADCODE
allocated_peft_buffer_size1 = 0;
allocated_peft_buffer_size2 = 0;
#endif
}

LoraLinearMeta::~LoraLinearMeta(void) {}
Expand Down Expand Up @@ -145,6 +147,16 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
}
}

bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
for (std::string s : config.target_modules) {
std::string n(m->op_name);
if (n.find(s) != std::string::npos) {
return true;
}
}
return false;
}

namespace Internal {


Expand Down Expand Up @@ -289,17 +301,6 @@ void inference_kernel(LoraLinearMeta *m,
}
#endif

bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
for (std::string s : config.target_modules) {
std::string n(m->op_name);
if (n.find(s) != std::string::npos) {
return true;
}
}
return false;
}


template <typename DT>
void inference_kernel(LoraLinearMeta *m,
BatchConfig const *bc,
Expand All @@ -326,7 +327,7 @@ void inference_kernel(LoraLinearMeta *m,
if (bc->requestsInfo[i].peft_bwd) {
num_peft_requests++;
}
LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
if (!lora_applies_to_this_layer(m, lora_config)) {
continue;
}
Expand Down Expand Up @@ -444,16 +445,15 @@ void peft_bwd_kernel(LoraLinearMeta *m,
if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) {
continue;
}
int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
if (!lora_applies_to_this_layer(m, lora_config)) {
continue;
}
assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch");
m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id);
int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
// int max_peft_tokens = bc->requestsInfo[i].max_length;
int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
// int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);

Expand Down Expand Up @@ -562,15 +562,14 @@ void peft_bwd_kernel(LoraLinearMeta *m,
}

if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config;
assert(optimizer_config != nullptr);
assert(lora_config.optimizer_config != nullptr);
int w0_num_elements = lora_config.rank * in_dim;
int w1_num_elements = lora_config.rank * out_dim;

// Get optimizer config
if (optimizer_config->getType() == "SGD") {
LoraSGDOptimizerConfig const *sgd_config =
(LoraSGDOptimizerConfig const *)optimizer_config;

if (lora_config.optimizer_config->getType() == "SGD") {
LoraSGDOptimizerConfig const *sgd_config = static_cast<LoraSGDOptimizerConfig const *>(lora_config.optimizer_config.get());
// LoRA_A weight is split in tensor parallelism, so no need to apply
// all-reduce
sgd_update<<<GET_BLOCKS(w0_num_elements),
Expand Down Expand Up @@ -609,7 +608,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
static_cast<DT const *>(weight.w1_grad_ptr),
static_cast<DT *>(weight.w1_v_values_ptr),
static_cast<DT *>(weight.w1_ptr));
} else if (optimizer_config->getType() == "Adam") {
} else if (lora_config.optimizer_config->getType() == "Adam") {
assert(false && "Adam optimizer type not implemented yet");
} else {
assert(false && "Unsupported optimizer type");
Expand Down
Loading

0 comments on commit 1691100

Please sign in to comment.