bug fix

flexflow · Oct 6, 2024 · 1691100 · 1691100
1 parent 53c408c
commit 1691100
Show file tree

Hide file tree

Showing 15 changed files with 156 additions and 101 deletions.
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -94,6 +94,7 @@ class BatchConfig {
       num_tokens_in_batch = 0;
       max_length = 0;
       request_guid = 0;
+      peft_model_id = PEFTModelID::NO_ID;
       prompt_phase = false;
       batch_config_request_id = -1;
       peft_bwd = false;
@@ -109,7 +110,8 @@ class BatchConfig {
     bool prompt_phase = false;
     RequestGuid request_guid;
     // PEFT fields
-    std::unordered_map<PEFTModelID, std::string> peft_adapters;
+    PEFTModelID peft_model_id;
+    std::string peft_model_config;
     bool peft_bwd;
     OptimizerTasks optimizer_tasks;
   };

diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
@@ -27,6 +27,7 @@ class PEFTModelID {
   PEFTModelID(size_t id);
   bool is_valid_id() const;
   friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   PEFTModelID const &peft_model_id);
 

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -847,6 +847,7 @@ class FFModel {
   // ========================================
 //   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
     void add_lora_layers(std::vector<std::string> target_modules);
+    PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
   // ========================================
   // Inference APIs
   // ========================================

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -6,6 +6,7 @@
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
 #include "flexflow/ops/lora_linear.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
 
@@ -35,6 +36,9 @@ class LoraLinearMeta : public OpMeta {
 
 namespace Kernels {
 namespace LoraLinear {
+
+bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config);
+
 void init_kernel_wrapper(LoraLinearMeta *m, int seed);
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,

diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
@@ -20,12 +20,10 @@ class LoraLinear : public Op {
   LoraLinear(
       FFModel &model,
       LayerID const &layer_guid,
-      OperatorType type,
       ParallelTensor const input,
       ParallelTensor const output,
       int max_rank,
       int max_concurrent_adapters,
-      // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
       char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,

diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
@@ -124,16 +124,28 @@ class LoraLinearConfig {
                    std::vector<std::string> const &target_modules_ = {});
   // constructor used to support std::unordered_map
   LoraLinearConfig();
+
+  // Method to set optimizer
   template<typename T>
-    void setOptimizer(T&& opt) {
-        optimizer_config = std::make_unique<T>(std::forward<T>(opt));
+  void setOptimizer(T&& opt) {
+    if constexpr (std::is_base_of_v<LoraOptimizerConfig, std::remove_reference_t<T>>) {
+      optimizer_config = std::make_unique<std::remove_reference_t<T>>(std::forward<T>(opt));
+    } else if constexpr (std::is_same_v<std::unique_ptr<LoraOptimizerConfig>, std::remove_reference_t<T>>) {
+      optimizer_config = std::move(opt);
+    } else {
+      static_assert(always_false<T>, "Unsupported optimizer type");
     }
+  }
+  // Helper template for static_assert
+  template <typename>
+  static inline constexpr bool always_false = false;
+
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
   std::string serialize_to_json_string(int indent=-1) const {
-    json j = {
+    nlohmann::json j = {
         {"cache_folder", cache_folder},
         {"peft_model_id", peft_model_id},
         {"rank", rank},
@@ -144,7 +156,8 @@ class LoraLinearConfig {
         {"init_lora_weights", init_lora_weights},
         {"base_model_name_or_path", base_model_name_or_path},
         {"precision", precision},
-        {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
+        // {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
+        {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()}
     };
 
     return j.dump(indent);  // No indentation
@@ -156,7 +169,7 @@ class LoraLinearConfig {
   }
   // Deserialization method
   static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) {
-    json j = json::parse(json_string);
+    nlohmann::json j = nlohmann::json::parse(json_string);
     LoraLinearConfig config(
         j["cache_folder"].get<std::string>(),
         j["peft_model_id"].get<std::string>(),
@@ -208,8 +221,6 @@ class LoraLinearConfig {
 class LoraLinearParams {
 public:
   LayerID layer_guid;
-  // OperatorType type;
-  // std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   int max_rank;
   int max_concurrent_adapters;
   char name[MAX_OPNAME];

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -149,6 +149,8 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
+  void register_peft_config(PEFTModelID const &peft_model_id,
+                            LoraLinearConfig const &peft_config);
   LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
   void set_max_lora_rank(int max_lora_rank);
   void set_max_concurrent_adapters(int max_concurrent_adapters);

diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
@@ -17,12 +17,13 @@
 #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
 
 #include "flexflow/config.h"
-#include "lora_linear_params.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/lora_linear_params.h"
 // #include <mutex>
 
 namespace FlexFlow {
 
-#ifdef DEACODE
+#ifdef DEADCODE
 class PEFTWeightAllocator {
 public:
   PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
@@ -108,19 +109,21 @@ struct LoraLinearWeight {
       low_rank_activation(low_rank_activation_), input_activation(input_activation_) {}
 };
 
+void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed);
+
 class PEFTMemoryManager {
 public:
-  PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
+  PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
   : gpu_mem(gpu_mem_), 
     max_concurrent_adapters(max_concurrent_adapters_), 
-    max_lora_size(max_lora_size_),
+    max_rank(max_rank_),
     in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_),
     max_peft_tokens(max_peft_tokens_),
     lora_layername_substr(lora_layername_substr_), dt(dt_),
     base_ptr(nullptr), 
     finetuning_ptr(nullptr), 
     finetuning_model_id(PEFTModelID::NO_ID) {
-
+    max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim);
     assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0");
     assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0");
     allocate_inference_memory();
@@ -146,12 +149,13 @@ class PEFTMemoryManager {
   LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
 
   // Legion memory management apparatus
-  Memory gpu_mem;
+  Legion::Memory gpu_mem;
   Realm::RegionInstance peftLegionInst;
   void *base_ptr, *finetuning_ptr;
   // Size and shapes
   int max_concurrent_adapters;
-  size_t max_lora_size;
+  int max_rank;
+  int max_lora_size;
   int in_dim, out_dim, num_shards, shard_id;
   int max_peft_tokens;
   // LRU cache apparatus
@@ -162,8 +166,8 @@ class PEFTMemoryManager {
   std::string lora_layername_substr;
   DataType dt;
   PEFTModelID finetuning_model_id;
-}
+};
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
 
 #endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
@@ -24,8 +24,10 @@ namespace FlexFlow {
 
 LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
     : OpMeta(handler, li) {
+#ifdef DEADCODE
   allocated_peft_buffer_size1 = 0;
   allocated_peft_buffer_size2 = 0;
+#endif
 }
 
 LoraLinearMeta::~LoraLinearMeta(void) {}
@@ -145,6 +147,16 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
   }
 }
 
+bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
+  for (std::string s : config.target_modules) {
+    std::string n(m->op_name);
+    if (n.find(s) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 namespace Internal {
 
 
@@ -289,17 +301,6 @@ void inference_kernel(LoraLinearMeta *m,
 }
 #endif
 
-bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
-  for (std::string s : config.target_modules) {
-    std::string n(m->op_name);
-    if (n.find(s) != std::string::npos) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -326,7 +327,7 @@ void inference_kernel(LoraLinearMeta *m,
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -444,16 +445,15 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
     assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch");
     m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     // int max_peft_tokens = bc->requestsInfo[i].max_length;
-    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
     DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
 
@@ -562,15 +562,14 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     }
 
     if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
-      LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config;
-      assert(optimizer_config != nullptr);
+      assert(lora_config.optimizer_config != nullptr);
       int w0_num_elements = lora_config.rank * in_dim;
       int w1_num_elements = lora_config.rank * out_dim;
 
       // Get optimizer config
-      if (optimizer_config->getType() == "SGD") {
-        LoraSGDOptimizerConfig const *sgd_config =
-            (LoraSGDOptimizerConfig const *)optimizer_config;
+
+      if (lora_config.optimizer_config->getType() == "SGD") {
+        LoraSGDOptimizerConfig const *sgd_config = static_cast<LoraSGDOptimizerConfig const *>(lora_config.optimizer_config.get());
         // LoRA_A weight is split in tensor parallelism, so no need to apply
         // all-reduce
         sgd_update<<<GET_BLOCKS(w0_num_elements),
@@ -609,7 +608,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                                static_cast<DT const *>(weight.w1_grad_ptr),
                                static_cast<DT *>(weight.w1_v_values_ptr),
                                static_cast<DT *>(weight.w1_ptr));
-      } else if (optimizer_config->getType() == "Adam") {
+      } else if (lora_config.optimizer_config->getType() == "Adam") {
         assert(false && "Adam optimizer type not implemented yet");
       } else {
         assert(false && "Unsupported optimizer type");