From 1691100906ddf25191fb0e1444fa75d0675cd44d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 6 Oct 2024 05:10:58 +0000
Subject: [PATCH] bug fix

---
 include/flexflow/batch_config.h               |  4 +-
 include/flexflow/fftype.h                     |  1 +
 include/flexflow/model.h                      |  1 +
 .../ops/kernels/lora_linear_kernels.h         |  4 +
 include/flexflow/ops/lora_linear.h            |  2 -
 include/flexflow/ops/lora_linear_params.h     | 25 ++++--
 include/flexflow/request_manager.h            |  2 +
 .../flexflow/utils/peft_weight_allocator.h    | 22 +++--
 src/ops/kernels/lora_linear_kernels.cu        | 41 +++++----
 src/ops/lora_linear.cc                        | 88 +++++++++++--------
 src/ops/lora_linear_params.cc                 |  9 +-
 src/runtime/fftype.cc                         |  2 +
 src/runtime/peft_weight_allocator.cc          | 43 ++++++---
 src/runtime/peft_weight_allocator.cu          |  8 +-
 src/runtime/request_manager.cc                |  5 +-
 15 files changed, 156 insertions(+), 101 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index cb2f8d3a3d..44d829a7f7 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -94,6 +94,7 @@ class BatchConfig {
       num_tokens_in_batch = 0;
       max_length = 0;
       request_guid = 0;
+      peft_model_id = PEFTModelID::NO_ID;
       prompt_phase = false;
       batch_config_request_id = -1;
       peft_bwd = false;
@@ -109,7 +110,8 @@ class BatchConfig {
     bool prompt_phase = false;
     RequestGuid request_guid;
     // PEFT fields
-    std::unordered_map<PEFTModelID, std::string> peft_adapters;
+    PEFTModelID peft_model_id;
+    std::string peft_model_config;
     bool peft_bwd;
     OptimizerTasks optimizer_tasks;
   };
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 3e482b8d67..ebc811c262 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -27,6 +27,7 @@ class PEFTModelID {
   PEFTModelID(size_t id);
   bool is_valid_id() const;
   friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   PEFTModelID const &peft_model_id);
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index d1dbe72d7c..e3beafe20c 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -847,6 +847,7 @@ class FFModel {
   // ========================================
 //   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
     void add_lora_layers(std::vector<std::string> target_modules);
+    PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
   // ========================================
   // Inference APIs
   // ========================================
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index eef3b392b3..00f16af146 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -6,6 +6,7 @@
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
 #include "flexflow/ops/lora_linear.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
 
@@ -35,6 +36,9 @@ class LoraLinearMeta : public OpMeta {
 
 namespace Kernels {
 namespace LoraLinear {
+
+bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config);
+
 void init_kernel_wrapper(LoraLinearMeta *m, int seed);
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 8d37be0c64..1c6070afe4 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -20,12 +20,10 @@ class LoraLinear : public Op {
   LoraLinear(
       FFModel &model,
       LayerID const &layer_guid,
-      OperatorType type,
       ParallelTensor const input,
       ParallelTensor const output,
       int max_rank,
       int max_concurrent_adapters,
-      // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
       char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index c5a327459f..525a9209d3 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -124,16 +124,28 @@ class LoraLinearConfig {
                    std::vector<std::string> const &target_modules_ = {});
   // constructor used to support std::unordered_map
   LoraLinearConfig();
+
+  // Method to set optimizer
   template<typename T>
-    void setOptimizer(T&& opt) {
-        optimizer_config = std::make_unique<T>(std::forward<T>(opt));
+  void setOptimizer(T&& opt) {
+    if constexpr (std::is_base_of_v<LoraOptimizerConfig, std::remove_reference_t<T>>) {
+      optimizer_config = std::make_unique<std::remove_reference_t<T>>(std::forward<T>(opt));
+    } else if constexpr (std::is_same_v<std::unique_ptr<LoraOptimizerConfig>, std::remove_reference_t<T>>) {
+      optimizer_config = std::move(opt);
+    } else {
+      static_assert(always_false<T>, "Unsupported optimizer type");
     }
+  }
+  // Helper template for static_assert
+  template <typename>
+  static inline constexpr bool always_false = false;
+  
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
   std::string serialize_to_json_string(int indent=-1) const {
-    json j = {
+    nlohmann::json j = {
         {"cache_folder", cache_folder},
         {"peft_model_id", peft_model_id},
         {"rank", rank},
@@ -144,7 +156,8 @@ class LoraLinearConfig {
         {"init_lora_weights", init_lora_weights},
         {"base_model_name_or_path", base_model_name_or_path},
         {"precision", precision},
-        {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
+        // {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
+        {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()}
     };
 
     return j.dump(indent);  // No indentation
@@ -156,7 +169,7 @@ class LoraLinearConfig {
   }
   // Deserialization method
   static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) {
-    json j = json::parse(json_string);
+    nlohmann::json j = nlohmann::json::parse(json_string);
     LoraLinearConfig config(
         j["cache_folder"].get<std::string>(),
         j["peft_model_id"].get<std::string>(),
@@ -208,8 +221,6 @@ class LoraLinearConfig {
 class LoraLinearParams {
 public:
   LayerID layer_guid;
-  // OperatorType type;
-  // std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   int max_rank;
   int max_concurrent_adapters;
   char name[MAX_OPNAME];
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 542deb336d..628714dcc0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -149,6 +149,8 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
+  void register_peft_config(PEFTModelID const &peft_model_id,
+                            LoraLinearConfig const &peft_config);
   LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
   void set_max_lora_rank(int max_lora_rank);
   void set_max_concurrent_adapters(int max_concurrent_adapters);
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 3c9efc0812..9670da8a4f 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -17,12 +17,13 @@
 #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
 
 #include "flexflow/config.h"
-#include "lora_linear_params.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/lora_linear_params.h"
 // #include <mutex>
 
 namespace FlexFlow {
 
-#ifdef DEACODE
+#ifdef DEADCODE
 class PEFTWeightAllocator {
 public:
   PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
@@ -108,19 +109,21 @@ struct LoraLinearWeight {
       low_rank_activation(low_rank_activation_), input_activation(input_activation_) {}
 };
 
+void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed);
+
 class PEFTMemoryManager {
 public:
-  PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
+  PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
   : gpu_mem(gpu_mem_), 
     max_concurrent_adapters(max_concurrent_adapters_), 
-    max_lora_size(max_lora_size_),
+    max_rank(max_rank_),
     in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_),
     max_peft_tokens(max_peft_tokens_),
     lora_layername_substr(lora_layername_substr_), dt(dt_),
     base_ptr(nullptr), 
     finetuning_ptr(nullptr), 
     finetuning_model_id(PEFTModelID::NO_ID) {
-    
+    max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim);
     assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0");
     assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0");
     allocate_inference_memory();
@@ -146,12 +149,13 @@ class PEFTMemoryManager {
   LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
 
   // Legion memory management apparatus
-  Memory gpu_mem;
+  Legion::Memory gpu_mem;
   Realm::RegionInstance peftLegionInst;
   void *base_ptr, *finetuning_ptr;
   // Size and shapes
   int max_concurrent_adapters;
-  size_t max_lora_size;
+  int max_rank;
+  int max_lora_size;
   int in_dim, out_dim, num_shards, shard_id;
   int max_peft_tokens;
   // LRU cache apparatus
@@ -162,8 +166,8 @@ class PEFTMemoryManager {
   std::string lora_layername_substr;
   DataType dt;
   PEFTModelID finetuning_model_id;
-}
+};
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
 
 #endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index d5baf49cdc..134af3ca6e 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -24,8 +24,10 @@ namespace FlexFlow {
 
 LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
     : OpMeta(handler, li) {
+#ifdef DEADCODE
   allocated_peft_buffer_size1 = 0;
   allocated_peft_buffer_size2 = 0;
+#endif
 }
 
 LoraLinearMeta::~LoraLinearMeta(void) {}
@@ -145,6 +147,16 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
   }
 }
 
+bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
+  for (std::string s : config.target_modules) {
+    std::string n(m->op_name);
+    if (n.find(s) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 namespace Internal {
 
 
@@ -289,17 +301,6 @@ void inference_kernel(LoraLinearMeta *m,
 }
 #endif
 
-bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
-  for (std::string s : config.target_modules) {
-    std::string n(m->op_name);
-    if (n.find(s) != std::string::npos) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -326,7 +327,7 @@ void inference_kernel(LoraLinearMeta *m,
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -444,8 +445,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -453,7 +453,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     // int max_peft_tokens = bc->requestsInfo[i].max_length;
-    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
     DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
 
@@ -562,15 +562,14 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     }
 
     if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
-      LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config;
-      assert(optimizer_config != nullptr);
+      assert(lora_config.optimizer_config != nullptr);
       int w0_num_elements = lora_config.rank * in_dim;
       int w1_num_elements = lora_config.rank * out_dim;
 
       // Get optimizer config
-      if (optimizer_config->getType() == "SGD") {
-        LoraSGDOptimizerConfig const *sgd_config =
-            (LoraSGDOptimizerConfig const *)optimizer_config;
+
+      if (lora_config.optimizer_config->getType() == "SGD") {
+        LoraSGDOptimizerConfig const *sgd_config = static_cast<LoraSGDOptimizerConfig const *>(lora_config.optimizer_config.get());
         // LoRA_A weight is split in tensor parallelism, so no need to apply
         // all-reduce
         sgd_update<<<GET_BLOCKS(w0_num_elements),
@@ -609,7 +608,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                                static_cast<DT const *>(weight.w1_grad_ptr),
                                static_cast<DT *>(weight.w1_v_values_ptr),
                                static_cast<DT *>(weight.w1_ptr));
-      } else if (optimizer_config->getType() == "Adam") {
+      } else if (lora_config.optimizer_config->getType() == "Adam") {
         assert(false && "Adam optimizer type not implemented yet");
       } else {
         assert(false && "Unsupported optimizer type");
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index a18f47c4ac..f7ac4ff06e 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -6,6 +6,7 @@
 #include "flexflow/utils/hash_utils.h"
 #include "flexflow/utils/peft_weight_allocator.h"
 #include "legion/legion_utilities.h"
+#include "flexflow/request_manager.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -51,13 +52,13 @@ bool check_lora_layer_match(Layer *potential_target,
   return false;
 }
 
-void FFmodel::add_lora_layers(std::vector<std::string> target_modules) {
+void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
   assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
   assert(target_modules.size() > 0 && "LoRA target module name is empty");
   RequestManager *rm = RequestManager::get_request_manager();
   int max_lora_rank = rm->get_max_lora_rank();
   int max_concurrent_adapters = rm->get_max_concurrent_adapters();
-  assert(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank");
+  assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank");
   assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters");
 
   for (std::string target_module_name : target_modules) {
@@ -120,7 +121,7 @@ void FFmodel::add_lora_layers(std::vector<std::string> target_modules) {
                                           true /*create_grad*/);
       }
       // pass max_rank and max_concurrent_adapters to OP_LORA layer
-      peft_layer->add_int_property("max_rank", max_rank);
+      peft_layer->add_int_property("max_rank", max_lora_rank);
       peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters);
       it = layers.insert(it + 1, peft_layer);
       ++it;
@@ -263,7 +264,7 @@ Op *LoraLinear::create_operator_from_layer(
   long long value;
   layer->get_int_property("max_rank", value);
   int max_rank = value;
-  layer->get_int_property("max_concurrent_adapters", max_concurrent_adapters);
+  layer->get_int_property("max_concurrent_adapters", value);
   int max_concurrent_adapters = value;
 #ifdef DEADCODE
   std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
@@ -276,7 +277,6 @@ Op *LoraLinear::create_operator_from_layer(
 #endif
   return new LoraLinear(model,
                         layer->layer_guid,
-                        layer->op_type,
                         inputs[0],
                         inputs[1],
                         max_rank,
@@ -290,7 +290,6 @@ LoraLinear::LoraLinear(FFModel &model,
                        ParallelTensor const output)
     : LoraLinear(model,
                  other.layer_guid,
-                 other.op_type,
                  input,
                  output,
                  other.max_rank,
@@ -303,7 +302,6 @@ LoraLinear::LoraLinear(FFModel &model,
                        char const *name)
     : LoraLinear(model,
                  params.layer_guid,
-                 params.type,
                  inputs.first,
                  inputs.second,
                  params.max_rank,
@@ -313,7 +311,6 @@ LoraLinear::LoraLinear(FFModel &model,
 LoraLinear::LoraLinear(
     FFModel &model,
     LayerID const &_layer_guid,
-    OperatorType _op_type,
     ParallelTensor const _input,
     ParallelTensor const _output,
     int _max_rank,
@@ -321,7 +318,7 @@ LoraLinear::LoraLinear(
     // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
     char const *name)
     : Op(model,
-         _op_type,
+         OP_LORA,
          _output->data_type,
          name,
          2 /*inputs*/,
@@ -473,9 +470,8 @@ OpMeta *LoraLinear::init_task(Task const *task,
       lora_layername.substr(0, found + searchString.length());
   
   // allocate space for lora weights
-  size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim);
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
-  m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, max_lora_size, lora->max_concurrent_adapters, in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt);
+  m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, lora->max_rank, lora->max_concurrent_adapters, BatchConfig::max_sequence_length(), in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt);
   m->peft_memory_manager->allocate_inference_memory();
   return m;
 }
@@ -709,8 +705,8 @@ void LoraLinear::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorRW(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
 
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
@@ -761,12 +757,15 @@ void LoraLinear::inference_task(Task const *task,
       assert(false);
     }
 
-    int rank, num_tokens;
-    for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
-      PEFTModelID peft_model_id = it->first;
-      LoraLinearWeight weight = m->model_state[peft_model_id].weights;
-      rank = weight.rank;
-      num_tokens = input.domain.get_volume() / weight.in_dim;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+      if (!lora_applies_to_this_layer(m, lora_config)) {
+        continue;
+      }
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
       fs::path dst_filepath_weights =
           get_dst_folder("weights", m->decoding_step, shard_id) / layername;
       std::string filenameA =
@@ -775,21 +774,38 @@ void LoraLinear::inference_task(Task const *task,
           dst_filepath_weights.string() + ".weight_B.original";
       if (m->input_type[0] == DT_FLOAT) {
         save_tensor((float *)weight.w0_ptr,
-                    weight.rank * weight.in_dim,
+                    lora_config.rank * in_dim,
                     filenameA.c_str());
         save_tensor((float *)weight.w1_ptr,
-                    weight.rank * weight.out_dim,
+                    lora_config.rank * out_dim,
                     filenameB.c_str());
       } else if (m->input_type[0] == DT_HALF) {
         save_tensor((half *)weight.w0_ptr,
-                    weight.rank * weight.in_dim,
+                    lora_config.rank * in_dim,
                     filenameA.c_str());
         save_tensor((half *)weight.w1_ptr,
-                    weight.rank * weight.out_dim,
+                    lora_config.rank * out_dim,
                     filenameB.c_str());
       } else {
         assert(false && "Data type not supported");
       }
+
+      if (bc->requestsInfo[i].peft_bwd) {
+        int num_tokens = input.domain.get_volume() / in_dim;
+        // input activation (intermediate)
+        filename = dst_filepath.string() + ".low_rank_activation";
+        if (output.data_type == DT_FLOAT) {
+          save_tensor((float *)weight.low_rank_activation,
+                      lora_config.rank * num_tokens,
+                      filename.c_str());
+        } else if (output.data_type == DT_HALF) {
+          save_tensor((half *)weight.low_rank_activation,
+                      lora_config.rank * num_tokens,
+                      filename.c_str());
+        } else {
+          assert(false);
+        }
+      }
     }
 
     filename = dst_filepath.string() + ".output_0";
@@ -803,21 +819,7 @@ void LoraLinear::inference_task(Task const *task,
       assert(false);
     }
 
-    if (bc->num_active_peft_tokens() > 0) {
-      // input activation (intermediate)
-      filename = dst_filepath.string() + ".low_rank_activation";
-      if (output.data_type == DT_FLOAT) {
-        save_tensor((float *)m->low_rank_activation,
-                    rank * num_tokens,
-                    filename.c_str());
-      } else if (output.data_type == DT_HALF) {
-        save_tensor((half *)m->low_rank_activation,
-                    rank * num_tokens,
-                    filename.c_str());
-      } else {
-        assert(false);
-      }
-    }
+    
     m->decoding_step++;
   }
 }
@@ -905,6 +907,16 @@ void lora_inference_debugging(LoraLinearMeta *m,
   // weights, weights gradients
   fs::path dst_filepath_weights =
       get_dst_folder("weights", m->bwd_step, shard_id) / layername;
+  
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
+      continue;
+    }
+  
   assert(m->model_state.size() >= 1 && "Model state empty!");
   for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
     PEFTModelID peft_model_id = it->first;
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 310b6d0973..c7b9fcc711 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -170,11 +170,10 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   os << "trainable: " << llc.trainable << ", ";
   if (llc.optimizer_config != nullptr) {
     os << "optimizer_config: ";
-    if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
-      os << *static_cast<LoraSGDOptimizerConfig *>(llc.optimizer_config);
-    } else if (typeid(*llc.optimizer_config) ==
-               typeid(LoraAdamOptimizerConfig)) {
-      os << *static_cast<LoraAdamOptimizerConfig *>(llc.optimizer_config);
+    if (llc.optimizer_config.get()->getType() == "SGD") {
+      os << *static_cast<LoraSGDOptimizerConfig const *>(llc.optimizer_config.get());
+    } else if (llc.optimizer_config.get()->getType() == "Adam") {
+      os << *static_cast<LoraAdamOptimizerConfig const *>(llc.optimizer_config.get());
     } else {
       os << "Unknown optimizer config type";
     }
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index 8213726e8a..0af5f45350 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -46,6 +46,8 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
   return lhs.id == rhs.id;
 }
 
+bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { return !(lhs == rhs); }
+
 std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) {
   if (peft_model_id == PEFTModelID::NO_ID) {
     os << "NO_ID";
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index cc40d666ed..287eb7e20a 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -1,6 +1,24 @@
-#include "peft_weight_allocator.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
+// declare legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
 
 void PEFTMemoryManager::allocate_inference_memory() {
     // allocate chunk of memory for all the PEFT adapters
@@ -21,7 +39,7 @@ void PEFTMemoryManager::allocate_inference_memory() {
 
 void PEFTMemoryManager::allocate_finetuning_memory() {
     size_t ft_size = max_lora_size*3; // weights, gradients, momentum values
-    ft_size += max_peft_tokens*(in_dim+rank); // input, low-rank activations
+    ft_size += max_peft_tokens * (in_dim + max_rank); // input, low-rank activations
     // allocate chunk of memory for PEFT adapter
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
@@ -144,7 +162,7 @@ void load_peft_from_file(DT *ptr,
 
 void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) {
     // Load weights
-    assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr "PEFT Memory Manager weight ptr null");
+    assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && "PEFT Memory Manager weight ptr null");
     int w0_num_elements = lora_config.rank * in_dim;
     int w1_num_elements = lora_config.rank * out_dim;
     // values below represent total weight sizes before sharding. Lora B is not
@@ -235,7 +253,7 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_
     int data_size = data_type_size(dt);
     LoraLinearWeight result;
     result.w0_ptr = static_cast<char *>(base_ptr) + mem_slot * max_lora_size;
-    result.w1_ptr = result.w0_ptr + w0_num_elements * data_size;
+    result.w1_ptr = static_cast<char *>(result.w0_ptr) + w0_num_elements * data_size;
     if (cache_miss) {
       load_peft_model(result, lora_config);
     }
@@ -244,19 +262,20 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_
 
 LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
     assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
-    bool cache_miss = get_finetuning_slot(model_id);
+    bool cache_miss;
+    get_finetuning_slot(model_id, &cache_miss);
     int w0_num_elements = lora_config.rank * in_dim;
     int w1_num_elements = lora_config.rank * out_dim;
     int data_size = data_type_size(dt);
     LoraLinearWeight result;
     result.w0_ptr = finetuning_ptr;
-    result.w1_ptr = result.w0_ptr + w0_num_elements*data_size;
-    result.w0_grad_ptr = result.w1_ptr + w1_num_elements*data_size;
-    result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size;
-    result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size;
-    result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size;
-    result.input_activation = result.w1_v_values_ptr + w1_num_elements*data_size; // max_peft_tokens*in_dim
-    result.low_rank_activation = result.input_activation + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank
+    result.w1_ptr = static_cast<char *>(result.w0_ptr)+ w0_num_elements*data_size;
+    result.w0_grad_ptr = static_cast<char *>(result.w1_ptr) + w1_num_elements*data_size;
+    result.w1_grad_ptr = static_cast<char *>(result.w0_grad_ptr) + w0_num_elements*data_size;
+    result.w0_v_values_ptr = static_cast<char *>(result.w1_grad_ptr) + w1_num_elements*data_size;
+    result.w1_v_values_ptr = static_cast<char *>(result.w0_v_values_ptr) + w0_num_elements*data_size;
+    result.input_activation = static_cast<char *>(result.w1_v_values_ptr) + w1_num_elements*data_size; // max_peft_tokens*in_dim
+    result.low_rank_activation = static_cast<char *>(result.input_activation) + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank
     if (cache_miss) {
       load_peft_model(result, lora_config);
     }
diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu
index cc8d095069..bc9ab443cb 100644
--- a/src/runtime/peft_weight_allocator.cu
+++ b/src/runtime/peft_weight_allocator.cu
@@ -8,7 +8,7 @@
 namespace FlexFlow {
 
 template <typename DT>
-void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) {
+void lora_init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) {
     // Initialize generator
     std::mt19937 gen(seed);
 
@@ -47,7 +47,7 @@ void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int ra
             num = num_float;
         }
     }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(w1_ptr),
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
                                 lora_b_random_init.data(),
                                 w1_num_elements * sizeof(DT),
                                 cudaMemcpyHostToDevice,
@@ -59,9 +59,9 @@ void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int ou
   checkCUDA(get_legion_stream(&stream));
 
   if (dt == DT_FLOAT) {
-    Internal::init_kernel<float>(weight, in_di, out_dim, rank, seed, stream);
+    lora_init_kernel<float>(weight, in_dim, out_dim, rank, seed, stream);
   } else if (dt == DT_HALF) {
-    Internal::init_kernel<half>(weight, in_di, out_dim, rank, seed, stream);
+    lora_init_kernel<half>(weight, in_dim, out_dim, rank, seed, stream);
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 79fcdfdcfe..2377a4f938 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -260,7 +260,8 @@ void RequestManager::register_peft_config(PEFTModelID const &peft_model_id,
   // check that peft_model_id is not already in use
   assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
          "PEFT model ID already in use");
-  peft_configs[peft_model_id] = peft_config;
+  // peft_configs[peft_model_id] = std::move(peft_config);
+  peft_configs.emplace(peft_model_id, std::move(peft_config));
 }
 
 LoraLinearConfig const &RequestManager::get_peft_config(
@@ -284,7 +285,7 @@ int RequestManager::get_max_concurrent_adapters() {
   return max_concurrent_adapters;
 }
 
-PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) {
+PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
   if (peft_config.target_modules.size() == 0) {