From 889224ade3b303b8a83ee4c2ac1d787a9cfe3bd4 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 14 Dec 2023 17:15:50 -0500
Subject: [PATCH 01/11] Revert "several hacks for performance measurement; some
 of the changes should be reverted"

This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c.
---
 inference/incr_decoding/incr_decoding.cc | 32 ++++++++----------------
 src/ops/argmax.cc                        |  5 ----
 src/runtime/request_manager.cc           | 10 ++------
 3 files changed, 12 insertions(+), 35 deletions(-)
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 94ccb1cabf..dcd1b5a5ab 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -138,9 +138,9 @@ void FlexFlow::top_level_task(Task const *task,
   bool do_sample = false;
   float temperature = 0.0f;
   float topp = 0.0f;
-  int max_requests_per_batch = 2;
-  int max_tokens_per_batch = 300;
-  int max_sequence_length = 300;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -272,7 +272,6 @@ void FlexFlow::top_level_task(Task const *task,
 
   int total_num_requests = 0;
   {
-#ifdef DEADCODE
     using json = nlohmann::json;
     std::ifstream file_handle(file_paths.prompt_file_path);
     assert(file_handle.good() && "Prompt file does not exist.");
@@ -292,26 +291,15 @@ void FlexFlow::top_level_task(Task const *task,
       inference_req.peft_model_id = peft_model_id;
       requests.push_back(inference_req);
       total_num_requests++;
-    }
-#endif
-    std::vector<Request> requests;
-    for (int i = 0; i < (max_requests_per_batch - 1) * 4; i++) {
-      Request inference_req;
-      inference_req.prompt = "b";
-      inference_req.max_sequence_length = 40;
-      requests.push_back(inference_req);
+      // Add fine-tuning request
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = 128;
+      fine_tuning_req.peft_model_id = peft_model_id;
+      fine_tuning_req.dataset_text.push_back(std::make_pair(text, ""));
+      requests.push_back(fine_tuning_req);
       total_num_requests++;
     }
-    // Add a fine-tuning request
-    Request fine_tuning_req;
-    fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
-    fine_tuning_req.max_sequence_length = 256;
-    fine_tuning_req.max_training_steps = 256;
-    fine_tuning_req.peft_model_id = peft_model_id;
-    fine_tuning_req.dataset_text.push_back(std::make_pair("b", ""));
-    requests.push_back(fine_tuning_req);
-    total_num_requests++;
-
     GenerationResult result = model.generate(requests);
   }
 
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index dd0e2bb822..cabb8b204f 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -392,11 +392,6 @@ InferenceResult
   GenericTensorAccessorW parent;
   int batch_size = bc->num_active_infr_tokens();
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
-  // Note that we free activation allocator here since argmax is the
-  // last operator in forward
-  if (m->handle.peft_activation_allocator != nullptr) {
-    m->handle.peft_activation_allocator->free_all();
-  }
   InferenceResult ir;
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1d4a9ee47c..cbb21e03e0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -246,17 +246,13 @@ RequestManager::RequestGuid
   request.peft_model_id = request_.peft_model_id;
   request.req_type = Request::REQ_FINETUNING;
   request.completed_training_steps = 0;
-  request.max_training_steps = request_.max_training_steps;
+  request.max_training_steps = 1; // TODO: let user set this
   for (auto const &sample : request_.dataset_text) {
     std::vector<int32_t> input_tokens;
     input_tokens = this->tokenizer_->Encode(sample.first);
     if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
       input_tokens.insert(input_tokens.begin(), bos_token_id);
     }
-    // FIXME: this is a hack, must undo
-    while (input_tokens.size() < 256) {
-      input_tokens.push_back(293);
-    }
     std::vector<int32_t> output_tokens =
         this->tokenizer_->Encode(sample.second);
     if (input_tokens.size() + output_tokens.size() >
@@ -359,7 +355,6 @@ BatchConfig RequestManager::prepare_next_batch_task(
 
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
-  log_req_mgr.print("[Old BC] Num tokens: %d", old_bc.num_tokens);
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Step 1: append result from previous iteration to request's tokens
   for (int i = 0; i < old_bc.num_tokens; i++) {
@@ -544,8 +539,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   new_bc.num_generation_tokens = num_generation_tokens;
 
   // Step 3: add new requests to the next batch if there is space
-  // FIXME: we reserve one slot for PEFT req now
-  for (int i = 0; i < BatchConfig::max_requests_per_batch() - 1; i++) {
+  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
     if (new_bc.request_completed[i]) {
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {

From f01b0560279fef38e8b347b4307e172ce10ae3e0 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 14 Dec 2023 17:16:38 -0500
Subject: [PATCH 02/11] backup

---
 src/ops/inc_multihead_self_attention.cc |    4 +
 tests/peft/alignment_tests.ipynb        | 1308 +++++++++++++++++++++++
 tests/peft/qk_prods_alignment.ipynb     |   24 +
 3 files changed, 1336 insertions(+)
 create mode 100644 tests/peft/alignment_tests.ipynb
 create mode 100644 tests/peft/qk_prods_alignment.ipynb

diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index ca6eb7c095..d88c7edb81 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -860,6 +860,8 @@ void IncMultiHeadSelfAttention::inference_task(
 
   assert(task->index_point.get_dim() == 1);
 
+  std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  std::cout << "INF " << op_name_without_uid << std::endl;
   IncMultiHeadSelfAttention::inference_kernel_wrapper(
       m, bc, task->index_point.point_data[0], input, weight, output, biases);
 
@@ -992,6 +994,8 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
 
   assert(task->index_point.get_dim() == 1);
 
+  std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  std::cout << "BWD " << op_name_without_uid << std::endl;
   IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
       m,
       bc,
diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb
new file mode 100644
index 0000000000..fc2899b7c4
--- /dev/null
+++ b/tests/peft/alignment_tests.ipynb
@@ -0,0 +1,1308 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors\"\n",
+    "ff_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors\"\n",
+    "def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):\n",
+    "    assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n",
+    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
+    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
+    "        assert(len(hf_tensor) == 1)\n",
+    "        hf_tensor = hf_tensor[0]\n",
+    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
+    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
+    "    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "\n",
+    "    len_hf_tensor = hf_tensor.shape[0]\n",
+    "    ff_tensor = ff_tensor[:len_hf_tensor]\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n",
+    "        print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}\")\n",
+    "        print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n",
+    "        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "        #print(np.nonzero(hf_tensor)[0])\n",
+    "        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n",
+    "        # print(ff_tensor[36], hf_tensor[36])\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len_hf_tensor)\n",
+    "    print(\"Ok!\")\n",
+    "def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):\n",
+    "    assert(os.path.exists(hf_tensor_filepath))\n",
+    "    assert(os.path.exists(ff_tensor1_filepath))\n",
+    "    assert(os.path.exists(ff_tensor2_filepath))\n",
+    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
+    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
+    "        assert(len(hf_tensor) == 1)\n",
+    "        hf_tensor = hf_tensor[0]\n",
+    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
+    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',')\n",
+    "\n",
+    "    len_hf_tensor = hf_tensor.shape[0]\n",
+    "    ff_tensor1 = ff_tensor1[:len_hf_tensor]\n",
+    "    ff_tensor2 = ff_tensor2[:len_hf_tensor]\n",
+    "    ff_tensor = ff_tensor1 - ff_tensor2\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n",
+    "        print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}\")\n",
+    "        print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n",
+    "        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "        #print(np.nonzero(hf_tensor)[0])\n",
+    "        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n",
+    "        # print(ff_tensor[36], hf_tensor[36])\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len_hf_tensor)\n",
+    "    print(\"Ok!\")\n",
+    "def compare_hf_tensors(tensor1_fp, tensor2_fp):\n",
+    "    assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n",
+    "    hf_tensor1 = torch.load(tensor1_fp)\n",
+    "    hf_tensor2 = torch.load(tensor2_fp)\n",
+    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
+    "        assert(len(hf_tensor1) == 1)\n",
+    "        hf_tensor1 = hf_tensor1[0]\n",
+    "    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n",
+    "        assert(len(hf_tensor2) == 1)\n",
+    "        hf_tensor2 = hf_tensor2[0]\n",
+    "    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n",
+    "    hf_tensor1 = torch.nan_to_num(hf_tensor1)\n",
+    "    hf_tensor2 = torch.nan_to_num(hf_tensor2)\n",
+    "    if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())):\n",
+    "        print(f\"mismatch between {tensor1_fp} and {tensor2_fp}\")\n",
+    "        print(hf_tensor1)\n",
+    "        print(hf_tensor2)\n",
+    "        print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))\n",
+    "        mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0]\n",
+    "        print(mismatches)\n",
+    "        assert(False)\n",
+    "    print(\"Ok!\")\n",
+    "\n",
+    "def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):\n",
+    "    assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n",
+    "    hf_tensor_sum = torch.load(tensor_sum_fp)\n",
+    "    hf_tensor1 = torch.load(tensor1_fp)\n",
+    "    hf_tensor2 = torch.load(tensor2_fp)\n",
+    "    if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:\n",
+    "        assert(len(hf_tensor_sum) == 1)\n",
+    "        hf_tensor_sum = hf_tensor_sum[0]\n",
+    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
+    "        assert(len(hf_tensor1) == 1)\n",
+    "        hf_tensor1 = hf_tensor1[0]\n",
+    "    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n",
+    "        assert(len(hf_tensor2) == 1)\n",
+    "        hf_tensor2 = hf_tensor2[0]\n",
+    "    assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape)\n",
+    "    assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n",
+    "    hf_tensor1 = torch.nan_to_num(hf_tensor1)\n",
+    "    hf_tensor2 = torch.nan_to_num(hf_tensor2)\n",
+    "    hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)\n",
+    "    sum_check_tensor = hf_tensor1 + hf_tensor2\n",
+    "    if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())):\n",
+    "        print(f\"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}\")\n",
+    "        print(tensor_sum_fp)\n",
+    "        print(sum_check_tensor)\n",
+    "        print(hf_tensor1)\n",
+    "        print(hf_tensor2)\n",
+    "        print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))\n",
+    "        mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0]\n",
+    "        print(mismatches)\n",
+    "        assert(False)\n",
+    "    print(\"Ok!\")\n",
+    "def check_hf_zero_tensor(hf_tensor_fp):\n",
+    "    assert(os.path.exists(hf_tensor_fp))\n",
+    "    hf_tensor1 = torch.load(hf_tensor_fp)\n",
+    "    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n",
+    "        assert(len(hf_tensor1) == 1)\n",
+    "        hf_tensor1 = hf_tensor1[0]\n",
+    "    assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)\n",
+    "def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=\"\"):\n",
+    "    assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n",
+    "    hf_tensor = torch.load(hf_tensor_filepath)\n",
+    "    if type(hf_tensor) == tuple or type(hf_tensor) == list:\n",
+    "        assert(len(hf_tensor) == 1)\n",
+    "        hf_tensor = hf_tensor[0]\n",
+    "    hf_tensor = torch.nan_to_num(hf_tensor)\n",
+    "    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n",
+    "    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "\n",
+    "    len_hf_tensor = hf_tensor.shape[0]\n",
+    "    ff_tensor = ff_tensor[:len_hf_tensor]\n",
+    "\n",
+    "    print(f\"{txt} - HF tensor:\")\n",
+    "    print(hf_tensor)\n",
+    "    print(f\"{txt} - FF tensor: \")\n",
+    "    print(ff_tensor)\n",
+    "def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):\n",
+    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
+    "\n",
+    "    if (ff_tensor1.shape != ff_tensor2.shape):\n",
+    "        print(ff_tensor1.shape, ff_tensor2.shape)\n",
+    "    assert(ff_tensor1.shape == ff_tensor2.shape)\n",
+    "\n",
+    "    if max_len > -1:\n",
+    "        ff_tensor1 = ff_tensor1[:max_len]\n",
+    "        ff_tensor2 = ff_tensor2[:max_len]\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n",
+    "        print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n",
+    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
+    "        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
+    "    print(\"Ok!\")\n",
+    "def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n",
+    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
+    "    minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])\n",
+    "    ff_tensor1 = ff_tensor1[:minlen]\n",
+    "    ff_tensor2 = ff_tensor2[:minlen]\n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n",
+    "        print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n",
+    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
+    "        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
+    "    print(\"Ok!\")\n",
+    "def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n",
+    "    assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n",
+    "    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n",
+    "    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n",
+    "    ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')\n",
+    "    \n",
+    "    ff_sum = ff_tensor1 + ff_tensor2\n",
+    "    assert(ff_tensor1.shape == ff_tensor2.shape)\n",
+    "    \n",
+    "    mismatches = []\n",
+    "    if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):\n",
+    "        print(f\"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}\")\n",
+    "        print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n",
+    "        print(f\"Sum Tensor: {ff_tensor_sum}\\nActual sum:{ff_sum}\")\n",
+    "        print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))\n",
+    "        mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]\n",
+    "        print(mismatches)\n",
+    "    #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n",
+    "    assert(len(mismatches) <= .05*len(ff_tensor1))\n",
+    "    print(\"Ok!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for layer_num in range(tot_num_layers):\n",
+    "    hf_input_ln_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.input_layernorm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_RMSNorm_shard-id_0_output_0\"\n",
+    "    if layer_num > 0:\n",
+    "        ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_norm_shard-id_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
+    "    hf_attn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.o_proj.output_0\"\n",
+    "    ff_attn_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_attn_out, ff_attn_out)\n",
+    "    hf_ffn_norm_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.post_attention_layernorm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_1\"\n",
+    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
+    "    # w1\n",
+    "    hf_gate_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n",
+    "    ff_gate_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n",
+    "    # w3\n",
+    "    hf_up_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\" \n",
+    "    ff_up_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_up_proj_out, ff_up_proj_out)\n",
+    "    # w2\n",
+    "    hf_down_proj_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.input_0\"\n",
+    "    hf_down_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.output_0\"\n",
+    "    ff_down_proj_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n",
+    "    ff_down_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n",
+    "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_weight_base_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out)\n",
+    "hf_lm_head_out = f\"{hf_weight_base_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "ff_BWD_softmax_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "\n",
+    "hf_BWD_lm_head_out = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "# compare weights\n",
+    "hf_lm_head_weight = f\"{hf_weight_base_path}/base_model.model.lm_head.weight\"\n",
+    "ff_lm_head_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_weight_0\"\n",
+    "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "# # Manually check the matmul\n",
+    "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n",
+    "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n",
+    "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n",
+    "# print(ff_tensor_out.shape)\n",
+    "# print(ff_weight.shape)\n",
+    "# print(np.matmul(ff_weight, ff_tensor_out))\n",
+    "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n",
+    "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "\n",
+    "hf_BWD_norm_out = f\"{hf_weight_base_path}/bwd_step_0_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "ff_BWD_norm_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_weight_0\"\n",
+    "hf_FWD_norm_weight = f\"{hf_weight_base_path}/base_model.model.model.norm.weight\"\n",
+    "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n",
+    "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  1.8299303e+01\n",
+      "  1.3871717e+01  1.8452764e+00]\n",
+      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  1.82993031e+01\n",
+      "  1.38717194e+01  1.84527588e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[2394]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  1.8299303e+01\n",
+      "  1.3871717e+01  1.8452764e+00]\n",
+      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  1.82993031e+01\n",
+      "  1.38717194e+01  1.84527588e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[2394]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "(64, 12, 24)\n",
+      "(64, 12, 24)\n",
+      "torch.Size([12, 24, 64])\n",
+      "torch.Size([12, 64, 24])\n",
+      "3.7760416666666665% mismatch in QK prods softmax out grad\n",
+      "hf_kproj_grads_post_rotary:  (24, 64, 12)\n",
+      "hf_kproj_grads_before_rotary:  (24, 64, 12)\n",
+      "[[-2.1751599e-01  1.2245592e-01 -2.6237822e-01 ...  1.4371538e+00\n",
+      "   5.2717543e-01  5.1425427e-01]\n",
+      " [-7.6055496e+01  4.2463268e+01 -1.2235089e+02 ...  5.3328156e+02\n",
+      "   2.3810944e+02  1.8990283e+02]\n",
+      " [ 5.2804117e+00 -4.9826388e+00  4.6240320e+00 ... -5.4525635e+01\n",
+      "  -2.1779711e+01 -3.2857445e+01]\n",
+      " ...\n",
+      " [ 1.0541155e+00 -3.1229946e-01  1.4272718e+00 ... -4.6509657e+00\n",
+      "  -2.2930331e+00  2.1488833e-01]\n",
+      " [ 1.8427576e+00 -5.0031781e-01  2.1591802e+00 ... -8.0996408e+00\n",
+      "  -6.6346103e-01  1.1487092e+00]\n",
+      " [-3.9699785e-02  1.7903861e-02 -5.9658013e-02 ...  2.4856456e-01\n",
+      "  -5.0553136e-02 -6.9623299e-02]]\n",
+      "HF Qproj:\n",
+      "torch.Size([24, 768])\n",
+      "\t reshaped:  (24, 64, 12)\n",
+      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
+      "   0.0000000e+00  0.0000000e+00]\n",
+      " [-2.1439369e-03  3.2949594e-03 -2.9551802e-04 ...  2.4234147e-01\n",
+      "   4.3675132e-02 -9.2217997e-02]\n",
+      " [ 2.9682016e+00 -4.1166668e+00 -1.5612273e+00 ...  1.8131609e+01\n",
+      "  -2.7311683e+00 -2.3451160e+01]\n",
+      " ...\n",
+      " [ 7.9408998e+00 -1.6016111e+01  7.5070286e+00 ...  6.9805992e+01\n",
+      "  -8.9288340e+00 -5.6585381e+01]\n",
+      " [ 5.9755993e+00 -1.2562438e+01  9.3722830e+00 ...  5.6924896e+01\n",
+      "   1.6420145e+00 -2.7360382e+01]\n",
+      " [ 2.9259295e+00 -8.8997393e+00  5.6537924e+00 ...  4.0085789e+01\n",
+      "  -5.5427680e+00 -3.3319279e+01]]\n",
+      "FF Qproj:\n",
+      "(24, 64, 12)\n",
+      "[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00\n",
+      "   0.00000000e+00  0.00000000e+00]\n",
+      " [-2.14390800e-03  3.29491800e-03 -2.95515000e-04 ...  2.42337957e-01\n",
+      "   4.36745250e-02 -9.22166630e-02]\n",
+      " [ 2.96819830e+00 -4.11666203e+00 -1.56122601e+00 ...  1.81315899e+01\n",
+      "  -2.73117018e+00 -2.34511394e+01]\n",
+      " ...\n",
+      " [ 7.94090462e+00 -1.60161247e+01  7.50703382e+00 ...  6.98059998e+01\n",
+      "  -8.92883396e+00 -5.65854073e+01]\n",
+      " [ 5.97561932e+00 -1.25624638e+01  9.37229633e+00 ...  5.69249115e+01\n",
+      "   1.64204872e+00 -2.73603287e+01]\n",
+      " [ 2.92593479e+00 -8.89975548e+00  5.65379906e+00 ...  4.00858383e+01\n",
+      "  -5.54277229e+00 -3.33193245e+01]]\n",
+      "hf_attn_in:  torch.Size([1, 24, 768])\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.5252225e+06 -1.2484900e+03  5.3961243e+01 ... -3.3743629e+01\n",
+      "  -2.8661375e+00 -1.2124748e+00]\n",
+      " [-9.5513660e+06  1.8450066e+03  3.8372406e+02 ... -1.9933952e+01\n",
+      "   1.4622488e+01 -2.4410028e+00]\n",
+      " [ 1.1452265e+07  2.1254619e+03 -4.8265629e+01 ...  4.8204151e+01\n",
+      "  -1.4841021e+01 -1.6505869e+01]\n",
+      " ...\n",
+      " [ 2.1089132e+06  2.8605874e+03  1.2375667e+03 ...  2.6102766e+01\n",
+      "   3.1422745e+01  6.7668297e+01]\n",
+      " [ 2.1169400e+06 -4.6361523e+02 -1.6561864e+02 ... -5.3914165e+00\n",
+      "  -6.0169220e-02  2.2841328e+01]\n",
+      " [ 7.3915345e+06  8.9268884e+02  5.4528040e+02 ...  6.2017624e+01\n",
+      "   1.3753588e+01  5.2149849e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.52522050e+06 -1.24848975e+03  5.39611511e+01 ... -3.37436867e+01\n",
+      "  -2.86611795e+00 -1.21241117e+00]\n",
+      " [-9.55136800e+06  1.84500635e+03  3.83724091e+02 ... -1.99339561e+01\n",
+      "   1.46225519e+01 -2.44094014e+00]\n",
+      " [ 1.14522650e+07  2.12546313e+03 -4.82656937e+01 ...  4.82041969e+01\n",
+      "  -1.48411064e+01 -1.65059376e+01]\n",
+      " ...\n",
+      " [ 2.10891300e+06  2.86058789e+03  1.23756726e+03 ...  2.61027851e+01\n",
+      "   3.14227238e+01  6.76683807e+01]\n",
+      " [ 2.11693950e+06 -4.63614868e+02 -1.65618515e+02 ... -5.39132690e+00\n",
+      "  -6.02092740e-02  2.28413010e+01]\n",
+      " [ 7.39153300e+06  8.92689453e+02  5.45280640e+02 ...  6.20176048e+01\n",
+      "   1.37535381e+01  5.21498528e+01]]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 5\u001b[0m line \u001b[0;36m3\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=299'>300</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mff_attn_in: \u001b[39m\u001b[39m\"\u001b[39m, ff_attn_in\u001b[39m.\u001b[39mshape)\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=300'>301</a>\u001b[0m \u001b[39mprint\u001b[39m(ff_attn_in)\n\u001b[0;32m--> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=301'>302</a>\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_attn_in, hf_attn_in, atol\u001b[39m=\u001b[39m\u001b[39m1e-2\u001b[39m))\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=303'>304</a>\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=305'>306</a>\u001b[0m hf_kproj_grads_in \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mhf_weight_base_path\u001b[39m}\u001b[39;00m\u001b[39m/bwd_step_0_layers.\u001b[39m\u001b[39m{\u001b[39;00mlayer_num\u001b[39m}\u001b[39;00m\u001b[39m.self_attn.k_proj.gi_0\u001b[39m\u001b[39m\"\u001b[39m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for layer_num in range(tot_num_layers-1, -1, -1):\n",
+    "    # HuggingFace filepaths\n",
+    "    hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n",
+    "    hf_BWD_loraB_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.go_0\"\n",
+    "    hf_BWD_loraB_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.gi_0\"\n",
+    "    hf_BWD_loraA_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.go_0\"\n",
+    "    hf_BWD_loraA_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.gi_0\"\n",
+    "    hf_loraA_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    hf_loraB_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    hf_BWD_lora_dropout_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.go_0\"\n",
+    "    hf_BWD_lora_dropout_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.gi_0\"\n",
+    "    hf_BWD_w2_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.go_0\"\n",
+    "    hf_BWD_w2_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.gi_0\"\n",
+    "    hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n",
+    "    hf_BWD_w3_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.go_0\"\n",
+    "    hf_BWD_w3_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.gi_0\"\n",
+    "    hf_BWD_w1_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.go_0\"\n",
+    "    hf_BWD_w1_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.gi_0\"\n",
+    "    hf_BWD_act_fn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.gi_0\"\n",
+    "    hf_BWD_act_fn_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.go_0\"\n",
+    "    hf_BWD_ffn_norm_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.go_0\"\n",
+    "    hf_BWD_ffn_norm_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.gi_0\"\n",
+    "    hf_BWD_attn_out_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.go_0\"\n",
+    "    hf_BWD_attn_q_in = f\"{hf_weight_base_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n",
+    "    hf_FWD_w1_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n",
+    "    hf_FWD_w3_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\"\n",
+    "    hf_FWD_act_fn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.act_fn.output_0\"\n",
+    "    hf_BWD_attn_oproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "    hf_attn_qproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.q_proj.weight\"\n",
+    "    hf_attn_kproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.k_proj.weight\"\n",
+    "    hf_attn_vproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.v_proj.weight\"\n",
+    "    hf_attn_oproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.o_proj.weight\"\n",
+    "    # hf_BWD_attn_vproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
+    "    # FlexFlow filepaths\n",
+    "    ff_BWD_w2_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n",
+    "    ff_BWD_w2_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n",
+    "    ff_BWD_w2_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_pre_input_0\"\n",
+    "    ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
+    "    ff_BWD_ssm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_output_0\"\n",
+    "    ff_BWD_ssm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_0\"\n",
+    "    ff_BWD_ssm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_1\"\n",
+    "    ff_BWD_w3_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
+    "    ff_BWD_w3_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_input_0\"\n",
+    "    ff_BWD_lora_A_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
+    "    ff_BWD_lora_B_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
+    "    ff_lora_A_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    ff_lora_B_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    ff_BWD_w1_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
+    "    ff_BWD_w1_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_input_0\"\n",
+    "    ff_BWD_w1_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_pre_input_0\"\n",
+    "    ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
+    "    ff_BWD_ffn_norm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_0\"\n",
+    "    ff_BWD_ffn_norm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_1\"\n",
+    "    ff_BWD_ffn_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_0\"\n",
+    "    ff_BWD_attn_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n",
+    "    ff_BWD_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_input_0\"\n",
+    "    ff_BWD_ssm_cached_w1_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w1_output\"\n",
+    "    ff_BWD_ssm_cached_w3_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w3_output\"\n",
+    "    ff_FWD_w1_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n",
+    "    ff_FWD_w3_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n",
+    "    ff_FWD_act_fnc_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_act_fn_output\"\n",
+    "    ff_BWD_attn_o_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
+    "    # ff_BWD_attn_v_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n",
+    "    ff_attn_oproj_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_weight_0\"\n",
+    "    # ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "\n",
+    "    # xxx = torch.load(hf_BWD_attn_out_out)\n",
+    "    # xxx.detach().cpu().numpy().tofile(f\"{hf_BWD_attn_out_out}.flexflow\")\n",
+    "    # print(f\"{hf_BWD_attn_out_out}.flexflow\")\n",
+    "    \n",
+    "    # HuggingFace checks\n",
+    "    print(\"\\nHuggingface checks:\")\n",
+    "    if layer_num == tot_num_layers-1:\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n",
+    "    # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n",
+    "    check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n",
+    "    check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n",
+    "\n",
+    "    # FlexFlow checks\n",
+    "    print(\"\\nFlexFlow checks:\")\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768) # should fail\n",
+    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768)\n",
+    "\n",
+    "    # HF-FlexFlow checks\n",
+    "    print(\"\\nHuggingface-FlexFlow checks:\")\n",
+    "    compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    #print(torch.load(hf_w2_weight).shape)\n",
+    "    compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n",
+    "    compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n",
+    "\n",
+    "    compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n",
+    "    compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
+    "\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n",
+    "\n",
+    "    compare_tensors(hf_FWD_w1_out, ff_FWD_w1_out)\n",
+    "    compare_tensors(hf_FWD_w3_out, ff_FWD_w3_out)\n",
+    "    compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
+    "    compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    # compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n",
+    "    # compare_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2)\n",
+    "    # compare_tensors(hf_BWD_attn_out_out, ff_BWD_ffn_norm_in2)\n",
+    "    compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n",
+    "\n",
+    "    # compare attn weight tensors\n",
+    "    hidden_size = 768\n",
+    "    qProjSize = 64\n",
+    "    num_heads = 12\n",
+    "    num_new_tokens = num_tokens = 24\n",
+    "    ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n",
+    "    ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "    ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "    ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "    ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n",
+    "    \n",
+    "    hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n",
+    "    hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n",
+    "    hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n",
+    "    hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n",
+    "    \n",
+    "    assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n",
+    "    assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n",
+    "    assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n",
+    "    assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n",
+    "    \n",
+    "    # Compare attn outproj grad in tensors\n",
+    "    compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n",
+    "    \n",
+    "    ########### Compare value projs grads ######################\n",
+    "    # 1. compare qk prods softmax\n",
+    "    hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.qk_prods_softmax\"\n",
+    "    ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "    \n",
+    "    hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n",
+    "    ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "\n",
+    "    for head_idx in range(num_heads):\n",
+    "        hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "        ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "        assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "    \n",
+    "    # 2. compare attn heads grads\n",
+    "    hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "    ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
+    "\n",
+    "    hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n",
+    "    ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n",
+    "    assert(np.allclose(ff_attn_heads_grads, hf_attn_heads_grads, atol=1e-2))\n",
+    "\n",
+    "    # 3. vproj grads\n",
+    "    hf_vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.go_0\"\n",
+    "    ff_vproj_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n",
+    "\n",
+    "    hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n",
+    "    ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n",
+    "    assert(np.allclose(hf_vproj_grads, ff_vproj_grads, atol=1e-2))\n",
+    "\n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    ##############################\n",
+    "    hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
+    "    hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "    print(hf_value_states.shape)\n",
+    "    ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n",
+    "    ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n",
+    "    print(ff_value_states.shape)\n",
+    "    assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    ########## Compare key and query projs grads ##################\n",
+    "    ff_devQKVPRojArray = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n",
+    "    ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n",
+    "    ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n",
+    "    ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n",
+    "    ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n",
+    "    assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n",
+    "\n",
+    "    # simulate qk_prods_softmax\n",
+    "    ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n",
+    "    ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n",
+    "    ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n",
+    "    ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n",
+    "    ff_value_states = torch.from_numpy(ff_value_states)\n",
+    "    ff_value_states = ff_value_states.permute(1,0,2)\n",
+    "    print(ff_attn_heads_grads.shape)\n",
+    "    print(ff_value_states.shape)\n",
+    "    simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n",
+    "    #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n",
+    "    #print(\"Simulated QK prods grads:\")\n",
+    "    #print(simulated_qk_prods_softmax_grads[0,:,:])\n",
+    "\n",
+    "    # qk prods softmax right before softmax\n",
+    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.go_0\"\n",
+    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad\"\n",
+    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "    # assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "    mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n",
+    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "\n",
+    "    # qk prods softmax right after softmax\n",
+    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.gi_0\"\n",
+    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in\"\n",
+    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "    assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "    \n",
+    "    # qk prods softmax after mask\n",
+    "    hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.matmul_op.go_0\"\n",
+    "    hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "    ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in_masked\"\n",
+    "    ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "    hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "    assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "\n",
+    "    # Compare query activation\n",
+    "    hf_query_activation = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.query_activation\"\n",
+    "    hf_query_activation = torch.load(hf_query_activation)\n",
+    "    ff_query_activation = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_query_activation\"\n",
+    "    ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n",
+    "    hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "    assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n",
+    "    \n",
+    "    # Compare FF kproj with intermediate kproj data from HF\n",
+    "    hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n",
+    "    hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
+    "    # print(hf_kproj_grads_post_rotary[0,:,:])\n",
+    "    ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n",
+    "    ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "    # print(\"ff_kproj: \", ff_kproj.shape)\n",
+    "    # print(ff_kproj[:,:,0])\n",
+    "    assert(np.allclose(ff_kproj, hf_kproj_grads_post_rotary, atol=1e-2))\n",
+    "\n",
+    "    # Compare HF before and Kproj out gradients\n",
+    "    hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n",
+    "    hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
+    "    print(hf_kproj_grads_before_rotary[:,:,0])\n",
+    "    assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
+    "    hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n",
+    "    hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n",
+    "    #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n",
+    "    #print(hf_kproj_grads[:,:64])\n",
+    "    reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "    #print(reshaped_tensor.shape)\n",
+    "    assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "    # Compare QProj\n",
+    "    hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n",
+    "    hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
+    "    print(\"HF Qproj:\")\n",
+    "    print(hf_qproj_grads.shape)\n",
+    "    reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "    print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
+    "    print(reshaped_tensor[:,:,0])\n",
+    "    ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n",
+    "    ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n",
+    "    print(\"FF Qproj:\")\n",
+    "    print(ff_qproj.shape)\n",
+    "    print(ff_qproj[:,:,0])\n",
+    "    assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "    hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n",
+    "    hf_attn_in = torch.load(hf_attn_in)\n",
+    "    print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    hf_attn_in = hf_attn_in.squeeze().T\n",
+    "    hf_attn_in = hf_attn_in.detach().cpu().numpy()\n",
+    "    print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    print(hf_attn_in)\n",
+    "\n",
+    "    ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n",
+    "    ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n",
+    "    print(\"ff_attn_in: \", ff_attn_in.shape)\n",
+    "    print(ff_attn_in)\n",
+    "    #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n",
+    "\n",
+    "    mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n",
+    "    mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "    \n",
+    "    assert(np.allclose(hf_kproj_grads, ff_kProjGrads, atol=1e-2))\n",
+    "    assert(np.allclose(hf_qproj_grads, ff_qProjGrads, atol=1e-2))\n",
+    "    # print(hf_qproj_grads.shape)\n",
+    "    # print(hf_kproj_grads)\n",
+    "    # print()\n",
+    "    # print(ff_qProjGrads)\n",
+    "    # print(ff_kProjGrads.shape)\n",
+    "    \n",
+    "    \n",
+    "\n",
+    "    assert False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 12, 24, 24])\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=16'>17</a>\u001b[0m     ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m     \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=18'>19</a>\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=20'>21</a>\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "layer_num = 11\n",
+    "hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n",
+    "ff_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "\n",
+    "hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
+    "\n",
+    "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n",
+    "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n",
+    "print(hf_qk_prods_softmax.shape)\n",
+    "#print(ff_qk_prods_softmax.shape)\n",
+    "#print(hf_qk_prods_softmax[:,:,0])\n",
+    "#print()\n",
+    "#print(ff_qk_prods_softmax[:,:,0])\n",
+    "\n",
+    "for head_idx in range(12):\n",
+    "    hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "    ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "    assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "\n",
+    "\n",
+    "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n",
+    "print(hf_value_states.shape)\n",
+    "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n",
+    "print()\n",
+    "print(attn_output.shape)\n",
+    "print(attn_output.transpose(1, 2).contiguous().shape)\n",
+    "print(\"Hf attn heads\")\n",
+    "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n",
+    "\n",
+    "print(\"Attn heads grads:\")\n",
+    "hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "print(torch.load(hf_attn_heads_grads).shape)\n",
+    "print(\"HF value grads:\")\n",
+    "vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
+    "print(torch.load(vproj_grads).shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([2, 3, 4])\n",
+      "torch.Size([4, 3, 2])\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.randn(2,3,4)\n",
+    "print(a.shape)\n",
+    "print(a.T.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[   0.0000,    0.0000,    0.0000,  ...,    0.0000,    0.0000,\n",
+      "             0.0000],\n",
+      "         [  27.8890,  -21.5089,   45.8214,  ...,    5.4010,  -10.8787,\n",
+      "            39.7619],\n",
+      "         [  19.2197,   27.4681,  -68.7141,  ...,  102.3280,   66.7925,\n",
+      "          -160.8711],\n",
+      "         ...,\n",
+      "         [  63.9532,   17.4273,  -29.4416,  ...,  101.6105,   67.5937,\n",
+      "          -198.4432],\n",
+      "         [  31.2799,   13.0724,  -44.7179,  ...,  132.4898,   42.3135,\n",
+      "          -194.4037],\n",
+      "         [  42.3453,  -16.2693,  -55.7386,  ...,   90.5921,   52.2032,\n",
+      "          -124.1802]]], device='cuda:0')\n",
+      "tensor([[[-1.1845e+06, -6.7460e+05,  7.4494e+05,  ..., -9.1441e+05,\n",
+      "          -1.4912e+05,  3.5769e+06],\n",
+      "         [-7.3920e+01, -7.9389e+01,  1.1027e+02,  ..., -7.3020e+01,\n",
+      "          -2.3540e+01,  3.4587e+02],\n",
+      "         [-5.3885e+01, -1.7373e+01, -1.9780e+01,  ...,  4.1291e+01,\n",
+      "           5.5099e+01,  5.5910e+01],\n",
+      "         ...,\n",
+      "         [-2.1948e+01, -3.2109e+01,  2.8364e+01,  ...,  3.4321e+01,\n",
+      "           5.0713e+01,  5.6592e+01],\n",
+      "         [-4.4339e+01, -2.8339e+01,  1.4070e+01,  ...,  6.2797e+01,\n",
+      "           3.0760e+01,  6.1743e+01],\n",
+      "         [-1.6287e+01, -5.0413e+01, -1.9940e+01,  ...,  4.3766e+01,\n",
+      "           4.7833e+01,  4.7295e+01]]], device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n",
+    "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n",
+    "a = torch.load(a)\n",
+    "b = torch.load(b)\n",
+    "print(a)\n",
+    "print(b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Manual matmul checks\n",
+    "# ff_w2_grad_out_tensor = np.loadtxt(ff_BWD_w2_out, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_w2_weight_tensor = np.loadtxt(ff_w2_weight, delimiter=',').reshape((3072,768), order='F')\n",
+    "# ff_w2_gradin_tensor = np.matmul(ff_w2_weight_tensor, ff_w2_grad_out_tensor).reshape((3072,128), order='F')\n",
+    "\n",
+    "# ff_lora_gradout_tensor = np.loadtxt(ff_BWD_lora_B_out, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_lora_A_weight_tensor = np.loadtxt(ff_lora_A_weight, delimiter=',').reshape((3072,16), order='F')\n",
+    "# ff_lora_B_weight_tensor = np.loadtxt(ff_lora_B_weight, delimiter=',').reshape((16,768), order='F')\n",
+    "# ff_lora_int_grad_tensor = np.matmul(ff_lora_B_weight_tensor, ff_lora_gradout_tensor)\n",
+    "# ff_lora_gradint_tensor = np.matmul(ff_lora_A_weight_tensor, ff_lora_int_grad_tensor)\n",
+    "\n",
+    "# # ff_w2_gradin_tensor = ff_w2_gradin_tensor + ff_lora_gradint_tensor\n",
+    "# #print(ff_w2_gradin_tensor[:,:24])\n",
+    "# print(\"calculated LORA grad in\")\n",
+    "# print(ff_lora_gradint_tensor[:,:24])\n",
+    "# # ff_BWD_w2_in_pre_tensor = np.loadtxt(ff_BWD_w2_in_pre, delimiter=',').reshape((3072,128), order='F')\n",
+    "# ff_BWD_lora_A_in_tensor = np.loadtxt(ff_BWD_lora_A_in, delimiter=',').reshape((3072,128), order='F')\n",
+    "# print(\"FlexFlow LORA grad in\")\n",
+    "# print(ff_BWD_lora_A_in_tensor[:,:24])\n",
+    "# # print(ff_BWD_w2_in_pre_tensor[:,:24])\n",
+    "# print(\"HF lora grad in\")\n",
+    "# print(torch.load(hf_BWD_loraA_in).squeeze().T.detach().cpu().numpy())\n",
+    "# compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
+    "\n",
+    "# simulate act_fn_grad\n",
+    "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# w3_fwd_out_tensor = np.loadtxt(ff_FWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# #print(ssm_out_grad_tensor.shape, w3_fwd_out_tensor.shape)\n",
+    "# act_fn_out_check = np.multiply(ssm_out_grad_tensor, w3_fwd_out_tensor)\n",
+    "# print(\"simulated act fn out - simulated\")\n",
+    "# print(act_fn_out_check[:,:24])\n",
+    "# print(\"simulated act fn out - HF\")\n",
+    "# print(torch.load(hf_BWD_act_fn_out).detach().cpu().numpy().squeeze().T)\n",
+    "\n",
+    "# Simulated w3_grad\n",
+    "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')[:,:24]\n",
+    "# act_fnc_out_tensor = np.loadtxt(ff_FWD_act_fnc_out, delimiter=',').reshape((3072,24), order='F')\n",
+    "# w3_out_gard_check = np.multiply(ssm_out_grad_tensor, act_fnc_out_tensor)\n",
+    "# print(\"simulated w3 out - FF\")\n",
+    "# print(w3_out_gard_check)\n",
+    "# ff_BWD_w3_out_tensor = np.loadtxt(ff_BWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# hf_BWD_w3_out_tensor = torch.load(hf_BWD_w3_out).detach().cpu().numpy().squeeze().T\n",
+    "# print(\"w3 out, FF\")\n",
+    "# print(ff_BWD_w3_out_tensor[:,:24])\n",
+    "# print(\"w3 out, HF\")\n",
+    "# print(hf_BWD_w3_out_tensor)\n",
+    "\n",
+    "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n",
+    "# assert False\n",
+    "# print()\n",
+    "# print()\n",
+    "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n",
+    "# print_tensors(hf_BWD_w3_in, ff_BWD_w3_in, \"w3 in\")\n",
+    "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out, \"w1 out\")\n",
+    "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in, \"w1 in\")\n",
+    "# print_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out, \"ffn norm out\")\n",
+    "# print_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2, \"ffn norm in\")\n",
+    "# print()\n",
+    "# ff_w1_out_tensor = np.loadtxt(ff_BWD_w1_out, delimiter=',').reshape((3072,128), order='F')\n",
+    "# ff_w1_in_tensor = np.loadtxt(ff_BWD_w1_in, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_w1_in_pre_tensor = np.loadtxt(ff_BWD_w1_in_pre, delimiter=',').reshape((768,128), order='F')\n",
+    "# ff_w1_only_in_tensor = ff_w1_in_tensor - ff_w1_in_pre_tensor\n",
+    "# ff_w1_weight_tensor = np.loadtxt(ff_w1_weight, delimiter=',').reshape((768,3072), order='F')\n",
+    "# ff_w1_in_check_tensor = np.matmul(ff_w1_weight_tensor, ff_w1_out_tensor)\n",
+    "# print(\"W1 in (simulated):\")\n",
+    "# print(ff_w1_in_check_tensor[:,:24])\n",
+    "# print(\"W1 in (FF):\")\n",
+    "# print(ff_w1_only_in_tensor[:,:24])\n",
+    "# print(\"W1 in (HF):\")\n",
+    "# print(torch.load(hf_BWD_w1_in).squeeze().T.detach().cpu().numpy())\n",
+    "\n",
+    "# compare_tensors_difference(hf_BWD_w2_in, ff_BWD_w2_in, ff_BWD_lora_A_in)\n",
+    "# compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
+    "#compare_hf_tensors(hf_BWD_ffn_norm_in, hf_BWD_attn_out_out)\n",
+    "# print(\"\\nw1 out:\")\n",
+    "\n",
+    "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "# print(\"\\nW1 in\\n\")\n",
+    "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n",
+    "# compare_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n",
+    "# print(\"\\nffn_norm\")\n",
+    "# compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "for layer_num in range(12):\n",
+    "    hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n",
+    "    hf_w1_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.gate_proj.weight\"\n",
+    "    ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n",
+    "    hf_w3_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.up_proj.weight\"\n",
+    "    ff_w3_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n",
+    "    hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n",
+    "    ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/qk_prods_alignment.ipynb b/tests/peft/qk_prods_alignment.ipynb
new file mode 100644
index 0000000000..c2a3644b3d
--- /dev/null
+++ b/tests/peft/qk_prods_alignment.ipynb
@@ -0,0 +1,24 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 6a5992212dd0479d3651a0f3d4b5689d4fab52dc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 15 Dec 2023 11:46:48 -0500
Subject: [PATCH 03/11] backup

---
 inference/incr_decoding/incr_decoding.cc |  12 +--
 src/ops/inc_multihead_self_attention.cc  |  42 +++++++-
 src/ops/inc_multihead_self_attention.cu  |  63 +++++++++++-
 src/ops/kernels/softmax.cu               |  25 ++---
 src/ops/residual_rms_norm.cc             |  68 ++++++++++++
 tests/peft/alignment_tests.ipynb         | 126 ++---------------------
 tests/peft/hf_finetune.py                |   4 +
 7 files changed, 203 insertions(+), 137 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index dcd1b5a5ab..009cd1af45 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -285,12 +285,12 @@ void FlexFlow::top_level_task(Task const *task,
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       // Add inference request
-      Request inference_req;
-      inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
-      inference_req.peft_model_id = peft_model_id;
-      requests.push_back(inference_req);
-      total_num_requests++;
+      // Request inference_req;
+      // inference_req.prompt = text;
+      // inference_req.max_sequence_length = 128;
+      // inference_req.peft_model_id = peft_model_id;
+      // requests.push_back(inference_req);
+      // total_num_requests++;
       // Add fine-tuning request
       Request fine_tuning_req;
       fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING;
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index d88c7edb81..569b35097d 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -878,6 +878,37 @@ void IncMultiHeadSelfAttention::inference_task(
   }
 }
 
+template <typename DT>
+void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    std::cout << "Could not open file: " << filepath << std::endl;
+  }
+  assert(in.good() && "incorrect weight file path");
+  std::vector<DT> host_array(size);
+  size_t loaded_data_size = sizeof(DT) * size;
+  in.seekg(0, in.end);
+  in.seekg(0, in.beg);
+  in.read((char *)host_array.data(), loaded_data_size);
+
+  size_t in_get_size = in.gcount();
+  if (in_get_size != loaded_data_size) {
+    std::cout << "load weight data error " << in_get_size << ", "
+              << loaded_data_size << ", " << sizeof(DT) << std::endl;
+    assert(false);
+  }
+  assert(size == host_array.size());
+
+  copy_tensor_host_to_dev(ptr, host_array.data(), size);
+
+  // // normal
+  // long data_index = 0;
+  // for (auto v : host_array) {
+  //   ptr[data_index++] = v;
+  // }
+  in.close();
+}
+
 FutureMap IncMultiHeadSelfAttention::peft_bwd(
     FFModel const &ff,
     BatchConfigFuture const &bc,
@@ -966,7 +997,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
       m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   GenericTensorAccessorR biases;
   if (*m->qkv_bias || *m->final_bias) {
@@ -996,6 +1027,15 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
 
   std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
   std::cout << "BWD " << op_name_without_uid << std::endl;
+
+  if (op_name_without_uid == "layers_11_attention") {
+    load_tensor_from_file(
+      output_grad.get_float_ptr(), 
+      (output_grad.domain.get_volume()/128)*24, 
+      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0.flexflow"
+    );
+  }
+
   IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
       m,
       bc,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index dec116addd..cf3fedd95a 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -601,6 +601,8 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                      m->hidden_size);
   }
   if (*m->apply_rotary_embedding) {
+    printf("ROTARY EMBEDDING: num_tokens: %i, q_array_size: %i, m->hidden_size: %i\n", 
+      num_tokens, q_array_size, m->hidden_size);
     /*q&k*/
     parallelism = num_tokens * m->hidden_size;
     apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
@@ -894,6 +896,26 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   //   }
   // #endif
+  std::string op_name_without_uid = std::string(m->op_name);
+  size_t last_underscore = op_name_without_uid.length() - 1;
+  for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
+    if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
+      break;
+    } else if (m->op_name[i] == '_') {
+      last_underscore = i;
+    }
+  }
+  op_name_without_uid.erase(last_underscore);
+
+  std::string base_filepath =
+        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
+        "_bwd-step_" + std::to_string(m->bwd_step) +
+        "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
+        "_layer-name_" + op_name_without_uid + "_shard-id_" +
+        std::to_string(shard_id);
+
+
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
@@ -955,6 +977,10 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      std::string filename = base_filepath + "_o_proj_in_grad";
+      std::cout << "FILENAME: " << filename << std::endl;
+      save_tensor(C, m_*n_, filename.c_str());
     }
     // Step 2: compute gradients w.r.t. value
     {
@@ -1006,6 +1032,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      std::string filename = base_filepath + "_v_proj_in_grad";
+      std::cout << "FILENAME: " << filename << std::endl;
+      save_tensor(C, m_*n_*m->num_q_heads, filename.c_str());
+      std::string filename2 = base_filepath + "_qk_prods_softmax";
+      std::cout << "FILENAME: " << filename2 << std::endl;
+      save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str());
     }
     // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
     {
@@ -1054,6 +1087,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      std::string filename4 = base_filepath + "_qk_prods_softmax_grad";
+      std::cout << "FILENAME: " << filename4 << std::endl;
+      save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str());
+      std::string filename5 = base_filepath + "_vcache";
+      std::cout << "FILENAME: " << filename5 << std::endl;
+      save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str());
     }
     // Step 4: softmax backpropagation
     {
@@ -1080,6 +1119,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                       &beta,
                                       m->qk_tensor,
                                       m->qk_prods));
+      
+      DT *C = static_cast<DT *>(m->qk_prods);
+      std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in";
+      std::cout << "FILENAME: " << filename6 << std::endl;
+      save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str());
+      
       //  TODO: fill all elements above diagonal to force causal attention
       size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
       if (entries_above_diagonal > 0) {
@@ -1095,6 +1140,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                                 entries_above_diagonal,
                                                 DT(0.0f));
       }
+      std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked";
+      std::cout << "FILENAME: " << filename7 << std::endl;
+      save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str());
     }
     // Step 5: compute gradients w.r.t. key
     {
@@ -1149,6 +1197,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    std::string filename8 = base_filepath + "_query_activation";
+    std::cout << "FILENAME: " << filename8 << std::endl;
+    save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str());
+    std::string filename9 = base_filepath + "_devkproj";
+    std::cout << "FILENAME: " << filename9 << std::endl;
+    save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
     }
     // Step 6: compute gradients w.r.t query
     {
@@ -1166,10 +1220,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
       DT *C = static_cast<DT *>(m->devQKVProjArray);
       // after transposition & striding
-      // after transposition & striding
       int m_ = num_tokens; // num_new_tokens
       int n_ = m->qProjSize;
-      int k_ = num_tokens;
+      int k_ = num_tokens; 
       // before transposition and striding
       int lda = num_tokens; // num_new_tokens
       int ldb = m->qProjSize * m->num_q_heads;
@@ -1200,6 +1253,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    std::string filename3 = base_filepath + "_devQKVPRojArray";
+    std::cout << "FILENAME: " << filename3 << std::endl;
+    save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
     }
     // Step 7: compute gradients w.r.t. input
     {
@@ -1242,6 +1298,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      std::string filename12 = base_filepath + "_attn_final_grad_in";
+      std::cout << "FILENAME: " << filename12 << std::endl;
+      save_tensor(C, num_tokens * m->qSize, filename12.c_str());
     }
   }
 }
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 0fc827319d..115461c129 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -279,10 +279,11 @@ __global__ void sparse_categorical_crossentropy_loss_peft_backward(
     int num_tokens,
     int num_classes) {
   CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
-    input_grad[i] = output_grad[i];
-    if (i % num_classes == token_ids[i / num_classes]) {
-      input_grad[i] -= 1.0f;
-    }
+    input_grad[i] = 0.5;
+    // input_grad[i] = output_grad[i];
+    // if (i % num_classes == token_ids[i / num_classes]) {
+    //   input_grad[i] -= 1.0f;
+    // }
   }
 }
 
@@ -334,14 +335,14 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
         num_bwd_tokens,
         num_classes);
     // scale
-    scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
-                   CUDA_NUM_THREADS,
-                   0,
-                   stream>>>(input_grad_ptr +
-                                 tokens_previous_requests * num_classes,
-                             num_bwd_tokens * num_classes,
-                             DT(0.0),
-                             scale_factor);
+    // scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
+    //                CUDA_NUM_THREADS,
+    //                0,
+    //                stream>>>(input_grad_ptr +
+    //                              tokens_previous_requests * num_classes,
+    //                          num_bwd_tokens * num_classes,
+    //                          DT(0.0),
+    //                          scale_factor);
 
     tokens_previous_requests += num_bwd_tokens;
   }
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 28dd7e2745..c03d1c07a1 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -673,6 +673,37 @@ Legion::FutureMap
   return runtime->execute_index_space(ctx, launcher);
 }
 
+template <typename DT>
+void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    std::cout << "Could not open file: " << filepath << std::endl;
+  }
+  assert(in.good() && "incorrect weight file path");
+  std::vector<DT> host_array(size);
+  size_t loaded_data_size = sizeof(DT) * size;
+  in.seekg(0, in.end);
+  in.seekg(0, in.beg);
+  in.read((char *)host_array.data(), loaded_data_size);
+
+  size_t in_get_size = in.gcount();
+  if (in_get_size != loaded_data_size) {
+    std::cout << "load weight data error " << in_get_size << ", "
+              << loaded_data_size << ", " << sizeof(DT) << std::endl;
+    assert(false);
+  }
+  assert(size == host_array.size());
+
+  copy_tensor_host_to_dev(ptr, host_array.data(), size);
+
+  // // normal
+  // long data_index = 0;
+  // for (auto v : host_array) {
+  //   ptr[data_index++] = v;
+  // }
+  in.close();
+}
+
 /*
   regions[0](I): RMS output_grad
   regions[1](I/O): Residual input 0 grad
@@ -710,6 +741,43 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
       m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(
       m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
+  int numdims = residual_input0_grad.domain.get_dim();
+  std::cout << "in grad dims: ";
+  for (int i=0; i<numdims; i++) {
+    std::cout << residual_input0_grad.domain.hi()[i] - residual_input0_grad.domain.lo()[i] + 1 << ", ";
+  }
+  std::cout << std::endl;
+  // get name
+  std::string op_name_without_uid = ResidualRMSNorm::get_op_name_without_uid(m);
+  std::cout << "BWD " << op_name_without_uid << std::endl;
+  // print shape
+
+  if (op_name_without_uid == "norm") {
+    load_tensor_from_file(
+      residual_input0_grad.get_float_ptr(), 
+      (residual_input0_grad.domain.get_volume()/128)*24, 
+      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_norm.gi_0.flexflow"
+    );
+    load_tensor_from_file(
+      residual_input1_grad.get_float_ptr(), 
+      (residual_input1_grad.domain.get_volume()/128)*24, 
+      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_norm.gi_0.flexflow"
+    );
+  } else if (op_name_without_uid == "layers_11_ffn_norm") {
+    load_tensor_from_file(
+      residual_input0_grad.get_float_ptr(), 
+      (residual_input0_grad.domain.get_volume()/128)*24, 
+      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0.flexflow"
+    );
+    load_tensor_from_file(
+      residual_input1_grad.get_float_ptr(), 
+      (residual_input1_grad.domain.get_volume()/128)*24, 
+      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0.flexflow"
+    );
+  }
+  // if name is layers_11_rms_norm, copy both
+  //load_tensor_from_file(DT *ptr, size_t size, std::string filepath)
+  
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb
index fc2899b7c4..6a7e2bead8 100644
--- a/tests/peft/alignment_tests.ipynb
+++ b/tests/peft/alignment_tests.ipynb
@@ -496,7 +496,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -521,120 +521,13 @@
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
-      "Ok!\n",
-      "\n",
-      "Huggingface-FlexFlow checks:\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n",
-      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  1.8299303e+01\n",
-      "  1.3871717e+01  1.8452764e+00]\n",
-      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  1.82993031e+01\n",
-      "  1.38717194e+01  1.84527588e+00]\n",
-      "[ True  True  True ...  True  True  True]\n",
-      "[2394]\n",
-      "Ok!\n",
-      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n",
-      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  1.8299303e+01\n",
-      "  1.3871717e+01  1.8452764e+00]\n",
-      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  1.82993031e+01\n",
-      "  1.38717194e+01  1.84527588e+00]\n",
-      "[ True  True  True ...  True  True  True]\n",
-      "[2394]\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "(64, 12, 24)\n",
-      "(64, 12, 24)\n",
-      "torch.Size([12, 24, 64])\n",
-      "torch.Size([12, 64, 24])\n",
-      "3.7760416666666665% mismatch in QK prods softmax out grad\n",
-      "hf_kproj_grads_post_rotary:  (24, 64, 12)\n",
-      "hf_kproj_grads_before_rotary:  (24, 64, 12)\n",
-      "[[-2.1751599e-01  1.2245592e-01 -2.6237822e-01 ...  1.4371538e+00\n",
-      "   5.2717543e-01  5.1425427e-01]\n",
-      " [-7.6055496e+01  4.2463268e+01 -1.2235089e+02 ...  5.3328156e+02\n",
-      "   2.3810944e+02  1.8990283e+02]\n",
-      " [ 5.2804117e+00 -4.9826388e+00  4.6240320e+00 ... -5.4525635e+01\n",
-      "  -2.1779711e+01 -3.2857445e+01]\n",
-      " ...\n",
-      " [ 1.0541155e+00 -3.1229946e-01  1.4272718e+00 ... -4.6509657e+00\n",
-      "  -2.2930331e+00  2.1488833e-01]\n",
-      " [ 1.8427576e+00 -5.0031781e-01  2.1591802e+00 ... -8.0996408e+00\n",
-      "  -6.6346103e-01  1.1487092e+00]\n",
-      " [-3.9699785e-02  1.7903861e-02 -5.9658013e-02 ...  2.4856456e-01\n",
-      "  -5.0553136e-02 -6.9623299e-02]]\n",
-      "HF Qproj:\n",
-      "torch.Size([24, 768])\n",
-      "\t reshaped:  (24, 64, 12)\n",
-      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
-      "   0.0000000e+00  0.0000000e+00]\n",
-      " [-2.1439369e-03  3.2949594e-03 -2.9551802e-04 ...  2.4234147e-01\n",
-      "   4.3675132e-02 -9.2217997e-02]\n",
-      " [ 2.9682016e+00 -4.1166668e+00 -1.5612273e+00 ...  1.8131609e+01\n",
-      "  -2.7311683e+00 -2.3451160e+01]\n",
-      " ...\n",
-      " [ 7.9408998e+00 -1.6016111e+01  7.5070286e+00 ...  6.9805992e+01\n",
-      "  -8.9288340e+00 -5.6585381e+01]\n",
-      " [ 5.9755993e+00 -1.2562438e+01  9.3722830e+00 ...  5.6924896e+01\n",
-      "   1.6420145e+00 -2.7360382e+01]\n",
-      " [ 2.9259295e+00 -8.8997393e+00  5.6537924e+00 ...  4.0085789e+01\n",
-      "  -5.5427680e+00 -3.3319279e+01]]\n",
-      "FF Qproj:\n",
-      "(24, 64, 12)\n",
-      "[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00\n",
-      "   0.00000000e+00  0.00000000e+00]\n",
-      " [-2.14390800e-03  3.29491800e-03 -2.95515000e-04 ...  2.42337957e-01\n",
-      "   4.36745250e-02 -9.22166630e-02]\n",
-      " [ 2.96819830e+00 -4.11666203e+00 -1.56122601e+00 ...  1.81315899e+01\n",
-      "  -2.73117018e+00 -2.34511394e+01]\n",
-      " ...\n",
-      " [ 7.94090462e+00 -1.60161247e+01  7.50703382e+00 ...  6.98059998e+01\n",
-      "  -8.92883396e+00 -5.65854073e+01]\n",
-      " [ 5.97561932e+00 -1.25624638e+01  9.37229633e+00 ...  5.69249115e+01\n",
-      "   1.64204872e+00 -2.73603287e+01]\n",
-      " [ 2.92593479e+00 -8.89975548e+00  5.65379906e+00 ...  4.00858383e+01\n",
-      "  -5.54277229e+00 -3.33193245e+01]]\n",
-      "hf_attn_in:  torch.Size([1, 24, 768])\n",
-      "hf_attn_in:  (768, 24)\n",
-      "[[-7.5252225e+06 -1.2484900e+03  5.3961243e+01 ... -3.3743629e+01\n",
-      "  -2.8661375e+00 -1.2124748e+00]\n",
-      " [-9.5513660e+06  1.8450066e+03  3.8372406e+02 ... -1.9933952e+01\n",
-      "   1.4622488e+01 -2.4410028e+00]\n",
-      " [ 1.1452265e+07  2.1254619e+03 -4.8265629e+01 ...  4.8204151e+01\n",
-      "  -1.4841021e+01 -1.6505869e+01]\n",
-      " ...\n",
-      " [ 2.1089132e+06  2.8605874e+03  1.2375667e+03 ...  2.6102766e+01\n",
-      "   3.1422745e+01  6.7668297e+01]\n",
-      " [ 2.1169400e+06 -4.6361523e+02 -1.6561864e+02 ... -5.3914165e+00\n",
-      "  -6.0169220e-02  2.2841328e+01]\n",
-      " [ 7.3915345e+06  8.9268884e+02  5.4528040e+02 ...  6.2017624e+01\n",
-      "   1.3753588e+01  5.2149849e+01]]\n",
-      "ff_attn_in:  (768, 24)\n",
-      "[[-7.52522050e+06 -1.24848975e+03  5.39611511e+01 ... -3.37436867e+01\n",
-      "  -2.86611795e+00 -1.21241117e+00]\n",
-      " [-9.55136800e+06  1.84500635e+03  3.83724091e+02 ... -1.99339561e+01\n",
-      "   1.46225519e+01 -2.44094014e+00]\n",
-      " [ 1.14522650e+07  2.12546313e+03 -4.82656937e+01 ...  4.82041969e+01\n",
-      "  -1.48411064e+01 -1.65059376e+01]\n",
-      " ...\n",
-      " [ 2.10891300e+06  2.86058789e+03  1.23756726e+03 ...  2.61027851e+01\n",
-      "   3.14227238e+01  6.76683807e+01]\n",
-      " [ 2.11693950e+06 -4.63614868e+02 -1.65618515e+02 ... -5.39132690e+00\n",
-      "  -6.02092740e-02  2.28413010e+01]\n",
-      " [ 7.39153300e+06  8.92689453e+02  5.45280640e+02 ...  6.20176048e+01\n",
-      "   1.37535381e+01  5.21498528e+01]]\n"
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_ffn_norm_shard-id_0_input_1 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_output_0\n",
+      "Tensor1: [   0.            0.            0.         ...   90.59211731   52.20317078\n",
+      " -124.1802063 ]\n",
+      "Tensor2:[-1.18452775e+06 -6.74598750e+05  7.44935375e+05 ...  4.37662773e+01\n",
+      "  4.78333855e+01  4.72951965e+01]\n",
+      "[False False False ... False False False]\n",
+      "[    0     1     2 ... 18429 18430 18431]\n"
      ]
     },
     {
@@ -644,7 +537,8 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 5\u001b[0m line \u001b[0;36m3\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=299'>300</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mff_attn_in: \u001b[39m\u001b[39m\"\u001b[39m, ff_attn_in\u001b[39m.\u001b[39mshape)\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=300'>301</a>\u001b[0m \u001b[39mprint\u001b[39m(ff_attn_in)\n\u001b[0;32m--> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=301'>302</a>\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_attn_in, hf_attn_in, atol\u001b[39m=\u001b[39m\u001b[39m1e-2\u001b[39m))\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=303'>304</a>\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=305'>306</a>\u001b[0m hf_kproj_grads_in \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mhf_weight_base_path\u001b[39m}\u001b[39;00m\u001b[39m/bwd_step_0_layers.\u001b[39m\u001b[39m{\u001b[39;00mlayer_num\u001b[39m}\u001b[39;00m\u001b[39m.self_attn.k_proj.gi_0\u001b[39m\u001b[39m\"\u001b[39m\n",
+      "Cell \u001b[0;32mIn[8], line 93\u001b[0m\n\u001b[1;32m     91\u001b[0m compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n\u001b[1;32m     92\u001b[0m compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m24\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m768\u001b[39m) \u001b[38;5;66;03m# should fail\u001b[39;00m\n\u001b[0;32m---> 93\u001b[0m \u001b[43mcompare_flexflow_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mff_BWD_ffn_norm_in2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_len\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m24\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m768\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     95\u001b[0m \u001b[38;5;66;03m# HF-FlexFlow checks\u001b[39;00m\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "Cell \u001b[0;32mIn[2], line 159\u001b[0m, in \u001b[0;36mcompare_flexflow_tensors\u001b[0;34m(ff_tensor1_fp, ff_tensor2_fp, tolerance, max_len)\u001b[0m\n\u001b[1;32m    157\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m    158\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m--> 159\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mlen\u001b[39m(ff_tensor1))\n\u001b[1;32m    160\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
       "\u001b[0;31mAssertionError\u001b[0m: "
      ]
     }
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 7836633b30..016a2386cb 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -72,6 +72,8 @@ def peft_backward_hook(module, grad_input, grad_output):
             print("\t", go.shape)
             print(f"\t\tSaving to {dst_filepath}")
             torch.save(go, dst_filepath)
+            if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0":
+                go.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow")
         else:
             print(go)
     print("Backward GRAD Input:")
@@ -81,6 +83,8 @@ def peft_backward_hook(module, grad_input, grad_output):
             print("\t", gi.shape)
             print(f"\t\tSaving to {dst_filepath}")
             torch.save(gi, dst_filepath)
+            if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0" or dst_filepath == "./hf_peft_tensors/bwd_step_0_norm.gi_0":
+                gi.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow")
         else:
             print(gi)
 

From 1202548c707ece4cda5201ae69bde97e8ab1e1bf Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 16 Dec 2023 10:42:49 -0500
Subject: [PATCH 04/11] updates

---
 src/ops/residual_rms_norm.cc     |  13 +-
 src/ops/sigmoid_silu_multi.cu    |   2 +-
 tests/peft/alignment_tests.ipynb | 251 +++++++++++--------------------
 tests/peft/hf_finetune.py        |   2 +
 4 files changed, 100 insertions(+), 168 deletions(-)

diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index c03d1c07a1..aa72d7d32a 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -741,21 +741,24 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
       m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(
       m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
+  
+  // get name
+  std::string op_name_without_uid = ResidualRMSNorm::get_op_name_without_uid(m);
+  std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " <<  m->reset_input_grads[0] << " reset_in_grad[1]: " <<  m->reset_input_grads[1] << std::endl;
+  // print shape
   int numdims = residual_input0_grad.domain.get_dim();
   std::cout << "in grad dims: ";
   for (int i=0; i<numdims; i++) {
     std::cout << residual_input0_grad.domain.hi()[i] - residual_input0_grad.domain.lo()[i] + 1 << ", ";
   }
   std::cout << std::endl;
-  // get name
-  std::string op_name_without_uid = ResidualRMSNorm::get_op_name_without_uid(m);
-  std::cout << "BWD " << op_name_without_uid << std::endl;
-  // print shape
 
   if (op_name_without_uid == "norm") {
+    int amount = (residual_input1_grad.domain.get_volume()/128)*24;
+    std::cout << "Loading " << amount << " elements from /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_norm.gi_0.flexflow..." << std::endl;
     load_tensor_from_file(
       residual_input0_grad.get_float_ptr(), 
-      (residual_input0_grad.domain.get_volume()/128)*24, 
+      (residual_input0_grad.domain.get_volume()/128)*24,
       "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_norm.gi_0.flexflow"
     );
     load_tensor_from_file(
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index ec88042a1d..e3b6f7a69a 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -233,7 +233,7 @@ void SigmoidSiluMulti::backward_kernel_wrapper(
                                                input1.get_float_ptr(),
                                                input2.get_float_ptr(),
                                                input1_grad.get_float_ptr(),
-                                               input1_grad.get_float_ptr(),
+                                               input2_grad.get_float_ptr(),
                                                m->reset_input_grads[0],
                                                m->reset_input_grads[1]);
   } else if (m->input_type[0] == DT_HALF) {
diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb
index 6a7e2bead8..d43b68e14d 100644
--- a/tests/peft/alignment_tests.ipynb
+++ b/tests/peft/alignment_tests.ipynb
@@ -217,7 +217,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -225,151 +225,25 @@
      "output_type": "stream",
      "text": [
       "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n",
-      "Ok!\n"
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.0.self_attn.o_proj.output_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_attention_shard-id_0_output_0\n",
+      "HF: [ 0.          0.          0.         ...  0.02364488 -0.00304312\n",
+      " -0.01649825]\n",
+      "FF:[ 0.          0.          0.         ...  0.02200473  0.01693928\n",
+      " -0.02354377]\n",
+      "[ True  True  True ...  True False  True]\n",
+      "[ 1541  1543  1545 ... 18427 18428 18430]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 10\u001b[0m\n\u001b[1;32m      8\u001b[0m hf_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.o_proj.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      9\u001b[0m ff_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_attention_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 10\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_attn_out\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     11\u001b[0m hf_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.post_attention_layernorm.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     12\u001b[0m ff_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_ffn_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "Cell \u001b[0;32mIn[2], line 27\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
      ]
     }
    ],
@@ -438,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -489,14 +363,12 @@
     "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n",
     "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n",
     "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n",
-    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n",
-    "\n",
-    "\n"
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -511,6 +383,7 @@
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
+      "Ok!\n",
       "\n",
       "FlexFlow checks:\n",
       "Ok!\n",
@@ -521,13 +394,67 @@
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
-      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_ffn_norm_shard-id_0_input_1 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_output_0\n",
-      "Tensor1: [   0.            0.            0.         ...   90.59211731   52.20317078\n",
-      " -124.1802063 ]\n",
-      "Tensor2:[-1.18452775e+06 -6.74598750e+05  7.44935375e+05 ...  4.37662773e+01\n",
-      "  4.78333855e+01  4.72951965e+01]\n",
-      "[False False False ... False False False]\n",
-      "[    0     1     2 ... 18429 18430 18431]\n"
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  2.14103374e+01\n",
+      "  1.20965424e+01  3.64246750e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[2394]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43506250e+03 -6.48986000e+05  1.17611156e+05 ...  2.14103374e+01\n",
+      "  1.20965424e+01  3.64246750e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[2394]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "(64, 12, 24)\n",
+      "(64, 12, 24)\n",
+      "torch.Size([12, 24, 64])\n",
+      "torch.Size([12, 64, 24])\n",
+      "4.383680555555555% mismatch in QK prods softmax out grad\n",
+      "hf_kproj_grads_post_rotary:  (24, 64, 12)\n",
+      "hf_kproj_grads_before_rotary:  (24, 64, 12)\n",
+      "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
+      "   3.2884139e-01  3.6066702e-01]\n",
+      " [-2.8613457e+01 -5.5871558e+00  2.9384506e+01 ...  3.8781765e+01\n",
+      "   9.6900581e+01  9.8469597e+01]\n",
+      " [ 3.3027239e+00  1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n",
+      "  -2.0009745e+01 -2.9787930e+01]\n",
+      " ...\n",
+      " [-7.6470733e-02 -1.8891659e-01  3.6430117e-01 ... -2.7492592e-01\n",
+      "   5.7017130e-01 -1.5985624e-01]\n",
+      " [ 2.5780225e+00 -1.8152566e+00  2.5087588e+00 ... -1.0776262e+01\n",
+      "   6.2166649e-01  8.3755457e-01]\n",
+      " [-6.8324409e-02  1.7568478e-01 -3.2310838e-01 ...  3.1202292e+00\n",
+      "  -2.6652411e-01 -1.1917179e+00]]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_752415/3838509285.py:163: UserWarning: The use of `x.T` on tensors of dimension other than 2 to reverse their shape is deprecated and it will throw an error in a future release. Consider `x.mT` to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse the dimensions of a tensor. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3571.)\n",
+      "  hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n"
      ]
     },
     {
@@ -537,8 +464,7 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[8], line 93\u001b[0m\n\u001b[1;32m     91\u001b[0m compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n\u001b[1;32m     92\u001b[0m compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m24\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m768\u001b[39m) \u001b[38;5;66;03m# should fail\u001b[39;00m\n\u001b[0;32m---> 93\u001b[0m \u001b[43mcompare_flexflow_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mff_BWD_ffn_norm_in2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_len\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m24\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m768\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     95\u001b[0m \u001b[38;5;66;03m# HF-FlexFlow checks\u001b[39;00m\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "Cell \u001b[0;32mIn[2], line 159\u001b[0m, in \u001b[0;36mcompare_flexflow_tensors\u001b[0;34m(ff_tensor1_fp, ff_tensor2_fp, tolerance, max_len)\u001b[0m\n\u001b[1;32m    157\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m    158\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m--> 159\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mlen\u001b[39m(ff_tensor1))\n\u001b[1;32m    160\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "Cell \u001b[0;32mIn[19], line 267\u001b[0m\n\u001b[1;32m    265\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhf_kproj_grads_before_rotary: \u001b[39m\u001b[38;5;124m\"\u001b[39m, hf_kproj_grads_before_rotary\u001b[38;5;241m.\u001b[39mshape)\n\u001b[1;32m    266\u001b[0m \u001b[38;5;28mprint\u001b[39m(hf_kproj_grads_before_rotary[:,:,\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m--> 267\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(np\u001b[38;5;241m.\u001b[39mallclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-2\u001b[39m))\n\u001b[1;32m    268\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.k_proj.go_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    269\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_kproj_grads)\u001b[38;5;241m.\u001b[39msqueeze()\n",
       "\u001b[0;31mAssertionError\u001b[0m: "
      ]
     }
@@ -619,6 +545,7 @@
     "    print(\"\\nHuggingface checks:\")\n",
     "    if layer_num == tot_num_layers-1:\n",
     "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n",
     "    compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n",
     "    compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n",
     "    # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n",
@@ -635,8 +562,8 @@
     "    compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n",
     "    compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n",
     "    compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n",
-    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768) # should fail\n",
-    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768)\n",
+    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n",
+    "    #compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768) # should fail\n",
     "\n",
     "    # HF-FlexFlow checks\n",
     "    print(\"\\nHuggingface-FlexFlow checks:\")\n",
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 016a2386cb..818e0b9085 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -229,6 +229,8 @@ def main():
                 torch.save(params, f"./hf_peft_tensors/{name}")
             if "lm_head" in name or "norm" in name:
                 torch.save(params, f"./hf_peft_tensors/{name}")
+            if "down_proj" in name or "self_attn" in name:
+                torch.save(params, f"./hf_peft_tensors/{name}")
 
     # Load fine-tuning dataset
     data = load_dataset("Abirate/english_quotes")

From 0de45d136feea49fc89f6b0497aa6a3662bfc68a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 18 Dec 2023 17:43:59 -0500
Subject: [PATCH 05/11] update

---
 src/ops/inc_multihead_self_attention.cu |  81 +++++-
 tests/peft/alignment_tests.ipynb        | 358 +++++++++++++++++++++---
 2 files changed, 402 insertions(+), 37 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index cf3fedd95a..b1c3db25dc 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -492,6 +492,47 @@ __global__ void
   }
 }
 
+template <typename DT>
+__global__ void apply_rotary_embedding_bwd(DT *input_ptr,
+                                          cuFloatComplex *complex_input,
+                                          BatchConfig::PerTokenInfo const *tokenInfos,
+                                          int proj_size,
+                                          int num_tokens,
+                                          int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // compute indexes to visit first half proj_size of each of q/k tensor.
+    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
+    bool q_tensor = i < (num_tokens * hidden_size / 2);
+    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
+    assert(hidden_size % proj_size == 0);
+    int num_heads = hidden_size / proj_size;
+
+    int token_idx = real_i % num_tokens;
+    int idx = (real_i / num_tokens) % (proj_size / 2);
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    assert(head_idx < num_heads);
+
+    int complex_part_index = 
+      (q_tensor ? 0 : 1) * num_tokens * hidden_size + 
+      head_idx * num_tokens * proj_size +
+      idx * num_tokens +
+      token_idx;
+    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
+
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
 template <typename DT>
 __global__ void fill_entries_above_diagonal(DT *matrix,
                                             size_t num_rows,
@@ -1200,7 +1241,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     std::string filename8 = base_filepath + "_query_activation";
     std::cout << "FILENAME: " << filename8 << std::endl;
     save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str());
-    std::string filename9 = base_filepath + "_devkproj";
+    std::string filename9 = base_filepath + "_devkproj_pre";
     std::cout << "FILENAME: " << filename9 << std::endl;
     save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
     }
@@ -1253,9 +1294,41 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    std::string filename3 = base_filepath + "_devQKVPRojArray";
-    std::cout << "FILENAME: " << filename3 << std::endl;
-    save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
+      std::string filename3 = base_filepath + "_devQKVPRojArray_pre";
+      std::cout << "FILENAME: " << filename3 << std::endl;
+      save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
+    }
+
+    // Compute rotary embeddings bwd
+    {
+      if (*m->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        printf("ROTARY EMBEDDING bwd: num_tokens: %i, m->hidden_size: %i\n",  num_tokens, m->hidden_size);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
+                                    min(CUDA_NUM_THREADS, parallelism),
+                                    0,
+                                    stream>>>(A,
+                                              m->complex_input,
+                                              m->token_infos,
+                                              m->qProjSize,
+                                              num_tokens,
+                                              m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        std::string filename3 = base_filepath + "_devQKVPRojArray";
+        std::cout << "FILENAME: " << filename3 << std::endl;
+        save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
+      std::string filename9 = base_filepath + "_devkproj";
+      std::cout << "FILENAME: " << filename9 << std::endl;
+      save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
     }
     // Step 7: compute gradients w.r.t. input
     {
diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb
index d43b68e14d..a9382b9524 100644
--- a/tests/peft/alignment_tests.ipynb
+++ b/tests/peft/alignment_tests.ipynb
@@ -312,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -368,7 +368,90 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "class LlamaRotaryEmbedding(nn.Module):\n",
+    "    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.dim = dim\n",
+    "        self.max_position_embeddings = max_position_embeddings\n",
+    "        self.base = base\n",
+    "        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n",
+    "        self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n",
+    "\n",
+    "        # Build here to make `torch.jit.trace` work.\n",
+    "        self._set_cos_sin_cache(\n",
+    "            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n",
+    "        )\n",
+    "\n",
+    "    def _set_cos_sin_cache(self, seq_len, device, dtype):\n",
+    "        self.max_seq_len_cached = seq_len\n",
+    "        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n",
+    "\n",
+    "        freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n",
+    "        # Different from paper, but it uses a different permutation in order to obtain the same calculation\n",
+    "        emb = torch.cat((freqs, freqs), dim=-1)\n",
+    "        self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n",
+    "        self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n",
+    "\n",
+    "    def forward(self, x, seq_len=None):\n",
+    "        # x: [bs, num_attention_heads, seq_len, head_size]\n",
+    "        if seq_len > self.max_seq_len_cached:\n",
+    "            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n",
+    "\n",
+    "        return (\n",
+    "            self.cos_cached[:seq_len].to(dtype=x.dtype),\n",
+    "            self.sin_cached[:seq_len].to(dtype=x.dtype),\n",
+    "        )\n",
+    "def rotate_half(x):\n",
+    "    \"\"\"Rotates half the hidden dims of the input.\"\"\"\n",
+    "    x1 = x[..., : x.shape[-1] // 2] # first half\n",
+    "    x2 = x[..., x.shape[-1] // 2 :] # second half\n",
+    "    return torch.cat((x2, -x1), dim=-1)\n",
+    "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n",
+    "    \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n",
+    "\n",
+    "    Args:\n",
+    "        q (`torch.Tensor`): The query tensor.\n",
+    "        k (`torch.Tensor`): The key tensor.\n",
+    "        cos (`torch.Tensor`): The cosine part of the rotary embedding.\n",
+    "        sin (`torch.Tensor`): The sine part of the rotary embedding.\n",
+    "        position_ids (`torch.Tensor`):\n",
+    "            The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n",
+    "            used to pass offsetted position ids when working with a KV-cache.\n",
+    "        unsqueeze_dim (`int`, *optional*, defaults to 1):\n",
+    "            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n",
+    "            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n",
+    "            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n",
+    "            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n",
+    "            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n",
+    "            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n",
+    "    Returns:\n",
+    "        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n",
+    "    \"\"\"\n",
+    "    cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    q_embed = (q * cos) + (rotate_half(q) * sin)\n",
+    "    k_embed = (k * cos) + (rotate_half(k) * sin)\n",
+    "    return q_embed, k_embed\n",
+    "head_dim = 64\n",
+    "max_position_embeddings = 2048\n",
+    "rope_theta=10_000\n",
+    "kv_seq_len = 24\n",
+    "rotary_emb = LlamaRotaryEmbedding(\n",
+    "    head_dim,\n",
+    "    max_position_embeddings=max_position_embeddings,\n",
+    "    base=rope_theta,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -433,6 +516,19 @@
       "torch.Size([12, 64, 24])\n",
       "4.383680555555555% mismatch in QK prods softmax out grad\n",
       "hf_kproj_grads_post_rotary:  (24, 64, 12)\n",
+      "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
+      "   3.2884139e-01  3.6066702e-01]\n",
+      " [-7.5168266e+00  4.6582484e+00  1.7284815e+01 ...  3.8785275e+01\n",
+      "   9.6879341e+01  9.8476219e+01]\n",
+      " [-8.0723800e-02  1.8924624e+00 -2.6913931e+00 ... -4.4056824e+01\n",
+      "  -2.0001854e+01 -2.9799681e+01]\n",
+      " ...\n",
+      " [-1.9819270e-01  1.9175959e-01  1.8926021e-01 ... -2.7737719e-01\n",
+      "   5.7191163e-01 -1.5962012e-01]\n",
+      " [-2.5673387e+00  1.7033563e+00  2.2882986e+00 ... -1.0788559e+01\n",
+      "   6.3817674e-01  8.2335520e-01]\n",
+      " [-1.7806959e-01  8.9493655e-02 -1.9538833e-01 ...  3.1075442e+00\n",
+      "  -2.6218265e-01 -1.1863230e+00]]\n",
       "hf_kproj_grads_before_rotary:  (24, 64, 12)\n",
       "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
       "   3.2884139e-01  3.6066702e-01]\n",
@@ -446,15 +542,98 @@
       " [ 2.5780225e+00 -1.8152566e+00  2.5087588e+00 ... -1.0776262e+01\n",
       "   6.2166649e-01  8.3755457e-01]\n",
       " [-6.8324409e-02  1.7568478e-01 -3.2310838e-01 ...  3.1202292e+00\n",
-      "  -2.6652411e-01 -1.1917179e+00]]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_752415/3838509285.py:163: UserWarning: The use of `x.T` on tensors of dimension other than 2 to reverse their shape is deprecated and it will throw an error in a future release. Consider `x.mT` to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse the dimensions of a tensor. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3571.)\n",
-      "  hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n"
+      "  -2.6652411e-01 -1.1917179e+00]]\n",
+      "ff_kproj_pre:  (24, 64, 12)\n",
+      "[[-1.57300810e-02 -4.11607850e-02  3.05930820e-02 ...  3.86295587e-01\n",
+      "   3.28840941e-01  3.60667169e-01]\n",
+      " [-7.51684189e+00  4.65823793e+00  1.72848415e+01 ...  3.87852402e+01\n",
+      "   9.68793182e+01  9.84762802e+01]\n",
+      " [-8.07239790e-02  1.89246774e+00 -2.69139457e+00 ... -4.40568542e+01\n",
+      "  -2.00018616e+01 -2.97996941e+01]\n",
+      " ...\n",
+      " [-1.98194161e-01  1.91760257e-01  1.89260900e-01 ... -2.77382791e-01\n",
+      "   5.71911991e-01 -1.59620658e-01]\n",
+      " [-2.56733608e+00  1.70335352e+00  2.28829479e+00 ... -1.07885523e+01\n",
+      "   6.38186097e-01  8.23350966e-01]\n",
+      " [-1.78069487e-01  8.94933720e-02 -1.95387334e-01 ...  3.10753584e+00\n",
+      "  -2.62182117e-01 -1.18632054e+00]]\n",
+      "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n",
+      "ff_kproj:  (24, 64, 12)\n",
+      "[[-1.57300810e-02 -4.11607850e-02  3.05930820e-02 ...  3.86295587e-01\n",
+      "   3.28840941e-01  3.60667169e-01]\n",
+      " [-2.86135025e+01 -5.58717918e+00  2.93845501e+01 ...  3.87817307e+01\n",
+      "   9.69005585e+01  9.84696579e+01]\n",
+      " [ 3.30272818e+00  1.82759121e-01 -1.84967291e+00 ... -4.40522003e+01\n",
+      "  -2.00097523e+01 -2.97879410e+01]\n",
+      " ...\n",
+      " [-7.64704790e-02 -1.88917309e-01  3.64301860e-01 ... -2.74931490e-01\n",
+      "   5.70171654e-01 -1.59856781e-01]\n",
+      " [ 2.57801986e+00 -1.81525516e+00  2.50875449e+00 ... -1.07762566e+01\n",
+      "   6.21675968e-01  8.37550282e-01]\n",
+      " [-6.83238800e-02  1.75684214e-01 -3.23107153e-01 ...  3.12022066e+00\n",
+      "  -2.66523540e-01 -1.19171536e+00]]\n",
+      "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n",
+      "HF Qproj:\n",
+      "torch.Size([24, 768])\n",
+      "\t reshaped:  (24, 64, 12)\n",
+      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
+      "   0.0000000e+00  0.0000000e+00]\n",
+      " [-2.1439367e-03  3.2949597e-03 -2.9551555e-04 ...  2.4234168e-01\n",
+      "   4.3675169e-02 -9.2218071e-02]\n",
+      " [ 2.2399018e+00 -3.3713050e+00 -9.7703063e-01 ...  1.4206999e+01\n",
+      "  -1.9386978e+00 -1.7756876e+01]\n",
+      " ...\n",
+      " [ 8.7195921e+00  1.2150297e+01  9.2796574e+00 ...  4.7496593e+01\n",
+      "  -2.7162397e+00 -2.6841351e+01]\n",
+      " [ 2.8459630e+00 -2.0782030e+01  5.8126745e+00 ...  3.3043846e+01\n",
+      "  -1.4574212e+01 -4.2649174e+01]\n",
+      " [-7.3419094e-02 -4.3298864e+00  2.0055656e+00 ... -1.4900026e+00\n",
+      "  -9.0601617e-01  2.9582092e-01]]\n",
+      "FF Qproj:\n",
+      "(24, 64, 12)\n",
+      "[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00\n",
+      "   0.00000000e+00  0.00000000e+00]\n",
+      " [-2.14390700e-03  3.29491400e-03 -2.95521000e-04 ...  2.42338002e-01\n",
+      "   4.36745360e-02 -9.22166560e-02]\n",
+      " [ 2.23990273e+00 -3.37130690e+00 -9.77032721e-01 ...  1.42070026e+01\n",
+      "  -1.93870103e+00 -1.77568874e+01]\n",
+      " ...\n",
+      " [ 8.71960449e+00  1.21503038e+01  9.27967071e+00 ...  4.74966431e+01\n",
+      "  -2.71619344e+00 -2.68413410e+01]\n",
+      " [ 2.84595203e+00 -2.07820034e+01  5.81268263e+00 ...  3.30439415e+01\n",
+      "  -1.45741787e+01 -4.26492157e+01]\n",
+      " [-7.34183120e-02 -4.32989836e+00  2.00555873e+00 ... -1.48999298e+00\n",
+      "  -9.06009376e-01  2.95819134e-01]]\n",
+      "hf_attn_in:  torch.Size([1, 24, 768])\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n",
+      "   2.38160934e+01  3.15938339e+01]\n",
+      " [-9.55138900e+06  6.71377197e+02  2.06871887e+02 ... -3.86393509e+01\n",
+      "   2.14816055e+01 -6.58599396e+01]\n",
+      " [ 1.14522670e+07  2.19898975e+03 -6.89673233e+00 ...  9.51593590e+00\n",
+      "  -1.68612709e+01  6.02474251e+01]\n",
+      " ...\n",
+      " [ 2.10891925e+06  3.78648706e+03  1.02701221e+03 ...  3.59794388e+01\n",
+      "   5.03902206e+01  4.19777756e+01]\n",
+      " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ...  9.36443710e+00\n",
+      "   3.84094887e+01 -7.51948738e+00]\n",
+      " [ 7.39155050e+06  1.11731885e+03  3.38369843e+02 ...  3.70399475e+01\n",
+      "   1.77629051e+01  9.76780853e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.52523500e+06 -1.27625269e+03 -4.39337921e+01 ... -3.34414406e+01\n",
+      "   2.38161297e+01  3.15938721e+01]\n",
+      " [-9.55138800e+06  6.71377197e+02  2.06871750e+02 ... -3.86393204e+01\n",
+      "   2.14817352e+01 -6.58599167e+01]\n",
+      " [ 1.14522680e+07  2.19898877e+03 -6.89653015e+00 ...  9.51589775e+00\n",
+      "  -1.68612289e+01  6.02473717e+01]\n",
+      " ...\n",
+      " [ 2.10891825e+06  3.78648633e+03  1.02701196e+03 ...  3.59794769e+01\n",
+      "   5.03901863e+01  4.19778595e+01]\n",
+      " [ 2.11695250e+06 -2.36283737e+02 -1.08002808e+02 ...  9.36445141e+00\n",
+      "   3.84095154e+01 -7.51950741e+00]\n",
+      " [ 7.39155000e+06  1.11731885e+03  3.38369934e+02 ...  3.70399170e+01\n",
+      "   1.77628460e+01  9.76780930e+01]]\n",
+      "4.817708333333334% mismatch in attention input grads\n"
      ]
     },
     {
@@ -464,7 +643,7 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[19], line 267\u001b[0m\n\u001b[1;32m    265\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhf_kproj_grads_before_rotary: \u001b[39m\u001b[38;5;124m\"\u001b[39m, hf_kproj_grads_before_rotary\u001b[38;5;241m.\u001b[39mshape)\n\u001b[1;32m    266\u001b[0m \u001b[38;5;28mprint\u001b[39m(hf_kproj_grads_before_rotary[:,:,\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m--> 267\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(np\u001b[38;5;241m.\u001b[39mallclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-2\u001b[39m))\n\u001b[1;32m    268\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.k_proj.go_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    269\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_kproj_grads)\u001b[38;5;241m.\u001b[39msqueeze()\n",
+      "Cell \u001b[0;32mIn[45], line 353\u001b[0m\n\u001b[1;32m    349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
       "\u001b[0;31mAssertionError\u001b[0m: "
      ]
     }
@@ -720,23 +899,62 @@
     "    hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n",
     "    assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n",
     "    \n",
+    "    ########################################## ROPE and Kproj ##########################################\n",
+    "\n",
     "    # Compare FF kproj with intermediate kproj data from HF\n",
     "    hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n",
-    "    hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n",
-    "    print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
-    "    # print(hf_kproj_grads_post_rotary[0,:,:])\n",
-    "    ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n",
-    "    ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
-    "    # print(\"ff_kproj: \", ff_kproj.shape)\n",
-    "    # print(ff_kproj[:,:,0])\n",
-    "    assert(np.allclose(ff_kproj, hf_kproj_grads_post_rotary, atol=1e-2))\n",
-    "\n",
-    "    # Compare HF before and Kproj out gradients\n",
+    "    hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n",
+    "    hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
+    "    print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
+    "    # Check hf ROPE \n",
+    "    cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n",
+    "    cos = cos.cuda()\n",
+    "    sin = sin.cuda()\n",
+    "    # query_states:  torch.Size([1, 12, 24, 64])\n",
+    "    # key_states:  torch.Size([1, 12, 24, 64])\n",
+    "    # position_ids:  torch.Size([1, 24])\n",
+    "    # tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "    #          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "    query_states = torch.zeros([1, 12, 24, 64]).cuda()\n",
+    "    position_ids = torch.arange(24).unsqueeze(0).cuda()\n",
+    "    query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n",
+    "    hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
+    "    # print(hf_kproj_grads_post_rotary[:,:,0])\n",
+    "    \n",
     "    hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n",
-    "    hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "    hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n",
+    "    hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
     "    print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
     "    print(hf_kproj_grads_before_rotary[:,:,0])\n",
-    "    assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
+    "    # Compare HF rope with manual ROPE\n",
+    "    assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "    # Compare HF Kproj with FF Kproj (before ROPE) \n",
+    "    ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n",
+    "    ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "    print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
+    "    print(ff_kproj_pre[:,:,0])\n",
+    "    mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "    #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "    \n",
+    "    ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n",
+    "    ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "    print(\"ff_kproj: \", ff_kproj.shape)\n",
+    "    print(ff_kproj[:,:,0])\n",
+    "    mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n",
+    "    assert(pct_mismatch <= 0.05)\n",
+    "    #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "    \n",
+    "    \n",
+    "    #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
     "    hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n",
     "    hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n",
     "    #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n",
@@ -745,6 +963,8 @@
     "    #print(reshaped_tensor.shape)\n",
     "    assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n",
     "\n",
+    "    ########################################## Qproj (with ROPE) ##########################################\n",
+    "\n",
     "    # Compare QProj\n",
     "    hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n",
     "    hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
@@ -780,19 +1000,91 @@
     "    print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n",
     "    assert(pct_mismatch <= 0.05)\n",
     "    \n",
-    "    assert(np.allclose(hf_kproj_grads, ff_kProjGrads, atol=1e-2))\n",
-    "    assert(np.allclose(hf_qproj_grads, ff_qProjGrads, atol=1e-2))\n",
-    "    # print(hf_qproj_grads.shape)\n",
-    "    # print(hf_kproj_grads)\n",
-    "    # print()\n",
-    "    # print(ff_qProjGrads)\n",
-    "    # print(ff_kProjGrads.shape)\n",
-    "    \n",
-    "    \n",
     "\n",
     "    assert False"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([12, 24, 64])\n",
+      "tensor([[-1.5730e-02, -4.1161e-02,  3.0593e-02,  ...,  3.8630e-01,\n",
+      "          3.2884e-01,  3.6067e-01],\n",
+      "        [-2.8613e+01, -5.5872e+00,  2.9385e+01,  ...,  3.8782e+01,\n",
+      "          9.6901e+01,  9.8470e+01],\n",
+      "        [ 3.3027e+00,  1.8276e-01, -1.8497e+00,  ..., -4.4052e+01,\n",
+      "         -2.0010e+01, -2.9788e+01],\n",
+      "        ...,\n",
+      "        [-7.6471e-02, -1.8892e-01,  3.6430e-01,  ..., -2.7493e-01,\n",
+      "          5.7017e-01, -1.5986e-01],\n",
+      "        [ 2.5780e+00, -1.8153e+00,  2.5088e+00,  ..., -1.0776e+01,\n",
+      "          6.2167e-01,  8.3755e-01],\n",
+      "        [-6.8324e-02,  1.7568e-01, -3.2311e-01,  ...,  3.1202e+00,\n",
+      "         -2.6652e-01, -1.1917e+00]])\n",
+      "(24, 64, 12)\n",
+      "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
+      "   3.2884139e-01  3.6066702e-01]\n",
+      " [-2.8613457e+01 -5.5871558e+00  2.9384506e+01 ...  3.8781765e+01\n",
+      "   9.6900581e+01  9.8469597e+01]\n",
+      " [ 3.3027239e+00  1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n",
+      "  -2.0009745e+01 -2.9787930e+01]\n",
+      " ...\n",
+      " [-7.6470733e-02 -1.8891659e-01  3.6430117e-01 ... -2.7492592e-01\n",
+      "   5.7017130e-01 -1.5985624e-01]\n",
+      " [ 2.5780225e+00 -1.8152566e+00  2.5087588e+00 ... -1.0776262e+01\n",
+      "   6.2166649e-01  8.3755457e-01]\n",
+      " [-6.8324409e-02  1.7568478e-01 -3.2310838e-01 ...  3.1202292e+00\n",
+      "  -2.6652411e-01 -1.1917179e+00]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# value states: torch.Size([1, 12, 24, 64])\n",
+    "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n",
+    "key_states = value_states\n",
+    "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n",
+    "# query_states:  torch.Size([1, 12, 24, 64])\n",
+    "# key_states:  torch.Size([1, 12, 24, 64])\n",
+    "# position_ids:  torch.Size([1, 24])\n",
+    "# tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "#          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "query_states = torch.zeros([1, 12, 24, 64])\n",
+    "position_ids = torch.arange(24).unsqueeze(0)\n",
+    "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n",
+    "key_states = key_states.squeeze()\n",
+    "print(key_states.shape)\n",
+    "print(key_states[0,:,:])\n",
+    "print(hf_kproj_grads_before_rotary.shape)\n",
+    "print(hf_kproj_grads_before_rotary[:,:,0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+       "         18, 19, 20, 21, 22, 23]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.arange(24).unsqueeze(0).cuda()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From ab6a33f362f9c404a41ec7e749848959b6a93f4f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 19 Dec 2023 14:27:30 -0500
Subject: [PATCH 06/11] backup

---
 tests/peft/alignment_tests.ipynb | 346 ++++++++++++++++---------------
 1 file changed, 176 insertions(+), 170 deletions(-)

diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb
index a9382b9524..e2a8978ea3 100644
--- a/tests/peft/alignment_tests.ipynb
+++ b/tests/peft/alignment_tests.ipynb
@@ -217,7 +217,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -225,25 +225,151 @@
      "output_type": "stream",
      "text": [
       "Ok!\n",
-      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.0.self_attn.o_proj.output_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_attention_shard-id_0_output_0\n",
-      "HF: [ 0.          0.          0.         ...  0.02364488 -0.00304312\n",
-      " -0.01649825]\n",
-      "FF:[ 0.          0.          0.         ...  0.02200473  0.01693928\n",
-      " -0.02354377]\n",
-      "[ True  True  True ...  True False  True]\n",
-      "[ 1541  1543  1545 ... 18427 18428 18430]\n"
-     ]
-    },
-    {
-     "ename": "AssertionError",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[8], line 10\u001b[0m\n\u001b[1;32m      8\u001b[0m hf_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.o_proj.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      9\u001b[0m ff_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_attention_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 10\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_attn_out\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     11\u001b[0m hf_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.post_attention_layernorm.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     12\u001b[0m ff_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_ffn_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
-      "Cell \u001b[0;32mIn[2], line 27\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "\u001b[0;31mAssertionError\u001b[0m: "
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
      ]
     }
    ],
@@ -312,7 +438,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -368,7 +494,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -451,7 +577,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -510,129 +636,9 @@
       "Ok!\n",
       "Ok!\n",
       "Ok!\n",
-      "(64, 12, 24)\n",
-      "(64, 12, 24)\n",
-      "torch.Size([12, 24, 64])\n",
-      "torch.Size([12, 64, 24])\n",
       "4.383680555555555% mismatch in QK prods softmax out grad\n",
-      "hf_kproj_grads_post_rotary:  (24, 64, 12)\n",
-      "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
-      "   3.2884139e-01  3.6066702e-01]\n",
-      " [-7.5168266e+00  4.6582484e+00  1.7284815e+01 ...  3.8785275e+01\n",
-      "   9.6879341e+01  9.8476219e+01]\n",
-      " [-8.0723800e-02  1.8924624e+00 -2.6913931e+00 ... -4.4056824e+01\n",
-      "  -2.0001854e+01 -2.9799681e+01]\n",
-      " ...\n",
-      " [-1.9819270e-01  1.9175959e-01  1.8926021e-01 ... -2.7737719e-01\n",
-      "   5.7191163e-01 -1.5962012e-01]\n",
-      " [-2.5673387e+00  1.7033563e+00  2.2882986e+00 ... -1.0788559e+01\n",
-      "   6.3817674e-01  8.2335520e-01]\n",
-      " [-1.7806959e-01  8.9493655e-02 -1.9538833e-01 ...  3.1075442e+00\n",
-      "  -2.6218265e-01 -1.1863230e+00]]\n",
-      "hf_kproj_grads_before_rotary:  (24, 64, 12)\n",
-      "[[-1.5729919e-02 -4.1160699e-02  3.0592799e-02 ...  3.8629669e-01\n",
-      "   3.2884139e-01  3.6066702e-01]\n",
-      " [-2.8613457e+01 -5.5871558e+00  2.9384506e+01 ...  3.8781765e+01\n",
-      "   9.6900581e+01  9.8469597e+01]\n",
-      " [ 3.3027239e+00  1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n",
-      "  -2.0009745e+01 -2.9787930e+01]\n",
-      " ...\n",
-      " [-7.6470733e-02 -1.8891659e-01  3.6430117e-01 ... -2.7492592e-01\n",
-      "   5.7017130e-01 -1.5985624e-01]\n",
-      " [ 2.5780225e+00 -1.8152566e+00  2.5087588e+00 ... -1.0776262e+01\n",
-      "   6.2166649e-01  8.3755457e-01]\n",
-      " [-6.8324409e-02  1.7568478e-01 -3.2310838e-01 ...  3.1202292e+00\n",
-      "  -2.6652411e-01 -1.1917179e+00]]\n",
-      "ff_kproj_pre:  (24, 64, 12)\n",
-      "[[-1.57300810e-02 -4.11607850e-02  3.05930820e-02 ...  3.86295587e-01\n",
-      "   3.28840941e-01  3.60667169e-01]\n",
-      " [-7.51684189e+00  4.65823793e+00  1.72848415e+01 ...  3.87852402e+01\n",
-      "   9.68793182e+01  9.84762802e+01]\n",
-      " [-8.07239790e-02  1.89246774e+00 -2.69139457e+00 ... -4.40568542e+01\n",
-      "  -2.00018616e+01 -2.97996941e+01]\n",
-      " ...\n",
-      " [-1.98194161e-01  1.91760257e-01  1.89260900e-01 ... -2.77382791e-01\n",
-      "   5.71911991e-01 -1.59620658e-01]\n",
-      " [-2.56733608e+00  1.70335352e+00  2.28829479e+00 ... -1.07885523e+01\n",
-      "   6.38186097e-01  8.23350966e-01]\n",
-      " [-1.78069487e-01  8.94933720e-02 -1.95387334e-01 ...  3.10753584e+00\n",
-      "  -2.62182117e-01 -1.18632054e+00]]\n",
       "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n",
-      "ff_kproj:  (24, 64, 12)\n",
-      "[[-1.57300810e-02 -4.11607850e-02  3.05930820e-02 ...  3.86295587e-01\n",
-      "   3.28840941e-01  3.60667169e-01]\n",
-      " [-2.86135025e+01 -5.58717918e+00  2.93845501e+01 ...  3.87817307e+01\n",
-      "   9.69005585e+01  9.84696579e+01]\n",
-      " [ 3.30272818e+00  1.82759121e-01 -1.84967291e+00 ... -4.40522003e+01\n",
-      "  -2.00097523e+01 -2.97879410e+01]\n",
-      " ...\n",
-      " [-7.64704790e-02 -1.88917309e-01  3.64301860e-01 ... -2.74931490e-01\n",
-      "   5.70171654e-01 -1.59856781e-01]\n",
-      " [ 2.57801986e+00 -1.81525516e+00  2.50875449e+00 ... -1.07762566e+01\n",
-      "   6.21675968e-01  8.37550282e-01]\n",
-      " [-6.83238800e-02  1.75684214e-01 -3.23107153e-01 ...  3.12022066e+00\n",
-      "  -2.66523540e-01 -1.19171536e+00]]\n",
       "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n",
-      "HF Qproj:\n",
-      "torch.Size([24, 768])\n",
-      "\t reshaped:  (24, 64, 12)\n",
-      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
-      "   0.0000000e+00  0.0000000e+00]\n",
-      " [-2.1439367e-03  3.2949597e-03 -2.9551555e-04 ...  2.4234168e-01\n",
-      "   4.3675169e-02 -9.2218071e-02]\n",
-      " [ 2.2399018e+00 -3.3713050e+00 -9.7703063e-01 ...  1.4206999e+01\n",
-      "  -1.9386978e+00 -1.7756876e+01]\n",
-      " ...\n",
-      " [ 8.7195921e+00  1.2150297e+01  9.2796574e+00 ...  4.7496593e+01\n",
-      "  -2.7162397e+00 -2.6841351e+01]\n",
-      " [ 2.8459630e+00 -2.0782030e+01  5.8126745e+00 ...  3.3043846e+01\n",
-      "  -1.4574212e+01 -4.2649174e+01]\n",
-      " [-7.3419094e-02 -4.3298864e+00  2.0055656e+00 ... -1.4900026e+00\n",
-      "  -9.0601617e-01  2.9582092e-01]]\n",
-      "FF Qproj:\n",
-      "(24, 64, 12)\n",
-      "[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00\n",
-      "   0.00000000e+00  0.00000000e+00]\n",
-      " [-2.14390700e-03  3.29491400e-03 -2.95521000e-04 ...  2.42338002e-01\n",
-      "   4.36745360e-02 -9.22166560e-02]\n",
-      " [ 2.23990273e+00 -3.37130690e+00 -9.77032721e-01 ...  1.42070026e+01\n",
-      "  -1.93870103e+00 -1.77568874e+01]\n",
-      " ...\n",
-      " [ 8.71960449e+00  1.21503038e+01  9.27967071e+00 ...  4.74966431e+01\n",
-      "  -2.71619344e+00 -2.68413410e+01]\n",
-      " [ 2.84595203e+00 -2.07820034e+01  5.81268263e+00 ...  3.30439415e+01\n",
-      "  -1.45741787e+01 -4.26492157e+01]\n",
-      " [-7.34183120e-02 -4.32989836e+00  2.00555873e+00 ... -1.48999298e+00\n",
-      "  -9.06009376e-01  2.95819134e-01]]\n",
-      "hf_attn_in:  torch.Size([1, 24, 768])\n",
-      "hf_attn_in:  (768, 24)\n",
-      "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n",
-      "   2.38160934e+01  3.15938339e+01]\n",
-      " [-9.55138900e+06  6.71377197e+02  2.06871887e+02 ... -3.86393509e+01\n",
-      "   2.14816055e+01 -6.58599396e+01]\n",
-      " [ 1.14522670e+07  2.19898975e+03 -6.89673233e+00 ...  9.51593590e+00\n",
-      "  -1.68612709e+01  6.02474251e+01]\n",
-      " ...\n",
-      " [ 2.10891925e+06  3.78648706e+03  1.02701221e+03 ...  3.59794388e+01\n",
-      "   5.03902206e+01  4.19777756e+01]\n",
-      " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ...  9.36443710e+00\n",
-      "   3.84094887e+01 -7.51948738e+00]\n",
-      " [ 7.39155050e+06  1.11731885e+03  3.38369843e+02 ...  3.70399475e+01\n",
-      "   1.77629051e+01  9.76780853e+01]]\n",
-      "ff_attn_in:  (768, 24)\n",
-      "[[-7.52523500e+06 -1.27625269e+03 -4.39337921e+01 ... -3.34414406e+01\n",
-      "   2.38161297e+01  3.15938721e+01]\n",
-      " [-9.55138800e+06  6.71377197e+02  2.06871750e+02 ... -3.86393204e+01\n",
-      "   2.14817352e+01 -6.58599167e+01]\n",
-      " [ 1.14522680e+07  2.19898877e+03 -6.89653015e+00 ...  9.51589775e+00\n",
-      "  -1.68612289e+01  6.02473717e+01]\n",
-      " ...\n",
-      " [ 2.10891825e+06  3.78648633e+03  1.02701196e+03 ...  3.59794769e+01\n",
-      "   5.03901863e+01  4.19778595e+01]\n",
-      " [ 2.11695250e+06 -2.36283737e+02 -1.08002808e+02 ...  9.36445141e+00\n",
-      "   3.84095154e+01 -7.51950741e+00]\n",
-      " [ 7.39155000e+06  1.11731885e+03  3.38369934e+02 ...  3.70399170e+01\n",
-      "   1.77628460e+01  9.76780930e+01]]\n",
       "4.817708333333334% mismatch in attention input grads\n"
      ]
     },
@@ -643,7 +649,7 @@
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[45], line 353\u001b[0m\n\u001b[1;32m    349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "Cell \u001b[0;32mIn[11], line 353\u001b[0m\n\u001b[1;32m    349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
       "\u001b[0;31mAssertionError\u001b[0m: "
      ]
     }
@@ -829,10 +835,10 @@
     "    ##############################\n",
     "    hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
     "    hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n",
-    "    print(hf_value_states.shape)\n",
+    "    # print(hf_value_states.shape)\n",
     "    ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n",
     "    ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n",
-    "    print(ff_value_states.shape)\n",
+    "    # print(ff_value_states.shape)\n",
     "    assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n",
     "    \n",
     "    \n",
@@ -852,8 +858,8 @@
     "    ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n",
     "    ff_value_states = torch.from_numpy(ff_value_states)\n",
     "    ff_value_states = ff_value_states.permute(1,0,2)\n",
-    "    print(ff_attn_heads_grads.shape)\n",
-    "    print(ff_value_states.shape)\n",
+    "    # print(ff_attn_heads_grads.shape)\n",
+    "    # print(ff_value_states.shape)\n",
     "    simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n",
     "    #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n",
     "    #print(\"Simulated QK prods grads:\")\n",
@@ -905,8 +911,8 @@
     "    hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n",
     "    hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n",
     "    hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
-    "    print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
-    "    print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
+    "    # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
+    "    # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
     "    # Check hf ROPE \n",
     "    cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n",
     "    cos = cos.cuda()\n",
@@ -926,15 +932,15 @@
     "    hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n",
     "    hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n",
     "    hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
-    "    print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
-    "    print(hf_kproj_grads_before_rotary[:,:,0])\n",
+    "    # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
+    "    # print(hf_kproj_grads_before_rotary[:,:,0])\n",
     "    # Compare HF rope with manual ROPE\n",
     "    assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n",
     "    # Compare HF Kproj with FF Kproj (before ROPE) \n",
     "    ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n",
     "    ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
-    "    print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
-    "    print(ff_kproj_pre[:,:,0])\n",
+    "    # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
+    "    #print(ff_kproj_pre[:,:,0])\n",
     "    mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
     "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
     "    pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n",
@@ -944,8 +950,8 @@
     "    \n",
     "    ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n",
     "    ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
-    "    print(\"ff_kproj: \", ff_kproj.shape)\n",
-    "    print(ff_kproj[:,:,0])\n",
+    "    # print(\"ff_kproj: \", ff_kproj.shape)\n",
+    "    #print(ff_kproj[:,:,0])\n",
     "    mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
     "    mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
     "    pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n",
@@ -968,30 +974,30 @@
     "    # Compare QProj\n",
     "    hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n",
     "    hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
-    "    print(\"HF Qproj:\")\n",
-    "    print(hf_qproj_grads.shape)\n",
+    "    # print(\"HF Qproj:\")\n",
+    "    # print(hf_qproj_grads.shape)\n",
     "    reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
-    "    print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
-    "    print(reshaped_tensor[:,:,0])\n",
+    "    # print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
+    "    # print(reshaped_tensor[:,:,0])\n",
     "    ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n",
     "    ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n",
-    "    print(\"FF Qproj:\")\n",
-    "    print(ff_qproj.shape)\n",
-    "    print(ff_qproj[:,:,0])\n",
+    "    # print(\"FF Qproj:\")\n",
+    "    # print(ff_qproj.shape)\n",
+    "    # print(ff_qproj[:,:,0])\n",
     "    assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n",
     "\n",
     "    hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n",
     "    hf_attn_in = torch.load(hf_attn_in)\n",
-    "    print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    # print(\"hf_attn_in: \", hf_attn_in.shape)\n",
     "    hf_attn_in = hf_attn_in.squeeze().T\n",
     "    hf_attn_in = hf_attn_in.detach().cpu().numpy()\n",
-    "    print(\"hf_attn_in: \", hf_attn_in.shape)\n",
-    "    print(hf_attn_in)\n",
+    "    # print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    # print(hf_attn_in)\n",
     "\n",
     "    ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n",
     "    ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n",
-    "    print(\"ff_attn_in: \", ff_attn_in.shape)\n",
-    "    print(ff_attn_in)\n",
+    "    # print(\"ff_attn_in: \", ff_attn_in.shape)\n",
+    "    # print(ff_attn_in)\n",
     "    #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n",
     "\n",
     "    mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n",
@@ -1006,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1066,7 +1072,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {

From 886d04fde3a3938a187ea4d0897809800848bb40 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 26 Dec 2023 10:58:03 -0500
Subject: [PATCH 07/11] backup

---
 .../ops/add_bias_residual_layer_norm.h        |  2 -
 inference/models/opt.cc                       | 17 +++--
 src/ops/add_bias_residual_layer_norm.cc       | 25 ++----
 src/ops/add_bias_residual_layer_norm.cu       | 62 ++++++---------
 src/ops/inc_multihead_self_attention.cc       |  2 +-
 src/ops/layer_norm.cu                         | 18 -----
 src/ops/linear.cc                             | 10 ++-
 src/ops/lora_linear.cc                        |  4 +-
 src/ops/residual_layer_norm.cc                | 22 +++---
 src/ops/residual_layer_norm.cu                | 76 +++++++++++--------
 src/ops/softmax.cc                            |  2 +-
 11 files changed, 109 insertions(+), 131 deletions(-)

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index 5c4a49f998..38bb825a4d 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -124,7 +124,6 @@ class AddBiasResidualLayerNorm : public Op {
                               T const *output_grad_ptr,
                               T *input_grad_ptr,
                               T *residual_grad_ptr,
-                              T *attn_bias_grad_ptr,
                               T const *gamma_ptr,
                               ffStream_t stream);
   static void
@@ -132,7 +131,6 @@ class AddBiasResidualLayerNorm : public Op {
                               GenericTensorAccessorR const &output_grad,
                               GenericTensorAccessorW &input_grad,
                               GenericTensorAccessorW const &residual_grad,
-                              GenericTensorAccessorW const &attn_bias_grad,
                               GenericTensorAccessorR const &gamma);
 
 public:
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 9069aef9e1..fa3bc29041 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -193,7 +193,7 @@ void OPT::create_opt_model(FFModel &ff,
     Tensor fc1 =
         ff.dense(final_norm,
                  opt_config.ffn_dim,
-                 AC_MODE_NONE,
+                 AC_MODE_RELU,
                  true,
                  DT_NONE,
                  nullptr,
@@ -202,8 +202,8 @@ void OPT::create_opt_model(FFModel &ff,
                  REG_MODE_NONE,
                  0.0f,
                  std::string("layers_" + std::to_string(i) + "_fc1").c_str());
-    Tensor activation = ff.relu(fc1, false);
-    fc2 = ff.dense(activation,
+    //Tensor activation = ff.relu(fc1, false);
+    fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
                    true,
@@ -216,17 +216,18 @@ void OPT::create_opt_model(FFModel &ff,
                    std::string("layers_" + std::to_string(i) + "_fc2").c_str());
     // Low-Rank Adapter (LoRA) for the second linear layer
     ff.lora_linear(
-        activation,
+        fc1,
         fc2,
         OP_LORA_MLP_SECOND,
         std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str());
   }
 
   // final
+  Tensor final_residual_ln_output[2] = {nullptr, nullptr};
   ff.residual_layer_norm(added,
                          fc2,
                          nullptr,
-                         res_ln_outputs,
+                         final_residual_ln_output,
                          false,
                          axes,
                          opt_config.layer_norm_elementwise_affine,
@@ -234,9 +235,8 @@ void OPT::create_opt_model(FFModel &ff,
                          true,
                          DT_NONE,
                          "final_layer_norm");
-  Tensor all_final_norm = res_ln_outputs[1];
 
-  Tensor lm_head = ff.dense(all_final_norm,
+  Tensor lm_head = ff.dense(final_residual_ln_output[1],
                             opt_config.vocab_size,
                             AC_MODE_NONE,
                             false,
@@ -255,7 +255,8 @@ void OPT::create_opt_model(FFModel &ff,
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
     // output = ff.arg_top_k(lm_head, /*k=*/1, false);
-    output = ff.argmax(lm_head, /*beam_Search*/ false);
+    Tensor softmax = ff.softmax(lm_head, -1);
+    output = ff.argmax(softmax, /*beam_Search*/ false);
   }
 
   //------------------- compile the model --------------------------------
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index be7b357f23..65247939b9 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -931,7 +931,7 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
@@ -939,18 +939,10 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
-  // attn bias grad
-  launcher.add_region_requirement(
-      RegionRequirement(batch_inputs[2]->part_grad,
-                        0 /*projection id*/,
-                        READ_WRITE,
-                        EXCLUSIVE,
-                        batch_inputs[2]->region_grad));
-  launcher.add_field(field_id++, FID_DATA);
   if (elementwise_affine) {
     // gamma
     launcher.add_region_requirement(RegionRequirement(weights[0]->part,
@@ -1001,14 +993,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
                                        ctx,
                                        runtime);
 
-  GenericTensorAccessorW attn_bias_grad =
-      helperGetGenericTensorAccessorRW(m->weight_type[0],
-                                       regions[region_idx++],
-                                       task->regions[task_region_idx++],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
-
   GenericTensorAccessorR gamma;
   if (m->elementwise_affine) {
     assert(m->use_bias == (regions.size() == 6));
@@ -1019,14 +1003,15 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
                                              ctx,
                                              runtime);
   }
+  std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m);
+  std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " <<  m->reset_input_grads[0] << " reset_in_grad[1]: " <<  m->reset_input_grads[1] << std::endl;
   AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
-      m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma);
+      m, output_grad, input_grad, residual_grad, gamma);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     std::vector<GenericTensorAccessorR> weights_accessors;
-    weights_accessors.push_back(attn_bias_grad);
     if (m->elementwise_affine) {
       weights_accessors.push_back(gamma);
     }
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index 097ace3676..08e3bb3edf 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -101,9 +101,9 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
+  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
             ? shared[lid]
-            : 0;
+            : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -536,8 +536,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
                                       T const *__restrict__ rstd,
                                       T const *__restrict__ gamma,
                                       T *dX,
-                                      T *dX_residual1,
-                                      T *dX_residual2,
+                                      T *dX_residual,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad,
                                       int const N,
                                       T *buf) {
   auto const i1 = blockIdx.x;
@@ -549,9 +550,7 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
   T const *X_i = X + i1 * N;
   T const *dY_i = dY + i1 * N;
   T *dX_i = dX + i1 * N;
-  T *dX_residual1_i = dX_residual1 + i1 * N;
-  T *dX_residual2_i =
-      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  T *dX_residual_i = dX_residual + i1 * N;
   // vectorized reads don't improve perf, so use regular unrolling
 
   for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
@@ -592,10 +591,15 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
     f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
     f_grad_input -= stats_x1;
     f_grad_input *= term1;
-    dX_i[l] += f_grad_input;
-    dX_residual1_i[l] += f_grad_input;
-    if (dX_residual2 != nullptr) {
-      dX_residual2_i[l] += f_grad_input;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad) {
+      dX_residual_i[l] = f_grad_input;
+    } else {
+      dX_residual_i[l] += f_grad_input;
     }
   }
 }
@@ -607,13 +611,14 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
                                              T const *__restrict__ rstd,
                                              T const *__restrict__ gamma,
                                              T *dX,
-                                             T *dX_residual1,
-                                             T *dX_residual2,
+                                             T *dX_residual,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad,
                                              int const N) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
   T *buf = reinterpret_cast<T *>(&s_data1);
 
-  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf);
+  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual, reset_input_grad, reset_residual_grad, N, buf);
 }
 
 /*static*/
@@ -661,7 +666,8 @@ void AddBiasResidualLayerNorm::backward_kernel(
       gamma_ptr,
       input_grad_ptr,
       residual_grad_ptr,
-      attn_bias_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
       N);
 
   if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
@@ -764,29 +770,11 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel(
     T const *output_grad_ptr,
     T *input_grad_ptr,
     T *residual_grad_ptr,
-    T *attn_bias_grad_ptr,
     T const *gamma_ptr,
     cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          static_cast<T const *>(m->input_activation),
-          gamma_ptr,
-          static_cast<T *>(m->ds_ptr),
-          static_cast<T *>(m->db_ptr));
-  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  ComputeGradientFusedParamsCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          static_cast<T *>(m->mean_ptr),
-                                          static_cast<T *>(m->rstd_ptr),
-                                          static_cast<T *>(m->ds_ptr),
-                                          static_cast<T *>(m->db_ptr),
-                                          static_cast<T *>(m->scale_ptr),
-                                          static_cast<T *>(m->bias_ptr));
+  
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
@@ -799,7 +787,8 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel(
       gamma_ptr,
       input_grad_ptr,
       residual_grad_ptr,
-      attn_bias_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
       N);
 }
 
@@ -809,7 +798,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
     GenericTensorAccessorR const &output_grad,
     GenericTensorAccessorW &input_grad,
     GenericTensorAccessorW const &residual_grad,
-    GenericTensorAccessorW const &attn_bias_grad,
     GenericTensorAccessorR const &gamma) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -825,7 +813,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
                     output_grad.get_float_ptr(),
                     input_grad.get_float_ptr(),
                     residual_grad.get_float_ptr(),
-                    attn_bias_grad.get_float_ptr(),
                     m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
                     stream);
   } else if (m->output_type[0] == DT_HALF) {
@@ -833,7 +820,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
                     output_grad.get_half_ptr(),
                     input_grad.get_half_ptr(),
                     residual_grad.get_half_ptr(),
-                    attn_bias_grad.get_half_ptr(),
                     m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
                     stream);
   } else {
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 569b35097d..562824d7d5 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -935,7 +935,7 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(idx++, FID_DATA);
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 6e12c53230..1d4e94d7d5 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -664,24 +664,6 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
                                 cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          static_cast<T *>(m->input_activation),
-          gamma_ptr,
-          static_cast<T *>(m->ds_ptr),
-          static_cast<T *>(m->db_ptr));
-  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  ComputeGradientFusedParamsCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          static_cast<T *>(m->mean_ptr),
-                                          static_cast<T *>(m->rstd_ptr),
-                                          static_cast<T *>(m->ds_ptr),
-                                          static_cast<T *>(m->db_ptr),
-                                          static_cast<T *>(m->scale_ptr),
-                                          static_cast<T *>(m->bias_ptr));
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index e71be3bbf4..a4e9ba5ce1 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -564,6 +564,7 @@ FutureMap Linear::inference(FFModel const &ff,
                             std::vector<ParallelTensor> const &batch_inputs,
                             std::vector<ParallelTensor> const &batch_outputs,
                             MachineView const *mv) {
+  printf("\tentering inference for %s\n", name);
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
@@ -617,10 +618,14 @@ void Linear::inference_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
                             Runtime *runtime) {
+  printf("\tEntering inference task\n");
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
   LinearMeta *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
+  printf("FWD %s\n", op_name_without_uid.c_str());
+  bc->print();
   if (bc->num_tokens == 0) {
     return;
   }
@@ -700,7 +705,7 @@ FutureMap Linear::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
@@ -757,6 +762,9 @@ void Linear::peft_bwd_task(Task const *task,
   int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
 
+  std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
+  std::cout << "BWD " << op_name_without_uid << std::endl;
+
   int num_infr_tokens = bc->num_active_infr_tokens();
   int num_peft_tokens = bc->num_active_peft_tokens();
   if (m->inference_debugging) {
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 9ed411397d..e39b444af4 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -589,14 +589,14 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(1, FID_DATA);
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index c142e47e62..ce24415291 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -723,7 +723,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
@@ -731,7 +731,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[1]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[1]->region_grad));
   launcher.add_field(field_id++, FID_DATA);
@@ -740,7 +740,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd(
     launcher.add_region_requirement(
         RegionRequirement(batch_inputs[2]->part_grad,
                           0 /*projection id*/,
-                          READ_WRITE,
+                          reset_input_grads[2] ? WRITE_ONLY : READ_WRITE,
                           EXCLUSIVE,
                           batch_inputs[2]->region_grad));
     launcher.add_field(field_id++, FID_DATA);
@@ -768,9 +768,7 @@ void ResidualLayerNorm::peft_bwd_task(
   }
   assert(task->regions.size() == regions.size());
   ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
-  assert(regions.size() ==
-         4 + m->use_two_residuals +
-             (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+  assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine);
 
   int region_idx = 0, task_region_idx = 0;
 
@@ -807,14 +805,16 @@ void ResidualLayerNorm::peft_bwd_task(
   }
   GenericTensorAccessorR gamma;
   if (m->elementwise_affine) {
-    assert(m->use_bias == (regions.size() == 6));
-    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+    gamma = helperGetGenericTensorAccessorRO(m->weight_type[0],
                                              regions[region_idx++],
                                              task->regions[task_region_idx++],
                                              FID_DATA,
                                              ctx,
                                              runtime);
   }
+  std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m);
+  std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " <<  m->reset_input_grads[0] << " reset_in_grad[1]: " <<  m->reset_input_grads[1] << std::endl;
+  
   ResidualLayerNorm::peft_bwd_kernel_wrapper(
       m, output_grad, input_grad, residual1_grad, residual2_grad, gamma);
 
@@ -942,12 +942,14 @@ void ResidualLayerNorm::inference_task(
 
   assert(task->regions.size() == regions.size());
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
+  std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m);
+  std::cout << "INF " << op_name_without_uid << std::endl;
   if (bc->num_tokens == 0) {
+    bc->print();
     return;
   }
 
-  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
-
   assert(regions.size() ==
          4 + m->use_two_residuals +
              (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 4bfac1887f..0b6624c4ab 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -239,36 +239,34 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     }
     assert(num_peft_requests <= 1);
 
-    int tokens_previous_requests = 0;
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
       if (bc->request_completed[i]) {
         continue;
       }
       // Skip non-PEFT requests
       if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-        // FIXME: use the new approach to computing token offset
-        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int in_dim =
-          added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1;
+      int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
         m->input_activation = allocator->allocate_instance_untyped(
             data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
+        printf("Allocating input_activation (%p) of size: %i*%i*%i=%i for %s...\n", m->input_activation, data_type_size(m->input_type[0]), num_peft_tokens,in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, m->op_name);
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              added_output.get_float_ptr() + tokens_previous_requests * in_dim,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
         } else if (m->input_type[0] == DT_HALF) {
           checkCUDA(cudaMemcpyAsync(
               m->input_activation,
-              added_output.get_half_ptr() + tokens_previous_requests * in_dim,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
               data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
               cudaMemcpyDeviceToDevice,
               stream));
@@ -481,6 +479,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
                                       T *dX,
                                       T *dX_residual1,
                                       T *dX_residual2,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad1,
+                                      bool reset_residual_grad2,
                                       int const N,
                                       T *buf) {
   auto const i1 = blockIdx.x;
@@ -535,10 +536,22 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY,
     f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
     f_grad_input -= stats_x1;
     f_grad_input *= term1;
-    dX_i[l] += f_grad_input;
-    dX_residual1_i[l] += f_grad_input;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad1) {
+      dX_residual1_i[l] = f_grad_input;
+    } else {
+      dX_residual1_i[l] += f_grad_input;
+    }
     if (dX_residual2 != nullptr) {
-      dX_residual2_i[l] += f_grad_input;
+      if (reset_residual_grad2) {
+        dX_residual2_i[l] = f_grad_input;
+      } else {
+        dX_residual2_i[l] += f_grad_input;
+      }
     }
   }
 }
@@ -552,11 +565,13 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
                                              T *dX,
                                              T *dX_residual1,
                                              T *dX_residual2,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad1,
+                                             bool reset_residual_grad2,
                                              int const N) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
   T *buf = reinterpret_cast<T *>(&s_data1);
-
-  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf);
+  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, reset_input_grad, reset_residual_grad1, reset_residual_grad2, N, buf);
 }
 
 /*static*/
@@ -604,6 +619,9 @@ void backward_kernel(ResidualLayerNormMeta const *m,
       input_grad_ptr,
       residual1_grad_ptr,
       residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
       N);
 
   if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
@@ -710,28 +728,23 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
                      cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  ComputeInternalGradientsCUDAKernel<T>
-      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
-          N,
-          output_grad_ptr,
-          static_cast<T const *>(m->input_activation),
-          gamma_ptr,
-          static_cast<T *>(m->ds_ptr),
-          static_cast<T *>(m->db_ptr));
-  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  ComputeGradientFusedParamsCUDAKernel<T>
-      <<<B, kCUDANumThreads, 0, stream>>>(M,
-                                          N,
-                                          static_cast<T *>(m->mean_ptr),
-                                          static_cast<T *>(m->rstd_ptr),
-                                          static_cast<T *>(m->ds_ptr),
-                                          static_cast<T *>(m->db_ptr),
-                                          static_cast<T *>(m->scale_ptr),
-                                          static_cast<T *>(m->bias_ptr));
+  
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
   int nshared = (num_threads / warp_size) * sizeof(T);
+
+  sleep(10);
+  printf("Attempting to access %p\n", m->input_activation);
+  check_device_vs_host_ptr(static_cast<T const *>(m->input_activation));
+  check_device_vs_host_ptr(static_cast<T const *>(m->mean_ptr));
+  check_device_vs_host_ptr(static_cast<T const *>(m->rstd_ptr));
+  check_device_vs_host_ptr(static_cast<T const *>(gamma_ptr));
+  check_device_vs_host_ptr(static_cast<T const *>(input_grad_ptr));
+  check_device_vs_host_ptr(static_cast<T const *>(residual1_grad_ptr));
+  sleep(10);
+  assert(false);
+
   layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
       output_grad_ptr,
       static_cast<T const *>(m->input_activation),
@@ -741,6 +754,9 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
       input_grad_ptr,
       residual1_grad_ptr,
       residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
       N);
 }
 
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 23f2eb9edf..8313273c49 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -411,7 +411,7 @@ FutureMap Softmax::peft_bwd(FFModel const &ff,
   launcher.add_region_requirement(
       RegionRequirement(batch_inputs[0]->part_grad,
                         0 /*projection id*/,
-                        READ_WRITE,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
                         EXCLUSIVE,
                         batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);

From 66c66f2f1baf313ab9a192f4b502c380ca1c1b01 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 1 Jan 2024 10:52:17 -0500
Subject: [PATCH 08/11] backup

---
 .../flexflow/ops/kernels/softmax_kernels.h    |  3 +-
 include/flexflow/ops/residual_layer_norm.h    |  1 +
 inference/models/opt.cc                       |  2 +
 src/ops/add_bias_residual_layer_norm.cc       | 13 ++--
 src/ops/fused.cc                              | 18 +++++
 src/ops/fused.cu                              | 70 ++++++++-----------
 src/ops/inc_multihead_self_attention.cc       | 12 ++--
 src/ops/kernels/softmax.cu                    | 13 +++-
 src/ops/linear.cc                             |  5 +-
 src/ops/lora_linear.cc                        |  2 +
 src/ops/residual_layer_norm.cc                | 25 +++++--
 src/ops/residual_layer_norm.cu                |  5 +-
 src/ops/residual_rms_norm.cc                  |  6 +-
 src/ops/softmax.cc                            | 22 ++++--
 14 files changed, 124 insertions(+), 73 deletions(-)

diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index db5e9799e9..b3dfe4f430 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -39,7 +39,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
 void inference_kernel_wrapper(SoftmaxMeta const *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output);
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad);
 
 void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
                              BatchConfig const *bc,
diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
index 35ddb171d4..d924132452 100644
--- a/include/flexflow/ops/residual_layer_norm.h
+++ b/include/flexflow/ops/residual_layer_norm.h
@@ -28,6 +28,7 @@ class ResidualLayerNorm : public Op {
                     float _eps,
                     bool allocate_weights,
                     char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index fa3bc29041..28ab2aea7d 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -224,6 +224,8 @@ void OPT::create_opt_model(FFModel &ff,
 
   // final
   Tensor final_residual_ln_output[2] = {nullptr, nullptr};
+  // ff.residual_rms_norm(added, fc2, final_residual_ln_output, 1e-05, opt_config.hidden_size,
+  //                      DT_NONE, "final_layer_norm");
   ff.residual_layer_norm(added,
                          fc2,
                          nullptr,
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index 65247939b9..a8a9e05e3d 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -618,12 +618,15 @@ void AddBiasResidualLayerNorm::inference_task(
 
   assert(task->regions.size() == regions.size());
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_tokens == 0) {
-    return;
-  }
+  
 
   AddBiasResidualLayerNormMeta *m =
       *((AddBiasResidualLayerNormMeta **)task->local_args);
+  std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m);
+  std::cout << "INF " << op_name_without_uid << std::endl;
+  if (bc->num_tokens == 0) {
+    return;
+  }
 
   assert(regions.size() ==
          5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
@@ -945,11 +948,11 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
   launcher.add_field(field_id++, FID_DATA);
   if (elementwise_affine) {
     // gamma
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
-                                                      weights[0]->region));
+                                                      weights[1]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index ea1c970cc5..632c331e1f 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -487,6 +487,11 @@ FutureMap FusedOp::inference(FFModel const &ff,
   // so we transfer the maximum of them
   // size_t batch_config_size =
   //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
+  printf("FUSED! INFERENCE! %i ops\n", numOperators);
+  for (int i=0; i<numOperators; i++) {
+    Op *oppp = operators[i];
+    std::cout << oppp->op_type << " " << oppp->name << std::endl;
+  }
   IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
@@ -528,6 +533,19 @@ FutureMap FusedOp::inference(FFModel const &ff,
                           batch_outputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
+  offset += numOutputs;
+  // add softmax output grad
+  if (operators[numOperators-1]->op_type == OP_SOFTMAX) {
+    printf("operator %i is last SOFTMAX! adding output %i\n", numOperators-1, numOutputs-1);
+    assert(outputs[numOutputs-1]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[numOutputs-1]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[numOutputs-1]->region_grad));
+    launcher.add_field(offset, FID_DATA);
+  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 9954a8b43a..25f15d8efd 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -95,8 +95,9 @@ __host__ void
 
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
+  bool softmax_grad_additional_region = (fused->op_op_type[fused->numOperators-1] == OP_SOFTMAX);
   assert((int)regions.size() ==
-         fused->numInputs + fused->numWeights + fused->numOutputs);
+         fused->numInputs + fused->numWeights + fused->numOutputs + softmax_grad_additional_region);
   // Domain input_domain[MAX_NUM_INPUTS];
   // Domain weight_domain[MAX_NUM_WEIGHTS];
   // Domain output_domain[MAX_NUM_OUTPUTS];
@@ -141,6 +142,7 @@ __host__ void
                                          ctx,
                                          runtime);
   }
+  roff += fused->numOutputs;
   // Assert that all meta share the same dnn/blas handler
   int start = 0;
   for (start = 0; start < fused->numOperators; start++) {
@@ -625,9 +627,19 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain.get_volume() ==
                my_output_accessor[0].domain.get_volume());
+        if (op == fused->numOperators -1) { // if this is the final operator
+        printf("op %i is softmax! Accessing region %i\n", fused->numOperators -1, roff);
+          output_accessor[fused->numOutputs] = 
+            helperGetGenericTensorAccessorWO(fused->output_data_types[fused->numOutputs-1],
+                                            regions[roff],
+                                            task->regions[roff],
+                                            FID_DATA,
+                                            ctx,
+                                            runtime);
+        }
         SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
         Kernels::Softmax::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+            m, bc, my_input_accessor[0], my_output_accessor[0], output_accessor[fused->numOutputs]);
         break;
       }
       case OP_ALLREDUCE: {
@@ -1008,7 +1020,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
             m,
             bc,
-            my_output_grad_accessor[0],
+            my_output_grad_accessor[1],
             my_input_grad_accessor[0],
             my_input_grad_accessor[1],
             my_weight_accessor[0]);
@@ -1078,27 +1090,20 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
             assert(fused->op_num_weights[op] == 2); // weight + bias
           }
         }
-        GenericTensorAccessorR residual2;
+        GenericTensorAccessorW residual2;
         if (m->use_two_residuals) {
           residual2 = my_input_grad_accessor[2];
         }
-        GenericTensorAccessorR gamma, beta;
+        GenericTensorAccessorR gamma;
         if (m->elementwise_affine) {
           gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
-          }
         }
-        // TODO: implment me
-        assert(false);
-        // ResidualLayerNorm::inference_kernel_wrapper(m,
-        //                                             my_input_accessor[0],
-        //                                             my_input_accessor[1],
-        //                                             residual2,
-        //                                             my_output_accessor[0],
-        //                                             my_output_accessor[1],
-        //                                             gamma,
-        //                                             beta);
+        ResidualLayerNorm::peft_bwd_kernel_wrapper(m,
+                                                   my_output_grad_accessor[1],
+                                                   my_input_grad_accessor[0],
+                                                   my_input_grad_accessor[1],
+                                                   residual2,
+                                                   gamma);
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
@@ -1115,31 +1120,16 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
             assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
           }
         }
-        GenericTensorAccessorR gamma, beta;
+        GenericTensorAccessorR gamma;
         if (m->elementwise_affine) {
           gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
-          }
         }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_grad_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        // TODO: implement me
-        assert(false);
-        // AddBiasResidualLayerNorm::inference_kernel_wrapper(
-        //     m,
-        //     attn_bias_dim,
-        //     residual_volume,
-        //     my_input_accessor[0],
-        //     my_output_accessor[0],
-        //     my_output_accessor[1],
-        //     my_input_accessor[1],
-        //     my_weight_accessor[0],
-        //     gamma,
-        //     beta);
+        
+        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(m, 
+                                                          my_output_grad_accessor[1], 
+                                                          my_input_grad_accessor[0],
+                                                          my_input_grad_accessor[1], 
+                                                          gamma);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 562824d7d5..2491634a76 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -818,12 +818,16 @@ void IncMultiHeadSelfAttention::inference_task(
   log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d",
                     bc->num_tokens,
                     bc->num_active_requests());
-  if (bc->num_tokens == 0) {
-    return;
-  }
+  
 
   IncMultiHeadSelfAttentionMeta *m =
       *((IncMultiHeadSelfAttentionMeta **)task->local_args);
+  std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  std::cout << "INF " << op_name_without_uid << std::endl;
+
+  if (bc->num_tokens == 0) {
+    return;
+  }
 
   assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
                                            : regions.size() == 3));
@@ -860,8 +864,6 @@ void IncMultiHeadSelfAttention::inference_task(
 
   assert(task->index_point.get_dim() == 1);
 
-  std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
-  std::cout << "INF " << op_name_without_uid << std::endl;
   IncMultiHeadSelfAttention::inference_kernel_wrapper(
       m, bc, task->index_point.point_data[0], input, weight, output, biases);
 
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 115461c129..1624c0458d 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -121,7 +121,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
 void inference_kernel_wrapper(SoftmaxMeta const *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output) {
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -138,6 +139,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_float_ptr(),
                                num_classes,
                                stream);
+    checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(),
+                              output.get_float_ptr(),
+                              output.domain.get_volume() * sizeof(float),
+                              cudaMemcpyDeviceToDevice,
+                              stream));
   } else if (m->output_type[0] == DT_HALF) {
     Internal::inference_kernel(m,
                                bc,
@@ -145,6 +151,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_half_ptr(),
                                num_classes,
                                stream);
+    checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(),
+                              output.get_half_ptr(),
+                              output.domain.get_volume() * sizeof(half),
+                              cudaMemcpyDeviceToDevice,
+                              stream));
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index a4e9ba5ce1..595b8d24e9 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -564,7 +564,6 @@ FutureMap Linear::inference(FFModel const &ff,
                             std::vector<ParallelTensor> const &batch_inputs,
                             std::vector<ParallelTensor> const &batch_outputs,
                             MachineView const *mv) {
-  printf("\tentering inference for %s\n", name);
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
@@ -618,14 +617,12 @@ void Linear::inference_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
                             Runtime *runtime) {
-  printf("\tEntering inference task\n");
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
   LinearMeta *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
-  printf("FWD %s\n", op_name_without_uid.c_str());
-  bc->print();
+  printf("INF %s\n", op_name_without_uid.c_str());
   if (bc->num_tokens == 0) {
     return;
   }
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index e39b444af4..fb13dc99cb 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -449,6 +449,8 @@ void LoraLinear::inference_task(Task const *task,
                                 Context ctx,
                                 Runtime *runtime) {
   LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m);
+  std::cout << "INF " << op_name_without_uid << std::endl;
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_active_tokens() == 0) {
     return;
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index ce24415291..7697613ae0 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -117,7 +117,6 @@ void FFModel::residual_layer_norm(const Tensor input,
   }
 
   int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0;
-  Layer *ln = nullptr;
   Tensor casted_input =
       (data_type != input->data_type)
           ? cast(input, data_type, "type cast for residual_layer_norm")
@@ -133,7 +132,7 @@ void FFModel::residual_layer_norm(const Tensor input,
             ? cast(residual2, data_type, "type cast for residual2_layer_norm")
             : residual2;
   }
-  ln = new Layer(this,
+  Layer *ln = new Layer(this,
                  OP_RESIDUAL_LAYERNORM,
                  data_type,
                  name,
@@ -144,9 +143,9 @@ void FFModel::residual_layer_norm(const Tensor input,
                  casted_residual1,
                  casted_residual2);
   ln->outputs[0] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/);
   ln->outputs[1] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/);
   {
     int numdims = axes.size();
     int dims[numdims];
@@ -326,6 +325,18 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
   }
 }
 
+void ResidualLayerNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  outputs[0]->parallel_is = inputs[0]->parallel_is;
+  outputs[0]->region = inputs[0]->region;
+  outputs[0]->part = inputs[0]->part;
+  outputs[0]->region_grad = inputs[0]->region_grad;
+  outputs[0]->part_grad = inputs[0]->part_grad;
+  // map output 1 to new region
+  ff.map_tensor(outputs[1], this);
+}
+
 void ResidualLayerNorm::init_inference(
     FFModel const &ff,
     std::vector<ParallelTensor> const &batch_inputs,
@@ -439,11 +450,11 @@ void ResidualLayerNorm::init(FFModel const &ff) {
   launcher.add_field(field_id++, FID_DATA);
   // residual2
   if (use_two_residuals) {
-    launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
-                                                      inputs[1]->region));
+                                                      inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
   // added: input + residual(s)
@@ -946,7 +957,7 @@ void ResidualLayerNorm::inference_task(
   std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m);
   std::cout << "INF " << op_name_without_uid << std::endl;
   if (bc->num_tokens == 0) {
-    bc->print();
+    printf("Zero tokens\n");
     return;
   }
 
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 0b6624c4ab..2164616b88 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -734,7 +734,6 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
   const dim3 blocks(M);
   int nshared = (num_threads / warp_size) * sizeof(T);
 
-  sleep(10);
   printf("Attempting to access %p\n", m->input_activation);
   check_device_vs_host_ptr(static_cast<T const *>(m->input_activation));
   check_device_vs_host_ptr(static_cast<T const *>(m->mean_ptr));
@@ -742,8 +741,8 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
   check_device_vs_host_ptr(static_cast<T const *>(gamma_ptr));
   check_device_vs_host_ptr(static_cast<T const *>(input_grad_ptr));
   check_device_vs_host_ptr(static_cast<T const *>(residual1_grad_ptr));
-  sleep(10);
-  assert(false);
+  
+  return;
 
   layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
       output_grad_ptr,
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index aa72d7d32a..ff72b2273a 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -90,9 +90,9 @@ void FFModel::residual_rms_norm(const Tensor input1,
                         casted_input2);
 
   rm->outputs[0] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/);
   rm->outputs[1] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/);
 
   // weights
   int weight_dims[1] = {dim};
@@ -100,7 +100,7 @@ void FFModel::residual_rms_norm(const Tensor input1,
                                                  weight_dims,
                                                  data_type,
                                                  rm,
-                                                 true /*create_grad*/,
+                                                 false /*create_grad*/,
                                                  nullptr,
                                                  CHOSEN_SYNC_TYPE);
 
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 8313273c49..700162ade2 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -355,6 +355,13 @@ FutureMap Softmax::inference(FFModel const &ff,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
   launcher.add_field(1, FID_DATA);
+  // we add the region below in order to copy the output to the grad tensor
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
@@ -363,20 +370,26 @@ void Softmax::inference_task(Task const *task,
                              Context ctx,
                              Runtime *runtime) {
   assert(task->regions.size() == regions.size());
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
+  assert(regions.size() == 3);
+  assert(task->regions.size() == 3);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
+  
+  std::string op_name_without_uid = Softmax::get_op_name_without_uid(m);
+  std::cout << "INF " << op_name_without_uid << std::endl;
   if (bc->num_tokens == 0) {
     return;
   }
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
+  
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  inference_kernel_wrapper(m, bc, input, output);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  inference_kernel_wrapper(m, bc, input, output, output_grad);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -429,6 +442,7 @@ void Softmax::peft_bwd_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
                             Runtime *runtime) {
+  printf("BWD softmax\n");
   assert(task->regions.size() == regions.size());
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);

From 1f86c29ce21a235fb6eaa0b50ab48d3cefd77313 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 2 Jan 2024 21:55:50 -0500
Subject: [PATCH 09/11] fix

---
 src/ops/inc_multihead_self_attention.cu | 103 ++++++++++++++----------
 1 file changed, 62 insertions(+), 41 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b1c3db25dc..6bcb6d42ea 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1018,10 +1018,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // save result to file for checking
-      std::string filename = base_filepath + "_o_proj_in_grad";
-      std::cout << "FILENAME: " << filename << std::endl;
-      save_tensor(C, m_*n_, filename.c_str());
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename = base_filepath + "_o_proj_in_grad";
+        std::cout << "FILENAME: " << filename << std::endl;
+        save_tensor(C, m_*n_, filename.c_str());
+      }
     }
     // Step 2: compute gradients w.r.t. value
     {
@@ -1074,12 +1076,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
       // save result to file for checking
-      std::string filename = base_filepath + "_v_proj_in_grad";
-      std::cout << "FILENAME: " << filename << std::endl;
-      save_tensor(C, m_*n_*m->num_q_heads, filename.c_str());
-      std::string filename2 = base_filepath + "_qk_prods_softmax";
-      std::cout << "FILENAME: " << filename2 << std::endl;
-      save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str());
+      if (m->inference_debugging) {
+        std::string filename = base_filepath + "_v_proj_in_grad";
+        std::cout << "FILENAME: " << filename << std::endl;
+        save_tensor(C, m_*n_*m->num_q_heads, filename.c_str());
+        std::string filename2 = base_filepath + "_qk_prods_softmax";
+        std::cout << "FILENAME: " << filename2 << std::endl;
+        save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str());
+      }
     }
     // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
     {
@@ -1128,12 +1132,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      std::string filename4 = base_filepath + "_qk_prods_softmax_grad";
-      std::cout << "FILENAME: " << filename4 << std::endl;
-      save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str());
-      std::string filename5 = base_filepath + "_vcache";
-      std::cout << "FILENAME: " << filename5 << std::endl;
-      save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str());
+      if (m->inference_debugging) {
+        std::string filename4 = base_filepath + "_qk_prods_softmax_grad";
+        std::cout << "FILENAME: " << filename4 << std::endl;
+        save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str());
+        std::string filename5 = base_filepath + "_vcache";
+        std::cout << "FILENAME: " << filename5 << std::endl;
+        save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str());
+      }
     }
     // Step 4: softmax backpropagation
     {
@@ -1161,10 +1167,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                       m->qk_tensor,
                                       m->qk_prods));
       
-      DT *C = static_cast<DT *>(m->qk_prods);
-      std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in";
-      std::cout << "FILENAME: " << filename6 << std::endl;
-      save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str());
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in";
+        std::cout << "FILENAME: " << filename6 << std::endl;
+        save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str());
+      }
       
       //  TODO: fill all elements above diagonal to force causal attention
       size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
@@ -1181,9 +1189,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                                 entries_above_diagonal,
                                                 DT(0.0f));
       }
-      std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked";
-      std::cout << "FILENAME: " << filename7 << std::endl;
-      save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str());
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked";
+        std::cout << "FILENAME: " << filename7 << std::endl;
+        save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str());
+      }
     }
     // Step 5: compute gradients w.r.t. key
     {
@@ -1238,12 +1249,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    std::string filename8 = base_filepath + "_query_activation";
-    std::cout << "FILENAME: " << filename8 << std::endl;
-    save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str());
-    std::string filename9 = base_filepath + "_devkproj_pre";
-    std::cout << "FILENAME: " << filename9 << std::endl;
-    save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
+      if (m->inference_debugging) {
+        std::string filename8 = base_filepath + "_query_activation";
+        std::cout << "FILENAME: " << filename8 << std::endl;
+        save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str());
+        std::string filename9 = base_filepath + "_devkproj_pre";
+        std::cout << "FILENAME: " << filename9 << std::endl;
+        save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
+      }
     }
     // Step 6: compute gradients w.r.t query
     {
@@ -1294,9 +1307,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      std::string filename3 = base_filepath + "_devQKVPRojArray_pre";
-      std::cout << "FILENAME: " << filename3 << std::endl;
-      save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
+      if (m->inference_debugging) {
+        std::string filename3 = base_filepath + "_devQKVPRojArray_pre";
+        std::cout << "FILENAME: " << filename3 << std::endl;
+        save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
+      }
     }
 
     // Compute rotary embeddings bwd
@@ -1318,17 +1333,21 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                               num_tokens,
                                               m->hidden_size);
         DT *C = static_cast<DT *>(m->devQKVProjArray);
-        std::string filename3 = base_filepath + "_devQKVPRojArray";
-        std::cout << "FILENAME: " << filename3 << std::endl;
-        save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
+        if (m->inference_debugging) {
+          std::string filename3 = base_filepath + "_devQKVPRojArray";
+          std::cout << "FILENAME: " << filename3 << std::endl;
+          save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
+        }
       }
 
       // matrix C: gradients for key (saved as part of m->devQKVProjArray)
       // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
       DT *C = static_cast<DT *>(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
-      std::string filename9 = base_filepath + "_devkproj";
-      std::cout << "FILENAME: " << filename9 << std::endl;
-      save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
+      if (m->inference_debugging) {
+        std::string filename9 = base_filepath + "_devkproj";
+        std::cout << "FILENAME: " << filename9 << std::endl;
+        save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
+      }
     }
     // Step 7: compute gradients w.r.t. input
     {
@@ -1371,9 +1390,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      std::string filename12 = base_filepath + "_attn_final_grad_in";
-      std::cout << "FILENAME: " << filename12 << std::endl;
-      save_tensor(C, num_tokens * m->qSize, filename12.c_str());
+      if (m->inference_debugging) {
+        std::string filename12 = base_filepath + "_attn_final_grad_in";
+        std::cout << "FILENAME: " << filename12 << std::endl;
+        save_tensor(C, num_tokens * m->qSize, filename12.c_str());
+      }
     }
   }
 }

From 6a0c899956800ef5bbdc50016a7fc349bc967b7a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 9 Jan 2024 22:31:25 -0500
Subject: [PATCH 10/11] cleanup

---
 inference/models/opt.cc                 |  9 +--
 src/ops/add_bias_residual_layer_norm.cc | 11 +--
 src/ops/argmax.cc                       |  5 ++
 src/ops/fused.cc                        |  5 --
 src/ops/fused.cu                        |  1 -
 src/ops/inc_multihead_self_attention.cc | 50 +------------
 src/ops/inc_multihead_self_attention.cu | 96 +------------------------
 src/ops/kernels/softmax.cu              | 25 ++++---
 src/ops/linear.cc                       |  5 --
 src/ops/lora_linear.cc                  |  2 -
 src/ops/residual_layer_norm.cc          |  6 --
 src/ops/residual_layer_norm.cu          | 11 ---
 src/ops/residual_rms_norm.cc            | 69 ------------------
 src/ops/softmax.cc                      |  7 +-
 tests/peft/hf_finetune.py               |  6 --
 15 files changed, 28 insertions(+), 280 deletions(-)

diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 28ab2aea7d..e0e940b186 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -202,7 +202,6 @@ void OPT::create_opt_model(FFModel &ff,
                  REG_MODE_NONE,
                  0.0f,
                  std::string("layers_" + std::to_string(i) + "_fc1").c_str());
-    //Tensor activation = ff.relu(fc1, false);
     fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
@@ -223,13 +222,10 @@ void OPT::create_opt_model(FFModel &ff,
   }
 
   // final
-  Tensor final_residual_ln_output[2] = {nullptr, nullptr};
-  // ff.residual_rms_norm(added, fc2, final_residual_ln_output, 1e-05, opt_config.hidden_size,
-  //                      DT_NONE, "final_layer_norm");
   ff.residual_layer_norm(added,
                          fc2,
                          nullptr,
-                         final_residual_ln_output,
+                         res_ln_outputs,
                          false,
                          axes,
                          opt_config.layer_norm_elementwise_affine,
@@ -237,8 +233,9 @@ void OPT::create_opt_model(FFModel &ff,
                          true,
                          DT_NONE,
                          "final_layer_norm");
+  Tensor all_final_norm = res_ln_outputs[1];
 
-  Tensor lm_head = ff.dense(final_residual_ln_output[1],
+  Tensor lm_head = ff.dense(all_final_norm,
                             opt_config.vocab_size,
                             AC_MODE_NONE,
                             false,
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index a8a9e05e3d..88a34b7eb5 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -618,16 +618,13 @@ void AddBiasResidualLayerNorm::inference_task(
 
   assert(task->regions.size() == regions.size());
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  
-
-  AddBiasResidualLayerNormMeta *m =
-      *((AddBiasResidualLayerNormMeta **)task->local_args);
-  std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m);
-  std::cout << "INF " << op_name_without_uid << std::endl;
   if (bc->num_tokens == 0) {
     return;
   }
 
+  AddBiasResidualLayerNormMeta *m =
+      *((AddBiasResidualLayerNormMeta **)task->local_args);
+
   assert(regions.size() ==
          5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
 
@@ -1006,8 +1003,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task(
                                              ctx,
                                              runtime);
   }
-  std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m);
-  std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " <<  m->reset_input_grads[0] << " reset_in_grad[1]: " <<  m->reset_input_grads[1] << std::endl;
   AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
       m, output_grad, input_grad, residual_grad, gamma);
 
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index cabb8b204f..dd0e2bb822 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -392,6 +392,11 @@ InferenceResult
   GenericTensorAccessorW parent;
   int batch_size = bc->num_active_infr_tokens();
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
+  // Note that we free activation allocator here since argmax is the
+  // last operator in forward
+  if (m->handle.peft_activation_allocator != nullptr) {
+    m->handle.peft_activation_allocator->free_all();
+  }
   InferenceResult ir;
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 632c331e1f..e18486289f 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -487,11 +487,6 @@ FutureMap FusedOp::inference(FFModel const &ff,
   // so we transfer the maximum of them
   // size_t batch_config_size =
   //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
-  printf("FUSED! INFERENCE! %i ops\n", numOperators);
-  for (int i=0; i<numOperators; i++) {
-    Op *oppp = operators[i];
-    std::cout << oppp->op_type << " " << oppp->name << std::endl;
-  }
   IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 25f15d8efd..17586e925f 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -628,7 +628,6 @@ __host__ void
         assert(my_input_accessor[0].domain.get_volume() ==
                my_output_accessor[0].domain.get_volume());
         if (op == fused->numOperators -1) { // if this is the final operator
-        printf("op %i is softmax! Accessing region %i\n", fused->numOperators -1, roff);
           output_accessor[fused->numOutputs] = 
             helperGetGenericTensorAccessorWO(fused->output_data_types[fused->numOutputs-1],
                                             regions[roff],
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 2491634a76..f590fa0440 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -818,17 +818,13 @@ void IncMultiHeadSelfAttention::inference_task(
   log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d",
                     bc->num_tokens,
                     bc->num_active_requests());
-  
-
-  IncMultiHeadSelfAttentionMeta *m =
-      *((IncMultiHeadSelfAttentionMeta **)task->local_args);
-  std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
-  std::cout << "INF " << op_name_without_uid << std::endl;
-
   if (bc->num_tokens == 0) {
     return;
   }
 
+  IncMultiHeadSelfAttentionMeta *m =
+      *((IncMultiHeadSelfAttentionMeta **)task->local_args);
+
   assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
                                            : regions.size() == 3));
 
@@ -880,36 +876,6 @@ void IncMultiHeadSelfAttention::inference_task(
   }
 }
 
-template <typename DT>
-void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) {
-  std::ifstream in(filepath, std::ios::in | std::ios::binary);
-  if (!in.good()) {
-    std::cout << "Could not open file: " << filepath << std::endl;
-  }
-  assert(in.good() && "incorrect weight file path");
-  std::vector<DT> host_array(size);
-  size_t loaded_data_size = sizeof(DT) * size;
-  in.seekg(0, in.end);
-  in.seekg(0, in.beg);
-  in.read((char *)host_array.data(), loaded_data_size);
-
-  size_t in_get_size = in.gcount();
-  if (in_get_size != loaded_data_size) {
-    std::cout << "load weight data error " << in_get_size << ", "
-              << loaded_data_size << ", " << sizeof(DT) << std::endl;
-    assert(false);
-  }
-  assert(size == host_array.size());
-
-  copy_tensor_host_to_dev(ptr, host_array.data(), size);
-
-  // // normal
-  // long data_index = 0;
-  // for (auto v : host_array) {
-  //   ptr[data_index++] = v;
-  // }
-  in.close();
-}
 
 FutureMap IncMultiHeadSelfAttention::peft_bwd(
     FFModel const &ff,
@@ -1027,16 +993,6 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
 
   assert(task->index_point.get_dim() == 1);
 
-  std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m);
-  std::cout << "BWD " << op_name_without_uid << std::endl;
-
-  if (op_name_without_uid == "layers_11_attention") {
-    load_tensor_from_file(
-      output_grad.get_float_ptr(), 
-      (output_grad.domain.get_volume()/128)*24, 
-      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0.flexflow"
-    );
-  }
 
   IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
       m,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 6bcb6d42ea..3a45ce5da3 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -642,8 +642,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                      m->hidden_size);
   }
   if (*m->apply_rotary_embedding) {
-    printf("ROTARY EMBEDDING: num_tokens: %i, q_array_size: %i, m->hidden_size: %i\n", 
-      num_tokens, q_array_size, m->hidden_size);
     /*q&k*/
     parallelism = num_tokens * m->hidden_size;
     apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
@@ -937,26 +935,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   //   }
   // #endif
-  std::string op_name_without_uid = std::string(m->op_name);
-  size_t last_underscore = op_name_without_uid.length() - 1;
-  for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
-    if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
-      break;
-    } else if (m->op_name[i] == '_') {
-      last_underscore = i;
-    }
-  }
-  op_name_without_uid.erase(last_underscore);
-
-  std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_bwd-step_" + std::to_string(m->bwd_step) +
-        "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) +
-        "_layer-name_" + op_name_without_uid + "_shard-id_" +
-        std::to_string(shard_id);
-
-
-
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
@@ -1018,12 +996,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        // save result to file for checking
-        std::string filename = base_filepath + "_o_proj_in_grad";
-        std::cout << "FILENAME: " << filename << std::endl;
-        save_tensor(C, m_*n_, filename.c_str());
-      }
     }
     // Step 2: compute gradients w.r.t. value
     {
@@ -1075,15 +1047,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // save result to file for checking
-      if (m->inference_debugging) {
-        std::string filename = base_filepath + "_v_proj_in_grad";
-        std::cout << "FILENAME: " << filename << std::endl;
-        save_tensor(C, m_*n_*m->num_q_heads, filename.c_str());
-        std::string filename2 = base_filepath + "_qk_prods_softmax";
-        std::cout << "FILENAME: " << filename2 << std::endl;
-        save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str());
-      }
     }
     // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
     {
@@ -1132,14 +1095,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename4 = base_filepath + "_qk_prods_softmax_grad";
-        std::cout << "FILENAME: " << filename4 << std::endl;
-        save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str());
-        std::string filename5 = base_filepath + "_vcache";
-        std::cout << "FILENAME: " << filename5 << std::endl;
-        save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str());
-      }
     }
     // Step 4: softmax backpropagation
     {
@@ -1166,14 +1121,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                       &beta,
                                       m->qk_tensor,
                                       m->qk_prods));
-      
-      if (m->inference_debugging) {
-        DT *C = static_cast<DT *>(m->qk_prods);
-        std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in";
-        std::cout << "FILENAME: " << filename6 << std::endl;
-        save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str());
-      }
-      
       //  TODO: fill all elements above diagonal to force causal attention
       size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
       if (entries_above_diagonal > 0) {
@@ -1189,12 +1136,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                                 entries_above_diagonal,
                                                 DT(0.0f));
       }
-      if (m->inference_debugging) {
-        DT *C = static_cast<DT *>(m->qk_prods);
-        std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked";
-        std::cout << "FILENAME: " << filename7 << std::endl;
-        save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str());
-      }
     }
     // Step 5: compute gradients w.r.t. key
     {
@@ -1249,14 +1190,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename8 = base_filepath + "_query_activation";
-        std::cout << "FILENAME: " << filename8 << std::endl;
-        save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str());
-        std::string filename9 = base_filepath + "_devkproj_pre";
-        std::cout << "FILENAME: " << filename9 << std::endl;
-        save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
-      }
     }
     // Step 6: compute gradients w.r.t query
     {
@@ -1276,7 +1209,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       // after transposition & striding
       int m_ = num_tokens; // num_new_tokens
       int n_ = m->qProjSize;
-      int k_ = num_tokens; 
+      int k_ = num_tokens;
       // before transposition and striding
       int lda = num_tokens; // num_new_tokens
       int ldb = m->qProjSize * m->num_q_heads;
@@ -1307,19 +1240,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                            m->num_q_heads,
                                            compute_type,
                                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename3 = base_filepath + "_devQKVPRojArray_pre";
-        std::cout << "FILENAME: " << filename3 << std::endl;
-        save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
-      }
     }
-
     // Compute rotary embeddings bwd
     {
       if (*m->apply_rotary_embedding) {
         assert(m->hidden_size == m->qProjSize * m->num_q_heads);
         assert(m->qProjSize == m->kProjSize);
-        printf("ROTARY EMBEDDING bwd: num_tokens: %i, m->hidden_size: %i\n",  num_tokens, m->hidden_size);
         /*q&k*/
         int parallelism = num_tokens * m->hidden_size;
         DT *A = static_cast<DT *>(m->devQKVProjArray);
@@ -1332,21 +1258,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                               m->qProjSize,
                                               num_tokens,
                                               m->hidden_size);
-        DT *C = static_cast<DT *>(m->devQKVProjArray);
-        if (m->inference_debugging) {
-          std::string filename3 = base_filepath + "_devQKVPRojArray";
-          std::cout << "FILENAME: " << filename3 << std::endl;
-          save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str());
-        }
-      }
-
-      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
-      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-      DT *C = static_cast<DT *>(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients
-      if (m->inference_debugging) {
-        std::string filename9 = base_filepath + "_devkproj";
-        std::cout << "FILENAME: " << filename9 << std::endl;
-        save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str());
       }
     }
     // Step 7: compute gradients w.r.t. input
@@ -1390,11 +1301,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              ldc,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      if (m->inference_debugging) {
-        std::string filename12 = base_filepath + "_attn_final_grad_in";
-        std::cout << "FILENAME: " << filename12 << std::endl;
-        save_tensor(C, num_tokens * m->qSize, filename12.c_str());
-      }
     }
   }
 }
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 1624c0458d..271a291b09 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -290,11 +290,10 @@ __global__ void sparse_categorical_crossentropy_loss_peft_backward(
     int num_tokens,
     int num_classes) {
   CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
-    input_grad[i] = 0.5;
-    // input_grad[i] = output_grad[i];
-    // if (i % num_classes == token_ids[i / num_classes]) {
-    //   input_grad[i] -= 1.0f;
-    // }
+    input_grad[i] = output_grad[i];
+    if (i % num_classes == token_ids[i / num_classes]) {
+      input_grad[i] -= 1.0f;
+    }
   }
 }
 
@@ -346,14 +345,14 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
         num_bwd_tokens,
         num_classes);
     // scale
-    // scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
-    //                CUDA_NUM_THREADS,
-    //                0,
-    //                stream>>>(input_grad_ptr +
-    //                              tokens_previous_requests * num_classes,
-    //                          num_bwd_tokens * num_classes,
-    //                          DT(0.0),
-    //                          scale_factor);
+    scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
+                   CUDA_NUM_THREADS,
+                   0,
+                   stream>>>(input_grad_ptr +
+                                 tokens_previous_requests * num_classes,
+                             num_bwd_tokens * num_classes,
+                             DT(0.0),
+                             scale_factor);
 
     tokens_previous_requests += num_bwd_tokens;
   }
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 595b8d24e9..15789ae2e9 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -621,8 +621,6 @@ void Linear::inference_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   LinearMeta *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
-  printf("INF %s\n", op_name_without_uid.c_str());
   if (bc->num_tokens == 0) {
     return;
   }
@@ -759,9 +757,6 @@ void Linear::peft_bwd_task(Task const *task,
   int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
 
-  std::string op_name_without_uid = Linear::get_op_name_without_uid(m);
-  std::cout << "BWD " << op_name_without_uid << std::endl;
-
   int num_infr_tokens = bc->num_active_infr_tokens();
   int num_peft_tokens = bc->num_active_peft_tokens();
   if (m->inference_debugging) {
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index fb13dc99cb..e39b444af4 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -449,8 +449,6 @@ void LoraLinear::inference_task(Task const *task,
                                 Context ctx,
                                 Runtime *runtime) {
   LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
-  std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m);
-  std::cout << "INF " << op_name_without_uid << std::endl;
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_active_tokens() == 0) {
     return;
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 7697613ae0..d3cf278b35 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -823,9 +823,6 @@ void ResidualLayerNorm::peft_bwd_task(
                                              ctx,
                                              runtime);
   }
-  std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m);
-  std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " <<  m->reset_input_grads[0] << " reset_in_grad[1]: " <<  m->reset_input_grads[1] << std::endl;
-  
   ResidualLayerNorm::peft_bwd_kernel_wrapper(
       m, output_grad, input_grad, residual1_grad, residual2_grad, gamma);
 
@@ -954,10 +951,7 @@ void ResidualLayerNorm::inference_task(
   assert(task->regions.size() == regions.size());
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
-  std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m);
-  std::cout << "INF " << op_name_without_uid << std::endl;
   if (bc->num_tokens == 0) {
-    printf("Zero tokens\n");
     return;
   }
 
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 2164616b88..fe3f695522 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -254,7 +254,6 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         MemoryAllocator *allocator = m->handle.peft_activation_allocator;
         m->input_activation = allocator->allocate_instance_untyped(
             data_type_size(m->input_type[0]) * num_peft_tokens * in_dim);
-        printf("Allocating input_activation (%p) of size: %i*%i*%i=%i for %s...\n", m->input_activation, data_type_size(m->input_type[0]), num_peft_tokens,in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, m->op_name);
         // copy input activation
         if (m->input_type[0] == DT_FLOAT) {
           checkCUDA(cudaMemcpyAsync(
@@ -734,16 +733,6 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
   const dim3 blocks(M);
   int nshared = (num_threads / warp_size) * sizeof(T);
 
-  printf("Attempting to access %p\n", m->input_activation);
-  check_device_vs_host_ptr(static_cast<T const *>(m->input_activation));
-  check_device_vs_host_ptr(static_cast<T const *>(m->mean_ptr));
-  check_device_vs_host_ptr(static_cast<T const *>(m->rstd_ptr));
-  check_device_vs_host_ptr(static_cast<T const *>(gamma_ptr));
-  check_device_vs_host_ptr(static_cast<T const *>(input_grad_ptr));
-  check_device_vs_host_ptr(static_cast<T const *>(residual1_grad_ptr));
-  
-  return;
-
   layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
       output_grad_ptr,
       static_cast<T const *>(m->input_activation),
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index ff72b2273a..9591aedf45 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -673,36 +673,6 @@ Legion::FutureMap
   return runtime->execute_index_space(ctx, launcher);
 }
 
-template <typename DT>
-void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) {
-  std::ifstream in(filepath, std::ios::in | std::ios::binary);
-  if (!in.good()) {
-    std::cout << "Could not open file: " << filepath << std::endl;
-  }
-  assert(in.good() && "incorrect weight file path");
-  std::vector<DT> host_array(size);
-  size_t loaded_data_size = sizeof(DT) * size;
-  in.seekg(0, in.end);
-  in.seekg(0, in.beg);
-  in.read((char *)host_array.data(), loaded_data_size);
-
-  size_t in_get_size = in.gcount();
-  if (in_get_size != loaded_data_size) {
-    std::cout << "load weight data error " << in_get_size << ", "
-              << loaded_data_size << ", " << sizeof(DT) << std::endl;
-    assert(false);
-  }
-  assert(size == host_array.size());
-
-  copy_tensor_host_to_dev(ptr, host_array.data(), size);
-
-  // // normal
-  // long data_index = 0;
-  // for (auto v : host_array) {
-  //   ptr[data_index++] = v;
-  // }
-  in.close();
-}
 
 /*
   regions[0](I): RMS output_grad
@@ -742,45 +712,6 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
   peft_bwd_kernel_wrapper(
       m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
   
-  // get name
-  std::string op_name_without_uid = ResidualRMSNorm::get_op_name_without_uid(m);
-  std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " <<  m->reset_input_grads[0] << " reset_in_grad[1]: " <<  m->reset_input_grads[1] << std::endl;
-  // print shape
-  int numdims = residual_input0_grad.domain.get_dim();
-  std::cout << "in grad dims: ";
-  for (int i=0; i<numdims; i++) {
-    std::cout << residual_input0_grad.domain.hi()[i] - residual_input0_grad.domain.lo()[i] + 1 << ", ";
-  }
-  std::cout << std::endl;
-
-  if (op_name_without_uid == "norm") {
-    int amount = (residual_input1_grad.domain.get_volume()/128)*24;
-    std::cout << "Loading " << amount << " elements from /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_norm.gi_0.flexflow..." << std::endl;
-    load_tensor_from_file(
-      residual_input0_grad.get_float_ptr(), 
-      (residual_input0_grad.domain.get_volume()/128)*24,
-      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_norm.gi_0.flexflow"
-    );
-    load_tensor_from_file(
-      residual_input1_grad.get_float_ptr(), 
-      (residual_input1_grad.domain.get_volume()/128)*24, 
-      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_norm.gi_0.flexflow"
-    );
-  } else if (op_name_without_uid == "layers_11_ffn_norm") {
-    load_tensor_from_file(
-      residual_input0_grad.get_float_ptr(), 
-      (residual_input0_grad.domain.get_volume()/128)*24, 
-      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0.flexflow"
-    );
-    load_tensor_from_file(
-      residual_input1_grad.get_float_ptr(), 
-      (residual_input1_grad.domain.get_volume()/128)*24, 
-      "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0.flexflow"
-    );
-  }
-  // if name is layers_11_rms_norm, copy both
-  //load_tensor_from_file(DT *ptr, size_t size, std::string filepath)
-  
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 700162ade2..932b8ade84 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -373,16 +373,12 @@ void Softmax::inference_task(Task const *task,
   assert(regions.size() == 3);
   assert(task->regions.size() == 3);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
-  
-  std::string op_name_without_uid = Softmax::get_op_name_without_uid(m);
-  std::cout << "INF " << op_name_without_uid << std::endl;
   if (bc->num_tokens == 0) {
     return;
   }
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  
+  SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
@@ -442,7 +438,6 @@ void Softmax::peft_bwd_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
                             Runtime *runtime) {
-  printf("BWD softmax\n");
   assert(task->regions.size() == regions.size());
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index 818e0b9085..7836633b30 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -72,8 +72,6 @@ def peft_backward_hook(module, grad_input, grad_output):
             print("\t", go.shape)
             print(f"\t\tSaving to {dst_filepath}")
             torch.save(go, dst_filepath)
-            if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0":
-                go.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow")
         else:
             print(go)
     print("Backward GRAD Input:")
@@ -83,8 +81,6 @@ def peft_backward_hook(module, grad_input, grad_output):
             print("\t", gi.shape)
             print(f"\t\tSaving to {dst_filepath}")
             torch.save(gi, dst_filepath)
-            if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0" or dst_filepath == "./hf_peft_tensors/bwd_step_0_norm.gi_0":
-                gi.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow")
         else:
             print(gi)
 
@@ -229,8 +225,6 @@ def main():
                 torch.save(params, f"./hf_peft_tensors/{name}")
             if "lm_head" in name or "norm" in name:
                 torch.save(params, f"./hf_peft_tensors/{name}")
-            if "down_proj" in name or "self_attn" in name:
-                torch.save(params, f"./hf_peft_tensors/{name}")
 
     # Load fine-tuning dataset
     data = load_dataset("Abirate/english_quotes")

From 0d530b00ef4557360d1dc68fd0a8720a698fb884 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 10 Jan 2024 03:36:06 +0000
Subject: [PATCH 11/11] linting

---
 src/ops/add_bias_residual_layer_norm.cu | 14 ++++++--
 src/ops/fused.cc                        | 12 ++++---
 src/ops/fused.cu                        | 43 ++++++++++++++-----------
 src/ops/inc_multihead_self_attention.cc |  2 --
 src/ops/residual_layer_norm.cc          | 18 +++++------
 src/ops/residual_layer_norm.cu          | 16 +++++++--
 src/ops/residual_rms_norm.cc            |  3 +-
 src/ops/softmax.cc                      | 11 ++++---
 8 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index 08e3bb3edf..ab017ed46c 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -618,7 +618,17 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
   alignas(sizeof(double)) extern __shared__ char s_data1[];
   T *buf = reinterpret_cast<T *>(&s_data1);
 
-  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual, reset_input_grad, reset_residual_grad, N, buf);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual,
+             reset_input_grad,
+             reset_residual_grad,
+             N,
+             buf);
 }
 
 /*static*/
@@ -774,7 +784,7 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel(
     cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  
+
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index e18486289f..8afd61aece 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -530,15 +530,17 @@ FutureMap FusedOp::inference(FFModel const &ff,
   }
   offset += numOutputs;
   // add softmax output grad
-  if (operators[numOperators-1]->op_type == OP_SOFTMAX) {
-    printf("operator %i is last SOFTMAX! adding output %i\n", numOperators-1, numOutputs-1);
-    assert(outputs[numOutputs-1]->region != LogicalRegion::NO_REGION);
+  if (operators[numOperators - 1]->op_type == OP_SOFTMAX) {
+    printf("operator %i is last SOFTMAX! adding output %i\n",
+           numOperators - 1,
+           numOutputs - 1);
+    assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION);
     launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[numOutputs-1]->part_grad,
+        RegionRequirement(batch_outputs[numOutputs - 1]->part_grad,
                           0 /*projection id*/,
                           WRITE_ONLY,
                           EXCLUSIVE,
-                          batch_outputs[numOutputs-1]->region_grad));
+                          batch_outputs[numOutputs - 1]->region_grad));
     launcher.add_field(offset, FID_DATA);
   }
   return runtime->execute_index_space(ctx, launcher);
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 17586e925f..f6bed71f6a 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -95,9 +95,11 @@ __host__ void
 
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
-  bool softmax_grad_additional_region = (fused->op_op_type[fused->numOperators-1] == OP_SOFTMAX);
-  assert((int)regions.size() ==
-         fused->numInputs + fused->numWeights + fused->numOutputs + softmax_grad_additional_region);
+  bool softmax_grad_additional_region =
+      (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX);
+  assert((int)regions.size() == fused->numInputs + fused->numWeights +
+                                    fused->numOutputs +
+                                    softmax_grad_additional_region);
   // Domain input_domain[MAX_NUM_INPUTS];
   // Domain weight_domain[MAX_NUM_WEIGHTS];
   // Domain output_domain[MAX_NUM_OUTPUTS];
@@ -627,18 +629,22 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain.get_volume() ==
                my_output_accessor[0].domain.get_volume());
-        if (op == fused->numOperators -1) { // if this is the final operator
-          output_accessor[fused->numOutputs] = 
-            helperGetGenericTensorAccessorWO(fused->output_data_types[fused->numOutputs-1],
-                                            regions[roff],
-                                            task->regions[roff],
-                                            FID_DATA,
-                                            ctx,
-                                            runtime);
+        if (op == fused->numOperators - 1) { // if this is the final operator
+          output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(
+              fused->output_data_types[fused->numOutputs - 1],
+              regions[roff],
+              task->regions[roff],
+              FID_DATA,
+              ctx,
+              runtime);
         }
         SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
         Kernels::Softmax::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0], output_accessor[fused->numOutputs]);
+            m,
+            bc,
+            my_input_accessor[0],
+            my_output_accessor[0],
+            output_accessor[fused->numOutputs]);
         break;
       }
       case OP_ALLREDUCE: {
@@ -1123,12 +1129,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         if (m->elementwise_affine) {
           gamma = my_weight_accessor[1];
         }
-        
-        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(m, 
-                                                          my_output_grad_accessor[1], 
-                                                          my_input_grad_accessor[0],
-                                                          my_input_grad_accessor[1], 
-                                                          gamma);
+
+        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+            m,
+            my_output_grad_accessor[1],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            gamma);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index f590fa0440..5d52034575 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -876,7 +876,6 @@ void IncMultiHeadSelfAttention::inference_task(
   }
 }
 
-
 FutureMap IncMultiHeadSelfAttention::peft_bwd(
     FFModel const &ff,
     BatchConfigFuture const &bc,
@@ -993,7 +992,6 @@ void IncMultiHeadSelfAttention::peft_bwd_task(
 
   assert(task->index_point.get_dim() == 1);
 
-
   IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
       m,
       bc,
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index d3cf278b35..8563c299ab 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -133,15 +133,15 @@ void FFModel::residual_layer_norm(const Tensor input,
             : residual2;
   }
   Layer *ln = new Layer(this,
-                 OP_RESIDUAL_LAYERNORM,
-                 data_type,
-                 name,
-                 2 + use_two_residuals /*inputs*/,
-                 num_weights,
-                 2 /*outputs*/,
-                 casted_input,
-                 casted_residual1,
-                 casted_residual2);
+                        OP_RESIDUAL_LAYERNORM,
+                        data_type,
+                        name,
+                        2 + use_two_residuals /*inputs*/,
+                        num_weights,
+                        2 /*outputs*/,
+                        casted_input,
+                        casted_residual1,
+                        casted_residual2);
   ln->outputs[0] = create_tensor_legion_ordering(
       input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/);
   ln->outputs[1] = create_tensor_legion_ordering(
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index fe3f695522..1f87949234 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -570,7 +570,19 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
                                              int const N) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
   T *buf = reinterpret_cast<T *>(&s_data1);
-  compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, reset_input_grad, reset_residual_grad1, reset_residual_grad2, N, buf);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual1,
+             dX_residual2,
+             reset_input_grad,
+             reset_residual_grad1,
+             reset_residual_grad2,
+             N,
+             buf);
 }
 
 /*static*/
@@ -727,7 +739,7 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m,
                      cudaStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  
+
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 9591aedf45..c2fbe11544 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -673,7 +673,6 @@ Legion::FutureMap
   return runtime->execute_index_space(ctx, launcher);
 }
 
-
 /*
   regions[0](I): RMS output_grad
   regions[1](I/O): Residual input 0 grad
@@ -711,7 +710,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task,
       m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   peft_bwd_kernel_wrapper(
       m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight);
-  
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 932b8ade84..1d062b552b 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -356,11 +356,12 @@ FutureMap Softmax::inference(FFModel const &ff,
                                                     batch_outputs[0]->region));
   launcher.add_field(1, FID_DATA);
   // we add the region below in order to copy the output to the grad tensor
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(2, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }