From 889224ade3b303b8a83ee4c2ac1d787a9cfe3bd4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 14 Dec 2023 17:15:50 -0500 Subject: [PATCH 01/11] Revert "several hacks for performance measurement; some of the changes should be reverted" This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c. --- inference/incr_decoding/incr_decoding.cc | 32 ++++++++---------------- src/ops/argmax.cc | 5 ---- src/runtime/request_manager.cc | 10 ++------ 3 files changed, 12 insertions(+), 35 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 94ccb1cabf..dcd1b5a5ab 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -138,9 +138,9 @@ void FlexFlow::top_level_task(Task const *task, bool do_sample = false; float temperature = 0.0f; float topp = 0.0f; - int max_requests_per_batch = 2; - int max_tokens_per_batch = 300; - int max_sequence_length = 300; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -272,7 +272,6 @@ void FlexFlow::top_level_task(Task const *task, int total_num_requests = 0; { -#ifdef DEADCODE using json = nlohmann::json; std::ifstream file_handle(file_paths.prompt_file_path); assert(file_handle.good() && "Prompt file does not exist."); @@ -292,26 +291,15 @@ void FlexFlow::top_level_task(Task const *task, inference_req.peft_model_id = peft_model_id; requests.push_back(inference_req); total_num_requests++; - } -#endif - std::vector requests; - for (int i = 0; i < (max_requests_per_batch - 1) * 4; i++) { - Request inference_req; - inference_req.prompt = "b"; - inference_req.max_sequence_length = 40; - requests.push_back(inference_req); + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 128; + fine_tuning_req.peft_model_id = peft_model_id; + fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); + requests.push_back(fine_tuning_req); total_num_requests++; } - // Add a fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 256; - fine_tuning_req.max_training_steps = 256; - fine_tuning_req.peft_model_id = peft_model_id; - fine_tuning_req.dataset_text.push_back(std::make_pair("b", "")); - requests.push_back(fine_tuning_req); - total_num_requests++; - GenerationResult result = model.generate(requests); } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index dd0e2bb822..cabb8b204f 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -392,11 +392,6 @@ InferenceResult GenericTensorAccessorW parent; int batch_size = bc->num_active_infr_tokens(); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); - // Note that we free activation allocator here since argmax is the - // last operator in forward - if (m->handle.peft_activation_allocator != nullptr) { - m->handle.peft_activation_allocator->free_all(); - } InferenceResult ir; if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1d4a9ee47c..cbb21e03e0 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -246,17 +246,13 @@ RequestManager::RequestGuid request.peft_model_id = request_.peft_model_id; request.req_type = Request::REQ_FINETUNING; request.completed_training_steps = 0; - request.max_training_steps = request_.max_training_steps; + request.max_training_steps = 1; // TODO: let user set this for (auto const &sample : request_.dataset_text) { std::vector input_tokens; input_tokens = this->tokenizer_->Encode(sample.first); if (bos_token_id >= 0 && model_type != ModelType::FALCON) { input_tokens.insert(input_tokens.begin(), bos_token_id); } - // FIXME: this is a hack, must undo - while (input_tokens.size() < 256) { - input_tokens.push_back(293); - } std::vector output_tokens = this->tokenizer_->Encode(sample.second); if (input_tokens.size() + output_tokens.size() > @@ -359,7 +355,6 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { - log_req_mgr.print("[Old BC] Num tokens: %d", old_bc.num_tokens); const std::lock_guard lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { @@ -544,8 +539,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_generation_tokens = num_generation_tokens; // Step 3: add new requests to the next batch if there is space - // FIXME: we reserve one slot for PEFT req now - for (int i = 0; i < BatchConfig::max_requests_per_batch() - 1; i++) { + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { From f01b0560279fef38e8b347b4307e172ce10ae3e0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 14 Dec 2023 17:16:38 -0500 Subject: [PATCH 02/11] backup --- src/ops/inc_multihead_self_attention.cc | 4 + tests/peft/alignment_tests.ipynb | 1308 +++++++++++++++++++++++ tests/peft/qk_prods_alignment.ipynb | 24 + 3 files changed, 1336 insertions(+) create mode 100644 tests/peft/alignment_tests.ipynb create mode 100644 tests/peft/qk_prods_alignment.ipynb diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index ca6eb7c095..d88c7edb81 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -860,6 +860,8 @@ void IncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); + std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m); + std::cout << "INF " << op_name_without_uid << std::endl; IncMultiHeadSelfAttention::inference_kernel_wrapper( m, bc, task->index_point.point_data[0], input, weight, output, biases); @@ -992,6 +994,8 @@ void IncMultiHeadSelfAttention::peft_bwd_task( assert(task->index_point.get_dim() == 1); + std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m); + std::cout << "BWD " << op_name_without_uid << std::endl; IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( m, bc, diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb new file mode 100644 index 0000000000..fc2899b7c4 --- /dev/null +++ b/tests/peft/alignment_tests.ipynb @@ -0,0 +1,1308 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "hf_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors\"\n", + "ff_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors\"\n", + "def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):\n", + " assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n", + " hf_tensor = torch.load(hf_tensor_filepath)\n", + " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", + " assert(len(hf_tensor) == 1)\n", + " hf_tensor = hf_tensor[0]\n", + " hf_tensor = torch.nan_to_num(hf_tensor)\n", + " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", + " ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "\n", + " len_hf_tensor = hf_tensor.shape[0]\n", + " ff_tensor = ff_tensor[:len_hf_tensor]\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n", + " print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}\")\n", + " print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n", + " print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #print(np.nonzero(hf_tensor)[0])\n", + " # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n", + " # print(ff_tensor[36], hf_tensor[36])\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len_hf_tensor)\n", + " print(\"Ok!\")\n", + "def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):\n", + " assert(os.path.exists(hf_tensor_filepath))\n", + " assert(os.path.exists(ff_tensor1_filepath))\n", + " assert(os.path.exists(ff_tensor2_filepath))\n", + " hf_tensor = torch.load(hf_tensor_filepath)\n", + " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", + " assert(len(hf_tensor) == 1)\n", + " hf_tensor = hf_tensor[0]\n", + " hf_tensor = torch.nan_to_num(hf_tensor)\n", + " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',')\n", + "\n", + " len_hf_tensor = hf_tensor.shape[0]\n", + " ff_tensor1 = ff_tensor1[:len_hf_tensor]\n", + " ff_tensor2 = ff_tensor2[:len_hf_tensor]\n", + " ff_tensor = ff_tensor1 - ff_tensor2\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n", + " print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}\")\n", + " print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n", + " print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #print(np.nonzero(hf_tensor)[0])\n", + " # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n", + " # print(ff_tensor[36], hf_tensor[36])\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len_hf_tensor)\n", + " print(\"Ok!\")\n", + "def compare_hf_tensors(tensor1_fp, tensor2_fp):\n", + " assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n", + " hf_tensor1 = torch.load(tensor1_fp)\n", + " hf_tensor2 = torch.load(tensor2_fp)\n", + " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", + " assert(len(hf_tensor1) == 1)\n", + " hf_tensor1 = hf_tensor1[0]\n", + " if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n", + " assert(len(hf_tensor2) == 1)\n", + " hf_tensor2 = hf_tensor2[0]\n", + " assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n", + " hf_tensor1 = torch.nan_to_num(hf_tensor1)\n", + " hf_tensor2 = torch.nan_to_num(hf_tensor2)\n", + " if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())):\n", + " print(f\"mismatch between {tensor1_fp} and {tensor2_fp}\")\n", + " print(hf_tensor1)\n", + " print(hf_tensor2)\n", + " print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))\n", + " mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0]\n", + " print(mismatches)\n", + " assert(False)\n", + " print(\"Ok!\")\n", + "\n", + "def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):\n", + " assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n", + " hf_tensor_sum = torch.load(tensor_sum_fp)\n", + " hf_tensor1 = torch.load(tensor1_fp)\n", + " hf_tensor2 = torch.load(tensor2_fp)\n", + " if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:\n", + " assert(len(hf_tensor_sum) == 1)\n", + " hf_tensor_sum = hf_tensor_sum[0]\n", + " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", + " assert(len(hf_tensor1) == 1)\n", + " hf_tensor1 = hf_tensor1[0]\n", + " if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n", + " assert(len(hf_tensor2) == 1)\n", + " hf_tensor2 = hf_tensor2[0]\n", + " assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape)\n", + " assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n", + " hf_tensor1 = torch.nan_to_num(hf_tensor1)\n", + " hf_tensor2 = torch.nan_to_num(hf_tensor2)\n", + " hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)\n", + " sum_check_tensor = hf_tensor1 + hf_tensor2\n", + " if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())):\n", + " print(f\"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}\")\n", + " print(tensor_sum_fp)\n", + " print(sum_check_tensor)\n", + " print(hf_tensor1)\n", + " print(hf_tensor2)\n", + " print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))\n", + " mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0]\n", + " print(mismatches)\n", + " assert(False)\n", + " print(\"Ok!\")\n", + "def check_hf_zero_tensor(hf_tensor_fp):\n", + " assert(os.path.exists(hf_tensor_fp))\n", + " hf_tensor1 = torch.load(hf_tensor_fp)\n", + " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", + " assert(len(hf_tensor1) == 1)\n", + " hf_tensor1 = hf_tensor1[0]\n", + " assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)\n", + "def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=\"\"):\n", + " assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n", + " hf_tensor = torch.load(hf_tensor_filepath)\n", + " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", + " assert(len(hf_tensor) == 1)\n", + " hf_tensor = hf_tensor[0]\n", + " hf_tensor = torch.nan_to_num(hf_tensor)\n", + " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", + " ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "\n", + " len_hf_tensor = hf_tensor.shape[0]\n", + " ff_tensor = ff_tensor[:len_hf_tensor]\n", + "\n", + " print(f\"{txt} - HF tensor:\")\n", + " print(hf_tensor)\n", + " print(f\"{txt} - FF tensor: \")\n", + " print(ff_tensor)\n", + "def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):\n", + " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", + "\n", + " if (ff_tensor1.shape != ff_tensor2.shape):\n", + " print(ff_tensor1.shape, ff_tensor2.shape)\n", + " assert(ff_tensor1.shape == ff_tensor2.shape)\n", + "\n", + " if max_len > -1:\n", + " ff_tensor1 = ff_tensor1[:max_len]\n", + " ff_tensor2 = ff_tensor2[:max_len]\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n", + " print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n", + " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", + " print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len(ff_tensor1))\n", + " print(\"Ok!\")\n", + "def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n", + " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", + " minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])\n", + " ff_tensor1 = ff_tensor1[:minlen]\n", + " ff_tensor2 = ff_tensor2[:minlen]\n", + " mismatches = []\n", + " if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n", + " print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n", + " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", + " print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len(ff_tensor1))\n", + " print(\"Ok!\")\n", + "def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n", + " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", + " ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')\n", + " \n", + " ff_sum = ff_tensor1 + ff_tensor2\n", + " assert(ff_tensor1.shape == ff_tensor2.shape)\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):\n", + " print(f\"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}\")\n", + " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", + " print(f\"Sum Tensor: {ff_tensor_sum}\\nActual sum:{ff_sum}\")\n", + " print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len(ff_tensor1))\n", + " print(\"Ok!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for layer_num in range(tot_num_layers):\n", + " hf_input_ln_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.input_layernorm.output_0\"\n", + " ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_RMSNorm_shard-id_0_output_0\"\n", + " if layer_num > 0:\n", + " ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_norm_shard-id_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", + " hf_attn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.o_proj.output_0\"\n", + " ff_attn_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n", + " compare_tensors(hf_attn_out, ff_attn_out)\n", + " hf_ffn_norm_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.post_attention_layernorm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_1\"\n", + " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", + " # w1\n", + " hf_gate_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n", + " ff_gate_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", + " compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n", + " # w3\n", + " hf_up_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\" \n", + " ff_up_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", + " compare_tensors(hf_up_proj_out, ff_up_proj_out)\n", + " # w2\n", + " hf_down_proj_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.input_0\"\n", + " hf_down_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.output_0\"\n", + " ff_down_proj_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n", + " ff_down_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n", + " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", + " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_weight_base_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out)\n", + "hf_lm_head_out = f\"{hf_weight_base_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "ff_BWD_softmax_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "\n", + "hf_BWD_lm_head_out = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "# compare weights\n", + "hf_lm_head_weight = f\"{hf_weight_base_path}/base_model.model.lm_head.weight\"\n", + "ff_lm_head_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_weight_0\"\n", + "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "# # Manually check the matmul\n", + "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n", + "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n", + "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n", + "# print(ff_tensor_out.shape)\n", + "# print(ff_weight.shape)\n", + "# print(np.matmul(ff_weight, ff_tensor_out))\n", + "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n", + "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "\n", + "hf_BWD_norm_out = f\"{hf_weight_base_path}/bwd_step_0_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "ff_BWD_norm_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_weight_0\"\n", + "hf_FWD_norm_weight = f\"{hf_weight_base_path}/base_model.model.model.norm.weight\"\n", + "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n", + "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 1.8299303e+01\n", + " 1.3871717e+01 1.8452764e+00]\n", + "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 1.82993031e+01\n", + " 1.38717194e+01 1.84527588e+00]\n", + "[ True True True ... True True True]\n", + "[2394]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 1.8299303e+01\n", + " 1.3871717e+01 1.8452764e+00]\n", + "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 1.82993031e+01\n", + " 1.38717194e+01 1.84527588e+00]\n", + "[ True True True ... True True True]\n", + "[2394]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "(64, 12, 24)\n", + "(64, 12, 24)\n", + "torch.Size([12, 24, 64])\n", + "torch.Size([12, 64, 24])\n", + "3.7760416666666665% mismatch in QK prods softmax out grad\n", + "hf_kproj_grads_post_rotary: (24, 64, 12)\n", + "hf_kproj_grads_before_rotary: (24, 64, 12)\n", + "[[-2.1751599e-01 1.2245592e-01 -2.6237822e-01 ... 1.4371538e+00\n", + " 5.2717543e-01 5.1425427e-01]\n", + " [-7.6055496e+01 4.2463268e+01 -1.2235089e+02 ... 5.3328156e+02\n", + " 2.3810944e+02 1.8990283e+02]\n", + " [ 5.2804117e+00 -4.9826388e+00 4.6240320e+00 ... -5.4525635e+01\n", + " -2.1779711e+01 -3.2857445e+01]\n", + " ...\n", + " [ 1.0541155e+00 -3.1229946e-01 1.4272718e+00 ... -4.6509657e+00\n", + " -2.2930331e+00 2.1488833e-01]\n", + " [ 1.8427576e+00 -5.0031781e-01 2.1591802e+00 ... -8.0996408e+00\n", + " -6.6346103e-01 1.1487092e+00]\n", + " [-3.9699785e-02 1.7903861e-02 -5.9658013e-02 ... 2.4856456e-01\n", + " -5.0553136e-02 -6.9623299e-02]]\n", + "HF Qproj:\n", + "torch.Size([24, 768])\n", + "\t reshaped: (24, 64, 12)\n", + "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n", + " 0.0000000e+00 0.0000000e+00]\n", + " [-2.1439369e-03 3.2949594e-03 -2.9551802e-04 ... 2.4234147e-01\n", + " 4.3675132e-02 -9.2217997e-02]\n", + " [ 2.9682016e+00 -4.1166668e+00 -1.5612273e+00 ... 1.8131609e+01\n", + " -2.7311683e+00 -2.3451160e+01]\n", + " ...\n", + " [ 7.9408998e+00 -1.6016111e+01 7.5070286e+00 ... 6.9805992e+01\n", + " -8.9288340e+00 -5.6585381e+01]\n", + " [ 5.9755993e+00 -1.2562438e+01 9.3722830e+00 ... 5.6924896e+01\n", + " 1.6420145e+00 -2.7360382e+01]\n", + " [ 2.9259295e+00 -8.8997393e+00 5.6537924e+00 ... 4.0085789e+01\n", + " -5.5427680e+00 -3.3319279e+01]]\n", + "FF Qproj:\n", + "(24, 64, 12)\n", + "[[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00\n", + " 0.00000000e+00 0.00000000e+00]\n", + " [-2.14390800e-03 3.29491800e-03 -2.95515000e-04 ... 2.42337957e-01\n", + " 4.36745250e-02 -9.22166630e-02]\n", + " [ 2.96819830e+00 -4.11666203e+00 -1.56122601e+00 ... 1.81315899e+01\n", + " -2.73117018e+00 -2.34511394e+01]\n", + " ...\n", + " [ 7.94090462e+00 -1.60161247e+01 7.50703382e+00 ... 6.98059998e+01\n", + " -8.92883396e+00 -5.65854073e+01]\n", + " [ 5.97561932e+00 -1.25624638e+01 9.37229633e+00 ... 5.69249115e+01\n", + " 1.64204872e+00 -2.73603287e+01]\n", + " [ 2.92593479e+00 -8.89975548e+00 5.65379906e+00 ... 4.00858383e+01\n", + " -5.54277229e+00 -3.33193245e+01]]\n", + "hf_attn_in: torch.Size([1, 24, 768])\n", + "hf_attn_in: (768, 24)\n", + "[[-7.5252225e+06 -1.2484900e+03 5.3961243e+01 ... -3.3743629e+01\n", + " -2.8661375e+00 -1.2124748e+00]\n", + " [-9.5513660e+06 1.8450066e+03 3.8372406e+02 ... -1.9933952e+01\n", + " 1.4622488e+01 -2.4410028e+00]\n", + " [ 1.1452265e+07 2.1254619e+03 -4.8265629e+01 ... 4.8204151e+01\n", + " -1.4841021e+01 -1.6505869e+01]\n", + " ...\n", + " [ 2.1089132e+06 2.8605874e+03 1.2375667e+03 ... 2.6102766e+01\n", + " 3.1422745e+01 6.7668297e+01]\n", + " [ 2.1169400e+06 -4.6361523e+02 -1.6561864e+02 ... -5.3914165e+00\n", + " -6.0169220e-02 2.2841328e+01]\n", + " [ 7.3915345e+06 8.9268884e+02 5.4528040e+02 ... 6.2017624e+01\n", + " 1.3753588e+01 5.2149849e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.52522050e+06 -1.24848975e+03 5.39611511e+01 ... -3.37436867e+01\n", + " -2.86611795e+00 -1.21241117e+00]\n", + " [-9.55136800e+06 1.84500635e+03 3.83724091e+02 ... -1.99339561e+01\n", + " 1.46225519e+01 -2.44094014e+00]\n", + " [ 1.14522650e+07 2.12546313e+03 -4.82656937e+01 ... 4.82041969e+01\n", + " -1.48411064e+01 -1.65059376e+01]\n", + " ...\n", + " [ 2.10891300e+06 2.86058789e+03 1.23756726e+03 ... 2.61027851e+01\n", + " 3.14227238e+01 6.76683807e+01]\n", + " [ 2.11693950e+06 -4.63614868e+02 -1.65618515e+02 ... -5.39132690e+00\n", + " -6.02092740e-02 2.28413010e+01]\n", + " [ 7.39153300e+06 8.92689453e+02 5.45280640e+02 ... 6.20176048e+01\n", + " 1.37535381e+01 5.21498528e+01]]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 5\u001b[0m line \u001b[0;36m3\n\u001b[1;32m 300\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mff_attn_in: \u001b[39m\u001b[39m\"\u001b[39m, ff_attn_in\u001b[39m.\u001b[39mshape)\n\u001b[1;32m 301\u001b[0m \u001b[39mprint\u001b[39m(ff_attn_in)\n\u001b[0;32m--> 302\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_attn_in, hf_attn_in, atol\u001b[39m=\u001b[39m\u001b[39m1e-2\u001b[39m))\n\u001b[1;32m 304\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 306\u001b[0m hf_kproj_grads_in \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mhf_weight_base_path\u001b[39m}\u001b[39;00m\u001b[39m/bwd_step_0_layers.\u001b[39m\u001b[39m{\u001b[39;00mlayer_num\u001b[39m}\u001b[39;00m\u001b[39m.self_attn.k_proj.gi_0\u001b[39m\u001b[39m\"\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for layer_num in range(tot_num_layers-1, -1, -1):\n", + " # HuggingFace filepaths\n", + " hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n", + " hf_BWD_loraB_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.go_0\"\n", + " hf_BWD_loraB_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.gi_0\"\n", + " hf_BWD_loraA_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.go_0\"\n", + " hf_BWD_loraA_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.gi_0\"\n", + " hf_loraA_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " hf_loraB_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " hf_BWD_lora_dropout_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.go_0\"\n", + " hf_BWD_lora_dropout_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.gi_0\"\n", + " hf_BWD_w2_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.go_0\"\n", + " hf_BWD_w2_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.gi_0\"\n", + " hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n", + " hf_BWD_w3_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.go_0\"\n", + " hf_BWD_w3_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.gi_0\"\n", + " hf_BWD_w1_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.go_0\"\n", + " hf_BWD_w1_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.gi_0\"\n", + " hf_BWD_act_fn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.gi_0\"\n", + " hf_BWD_act_fn_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.go_0\"\n", + " hf_BWD_ffn_norm_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.go_0\"\n", + " hf_BWD_ffn_norm_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.gi_0\"\n", + " hf_BWD_attn_out_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.go_0\"\n", + " hf_BWD_attn_q_in = f\"{hf_weight_base_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n", + " hf_FWD_w1_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n", + " hf_FWD_w3_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\"\n", + " hf_FWD_act_fn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.act_fn.output_0\"\n", + " hf_BWD_attn_oproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + " hf_attn_qproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.q_proj.weight\"\n", + " hf_attn_kproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.k_proj.weight\"\n", + " hf_attn_vproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.v_proj.weight\"\n", + " hf_attn_oproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.o_proj.weight\"\n", + " # hf_BWD_attn_vproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", + " # FlexFlow filepaths\n", + " ff_BWD_w2_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n", + " ff_BWD_w2_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n", + " ff_BWD_w2_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_pre_input_0\"\n", + " ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", + " ff_BWD_ssm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_output_0\"\n", + " ff_BWD_ssm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_0\"\n", + " ff_BWD_ssm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_1\"\n", + " ff_BWD_w3_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", + " ff_BWD_w3_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_input_0\"\n", + " ff_BWD_lora_A_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", + " ff_BWD_lora_B_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", + " ff_lora_A_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " ff_lora_B_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " ff_BWD_w1_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", + " ff_BWD_w1_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_input_0\"\n", + " ff_BWD_w1_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_pre_input_0\"\n", + " ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", + " ff_BWD_ffn_norm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_0\"\n", + " ff_BWD_ffn_norm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_1\"\n", + " ff_BWD_ffn_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_0\"\n", + " ff_BWD_attn_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n", + " ff_BWD_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_input_0\"\n", + " ff_BWD_ssm_cached_w1_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w1_output\"\n", + " ff_BWD_ssm_cached_w3_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w3_output\"\n", + " ff_FWD_w1_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", + " ff_FWD_w3_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", + " ff_FWD_act_fnc_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_act_fn_output\"\n", + " ff_BWD_attn_o_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", + " # ff_BWD_attn_v_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n", + " ff_attn_oproj_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_weight_0\"\n", + " # ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + "\n", + " # xxx = torch.load(hf_BWD_attn_out_out)\n", + " # xxx.detach().cpu().numpy().tofile(f\"{hf_BWD_attn_out_out}.flexflow\")\n", + " # print(f\"{hf_BWD_attn_out_out}.flexflow\")\n", + " \n", + " # HuggingFace checks\n", + " print(\"\\nHuggingface checks:\")\n", + " if layer_num == tot_num_layers-1:\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n", + " compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n", + " # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n", + " check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n", + " check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n", + "\n", + " # FlexFlow checks\n", + " print(\"\\nFlexFlow checks:\")\n", + " compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n", + " compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n", + " compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n", + " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768) # should fail\n", + " compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768)\n", + "\n", + " # HF-FlexFlow checks\n", + " print(\"\\nHuggingface-FlexFlow checks:\")\n", + " compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " #print(torch.load(hf_w2_weight).shape)\n", + " compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n", + " compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n", + "\n", + " compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n", + " compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", + "\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n", + "\n", + " compare_tensors(hf_FWD_w1_out, ff_FWD_w1_out)\n", + " compare_tensors(hf_FWD_w3_out, ff_FWD_w3_out)\n", + " compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", + " compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " # compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n", + " # compare_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2)\n", + " # compare_tensors(hf_BWD_attn_out_out, ff_BWD_ffn_norm_in2)\n", + " compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n", + "\n", + " # compare attn weight tensors\n", + " hidden_size = 768\n", + " qProjSize = 64\n", + " num_heads = 12\n", + " num_new_tokens = num_tokens = 24\n", + " ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n", + " ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n", + " \n", + " hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n", + " \n", + " assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n", + " \n", + " # Compare attn outproj grad in tensors\n", + " compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n", + " \n", + " ########### Compare value projs grads ######################\n", + " # 1. compare qk prods softmax\n", + " hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.qk_prods_softmax\"\n", + " ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + " \n", + " hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n", + " ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + "\n", + " for head_idx in range(num_heads):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + " \n", + " # 2. compare attn heads grads\n", + " hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + " ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", + "\n", + " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n", + " assert(np.allclose(ff_attn_heads_grads, hf_attn_heads_grads, atol=1e-2))\n", + "\n", + " # 3. vproj grads\n", + " hf_vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.go_0\"\n", + " ff_vproj_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n", + "\n", + " hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n", + " ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n", + " assert(np.allclose(hf_vproj_grads, ff_vproj_grads, atol=1e-2))\n", + "\n", + " \n", + " \n", + " \n", + " ##############################\n", + " hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", + " hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " print(hf_value_states.shape)\n", + " ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n", + " ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n", + " print(ff_value_states.shape)\n", + " assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n", + " \n", + " \n", + " \n", + " ########## Compare key and query projs grads ##################\n", + " ff_devQKVPRojArray = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n", + " ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n", + " ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n", + " ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n", + " ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n", + " assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n", + "\n", + " # simulate qk_prods_softmax\n", + " ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n", + " ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n", + " ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n", + " ff_value_states = torch.from_numpy(ff_value_states)\n", + " ff_value_states = ff_value_states.permute(1,0,2)\n", + " print(ff_attn_heads_grads.shape)\n", + " print(ff_value_states.shape)\n", + " simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n", + " #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n", + " #print(\"Simulated QK prods grads:\")\n", + " #print(simulated_qk_prods_softmax_grads[0,:,:])\n", + "\n", + " # qk prods softmax right before softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " # assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + " mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n", + " assert(pct_mismatch <= 0.05)\n", + "\n", + " # qk prods softmax right after softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.gi_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + " \n", + " # qk prods softmax after mask\n", + " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.matmul_op.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in_masked\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + "\n", + " # Compare query activation\n", + " hf_query_activation = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.query_activation\"\n", + " hf_query_activation = torch.load(hf_query_activation)\n", + " ff_query_activation = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_query_activation\"\n", + " ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n", + " hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n", + " \n", + " # Compare FF kproj with intermediate kproj data from HF\n", + " hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n", + " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", + " # print(hf_kproj_grads_post_rotary[0,:,:])\n", + " ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n", + " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj: \", ff_kproj.shape)\n", + " # print(ff_kproj[:,:,0])\n", + " assert(np.allclose(ff_kproj, hf_kproj_grads_post_rotary, atol=1e-2))\n", + "\n", + " # Compare HF before and Kproj out gradients\n", + " hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n", + " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", + " print(hf_kproj_grads_before_rotary[:,:,0])\n", + " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", + " hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n", + " hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n", + " #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n", + " #print(hf_kproj_grads[:,:64])\n", + " reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " #print(reshaped_tensor.shape)\n", + " assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " # Compare QProj\n", + " hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n", + " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", + " print(\"HF Qproj:\")\n", + " print(hf_qproj_grads.shape)\n", + " reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " print(\"\\t reshaped: \", reshaped_tensor.shape)\n", + " print(reshaped_tensor[:,:,0])\n", + " ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n", + " ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n", + " print(\"FF Qproj:\")\n", + " print(ff_qproj.shape)\n", + " print(ff_qproj[:,:,0])\n", + " assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n", + " hf_attn_in = torch.load(hf_attn_in)\n", + " print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " hf_attn_in = hf_attn_in.squeeze().T\n", + " hf_attn_in = hf_attn_in.detach().cpu().numpy()\n", + " print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " print(hf_attn_in)\n", + "\n", + " ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n", + " ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n", + " print(\"ff_attn_in: \", ff_attn_in.shape)\n", + " print(ff_attn_in)\n", + " #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n", + "\n", + " mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n", + " mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n", + " print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n", + " assert(pct_mismatch <= 0.05)\n", + " \n", + " assert(np.allclose(hf_kproj_grads, ff_kProjGrads, atol=1e-2))\n", + " assert(np.allclose(hf_qproj_grads, ff_qProjGrads, atol=1e-2))\n", + " # print(hf_qproj_grads.shape)\n", + " # print(hf_kproj_grads)\n", + " # print()\n", + " # print(ff_qProjGrads)\n", + " # print(ff_kProjGrads.shape)\n", + " \n", + " \n", + "\n", + " assert False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 12, 24, 24])\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 17\u001b[0m ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m 18\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> 19\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 21\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "layer_num = 11\n", + "hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n", + "ff_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + "\n", + "hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", + "\n", + "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n", + "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n", + "print(hf_qk_prods_softmax.shape)\n", + "#print(ff_qk_prods_softmax.shape)\n", + "#print(hf_qk_prods_softmax[:,:,0])\n", + "#print()\n", + "#print(ff_qk_prods_softmax[:,:,0])\n", + "\n", + "for head_idx in range(12):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + "\n", + "\n", + "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n", + "print(hf_value_states.shape)\n", + "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n", + "print()\n", + "print(attn_output.shape)\n", + "print(attn_output.transpose(1, 2).contiguous().shape)\n", + "print(\"Hf attn heads\")\n", + "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n", + "\n", + "print(\"Attn heads grads:\")\n", + "hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + "print(torch.load(hf_attn_heads_grads).shape)\n", + "print(\"HF value grads:\")\n", + "vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", + "print(torch.load(vproj_grads).shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 3, 4])\n", + "torch.Size([4, 3, 2])\n" + ] + } + ], + "source": [ + "a = torch.randn(2,3,4)\n", + "print(a.shape)\n", + "print(a.T.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000,\n", + " 0.0000],\n", + " [ 27.8890, -21.5089, 45.8214, ..., 5.4010, -10.8787,\n", + " 39.7619],\n", + " [ 19.2197, 27.4681, -68.7141, ..., 102.3280, 66.7925,\n", + " -160.8711],\n", + " ...,\n", + " [ 63.9532, 17.4273, -29.4416, ..., 101.6105, 67.5937,\n", + " -198.4432],\n", + " [ 31.2799, 13.0724, -44.7179, ..., 132.4898, 42.3135,\n", + " -194.4037],\n", + " [ 42.3453, -16.2693, -55.7386, ..., 90.5921, 52.2032,\n", + " -124.1802]]], device='cuda:0')\n", + "tensor([[[-1.1845e+06, -6.7460e+05, 7.4494e+05, ..., -9.1441e+05,\n", + " -1.4912e+05, 3.5769e+06],\n", + " [-7.3920e+01, -7.9389e+01, 1.1027e+02, ..., -7.3020e+01,\n", + " -2.3540e+01, 3.4587e+02],\n", + " [-5.3885e+01, -1.7373e+01, -1.9780e+01, ..., 4.1291e+01,\n", + " 5.5099e+01, 5.5910e+01],\n", + " ...,\n", + " [-2.1948e+01, -3.2109e+01, 2.8364e+01, ..., 3.4321e+01,\n", + " 5.0713e+01, 5.6592e+01],\n", + " [-4.4339e+01, -2.8339e+01, 1.4070e+01, ..., 6.2797e+01,\n", + " 3.0760e+01, 6.1743e+01],\n", + " [-1.6287e+01, -5.0413e+01, -1.9940e+01, ..., 4.3766e+01,\n", + " 4.7833e+01, 4.7295e+01]]], device='cuda:0')\n" + ] + } + ], + "source": [ + "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n", + "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n", + "a = torch.load(a)\n", + "b = torch.load(b)\n", + "print(a)\n", + "print(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Manual matmul checks\n", + "# ff_w2_grad_out_tensor = np.loadtxt(ff_BWD_w2_out, delimiter=',').reshape((768,128), order='F')\n", + "# ff_w2_weight_tensor = np.loadtxt(ff_w2_weight, delimiter=',').reshape((3072,768), order='F')\n", + "# ff_w2_gradin_tensor = np.matmul(ff_w2_weight_tensor, ff_w2_grad_out_tensor).reshape((3072,128), order='F')\n", + "\n", + "# ff_lora_gradout_tensor = np.loadtxt(ff_BWD_lora_B_out, delimiter=',').reshape((768,128), order='F')\n", + "# ff_lora_A_weight_tensor = np.loadtxt(ff_lora_A_weight, delimiter=',').reshape((3072,16), order='F')\n", + "# ff_lora_B_weight_tensor = np.loadtxt(ff_lora_B_weight, delimiter=',').reshape((16,768), order='F')\n", + "# ff_lora_int_grad_tensor = np.matmul(ff_lora_B_weight_tensor, ff_lora_gradout_tensor)\n", + "# ff_lora_gradint_tensor = np.matmul(ff_lora_A_weight_tensor, ff_lora_int_grad_tensor)\n", + "\n", + "# # ff_w2_gradin_tensor = ff_w2_gradin_tensor + ff_lora_gradint_tensor\n", + "# #print(ff_w2_gradin_tensor[:,:24])\n", + "# print(\"calculated LORA grad in\")\n", + "# print(ff_lora_gradint_tensor[:,:24])\n", + "# # ff_BWD_w2_in_pre_tensor = np.loadtxt(ff_BWD_w2_in_pre, delimiter=',').reshape((3072,128), order='F')\n", + "# ff_BWD_lora_A_in_tensor = np.loadtxt(ff_BWD_lora_A_in, delimiter=',').reshape((3072,128), order='F')\n", + "# print(\"FlexFlow LORA grad in\")\n", + "# print(ff_BWD_lora_A_in_tensor[:,:24])\n", + "# # print(ff_BWD_w2_in_pre_tensor[:,:24])\n", + "# print(\"HF lora grad in\")\n", + "# print(torch.load(hf_BWD_loraA_in).squeeze().T.detach().cpu().numpy())\n", + "# compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", + "\n", + "# simulate act_fn_grad\n", + "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')\n", + "# w3_fwd_out_tensor = np.loadtxt(ff_FWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n", + "# #print(ssm_out_grad_tensor.shape, w3_fwd_out_tensor.shape)\n", + "# act_fn_out_check = np.multiply(ssm_out_grad_tensor, w3_fwd_out_tensor)\n", + "# print(\"simulated act fn out - simulated\")\n", + "# print(act_fn_out_check[:,:24])\n", + "# print(\"simulated act fn out - HF\")\n", + "# print(torch.load(hf_BWD_act_fn_out).detach().cpu().numpy().squeeze().T)\n", + "\n", + "# Simulated w3_grad\n", + "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')[:,:24]\n", + "# act_fnc_out_tensor = np.loadtxt(ff_FWD_act_fnc_out, delimiter=',').reshape((3072,24), order='F')\n", + "# w3_out_gard_check = np.multiply(ssm_out_grad_tensor, act_fnc_out_tensor)\n", + "# print(\"simulated w3 out - FF\")\n", + "# print(w3_out_gard_check)\n", + "# ff_BWD_w3_out_tensor = np.loadtxt(ff_BWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n", + "# hf_BWD_w3_out_tensor = torch.load(hf_BWD_w3_out).detach().cpu().numpy().squeeze().T\n", + "# print(\"w3 out, FF\")\n", + "# print(ff_BWD_w3_out_tensor[:,:24])\n", + "# print(\"w3 out, HF\")\n", + "# print(hf_BWD_w3_out_tensor)\n", + "\n", + "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n", + "# assert False\n", + "# print()\n", + "# print()\n", + "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n", + "# print_tensors(hf_BWD_w3_in, ff_BWD_w3_in, \"w3 in\")\n", + "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out, \"w1 out\")\n", + "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in, \"w1 in\")\n", + "# print_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out, \"ffn norm out\")\n", + "# print_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2, \"ffn norm in\")\n", + "# print()\n", + "# ff_w1_out_tensor = np.loadtxt(ff_BWD_w1_out, delimiter=',').reshape((3072,128), order='F')\n", + "# ff_w1_in_tensor = np.loadtxt(ff_BWD_w1_in, delimiter=',').reshape((768,128), order='F')\n", + "# ff_w1_in_pre_tensor = np.loadtxt(ff_BWD_w1_in_pre, delimiter=',').reshape((768,128), order='F')\n", + "# ff_w1_only_in_tensor = ff_w1_in_tensor - ff_w1_in_pre_tensor\n", + "# ff_w1_weight_tensor = np.loadtxt(ff_w1_weight, delimiter=',').reshape((768,3072), order='F')\n", + "# ff_w1_in_check_tensor = np.matmul(ff_w1_weight_tensor, ff_w1_out_tensor)\n", + "# print(\"W1 in (simulated):\")\n", + "# print(ff_w1_in_check_tensor[:,:24])\n", + "# print(\"W1 in (FF):\")\n", + "# print(ff_w1_only_in_tensor[:,:24])\n", + "# print(\"W1 in (HF):\")\n", + "# print(torch.load(hf_BWD_w1_in).squeeze().T.detach().cpu().numpy())\n", + "\n", + "# compare_tensors_difference(hf_BWD_w2_in, ff_BWD_w2_in, ff_BWD_lora_A_in)\n", + "# compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", + "#compare_hf_tensors(hf_BWD_ffn_norm_in, hf_BWD_attn_out_out)\n", + "# print(\"\\nw1 out:\")\n", + "\n", + "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + "# print(\"\\nW1 in\\n\")\n", + "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n", + "# compare_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n", + "# print(\"\\nffn_norm\")\n", + "# compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "for layer_num in range(12):\n", + " hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n", + " hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n", + " hf_w1_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.gate_proj.weight\"\n", + " ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n", + " hf_w3_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.up_proj.weight\"\n", + " ff_w3_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n", + " hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n", + " ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/qk_prods_alignment.ipynb b/tests/peft/qk_prods_alignment.ipynb new file mode 100644 index 0000000000..c2a3644b3d --- /dev/null +++ b/tests/peft/qk_prods_alignment.ipynb @@ -0,0 +1,24 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 6a5992212dd0479d3651a0f3d4b5689d4fab52dc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 15 Dec 2023 11:46:48 -0500 Subject: [PATCH 03/11] backup --- inference/incr_decoding/incr_decoding.cc | 12 +-- src/ops/inc_multihead_self_attention.cc | 42 +++++++- src/ops/inc_multihead_self_attention.cu | 63 +++++++++++- src/ops/kernels/softmax.cu | 25 ++--- src/ops/residual_rms_norm.cc | 68 ++++++++++++ tests/peft/alignment_tests.ipynb | 126 ++--------------------- tests/peft/hf_finetune.py | 4 + 7 files changed, 203 insertions(+), 137 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index dcd1b5a5ab..009cd1af45 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -285,12 +285,12 @@ void FlexFlow::top_level_task(Task const *task, std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); // Add inference request - Request inference_req; - inference_req.prompt = text; - inference_req.max_sequence_length = 128; - inference_req.peft_model_id = peft_model_id; - requests.push_back(inference_req); - total_num_requests++; + // Request inference_req; + // inference_req.prompt = text; + // inference_req.max_sequence_length = 128; + // inference_req.peft_model_id = peft_model_id; + // requests.push_back(inference_req); + // total_num_requests++; // Add fine-tuning request Request fine_tuning_req; fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index d88c7edb81..569b35097d 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -878,6 +878,37 @@ void IncMultiHeadSelfAttention::inference_task( } } +template +void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(size); + size_t loaded_data_size = sizeof(DT) * size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != loaded_data_size) { + std::cout << "load weight data error " << in_get_size << ", " + << loaded_data_size << ", " << sizeof(DT) << std::endl; + assert(false); + } + assert(size == host_array.size()); + + copy_tensor_host_to_dev(ptr, host_array.data(), size); + + // // normal + // long data_index = 0; + // for (auto v : host_array) { + // ptr[data_index++] = v; + // } + in.close(); +} + FutureMap IncMultiHeadSelfAttention::peft_bwd( FFModel const &ff, BatchConfigFuture const &bc, @@ -966,7 +997,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR biases; if (*m->qkv_bias || *m->final_bias) { @@ -996,6 +1027,15 @@ void IncMultiHeadSelfAttention::peft_bwd_task( std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m); std::cout << "BWD " << op_name_without_uid << std::endl; + + if (op_name_without_uid == "layers_11_attention") { + load_tensor_from_file( + output_grad.get_float_ptr(), + (output_grad.domain.get_volume()/128)*24, + "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0.flexflow" + ); + } + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( m, bc, diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index dec116addd..cf3fedd95a 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -601,6 +601,8 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->hidden_size); } if (*m->apply_rotary_embedding) { + printf("ROTARY EMBEDDING: num_tokens: %i, q_array_size: %i, m->hidden_size: %i\n", + num_tokens, q_array_size, m->hidden_size); /*q&k*/ parallelism = num_tokens * m->hidden_size; apply_rotary_embedding_hf<<op_name); + size_t last_underscore = op_name_without_uid.length() - 1; + for (int i = op_name_without_uid.length() - 1; i > 0; i--) { + if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { + break; + } else if (m->op_name[i] == '_') { + last_underscore = i; + } + } + op_name_without_uid.erase(last_underscore); + + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_bwd-step_" + std::to_string(m->bwd_step) + + "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + + "_layer-name_" + op_name_without_uid + "_shard-id_" + + std::to_string(shard_id); + + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; @@ -955,6 +977,10 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + std::string filename = base_filepath + "_o_proj_in_grad"; + std::cout << "FILENAME: " << filename << std::endl; + save_tensor(C, m_*n_, filename.c_str()); } // Step 2: compute gradients w.r.t. value { @@ -1006,6 +1032,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + std::string filename = base_filepath + "_v_proj_in_grad"; + std::cout << "FILENAME: " << filename << std::endl; + save_tensor(C, m_*n_*m->num_q_heads, filename.c_str()); + std::string filename2 = base_filepath + "_qk_prods_softmax"; + std::cout << "FILENAME: " << filename2 << std::endl; + save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str()); } // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor { @@ -1054,6 +1087,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + std::string filename4 = base_filepath + "_qk_prods_softmax_grad"; + std::cout << "FILENAME: " << filename4 << std::endl; + save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str()); + std::string filename5 = base_filepath + "_vcache"; + std::cout << "FILENAME: " << filename5 << std::endl; + save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str()); } // Step 4: softmax backpropagation { @@ -1080,6 +1119,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, &beta, m->qk_tensor, m->qk_prods)); + + DT *C = static_cast
(m->qk_prods); + std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in"; + std::cout << "FILENAME: " << filename6 << std::endl; + save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str()); + // TODO: fill all elements above diagonal to force causal attention size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; if (entries_above_diagonal > 0) { @@ -1095,6 +1140,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, DT(0.0f)); } + std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked"; + std::cout << "FILENAME: " << filename7 << std::endl; + save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str()); } // Step 5: compute gradients w.r.t. key { @@ -1149,6 +1197,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + std::string filename8 = base_filepath + "_query_activation"; + std::cout << "FILENAME: " << filename8 << std::endl; + save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str()); + std::string filename9 = base_filepath + "_devkproj"; + std::cout << "FILENAME: " << filename9 << std::endl; + save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); } // Step 6: compute gradients w.r.t query { @@ -1166,10 +1220,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] DT *C = static_cast
(m->devQKVProjArray); // after transposition & striding - // after transposition & striding int m_ = num_tokens; // num_new_tokens int n_ = m->qProjSize; - int k_ = num_tokens; + int k_ = num_tokens; // before transposition and striding int lda = num_tokens; // num_new_tokens int ldb = m->qProjSize * m->num_q_heads; @@ -1200,6 +1253,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + std::string filename3 = base_filepath + "_devQKVPRojArray"; + std::cout << "FILENAME: " << filename3 << std::endl; + save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); } // Step 7: compute gradients w.r.t. input { @@ -1242,6 +1298,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + std::string filename12 = base_filepath + "_attn_final_grad_in"; + std::cout << "FILENAME: " << filename12 << std::endl; + save_tensor(C, num_tokens * m->qSize, filename12.c_str()); } } } diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 0fc827319d..115461c129 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -279,10 +279,11 @@ __global__ void sparse_categorical_crossentropy_loss_peft_backward( int num_tokens, int num_classes) { CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { - input_grad[i] = output_grad[i]; - if (i % num_classes == token_ids[i / num_classes]) { - input_grad[i] -= 1.0f; - } + input_grad[i] = 0.5; + // input_grad[i] = output_grad[i]; + // if (i % num_classes == token_ids[i / num_classes]) { + // input_grad[i] -= 1.0f; + // } } } @@ -334,14 +335,14 @@ void peft_bwd_kernel(SoftmaxMeta const *m, num_bwd_tokens, num_classes); // scale - scale_kernel<<>>(input_grad_ptr + - tokens_previous_requests * num_classes, - num_bwd_tokens * num_classes, - DT(0.0), - scale_factor); + // scale_kernel<<>>(input_grad_ptr + + // tokens_previous_requests * num_classes, + // num_bwd_tokens * num_classes, + // DT(0.0), + // scale_factor); tokens_previous_requests += num_bwd_tokens; } diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 28dd7e2745..c03d1c07a1 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -673,6 +673,37 @@ Legion::FutureMap return runtime->execute_index_space(ctx, launcher); } +template +void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(size); + size_t loaded_data_size = sizeof(DT) * size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != loaded_data_size) { + std::cout << "load weight data error " << in_get_size << ", " + << loaded_data_size << ", " << sizeof(DT) << std::endl; + assert(false); + } + assert(size == host_array.size()); + + copy_tensor_host_to_dev(ptr, host_array.data(), size); + + // // normal + // long data_index = 0; + // for (auto v : host_array) { + // ptr[data_index++] = v; + // } + in.close(); +} + /* regions[0](I): RMS output_grad regions[1](I/O): Residual input 0 grad @@ -710,6 +741,43 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper( m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); + int numdims = residual_input0_grad.domain.get_dim(); + std::cout << "in grad dims: "; + for (int i=0; iinference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb index fc2899b7c4..6a7e2bead8 100644 --- a/tests/peft/alignment_tests.ipynb +++ b/tests/peft/alignment_tests.ipynb @@ -496,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -521,120 +521,13 @@ "Ok!\n", "Ok!\n", "Ok!\n", - "Ok!\n", - "\n", - "Huggingface-FlexFlow checks:\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n", - "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 1.8299303e+01\n", - " 1.3871717e+01 1.8452764e+00]\n", - "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 1.82993031e+01\n", - " 1.38717194e+01 1.84527588e+00]\n", - "[ True True True ... True True True]\n", - "[2394]\n", - "Ok!\n", - "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n", - "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 1.8299303e+01\n", - " 1.3871717e+01 1.8452764e+00]\n", - "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 1.82993031e+01\n", - " 1.38717194e+01 1.84527588e+00]\n", - "[ True True True ... True True True]\n", - "[2394]\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "(64, 12, 24)\n", - "(64, 12, 24)\n", - "torch.Size([12, 24, 64])\n", - "torch.Size([12, 64, 24])\n", - "3.7760416666666665% mismatch in QK prods softmax out grad\n", - "hf_kproj_grads_post_rotary: (24, 64, 12)\n", - "hf_kproj_grads_before_rotary: (24, 64, 12)\n", - "[[-2.1751599e-01 1.2245592e-01 -2.6237822e-01 ... 1.4371538e+00\n", - " 5.2717543e-01 5.1425427e-01]\n", - " [-7.6055496e+01 4.2463268e+01 -1.2235089e+02 ... 5.3328156e+02\n", - " 2.3810944e+02 1.8990283e+02]\n", - " [ 5.2804117e+00 -4.9826388e+00 4.6240320e+00 ... -5.4525635e+01\n", - " -2.1779711e+01 -3.2857445e+01]\n", - " ...\n", - " [ 1.0541155e+00 -3.1229946e-01 1.4272718e+00 ... -4.6509657e+00\n", - " -2.2930331e+00 2.1488833e-01]\n", - " [ 1.8427576e+00 -5.0031781e-01 2.1591802e+00 ... -8.0996408e+00\n", - " -6.6346103e-01 1.1487092e+00]\n", - " [-3.9699785e-02 1.7903861e-02 -5.9658013e-02 ... 2.4856456e-01\n", - " -5.0553136e-02 -6.9623299e-02]]\n", - "HF Qproj:\n", - "torch.Size([24, 768])\n", - "\t reshaped: (24, 64, 12)\n", - "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n", - " 0.0000000e+00 0.0000000e+00]\n", - " [-2.1439369e-03 3.2949594e-03 -2.9551802e-04 ... 2.4234147e-01\n", - " 4.3675132e-02 -9.2217997e-02]\n", - " [ 2.9682016e+00 -4.1166668e+00 -1.5612273e+00 ... 1.8131609e+01\n", - " -2.7311683e+00 -2.3451160e+01]\n", - " ...\n", - " [ 7.9408998e+00 -1.6016111e+01 7.5070286e+00 ... 6.9805992e+01\n", - " -8.9288340e+00 -5.6585381e+01]\n", - " [ 5.9755993e+00 -1.2562438e+01 9.3722830e+00 ... 5.6924896e+01\n", - " 1.6420145e+00 -2.7360382e+01]\n", - " [ 2.9259295e+00 -8.8997393e+00 5.6537924e+00 ... 4.0085789e+01\n", - " -5.5427680e+00 -3.3319279e+01]]\n", - "FF Qproj:\n", - "(24, 64, 12)\n", - "[[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00\n", - " 0.00000000e+00 0.00000000e+00]\n", - " [-2.14390800e-03 3.29491800e-03 -2.95515000e-04 ... 2.42337957e-01\n", - " 4.36745250e-02 -9.22166630e-02]\n", - " [ 2.96819830e+00 -4.11666203e+00 -1.56122601e+00 ... 1.81315899e+01\n", - " -2.73117018e+00 -2.34511394e+01]\n", - " ...\n", - " [ 7.94090462e+00 -1.60161247e+01 7.50703382e+00 ... 6.98059998e+01\n", - " -8.92883396e+00 -5.65854073e+01]\n", - " [ 5.97561932e+00 -1.25624638e+01 9.37229633e+00 ... 5.69249115e+01\n", - " 1.64204872e+00 -2.73603287e+01]\n", - " [ 2.92593479e+00 -8.89975548e+00 5.65379906e+00 ... 4.00858383e+01\n", - " -5.54277229e+00 -3.33193245e+01]]\n", - "hf_attn_in: torch.Size([1, 24, 768])\n", - "hf_attn_in: (768, 24)\n", - "[[-7.5252225e+06 -1.2484900e+03 5.3961243e+01 ... -3.3743629e+01\n", - " -2.8661375e+00 -1.2124748e+00]\n", - " [-9.5513660e+06 1.8450066e+03 3.8372406e+02 ... -1.9933952e+01\n", - " 1.4622488e+01 -2.4410028e+00]\n", - " [ 1.1452265e+07 2.1254619e+03 -4.8265629e+01 ... 4.8204151e+01\n", - " -1.4841021e+01 -1.6505869e+01]\n", - " ...\n", - " [ 2.1089132e+06 2.8605874e+03 1.2375667e+03 ... 2.6102766e+01\n", - " 3.1422745e+01 6.7668297e+01]\n", - " [ 2.1169400e+06 -4.6361523e+02 -1.6561864e+02 ... -5.3914165e+00\n", - " -6.0169220e-02 2.2841328e+01]\n", - " [ 7.3915345e+06 8.9268884e+02 5.4528040e+02 ... 6.2017624e+01\n", - " 1.3753588e+01 5.2149849e+01]]\n", - "ff_attn_in: (768, 24)\n", - "[[-7.52522050e+06 -1.24848975e+03 5.39611511e+01 ... -3.37436867e+01\n", - " -2.86611795e+00 -1.21241117e+00]\n", - " [-9.55136800e+06 1.84500635e+03 3.83724091e+02 ... -1.99339561e+01\n", - " 1.46225519e+01 -2.44094014e+00]\n", - " [ 1.14522650e+07 2.12546313e+03 -4.82656937e+01 ... 4.82041969e+01\n", - " -1.48411064e+01 -1.65059376e+01]\n", - " ...\n", - " [ 2.10891300e+06 2.86058789e+03 1.23756726e+03 ... 2.61027851e+01\n", - " 3.14227238e+01 6.76683807e+01]\n", - " [ 2.11693950e+06 -4.63614868e+02 -1.65618515e+02 ... -5.39132690e+00\n", - " -6.02092740e-02 2.28413010e+01]\n", - " [ 7.39153300e+06 8.92689453e+02 5.45280640e+02 ... 6.20176048e+01\n", - " 1.37535381e+01 5.21498528e+01]]\n" + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_ffn_norm_shard-id_0_input_1 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_output_0\n", + "Tensor1: [ 0. 0. 0. ... 90.59211731 52.20317078\n", + " -124.1802063 ]\n", + "Tensor2:[-1.18452775e+06 -6.74598750e+05 7.44935375e+05 ... 4.37662773e+01\n", + " 4.78333855e+01 4.72951965e+01]\n", + "[False False False ... False False False]\n", + "[ 0 1 2 ... 18429 18430 18431]\n" ] }, { @@ -644,7 +537,8 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 5\u001b[0m line \u001b[0;36m3\n\u001b[1;32m 300\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mff_attn_in: \u001b[39m\u001b[39m\"\u001b[39m, ff_attn_in\u001b[39m.\u001b[39mshape)\n\u001b[1;32m 301\u001b[0m \u001b[39mprint\u001b[39m(ff_attn_in)\n\u001b[0;32m--> 302\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_attn_in, hf_attn_in, atol\u001b[39m=\u001b[39m\u001b[39m1e-2\u001b[39m))\n\u001b[1;32m 304\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 306\u001b[0m hf_kproj_grads_in \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mhf_weight_base_path\u001b[39m}\u001b[39;00m\u001b[39m/bwd_step_0_layers.\u001b[39m\u001b[39m{\u001b[39;00mlayer_num\u001b[39m}\u001b[39;00m\u001b[39m.self_attn.k_proj.gi_0\u001b[39m\u001b[39m\"\u001b[39m\n", + "Cell \u001b[0;32mIn[8], line 93\u001b[0m\n\u001b[1;32m 91\u001b[0m compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n\u001b[1;32m 92\u001b[0m compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m24\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m768\u001b[39m) \u001b[38;5;66;03m# should fail\u001b[39;00m\n\u001b[0;32m---> 93\u001b[0m \u001b[43mcompare_flexflow_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mff_BWD_ffn_norm_in2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_len\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m24\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m768\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;66;03m# HF-FlexFlow checks\u001b[39;00m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[2], line 159\u001b[0m, in \u001b[0;36mcompare_flexflow_tensors\u001b[0;34m(ff_tensor1_fp, ff_tensor2_fp, tolerance, max_len)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 158\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m--> 159\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mlen\u001b[39m(ff_tensor1))\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mAssertionError\u001b[0m: " ] } diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 7836633b30..016a2386cb 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -72,6 +72,8 @@ def peft_backward_hook(module, grad_input, grad_output): print("\t", go.shape) print(f"\t\tSaving to {dst_filepath}") torch.save(go, dst_filepath) + if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0": + go.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow") else: print(go) print("Backward GRAD Input:") @@ -81,6 +83,8 @@ def peft_backward_hook(module, grad_input, grad_output): print("\t", gi.shape) print(f"\t\tSaving to {dst_filepath}") torch.save(gi, dst_filepath) + if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0" or dst_filepath == "./hf_peft_tensors/bwd_step_0_norm.gi_0": + gi.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow") else: print(gi) From 1202548c707ece4cda5201ae69bde97e8ab1e1bf Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 16 Dec 2023 10:42:49 -0500 Subject: [PATCH 04/11] updates --- src/ops/residual_rms_norm.cc | 13 +- src/ops/sigmoid_silu_multi.cu | 2 +- tests/peft/alignment_tests.ipynb | 251 +++++++++++-------------------- tests/peft/hf_finetune.py | 2 + 4 files changed, 100 insertions(+), 168 deletions(-) diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index c03d1c07a1..aa72d7d32a 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -741,21 +741,24 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper( m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); + + // get name + std::string op_name_without_uid = ResidualRMSNorm::get_op_name_without_uid(m); + std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " << m->reset_input_grads[0] << " reset_in_grad[1]: " << m->reset_input_grads[1] << std::endl; + // print shape int numdims = residual_input0_grad.domain.get_dim(); std::cout << "in grad dims: "; for (int i=0; ireset_input_grads[0], m->reset_input_grads[1]); } else if (m->input_type[0] == DT_HALF) { diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb index 6a7e2bead8..d43b68e14d 100644 --- a/tests/peft/alignment_tests.ipynb +++ b/tests/peft/alignment_tests.ipynb @@ -217,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -225,151 +225,25 @@ "output_type": "stream", "text": [ "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n" + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.0.self_attn.o_proj.output_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_attention_shard-id_0_output_0\n", + "HF: [ 0. 0. 0. ... 0.02364488 -0.00304312\n", + " -0.01649825]\n", + "FF:[ 0. 0. 0. ... 0.02200473 0.01693928\n", + " -0.02354377]\n", + "[ True True True ... True False True]\n", + "[ 1541 1543 1545 ... 18427 18428 18430]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[8], line 10\u001b[0m\n\u001b[1;32m 8\u001b[0m hf_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.o_proj.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 9\u001b[0m ff_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_attention_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 10\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_attn_out\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m hf_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.post_attention_layernorm.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 12\u001b[0m ff_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_ffn_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "Cell \u001b[0;32mIn[2], line 27\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " ] } ], @@ -438,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -489,14 +363,12 @@ "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n", "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n", "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n", - "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n", - "\n", - "\n" + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -511,6 +383,7 @@ "Ok!\n", "Ok!\n", "Ok!\n", + "Ok!\n", "\n", "FlexFlow checks:\n", "Ok!\n", @@ -521,13 +394,67 @@ "Ok!\n", "Ok!\n", "Ok!\n", - "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_ffn_norm_shard-id_0_input_1 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_output_0\n", - "Tensor1: [ 0. 0. 0. ... 90.59211731 52.20317078\n", - " -124.1802063 ]\n", - "Tensor2:[-1.18452775e+06 -6.74598750e+05 7.44935375e+05 ... 4.37662773e+01\n", - " 4.78333855e+01 4.72951965e+01]\n", - "[False False False ... False False False]\n", - "[ 0 1 2 ... 18429 18430 18431]\n" + "\n", + "Huggingface-FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 2.14103374e+01\n", + " 1.20965424e+01 3.64246750e+00]\n", + "[ True True True ... True True True]\n", + "[2394]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 2.14103374e+01\n", + " 1.20965424e+01 3.64246750e+00]\n", + "[ True True True ... True True True]\n", + "[2394]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "(64, 12, 24)\n", + "(64, 12, 24)\n", + "torch.Size([12, 24, 64])\n", + "torch.Size([12, 64, 24])\n", + "4.383680555555555% mismatch in QK prods softmax out grad\n", + "hf_kproj_grads_post_rotary: (24, 64, 12)\n", + "hf_kproj_grads_before_rotary: (24, 64, 12)\n", + "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", + " 3.2884139e-01 3.6066702e-01]\n", + " [-2.8613457e+01 -5.5871558e+00 2.9384506e+01 ... 3.8781765e+01\n", + " 9.6900581e+01 9.8469597e+01]\n", + " [ 3.3027239e+00 1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n", + " -2.0009745e+01 -2.9787930e+01]\n", + " ...\n", + " [-7.6470733e-02 -1.8891659e-01 3.6430117e-01 ... -2.7492592e-01\n", + " 5.7017130e-01 -1.5985624e-01]\n", + " [ 2.5780225e+00 -1.8152566e+00 2.5087588e+00 ... -1.0776262e+01\n", + " 6.2166649e-01 8.3755457e-01]\n", + " [-6.8324409e-02 1.7568478e-01 -3.2310838e-01 ... 3.1202292e+00\n", + " -2.6652411e-01 -1.1917179e+00]]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_752415/3838509285.py:163: UserWarning: The use of `x.T` on tensors of dimension other than 2 to reverse their shape is deprecated and it will throw an error in a future release. Consider `x.mT` to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse the dimensions of a tensor. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3571.)\n", + " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n" ] }, { @@ -537,8 +464,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[8], line 93\u001b[0m\n\u001b[1;32m 91\u001b[0m compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n\u001b[1;32m 92\u001b[0m compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m24\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m768\u001b[39m) \u001b[38;5;66;03m# should fail\u001b[39;00m\n\u001b[0;32m---> 93\u001b[0m \u001b[43mcompare_flexflow_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mff_BWD_ffn_norm_in2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_len\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m24\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m768\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;66;03m# HF-FlexFlow checks\u001b[39;00m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[2], line 159\u001b[0m, in \u001b[0;36mcompare_flexflow_tensors\u001b[0;34m(ff_tensor1_fp, ff_tensor2_fp, tolerance, max_len)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 158\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m--> 159\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mlen\u001b[39m(ff_tensor1))\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[19], line 267\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhf_kproj_grads_before_rotary: \u001b[39m\u001b[38;5;124m\"\u001b[39m, hf_kproj_grads_before_rotary\u001b[38;5;241m.\u001b[39mshape)\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28mprint\u001b[39m(hf_kproj_grads_before_rotary[:,:,\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m--> 267\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(np\u001b[38;5;241m.\u001b[39mallclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-2\u001b[39m))\n\u001b[1;32m 268\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.k_proj.go_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 269\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_kproj_grads)\u001b[38;5;241m.\u001b[39msqueeze()\n", "\u001b[0;31mAssertionError\u001b[0m: " ] } @@ -619,6 +545,7 @@ " print(\"\\nHuggingface checks:\")\n", " if layer_num == tot_num_layers-1:\n", " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n", " compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n", " compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n", " # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n", @@ -635,8 +562,8 @@ " compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n", " compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n", " compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n", - " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768) # should fail\n", - " compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768)\n", + " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n", + " #compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768) # should fail\n", "\n", " # HF-FlexFlow checks\n", " print(\"\\nHuggingface-FlexFlow checks:\")\n", diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 016a2386cb..818e0b9085 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -229,6 +229,8 @@ def main(): torch.save(params, f"./hf_peft_tensors/{name}") if "lm_head" in name or "norm" in name: torch.save(params, f"./hf_peft_tensors/{name}") + if "down_proj" in name or "self_attn" in name: + torch.save(params, f"./hf_peft_tensors/{name}") # Load fine-tuning dataset data = load_dataset("Abirate/english_quotes") From 0de45d136feea49fc89f6b0497aa6a3662bfc68a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 18 Dec 2023 17:43:59 -0500 Subject: [PATCH 05/11] update --- src/ops/inc_multihead_self_attention.cu | 81 +++++- tests/peft/alignment_tests.ipynb | 358 +++++++++++++++++++++--- 2 files changed, 402 insertions(+), 37 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index cf3fedd95a..b1c3db25dc 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -492,6 +492,47 @@ __global__ void } } +template +__global__ void apply_rotary_embedding_bwd(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int proj_size, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + // compute indexes to visit first half proj_size of each of q/k tensor. + // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd + bool q_tensor = i < (num_tokens * hidden_size / 2); + int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; + assert(hidden_size % proj_size == 0); + int num_heads = hidden_size / proj_size; + + int token_idx = real_i % num_tokens; + int idx = (real_i / num_tokens) % (proj_size / 2); + int head_idx = real_i / (num_tokens * proj_size / 2); + assert(head_idx < num_heads); + + int complex_part_index = + (q_tensor ? 0 : 1) * num_tokens * hidden_size + + head_idx * num_tokens * proj_size + + idx * num_tokens + + token_idx; + int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + template __global__ void fill_entries_above_diagonal(DT *matrix, size_t num_rows, @@ -1200,7 +1241,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, std::string filename8 = base_filepath + "_query_activation"; std::cout << "FILENAME: " << filename8 << std::endl; save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str()); - std::string filename9 = base_filepath + "_devkproj"; + std::string filename9 = base_filepath + "_devkproj_pre"; std::cout << "FILENAME: " << filename9 << std::endl; save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); } @@ -1253,9 +1294,41 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - std::string filename3 = base_filepath + "_devQKVPRojArray"; - std::cout << "FILENAME: " << filename3 << std::endl; - save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); + std::string filename3 = base_filepath + "_devQKVPRojArray_pre"; + std::cout << "FILENAME: " << filename3 << std::endl; + save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); + } + + // Compute rotary embeddings bwd + { + if (*m->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + printf("ROTARY EMBEDDING bwd: num_tokens: %i, m->hidden_size: %i\n", num_tokens, m->hidden_size); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + apply_rotary_embedding_bwd<<>>(A, + m->complex_input, + m->token_infos, + m->qProjSize, + num_tokens, + m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + std::string filename3 = base_filepath + "_devQKVPRojArray"; + std::cout << "FILENAME: " << filename3 << std::endl; + save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients + std::string filename9 = base_filepath + "_devkproj"; + std::cout << "FILENAME: " << filename9 << std::endl; + save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); } // Step 7: compute gradients w.r.t. input { diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb index d43b68e14d..a9382b9524 100644 --- a/tests/peft/alignment_tests.ipynb +++ b/tests/peft/alignment_tests.ipynb @@ -312,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -368,7 +368,90 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "class LlamaRotaryEmbedding(nn.Module):\n", + " def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n", + " super().__init__()\n", + "\n", + " self.dim = dim\n", + " self.max_position_embeddings = max_position_embeddings\n", + " self.base = base\n", + " inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n", + " self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n", + "\n", + " # Build here to make `torch.jit.trace` work.\n", + " self._set_cos_sin_cache(\n", + " seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n", + " )\n", + "\n", + " def _set_cos_sin_cache(self, seq_len, device, dtype):\n", + " self.max_seq_len_cached = seq_len\n", + " t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n", + "\n", + " freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n", + " # Different from paper, but it uses a different permutation in order to obtain the same calculation\n", + " emb = torch.cat((freqs, freqs), dim=-1)\n", + " self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n", + " self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n", + "\n", + " def forward(self, x, seq_len=None):\n", + " # x: [bs, num_attention_heads, seq_len, head_size]\n", + " if seq_len > self.max_seq_len_cached:\n", + " self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n", + "\n", + " return (\n", + " self.cos_cached[:seq_len].to(dtype=x.dtype),\n", + " self.sin_cached[:seq_len].to(dtype=x.dtype),\n", + " )\n", + "def rotate_half(x):\n", + " \"\"\"Rotates half the hidden dims of the input.\"\"\"\n", + " x1 = x[..., : x.shape[-1] // 2] # first half\n", + " x2 = x[..., x.shape[-1] // 2 :] # second half\n", + " return torch.cat((x2, -x1), dim=-1)\n", + "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n", + " \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n", + "\n", + " Args:\n", + " q (`torch.Tensor`): The query tensor.\n", + " k (`torch.Tensor`): The key tensor.\n", + " cos (`torch.Tensor`): The cosine part of the rotary embedding.\n", + " sin (`torch.Tensor`): The sine part of the rotary embedding.\n", + " position_ids (`torch.Tensor`):\n", + " The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n", + " used to pass offsetted position ids when working with a KV-cache.\n", + " unsqueeze_dim (`int`, *optional*, defaults to 1):\n", + " The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n", + " sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n", + " that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n", + " k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n", + " cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n", + " the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n", + " Returns:\n", + " `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n", + " \"\"\"\n", + " cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n", + " sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n", + " q_embed = (q * cos) + (rotate_half(q) * sin)\n", + " k_embed = (k * cos) + (rotate_half(k) * sin)\n", + " return q_embed, k_embed\n", + "head_dim = 64\n", + "max_position_embeddings = 2048\n", + "rope_theta=10_000\n", + "kv_seq_len = 24\n", + "rotary_emb = LlamaRotaryEmbedding(\n", + " head_dim,\n", + " max_position_embeddings=max_position_embeddings,\n", + " base=rope_theta,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -433,6 +516,19 @@ "torch.Size([12, 64, 24])\n", "4.383680555555555% mismatch in QK prods softmax out grad\n", "hf_kproj_grads_post_rotary: (24, 64, 12)\n", + "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", + " 3.2884139e-01 3.6066702e-01]\n", + " [-7.5168266e+00 4.6582484e+00 1.7284815e+01 ... 3.8785275e+01\n", + " 9.6879341e+01 9.8476219e+01]\n", + " [-8.0723800e-02 1.8924624e+00 -2.6913931e+00 ... -4.4056824e+01\n", + " -2.0001854e+01 -2.9799681e+01]\n", + " ...\n", + " [-1.9819270e-01 1.9175959e-01 1.8926021e-01 ... -2.7737719e-01\n", + " 5.7191163e-01 -1.5962012e-01]\n", + " [-2.5673387e+00 1.7033563e+00 2.2882986e+00 ... -1.0788559e+01\n", + " 6.3817674e-01 8.2335520e-01]\n", + " [-1.7806959e-01 8.9493655e-02 -1.9538833e-01 ... 3.1075442e+00\n", + " -2.6218265e-01 -1.1863230e+00]]\n", "hf_kproj_grads_before_rotary: (24, 64, 12)\n", "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", " 3.2884139e-01 3.6066702e-01]\n", @@ -446,15 +542,98 @@ " [ 2.5780225e+00 -1.8152566e+00 2.5087588e+00 ... -1.0776262e+01\n", " 6.2166649e-01 8.3755457e-01]\n", " [-6.8324409e-02 1.7568478e-01 -3.2310838e-01 ... 3.1202292e+00\n", - " -2.6652411e-01 -1.1917179e+00]]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_752415/3838509285.py:163: UserWarning: The use of `x.T` on tensors of dimension other than 2 to reverse their shape is deprecated and it will throw an error in a future release. Consider `x.mT` to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse the dimensions of a tensor. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3571.)\n", - " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n" + " -2.6652411e-01 -1.1917179e+00]]\n", + "ff_kproj_pre: (24, 64, 12)\n", + "[[-1.57300810e-02 -4.11607850e-02 3.05930820e-02 ... 3.86295587e-01\n", + " 3.28840941e-01 3.60667169e-01]\n", + " [-7.51684189e+00 4.65823793e+00 1.72848415e+01 ... 3.87852402e+01\n", + " 9.68793182e+01 9.84762802e+01]\n", + " [-8.07239790e-02 1.89246774e+00 -2.69139457e+00 ... -4.40568542e+01\n", + " -2.00018616e+01 -2.97996941e+01]\n", + " ...\n", + " [-1.98194161e-01 1.91760257e-01 1.89260900e-01 ... -2.77382791e-01\n", + " 5.71911991e-01 -1.59620658e-01]\n", + " [-2.56733608e+00 1.70335352e+00 2.28829479e+00 ... -1.07885523e+01\n", + " 6.38186097e-01 8.23350966e-01]\n", + " [-1.78069487e-01 8.94933720e-02 -1.95387334e-01 ... 3.10753584e+00\n", + " -2.62182117e-01 -1.18632054e+00]]\n", + "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n", + "ff_kproj: (24, 64, 12)\n", + "[[-1.57300810e-02 -4.11607850e-02 3.05930820e-02 ... 3.86295587e-01\n", + " 3.28840941e-01 3.60667169e-01]\n", + " [-2.86135025e+01 -5.58717918e+00 2.93845501e+01 ... 3.87817307e+01\n", + " 9.69005585e+01 9.84696579e+01]\n", + " [ 3.30272818e+00 1.82759121e-01 -1.84967291e+00 ... -4.40522003e+01\n", + " -2.00097523e+01 -2.97879410e+01]\n", + " ...\n", + " [-7.64704790e-02 -1.88917309e-01 3.64301860e-01 ... -2.74931490e-01\n", + " 5.70171654e-01 -1.59856781e-01]\n", + " [ 2.57801986e+00 -1.81525516e+00 2.50875449e+00 ... -1.07762566e+01\n", + " 6.21675968e-01 8.37550282e-01]\n", + " [-6.83238800e-02 1.75684214e-01 -3.23107153e-01 ... 3.12022066e+00\n", + " -2.66523540e-01 -1.19171536e+00]]\n", + "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n", + "HF Qproj:\n", + "torch.Size([24, 768])\n", + "\t reshaped: (24, 64, 12)\n", + "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n", + " 0.0000000e+00 0.0000000e+00]\n", + " [-2.1439367e-03 3.2949597e-03 -2.9551555e-04 ... 2.4234168e-01\n", + " 4.3675169e-02 -9.2218071e-02]\n", + " [ 2.2399018e+00 -3.3713050e+00 -9.7703063e-01 ... 1.4206999e+01\n", + " -1.9386978e+00 -1.7756876e+01]\n", + " ...\n", + " [ 8.7195921e+00 1.2150297e+01 9.2796574e+00 ... 4.7496593e+01\n", + " -2.7162397e+00 -2.6841351e+01]\n", + " [ 2.8459630e+00 -2.0782030e+01 5.8126745e+00 ... 3.3043846e+01\n", + " -1.4574212e+01 -4.2649174e+01]\n", + " [-7.3419094e-02 -4.3298864e+00 2.0055656e+00 ... -1.4900026e+00\n", + " -9.0601617e-01 2.9582092e-01]]\n", + "FF Qproj:\n", + "(24, 64, 12)\n", + "[[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00\n", + " 0.00000000e+00 0.00000000e+00]\n", + " [-2.14390700e-03 3.29491400e-03 -2.95521000e-04 ... 2.42338002e-01\n", + " 4.36745360e-02 -9.22166560e-02]\n", + " [ 2.23990273e+00 -3.37130690e+00 -9.77032721e-01 ... 1.42070026e+01\n", + " -1.93870103e+00 -1.77568874e+01]\n", + " ...\n", + " [ 8.71960449e+00 1.21503038e+01 9.27967071e+00 ... 4.74966431e+01\n", + " -2.71619344e+00 -2.68413410e+01]\n", + " [ 2.84595203e+00 -2.07820034e+01 5.81268263e+00 ... 3.30439415e+01\n", + " -1.45741787e+01 -4.26492157e+01]\n", + " [-7.34183120e-02 -4.32989836e+00 2.00555873e+00 ... -1.48999298e+00\n", + " -9.06009376e-01 2.95819134e-01]]\n", + "hf_attn_in: torch.Size([1, 24, 768])\n", + "hf_attn_in: (768, 24)\n", + "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n", + " 2.38160934e+01 3.15938339e+01]\n", + " [-9.55138900e+06 6.71377197e+02 2.06871887e+02 ... -3.86393509e+01\n", + " 2.14816055e+01 -6.58599396e+01]\n", + " [ 1.14522670e+07 2.19898975e+03 -6.89673233e+00 ... 9.51593590e+00\n", + " -1.68612709e+01 6.02474251e+01]\n", + " ...\n", + " [ 2.10891925e+06 3.78648706e+03 1.02701221e+03 ... 3.59794388e+01\n", + " 5.03902206e+01 4.19777756e+01]\n", + " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ... 9.36443710e+00\n", + " 3.84094887e+01 -7.51948738e+00]\n", + " [ 7.39155050e+06 1.11731885e+03 3.38369843e+02 ... 3.70399475e+01\n", + " 1.77629051e+01 9.76780853e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.52523500e+06 -1.27625269e+03 -4.39337921e+01 ... -3.34414406e+01\n", + " 2.38161297e+01 3.15938721e+01]\n", + " [-9.55138800e+06 6.71377197e+02 2.06871750e+02 ... -3.86393204e+01\n", + " 2.14817352e+01 -6.58599167e+01]\n", + " [ 1.14522680e+07 2.19898877e+03 -6.89653015e+00 ... 9.51589775e+00\n", + " -1.68612289e+01 6.02473717e+01]\n", + " ...\n", + " [ 2.10891825e+06 3.78648633e+03 1.02701196e+03 ... 3.59794769e+01\n", + " 5.03901863e+01 4.19778595e+01]\n", + " [ 2.11695250e+06 -2.36283737e+02 -1.08002808e+02 ... 9.36445141e+00\n", + " 3.84095154e+01 -7.51950741e+00]\n", + " [ 7.39155000e+06 1.11731885e+03 3.38369934e+02 ... 3.70399170e+01\n", + " 1.77628460e+01 9.76780930e+01]]\n", + "4.817708333333334% mismatch in attention input grads\n" ] }, { @@ -464,7 +643,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[19], line 267\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhf_kproj_grads_before_rotary: \u001b[39m\u001b[38;5;124m\"\u001b[39m, hf_kproj_grads_before_rotary\u001b[38;5;241m.\u001b[39mshape)\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28mprint\u001b[39m(hf_kproj_grads_before_rotary[:,:,\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m--> 267\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(np\u001b[38;5;241m.\u001b[39mallclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-2\u001b[39m))\n\u001b[1;32m 268\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.k_proj.go_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 269\u001b[0m hf_kproj_grads \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_kproj_grads)\u001b[38;5;241m.\u001b[39msqueeze()\n", + "Cell \u001b[0;32mIn[45], line 353\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", "\u001b[0;31mAssertionError\u001b[0m: " ] } @@ -720,23 +899,62 @@ " hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n", " assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n", " \n", + " ########################################## ROPE and Kproj ##########################################\n", + "\n", " # Compare FF kproj with intermediate kproj data from HF\n", " hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n", - " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n", - " print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", - " # print(hf_kproj_grads_post_rotary[0,:,:])\n", - " ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n", - " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", - " # print(\"ff_kproj: \", ff_kproj.shape)\n", - " # print(ff_kproj[:,:,0])\n", - " assert(np.allclose(ff_kproj, hf_kproj_grads_post_rotary, atol=1e-2))\n", - "\n", - " # Compare HF before and Kproj out gradients\n", + " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n", + " hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", + " print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", + " # Check hf ROPE \n", + " cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n", + " cos = cos.cuda()\n", + " sin = sin.cuda()\n", + " # query_states: torch.Size([1, 12, 24, 64])\n", + " # key_states: torch.Size([1, 12, 24, 64])\n", + " # position_ids: torch.Size([1, 24])\n", + " # tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " # 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + " query_states = torch.zeros([1, 12, 24, 64]).cuda()\n", + " position_ids = torch.arange(24).unsqueeze(0).cuda()\n", + " query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n", + " hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", + " # print(hf_kproj_grads_post_rotary[:,:,0])\n", + " \n", " hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n", - " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary).squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n", + " hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", " print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", " print(hf_kproj_grads_before_rotary[:,:,0])\n", - " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", + " # Compare HF rope with manual ROPE\n", + " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " # Compare HF Kproj with FF Kproj (before ROPE) \n", + " ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n", + " ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", + " print(ff_kproj_pre[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " \n", + " ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n", + " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " print(\"ff_kproj: \", ff_kproj.shape)\n", + " print(ff_kproj[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " \n", + " \n", + " #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", " hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n", " hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n", " #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n", @@ -745,6 +963,8 @@ " #print(reshaped_tensor.shape)\n", " assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n", "\n", + " ########################################## Qproj (with ROPE) ##########################################\n", + "\n", " # Compare QProj\n", " hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n", " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", @@ -780,19 +1000,91 @@ " print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n", " assert(pct_mismatch <= 0.05)\n", " \n", - " assert(np.allclose(hf_kproj_grads, ff_kProjGrads, atol=1e-2))\n", - " assert(np.allclose(hf_qproj_grads, ff_qProjGrads, atol=1e-2))\n", - " # print(hf_qproj_grads.shape)\n", - " # print(hf_kproj_grads)\n", - " # print()\n", - " # print(ff_qProjGrads)\n", - " # print(ff_kProjGrads.shape)\n", - " \n", - " \n", "\n", " assert False" ] }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([12, 24, 64])\n", + "tensor([[-1.5730e-02, -4.1161e-02, 3.0593e-02, ..., 3.8630e-01,\n", + " 3.2884e-01, 3.6067e-01],\n", + " [-2.8613e+01, -5.5872e+00, 2.9385e+01, ..., 3.8782e+01,\n", + " 9.6901e+01, 9.8470e+01],\n", + " [ 3.3027e+00, 1.8276e-01, -1.8497e+00, ..., -4.4052e+01,\n", + " -2.0010e+01, -2.9788e+01],\n", + " ...,\n", + " [-7.6471e-02, -1.8892e-01, 3.6430e-01, ..., -2.7493e-01,\n", + " 5.7017e-01, -1.5986e-01],\n", + " [ 2.5780e+00, -1.8153e+00, 2.5088e+00, ..., -1.0776e+01,\n", + " 6.2167e-01, 8.3755e-01],\n", + " [-6.8324e-02, 1.7568e-01, -3.2311e-01, ..., 3.1202e+00,\n", + " -2.6652e-01, -1.1917e+00]])\n", + "(24, 64, 12)\n", + "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", + " 3.2884139e-01 3.6066702e-01]\n", + " [-2.8613457e+01 -5.5871558e+00 2.9384506e+01 ... 3.8781765e+01\n", + " 9.6900581e+01 9.8469597e+01]\n", + " [ 3.3027239e+00 1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n", + " -2.0009745e+01 -2.9787930e+01]\n", + " ...\n", + " [-7.6470733e-02 -1.8891659e-01 3.6430117e-01 ... -2.7492592e-01\n", + " 5.7017130e-01 -1.5985624e-01]\n", + " [ 2.5780225e+00 -1.8152566e+00 2.5087588e+00 ... -1.0776262e+01\n", + " 6.2166649e-01 8.3755457e-01]\n", + " [-6.8324409e-02 1.7568478e-01 -3.2310838e-01 ... 3.1202292e+00\n", + " -2.6652411e-01 -1.1917179e+00]]\n" + ] + } + ], + "source": [ + "# value states: torch.Size([1, 12, 24, 64])\n", + "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n", + "key_states = value_states\n", + "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n", + "# query_states: torch.Size([1, 12, 24, 64])\n", + "# key_states: torch.Size([1, 12, 24, 64])\n", + "# position_ids: torch.Size([1, 24])\n", + "# tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + "# 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + "query_states = torch.zeros([1, 12, 24, 64])\n", + "position_ids = torch.arange(24).unsqueeze(0)\n", + "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n", + "key_states = key_states.squeeze()\n", + "print(key_states.shape)\n", + "print(key_states[0,:,:])\n", + "print(hf_kproj_grads_before_rotary.shape)\n", + "print(hf_kproj_grads_before_rotary[:,:,0])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " 18, 19, 20, 21, 22, 23]], device='cuda:0')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.arange(24).unsqueeze(0).cuda()" + ] + }, { "cell_type": "code", "execution_count": null, From ab6a33f362f9c404a41ec7e749848959b6a93f4f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 19 Dec 2023 14:27:30 -0500 Subject: [PATCH 06/11] backup --- tests/peft/alignment_tests.ipynb | 346 ++++++++++++++++--------------- 1 file changed, 176 insertions(+), 170 deletions(-) diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb index a9382b9524..e2a8978ea3 100644 --- a/tests/peft/alignment_tests.ipynb +++ b/tests/peft/alignment_tests.ipynb @@ -217,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -225,25 +225,151 @@ "output_type": "stream", "text": [ "Ok!\n", - "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.0.self_attn.o_proj.output_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_attention_shard-id_0_output_0\n", - "HF: [ 0. 0. 0. ... 0.02364488 -0.00304312\n", - " -0.01649825]\n", - "FF:[ 0. 0. 0. ... 0.02200473 0.01693928\n", - " -0.02354377]\n", - "[ True True True ... True False True]\n", - "[ 1541 1543 1545 ... 18427 18428 18430]\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[8], line 10\u001b[0m\n\u001b[1;32m 8\u001b[0m hf_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.self_attn.o_proj.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 9\u001b[0m ff_attn_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_attention_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 10\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_attn_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_attn_out\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m hf_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.post_attention_layernorm.output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 12\u001b[0m ff_ffn_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layer-name_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_ffn_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "Cell \u001b[0;32mIn[2], line 27\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mAssertionError\u001b[0m: " + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" ] } ], @@ -312,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -368,7 +494,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -451,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -510,129 +636,9 @@ "Ok!\n", "Ok!\n", "Ok!\n", - "(64, 12, 24)\n", - "(64, 12, 24)\n", - "torch.Size([12, 24, 64])\n", - "torch.Size([12, 64, 24])\n", "4.383680555555555% mismatch in QK prods softmax out grad\n", - "hf_kproj_grads_post_rotary: (24, 64, 12)\n", - "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", - " 3.2884139e-01 3.6066702e-01]\n", - " [-7.5168266e+00 4.6582484e+00 1.7284815e+01 ... 3.8785275e+01\n", - " 9.6879341e+01 9.8476219e+01]\n", - " [-8.0723800e-02 1.8924624e+00 -2.6913931e+00 ... -4.4056824e+01\n", - " -2.0001854e+01 -2.9799681e+01]\n", - " ...\n", - " [-1.9819270e-01 1.9175959e-01 1.8926021e-01 ... -2.7737719e-01\n", - " 5.7191163e-01 -1.5962012e-01]\n", - " [-2.5673387e+00 1.7033563e+00 2.2882986e+00 ... -1.0788559e+01\n", - " 6.3817674e-01 8.2335520e-01]\n", - " [-1.7806959e-01 8.9493655e-02 -1.9538833e-01 ... 3.1075442e+00\n", - " -2.6218265e-01 -1.1863230e+00]]\n", - "hf_kproj_grads_before_rotary: (24, 64, 12)\n", - "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", - " 3.2884139e-01 3.6066702e-01]\n", - " [-2.8613457e+01 -5.5871558e+00 2.9384506e+01 ... 3.8781765e+01\n", - " 9.6900581e+01 9.8469597e+01]\n", - " [ 3.3027239e+00 1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n", - " -2.0009745e+01 -2.9787930e+01]\n", - " ...\n", - " [-7.6470733e-02 -1.8891659e-01 3.6430117e-01 ... -2.7492592e-01\n", - " 5.7017130e-01 -1.5985624e-01]\n", - " [ 2.5780225e+00 -1.8152566e+00 2.5087588e+00 ... -1.0776262e+01\n", - " 6.2166649e-01 8.3755457e-01]\n", - " [-6.8324409e-02 1.7568478e-01 -3.2310838e-01 ... 3.1202292e+00\n", - " -2.6652411e-01 -1.1917179e+00]]\n", - "ff_kproj_pre: (24, 64, 12)\n", - "[[-1.57300810e-02 -4.11607850e-02 3.05930820e-02 ... 3.86295587e-01\n", - " 3.28840941e-01 3.60667169e-01]\n", - " [-7.51684189e+00 4.65823793e+00 1.72848415e+01 ... 3.87852402e+01\n", - " 9.68793182e+01 9.84762802e+01]\n", - " [-8.07239790e-02 1.89246774e+00 -2.69139457e+00 ... -4.40568542e+01\n", - " -2.00018616e+01 -2.97996941e+01]\n", - " ...\n", - " [-1.98194161e-01 1.91760257e-01 1.89260900e-01 ... -2.77382791e-01\n", - " 5.71911991e-01 -1.59620658e-01]\n", - " [-2.56733608e+00 1.70335352e+00 2.28829479e+00 ... -1.07885523e+01\n", - " 6.38186097e-01 8.23350966e-01]\n", - " [-1.78069487e-01 8.94933720e-02 -1.95387334e-01 ... 3.10753584e+00\n", - " -2.62182117e-01 -1.18632054e+00]]\n", "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n", - "ff_kproj: (24, 64, 12)\n", - "[[-1.57300810e-02 -4.11607850e-02 3.05930820e-02 ... 3.86295587e-01\n", - " 3.28840941e-01 3.60667169e-01]\n", - " [-2.86135025e+01 -5.58717918e+00 2.93845501e+01 ... 3.87817307e+01\n", - " 9.69005585e+01 9.84696579e+01]\n", - " [ 3.30272818e+00 1.82759121e-01 -1.84967291e+00 ... -4.40522003e+01\n", - " -2.00097523e+01 -2.97879410e+01]\n", - " ...\n", - " [-7.64704790e-02 -1.88917309e-01 3.64301860e-01 ... -2.74931490e-01\n", - " 5.70171654e-01 -1.59856781e-01]\n", - " [ 2.57801986e+00 -1.81525516e+00 2.50875449e+00 ... -1.07762566e+01\n", - " 6.21675968e-01 8.37550282e-01]\n", - " [-6.83238800e-02 1.75684214e-01 -3.23107153e-01 ... 3.12022066e+00\n", - " -2.66523540e-01 -1.19171536e+00]]\n", "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n", - "HF Qproj:\n", - "torch.Size([24, 768])\n", - "\t reshaped: (24, 64, 12)\n", - "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n", - " 0.0000000e+00 0.0000000e+00]\n", - " [-2.1439367e-03 3.2949597e-03 -2.9551555e-04 ... 2.4234168e-01\n", - " 4.3675169e-02 -9.2218071e-02]\n", - " [ 2.2399018e+00 -3.3713050e+00 -9.7703063e-01 ... 1.4206999e+01\n", - " -1.9386978e+00 -1.7756876e+01]\n", - " ...\n", - " [ 8.7195921e+00 1.2150297e+01 9.2796574e+00 ... 4.7496593e+01\n", - " -2.7162397e+00 -2.6841351e+01]\n", - " [ 2.8459630e+00 -2.0782030e+01 5.8126745e+00 ... 3.3043846e+01\n", - " -1.4574212e+01 -4.2649174e+01]\n", - " [-7.3419094e-02 -4.3298864e+00 2.0055656e+00 ... -1.4900026e+00\n", - " -9.0601617e-01 2.9582092e-01]]\n", - "FF Qproj:\n", - "(24, 64, 12)\n", - "[[ 0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00\n", - " 0.00000000e+00 0.00000000e+00]\n", - " [-2.14390700e-03 3.29491400e-03 -2.95521000e-04 ... 2.42338002e-01\n", - " 4.36745360e-02 -9.22166560e-02]\n", - " [ 2.23990273e+00 -3.37130690e+00 -9.77032721e-01 ... 1.42070026e+01\n", - " -1.93870103e+00 -1.77568874e+01]\n", - " ...\n", - " [ 8.71960449e+00 1.21503038e+01 9.27967071e+00 ... 4.74966431e+01\n", - " -2.71619344e+00 -2.68413410e+01]\n", - " [ 2.84595203e+00 -2.07820034e+01 5.81268263e+00 ... 3.30439415e+01\n", - " -1.45741787e+01 -4.26492157e+01]\n", - " [-7.34183120e-02 -4.32989836e+00 2.00555873e+00 ... -1.48999298e+00\n", - " -9.06009376e-01 2.95819134e-01]]\n", - "hf_attn_in: torch.Size([1, 24, 768])\n", - "hf_attn_in: (768, 24)\n", - "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n", - " 2.38160934e+01 3.15938339e+01]\n", - " [-9.55138900e+06 6.71377197e+02 2.06871887e+02 ... -3.86393509e+01\n", - " 2.14816055e+01 -6.58599396e+01]\n", - " [ 1.14522670e+07 2.19898975e+03 -6.89673233e+00 ... 9.51593590e+00\n", - " -1.68612709e+01 6.02474251e+01]\n", - " ...\n", - " [ 2.10891925e+06 3.78648706e+03 1.02701221e+03 ... 3.59794388e+01\n", - " 5.03902206e+01 4.19777756e+01]\n", - " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ... 9.36443710e+00\n", - " 3.84094887e+01 -7.51948738e+00]\n", - " [ 7.39155050e+06 1.11731885e+03 3.38369843e+02 ... 3.70399475e+01\n", - " 1.77629051e+01 9.76780853e+01]]\n", - "ff_attn_in: (768, 24)\n", - "[[-7.52523500e+06 -1.27625269e+03 -4.39337921e+01 ... -3.34414406e+01\n", - " 2.38161297e+01 3.15938721e+01]\n", - " [-9.55138800e+06 6.71377197e+02 2.06871750e+02 ... -3.86393204e+01\n", - " 2.14817352e+01 -6.58599167e+01]\n", - " [ 1.14522680e+07 2.19898877e+03 -6.89653015e+00 ... 9.51589775e+00\n", - " -1.68612289e+01 6.02473717e+01]\n", - " ...\n", - " [ 2.10891825e+06 3.78648633e+03 1.02701196e+03 ... 3.59794769e+01\n", - " 5.03901863e+01 4.19778595e+01]\n", - " [ 2.11695250e+06 -2.36283737e+02 -1.08002808e+02 ... 9.36445141e+00\n", - " 3.84095154e+01 -7.51950741e+00]\n", - " [ 7.39155000e+06 1.11731885e+03 3.38369934e+02 ... 3.70399170e+01\n", - " 1.77628460e+01 9.76780930e+01]]\n", "4.817708333333334% mismatch in attention input grads\n" ] }, @@ -643,7 +649,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[45], line 353\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "Cell \u001b[0;32mIn[11], line 353\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", "\u001b[0;31mAssertionError\u001b[0m: " ] } @@ -829,10 +835,10 @@ " ##############################\n", " hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", " hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n", - " print(hf_value_states.shape)\n", + " # print(hf_value_states.shape)\n", " ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n", " ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n", - " print(ff_value_states.shape)\n", + " # print(ff_value_states.shape)\n", " assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n", " \n", " \n", @@ -852,8 +858,8 @@ " ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n", " ff_value_states = torch.from_numpy(ff_value_states)\n", " ff_value_states = ff_value_states.permute(1,0,2)\n", - " print(ff_attn_heads_grads.shape)\n", - " print(ff_value_states.shape)\n", + " # print(ff_attn_heads_grads.shape)\n", + " # print(ff_value_states.shape)\n", " simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n", " #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n", " #print(\"Simulated QK prods grads:\")\n", @@ -905,8 +911,8 @@ " hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n", " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n", " hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", - " print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", - " print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", + " # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", " # Check hf ROPE \n", " cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n", " cos = cos.cuda()\n", @@ -926,15 +932,15 @@ " hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n", " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n", " hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", - " print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", - " print(hf_kproj_grads_before_rotary[:,:,0])\n", + " # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", + " # print(hf_kproj_grads_before_rotary[:,:,0])\n", " # Compare HF rope with manual ROPE\n", " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n", " # Compare HF Kproj with FF Kproj (before ROPE) \n", " ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n", " ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", - " print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", - " print(ff_kproj_pre[:,:,0])\n", + " # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", + " #print(ff_kproj_pre[:,:,0])\n", " mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", " pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n", @@ -944,8 +950,8 @@ " \n", " ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n", " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", - " print(\"ff_kproj: \", ff_kproj.shape)\n", - " print(ff_kproj[:,:,0])\n", + " # print(\"ff_kproj: \", ff_kproj.shape)\n", + " #print(ff_kproj[:,:,0])\n", " mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", " pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n", @@ -968,30 +974,30 @@ " # Compare QProj\n", " hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n", " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", - " print(\"HF Qproj:\")\n", - " print(hf_qproj_grads.shape)\n", + " # print(\"HF Qproj:\")\n", + " # print(hf_qproj_grads.shape)\n", " reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", - " print(\"\\t reshaped: \", reshaped_tensor.shape)\n", - " print(reshaped_tensor[:,:,0])\n", + " # print(\"\\t reshaped: \", reshaped_tensor.shape)\n", + " # print(reshaped_tensor[:,:,0])\n", " ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n", " ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n", - " print(\"FF Qproj:\")\n", - " print(ff_qproj.shape)\n", - " print(ff_qproj[:,:,0])\n", + " # print(\"FF Qproj:\")\n", + " # print(ff_qproj.shape)\n", + " # print(ff_qproj[:,:,0])\n", " assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n", "\n", " hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n", " hf_attn_in = torch.load(hf_attn_in)\n", - " print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " # print(\"hf_attn_in: \", hf_attn_in.shape)\n", " hf_attn_in = hf_attn_in.squeeze().T\n", " hf_attn_in = hf_attn_in.detach().cpu().numpy()\n", - " print(\"hf_attn_in: \", hf_attn_in.shape)\n", - " print(hf_attn_in)\n", + " # print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " # print(hf_attn_in)\n", "\n", " ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n", " ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n", - " print(\"ff_attn_in: \", ff_attn_in.shape)\n", - " print(ff_attn_in)\n", + " # print(\"ff_attn_in: \", ff_attn_in.shape)\n", + " # print(ff_attn_in)\n", " #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n", "\n", " mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n", @@ -1006,7 +1012,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1066,7 +1072,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [ { From 886d04fde3a3938a187ea4d0897809800848bb40 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 26 Dec 2023 10:58:03 -0500 Subject: [PATCH 07/11] backup --- .../ops/add_bias_residual_layer_norm.h | 2 - inference/models/opt.cc | 17 +++-- src/ops/add_bias_residual_layer_norm.cc | 25 ++---- src/ops/add_bias_residual_layer_norm.cu | 62 ++++++--------- src/ops/inc_multihead_self_attention.cc | 2 +- src/ops/layer_norm.cu | 18 ----- src/ops/linear.cc | 10 ++- src/ops/lora_linear.cc | 4 +- src/ops/residual_layer_norm.cc | 22 +++--- src/ops/residual_layer_norm.cu | 76 +++++++++++-------- src/ops/softmax.cc | 2 +- 11 files changed, 109 insertions(+), 131 deletions(-) diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index 5c4a49f998..38bb825a4d 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -124,7 +124,6 @@ class AddBiasResidualLayerNorm : public Op { T const *output_grad_ptr, T *input_grad_ptr, T *residual_grad_ptr, - T *attn_bias_grad_ptr, T const *gamma_ptr, ffStream_t stream); static void @@ -132,7 +131,6 @@ class AddBiasResidualLayerNorm : public Op { GenericTensorAccessorR const &output_grad, GenericTensorAccessorW &input_grad, GenericTensorAccessorW const &residual_grad, - GenericTensorAccessorW const &attn_bias_grad, GenericTensorAccessorR const &gamma); public: diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9069aef9e1..fa3bc29041 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -193,7 +193,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor fc1 = ff.dense(final_norm, opt_config.ffn_dim, - AC_MODE_NONE, + AC_MODE_RELU, true, DT_NONE, nullptr, @@ -202,8 +202,8 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers_" + std::to_string(i) + "_fc1").c_str()); - Tensor activation = ff.relu(fc1, false); - fc2 = ff.dense(activation, + //Tensor activation = ff.relu(fc1, false); + fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, true, @@ -216,17 +216,18 @@ void OPT::create_opt_model(FFModel &ff, std::string("layers_" + std::to_string(i) + "_fc2").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer ff.lora_linear( - activation, + fc1, fc2, OP_LORA_MLP_SECOND, std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str()); } // final + Tensor final_residual_ln_output[2] = {nullptr, nullptr}; ff.residual_layer_norm(added, fc2, nullptr, - res_ln_outputs, + final_residual_ln_output, false, axes, opt_config.layer_norm_elementwise_affine, @@ -234,9 +235,8 @@ void OPT::create_opt_model(FFModel &ff, true, DT_NONE, "final_layer_norm"); - Tensor all_final_norm = res_ln_outputs[1]; - Tensor lm_head = ff.dense(all_final_norm, + Tensor lm_head = ff.dense(final_residual_ln_output[1], opt_config.vocab_size, AC_MODE_NONE, false, @@ -255,7 +255,8 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(softmax, /*beam_Search*/ true); } else { // output = ff.arg_top_k(lm_head, /*k=*/1, false); - output = ff.argmax(lm_head, /*beam_Search*/ false); + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.argmax(softmax, /*beam_Search*/ false); } //------------------- compile the model -------------------------------- diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index be7b357f23..65247939b9 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -931,7 +931,7 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -939,18 +939,10 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); - // attn bias grad - launcher.add_region_requirement( - RegionRequirement(batch_inputs[2]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[2]->region_grad)); - launcher.add_field(field_id++, FID_DATA); if (elementwise_affine) { // gamma launcher.add_region_requirement(RegionRequirement(weights[0]->part, @@ -1001,14 +993,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task( ctx, runtime); - GenericTensorAccessorW attn_bias_grad = - helperGetGenericTensorAccessorRW(m->weight_type[0], - regions[region_idx++], - task->regions[task_region_idx++], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorR gamma; if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 6)); @@ -1019,14 +1003,15 @@ void AddBiasResidualLayerNorm::peft_bwd_task( ctx, runtime); } + std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m); + std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " << m->reset_input_grads[0] << " reset_in_grad[1]: " << m->reset_input_grads[1] << std::endl; AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( - m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma); + m, output_grad, input_grad, residual_grad, gamma); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector weights_accessors; - weights_accessors.push_back(attn_bias_grad); if (m->elementwise_affine) { weights_accessors.push_back(gamma); } diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 097ace3676..08e3bb3edf 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -101,9 +101,9 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) ? shared[lid] - : 0; + : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -536,8 +536,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T const *__restrict__ rstd, T const *__restrict__ gamma, T *dX, - T *dX_residual1, - T *dX_residual2, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, int const N, T *buf) { auto const i1 = blockIdx.x; @@ -549,9 +550,7 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T const *X_i = X + i1 * N; T const *dY_i = dY + i1 * N; T *dX_i = dX + i1 * N; - T *dX_residual1_i = dX_residual1 + i1 * N; - T *dX_residual2_i = - (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + T *dX_residual_i = dX_residual + i1 * N; // vectorized reads don't improve perf, so use regular unrolling for (; l + unroll - 1 < N; l += blockDim.x * unroll) { @@ -592,10 +591,15 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, f_grad_input -= (x - mean_val) * rstd_val * stats_x2; f_grad_input -= stats_x1; f_grad_input *= term1; - dX_i[l] += f_grad_input; - dX_residual1_i[l] += f_grad_input; - if (dX_residual2 != nullptr) { - dX_residual2_i[l] += f_grad_input; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad) { + dX_residual_i[l] = f_grad_input; + } else { + dX_residual_i[l] += f_grad_input; } } } @@ -607,13 +611,14 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, T const *__restrict__ rstd, T const *__restrict__ gamma, T *dX, - T *dX_residual1, - T *dX_residual2, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, int const N) { alignas(sizeof(double)) extern __shared__ char s_data1[]; T *buf = reinterpret_cast(&s_data1); - compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf); + compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual, reset_input_grad, reset_residual_grad, N, buf); } /*static*/ @@ -661,7 +666,8 @@ void AddBiasResidualLayerNorm::backward_kernel( gamma_ptr, input_grad_ptr, residual_grad_ptr, - attn_bias_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { @@ -764,29 +770,11 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel( T const *output_grad_ptr, T *input_grad_ptr, T *residual_grad_ptr, - T *attn_bias_grad_ptr, T const *gamma_ptr, cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - ComputeGradientFusedParamsCUDAKernel - <<>>(M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); @@ -799,7 +787,8 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel( gamma_ptr, input_grad_ptr, residual_grad_ptr, - attn_bias_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], N); } @@ -809,7 +798,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( GenericTensorAccessorR const &output_grad, GenericTensorAccessorW &input_grad, GenericTensorAccessorW const &residual_grad, - GenericTensorAccessorW const &attn_bias_grad, GenericTensorAccessorR const &gamma) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -825,7 +813,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( output_grad.get_float_ptr(), input_grad.get_float_ptr(), residual_grad.get_float_ptr(), - attn_bias_grad.get_float_ptr(), m->elementwise_affine ? gamma.get_float_ptr() : nullptr, stream); } else if (m->output_type[0] == DT_HALF) { @@ -833,7 +820,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( output_grad.get_half_ptr(), input_grad.get_half_ptr(), residual_grad.get_half_ptr(), - attn_bias_grad.get_half_ptr(), m->elementwise_affine ? gamma.get_half_ptr() : nullptr, stream); } else { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 569b35097d..562824d7d5 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -935,7 +935,7 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 6e12c53230..1d4e94d7d5 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -664,24 +664,6 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - ComputeGradientFusedParamsCUDAKernel - <<>>(M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index e71be3bbf4..a4e9ba5ce1 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -564,6 +564,7 @@ FutureMap Linear::inference(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { + printf("\tentering inference for %s\n", name); ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -617,10 +618,14 @@ void Linear::inference_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + printf("\tEntering inference task\n"); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + std::string op_name_without_uid = Linear::get_op_name_without_uid(m); + printf("FWD %s\n", op_name_without_uid.c_str()); + bc->print(); if (bc->num_tokens == 0) { return; } @@ -700,7 +705,7 @@ FutureMap Linear::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); @@ -757,6 +762,9 @@ void Linear::peft_bwd_task(Task const *task, int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + std::string op_name_without_uid = Linear::get_op_name_without_uid(m); + std::cout << "BWD " << op_name_without_uid << std::endl; + int num_infr_tokens = bc->num_active_infr_tokens(); int num_peft_tokens = bc->num_active_peft_tokens(); if (m->inference_debugging) { diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 9ed411397d..e39b444af4 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -589,14 +589,14 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(1, FID_DATA); diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index c142e47e62..ce24415291 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -723,7 +723,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -731,7 +731,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -740,7 +740,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[2] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -768,9 +768,7 @@ void ResidualLayerNorm::peft_bwd_task( } assert(task->regions.size() == regions.size()); ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); - assert(regions.size() == - 4 + m->use_two_residuals + - (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine); int region_idx = 0, task_region_idx = 0; @@ -807,14 +805,16 @@ void ResidualLayerNorm::peft_bwd_task( } GenericTensorAccessorR gamma; if (m->elementwise_affine) { - assert(m->use_bias == (regions.size() == 6)); - gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + gamma = helperGetGenericTensorAccessorRO(m->weight_type[0], regions[region_idx++], task->regions[task_region_idx++], FID_DATA, ctx, runtime); } + std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m); + std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " << m->reset_input_grads[0] << " reset_in_grad[1]: " << m->reset_input_grads[1] << std::endl; + ResidualLayerNorm::peft_bwd_kernel_wrapper( m, output_grad, input_grad, residual1_grad, residual2_grad, gamma); @@ -942,12 +942,14 @@ void ResidualLayerNorm::inference_task( assert(task->regions.size() == regions.size()); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); + std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m); + std::cout << "INF " << op_name_without_uid << std::endl; if (bc->num_tokens == 0) { + bc->print(); return; } - ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); - assert(regions.size() == 4 + m->use_two_residuals + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 4bfac1887f..0b6624c4ab 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -239,36 +239,34 @@ void ResidualLayerNorm::inference_kernel_wrapper( } assert(num_peft_requests <= 1); - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - // FIXME: use the new approach to computing token offset - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int in_dim = - added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; m->input_activation = allocator->allocate_instance_untyped( data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + printf("Allocating input_activation (%p) of size: %i*%i*%i=%i for %s...\n", m->input_activation, data_type_size(m->input_type[0]), num_peft_tokens,in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, m->op_name); // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( m->input_activation, - added_output.get_float_ptr() + tokens_previous_requests * in_dim, + added_output.get_float_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); } else if (m->input_type[0] == DT_HALF) { checkCUDA(cudaMemcpyAsync( m->input_activation, - added_output.get_half_ptr() + tokens_previous_requests * in_dim, + added_output.get_half_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); @@ -481,6 +479,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T *dX, T *dX_residual1, T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, int const N, T *buf) { auto const i1 = blockIdx.x; @@ -535,10 +536,22 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, f_grad_input -= (x - mean_val) * rstd_val * stats_x2; f_grad_input -= stats_x1; f_grad_input *= term1; - dX_i[l] += f_grad_input; - dX_residual1_i[l] += f_grad_input; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad1) { + dX_residual1_i[l] = f_grad_input; + } else { + dX_residual1_i[l] += f_grad_input; + } if (dX_residual2 != nullptr) { - dX_residual2_i[l] += f_grad_input; + if (reset_residual_grad2) { + dX_residual2_i[l] = f_grad_input; + } else { + dX_residual2_i[l] += f_grad_input; + } } } } @@ -552,11 +565,13 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, T *dX, T *dX_residual1, T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, int const N) { alignas(sizeof(double)) extern __shared__ char s_data1[]; T *buf = reinterpret_cast(&s_data1); - - compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf); + compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, reset_input_grad, reset_residual_grad1, reset_residual_grad2, N, buf); } /*static*/ @@ -604,6 +619,9 @@ void backward_kernel(ResidualLayerNormMeta const *m, input_grad_ptr, residual1_grad_ptr, residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { @@ -710,28 +728,23 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - ComputeGradientFusedParamsCUDAKernel - <<>>(M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); + + sleep(10); + printf("Attempting to access %p\n", m->input_activation); + check_device_vs_host_ptr(static_cast(m->input_activation)); + check_device_vs_host_ptr(static_cast(m->mean_ptr)); + check_device_vs_host_ptr(static_cast(m->rstd_ptr)); + check_device_vs_host_ptr(static_cast(gamma_ptr)); + check_device_vs_host_ptr(static_cast(input_grad_ptr)); + check_device_vs_host_ptr(static_cast(residual1_grad_ptr)); + sleep(10); + assert(false); + layer_norm_grad_input_kernel<<>>( output_grad_ptr, static_cast(m->input_activation), @@ -741,6 +754,9 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, input_grad_ptr, residual1_grad_ptr, residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], N); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 23f2eb9edf..8313273c49 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -411,7 +411,7 @@ FutureMap Softmax::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); From 66c66f2f1baf313ab9a192f4b502c380ca1c1b01 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 1 Jan 2024 10:52:17 -0500 Subject: [PATCH 08/11] backup --- .../flexflow/ops/kernels/softmax_kernels.h | 3 +- include/flexflow/ops/residual_layer_norm.h | 1 + inference/models/opt.cc | 2 + src/ops/add_bias_residual_layer_norm.cc | 13 ++-- src/ops/fused.cc | 18 +++++ src/ops/fused.cu | 70 ++++++++----------- src/ops/inc_multihead_self_attention.cc | 12 ++-- src/ops/kernels/softmax.cu | 13 +++- src/ops/linear.cc | 5 +- src/ops/lora_linear.cc | 2 + src/ops/residual_layer_norm.cc | 25 +++++-- src/ops/residual_layer_norm.cu | 5 +- src/ops/residual_rms_norm.cc | 6 +- src/ops/softmax.cc | 22 ++++-- 14 files changed, 124 insertions(+), 73 deletions(-) diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index db5e9799e9..b3dfe4f430 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -39,7 +39,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, void inference_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad); void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index 35ddb171d4..d924132452 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -28,6 +28,7 @@ class ResidualLayerNorm : public Op { float _eps, bool allocate_weights, char const *name); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index fa3bc29041..28ab2aea7d 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -224,6 +224,8 @@ void OPT::create_opt_model(FFModel &ff, // final Tensor final_residual_ln_output[2] = {nullptr, nullptr}; + // ff.residual_rms_norm(added, fc2, final_residual_ln_output, 1e-05, opt_config.hidden_size, + // DT_NONE, "final_layer_norm"); ff.residual_layer_norm(added, fc2, nullptr, diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 65247939b9..a8a9e05e3d 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -618,12 +618,15 @@ void AddBiasResidualLayerNorm::inference_task( assert(task->regions.size() == regions.size()); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_tokens == 0) { - return; - } + AddBiasResidualLayerNormMeta *m = *((AddBiasResidualLayerNormMeta **)task->local_args); + std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m); + std::cout << "INF " << op_name_without_uid << std::endl; + if (bc->num_tokens == 0) { + return; + } assert(regions.size() == 5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); @@ -945,11 +948,11 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( launcher.add_field(field_id++, FID_DATA); if (elementwise_affine) { // gamma - launcher.add_region_requirement(RegionRequirement(weights[0]->part, + launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - weights[0]->region)); + weights[1]->region)); launcher.add_field(field_id++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index ea1c970cc5..632c331e1f 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -487,6 +487,11 @@ FutureMap FusedOp::inference(FFModel const &ff, // so we transfer the maximum of them // size_t batch_config_size = // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); + printf("FUSED! INFERENCE! %i ops\n", numOperators); + for (int i=0; iop_type << " " << oppp->name << std::endl; + } IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -528,6 +533,19 @@ FutureMap FusedOp::inference(FFModel const &ff, batch_outputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } + offset += numOutputs; + // add softmax output grad + if (operators[numOperators-1]->op_type == OP_SOFTMAX) { + printf("operator %i is last SOFTMAX! adding output %i\n", numOperators-1, numOutputs-1); + assert(outputs[numOutputs-1]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[numOutputs-1]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[numOutputs-1]->region_grad)); + launcher.add_field(offset, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 9954a8b43a..25f15d8efd 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -95,8 +95,9 @@ __host__ void assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); + bool softmax_grad_additional_region = (fused->op_op_type[fused->numOperators-1] == OP_SOFTMAX); assert((int)regions.size() == - fused->numInputs + fused->numWeights + fused->numOutputs); + fused->numInputs + fused->numWeights + fused->numOutputs + softmax_grad_additional_region); // Domain input_domain[MAX_NUM_INPUTS]; // Domain weight_domain[MAX_NUM_WEIGHTS]; // Domain output_domain[MAX_NUM_OUTPUTS]; @@ -141,6 +142,7 @@ __host__ void ctx, runtime); } + roff += fused->numOutputs; // Assert that all meta share the same dnn/blas handler int start = 0; for (start = 0; start < fused->numOperators; start++) { @@ -625,9 +627,19 @@ __host__ void assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); + if (op == fused->numOperators -1) { // if this is the final operator + printf("op %i is softmax! Accessing region %i\n", fused->numOperators -1, roff); + output_accessor[fused->numOutputs] = + helperGetGenericTensorAccessorWO(fused->output_data_types[fused->numOutputs-1], + regions[roff], + task->regions[roff], + FID_DATA, + ctx, + runtime); + } SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; Kernels::Softmax::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + m, bc, my_input_accessor[0], my_output_accessor[0], output_accessor[fused->numOutputs]); break; } case OP_ALLREDUCE: { @@ -1008,7 +1020,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( m, bc, - my_output_grad_accessor[0], + my_output_grad_accessor[1], my_input_grad_accessor[0], my_input_grad_accessor[1], my_weight_accessor[0]); @@ -1078,27 +1090,20 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_weights[op] == 2); // weight + bias } } - GenericTensorAccessorR residual2; + GenericTensorAccessorW residual2; if (m->use_two_residuals) { residual2 = my_input_grad_accessor[2]; } - GenericTensorAccessorR gamma, beta; + GenericTensorAccessorR gamma; if (m->elementwise_affine) { gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; - } } - // TODO: implment me - assert(false); - // ResidualLayerNorm::inference_kernel_wrapper(m, - // my_input_accessor[0], - // my_input_accessor[1], - // residual2, - // my_output_accessor[0], - // my_output_accessor[1], - // gamma, - // beta); + ResidualLayerNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + residual2, + gamma); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { @@ -1115,31 +1120,16 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias } } - GenericTensorAccessorR gamma, beta; + GenericTensorAccessorR gamma; if (m->elementwise_affine) { gamma = my_weight_accessor[1]; - if (m->use_bias) { - beta = my_weight_accessor[2]; - } } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_grad_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); - // TODO: implement me - assert(false); - // AddBiasResidualLayerNorm::inference_kernel_wrapper( - // m, - // attn_bias_dim, - // residual_volume, - // my_input_accessor[0], - // my_output_accessor[0], - // my_output_accessor[1], - // my_input_accessor[1], - // my_weight_accessor[0], - // gamma, - // beta); + + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + gamma); break; } case OP_SIGMOID_SILU_MULTI: { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 562824d7d5..2491634a76 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -818,12 +818,16 @@ void IncMultiHeadSelfAttention::inference_task( log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", bc->num_tokens, bc->num_active_requests()); - if (bc->num_tokens == 0) { - return; - } + IncMultiHeadSelfAttentionMeta *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); + std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m); + std::cout << "INF " << op_name_without_uid << std::endl; + + if (bc->num_tokens == 0) { + return; + } assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 : regions.size() == 3)); @@ -860,8 +864,6 @@ void IncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); - std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m); - std::cout << "INF " << op_name_without_uid << std::endl; IncMultiHeadSelfAttention::inference_kernel_wrapper( m, bc, task->index_point.point_data[0], input, weight, output, biases); diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 115461c129..1624c0458d 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -121,7 +121,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, void inference_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -138,6 +139,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_float_ptr(), num_classes, stream); + checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(), + output.get_float_ptr(), + output.domain.get_volume() * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); } else if (m->output_type[0] == DT_HALF) { Internal::inference_kernel(m, bc, @@ -145,6 +151,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_half_ptr(), num_classes, stream); + checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(), + output.get_half_ptr(), + output.domain.get_volume() * sizeof(half), + cudaMemcpyDeviceToDevice, + stream)); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index a4e9ba5ce1..595b8d24e9 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -564,7 +564,6 @@ FutureMap Linear::inference(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { - printf("\tentering inference for %s\n", name); ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -618,14 +617,12 @@ void Linear::inference_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - printf("\tEntering inference task\n"); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); std::string op_name_without_uid = Linear::get_op_name_without_uid(m); - printf("FWD %s\n", op_name_without_uid.c_str()); - bc->print(); + printf("INF %s\n", op_name_without_uid.c_str()); if (bc->num_tokens == 0) { return; } diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index e39b444af4..fb13dc99cb 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -449,6 +449,8 @@ void LoraLinear::inference_task(Task const *task, Context ctx, Runtime *runtime) { LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m); + std::cout << "INF " << op_name_without_uid << std::endl; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_tokens() == 0) { return; diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index ce24415291..7697613ae0 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -117,7 +117,6 @@ void FFModel::residual_layer_norm(const Tensor input, } int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0; - Layer *ln = nullptr; Tensor casted_input = (data_type != input->data_type) ? cast(input, data_type, "type cast for residual_layer_norm") @@ -133,7 +132,7 @@ void FFModel::residual_layer_norm(const Tensor input, ? cast(residual2, data_type, "type cast for residual2_layer_norm") : residual2; } - ln = new Layer(this, + Layer *ln = new Layer(this, OP_RESIDUAL_LAYERNORM, data_type, name, @@ -144,9 +143,9 @@ void FFModel::residual_layer_norm(const Tensor input, casted_residual1, casted_residual2); ln->outputs[0] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/); ln->outputs[1] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/); { int numdims = axes.size(); int dims[numdims]; @@ -326,6 +325,18 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, } } +void ResidualLayerNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); +} + void ResidualLayerNorm::init_inference( FFModel const &ff, std::vector const &batch_inputs, @@ -439,11 +450,11 @@ void ResidualLayerNorm::init(FFModel const &ff) { launcher.add_field(field_id++, FID_DATA); // residual2 if (use_two_residuals) { - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + launcher.add_region_requirement(RegionRequirement(inputs[2]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - inputs[1]->region)); + inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } // added: input + residual(s) @@ -946,7 +957,7 @@ void ResidualLayerNorm::inference_task( std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m); std::cout << "INF " << op_name_without_uid << std::endl; if (bc->num_tokens == 0) { - bc->print(); + printf("Zero tokens\n"); return; } diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 0b6624c4ab..2164616b88 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -734,7 +734,6 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, const dim3 blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); - sleep(10); printf("Attempting to access %p\n", m->input_activation); check_device_vs_host_ptr(static_cast(m->input_activation)); check_device_vs_host_ptr(static_cast(m->mean_ptr)); @@ -742,8 +741,8 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, check_device_vs_host_ptr(static_cast(gamma_ptr)); check_device_vs_host_ptr(static_cast(input_grad_ptr)); check_device_vs_host_ptr(static_cast(residual1_grad_ptr)); - sleep(10); - assert(false); + + return; layer_norm_grad_input_kernel<<>>( output_grad_ptr, diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index aa72d7d32a..ff72b2273a 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -90,9 +90,9 @@ void FFModel::residual_rms_norm(const Tensor input1, casted_input2); rm->outputs[0] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/); rm->outputs[1] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/); // weights int weight_dims[1] = {dim}; @@ -100,7 +100,7 @@ void FFModel::residual_rms_norm(const Tensor input1, weight_dims, data_type, rm, - true /*create_grad*/, + false /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 8313273c49..700162ade2 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -355,6 +355,13 @@ FutureMap Softmax::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); + // we add the region below in order to copy the output to the grad tensor + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); } @@ -363,20 +370,26 @@ void Softmax::inference_task(Task const *task, Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); + + std::string op_name_without_uid = Softmax::get_op_name_without_uid(m); + std::cout << "INF " << op_name_without_uid << std::endl; if (bc->num_tokens == 0) { return; } Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - inference_kernel_wrapper(m, bc, input, output); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + inference_kernel_wrapper(m, bc, input, output, output_grad); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -429,6 +442,7 @@ void Softmax::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + printf("BWD softmax\n"); assert(task->regions.size() == regions.size()); assert(regions.size() == 2); assert(task->regions.size() == 2); From 1f86c29ce21a235fb6eaa0b50ab48d3cefd77313 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 2 Jan 2024 21:55:50 -0500 Subject: [PATCH 09/11] fix --- src/ops/inc_multihead_self_attention.cu | 103 ++++++++++++++---------- 1 file changed, 62 insertions(+), 41 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b1c3db25dc..6bcb6d42ea 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1018,10 +1018,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // save result to file for checking - std::string filename = base_filepath + "_o_proj_in_grad"; - std::cout << "FILENAME: " << filename << std::endl; - save_tensor(C, m_*n_, filename.c_str()); + if (m->inference_debugging) { + // save result to file for checking + std::string filename = base_filepath + "_o_proj_in_grad"; + std::cout << "FILENAME: " << filename << std::endl; + save_tensor(C, m_*n_, filename.c_str()); + } } // Step 2: compute gradients w.r.t. value { @@ -1074,12 +1076,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // save result to file for checking - std::string filename = base_filepath + "_v_proj_in_grad"; - std::cout << "FILENAME: " << filename << std::endl; - save_tensor(C, m_*n_*m->num_q_heads, filename.c_str()); - std::string filename2 = base_filepath + "_qk_prods_softmax"; - std::cout << "FILENAME: " << filename2 << std::endl; - save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str()); + if (m->inference_debugging) { + std::string filename = base_filepath + "_v_proj_in_grad"; + std::cout << "FILENAME: " << filename << std::endl; + save_tensor(C, m_*n_*m->num_q_heads, filename.c_str()); + std::string filename2 = base_filepath + "_qk_prods_softmax"; + std::cout << "FILENAME: " << filename2 << std::endl; + save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str()); + } } // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor { @@ -1128,12 +1132,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - std::string filename4 = base_filepath + "_qk_prods_softmax_grad"; - std::cout << "FILENAME: " << filename4 << std::endl; - save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str()); - std::string filename5 = base_filepath + "_vcache"; - std::cout << "FILENAME: " << filename5 << std::endl; - save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str()); + if (m->inference_debugging) { + std::string filename4 = base_filepath + "_qk_prods_softmax_grad"; + std::cout << "FILENAME: " << filename4 << std::endl; + save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str()); + std::string filename5 = base_filepath + "_vcache"; + std::cout << "FILENAME: " << filename5 << std::endl; + save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str()); + } } // Step 4: softmax backpropagation { @@ -1161,10 +1167,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->qk_tensor, m->qk_prods)); - DT *C = static_cast
(m->qk_prods); - std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in"; - std::cout << "FILENAME: " << filename6 << std::endl; - save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str()); + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in"; + std::cout << "FILENAME: " << filename6 << std::endl; + save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str()); + } // TODO: fill all elements above diagonal to force causal attention size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; @@ -1181,9 +1189,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, DT(0.0f)); } - std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked"; - std::cout << "FILENAME: " << filename7 << std::endl; - save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str()); + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked"; + std::cout << "FILENAME: " << filename7 << std::endl; + save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str()); + } } // Step 5: compute gradients w.r.t. key { @@ -1238,12 +1249,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - std::string filename8 = base_filepath + "_query_activation"; - std::cout << "FILENAME: " << filename8 << std::endl; - save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str()); - std::string filename9 = base_filepath + "_devkproj_pre"; - std::cout << "FILENAME: " << filename9 << std::endl; - save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); + if (m->inference_debugging) { + std::string filename8 = base_filepath + "_query_activation"; + std::cout << "FILENAME: " << filename8 << std::endl; + save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str()); + std::string filename9 = base_filepath + "_devkproj_pre"; + std::cout << "FILENAME: " << filename9 << std::endl; + save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); + } } // Step 6: compute gradients w.r.t query { @@ -1294,9 +1307,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - std::string filename3 = base_filepath + "_devQKVPRojArray_pre"; - std::cout << "FILENAME: " << filename3 << std::endl; - save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); + if (m->inference_debugging) { + std::string filename3 = base_filepath + "_devQKVPRojArray_pre"; + std::cout << "FILENAME: " << filename3 << std::endl; + save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); + } } // Compute rotary embeddings bwd @@ -1318,17 +1333,21 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, num_tokens, m->hidden_size); DT *C = static_cast
(m->devQKVProjArray); - std::string filename3 = base_filepath + "_devQKVPRojArray"; - std::cout << "FILENAME: " << filename3 << std::endl; - save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); + if (m->inference_debugging) { + std::string filename3 = base_filepath + "_devQKVPRojArray"; + std::cout << "FILENAME: " << filename3 << std::endl; + save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); + } } // matrix C: gradients for key (saved as part of m->devQKVProjArray) // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] DT *C = static_cast
(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients - std::string filename9 = base_filepath + "_devkproj"; - std::cout << "FILENAME: " << filename9 << std::endl; - save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); + if (m->inference_debugging) { + std::string filename9 = base_filepath + "_devkproj"; + std::cout << "FILENAME: " << filename9 << std::endl; + save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); + } } // Step 7: compute gradients w.r.t. input { @@ -1371,9 +1390,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - std::string filename12 = base_filepath + "_attn_final_grad_in"; - std::cout << "FILENAME: " << filename12 << std::endl; - save_tensor(C, num_tokens * m->qSize, filename12.c_str()); + if (m->inference_debugging) { + std::string filename12 = base_filepath + "_attn_final_grad_in"; + std::cout << "FILENAME: " << filename12 << std::endl; + save_tensor(C, num_tokens * m->qSize, filename12.c_str()); + } } } } From 6a0c899956800ef5bbdc50016a7fc349bc967b7a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 Jan 2024 22:31:25 -0500 Subject: [PATCH 10/11] cleanup --- inference/models/opt.cc | 9 +-- src/ops/add_bias_residual_layer_norm.cc | 11 +-- src/ops/argmax.cc | 5 ++ src/ops/fused.cc | 5 -- src/ops/fused.cu | 1 - src/ops/inc_multihead_self_attention.cc | 50 +------------ src/ops/inc_multihead_self_attention.cu | 96 +------------------------ src/ops/kernels/softmax.cu | 25 ++++--- src/ops/linear.cc | 5 -- src/ops/lora_linear.cc | 2 - src/ops/residual_layer_norm.cc | 6 -- src/ops/residual_layer_norm.cu | 11 --- src/ops/residual_rms_norm.cc | 69 ------------------ src/ops/softmax.cc | 7 +- tests/peft/hf_finetune.py | 6 -- 15 files changed, 28 insertions(+), 280 deletions(-) diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 28ab2aea7d..e0e940b186 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -202,7 +202,6 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers_" + std::to_string(i) + "_fc1").c_str()); - //Tensor activation = ff.relu(fc1, false); fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, @@ -223,13 +222,10 @@ void OPT::create_opt_model(FFModel &ff, } // final - Tensor final_residual_ln_output[2] = {nullptr, nullptr}; - // ff.residual_rms_norm(added, fc2, final_residual_ln_output, 1e-05, opt_config.hidden_size, - // DT_NONE, "final_layer_norm"); ff.residual_layer_norm(added, fc2, nullptr, - final_residual_ln_output, + res_ln_outputs, false, axes, opt_config.layer_norm_elementwise_affine, @@ -237,8 +233,9 @@ void OPT::create_opt_model(FFModel &ff, true, DT_NONE, "final_layer_norm"); + Tensor all_final_norm = res_ln_outputs[1]; - Tensor lm_head = ff.dense(final_residual_ln_output[1], + Tensor lm_head = ff.dense(all_final_norm, opt_config.vocab_size, AC_MODE_NONE, false, diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index a8a9e05e3d..88a34b7eb5 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -618,16 +618,13 @@ void AddBiasResidualLayerNorm::inference_task( assert(task->regions.size() == regions.size()); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - - - AddBiasResidualLayerNormMeta *m = - *((AddBiasResidualLayerNormMeta **)task->local_args); - std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m); - std::cout << "INF " << op_name_without_uid << std::endl; if (bc->num_tokens == 0) { return; } + AddBiasResidualLayerNormMeta *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + assert(regions.size() == 5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); @@ -1006,8 +1003,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task( ctx, runtime); } - std::string op_name_without_uid = AddBiasResidualLayerNorm::get_op_name_without_uid(m); - std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " << m->reset_input_grads[0] << " reset_in_grad[1]: " << m->reset_input_grads[1] << std::endl; AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( m, output_grad, input_grad, residual_grad, gamma); diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index cabb8b204f..dd0e2bb822 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -392,6 +392,11 @@ InferenceResult GenericTensorAccessorW parent; int batch_size = bc->num_active_infr_tokens(); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); + // Note that we free activation allocator here since argmax is the + // last operator in forward + if (m->handle.peft_activation_allocator != nullptr) { + m->handle.peft_activation_allocator->free_all(); + } InferenceResult ir; if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 632c331e1f..e18486289f 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -487,11 +487,6 @@ FutureMap FusedOp::inference(FFModel const &ff, // so we transfer the maximum of them // size_t batch_config_size = // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); - printf("FUSED! INFERENCE! %i ops\n", numOperators); - for (int i=0; iop_type << " " << oppp->name << std::endl; - } IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 25f15d8efd..17586e925f 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -628,7 +628,6 @@ __host__ void assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); if (op == fused->numOperators -1) { // if this is the final operator - printf("op %i is softmax! Accessing region %i\n", fused->numOperators -1, roff); output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(fused->output_data_types[fused->numOutputs-1], regions[roff], diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 2491634a76..f590fa0440 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -818,17 +818,13 @@ void IncMultiHeadSelfAttention::inference_task( log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", bc->num_tokens, bc->num_active_requests()); - - - IncMultiHeadSelfAttentionMeta *m = - *((IncMultiHeadSelfAttentionMeta **)task->local_args); - std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m); - std::cout << "INF " << op_name_without_uid << std::endl; - if (bc->num_tokens == 0) { return; } + IncMultiHeadSelfAttentionMeta *m = + *((IncMultiHeadSelfAttentionMeta **)task->local_args); + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 : regions.size() == 3)); @@ -880,36 +876,6 @@ void IncMultiHeadSelfAttention::inference_task( } } -template -void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) { - std::ifstream in(filepath, std::ios::in | std::ios::binary); - if (!in.good()) { - std::cout << "Could not open file: " << filepath << std::endl; - } - assert(in.good() && "incorrect weight file path"); - std::vector
host_array(size); - size_t loaded_data_size = sizeof(DT) * size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - in.read((char *)host_array.data(), loaded_data_size); - - size_t in_get_size = in.gcount(); - if (in_get_size != loaded_data_size) { - std::cout << "load weight data error " << in_get_size << ", " - << loaded_data_size << ", " << sizeof(DT) << std::endl; - assert(false); - } - assert(size == host_array.size()); - - copy_tensor_host_to_dev(ptr, host_array.data(), size); - - // // normal - // long data_index = 0; - // for (auto v : host_array) { - // ptr[data_index++] = v; - // } - in.close(); -} FutureMap IncMultiHeadSelfAttention::peft_bwd( FFModel const &ff, @@ -1027,16 +993,6 @@ void IncMultiHeadSelfAttention::peft_bwd_task( assert(task->index_point.get_dim() == 1); - std::string op_name_without_uid = IncMultiHeadSelfAttention::get_op_name_without_uid(m); - std::cout << "BWD " << op_name_without_uid << std::endl; - - if (op_name_without_uid == "layers_11_attention") { - load_tensor_from_file( - output_grad.get_float_ptr(), - (output_grad.domain.get_volume()/128)*24, - "/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0.flexflow" - ); - } IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( m, diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 6bcb6d42ea..3a45ce5da3 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -642,8 +642,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->hidden_size); } if (*m->apply_rotary_embedding) { - printf("ROTARY EMBEDDING: num_tokens: %i, q_array_size: %i, m->hidden_size: %i\n", - num_tokens, q_array_size, m->hidden_size); /*q&k*/ parallelism = num_tokens * m->hidden_size; apply_rotary_embedding_hf<<op_name); - size_t last_underscore = op_name_without_uid.length() - 1; - for (int i = op_name_without_uid.length() - 1; i > 0; i--) { - if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { - break; - } else if (m->op_name[i] == '_') { - last_underscore = i; - } - } - op_name_without_uid.erase(last_underscore); - - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_bwd-step_" + std::to_string(m->bwd_step) + - "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + - "_layer-name_" + op_name_without_uid + "_shard-id_" + - std::to_string(shard_id); - - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; @@ -1018,12 +996,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - // save result to file for checking - std::string filename = base_filepath + "_o_proj_in_grad"; - std::cout << "FILENAME: " << filename << std::endl; - save_tensor(C, m_*n_, filename.c_str()); - } } // Step 2: compute gradients w.r.t. value { @@ -1075,15 +1047,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // save result to file for checking - if (m->inference_debugging) { - std::string filename = base_filepath + "_v_proj_in_grad"; - std::cout << "FILENAME: " << filename << std::endl; - save_tensor(C, m_*n_*m->num_q_heads, filename.c_str()); - std::string filename2 = base_filepath + "_qk_prods_softmax"; - std::cout << "FILENAME: " << filename2 << std::endl; - save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str()); - } } // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor { @@ -1132,14 +1095,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename4 = base_filepath + "_qk_prods_softmax_grad"; - std::cout << "FILENAME: " << filename4 << std::endl; - save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename4.c_str()); - std::string filename5 = base_filepath + "_vcache"; - std::cout << "FILENAME: " << filename5 << std::endl; - save_tensor(B, m->vProjSize * m->num_q_heads * num_tokens, filename5.c_str()); - } } // Step 4: softmax backpropagation { @@ -1166,14 +1121,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, &beta, m->qk_tensor, m->qk_prods)); - - if (m->inference_debugging) { - DT *C = static_cast
(m->qk_prods); - std::string filename6 = base_filepath + "_qk_prods_softmax_grad_in"; - std::cout << "FILENAME: " << filename6 << std::endl; - save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename6.c_str()); - } - // TODO: fill all elements above diagonal to force causal attention size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; if (entries_above_diagonal > 0) { @@ -1189,12 +1136,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, DT(0.0f)); } - if (m->inference_debugging) { - DT *C = static_cast
(m->qk_prods); - std::string filename7 = base_filepath + "_qk_prods_softmax_grad_in_masked"; - std::cout << "FILENAME: " << filename7 << std::endl; - save_tensor(C, num_tokens * num_tokens * m->num_q_heads, filename7.c_str()); - } } // Step 5: compute gradients w.r.t. key { @@ -1249,14 +1190,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename8 = base_filepath + "_query_activation"; - std::cout << "FILENAME: " << filename8 << std::endl; - save_tensor(B, m->qProjSize * m->num_q_heads *num_tokens, filename8.c_str()); - std::string filename9 = base_filepath + "_devkproj_pre"; - std::cout << "FILENAME: " << filename9 << std::endl; - save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); - } } // Step 6: compute gradients w.r.t query { @@ -1276,7 +1209,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // after transposition & striding int m_ = num_tokens; // num_new_tokens int n_ = m->qProjSize; - int k_ = num_tokens; + int k_ = num_tokens; // before transposition and striding int lda = num_tokens; // num_new_tokens int ldb = m->qProjSize * m->num_q_heads; @@ -1307,19 +1240,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename3 = base_filepath + "_devQKVPRojArray_pre"; - std::cout << "FILENAME: " << filename3 << std::endl; - save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); - } } - // Compute rotary embeddings bwd { if (*m->apply_rotary_embedding) { assert(m->hidden_size == m->qProjSize * m->num_q_heads); assert(m->qProjSize == m->kProjSize); - printf("ROTARY EMBEDDING bwd: num_tokens: %i, m->hidden_size: %i\n", num_tokens, m->hidden_size); /*q&k*/ int parallelism = num_tokens * m->hidden_size; DT *A = static_cast
(m->devQKVProjArray); @@ -1332,21 +1258,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->qProjSize, num_tokens, m->hidden_size); - DT *C = static_cast
(m->devQKVProjArray); - if (m->inference_debugging) { - std::string filename3 = base_filepath + "_devQKVPRojArray"; - std::cout << "FILENAME: " << filename3 << std::endl; - save_tensor(C, num_tokens * m->qProjSize * m->num_q_heads * 3, filename3.c_str()); - } - } - - // matrix C: gradients for key (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients - if (m->inference_debugging) { - std::string filename9 = base_filepath + "_devkproj"; - std::cout << "FILENAME: " << filename9 << std::endl; - save_tensor(C, num_tokens * (m->qProjSize * m->num_q_heads), filename9.c_str()); } } // Step 7: compute gradients w.r.t. input @@ -1390,11 +1301,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename12 = base_filepath + "_attn_final_grad_in"; - std::cout << "FILENAME: " << filename12 << std::endl; - save_tensor(C, num_tokens * m->qSize, filename12.c_str()); - } } } } diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 1624c0458d..271a291b09 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -290,11 +290,10 @@ __global__ void sparse_categorical_crossentropy_loss_peft_backward( int num_tokens, int num_classes) { CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { - input_grad[i] = 0.5; - // input_grad[i] = output_grad[i]; - // if (i % num_classes == token_ids[i / num_classes]) { - // input_grad[i] -= 1.0f; - // } + input_grad[i] = output_grad[i]; + if (i % num_classes == token_ids[i / num_classes]) { + input_grad[i] -= 1.0f; + } } } @@ -346,14 +345,14 @@ void peft_bwd_kernel(SoftmaxMeta const *m, num_bwd_tokens, num_classes); // scale - // scale_kernel<<>>(input_grad_ptr + - // tokens_previous_requests * num_classes, - // num_bwd_tokens * num_classes, - // DT(0.0), - // scale_factor); + scale_kernel<<>>(input_grad_ptr + + tokens_previous_requests * num_classes, + num_bwd_tokens * num_classes, + DT(0.0), + scale_factor); tokens_previous_requests += num_bwd_tokens; } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 595b8d24e9..15789ae2e9 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -621,8 +621,6 @@ void Linear::inference_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - std::string op_name_without_uid = Linear::get_op_name_without_uid(m); - printf("INF %s\n", op_name_without_uid.c_str()); if (bc->num_tokens == 0) { return; } @@ -759,9 +757,6 @@ void Linear::peft_bwd_task(Task const *task, int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; - std::string op_name_without_uid = Linear::get_op_name_without_uid(m); - std::cout << "BWD " << op_name_without_uid << std::endl; - int num_infr_tokens = bc->num_active_infr_tokens(); int num_peft_tokens = bc->num_active_peft_tokens(); if (m->inference_debugging) { diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index fb13dc99cb..e39b444af4 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -449,8 +449,6 @@ void LoraLinear::inference_task(Task const *task, Context ctx, Runtime *runtime) { LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); - std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m); - std::cout << "INF " << op_name_without_uid << std::endl; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_tokens() == 0) { return; diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 7697613ae0..d3cf278b35 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -823,9 +823,6 @@ void ResidualLayerNorm::peft_bwd_task( ctx, runtime); } - std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m); - std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " << m->reset_input_grads[0] << " reset_in_grad[1]: " << m->reset_input_grads[1] << std::endl; - ResidualLayerNorm::peft_bwd_kernel_wrapper( m, output_grad, input_grad, residual1_grad, residual2_grad, gamma); @@ -954,10 +951,7 @@ void ResidualLayerNorm::inference_task( assert(task->regions.size() == regions.size()); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); - std::string op_name_without_uid = ResidualLayerNorm::get_op_name_without_uid(m); - std::cout << "INF " << op_name_without_uid << std::endl; if (bc->num_tokens == 0) { - printf("Zero tokens\n"); return; } diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 2164616b88..fe3f695522 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -254,7 +254,6 @@ void ResidualLayerNorm::inference_kernel_wrapper( MemoryAllocator *allocator = m->handle.peft_activation_allocator; m->input_activation = allocator->allocate_instance_untyped( data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); - printf("Allocating input_activation (%p) of size: %i*%i*%i=%i for %s...\n", m->input_activation, data_type_size(m->input_type[0]), num_peft_tokens,in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, m->op_name); // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( @@ -734,16 +733,6 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, const dim3 blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); - printf("Attempting to access %p\n", m->input_activation); - check_device_vs_host_ptr(static_cast(m->input_activation)); - check_device_vs_host_ptr(static_cast(m->mean_ptr)); - check_device_vs_host_ptr(static_cast(m->rstd_ptr)); - check_device_vs_host_ptr(static_cast(gamma_ptr)); - check_device_vs_host_ptr(static_cast(input_grad_ptr)); - check_device_vs_host_ptr(static_cast(residual1_grad_ptr)); - - return; - layer_norm_grad_input_kernel<<>>( output_grad_ptr, static_cast(m->input_activation), diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index ff72b2273a..9591aedf45 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -673,36 +673,6 @@ Legion::FutureMap return runtime->execute_index_space(ctx, launcher); } -template -void load_tensor_from_file(DT *ptr, size_t size, std::string filepath) { - std::ifstream in(filepath, std::ios::in | std::ios::binary); - if (!in.good()) { - std::cout << "Could not open file: " << filepath << std::endl; - } - assert(in.good() && "incorrect weight file path"); - std::vector
host_array(size); - size_t loaded_data_size = sizeof(DT) * size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - in.read((char *)host_array.data(), loaded_data_size); - - size_t in_get_size = in.gcount(); - if (in_get_size != loaded_data_size) { - std::cout << "load weight data error " << in_get_size << ", " - << loaded_data_size << ", " << sizeof(DT) << std::endl; - assert(false); - } - assert(size == host_array.size()); - - copy_tensor_host_to_dev(ptr, host_array.data(), size); - - // // normal - // long data_index = 0; - // for (auto v : host_array) { - // ptr[data_index++] = v; - // } - in.close(); -} /* regions[0](I): RMS output_grad @@ -742,45 +712,6 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, peft_bwd_kernel_wrapper( m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); - // get name - std::string op_name_without_uid = ResidualRMSNorm::get_op_name_without_uid(m); - std::cout << "BWD " << op_name_without_uid << " reset_in_grad[0]: " << m->reset_input_grads[0] << " reset_in_grad[1]: " << m->reset_input_grads[1] << std::endl; - // print shape - int numdims = residual_input0_grad.domain.get_dim(); - std::cout << "in grad dims: "; - for (int i=0; iinference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 700162ade2..932b8ade84 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -373,16 +373,12 @@ void Softmax::inference_task(Task const *task, assert(regions.size() == 3); assert(task->regions.size() == 3); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); - - std::string op_name_without_uid = Softmax::get_op_name_without_uid(m); - std::cout << "INF " << op_name_without_uid << std::endl; if (bc->num_tokens == 0) { return; } Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - + SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( @@ -442,7 +438,6 @@ void Softmax::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - printf("BWD softmax\n"); assert(task->regions.size() == regions.size()); assert(regions.size() == 2); assert(task->regions.size() == 2); diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 818e0b9085..7836633b30 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -72,8 +72,6 @@ def peft_backward_hook(module, grad_input, grad_output): print("\t", go.shape) print(f"\t\tSaving to {dst_filepath}") torch.save(go, dst_filepath) - if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0": - go.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow") else: print(go) print("Backward GRAD Input:") @@ -83,8 +81,6 @@ def peft_backward_hook(module, grad_input, grad_output): print("\t", gi.shape) print(f"\t\tSaving to {dst_filepath}") torch.save(gi, dst_filepath) - if dst_filepath == "./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0" or dst_filepath == "./hf_peft_tensors/bwd_step_0_norm.gi_0": - gi.detach().cpu().numpy().tofile(f"{dst_filepath}.flexflow") else: print(gi) @@ -229,8 +225,6 @@ def main(): torch.save(params, f"./hf_peft_tensors/{name}") if "lm_head" in name or "norm" in name: torch.save(params, f"./hf_peft_tensors/{name}") - if "down_proj" in name or "self_attn" in name: - torch.save(params, f"./hf_peft_tensors/{name}") # Load fine-tuning dataset data = load_dataset("Abirate/english_quotes") From 0d530b00ef4557360d1dc68fd0a8720a698fb884 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 10 Jan 2024 03:36:06 +0000 Subject: [PATCH 11/11] linting --- src/ops/add_bias_residual_layer_norm.cu | 14 ++++++-- src/ops/fused.cc | 12 ++++--- src/ops/fused.cu | 43 ++++++++++++++----------- src/ops/inc_multihead_self_attention.cc | 2 -- src/ops/residual_layer_norm.cc | 18 +++++------ src/ops/residual_layer_norm.cu | 16 +++++++-- src/ops/residual_rms_norm.cc | 3 +- src/ops/softmax.cc | 11 ++++--- 8 files changed, 74 insertions(+), 45 deletions(-) diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 08e3bb3edf..ab017ed46c 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -618,7 +618,17 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, alignas(sizeof(double)) extern __shared__ char s_data1[]; T *buf = reinterpret_cast(&s_data1); - compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual, reset_input_grad, reset_residual_grad, N, buf); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual, + reset_input_grad, + reset_residual_grad, + N, + buf); } /*static*/ @@ -774,7 +784,7 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel( cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - + int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index e18486289f..8afd61aece 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -530,15 +530,17 @@ FutureMap FusedOp::inference(FFModel const &ff, } offset += numOutputs; // add softmax output grad - if (operators[numOperators-1]->op_type == OP_SOFTMAX) { - printf("operator %i is last SOFTMAX! adding output %i\n", numOperators-1, numOutputs-1); - assert(outputs[numOutputs-1]->region != LogicalRegion::NO_REGION); + if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { + printf("operator %i is last SOFTMAX! adding output %i\n", + numOperators - 1, + numOutputs - 1); + assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); launcher.add_region_requirement( - RegionRequirement(batch_outputs[numOutputs-1]->part_grad, + RegionRequirement(batch_outputs[numOutputs - 1]->part_grad, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, - batch_outputs[numOutputs-1]->region_grad)); + batch_outputs[numOutputs - 1]->region_grad)); launcher.add_field(offset, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 17586e925f..f6bed71f6a 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -95,9 +95,11 @@ __host__ void assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); - bool softmax_grad_additional_region = (fused->op_op_type[fused->numOperators-1] == OP_SOFTMAX); - assert((int)regions.size() == - fused->numInputs + fused->numWeights + fused->numOutputs + softmax_grad_additional_region); + bool softmax_grad_additional_region = + (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX); + assert((int)regions.size() == fused->numInputs + fused->numWeights + + fused->numOutputs + + softmax_grad_additional_region); // Domain input_domain[MAX_NUM_INPUTS]; // Domain weight_domain[MAX_NUM_WEIGHTS]; // Domain output_domain[MAX_NUM_OUTPUTS]; @@ -627,18 +629,22 @@ __host__ void assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - if (op == fused->numOperators -1) { // if this is the final operator - output_accessor[fused->numOutputs] = - helperGetGenericTensorAccessorWO(fused->output_data_types[fused->numOutputs-1], - regions[roff], - task->regions[roff], - FID_DATA, - ctx, - runtime); + if (op == fused->numOperators - 1) { // if this is the final operator + output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO( + fused->output_data_types[fused->numOutputs - 1], + regions[roff], + task->regions[roff], + FID_DATA, + ctx, + runtime); } SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; Kernels::Softmax::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0], output_accessor[fused->numOutputs]); + m, + bc, + my_input_accessor[0], + my_output_accessor[0], + output_accessor[fused->numOutputs]); break; } case OP_ALLREDUCE: { @@ -1123,12 +1129,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, if (m->elementwise_affine) { gamma = my_weight_accessor[1]; } - - AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(m, - my_output_grad_accessor[1], - my_input_grad_accessor[0], - my_input_grad_accessor[1], - gamma); + + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + gamma); break; } case OP_SIGMOID_SILU_MULTI: { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index f590fa0440..5d52034575 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -876,7 +876,6 @@ void IncMultiHeadSelfAttention::inference_task( } } - FutureMap IncMultiHeadSelfAttention::peft_bwd( FFModel const &ff, BatchConfigFuture const &bc, @@ -993,7 +992,6 @@ void IncMultiHeadSelfAttention::peft_bwd_task( assert(task->index_point.get_dim() == 1); - IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( m, bc, diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index d3cf278b35..8563c299ab 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -133,15 +133,15 @@ void FFModel::residual_layer_norm(const Tensor input, : residual2; } Layer *ln = new Layer(this, - OP_RESIDUAL_LAYERNORM, - data_type, - name, - 2 + use_two_residuals /*inputs*/, - num_weights, - 2 /*outputs*/, - casted_input, - casted_residual1, - casted_residual2); + OP_RESIDUAL_LAYERNORM, + data_type, + name, + 2 + use_two_residuals /*inputs*/, + num_weights, + 2 /*outputs*/, + casted_input, + casted_residual1, + casted_residual2); ln->outputs[0] = create_tensor_legion_ordering( input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/); ln->outputs[1] = create_tensor_legion_ordering( diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index fe3f695522..1f87949234 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -570,7 +570,19 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, int const N) { alignas(sizeof(double)) extern __shared__ char s_data1[]; T *buf = reinterpret_cast(&s_data1); - compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, reset_input_grad, reset_residual_grad1, reset_residual_grad2, N, buf); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual1, + dX_residual2, + reset_input_grad, + reset_residual_grad1, + reset_residual_grad2, + N, + buf); } /*static*/ @@ -727,7 +739,7 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - + int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 9591aedf45..c2fbe11544 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -673,7 +673,6 @@ Legion::FutureMap return runtime->execute_index_space(ctx, launcher); } - /* regions[0](I): RMS output_grad regions[1](I/O): Residual input 0 grad @@ -711,7 +710,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper( m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); - + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 932b8ade84..1d062b552b 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -356,11 +356,12 @@ FutureMap Softmax::inference(FFModel const &ff, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); // we add the region below in order to copy the output to the grad tensor - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); }