Skip to content

Commit

Permalink
Code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
zhihao committed Sep 20, 2024
1 parent 281a8bf commit 1bc1c1e
Show file tree
Hide file tree
Showing 8 changed files with 10 additions and 113 deletions.
4 changes: 0 additions & 4 deletions inference/models/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ void LLAMA::create_llama_model(FFModel &ff,
token = token_att_norm[0];
att_norm = token_att_norm[1];
}
att_norm->print("att_norm");
Tensor qkv_proj = ff.dense(
att_norm,
llama_config.hidden_size *
Expand All @@ -107,7 +106,6 @@ void LLAMA::create_llama_model(FFModel &ff,
0.0f, // no dropout
std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj")
.c_str());
qkv_proj->print("qkv_proj");

Tensor mha;
switch (mode) {
Expand Down Expand Up @@ -189,7 +187,6 @@ void LLAMA::create_llama_model(FFModel &ff,
}

Tensor mha_input = mha;
mha_input->print("mha_input");
mha = ff.dense(
mha_input,
llama_config.hidden_size,
Expand All @@ -203,7 +200,6 @@ void LLAMA::create_llama_model(FFModel &ff,
0.0f,
std::string("layers." + std::to_string(i) + ".self_attn.o_proj")
.c_str());
mha->print("mha");

// step 2: SILU activaion
Tensor token_ff_norm[2] = {nullptr, nullptr};
Expand Down
7 changes: 0 additions & 7 deletions src/ops/inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -600,13 +600,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
attn->num_kv_heads / attn->tensor_parallelism_degree +
(attn->num_kv_heads % attn->tensor_parallelism_degree != 0);

if (attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
printf("attn o_proj size %d does not match output domain %d\n",
attn->oProjSize,
output.domain.hi()[0] - output.domain.lo()[0] + 1);
}
// assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] +
// 1);

Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
Expand Down
14 changes: 3 additions & 11 deletions src/ops/inc_multihead_self_attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -923,9 +923,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
BatchConfig const *bc,
int shard_id,
DT const *qkv_ptr,
DT const *weight_ptr,
DT *output_ptr,
DT const *bias_ptr,
hipStream_t stream) {

if (m->offload && m->biasSize > 0) {
Expand Down Expand Up @@ -954,7 +952,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
if (bc->num_tokens > bc->num_generation_tokens) {
// phase 4: Compute attention score for prompt tokens;
compute_attention_kernel_prompt(
m, bc, shard_id, bias_ptr, weight_ptr, stream);
m, bc, shard_id, stream);
}

// compute output production and bias together for all tokens
Expand Down Expand Up @@ -1482,12 +1480,11 @@ __global__ void store_query_cache(DT const *devQKVProjArray,
}
}

template <typename DT>
// Please refer to the implementation in .cu file.
// This implementation is outdated
void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
BatchConfig const *bc,
int shard_id,
DT const *bias_ptr,
DT const *weight_ptr,
hipStream_t stream) {
checkCUDA(hipblasSetStream(m->handle.blas, stream));
checkCUDNN(miopenSetStream(m->handle.dnn, stream));
Expand Down Expand Up @@ -1802,9 +1799,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
bc,
shard_id,
input.get_half_ptr(),
m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
output.get_half_ptr(),
bias_ptr,
stream);
} else if (input.data_type == DT_FLOAT) {
if (m->offload) {
Expand All @@ -1817,10 +1812,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
bc,
shard_id,
input.get_float_ptr(),
m->offload ? static_cast<float *>(m->weight_ptr)
: weight.get_float_ptr(),
output.get_float_ptr(),
bias_ptr,
stream);
} else {
assert(false && "Unspported data type");
Expand Down
47 changes: 6 additions & 41 deletions src/ops/inc_multihead_self_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
// this kernel is no longer used by the attention operator because
// there's no more weights
// TODO: check if this is needed by the projection layers?
// It is left in case we want to reuse this part in the future
template <typename DT>
void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
GenericTensorAccessorR const weight,
Expand Down Expand Up @@ -805,9 +805,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
BatchConfig const *bc,
int shard_id,
DT const *qkv_ptr,
DT const *weight_ptr,
DT *output_ptr,
DT const *bias_ptr,
cudaStream_t stream) {
// phase 0: copy calculated qkv into devQKVProjArray
Expand All @@ -825,11 +823,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
compute_qkv_kernel(m,
bc,
shard_id,
// input_ptr,
// weight_ptr,
// nullptr, // does not use weight
static_cast<DT *>(m->devQKVProjArray),
// bias_ptr,
stream);
update_kv_cache_kernel<DT>(m, bc, stream);
Expand All @@ -842,7 +836,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
if (bc->num_tokens > bc->num_generation_tokens) {
// phase 4: Compute attention score for prompt tokens;
compute_attention_kernel_prompt(
m, bc, shard_id, bias_ptr, weight_ptr, stream);
m, bc, shard_id, static_cast<DT*>(nullptr), static_cast<DT*>(nullptr), stream);
}
// compute output production and bias together for all tokens
Expand Down Expand Up @@ -1355,14 +1349,12 @@ void peft_bwd_kernel(
int n_ = num_tokens;
int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
// TODO: checkout if the input grad ptr has some relation with
// m->devQKVProjArray so we may potentially skip this transpose and copy
// TODO: check if this transposeAdd can correctly implement gradient
// accumulation
// The original version uses existing result and attention's projection to
// do further calculation in a way different than the usual dense layer,
// they are off by a transpose. So an explicit transpose is needed here.
// The add here is just for gradient accumulation.
transposeAdd(C, B, n_, k_, alpha, beta, stream);
// printf("backward of raw attn grad: %d, %d, with redudant dimension
// %d\n", k_, n_, m_);
if (m->inference_debugging) {
std::string filename =
get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
Expand Down Expand Up @@ -1712,14 +1704,10 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
BatchConfig const *bc,
int shard_id,
GenericTensorAccessorR const &input,
// GenericTensorAccessorR const &weight,
GenericTensorAccessorW const &output
// GenericTensorAccessorR const &bias
) {
// printf("inf_k_warpper start\n");
cudaStream_t stream;
checkCUDA(get_legion_stream(&stream));
// bool use_bias = *m->qkv_bias || *m->final_bias;
cudaEvent_t t_start, t_end;
if (m->profiling) {
Expand All @@ -1728,31 +1716,23 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
cudaEventRecord(t_start, stream);
}
// assert(input.data_type == weight.data_type);
assert(input.data_type == output.data_type);
// if (use_bias) {
// assert(input.data_type == bias.data_type);
// }
if (input.data_type == DT_HALF) {
Kernels::IncMultiHeadAttention::inference_kernel(
m,
bc,
shard_id,
input.get_half_ptr(),
static_cast<half const *>(nullptr), // weight_ptr is no longer used
output.get_half_ptr(),
static_cast<half const *>(nullptr), // bias_ptr is no longer used
stream);
} else if (input.data_type == DT_FLOAT) {
Kernels::IncMultiHeadAttention::inference_kernel(
m,
bc,
shard_id,
input.get_float_ptr(),
static_cast<float const *>(nullptr), // weight_ptr is no longer used
output.get_float_ptr(),
static_cast<float const *>(nullptr), // bias_ptr is no longer used
stream);
} else {
assert(false && "Unspported data type");
Expand All @@ -1775,9 +1755,7 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
BatchConfig const *bc,
int shard_id,
GenericTensorAccessorW const &input_grad,
// GenericTensorAccessorR const &weight,
GenericTensorAccessorR const &output_grad) {
// GenericTensorAccessorR const &bias) {
cudaStream_t stream;
checkCUDA(get_legion_stream(&stream));
bool use_bias = *m->qkv_bias || *m->final_bias;
Expand All @@ -1789,41 +1767,28 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
cudaEventRecord(t_start, stream);
}
// assert(input.data_type == weight.data_type);
assert(input_grad.data_type == output_grad.data_type);
// if (use_bias) {
// assert(input_grad.data_type == bias.data_type);
// }
if (input_grad.data_type == DT_HALF) {
assert(!m->offload);
// half const *bias_ptr =
// use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
Kernels::IncMultiHeadAttention::peft_bwd_kernel(
m,
bc,
shard_id,
input_grad.get_half_ptr(),
// weight.get_half_ptr(),
static_cast<half const *>(nullptr),
output_grad.get_half_ptr(),
// bias_ptr,
static_cast<half const *>(nullptr),
stream);
} else if (input_grad.data_type == DT_FLOAT) {
assert(!m->offload);
// float const *bias_ptr =
// use_bias ? bias.get_float_ptr() : static_cast<float const
// *>(nullptr);
Kernels::IncMultiHeadAttention::peft_bwd_kernel(
m,
bc,
shard_id,
input_grad.get_float_ptr(),
// weight.get_float_ptr(),
static_cast<float const *>(nullptr),
output_grad.get_float_ptr(),
// bias_ptr,
static_cast<float const *>(nullptr),
stream);
} else {
Expand Down
8 changes: 0 additions & 8 deletions src/ops/linear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -779,14 +779,6 @@ void Linear::peft_bwd_task(Task const *task,
if (m->inference_debugging) {
assert(task->index_point.get_dim() == 1);
int shard_id = task->index_point.point_data[0];
printf("%s: in_dim = %d, out_dim = %d, num_infr_tokens = %d, "
"num_peft_tokens = %d, volume = %d\n",
m->op_name,
in_dim,
out_dim,
num_infr_tokens,
num_peft_tokens,
input_grad.domain.get_volume());
Linear::save_inference_tensors_to_file(
m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
}
Expand Down
10 changes: 1 addition & 9 deletions src/ops/spec_inc_multihead_self_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,6 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
BeamSearchBatchConfig const *bc,
int shard_id,
DT *output_ptr,
DT const *bias_ptr,
DT const *weight_ptr,
cudaStream_t stream) {
checkCUDA(cublasSetStream(m->handle.blas, stream));
checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
Expand Down Expand Up @@ -699,9 +697,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
BeamSearchBatchConfig const *bc,
int shard_id,
DT const *qkv_ptr,
DT const *weight_ptr,
DT *output_ptr,
DT const *bias_ptr,
cudaStream_t stream) {
// phase 0: copy calculated qkv into devQKVProjArray
Expand Down Expand Up @@ -736,7 +732,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
// 3 kernels for pahse 3: matmul1 - softmax - matmal2
if (bc->num_tokens > bc->num_generation_tokens) {
compute_attention_kernel_prompt(
m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
m, bc, shard_id, output_ptr, stream);
}
// compute output production and bias together for all tokens
int num_tokens = bc->num_active_tokens();
Expand Down Expand Up @@ -780,19 +776,15 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
bc,
shard_id,
input.get_half_ptr(),
static_cast<half const *>(nullptr),
output.get_half_ptr(),
static_cast<half const *>(nullptr),
stream);
} else if (input.data_type == DT_FLOAT) {
Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
m,
bc,
shard_id,
input.get_float_ptr(),
static_cast<float const *>(nullptr),
output.get_float_ptr(),
static_cast<float const *>(nullptr),
stream);
} else {
assert(false && "Unspported data type");
Expand Down
31 changes: 0 additions & 31 deletions src/runtime/file_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -918,17 +918,10 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
for (int i = 0; i < weight->num_dims; i++) {
dims_vec.push_back(weight->dims[i]);
volume *= weight->dims[i];
// std::cout<<l->name<<" dim "<<i<<": "<<weight->dims[i]<<std::endl;
}
assert(data_type_size(weight->data_type) == sizeof(DT));
DT *data = (DT *)malloc(sizeof(DT) * volume);

// printf("loading weight for %s, shapes: ", l->name);
// for(int i = 0; i < weight->num_dims; i++) {
// printf("%d ", weight->dims[i]);
// }
// printf("\n");

std::string weight_filename = removeGuidOperatorName(std::string(l->name));
bool is_attn_proj = false, is_o_proj = false;

Expand Down Expand Up @@ -961,29 +954,6 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
// if (weight_idx == 0) {
// load_attention_weights_v2(data,
// num_heads,
// num_kv_heads,
// hidden_dim,
// qkv_inner_dim,
// weight_filename,
// weights_folder,
// volume,
// tensor_parallelism_degree);
// } else {
// long long value;
// l->get_int_property("final_bias", value);
// bool final_bias = (bool)value;
// load_attention_bias_v2(data,
// num_heads,
// num_kv_heads,
// hidden_dim,
// qkv_inner_dim,
// final_bias,
// weight_filename,
// weights_folder);
// }
} else if (is_attn_proj) {
if (is_o_proj) {
if (weight_idx == 0) {
Expand Down Expand Up @@ -1053,7 +1023,6 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
}

// Copy the weight data from the buffer to the weight's ParallelTensor
printf("using default load for %s\n", l->name);
ParallelTensor weight_pt;
ff->get_parallel_tensor_from_tensor(weight, weight_pt);
weight_pt->set_tensor<DT>(ff, dims_vec, data);
Expand Down
2 changes: 0 additions & 2 deletions src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2756,7 +2756,6 @@ void RequestManager::start_background_server(FFModel *model) {
// Register callbacks for termination
{
std::set_terminate([]() {
// assert(false && "terminate");
RequestManager::terminate_background_server_at_exit();
std::abort();
});
Expand Down Expand Up @@ -3013,7 +3012,6 @@ void RequestManager::trigger_request_completion_future(
/*static*/
void RequestManager::terminate_background_server_at_exit() {
RequestManager *rm = RequestManager::get_request_manager();
// assert(false && "RM terminating bg server due to exit");
rm->terminate_background_server();
}

Expand Down

0 comments on commit 1bc1c1e

Please sign in to comment.