diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index a0d01092bf..85279860cf 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -112,9 +112,7 @@ class SpecIncMultiHeadSelfAttention : public Op { BeamSearchBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); + GenericTensorAccessorW const &output); Params get_params() const; public: diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 168ad5f618..b4eb339201 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -114,10 +114,7 @@ class TreeIncMultiHeadSelfAttention : public Op { TreeVerifyBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); - + GenericTensorAccessorW const &output); Params get_params() const; public: diff --git a/src/ops/fused.cu b/src/ops/fused.cu index de57cf59b5..4053eabca4 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -449,6 +449,7 @@ __host__ void assert(fused->op_num_outputs[op] == 1); IncMultiHeadSelfAttentionMeta *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + // TODO: why is op_num_weight still non-zero? assert(fused->op_num_weights[op] == (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; @@ -461,9 +462,7 @@ __host__ void bc, task->index_point.point_data[0], my_input_accessor[0], - // my_weight_accessor[0], my_output_accessor[0] - // biases ); break; } @@ -486,9 +485,7 @@ __host__ void &tree_bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -512,9 +509,7 @@ __host__ void &beam_bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_LAYERNORM: { diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 9c6ed0e0b6..30dcce3e7b 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -121,7 +121,7 @@ Tensor data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0 /*weights*/, 1 /*outputs*/, casted_input); } else { @@ -130,7 +130,7 @@ Tensor data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0 /*weights*/, 1 /*outputs*/, input); } @@ -154,30 +154,30 @@ Tensor int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); int weight_size = qParas * num_q_heads + kParas * num_q_heads + vParas * num_q_heads + oParas * num_q_heads; - { - int dims[1] = {weight_size}; - li->weights[0] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (qkv_bias || final_bias) { - // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; - li->weights[1] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } + // { + // int dims[1] = {weight_size}; + // li->weights[0] = create_weight_legion_ordering(1, + // dims, + // data_type, + // li, + // true /*create_grad*/, + // kernel_initializer, + // CHOSEN_SYNC_TYPE); + // } + // if (qkv_bias || final_bias) { + // // q, k, v, o + // int qkv_bias_size = + // qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + // int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + + // (final_bias ? oProjSize : 0)}; + // li->weights[1] = create_weight_legion_ordering(1, + // dims, + // data_type, + // li, + // true /*create_grad*/, + // kernel_initializer, + // CHOSEN_SYNC_TYPE); + // } li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_q_heads", num_q_heads); @@ -280,7 +280,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0, 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), @@ -323,28 +323,28 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[1].is_replica_dim = false; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } + // weights[0] = model.create_parallel_weight<2>(dims, + // this->data_type, + // NULL /*owner_op*/, + // true /*create_grad*/, + // initializer, + // CHOSEN_SYNC_TYPE); + // if (qkv_bias || final_bias) { + // ParallelTensorShape bias_shape = _input->get_shape(); + // int qkv_bias_size = + // qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + // bias_shape.dims[0].size = + // (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + // bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + // weights[1] = + // model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + // bias_shape.dims, + // this->data_type, + // nullptr /*owner_op*/, + // true /*create_grad*/, + // initializer, + // CHOSEN_SYNC_TYPE); + // } } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -382,7 +382,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0 /*weights*/, 1 /*outputs*/, _input, _weight), @@ -426,28 +426,28 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( // dims[2].size = qParas + kParas + vParas + oParas; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } + // weights[0] = model.create_parallel_weight<2>(dims, + // this->data_type, + // NULL /*owner_op*/, + // true /*create_grad*/, + // initializer, + // CHOSEN_SYNC_TYPE); + // if (qkv_bias || final_bias) { + // ParallelTensorShape bias_shape = _input->get_shape(); + // int qkv_bias_size = + // qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + // bias_shape.dims[0].size = + // (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + // bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + // weights[1] = + // model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + // bias_shape.dims, + // this->data_type, + // nullptr /*owner_op*/, + // true /*create_grad*/, + // initializer, + // CHOSEN_SYNC_TYPE); + // } } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -541,18 +541,12 @@ void SpecIncMultiHeadSelfAttention::init_inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -580,18 +574,12 @@ void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -618,17 +606,10 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], + regions[1], + task->regions[1], FID_DATA, ctx, runtime); @@ -649,7 +630,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta(handle, attn, - weight, + GenericTensorAccessorR(), gpu_mem_allocator, num_samples, num_q_heads, @@ -661,8 +642,6 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( m->inference_debugging = attn->inference_debugging; std::strcpy(m->op_name, attn->name); m->layer_guid = attn->layer_guid; - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); return m; } @@ -700,12 +679,6 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, @@ -713,21 +686,12 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (qkv_bias || final_bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(idx++, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } /* regions[0](I): input - regions[3](I): weight - regions[4](O): output + regions[1](O): output */ void SpecIncMultiHeadSelfAttention::inference_task( Task const *task, @@ -744,51 +708,30 @@ void SpecIncMultiHeadSelfAttention::inference_task( SpecIncMultiHeadSelfAttentionMeta *m = *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); + assert(regions.size() ==2); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[1].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); assert(task->index_point.get_dim() == 1); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); + m, &bc, task->index_point.point_data[0], input, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - std::vector weights_accessors; - weights_accessors.push_back(weight); - if (*m->qkv_bias || *m->final_bias) { - weights_accessors.push_back(biases); - } SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); + m, shard_id, &bc, {input}, {}, {output}); } } diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 1674e1aa26..6144b9bd4c 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -715,14 +715,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, stream); // phase 1: Implement kernel to compute KQV for input tokens // TODO WARNING: this is commented out only because we are fixing the inc_attn first - // compute_qkv_kernel(m, - // bc, - // shard_id, - // // input_ptr, - // weight_ptr, - // static_cast
(m->devQKVProjArray), - // bias_ptr, - // stream); + compute_qkv_kernel(m, + bc, + shard_id, + // input_ptr, + // weight_ptr, + static_cast
(m->devQKVProjArray), + // bias_ptr, + stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); if (bc->num_generation_tokens > 0) { @@ -756,9 +756,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( BeamSearchBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); bool use_bias = *m->qkv_bias || *m->final_bias; @@ -770,35 +768,28 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); + half const *bias_ptr = static_cast(nullptr); Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( m, bc, shard_id, input.get_half_ptr(), - weight.get_half_ptr(), + static_cast(nullptr), output.get_half_ptr(), - bias_ptr, + static_cast(nullptr), stream); } else if (input.data_type == DT_FLOAT) { - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( m, bc, shard_id, input.get_float_ptr(), - weight.get_float_ptr(), + static_cast(nullptr), output.get_float_ptr(), - bias_ptr, + static_cast(nullptr), stream); } else { assert(false && "Unspported data type"); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 661a9199a6..287c9fc46a 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -125,7 +125,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0, 1 /*outputs*/, casted_input); } else { @@ -134,7 +134,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0, 1 /*outputs*/, input); } @@ -159,37 +159,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( int one_head_size = qParas + kParas + vParas + oParas; int weight_size = qParas * num_q_heads + kParas * num_q_heads + vParas * num_q_heads + oParas * num_q_heads; - { - // compress the weight size if quantization. - if (quantization_type != DT_NONE) { - one_head_size = get_quantization_to_byte_size( - data_type, quantization_type, one_head_size); - } - - int dims[1] = {weight_size}; - li->weights[0] = create_weight_legion_ordering( - 1, - dims, - quantization_type == DT_NONE ? data_type : quantization_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (qkv_bias || final_bias) { - // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; - li->weights[1] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } + li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_q_heads", num_q_heads); @@ -305,7 +275,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0, 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), @@ -330,7 +300,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[i] = _input->dims[i]; } dims[0].size = _embed_dim; - // Currently require no parallelism along this dim + // No longer require no parallelism along this dim // assert(dims[0].degree == 1); if (allocate_weights) { // Create weight tensor @@ -357,29 +327,6 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( // dims[2].parallel_idx = -1; int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>( - dims, - quantization_type == DT_NONE ? this->data_type : quantization_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -420,7 +367,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0, 1 /*outputs*/, _input, _weight), @@ -445,7 +392,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[i] = _input->dims[i]; } dims[0].size = _embed_dim; - // Currently require no parallelism along this dim + // Currently require no parallelism along this dim, is this aligned with the previous removal of assert? assert(dims[0].degree == 1); if (allocate_weights) { // Create weight tensor @@ -470,29 +417,6 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( } int seed = std::rand(); Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>( - dims, - quantization_type == DT_NONE ? this->data_type : quantization_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } } outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -592,20 +516,12 @@ void TreeIncMultiHeadSelfAttention::init_inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -633,18 +549,12 @@ void TreeIncMultiHeadSelfAttention::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -671,17 +581,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], + regions[1], + task->regions[1], FID_DATA, ctx, runtime); @@ -694,8 +597,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); - - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + if(attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) { + std::cout<<"attn->oProjSize: "<oProjSize<<" does not match output domain dim[0]: "<oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -711,7 +616,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta(handle, attn, - weight, + GenericTensorAccessorR(), gpu_mem_allocator, num_samples, num_q_heads, @@ -726,10 +631,6 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( std::strcpy(m->op_name, attn->name); m->layer_guid = attn->layer_guid; - if (attn->quantization_type == DT_NONE) { - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); - } return m; } @@ -767,37 +668,18 @@ FutureMap TreeIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (qkv_bias || final_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } /* regions[0](I): input - regions[3](I): weight - regions[4](O): output + regions[1](O): output */ void TreeIncMultiHeadSelfAttention::inference_task( Task const *task, @@ -818,37 +700,19 @@ void TreeIncMultiHeadSelfAttention::inference_task( TreeIncMultiHeadSelfAttentionMeta *m = *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); + assert(regions.size() == 2); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[1].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); /* print_tensor(input.get_float_ptr(), @@ -858,18 +722,13 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); + m, &bc, task->index_point.point_data[0], input, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - std::vector weights_accessors; - weights_accessors.push_back(weight); - if (*m->qkv_bias || *m->final_bias) { - weights_accessors.push_back(biases); - } TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); + m, shard_id, &bc, {input}, {}, {output}); } } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 7bdd520df4..c42256d59f 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -927,14 +927,14 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // phase 1: Implement kernel to compute KQV for input tokens // TODO WARNING: this is commented out only because we are fixing the inc_attn first - // compute_qkv_kernel(m, - // bc, - // shard_id, - // // input_ptr, - // weight_ptr, - // static_cast
(m->devQKVProjArray), - // bias_ptr, - // stream); + compute_qkv_kernel(m, + bc, + shard_id, + // input_ptr, + // weight_ptr, + static_cast
(m->devQKVProjArray), + // bias_ptr, + stream); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -970,9 +970,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeVerifyBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); bool use_bias = *m->qkv_bias || *m->final_bias; @@ -986,41 +984,26 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::TreeIncMultiHeadAttention::inference_kernel( m, bc, shard_id, input.get_half_ptr(), - m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), + (half*)nullptr, output.get_half_ptr(), - bias_ptr, + (half*)nullptr, stream); } else if (input.data_type == DT_FLOAT) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::TreeIncMultiHeadAttention::inference_kernel( m, bc, shard_id, input.get_float_ptr(), - m->offload ? static_cast(m->weight_ptr) - : weight.get_float_ptr(), + (float*)nullptr, output.get_float_ptr(), - bias_ptr, + (float*)nullptr, stream); } else { assert(false && "Unspported data type"); diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index e47d873cb5..0cb12e3b0e 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -347,18 +347,12 @@ void load_attention_weights_to_dense_v2(DT *ptr, } } } - // for (int i = 0; i < one_weight_file_size; i++) { - // ptr[base_index + i] = host_array.at(data_index++); - // } std::cout<<"host array going out of scope, releasing"<