diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index a0d01092bf..85279860cf 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -112,9 +112,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                BeamSearchBatchConfig const *bc,
                                int shard_id,
                                GenericTensorAccessorR const &input,
-                               GenericTensorAccessorR const &weight,
-                               GenericTensorAccessorW const &output,
-                               GenericTensorAccessorR const &bias);
+                               GenericTensorAccessorW const &output);
   Params get_params() const;
 
 public:
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 168ad5f618..b4eb339201 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -114,10 +114,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                        TreeVerifyBatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
-                                       GenericTensorAccessorR const &weight,
-                                       GenericTensorAccessorW const &output,
-                                       GenericTensorAccessorR const &bias);
-
+                                       GenericTensorAccessorW const &output);
   Params get_params() const;
 
 public:
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index de57cf59b5..4053eabca4 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -449,6 +449,7 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         IncMultiHeadSelfAttentionMeta *m =
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // TODO: why is op_num_weight still non-zero?
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -461,9 +462,7 @@ __host__ void
             bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            // my_weight_accessor[0],
             my_output_accessor[0]
-            // biases
             );
         break;
       }
@@ -486,9 +485,7 @@ __host__ void
             &tree_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -512,9 +509,7 @@ __host__ void
             &beam_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_output_accessor[0]);
         break;
       }
       case OP_LAYERNORM: {
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 9c6ed0e0b6..30dcce3e7b 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -121,7 +121,7 @@ Tensor
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0 /*weights*/,
                    1 /*outputs*/,
                    casted_input);
   } else {
@@ -130,7 +130,7 @@ Tensor
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0 /*weights*/,
                    1 /*outputs*/,
                    input);
   }
@@ -154,30 +154,30 @@ Tensor
   int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
-  {
-    int dims[1] = {weight_size};
-    li->weights[0] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
-  if (qkv_bias || final_bias) {
-    // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
-    li->weights[1] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
+  // {
+  //   int dims[1] = {weight_size};
+  //   li->weights[0] = create_weight_legion_ordering(1,
+  //                                                  dims,
+  //                                                  data_type,
+  //                                                  li,
+  //                                                  true /*create_grad*/,
+  //                                                  kernel_initializer,
+  //                                                  CHOSEN_SYNC_TYPE);
+  // }
+  // if (qkv_bias || final_bias) {
+  //   // q, k, v, o
+  //   int qkv_bias_size =
+  //       qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+  //   int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
+  //                  (final_bias ? oProjSize : 0)};
+  //   li->weights[1] = create_weight_legion_ordering(1,
+  //                                                  dims,
+  //                                                  data_type,
+  //                                                  li,
+  //                                                  true /*create_grad*/,
+  //                                                  kernel_initializer,
+  //                                                  CHOSEN_SYNC_TYPE);
+  // }
   li->data_type = data_type;
   li->add_int_property("embed_dim", embed_dim);
   li->add_int_property("num_q_heads", num_q_heads);
@@ -280,7 +280,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0,
          1 /*outputs*/,
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
@@ -323,28 +323,28 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     dims[1].is_replica_dim = false;
     int seed = std::rand();
     Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(dims,
-                                                 this->data_type,
-                                                 NULL /*owner_op*/,
-                                                 true /*create_grad*/,
-                                                 initializer,
-                                                 CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
+    // weights[0] = model.create_parallel_weight<2>(dims,
+    //                                              this->data_type,
+    //                                              NULL /*owner_op*/,
+    //                                              true /*create_grad*/,
+    //                                              initializer,
+    //                                              CHOSEN_SYNC_TYPE);
+    // if (qkv_bias || final_bias) {
+    //   ParallelTensorShape bias_shape = _input->get_shape();
+    //   int qkv_bias_size =
+    //       qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+    //   bias_shape.dims[0].size =
+    //       (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+    //   bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
+    //   weights[1] =
+    //       model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
+    //                                                    bias_shape.dims,
+    //                                                    this->data_type,
+    //                                                    nullptr /*owner_op*/,
+    //                                                    true /*create_grad*/,
+    //                                                    initializer,
+    //                                                    CHOSEN_SYNC_TYPE);
+    // }
   }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -382,7 +382,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0 /*weights*/,
          1 /*outputs*/,
          _input,
          _weight),
@@ -426,28 +426,28 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     // dims[2].size = qParas + kParas + vParas + oParas;
     int seed = std::rand();
     Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(dims,
-                                                 this->data_type,
-                                                 NULL /*owner_op*/,
-                                                 true /*create_grad*/,
-                                                 initializer,
-                                                 CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
+    // weights[0] = model.create_parallel_weight<2>(dims,
+    //                                              this->data_type,
+    //                                              NULL /*owner_op*/,
+    //                                              true /*create_grad*/,
+    //                                              initializer,
+    //                                              CHOSEN_SYNC_TYPE);
+    // if (qkv_bias || final_bias) {
+    //   ParallelTensorShape bias_shape = _input->get_shape();
+    //   int qkv_bias_size =
+    //       qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+    //   bias_shape.dims[0].size =
+    //       (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+    //   bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
+    //   weights[1] =
+    //       model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
+    //                                                    bias_shape.dims,
+    //                                                    this->data_type,
+    //                                                    nullptr /*owner_op*/,
+    //                                                    true /*create_grad*/,
+    //                                                    initializer,
+    //                                                    CHOSEN_SYNC_TYPE);
+    // }
   }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -541,18 +541,12 @@ void SpecIncMultiHeadSelfAttention::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -580,18 +574,12 @@ void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -618,17 +606,10 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR weight =
-      helperGetGenericTensorAccessorRO(attn->weights[0]->data_type,
-                                       regions[1],
-                                       task->regions[1],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type,
-                                       regions[2],
-                                       task->regions[2],
+                                       regions[1],
+                                       task->regions[1],
                                        FID_DATA,
                                        ctx,
                                        runtime);
@@ -649,7 +630,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   SpecIncMultiHeadSelfAttentionMeta *m =
       new SpecIncMultiHeadSelfAttentionMeta(handle,
                                             attn,
-                                            weight,
+                                            GenericTensorAccessorR(),
                                             gpu_mem_allocator,
                                             num_samples,
                                             num_q_heads,
@@ -661,8 +642,6 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   m->inference_debugging = attn->inference_debugging;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
-  assert(weight.domain.get_volume() * data_type_size(weight.data_type) ==
-         m->weightSize);
   return m;
 }
 
@@ -700,12 +679,6 @@ FutureMap SpecIncMultiHeadSelfAttention::inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
@@ -713,21 +686,12 @@ FutureMap SpecIncMultiHeadSelfAttention::inference(
                                                     batch_outputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
 
-  if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[1]->region));
-    launcher.add_field(idx++, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
   regions[0](I): input
-  regions[3](I): weight
-  regions[4](O): output
+  regions[1](O): output
 */
 void SpecIncMultiHeadSelfAttention::inference_task(
     Task const *task,
@@ -744,51 +708,30 @@ void SpecIncMultiHeadSelfAttention::inference_task(
 
   SpecIncMultiHeadSelfAttentionMeta *m =
       *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args);
-  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
-                                           : regions.size() == 3));
+  assert(regions.size() ==2);
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR biases;
-  if (*m->qkv_bias || *m->final_bias) {
-    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                              regions[3],
-                                              task->regions[3],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime);
-    Domain bias_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    assert(bias_domain.get_dim() == 4);
-  }
+  
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  Domain weight_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
   Domain output_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[1].region.get_index_space());
 
   assert(input_domain.get_dim() == 4);
-  assert(weight_domain.get_dim() == 2);
   assert(output_domain.get_dim() == 4);
 
   assert(task->index_point.get_dim() == 1);
   SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, &bc, task->index_point.point_data[0], input, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    std::vector<GenericTensorAccessorR> weights_accessors;
-    weights_accessors.push_back(weight);
-    if (*m->qkv_bias || *m->final_bias) {
-      weights_accessors.push_back(biases);
-    }
     SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, &bc, {input}, {}, {output});
   }
 }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 1674e1aa26..6144b9bd4c 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -715,14 +715,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                   stream);
   // phase 1: Implement kernel to compute KQV for input tokens
   // TODO WARNING: this is commented out only because we are fixing the inc_attn first
-  // compute_qkv_kernel(m,
-  //                    bc,
-  //                    shard_id,
-  //                   //  input_ptr,
-  //                    weight_ptr,
-  //                    static_cast<DT *>(m->devQKVProjArray),
-  //                    bias_ptr,
-  //                    stream);
+  compute_qkv_kernel(m,
+                     bc,
+                     shard_id,
+                    //  input_ptr,
+                    //  weight_ptr,
+                     static_cast<DT *>(m->devQKVProjArray),
+                    //  bias_ptr,
+                     stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
   if (bc->num_generation_tokens > 0) {
@@ -756,9 +756,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     BeamSearchBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
@@ -770,35 +768,28 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     cudaEventRecord(t_start, stream);
   }
 
-  assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    half const *bias_ptr = static_cast<half const *>(nullptr);
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_half_ptr(),
-        weight.get_half_ptr(),
+        static_cast<half const *>(nullptr),
         output.get_half_ptr(),
-        bias_ptr,
+        static_cast<half const *>(nullptr),
         stream);
   } else if (input.data_type == DT_FLOAT) {
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_float_ptr(),
-        weight.get_float_ptr(),
+        static_cast<float const *>(nullptr),
         output.get_float_ptr(),
-        bias_ptr,
+        static_cast<float const *>(nullptr),
         stream);
   } else {
     assert(false && "Unspported data type");
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 661a9199a6..287c9fc46a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -125,7 +125,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0,
                    1 /*outputs*/,
                    casted_input);
   } else {
@@ -134,7 +134,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
                    data_type,
                    name,
                    1 /*inputs*/,
-                   weight_num /*weights*/,
+                   0,
                    1 /*outputs*/,
                    input);
   }
@@ -159,37 +159,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   int one_head_size = qParas + kParas + vParas + oParas;
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
-  {
-    // compress the weight size if quantization.
-    if (quantization_type != DT_NONE) {
-      one_head_size = get_quantization_to_byte_size(
-          data_type, quantization_type, one_head_size);
-    }
-
-    int dims[1] = {weight_size};
-    li->weights[0] = create_weight_legion_ordering(
-        1,
-        dims,
-        quantization_type == DT_NONE ? data_type : quantization_type,
-        li,
-        true /*create_grad*/,
-        kernel_initializer,
-        CHOSEN_SYNC_TYPE);
-  }
-  if (qkv_bias || final_bias) {
-    // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
-    li->weights[1] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   data_type,
-                                                   li,
-                                                   true /*create_grad*/,
-                                                   kernel_initializer,
-                                                   CHOSEN_SYNC_TYPE);
-  }
+  
   li->data_type = data_type;
   li->add_int_property("embed_dim", embed_dim);
   li->add_int_property("num_q_heads", num_q_heads);
@@ -305,7 +275,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0,
          1 /*outputs*/,
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
@@ -330,7 +300,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     dims[i] = _input->dims[i];
   }
   dims[0].size = _embed_dim;
-  // Currently require no parallelism along this dim
+  // No longer require no parallelism along this dim
   // assert(dims[0].degree == 1);
   if (allocate_weights) {
     // Create weight tensor
@@ -357,29 +327,6 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     // dims[2].parallel_idx = -1;
     int seed = std::rand();
     Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(
-        dims,
-        quantization_type == DT_NONE ? this->data_type : quantization_type,
-        NULL /*owner_op*/,
-        true /*create_grad*/,
-        initializer,
-        CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
   }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -420,7 +367,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
          _input->data_type,
          name,
          1 /*inputs*/,
-         (_qkv_bias || _final_bias ? 2 : 1) /*weights*/,
+         0,
          1 /*outputs*/,
          _input,
          _weight),
@@ -445,7 +392,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     dims[i] = _input->dims[i];
   }
   dims[0].size = _embed_dim;
-  // Currently require no parallelism along this dim
+  // Currently require no parallelism along this dim, is this aligned with the previous removal of assert?
   assert(dims[0].degree == 1);
   if (allocate_weights) {
     // Create weight tensor
@@ -470,29 +417,6 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     }
     int seed = std::rand();
     Initializer *initializer = new GlorotUniform(seed);
-    weights[0] = model.create_parallel_weight<2>(
-        dims,
-        quantization_type == DT_NONE ? this->data_type : quantization_type,
-        NULL /*owner_op*/,
-        true /*create_grad*/,
-        initializer,
-        CHOSEN_SYNC_TYPE);
-    if (qkv_bias || final_bias) {
-      ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-      bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
-      bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
-      weights[1] =
-          model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
-                                                       bias_shape.dims,
-                                                       this->data_type,
-                                                       nullptr /*owner_op*/,
-                                                       true /*create_grad*/,
-                                                       initializer,
-                                                       CHOSEN_SYNC_TYPE);
-    }
   }
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -592,20 +516,12 @@ void TreeIncMultiHeadSelfAttention::init_inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -633,18 +549,12 @@ void TreeIncMultiHeadSelfAttention::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     inputs[0]->region));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(1, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(1, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -671,17 +581,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
                                        FID_DATA,
                                        ctx,
                                        runtime);
-  GenericTensorAccessorR weight =
-      helperGetGenericTensorAccessorRO(attn->weights[0]->data_type,
-                                       regions[1],
-                                       task->regions[1],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type,
-                                       regions[2],
-                                       task->regions[2],
+                                       regions[1],
+                                       task->regions[1],
                                        FID_DATA,
                                        ctx,
                                        runtime);
@@ -694,8 +597,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
   int num_kv_heads =
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
-
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  if(attn->oProjSize != output.domain.hi()[0] - output.domain.lo()[0] + 1) {
+    std::cout<<"attn->oProjSize: "<<attn->oProjSize<<" does not match output domain dim[0]: "<<output.domain.hi()[0] - output.domain.lo()[0] + 1<<std::endl;
+  }
+  // assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                        .only_kind(Memory::GPU_FB_MEM)
@@ -711,7 +616,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
   TreeIncMultiHeadSelfAttentionMeta *m =
       new TreeIncMultiHeadSelfAttentionMeta(handle,
                                             attn,
-                                            weight,
+                                            GenericTensorAccessorR(),
                                             gpu_mem_allocator,
                                             num_samples,
                                             num_q_heads,
@@ -726,10 +631,6 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
 
-  if (attn->quantization_type == DT_NONE) {
-    assert(weight.domain.get_volume() * data_type_size(weight.data_type) ==
-           m->weightSize);
-  }
   return m;
 }
 
@@ -767,37 +668,18 @@ FutureMap TreeIncMultiHeadSelfAttention::inference(
                                                     EXCLUSIVE,
                                                     batch_inputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
-  launcher.add_region_requirement(
-      RegionRequirement(weights[0]->part,
-                        0 /*projection id*/,
-                        READ_ONLY,
-                        EXCLUSIVE,
-                        weights[0]->region,
-                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-  launcher.add_field(idx++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[0]->region));
   launcher.add_field(idx++, FID_DATA);
-  if (qkv_bias || final_bias) {
-    launcher.add_region_requirement(
-        RegionRequirement(weights[1]->part,
-                          0 /*projection id*/,
-                          READ_ONLY,
-                          EXCLUSIVE,
-                          weights[1]->region,
-                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
-    launcher.add_field(idx++, FID_DATA);
-  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
   regions[0](I): input
-  regions[3](I): weight
-  regions[4](O): output
+  regions[1](O): output
 */
 void TreeIncMultiHeadSelfAttention::inference_task(
     Task const *task,
@@ -818,37 +700,19 @@ void TreeIncMultiHeadSelfAttention::inference_task(
 
   TreeIncMultiHeadSelfAttentionMeta *m =
       *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args);
-  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
-                                           : regions.size() == 3));
+  assert(regions.size() == 2);
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR biases;
-  if (*m->qkv_bias || *m->final_bias) {
-    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                              regions[3],
-                                              task->regions[3],
-                                              FID_DATA,
-                                              ctx,
-                                              runtime);
-    Domain bias_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    assert(bias_domain.get_dim() == 4);
-  }
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  Domain weight_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
   Domain output_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[1].region.get_index_space());
 
   assert(input_domain.get_dim() == 4);
-  assert(weight_domain.get_dim() == 2);
   assert(output_domain.get_dim() == 4);
 
   /* print_tensor<float>(input.get_float_ptr(),
@@ -858,18 +722,13 @@ void TreeIncMultiHeadSelfAttention::inference_task(
   assert(task->index_point.get_dim() == 1);
 
   TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, &bc, task->index_point.point_data[0], input, output);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    std::vector<GenericTensorAccessorR> weights_accessors;
-    weights_accessors.push_back(weight);
-    if (*m->qkv_bias || *m->final_bias) {
-      weights_accessors.push_back(biases);
-    }
     TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, &bc, {input}, {}, {output});
   }
 }
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 7bdd520df4..c42256d59f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -927,14 +927,14 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   // phase 1: Implement kernel to compute KQV for input tokens
   // TODO WARNING: this is commented out only because we are fixing the inc_attn first
-  // compute_qkv_kernel(m,
-  //                    bc,
-  //                    shard_id,
-  //                   //  input_ptr,
-  //                    weight_ptr,
-  //                    static_cast<DT *>(m->devQKVProjArray),
-  //                    bias_ptr,
-  //                    stream);
+  compute_qkv_kernel(m,
+                     bc,
+                     shard_id,
+                    //  input_ptr,
+                    //  weight_ptr,
+                     static_cast<DT *>(m->devQKVProjArray),
+                    //  bias_ptr,
+                     stream);
 
   // phase 2: No need to update key/val cache
   // IncMultiHeadSelfAttention::update_kv_cache_kernel(
@@ -970,9 +970,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     TreeVerifyBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
+    GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
@@ -986,41 +984,26 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
 
   if (input.data_type == DT_HALF) {
-    if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
-    }
-
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
     Kernels::TreeIncMultiHeadAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_half_ptr(),
-        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
+        (half*)nullptr,
         output.get_half_ptr(),
-        bias_ptr,
+        (half*)nullptr,
         stream);
   } else if (input.data_type == DT_FLOAT) {
-    if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-    }
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
     Kernels::TreeIncMultiHeadAttention::inference_kernel(
         m,
         bc,
         shard_id,
         input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
+        (float*)nullptr,
         output.get_float_ptr(),
-        bias_ptr,
+        (float*)nullptr,
         stream);
   } else {
     assert(false && "Unspported data type");
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index e47d873cb5..0cb12e3b0e 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -347,18 +347,12 @@ void load_attention_weights_to_dense_v2(DT *ptr,
           }
         }
       }
-      // for (int i = 0; i < one_weight_file_size; i++) {
-      //   ptr[base_index + i] = host_array.at(data_index++);
-      // }
       std::cout<<"host array going out of scope, releasing"<<endl;
-      // assert(data_index == partial_size);
       base_index += one_partition_size;
-      // base_index += one_weight_file_size;
       file_index++;
     }
     assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
                             tensor_parallelism_degree);
-    // assert(base_index == (q_size + k_replicate_size + v_replicate_size));
   } else {
     std::cout << "Loading weight file " << o_file << std::endl;
     std::string weight_filepath = join_path({weights_folder, o_file});
@@ -384,14 +378,6 @@ void load_attention_weights_to_dense_v2(DT *ptr,
 
     int one_partition_size =
         qkv_inner_dim * (num_heads / tensor_parallelism_degree);
-    // for (int i = 0; i < one_weight_file_size; i++) {
-    //   int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
-    //   int block_num = (i / one_partition_size);
-    //   int offset = block_num / tensor_parallelism_degree * one_partition_size +
-    //                (i % one_partition_size);
-    //   ptr[part_idx * stride_size + offset] = 
-    //       host_array.at(data_index++);
-    // }
     for (int i = 0; i < one_weight_file_size; i++) {
       ptr[i] = host_array.at(data_index++);
     }