compile without dpcpp

intel · Jun 4, 2024 · 379ed27 · 379ed27
1 parent e55cb0f
commit 379ed27
Showing 17 changed files with 185 additions and 124 deletions.
diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
@@ -1204,7 +1204,7 @@ struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor*
   return ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL, src->size, src->backend);
 }
 
-struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
+struct ne_tensor* ne_dup_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
   return ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL, src->size, bk);
 }
 
@@ -1451,7 +1451,7 @@ struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor*
   return result;
 }
 
-struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
+struct ne_tensor* ne_view_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
   struct ne_tensor* result = ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data, src->size, bk);
 
   result->nb[0] = src->nb[0];
@@ -1461,6 +1461,7 @@ struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor*
 
   return result;
 }
+
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef NS_TP_MODEL
 // ne_dump_tensor
@@ -1506,9 +1507,13 @@ struct ne_tensor* ne_debug_op(struct ne_context* ctx, struct ne_tensor* a, ne_de
   return result;
 }
 
-struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, false); }
+struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_dup_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, true); }
+struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_dup_impl(ctx, a, true);
+}
 
 // ne_add
 
@@ -1522,7 +1527,7 @@ struct ne_tensor* ne_add_impl(struct ne_context* ctx, struct ne_tensor* a, struc
   }
   enum ne_op op = NE_OP_ADD;
   enum ne_backend bk = bestla_backend_support(a, b, op);
-  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a, bk) : ne_dup_tensor(ctx, a, bk);
+  struct ne_tensor* result = inplace ? ne_view_tensor_bk(ctx, a, bk) : ne_dup_tensor_bk(ctx, a, bk);
 
   result->op = NE_OP_ADD;
   result->grad = NULL;
@@ -1866,9 +1871,13 @@ struct ne_tensor* ne_sqr_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, false); }
+struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqr_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, true); }
+struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqr_impl(ctx, a, true);
+}
 
 // ne_sqrt
 
@@ -1888,9 +1897,13 @@ struct ne_tensor* ne_sqrt_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, false); }
+struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqrt_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, true); }
+struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqrt_impl(ctx, a, true);
+}
 
 // ne_log
 
@@ -1910,9 +1923,13 @@ struct ne_tensor* ne_log_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, false); }
+struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_log_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, true); }
+struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_log_impl(ctx, a, true);
+}
 
 // ne_sum
 
@@ -2017,9 +2034,13 @@ struct ne_tensor* ne_abs_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, false); }
+struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_abs_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, true); }
+struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_abs_impl(ctx, a, true);
+}
 
 // ne_sgn
 
@@ -2039,9 +2060,13 @@ struct ne_tensor* ne_sgn_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, false); }
+struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sgn_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, true); }
+struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sgn_impl(ctx, a, true);
+}
 
 // ne_neg
 
@@ -2061,9 +2086,13 @@ struct ne_tensor* ne_neg_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, false); }
+struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_neg_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, true); }
+struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_neg_impl(ctx, a, true);
+}
 
 // ne_step
 
@@ -2083,9 +2112,13 @@ struct ne_tensor* ne_step_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, false); }
+struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_step_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, true); }
+struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_step_impl(ctx, a, true);
+}
 
 // ne_relu
 
@@ -2105,9 +2138,13 @@ struct ne_tensor* ne_relu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, false); }
+struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_relu_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, true); }
+struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_relu_impl(ctx, a, true);
+}
 
 // ne_gelu
 
@@ -2127,9 +2164,13 @@ struct ne_tensor* ne_gelu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, false); }
+struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_gelu_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, true); }
+struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_gelu_impl(ctx, a, true);
+}
 
 // ne_silu
 
@@ -2149,9 +2190,13 @@ struct ne_tensor* ne_silu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, false); }
+struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_silu_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, true); }
+struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_silu_impl(ctx, a, true);
+}
 
 // ne_silu_back
 
@@ -2709,9 +2754,13 @@ struct ne_tensor* ne_cont_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, false); }
+struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_cont_impl(ctx, a, false);
+}
 
-struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, true); }
+struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_cont_impl(ctx, a, true);
+}
 
 // ne_reshape
 
@@ -3247,7 +3296,9 @@ struct ne_tensor* ne_soft_max_impl(struct ne_context* ctx, struct ne_tensor* a,
   return result;
 }
 
-struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) { return ne_soft_max_impl(ctx, a, false); }
+struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_soft_max_impl(ctx, a, false);
+}
 
 struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor* a) {
   return ne_soft_max_impl(ctx, a, true);
@@ -4421,6 +4472,7 @@ static void ne_compute_forward_add_f32(const struct ne_compute_params* params, c
   }
   float* dstptr = dst->backend == NE_BACKEND_CPU ? (float*)dst->data : (float*)wsptr;
   if (params->type == NE_TASK_INIT) {
+#ifdef NS_SYCL
     if (params->ith == 0) {
       bool sync = src1->backend != NE_BACKEND_CPU || src0->backend != NE_BACKEND_CPU;
       if (sync) {
@@ -4434,15 +4486,22 @@ static void ne_compute_forward_add_f32(const struct ne_compute_params* params, c
         bestla_device_sync(params->dev_queue);
       }
     }
+#else
+    NE_ASSERT(0);
+#endif
     return;
   }
 
   if (params->type == NE_TASK_FINALIZE) {
+#ifdef NS_SYCL
     if (params->ith == 0) {
       if (src1->backend != NE_BACKEND_CPU) {
         bestla_device_memcpy_sync(dst->data, dstptr, dst->size, params->dev_queue);
       }
     }
+#else
+    NE_ASSERT(0);
+#endif
     return;
   }
 
@@ -7252,7 +7311,7 @@ static void ne_compute_forward_mul_mat_id_q_f32(const struct ne_compute_params*
   // char * wdata_src1_end = (char *)params->wdata;
   // int64_t wdata_src1_end = 0;
 
-#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
 
   // nb01 >= nb00 - src0 is not transposed
   //   compute by src0 rows
@@ -7414,7 +7473,7 @@ static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* pa
   }
   int64_t matrix_row_counts[100];  // [n_as]
   int64_t matrix_rows[30000];      // [n_as][ne11]
-#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
   memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
   memset(matrix_rows, -1, 30000 * sizeof(int64_t));
   for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -7562,7 +7621,7 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params
   }
   int64_t matrix_row_counts[100];  // [n_as]
   int64_t matrix_rows[30000];      // [n_as][ne11]
-#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
   memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
   memset(matrix_rows, -1, 30000 * sizeof(int64_t));
   for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -7691,7 +7750,7 @@ static void ne_compute_forward_mul_mat_id_q_f32_bestla(const struct ne_compute_p
   // int64_t wdata_src1_end = 0;
   int64_t matrix_row_counts[100];  // [n_as]
   int64_t matrix_rows[30000];      // [n_as][ne11]
-#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
 
   // nb01 >= nb00 - src0 is not transposed
   //   compute by src0 rows

diff --git a/neural_speed/core/ne_layers.h b/neural_speed/core/ne_layers.h
@@ -141,6 +141,9 @@ NE_API struct ne_tensor* ne_new_f32(struct ne_context* ctx, float value);
 NE_API struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor* src);
 NE_API struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor* src);
 
+NE_API struct ne_tensor* ne_dup_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk);
+NE_API struct ne_tensor* ne_view_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk);
+
 NE_API struct ne_tensor* ne_set_zero(struct ne_tensor* tensor);
 NE_API struct ne_tensor* ne_set_i32(struct ne_tensor* tensor, int32_t value);
 NE_API struct ne_tensor* ne_set_f32(struct ne_tensor* tensor, float value);

diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp
@@ -147,8 +147,7 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
 
       // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
       struct ne_tensor* Q = ne_permute(
-          ctx0, ne_cpy(ctx0, Qcur, ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N, NE_SIZE_CALC)), 0, 2,
-          1, 3);
+          ctx0, ne_cpy(ctx0, Qcur, d_ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2, 1, 3);
 
       // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
       struct ne_tensor* K = ne_permute(ctx0,
@@ -184,15 +183,15 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
                                                      il * n_ctx * ne_element_size(kv_self.v) * n_embd),
                                           n_embd / n_head, n_head, n_past + N),
                             1, 2, 0, 3),
-                 ne_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd / n_head, n_head, NE_SIZE_CALC));
+                 d_ne_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd / n_head, n_head));
       // KQV = transpose(V) * KQ_soft_max
       struct ne_tensor* KQV = ne_mul_mat(ctx0, V_trans, KQ_soft_max);
 
       // KQV_merged = KQV.permute(0, 2, 1, 3)
       struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
       // cur = KQV_merged.contiguous().view(n_embd, N)
-      cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
+      cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
 
       // projection
       cur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur);

diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp
@@ -204,7 +204,7 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
       if (n_past == 0) {
         // build attention mask for context input
         ne_tensor* inf =
-            ne_new_tensor_4d(ctx0, attn_scores->type, 1, qlen - 1, num_attention_heads, batch_size, NE_SIZE_CALC);
+            d_ne_new_tensor_4d(ctx0, attn_scores->type, 1, qlen - 1, num_attention_heads, batch_size);
         ne_set_f32(inf, -INFINITY);
 
         ne_tensor* masked_attn_scores =

diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp
@@ -222,7 +222,7 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
         struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
         // cur = KQV_merged.contiguous().view(n_embd, N)
-        cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
+        cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
       } else {  // Using MHA (GQA/MQA) managed kv-cache
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;
@@ -272,7 +272,7 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
     lctx.use_buf(ctx0, 1);
 
     struct ne_tensor* inpFF = layernorm_output;
-    struct ne_tensor* attn_out = ne_cpy(ctx0, cur, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
+    struct ne_tensor* attn_out = ne_cpy(ctx0, cur, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
 
     // FFN (pre_layer_norm output)
     {

diff --git a/neural_speed/models/gemma/gemma.cpp b/neural_speed/models/gemma/gemma.cpp
@@ -256,7 +256,7 @@ static bool gemma_model_eval_internal(model_context* ctx, const model_input* inp
 
         // cur = KQV_merged.contiguous().view(n_gqa_embd, N)
         cur = ne_cpy(ctx0, KQV_merged,
-                     ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_dim * n_head, N * batch_size, NE_SIZE_CALC));
+                     d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_dim * n_head, N * batch_size));
       } else {
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
@@ -345,7 +345,7 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
 
     // for-loop self-attention
     struct ne_tensor* KQV_merged_contiguous =
-        ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC);
+        d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum);
     size_t off_sl = 0;
     for (int gi = 0; gi < infer_groups.size(); ++gi) {
       const int attn_bs = infer_groups[gi].size();
@@ -453,7 +453,7 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
       } else if (attn_n_total == 0 && run_mha_bf16_first) {
         // non-reordered kv-cache bf16 mha (first token only)
         auto vnele = ne_nelements(Vcur);
-        struct ne_tensor* Vtmp = ne_new_tensor_1d(ctx0, NE_TYPE_F16, vnele, NE_SIZE_CALC);
+        struct ne_tensor* Vtmp = d_ne_new_tensor_1d(ctx0, NE_TYPE_F16, vnele);
         Vtmp = ne_cpy(ctx0, ne_view_1d(ctx0, Vcur, vnele, 0), Vtmp);
         Vtmp = ne_view_4d(ctx0, Vtmp, head_size, n_head, attn_sl, attn_bs, ne_element_size(Vtmp) * head_size,
                           ne_element_size(Vtmp) * head_size * n_head,

diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp
@@ -262,7 +262,7 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
         struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
         // cur = KQV_merged.contiguous().view(n_embd, N)
-        cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
+        cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
       } else {
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/grok/grok.cpp b/neural_speed/models/grok/grok.cpp
@@ -227,7 +227,7 @@ static bool grok_model_eval_internal(model_context* ctx, const model_input* inpu
       struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
       // cur = KQV_merged.contiguous().view(n_embd, N)
-      cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
+      cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
     } else {
       const auto seq_kv = n_past + N;
       const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp
@@ -163,7 +163,7 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
       // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
       // 2, 1, 3) [64, N, 12]
       struct ne_tensor* Q = ne_permute(
-          ctx0, ne_cpy(ctx0, Qcur, ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N, NE_SIZE_CALC)), 0, 2,
+          ctx0, ne_cpy(ctx0, Qcur, d_ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
           1, 3);
 
       // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
@@ -201,7 +201,7 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
       struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
       // cur = KQV_merged.contiguous().view(n_embd, N)
-      cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
+      cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
     } else {
       const auto seq_kv = n_past + N;
       const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp
@@ -224,7 +224,7 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
       struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
       // [n_embd, N]
-      cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
+      cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
     }
 
     // attn out projection

diff --git a/neural_speed/models/phi/phi.cpp b/neural_speed/models/phi/phi.cpp
@@ -256,7 +256,7 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
         struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
         // cur = KQV_merged.contiguous().view(n_embd, N)
-        cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
+        cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
       } else {
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/phi/phi3.cpp b/neural_speed/models/phi/phi3.cpp
@@ -117,7 +117,7 @@ static bool phi3_model_eval_internal(model_context* ctx, const model_input* inpu
     bestla_reordered_attn_fp32_batch_kv_info(&kv_shape, &kv_cache_info);
   }
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
-  struct ne_tensor* factor = ne_new_tensor_1d(ctx0, NE_TYPE_F32, 48, sizeof(float));
+  struct ne_tensor* factor = d_ne_new_tensor_1d(ctx0, NE_TYPE_F32, 48);
   const float longfactor[48] = {
       1.0299999713897705, 1.0499999523162842, 1.0499999523162842, 1.0799999237060547, 1.2299998998641968,
       1.2299998998641968, 1.2999999523162842, 1.4499999284744263, 1.5999999046325684, 1.6499998569488525,
@@ -265,7 +265,7 @@ static bool phi3_model_eval_internal(model_context* ctx, const model_input* inpu
         struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
         // cur = KQV_merged.contiguous().view(n_embd, N)
-        cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
+        cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
       } else {
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp
@@ -278,7 +278,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
         struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
         // cur = KQV_merged.contiguous().view(n_embd, N)
-        cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
+        cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
       } else {
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/stablelm/stablelm.cpp b/neural_speed/models/stablelm/stablelm.cpp
@@ -259,7 +259,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input*
         struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);
 
         // cur = KQV_merged.contiguous().view(n_embd, N)
-        cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
+        cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
       } else {
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;
@@ -435,4 +435,4 @@ int model_eval(struct model_context* ctx, const model_input* inputs, const int n
   }
 
   return 0;
-}
+}
diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp
@@ -248,7 +248,7 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
 
         // cur = KQV_merged.contiguous().view(n_embd, N)
         // [768, N]
-        cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
+        cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
       } else {
         const auto seq_kv = n_past + N;
         const auto k_size = kv_cache_info.k_bytes;

diff --git a/neural_speed/models/whisper/whisper.cpp b/neural_speed/models/whisper/whisper.cpp