Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
compile without dpcpp
Browse files Browse the repository at this point in the history
luoyu-intel committed Jun 4, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent e55cb0f commit 379ed27
Showing 17 changed files with 185 additions and 124 deletions.
123 changes: 91 additions & 32 deletions neural_speed/core/ne_layers.c
Original file line number Diff line number Diff line change
@@ -1204,7 +1204,7 @@ struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor*
return ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL, src->size, src->backend);
}

struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
struct ne_tensor* ne_dup_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
return ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL, src->size, bk);
}

@@ -1451,7 +1451,7 @@ struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor*
return result;
}

struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
struct ne_tensor* ne_view_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk) {
struct ne_tensor* result = ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data, src->size, bk);

result->nb[0] = src->nb[0];
@@ -1461,6 +1461,7 @@ struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor*

return result;
}

////////////////////////////////////////////////////////////////////////////////
#ifdef NS_TP_MODEL
// ne_dump_tensor
@@ -1506,9 +1507,13 @@ struct ne_tensor* ne_debug_op(struct ne_context* ctx, struct ne_tensor* a, ne_de
return result;
}

struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, false); }
struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) {
return ne_dup_impl(ctx, a, false);
}

struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, true); }
struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_dup_impl(ctx, a, true);
}

// ne_add

@@ -1522,7 +1527,7 @@ struct ne_tensor* ne_add_impl(struct ne_context* ctx, struct ne_tensor* a, struc
}
enum ne_op op = NE_OP_ADD;
enum ne_backend bk = bestla_backend_support(a, b, op);
struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a, bk) : ne_dup_tensor(ctx, a, bk);
struct ne_tensor* result = inplace ? ne_view_tensor_bk(ctx, a, bk) : ne_dup_tensor_bk(ctx, a, bk);

result->op = NE_OP_ADD;
result->grad = NULL;
@@ -1866,9 +1871,13 @@ struct ne_tensor* ne_sqr_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, false); }
struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) {
return ne_sqr_impl(ctx, a, false);
}

struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, true); }
struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_sqr_impl(ctx, a, true);
}

// ne_sqrt

@@ -1888,9 +1897,13 @@ struct ne_tensor* ne_sqrt_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, false); }
struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) {
return ne_sqrt_impl(ctx, a, false);
}

struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, true); }
struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_sqrt_impl(ctx, a, true);
}

// ne_log

@@ -1910,9 +1923,13 @@ struct ne_tensor* ne_log_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, false); }
struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) {
return ne_log_impl(ctx, a, false);
}

struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, true); }
struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_log_impl(ctx, a, true);
}

// ne_sum

@@ -2017,9 +2034,13 @@ struct ne_tensor* ne_abs_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, false); }
struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) {
return ne_abs_impl(ctx, a, false);
}

struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, true); }
struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_abs_impl(ctx, a, true);
}

// ne_sgn

@@ -2039,9 +2060,13 @@ struct ne_tensor* ne_sgn_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, false); }
struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) {
return ne_sgn_impl(ctx, a, false);
}

struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, true); }
struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_sgn_impl(ctx, a, true);
}

// ne_neg

@@ -2061,9 +2086,13 @@ struct ne_tensor* ne_neg_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, false); }
struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) {
return ne_neg_impl(ctx, a, false);
}

struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, true); }
struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_neg_impl(ctx, a, true);
}

// ne_step

@@ -2083,9 +2112,13 @@ struct ne_tensor* ne_step_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, false); }
struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) {
return ne_step_impl(ctx, a, false);
}

struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, true); }
struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_step_impl(ctx, a, true);
}

// ne_relu

@@ -2105,9 +2138,13 @@ struct ne_tensor* ne_relu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, false); }
struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) {
return ne_relu_impl(ctx, a, false);
}

struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, true); }
struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_relu_impl(ctx, a, true);
}

// ne_gelu

@@ -2127,9 +2164,13 @@ struct ne_tensor* ne_gelu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, false); }
struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) {
return ne_gelu_impl(ctx, a, false);
}

struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, true); }
struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_gelu_impl(ctx, a, true);
}

// ne_silu

@@ -2149,9 +2190,13 @@ struct ne_tensor* ne_silu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, false); }
struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) {
return ne_silu_impl(ctx, a, false);
}

struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, true); }
struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_silu_impl(ctx, a, true);
}

// ne_silu_back

@@ -2709,9 +2754,13 @@ struct ne_tensor* ne_cont_impl(struct ne_context* ctx, struct ne_tensor* a, bool
return result;
}

struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, false); }
struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) {
return ne_cont_impl(ctx, a, false);
}

struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, true); }
struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_cont_impl(ctx, a, true);
}

// ne_reshape

@@ -3247,7 +3296,9 @@ struct ne_tensor* ne_soft_max_impl(struct ne_context* ctx, struct ne_tensor* a,
return result;
}

struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) { return ne_soft_max_impl(ctx, a, false); }
struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) {
return ne_soft_max_impl(ctx, a, false);
}

struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor* a) {
return ne_soft_max_impl(ctx, a, true);
@@ -4421,6 +4472,7 @@ static void ne_compute_forward_add_f32(const struct ne_compute_params* params, c
}
float* dstptr = dst->backend == NE_BACKEND_CPU ? (float*)dst->data : (float*)wsptr;
if (params->type == NE_TASK_INIT) {
#ifdef NS_SYCL
if (params->ith == 0) {
bool sync = src1->backend != NE_BACKEND_CPU || src0->backend != NE_BACKEND_CPU;
if (sync) {
@@ -4434,15 +4486,22 @@ static void ne_compute_forward_add_f32(const struct ne_compute_params* params, c
bestla_device_sync(params->dev_queue);
}
}
#else
NE_ASSERT(0);
#endif
return;
}

if (params->type == NE_TASK_FINALIZE) {
#ifdef NS_SYCL
if (params->ith == 0) {
if (src1->backend != NE_BACKEND_CPU) {
bestla_device_memcpy_sync(dst->data, dstptr, dst->size, params->dev_queue);
}
}
#else
NE_ASSERT(0);
#endif
return;
}

@@ -7252,7 +7311,7 @@ static void ne_compute_forward_mul_mat_id_q_f32(const struct ne_compute_params*
// char * wdata_src1_end = (char *)params->wdata;
// int64_t wdata_src1_end = 0;

#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]

// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
@@ -7414,7 +7473,7 @@ static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* pa
}
int64_t matrix_row_counts[100]; // [n_as]
int64_t matrix_rows[30000]; // [n_as][ne11]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
memset(matrix_rows, -1, 30000 * sizeof(int64_t));
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -7562,7 +7621,7 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params
}
int64_t matrix_row_counts[100]; // [n_as]
int64_t matrix_rows[30000]; // [n_as][ne11]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
memset(matrix_rows, -1, 30000 * sizeof(int64_t));
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -7691,7 +7750,7 @@ static void ne_compute_forward_mul_mat_id_q_f32_bestla(const struct ne_compute_p
// int64_t wdata_src1_end = 0;
int64_t matrix_row_counts[100]; // [n_as]
int64_t matrix_rows[30000]; // [n_as][ne11]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id) * ne11 + (i1)]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]

// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
3 changes: 3 additions & 0 deletions neural_speed/core/ne_layers.h
Original file line number Diff line number Diff line change
@@ -141,6 +141,9 @@ NE_API struct ne_tensor* ne_new_f32(struct ne_context* ctx, float value);
NE_API struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor* src);
NE_API struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor* src);

NE_API struct ne_tensor* ne_dup_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk);
NE_API struct ne_tensor* ne_view_tensor_bk(struct ne_context* ctx, const struct ne_tensor* src, enum ne_backend bk);

NE_API struct ne_tensor* ne_set_zero(struct ne_tensor* tensor);
NE_API struct ne_tensor* ne_set_i32(struct ne_tensor* tensor, int32_t value);
NE_API struct ne_tensor* ne_set_f32(struct ne_tensor* tensor, float value);
7 changes: 3 additions & 4 deletions neural_speed/models/bloom/bloom.cpp
Original file line number Diff line number Diff line change
@@ -147,8 +147,7 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp

// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
struct ne_tensor* Q = ne_permute(
ctx0, ne_cpy(ctx0, Qcur, ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N, NE_SIZE_CALC)), 0, 2,
1, 3);
ctx0, ne_cpy(ctx0, Qcur, d_ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2, 1, 3);

// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
struct ne_tensor* K = ne_permute(ctx0,
@@ -184,15 +183,15 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
il * n_ctx * ne_element_size(kv_self.v) * n_embd),
n_embd / n_head, n_head, n_past + N),
1, 2, 0, 3),
ne_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd / n_head, n_head, NE_SIZE_CALC));
d_ne_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd / n_head, n_head));
// KQV = transpose(V) * KQ_soft_max
struct ne_tensor* KQV = ne_mul_mat(ctx0, V_trans, KQ_soft_max);

// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));

// projection
cur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur);
2 changes: 1 addition & 1 deletion neural_speed/models/chatglm/chatglm.cpp
Original file line number Diff line number Diff line change
@@ -204,7 +204,7 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
if (n_past == 0) {
// build attention mask for context input
ne_tensor* inf =
ne_new_tensor_4d(ctx0, attn_scores->type, 1, qlen - 1, num_attention_heads, batch_size, NE_SIZE_CALC);
d_ne_new_tensor_4d(ctx0, attn_scores->type, 1, qlen - 1, num_attention_heads, batch_size);
ne_set_f32(inf, -INFINITY);

ne_tensor* masked_attn_scores =
4 changes: 2 additions & 2 deletions neural_speed/models/falcon/falcon.cpp
Original file line number Diff line number Diff line change
@@ -222,7 +222,7 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
} else { // Using MHA (GQA/MQA) managed kv-cache
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
@@ -272,7 +272,7 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
lctx.use_buf(ctx0, 1);

struct ne_tensor* inpFF = layernorm_output;
struct ne_tensor* attn_out = ne_cpy(ctx0, cur, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
struct ne_tensor* attn_out = ne_cpy(ctx0, cur, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));

// FFN (pre_layer_norm output)
{
2 changes: 1 addition & 1 deletion neural_speed/models/gemma/gemma.cpp
Original file line number Diff line number Diff line change
@@ -256,7 +256,7 @@ static bool gemma_model_eval_internal(model_context* ctx, const model_input* inp

// cur = KQV_merged.contiguous().view(n_gqa_embd, N)
cur = ne_cpy(ctx0, KQV_merged,
ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_dim * n_head, N * batch_size, NE_SIZE_CALC));
d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_dim * n_head, N * batch_size));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
4 changes: 2 additions & 2 deletions neural_speed/models/gptj/gptj.cpp
Original file line number Diff line number Diff line change
@@ -345,7 +345,7 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu

// for-loop self-attention
struct ne_tensor* KQV_merged_contiguous =
ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC);
d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum);
size_t off_sl = 0;
for (int gi = 0; gi < infer_groups.size(); ++gi) {
const int attn_bs = infer_groups[gi].size();
@@ -453,7 +453,7 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
} else if (attn_n_total == 0 && run_mha_bf16_first) {
// non-reordered kv-cache bf16 mha (first token only)
auto vnele = ne_nelements(Vcur);
struct ne_tensor* Vtmp = ne_new_tensor_1d(ctx0, NE_TYPE_F16, vnele, NE_SIZE_CALC);
struct ne_tensor* Vtmp = d_ne_new_tensor_1d(ctx0, NE_TYPE_F16, vnele);
Vtmp = ne_cpy(ctx0, ne_view_1d(ctx0, Vcur, vnele, 0), Vtmp);
Vtmp = ne_view_4d(ctx0, Vtmp, head_size, n_head, attn_sl, attn_bs, ne_element_size(Vtmp) * head_size,
ne_element_size(Vtmp) * head_size * n_head,
2 changes: 1 addition & 1 deletion neural_speed/models/gptneox/gptneox.cpp
Original file line number Diff line number Diff line change
@@ -262,7 +262,7 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
2 changes: 1 addition & 1 deletion neural_speed/models/grok/grok.cpp
Original file line number Diff line number Diff line change
@@ -227,7 +227,7 @@ static bool grok_model_eval_internal(model_context* ctx, const model_input* inpu
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
4 changes: 2 additions & 2 deletions neural_speed/models/mpt/mpt.cpp
Original file line number Diff line number Diff line change
@@ -163,7 +163,7 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
// 2, 1, 3) [64, N, 12]
struct ne_tensor* Q = ne_permute(
ctx0, ne_cpy(ctx0, Qcur, ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N, NE_SIZE_CALC)), 0, 2,
ctx0, ne_cpy(ctx0, Qcur, d_ne_new_tensor_3d(ctx0, NE_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
1, 3);

// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
@@ -201,7 +201,7 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
2 changes: 1 addition & 1 deletion neural_speed/models/opt/opt.cpp
Original file line number Diff line number Diff line change
@@ -224,7 +224,7 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// [n_embd, N]
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
}

// attn out projection
2 changes: 1 addition & 1 deletion neural_speed/models/phi/phi.cpp
Original file line number Diff line number Diff line change
@@ -256,7 +256,7 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
4 changes: 2 additions & 2 deletions neural_speed/models/phi/phi3.cpp
Original file line number Diff line number Diff line change
@@ -117,7 +117,7 @@ static bool phi3_model_eval_internal(model_context* ctx, const model_input* inpu
bestla_reordered_attn_fp32_batch_kv_info(&kv_shape, &kv_cache_info);
}
struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
struct ne_tensor* factor = ne_new_tensor_1d(ctx0, NE_TYPE_F32, 48, sizeof(float));
struct ne_tensor* factor = d_ne_new_tensor_1d(ctx0, NE_TYPE_F32, 48);
const float longfactor[48] = {
1.0299999713897705, 1.0499999523162842, 1.0499999523162842, 1.0799999237060547, 1.2299998998641968,
1.2299998998641968, 1.2999999523162842, 1.4499999284744263, 1.5999999046325684, 1.6499998569488525,
@@ -265,7 +265,7 @@ static bool phi3_model_eval_internal(model_context* ctx, const model_input* inpu
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
2 changes: 1 addition & 1 deletion neural_speed/models/qwen/qwen.cpp
Original file line number Diff line number Diff line change
@@ -278,7 +278,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
4 changes: 2 additions & 2 deletions neural_speed/models/stablelm/stablelm.cpp
Original file line number Diff line number Diff line change
@@ -259,7 +259,7 @@ static bool stablelm_model_eval_internal(model_context* ctx, const model_input*
struct ne_tensor* KQV_merged = ne_permute(ctx0, KQV, 0, 2, 1, 3);

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N * batch_size));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
@@ -435,4 +435,4 @@ int model_eval(struct model_context* ctx, const model_input* inputs, const int n
}

return 0;
}
}
2 changes: 1 addition & 1 deletion neural_speed/models/starcoder/starcoder.cpp
Original file line number Diff line number Diff line change
@@ -248,7 +248,7 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*

// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, N]
cur = ne_cpy(ctx0, KQV_merged, ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N, NE_SIZE_CALC));
cur = ne_cpy(ctx0, KQV_merged, d_ne_new_tensor_2d(ctx0, NE_TYPE_F32, n_embd, N));
} else {
const auto seq_kv = n_past + N;
const auto k_size = kv_cache_info.k_bytes;
140 changes: 70 additions & 70 deletions neural_speed/models/whisper/whisper.cpp

Large diffs are not rendered by default.

0 comments on commit 379ed27

Please sign in to comment.