Skip to content

Commit

Permalink
revert cpy code
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel committed Jul 15, 2024
1 parent aacf97a commit de5ad40
Showing 1 changed file with 27 additions and 14 deletions.
41 changes: 27 additions & 14 deletions ggml/src/ggml-sycl/dmmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,12 @@ static void dequantize_mul_mat_vec_q4_0(const void * __restrict__ vx, const dflo
v.x() = ((vui & 0xF) - 8) * d;
v.y() = ((vui >> 4) - 8) * d;
#ifdef GGML_SYCL_F16
dfloat2 t1{ y[iybs + ir + 0],
y[iybs + ir + QK4_0 / 2] };
dfloat2 t1{ y[iybs + ir * 2 + 0],
y[iybs + ir * 2 + 1] };
tmp += v * t1;
#else
tmp += v.x() * y[iybs + ir + 0];
tmp += v.y() * y[iybs + ir + QK4_0 / 2];
tmp += v.x() * y[iybs + ir * 2 + 0];
tmp += v.y() * y[iybs + ir * 2 + 1];
#endif
}
}
Expand All @@ -157,12 +157,12 @@ static void dequantize_mul_mat_vec_q4_0(const void * __restrict__ vx, const dflo
v.x() = ((vui & 0xF) - 8) * d;
v.y() = ((vui >> 4) - 8) * d;
#ifdef GGML_SYCL_F16
dfloat2 t1{ y[iybs + ir + 0],
y[iybs + ir + QK4_0 / 2] };
dfloat2 t1{ y[iybs + ir * 2 + 0],
y[iybs + ir * 2 + 1] };
tmp += v * t1;
#else
tmp += v.x() * y[iybs + ir + 0];
tmp += v.y() * y[iybs + ir + QK4_0 / 2];
tmp += v.x() * y[iybs + ir * 2 + 0];
tmp += v.y() * y[iybs + ir * 2 + 1];
#endif
}
}
Expand All @@ -180,12 +180,12 @@ static void dequantize_mul_mat_vec_q4_0(const void * __restrict__ vx, const dflo
v.x() = ((vui & 0xF) - 8) * d;
v.y() = ((vui >> 4) - 8) * d;
#ifdef GGML_SYCL_F16
dfloat2 t1{ y[iybs + ir * QK4_0 + iqs + 0],
y[iybs + ir * QK4_0 + iqs + QK4_0 / 2] };
dfloat2 t1{ y[iybs + ir * QK4_0 + iqs * 2 + 0],
y[iybs + ir * QK4_0 + iqs * 2 + 1] };
tmp += v * t1;
#else
tmp += v.x() * y[iybs + ir * QK4_0 + iqs + 0];
tmp += v.y() * y[iybs + ir * QK4_0 + iqs + QK4_0 / 2];
tmp += v.x() * y[iybs + ir * QK4_0 + iqs * 2 + 0];
tmp += v.y() * y[iybs + ir * QK4_0 + iqs * 2 + 1];
#endif
}

Expand Down Expand Up @@ -904,11 +904,24 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
stream->parallel_for(
nrows * ncols / QK4_0,
[=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
const block_q4_0 * x = (const block_q4_0 *) vx;
const block_q4_0 *x = (const block_q4_0 *)vx;
int ib = i;
typedef sycl::vec<uint8_t, QK4_0 / 2> CT;
CT tmp = *(CT *)x[ib].qs;
*(CT*)(vx_tmp + ib * QK4_0 / 2) = tmp;
for (int j = 0; j < QK4_0 / 2; j += 2)
{
const int vui = tmp[j];
const int vui1 = tmp[j + 1];
uint8_t nv = (vui & 0xF) | (vui1 << 4);
*(uint8_t *)(vx_tmp + ib * QK4_0 / 2 + j / 2) = nv;
}
for (int j = 0; j < QK4_0 / 2; j += 2)
{
const int vui = tmp[j];
const int vui1 = tmp[j + 1];
uint8_t nv = (vui >> 4) | (vui1 & 0xf0);
*(uint8_t *)(vx_tmp + ib * QK4_0 / 2 + j / 2 + QK4_0 / 4) = nv;
}
*(sycl::half *)(vx_tmp + ncols * nrows / 2 + ib * 2) = x[ib].d;

});
Expand Down

0 comments on commit de5ad40

Please sign in to comment.