Skip to content

Commit

Permalink
add to v35
Browse files Browse the repository at this point in the history
  • Loading branch information
zhen8838 committed Aug 25, 2023
1 parent 64158a5 commit afdd14f
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 24 deletions.
8 changes: 4 additions & 4 deletions modules/cpu/src/runtime/cmodel/include/tdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,18 @@ void binary(tensor<T, ALoc> &a, tensor<T, BLoc> &b, tensor<T, CLoc> &out,
gsl::make_span(out.strides()).template as_span<const size_t>());
}

template <class T, loc_t ALoc, loc_t BLoc, loc_t CLoc>
void unary(tensor<T, ALoc> &a, tensor<T, CLoc> &out, unary_op_t op) {
template <class T, loc_t ALoc, loc_t BLoc>
void unary(tensor<T, ALoc> &a, tensor<T, BLoc> &out, unary_op_t op) {
kernels::unary(
op, a.cdata().data(), out.data().data(),
gsl::make_span(a.strides()).template as_span<const size_t>(),
gsl::make_span(out.dimension()).template as_span<const size_t>(),
gsl::make_span(out.strides()).template as_span<const size_t>());
}

template <typename T, loc_t ALoc, loc_t BLoc>
template <typename T, loc_t ALoc, loc_t BLoc, loc_t CLoc>
void matmul(tensor<T, ALoc> &a, tensor<T, BLoc> &b,
tensor<T, loc_t::local> &c) {
tensor<T, CLoc> &c) {
kernels::matmul(a.cdata().data(), b.cdata().data(), c.data().data(),
a.dimension(), a.strides(), b.dimension(), b.strides(),
c.dimension(), c.strides());
Expand Down
127 changes: 122 additions & 5 deletions modules/cpu/src/runtime/cmodel/tests/demo3/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,50 @@ using namespace shared;

static bool w_loaded;
static tensor<float> v2_w({8, 2048, 128});
static tensor<float> v16_w({8, 2048, 128});
static tensor<float> v31_w({8, 2048, 128});
static tensor<float> v35_w({1024, 2048});
static tensor<float> v3_data({384, 128});
static tensor<float> v11_data({384, 128});
static tensor<int64_t> position_ids({1, 384});

void stage1_kernel(tensor<float, loc_t::device> &Hidden_in, /* [1, 384, 8192] */
tensor<float, loc_t::device> &V0_gamma,
tensor<float, loc_t::device> &V0_beta,
tensor<float, loc_t::device> &V2_w, /* [64, 8192, 128] */
tensor<float, loc_t::device> &V3_data, /* [384, 128] */
tensor<float, loc_t::device> &V0_gamma, /* [8192] */
tensor<float, loc_t::device> &V0_beta, /* [8192] */
tensor<float, loc_t::device> &V2_w, /* [64, 8192, 128] */
tensor<float, loc_t::device> &V16_w, /* [64, 8192, 128] */
tensor<float, loc_t::device> &V31_w, /* [64, 8192, 128] */
tensor<float, loc_t::device> &V35_w, /* [8192, 8192] */
tensor<float, loc_t::device> &V3_data, /* [384, 128] */
tensor<float, loc_t::device> &V11_data, /* [384, 128] */
tensor<float, loc_t::device> &Attn_mask, /* [1,1,384,384] */
tensor<int64_t, loc_t::device> &Position_ids /* [1,384] */
) {
thread_context ctx(bid, tid);
tensor<float> v0_gamma({2048});
tensor<float> v0_beta({2048});
tensor<float> v0({1, 48, 2048}); /* [1, 384, 8192] [1, 48@b, 2048@t] */

tensor<float> attn_mask(
{1, 1, 384, 384}); /* [1,1,384,384] [1,1,96@t,96@t] */

if (!w_loaded) {
tdma_load_async(v0_gamma, V0_gamma({tid * 2048}, {2048}), ctx);
tdma_load_async(v0_beta, V0_beta({tid * 2048}, {2048}), ctx);
tdma_load_async(v2_w, V2_w({8 * bid, 2048 * tid, 0}, {8, 2048, 128}),
ctx);
tdma_load_async(v16_w, V16_w({8 * bid, 2048 * tid, 0}, {8, 2048, 128}),
ctx);
tdma_load_async(v31_w, V31_w({8 * bid, 2048 * tid, 0}, {8, 2048, 128}),
ctx);
tdma_load_async(v35_w, V35_w({1024 * bid, 2048 * tid}, {1024, 2048}),
ctx);
tdma_load_async(v3_data, std::move(V3_data), ctx);
tdma_load_async(v11_data, std::move(V11_data), ctx);
tdma_load_async(position_ids, std::move(Position_ids), ctx);
tdma_load_async(attn_mask,
Attn_mask({0, 0, tid * 96, tid * 96}, {1, 1, 96, 96}),
ctx);

tdma_wait(ctx);
}
Expand All @@ -49,5 +71,100 @@ void stage1_kernel(tensor<float, loc_t::device> &Hidden_in, /* [1, 384, 8192] */
tensor_block_mma_sync(v1, v2_w, V2, false, ctx);

tensor<float> v3({1, 384, 128}); //
gather(v3_data, position_ids, v3, 1);
gather(v3_data, position_ids, v3, 0);

auto v4 = unsqueeze(v3); /* 1, 1, 384, 128 */

tensor<float> v5({1, 8, 384, 128}); // [1, 64, 384, 128] [1, 8@b, 384, 128]
binary(V2, v4, v5, binary_op_t::mul);

auto v6 =
V2({0, 0, 0, 64}, {1, 8, 384, 64}); //[1, 64, 384, 64] [1, 8@b, 384, 64]
auto v7 = v6;
if (tid == 0) {
/* V2 is shared, so don't need perfrom on each thread. */
unary(v6, v7, unary_op_t::neg);
}
tdma_wait(ctx);

// auto v8 = v2({0, 0, 0, 0}, {1, 8, 384, 64}); //[1, 64, 384, 64] [1, 8@b,
// 384, 64]@shared
/* v10 为concat(v7,v8), 实际来源于V2,因此无需实际动作 */
// [1, 64, 384, 128] [1, 8@b, 384, 128]@shared ->
// [1, 8@b, 96@t, 128]@shared here can resplit .
auto v10 = V2({0, 0, tid * 96, 0}, {1, 8, 96, 128});

tensor<float> v11({1, 384, 128}); //[1, 384, 128] [1, 384, 128]
gather(v11_data, position_ids, v11, 0);
auto v12 = unsqueeze(v11); // [1, 1, 384, 128] [1, 1, 384, 128]

tensor<float> v13({1, 8, 96, 128}); // [1, 64, 384, 128] [1, 8@b, 96@t, 128]
binary(v10, v12, v13, binary_op_t::mul);

tensor<float> v14({1, 8, 96, 128}); // [1, 64, 384, 128] [1, 8@b, 96@t, 128]
binary(v5, v13, v14, binary_op_t::add);

// [1, 1, 384, 8192] [1, 1, 48@b, 2048@t]
tensor<float> v15 = v0({0, bid * 48, tid * 2048}, {1, 48, 2048});

tensor_block_mma_sync(v15, v16_w, V16, false, ctx);
tensor<float> v17({1, 8, 384, 128}); // [1, 64, 384, 128] [1, 8@b, 384, 128]
binary(V16, v4, v17, binary_op_t ::mul);

// [1, 64, 384, 64] [1, 8@b, 384, 64]@shared
auto v18 = V16({0, 0, 0, 64}, {1, 8, 384, 64});
if (tid == 0) {
unary(v18, v18, unary_op_t::neg);
}
tdma_wait(ctx);

//[1, 64, 384, 128] [1, 8@b, 96@t, 128]@shared
auto v22 = V16({0, 0, tid * 96, 0}, {1, 8, 96, 128});

// [1, 64, 384, 128] [1, 8@b, 96@t, 128]@shared
tensor<float> v23({1, 8, 96, 128});
binary(v22, v12, v23, binary_op_t::mul);

// [1, 64, 384, 128] [1, 8@b, 96@t, 128]
tensor<float> v24({1, 8, 96, 128});
binary(v17, v23, v24, binary_op_t::add);

// [1, 64, 128, 384] [1, 8@b, 128, 96@t]
tensor<float> v25({1, 8, 128, 96});
transpose(v24, v25, dims_t({0, 1, 3, 2}));

// [1, 8@b, 96@t, 128] @ [1, 8@b, 128, 96@t] => [1, 8@b, 96@t, 96@t]
auto v26 = V26({0, 0, tid * 96, tid * 96}, {1, 8, 96, 96});
matmul(v14, v25, v26);

// [1, 64, 384, 384] [1, 8@b, 96@t, 96@t] @shared
auto v27 = v26;
auto v26_c = tensor<float>({1, 1, 1, 1});
tdma_fill_async(v26_c, 11.313708f);
binary(v26, v26_c, v27, binary_op_t::div);

// [1, 64, 384, 384] [1, 8@b, 96@t, 96@t] @shared
auto v28 = v27;
binary(v27, attn_mask, v28, binary_op_t::div);

// resplit need sync.
tdma_wait(ctx);
auto v28_1 = V26({0, 0, tid * 96, 0}, {1, 8, 96, 384});
tensor<float> v29({1, 8, 96, 384}); //[1, 64, 384, 384] [1, 8@b, 96@t, 384]
softmax(v28_1, v29, 3);

// [1, 1, 384, 8192] [1, 1, 48@b, 2048@t]
auto v30 = unsqueeze(v0);

tensor_block_mma_sync(v30, v31_w, V31, false, ctx);

// [1, 8@b, 384, 128]@local @ [1, 8@b, 384, 128]@shared
// [1, 64, 384, 128] [1, 8@b, 384, 128]
if (tid == 0) {
matmul(v29, V31, V32);
// V33 [1, 384, 64, 128] [1, 384, 8@b, 128]@shared
transpose(V32, V33, dims_t({0, 2, 1, 3}));
}
tdma_wait(ctx);
// auto v34 =
}
39 changes: 25 additions & 14 deletions modules/cpu/src/runtime/cmodel/tests/demo3/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#define DEFINE_TFUNC(b, t) \
void *f_##b##_##t(void *arg) { \
block##b::thread##t::stage1_kernel(Hidden_in, V0_gamma, V0_beta, V2_w, \
V3_data, Position_ids); \
V16_w, V31_w, V35_w, V3_data, \
V11_data, Attn_mask, Position_ids); \
return arg; \
}

Expand All @@ -16,12 +17,17 @@
DEFINE_TFUNC(b, 2) \
DEFINE_TFUNC(b, 3)

tensor<float, loc_t::device> Hidden_in({1, 384, 8192});
tensor<float, loc_t::device> V0_gamma({8192});
tensor<float, loc_t::device> V0_beta({8192});
tensor<float, loc_t::device> V2_w({64, 8192, 128});
tensor<float, loc_t::device> V3_data({384, 128});
tensor<int64_t, loc_t::device> Position_ids({1, 384});
static tensor<float, loc_t::device> Hidden_in({1, 384, 8192});
static tensor<float, loc_t::device> V0_gamma({8192});
static tensor<float, loc_t::device> V0_beta({8192});
static tensor<float, loc_t::device> V2_w({64, 8192, 128});
static tensor<float, loc_t::device> V16_w({64, 8192, 128});
static tensor<float, loc_t::device> V31_w({64, 8192, 128});
static tensor<float, loc_t::device> V35_w({8192, 8192});
static tensor<float, loc_t::device> V3_data({384, 128});
static tensor<float, loc_t::device> V11_data({384, 128});
static tensor<float, loc_t::device> Attn_mask({1, 1, 384, 384});
static tensor<int64_t, loc_t::device> Position_ids({1, 384});

DEFINE_BFUNC(0)
DEFINE_BFUNC(1)
Expand All @@ -35,7 +41,7 @@ DEFINE_BFUNC(7)
#define LOAD_FILE(name, i, type) \
{ \
auto src_##name = read_file(std::string(argv[(i)])); \
span_copy(name.data(), gsl::make_span(src_##name).as_span<type>()); \
span_copy(name.data(), gsl::make_span(src_##name).as_span<type>()); \
}

/**
Expand All @@ -49,12 +55,17 @@ int main([[maybe_unused]] int argc, char **argv) {
// spdlog::set_level(spdlog::level::debug);
global_hardware_init();

LOAD_FILE(Hidden_in, 1, float);
LOAD_FILE(V0_gamma, 2, float);
LOAD_FILE(V0_beta, 3, float);
LOAD_FILE(V2_w, 4, float);
LOAD_FILE(V3_data, 5, float);
LOAD_FILE(Position_ids, 6, int64_t);
LOAD_FILE(Hidden_in, 0, float)
LOAD_FILE(V0_gamma, 1, float)
LOAD_FILE(V0_beta, 2, float)
LOAD_FILE(V2_w, 3, float)
LOAD_FILE(V16_w, 4, float)
LOAD_FILE(V31_w, 5, float)
LOAD_FILE(V35_w, 6, float)
LOAD_FILE(V3_data, 7, float)
LOAD_FILE(V11_data, 8, float)
LOAD_FILE(Attn_mask, 9, float)
LOAD_FILE(Position_ids, 10, int64_t)

pthread_t t_0_0, t_1_0, t_2_0, t_3_0, t_4_0, t_5_0, t_6_0, t_7_0;
pthread_t t_0_1, t_1_1, t_2_1, t_3_1, t_4_1, t_5_1, t_6_1, t_7_1;
Expand Down
16 changes: 15 additions & 1 deletion modules/cpu/src/runtime/cmodel/tests/demo3/shared_def.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
#include <tdma.h>

namespace shared {
tensor<float, loc_t::shared> V2({1, 8, 384, 128}); // [1, 64, 384, 128] [1, 8@b, 384, 128]
static tensor<float, loc_t::shared>
V2({1, 8, 384, 128}); // [1, 64, 384, 128] [1, 8@b, 384, 128]
static tensor<float, loc_t::shared>
V16({1, 8, 384, 128}); // [1, 64, 384, 128] [1, 8@b, 384, 128]
static tensor<float, loc_t::shared>
V26({1, 8, 384, 384}); // [1, 64, 384, 128] [1, 8@b, 384, 384]

static tensor<float, loc_t::shared>
V31({1, 8, 384, 128}); // [1, 64, 384, 128] [1, 8@b, 384, 128]

static tensor<float, loc_t::shared>
V32({1, 8, 384, 128}); // [1, 64, 384, 128] [1, 8@b, 384, 128]
static tensor<float, loc_t::shared>
V33({1, 384, 8, 128}); // [1, 384, 64, 128] [1, 384, 8@b, 128]

} // namespace shared

0 comments on commit afdd14f

Please sign in to comment.