Skip to content

Commit

Permalink
Лаба 4 не до конца
Browse files Browse the repository at this point in the history
  • Loading branch information
vatican1 committed Oct 8, 2023
1 parent ab339de commit 2512cc8
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 42 deletions.
59 changes: 56 additions & 3 deletions src/cl/matrix_multiplication.cl
Original file line number Diff line number Diff line change
@@ -1,4 +1,57 @@
__kernel void matrix_multiplication(...)
#define TILE_SIZE 16
#define THREAD_WORK 16

__kernel void matrix_multiplication(const __global float* a,
const __global float* b,
__global float* c,
unsigned int M,
unsigned int K,
unsigned int N)
{
// TODO
}
int i = get_global_id(0);
int j = get_global_id(1);

float sum = 0.0f;
for(int k = 0; k < K; ++k)
{
sum += a[j * K + k]* b[k * N + i];
}
c[j * N + i] = sum;
}

__kernel void matrix_multiplication_1(const __global float* a,
const __global float* b,
__global float* c,
unsigned int M,
unsigned int K,
unsigned int N)
{
int global_i = get_global_id(0);
int global_j = get_global_id(1);
int local_i = get_local_id(0);
int local_j = get_local_id(1);

if (global_i >= M || global_j > N || local_i > TILE_SIZE || local_j > TILE_SIZE)
{
return;
}

__local float tileA[TILE_SIZE][TILE_SIZE + 1];
__local float tileB[TILE_SIZE][TILE_SIZE + 1];

float sum = 0.0f;
for (int tileK = 0; tileK * TILE_SIZE < K; ++tileK)
{
tileA[local_j][local_i] = a[global_j * K + TILE_SIZE * tileK + local_i];
tileB[local_j][local_i] = b[(TILE_SIZE * tileK + local_j) * N + global_i];
barrier(CLK_LOCAL_MEM_FENCE);

for (int k = 0; k < TILE_SIZE; ++k)
{
sum += tileA[local_j][k] * tileB[k][local_i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
c[global_j * N + global_i] = sum;

}
33 changes: 30 additions & 3 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
@@ -1,4 +1,31 @@
__kernel void matrix_transpose(...)
#define TILE_SIZE 32

__kernel void matrix_transpose(__global float* as,
__global float* as_t,
unsigned int m,
unsigned int k)
{
// TODO
}
int global_i = get_global_id(0);
int global_j = get_global_id(1);

if (global_j >= m || global_i >= k)
{
return;
}

int local_i = get_local_id(0);
int local_j = get_local_id(1);

if ((local_i >= TILE_SIZE) || (local_j >= TILE_SIZE))
{
return;
}

__local float tile[TILE_SIZE][TILE_SIZE + 1];
tile[local_j][local_i] = as[global_j * k + global_i];

barrier(CLK_LOCAL_MEM_FENCE);

as_t[global_i * m + global_j] = tile[local_j][local_i];

}
67 changes: 37 additions & 30 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ int main(int argc, char **argv)

const std::vector<float> cs_cpu_reference = cs;

/*

gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu;
as_gpu.resizeN(M*K);
bs_gpu.resizeN(K*N);
Expand All @@ -67,42 +67,49 @@ int main(int argc, char **argv)
as_gpu.writeN(as.data(), M*K);
bs_gpu.writeN(bs.data(), K*N);

ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, "matrix_multiplication");
matrix_multiplication_kernel.compile();
std::vector<std::string> kernel_names = {"matrix_multiplication", "matrix_multiplication_1"};

for(const std::string kernel_name: kernel_names)
{
timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
// TODO
unsigned int work_group_size = 128;
unsigned int global_work_size = ...;
matrix_multiplication_kernel.exec(gpu::WorkSize(work_group_size, global_work_size), as_gpu, bs_gpu, cs_gpu, M, K, N);

t.nextLap();
ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, kernel_name);
matrix_multiplication_kernel.compile();

{
timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
// TODO
unsigned int work_group_size = 16;
unsigned int global_work_size = 16;
matrix_multiplication_kernel.exec(gpu::WorkSize(work_group_size, global_work_size, K, M),
as_gpu, bs_gpu, cs_gpu, M, K, N);

t.nextLap();
}
std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "GPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;
}
std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "GPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;
}

cs_gpu.readN(cs.data(), M*N);
*/

// Проверяем корректность результатов
double diff_sum = 0;
for (int i = 0; i < M * N; ++i) {
double a = cs[i];
double b = cs_cpu_reference[i];
if (a != 0.0 || b != 0.0) {
double diff = fabs(a - b) / std::max(fabs(a), fabs(b));
diff_sum += diff;
cs_gpu.readN(cs.data(), M*N);


// Проверяем корректность результатов
double diff_sum = 0;
for (int i = 0; i < M * N; ++i) {
double a = cs[i];
double b = cs_cpu_reference[i];
if (a != 0.0 || b != 0.0) {
double diff = fabs(a - b) / std::max(fabs(a), fabs(b));
diff_sum += diff;
}
}
}

double diff_avg = diff_sum / (M * N);
std::cout << "Average difference: " << diff_avg * 100.0 << "%" << std::endl;
if (diff_avg > 0.01) {
std::cerr << "Too big difference!" << std::endl;
return 1;
double diff_avg = diff_sum / (M * N);
std::cout << "Average difference: " << diff_avg * 100.0 << "%" << std::endl;
if (diff_avg > 0.01) {
std::cerr << "Too big difference!" << std::endl;
return 1;
}
}

return 0;
Expand Down
13 changes: 7 additions & 6 deletions src/main_matrix_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int main(int argc, char **argv)
}
std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;

/*

gpu::gpu_mem_32f as_gpu, as_t_gpu;
as_gpu.resizeN(M*K);
as_t_gpu.resizeN(K*M);
Expand All @@ -45,15 +45,16 @@ int main(int argc, char **argv)
{
timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
// TODO
unsigned int work_group_size = 128;
unsigned int global_work_size = ...;
unsigned int work_group_size_width = 16;
unsigned int work_group_size_height = 16;
// Для этой задачи естественнее использовать двухмерный NDRange. Чтобы это сформулировать
// в терминологии библиотеки - нужно вызвать другую вариацию конструктора WorkSize.
// В CLion удобно смотреть какие есть вариант аргументов в конструкторах:
// поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
// - для 1D, 2D и 3D рабочего пространства соответственно
matrix_transpose_kernel.exec(gpu::WorkSize(work_group_size, global_work_size), as_gpu, as_t_gpu, M, K);
auto workSize = gpu::WorkSize(work_group_size_width, work_group_size_height, K, M);
matrix_transpose_kernel.exec(workSize, as_gpu, as_t_gpu, M, K);


t.nextLap();
}
Expand All @@ -74,7 +75,7 @@ int main(int argc, char **argv)
}
}
}
*/


return 0;
}

0 comments on commit 2512cc8

Please sign in to comment.