From b6349788c9678eb63c8d2de6cb89811dbae20c5c Mon Sep 17 00:00:00 2001 From: Andrew Johnson Date: Tue, 19 Sep 2023 17:03:40 +0300 Subject: [PATCH] Fix merge breakage --- .../opencl/kernels/tridiagonalization.hpp | 590 ++++++------------ 1 file changed, 192 insertions(+), 398 deletions(-) diff --git a/stan/math/opencl/kernels/tridiagonalization.hpp b/stan/math/opencl/kernels/tridiagonalization.hpp index 6c72087475f..1a85d9d5cb1 100644 --- a/stan/math/opencl/kernels/tridiagonalization.hpp +++ b/stan/math/opencl/kernels/tridiagonalization.hpp @@ -10,98 +10,97 @@ namespace math { namespace opencl_kernels { // \cond -static constexpr const char* tridiagonalization_householder_kernel_code - = STRINGIFY( - // \endcond - /** - * Calculates householder vector and first element of the vector v. - * Must be run with 1 workgroup of LOCAL_SIZE_ threads. - * @param[in,out] P packed matrix being constructed - * @param[in,out] V matrix V - * @param[out] q_glob q - * @param P_rows Number of rows of the packed matrix - * @param V_rows Number of rows of the matrix V - * @param j Start column of the block to work on - * @param k Index of the householder vector in the block to create - */ - __kernel void tridiagonalization_householder( - __global double* P, __global double* V, __global double* q_glob, - const int P_rows, const int V_rows, const int j, const int k) { - const int lid = get_local_id(0); - const int gid = get_global_id(0); - const int gsize = get_global_size(0); - const int lsize = get_local_size(0); - const int ngroups = get_num_groups(0); - const int wgid = get_group_id(0); +static constexpr const char* tridiagonalization_householder_kernel_code = STRINGIFY( + // \endcond + /** + * Calculates householder vector and first element of the vector v. + * Must be run with 1 workgroup of LOCAL_SIZE_ threads. + * @param[in,out] P packed matrix being constructed + * @param[in,out] V matrix V + * @param[out] q_glob q + * @param P_rows Number of rows of the packed matrix + * @param V_rows Number of rows of the matrix V + * @param j Start column of the block to work on + * @param k Index of the householder vector in the block to create + */ + __kernel void tridiagonalization_householder( + __global double* P, __global double* V, __global double* q_glob, + const int P_rows, const int V_rows, const int j, const int k) { + const int lid = get_local_id(0); + const int gid = get_global_id(0); + const int gsize = get_global_size(0); + const int lsize = get_local_size(0); + const int ngroups = get_num_groups(0); + const int wgid = get_group_id(0); - double q = 0; + double q = 0; - const int P_start = P_rows * (k + j) + k + j; - const int P_span = P_rows * (k + j + 1) - P_start; - for (int i = lid; i < P_span; i += lsize) { - double acc = 0; - // apply previous householder reflections from current block to the - // column we are making the Householder vector from - for (int l = 0; l < j; l++) { - acc += P[P_rows * (k + l) + k + j + i] * V[V_rows * l + j - 1] - + V[V_rows * l + j - 1 + i] * P[P_rows * (k + l) + k + j]; - } - double tmp = P[P_start + i] - acc; - P[P_start + i] = tmp; - if (i != 0) { - q += tmp * tmp; - } - } - // calculate column norm between threads - __local double q_local[LOCAL_SIZE_]; - q_local[lid] = q; - barrier(CLK_LOCAL_MEM_FENCE); - for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; - step /= REDUCTION_STEP_SIZE) { - if (lid < step) { - for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { - q_local[lid] += q_local[lid + step * i]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); + const int P_start = P_rows * (k + j) + k + j; + const int P_span = P_rows * (k + j + 1) - P_start; + for (int i = lid; i < P_span; i += lsize) { + double acc = 0; + // apply previous householder reflections from current block to the + // column we are making the Householder vector from + for (int l = 0; l < j; l++) { + acc += P[P_rows * (k + l) + k + j + i] * V[V_rows * l + j - 1] + + V[V_rows * l + j - 1 + i] * P[P_rows * (k + l) + k + j]; + } + double tmp = P[P_start + i] - acc; + P[P_start + i] = tmp; + if (i != 0) { + q += tmp * tmp; + } + } + // calculate column norm between threads + __local double q_local[LOCAL_SIZE_]; + q_local[lid] = q; + barrier(CLK_LOCAL_MEM_FENCE); + for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; + step /= REDUCTION_STEP_SIZE) { + if (lid < step) { + for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { + q_local[lid] += q_local[lid + step * i]; } + } + barrier(CLK_LOCAL_MEM_FENCE); + } - double alpha; - if (lid == 0) { - q = q_local[0]; - double p1 = P[P_start + 1]; - // make Householder vector - alpha = -copysign(sqrt(q), P[P_start]); - q -= p1 * p1; - p1 -= alpha; - P[P_start + 1] = p1; - q += p1 * p1; - q = sqrt(q); - q_local[0] = q; - q_local[1] = alpha; - *q_glob = q; - } - barrier(CLK_LOCAL_MEM_FENCE); - q = q_local[0]; - alpha = q_local[1]; - if (q != 0) { - double multi = M_SQRT2 / q; - // normalize the Householder vector - for (int i = lid + 1; i < P_span; i += lsize) { - P[P_start + i] *= multi; - } - } - if (gid == 0) { - P[P_rows * (k + j + 1) + k + j] - = P[P_rows * (k + j) + k + j + 1] * q / M_SQRT2 + alpha; - } + double alpha; + if (lid == 0) { + q = q_local[0]; + double p1 = P[P_start + 1]; + // make Householder vector + alpha = -copysign(sqrt(q), P[P_start]); + q -= p1 * p1; + p1 -= alpha; + P[P_start + 1] = p1; + q += p1 * p1; + q = sqrt(q); + q_local[0] = q; + q_local[1] = alpha; + *q_glob = q; + } + barrier(CLK_LOCAL_MEM_FENCE); + q = q_local[0]; + alpha = q_local[1]; + if (q != 0) { + double multi = M_SQRT2 / q; + // normalize the Householder vector + for (int i = lid + 1; i < P_span; i += lsize) { + P[P_start + i] *= multi; } - // \cond - ); + } + if (gid == 0) { + P[P_rows * (k + j + 1) + k + j] + = P[P_rows * (k + j) + k + j + 1] * q / M_SQRT2 + alpha; + } + } + // \cond +); // \endcond // \cond -static const char* tridiagonalization_v_step_1_kernel_code = STRINGIFY( +static constexpr const char* tridiagonalization_v_step_1_kernel_code = STRINGIFY( // \endcond /** * Calculates first part of constructing the vector v: Uu = Pb * u and Vu = @@ -164,7 +163,7 @@ static const char* tridiagonalization_v_step_1_kernel_code = STRINGIFY( // \endcond // \cond -static const char* tridiagonalization_v_step_2_kernel_code = STRINGIFY( +static constexpr const char* tridiagonalization_v_step_2_kernel_code = STRINGIFY( // \endcond /** * Second part in constructing vector v: v = Pb * u + V * Uu + U * Vu. Pb is @@ -186,306 +185,105 @@ static const char* tridiagonalization_v_step_2_kernel_code = STRINGIFY( const __global double* P, __global double* V, const __global double* Uu, const __global double* Vu, const int P_rows, const int V_rows, const int k, const int j) { - const int lid = get_local_id(0); - const int gid = get_global_id(0); - const int gsize = get_global_size(0); - const int lsize = get_local_size(0); - const int ngroups = get_num_groups(0); - const int wgid = get_group_id(0); + const int lid = get_local_id(0); + const int gid = get_global_id(0); + const int gsize = get_global_size(0); + const int lsize = get_local_size(0); + const int ngroups = get_num_groups(0); + const int wgid = get_group_id(0); - int work = P_rows - k - j - 1; - double acc = 0; + int work = P_rows - k - j - 1; + double acc = 0; - const __global double* vec = P + P_rows * (k + j) + k + j + 1; - const __global double* M1 = P + P_rows * (k + j + 1) + k + j + 1; - const __global double* M2 = P + P_rows * k + k + j + 1; - const __global double* M3 = V + j; - int i; - if (gid < work) { - for (i = 0; i <= gid; i++) { - acc += M1[P_rows * i + gid] * vec[i]; - } - for (int i = 0; i < j; i++) { - acc -= M2[P_rows * i + gid] * Vu[i]; - acc -= M3[V_rows * i + gid] * Uu[i]; - } - V[V_rows * j + gid + j] = acc; - } - float work_per_group = (float)work / ngroups; // NOLINT(readability/casting) - int start = work_per_group * wgid; - int end = work_per_group * (wgid + 1); - __local double res_loc[LOCAL_SIZE_]; - for (int i = start; i < end; i += 1) { - acc = 0; - for (int l = i + 1 + lid; l < work; l += LOCAL_SIZE_) { - acc += M1[P_rows * i + l] * vec[l]; - } - res_loc[lid] = acc; - barrier(CLK_LOCAL_MEM_FENCE); - for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; - step /= REDUCTION_STEP_SIZE) { - if (lid < step) { - for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { - res_loc[lid] += res_loc[lid + step * i]; + const __global double* vec = P + P_rows * (k + j) + k + j + 1; + const __global double* M1 = P + P_rows * (k + j + 1) + k + j + 1; + const __global double* M2 = P + P_rows * k + k + j + 1; + const __global double* M3 = V + j; + int i; + if (gid < work) { + for (i = 0; i <= gid; i++) { + acc += M1[P_rows * i + gid] * vec[i]; } + for (int i = 0; i < j; i++) { + acc -= M2[P_rows * i + gid] * Vu[i]; + acc -= M3[V_rows * i + gid] * Uu[i]; + } + V[V_rows * j + gid + j] = acc; } - // calculate column norm between threads - __local double q_local[LOCAL_SIZE_]; - q_local[lid] = q; - barrier(CLK_LOCAL_MEM_FENCE); - for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; - step /= REDUCTION_STEP_SIZE) { - if (lid < step) { - for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { - q_local[lid] += q_local[lid + step * i]; - } + float work_per_group + = (float)work / ngroups; // NOLINT(readability/casting) + int start = work_per_group * wgid; + int end = work_per_group * (wgid + 1); + __local double res_loc[LOCAL_SIZE_]; + for (int i = start; i < end; i += 1) { + acc = 0; + for (int l = i + 1 + lid; l < work; l += LOCAL_SIZE_) { + acc += M1[P_rows * i + l] * vec[l]; } + res_loc[lid] = acc; barrier(CLK_LOCAL_MEM_FENCE); - } - - double alpha; - if (lid == 0) { - q = q_local[0]; - double p1 = P[P_start + 1]; - // make Householder vector - alpha = -copysign(sqrt(q), P[P_start]); - q -= p1 * p1; - p1 -= alpha; - P[P_start + 1] = p1; - q += p1 * p1; - q = sqrt(q); - q_local[0] = q; - q_local[1] = alpha; - *q_glob = q; - } - barrier(CLK_LOCAL_MEM_FENCE); - q = q_local[0]; - alpha = q_local[1]; - if (q != 0) { - double multi = sqrt(2.) / q; - // normalize the Householder vector - for (int i = lid + 1; i < P_span; i += lsize) { - P[P_start + i] *= multi; + for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; + step /= REDUCTION_STEP_SIZE) { + if (lid < step) { + for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { + res_loc[lid] += res_loc[lid + step * i]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); } - } - if (gid == 0) { - P[P_rows * (k + j + 1) + k + j] - = P[P_rows * (k + j) + k + j + 1] * q / sqrt(2.) + alpha; + if (lid == 0) { + V[V_rows * (j + 1) + i + j] = res_loc[lid]; + } + barrier(CLK_LOCAL_MEM_FENCE); } } // \cond - ); // NOLINT(whitespace/parens) - // \endcond - - // \cond - static constexpr const char* tridiagonalization_v_step_1_kernel_code - = STRINGIFY( - // \endcond - /** - * Calculates first part of constructing the vector v: Uu = Pb * u - * and Vu = Vl * u. Pb is a block of packed matrix, Vl is left part - * of matrix V and u is householder vector. Must be run with number - * of work groups equal to size of resulting vectors and 64 threads - * per work group. - * @param P Packed matrix being constructed. - * @param V Matrix V. - * @param[out] Uu First resulting vector. - * @param[out] Vu Second resulting vector. - * @param P_rows Number of rows of the packed matrix - * @param V_rows Number of rows of the matrix V - * @param k Index of the householder vector in the block we use as - * input - */ - __kernel void tridiagonalization_v_step_1( - const __global double* P, const __global double* V, - __global double* Uu, __global double* Vu, const int P_rows, - const int V_rows, const int k) { - const int lid = get_local_id(0); - const int gid = get_global_id(0); - const int gsize = get_global_size(0); - const int lsize = get_local_size(0); - const int ngroups = get_num_groups(0); - const int wgid = get_group_id(0); - - __local double res_loc1[LOCAL_SIZE_]; - __local double res_loc2[LOCAL_SIZE_]; - double acc1 = 0; - double acc2 = 0; - - const __global double* vec - = P + P_rows * (k + ngroups) + k + ngroups + 1; - const __global double* M1 - = P + P_rows * (k + wgid) + k + ngroups + 1; - const __global double* M2 = V + V_rows * wgid + ngroups; - for (int i = lid; i < P_rows - k - ngroups - 1; - i += LOCAL_SIZE_) { // go over column of the matrix in steps - // of 64 - double v = vec[i]; - acc1 += M1[i] * v; - acc2 += M2[i] * v; - } - res_loc1[lid] = acc1; - res_loc2[lid] = acc2; - barrier(CLK_LOCAL_MEM_FENCE); - - for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; - step /= REDUCTION_STEP_SIZE) { - if (lid < step) { - for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { - res_loc1[lid] += res_loc1[lid + step * i]; - res_loc2[lid] += res_loc2[lid + step * i]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (lid == 0) { - Uu[wgid] = res_loc1[0]; - Vu[wgid] = res_loc2[0]; - } - } -<<<<<<< HEAD - // \cond - ); // NOLINT(whitespace/parens) - // \endcond - - // \cond - static constexpr const char* tridiagonalization_v_step_2_kernel_code - = STRINGIFY( - // \endcond - /** - * Second part in constructing vector v: v = Pb * u + V * Uu + U * - * Vu. Pb is a block of packed matrix and U is householder vector. - * Pb is symmetric with only lower triangle having values. That is - * why two columns of V are written that must be added to obtain the - * vector v. Must be run with 64 threads per work group and total - * number of threads equal or greater than size of result vector. - * @param P Packed matrix being constructed. - * @param V Matrix V. - * @param Uu Uu from previous kernel. - * @param Vu Vu from previous kernel. - * @param P_rows Number of rows of the packed matrix - * @param V_rows Number of rows of the matrix V - * @param k Index of the householder vector in the block we use as - * input - * @param j Start column of the block to work on - */ - __kernel void tridiagonalization_v_step_2( - const __global double* P, __global double* V, - const __global double* Uu, const __global double* Vu, - const int P_rows, const int V_rows, const int k, const int j) { - const int lid = get_local_id(0); - const int gid = get_global_id(0); - const int gsize = get_global_size(0); - const int lsize = get_local_size(0); - const int ngroups = get_num_groups(0); - const int wgid = get_group_id(0); - - int work = P_rows - k - j - 1; - double acc = 0; +); +// \endcond - const __global double* vec = P + P_rows * (k + j) + k + j + 1; - const __global double* M1 = P + P_rows * (k + j + 1) + k + j + 1; - const __global double* M2 = P + P_rows * k + k + j + 1; - const __global double* M3 = V + j; - int i; - if (gid < work) { - for (i = 0; i <= gid; i++) { - acc += M1[P_rows * i + gid] * vec[i]; - } - for (int i = 0; i < j; i++) { - acc -= M2[P_rows * i + gid] * Vu[i]; - acc -= M3[V_rows * i + gid] * Uu[i]; - } - V[V_rows * j + gid + j] = acc; - } - float work_per_group - = (float)work / ngroups; // NOLINT(readability/casting) - int start = work_per_group * wgid; - int end = work_per_group * (wgid + 1); - __local double res_loc[LOCAL_SIZE_]; - for (int i = start; i < end; i += 1) { - acc = 0; - for (int l = i + 1 + lid; l < work; l += LOCAL_SIZE_) { - acc += M1[P_rows * i + l] * vec[l]; - } - res_loc[lid] = acc; - barrier(CLK_LOCAL_MEM_FENCE); - for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; - step /= REDUCTION_STEP_SIZE) { - if (lid < step) { - for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { - res_loc[lid] += res_loc[lid + step * i]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (lid == 0) { - V[V_rows * (j + 1) + i + j] = res_loc[lid]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - } - // \cond - ); // NOLINT(whitespace/parens) +// \cond +static constexpr const char* tridiagonalization_v_step_3_kernel_code = STRINGIFY( // \endcond + /** + * Third part in constructing vector v: v-=0.5*(v^T*u)*u, where u is the + * householder vector. + * @param[in,out] P packed matrix being constructed + * @param[in,out] V matrix V + * @param[out] q q + * @param P_rows Number of rows of the packed matrix + * @param V_rows Number of rows of the matrix V + * @param k Index of the householder vector in the block to create + * @param j Start column of the block to work on + */ + __kernel void tridiagonalization_v_step_3( + __global double* P, __global double* V, __global double* q, + const int P_rows, const int V_rows, const int k, const int j) { + const int lid = get_local_id(0); + const int gid = get_global_id(0); + const int gsize = get_global_size(0); + const int lsize = get_local_size(0); + const int ngroups = get_num_groups(0); + const int wgid = get_group_id(0); - // \cond - static constexpr const char* tridiagonalization_v_step_3_kernel_code - = STRINGIFY( - // \endcond - /** - * Third part in constructing vector v: v-=0.5*(v^T*u)*u, where u is - * the householder vector. - * @param[in,out] P packed matrix being constructed - * @param[in,out] V matrix V - * @param[out] q q - * @param P_rows Number of rows of the packed matrix - * @param V_rows Number of rows of the matrix V - * @param k Index of the householder vector in the block to create - * @param j Start column of the block to work on - */ - __kernel void tridiagonalization_v_step_3( - __global double* P, __global double* V, __global double* q, - const int P_rows, const int V_rows, const int k, const int j) { - const int lid = get_local_id(0); - const int gid = get_global_id(0); - const int gsize = get_global_size(0); - const int lsize = get_local_size(0); - const int ngroups = get_num_groups(0); - const int wgid = get_group_id(0); - - __global double* u = P + P_rows * (k + j) + k + j + 1; - __global double* v = V + V_rows * j + j; - double acc = 0; + __global double* u = P + P_rows * (k + j) + k + j + 1; + __global double* v = V + V_rows * j + j; + double acc = 0; - for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) { - double vi = v[i] + v[i + V_rows]; - v[i] = vi; - acc += u[i] * vi; - } - __local double res_loc[LOCAL_SIZE_]; - res_loc[lid] = acc; - barrier(CLK_LOCAL_MEM_FENCE); - for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; - step /= REDUCTION_STEP_SIZE) { - if (lid < step) { - for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { - res_loc[lid] += res_loc[lid + step * i]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - acc = res_loc[0] * 0.5; - for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) { - v[i] -= acc * u[i]; - } - if (gid == 0) { - P[P_rows * (k + j + 1) + k + j] -= *q / sqrt(2.) * u[0]; - } - } - // \cond - ); // NOLINT(whitespace/parens) -======= + for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) { + double vi = v[i] + v[i + V_rows]; + v[i] = vi; + acc += u[i] * vi; + } + __local double res_loc[LOCAL_SIZE_]; + res_loc[lid] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; + step /= REDUCTION_STEP_SIZE) { + if (lid < step) { + for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { + res_loc[lid] += res_loc[lid + step * i]; + } + } barrier(CLK_LOCAL_MEM_FENCE); } acc = res_loc[0] * 0.5; @@ -498,38 +296,34 @@ static const char* tridiagonalization_v_step_2_kernel_code = STRINGIFY( } // \cond ); ->>>>>>> develop - // \endcond +// \endcond - const kernel_cl - tridiagonalization_householder( - "tridiagonalization_householder", - {tridiagonalization_householder_kernel_code}, - {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}}); +const kernel_cl + tridiagonalization_householder("tridiagonalization_householder", + {tridiagonalization_householder_kernel_code}, + {{"REDUCTION_STEP_SIZE", 4}, + {"LOCAL_SIZE_", 1024}}); - const kernel_cl - tridiagonalization_v_step_1( - "tridiagonalization_v_step_1", - {tridiagonalization_v_step_1_kernel_code}, - {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}}); +const kernel_cl + tridiagonalization_v_step_1("tridiagonalization_v_step_1", + {tridiagonalization_v_step_1_kernel_code}, + {{"REDUCTION_STEP_SIZE", 4}, + {"LOCAL_SIZE_", 64}}); - const kernel_cl - tridiagonalization_v_step_2( - "tridiagonalization_v_step_2", - {tridiagonalization_v_step_2_kernel_code}, - {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}}); +const kernel_cl + tridiagonalization_v_step_2("tridiagonalization_v_step_2", + {tridiagonalization_v_step_2_kernel_code}, + {{"REDUCTION_STEP_SIZE", 4}, + {"LOCAL_SIZE_", 64}}); - const kernel_cl - tridiagonalization_v_step_3( - "tridiagonalization_v_step_3", - {tridiagonalization_v_step_3_kernel_code}, - {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}}); +const kernel_cl + tridiagonalization_v_step_3("tridiagonalization_v_step_3", + {tridiagonalization_v_step_3_kernel_code}, + {{"REDUCTION_STEP_SIZE", 4}, + {"LOCAL_SIZE_", 1024}}); - } // namespace opencl_kernels -} // namespace math } // namespace opencl_kernels +} // namespace math +} // namespace stan #endif #endif