From 5be66536ca30785cda922070ea9da81b7ea2ddb2 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Tue, 19 Sep 2023 10:04:53 -0400
Subject: [PATCH] [Jenkins] auto-formatting by clang-format version
 10.0.0-4ubuntu1

---
 .../opencl/kernels/tridiagonalization.hpp     | 524 +++++++++---------
 1 file changed, 266 insertions(+), 258 deletions(-)

diff --git a/stan/math/opencl/kernels/tridiagonalization.hpp b/stan/math/opencl/kernels/tridiagonalization.hpp
index 1a85d9d5cb1..67195e631c8 100644
--- a/stan/math/opencl/kernels/tridiagonalization.hpp
+++ b/stan/math/opencl/kernels/tridiagonalization.hpp
@@ -10,292 +10,300 @@ namespace math {
 namespace opencl_kernels {
 
 // \cond
-static constexpr const char* tridiagonalization_householder_kernel_code = STRINGIFY(
-    // \endcond
-    /**
-     * Calculates householder vector and first element of the vector v.
-     * Must be run with 1 workgroup of LOCAL_SIZE_ threads.
-     * @param[in,out] P packed matrix being constructed
-     * @param[in,out] V matrix V
-     * @param[out] q_glob q
-     * @param P_rows Number of rows of the packed matrix
-     * @param V_rows Number of rows of the matrix V
-     * @param j Start column of the block to work on
-     * @param k Index of the householder vector in the block to create
-     */
-    __kernel void tridiagonalization_householder(
-        __global double* P, __global double* V, __global double* q_glob,
-        const int P_rows, const int V_rows, const int j, const int k) {
-      const int lid = get_local_id(0);
-      const int gid = get_global_id(0);
-      const int gsize = get_global_size(0);
-      const int lsize = get_local_size(0);
-      const int ngroups = get_num_groups(0);
-      const int wgid = get_group_id(0);
+static constexpr const char* tridiagonalization_householder_kernel_code
+    = STRINGIFY(
+        // \endcond
+        /**
+         * Calculates householder vector and first element of the vector v.
+         * Must be run with 1 workgroup of LOCAL_SIZE_ threads.
+         * @param[in,out] P packed matrix being constructed
+         * @param[in,out] V matrix V
+         * @param[out] q_glob q
+         * @param P_rows Number of rows of the packed matrix
+         * @param V_rows Number of rows of the matrix V
+         * @param j Start column of the block to work on
+         * @param k Index of the householder vector in the block to create
+         */
+        __kernel void tridiagonalization_householder(
+            __global double* P, __global double* V, __global double* q_glob,
+            const int P_rows, const int V_rows, const int j, const int k) {
+          const int lid = get_local_id(0);
+          const int gid = get_global_id(0);
+          const int gsize = get_global_size(0);
+          const int lsize = get_local_size(0);
+          const int ngroups = get_num_groups(0);
+          const int wgid = get_group_id(0);
 
-      double q = 0;
+          double q = 0;
 
-      const int P_start = P_rows * (k + j) + k + j;
-      const int P_span = P_rows * (k + j + 1) - P_start;
-      for (int i = lid; i < P_span; i += lsize) {
-        double acc = 0;
-        // apply previous householder reflections from current block to the
-        // column we are making the Householder vector from
-        for (int l = 0; l < j; l++) {
-          acc += P[P_rows * (k + l) + k + j + i] * V[V_rows * l + j - 1]
-                 + V[V_rows * l + j - 1 + i] * P[P_rows * (k + l) + k + j];
-        }
-        double tmp = P[P_start + i] - acc;
-        P[P_start + i] = tmp;
-        if (i != 0) {
-          q += tmp * tmp;
-        }
-      }
-      // calculate column norm between threads
-      __local double q_local[LOCAL_SIZE_];
-      q_local[lid] = q;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
-           step /= REDUCTION_STEP_SIZE) {
-        if (lid < step) {
-          for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
-            q_local[lid] += q_local[lid + step * i];
+          const int P_start = P_rows * (k + j) + k + j;
+          const int P_span = P_rows * (k + j + 1) - P_start;
+          for (int i = lid; i < P_span; i += lsize) {
+            double acc = 0;
+            // apply previous householder reflections from current block to the
+            // column we are making the Householder vector from
+            for (int l = 0; l < j; l++) {
+              acc += P[P_rows * (k + l) + k + j + i] * V[V_rows * l + j - 1]
+                     + V[V_rows * l + j - 1 + i] * P[P_rows * (k + l) + k + j];
+            }
+            double tmp = P[P_start + i] - acc;
+            P[P_start + i] = tmp;
+            if (i != 0) {
+              q += tmp * tmp;
+            }
+          }
+          // calculate column norm between threads
+          __local double q_local[LOCAL_SIZE_];
+          q_local[lid] = q;
+          barrier(CLK_LOCAL_MEM_FENCE);
+          for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
+               step /= REDUCTION_STEP_SIZE) {
+            if (lid < step) {
+              for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
+                q_local[lid] += q_local[lid + step * i];
+              }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
           }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
 
-      double alpha;
-      if (lid == 0) {
-        q = q_local[0];
-        double p1 = P[P_start + 1];
-        // make Householder vector
-        alpha = -copysign(sqrt(q), P[P_start]);
-        q -= p1 * p1;
-        p1 -= alpha;
-        P[P_start + 1] = p1;
-        q += p1 * p1;
-        q = sqrt(q);
-        q_local[0] = q;
-        q_local[1] = alpha;
-        *q_glob = q;
-      }
-      barrier(CLK_LOCAL_MEM_FENCE);
-      q = q_local[0];
-      alpha = q_local[1];
-      if (q != 0) {
-        double multi = M_SQRT2 / q;
-        // normalize the Householder vector
-        for (int i = lid + 1; i < P_span; i += lsize) {
-          P[P_start + i] *= multi;
+          double alpha;
+          if (lid == 0) {
+            q = q_local[0];
+            double p1 = P[P_start + 1];
+            // make Householder vector
+            alpha = -copysign(sqrt(q), P[P_start]);
+            q -= p1 * p1;
+            p1 -= alpha;
+            P[P_start + 1] = p1;
+            q += p1 * p1;
+            q = sqrt(q);
+            q_local[0] = q;
+            q_local[1] = alpha;
+            *q_glob = q;
+          }
+          barrier(CLK_LOCAL_MEM_FENCE);
+          q = q_local[0];
+          alpha = q_local[1];
+          if (q != 0) {
+            double multi = M_SQRT2 / q;
+            // normalize the Householder vector
+            for (int i = lid + 1; i < P_span; i += lsize) {
+              P[P_start + i] *= multi;
+            }
+          }
+          if (gid == 0) {
+            P[P_rows * (k + j + 1) + k + j]
+                = P[P_rows * (k + j) + k + j + 1] * q / M_SQRT2 + alpha;
+          }
         }
-      }
-      if (gid == 0) {
-        P[P_rows * (k + j + 1) + k + j]
-            = P[P_rows * (k + j) + k + j + 1] * q / M_SQRT2 + alpha;
-      }
-    }
-    // \cond
-);
+        // \cond
+    );
 // \endcond
 
 // \cond
-static constexpr const char* tridiagonalization_v_step_1_kernel_code = STRINGIFY(
-    // \endcond
-    /**
-     * Calculates first part of constructing the vector v: Uu = Pb * u and Vu =
-     * Vl * u. Pb is a block of packed matrix, Vl is left part of matrix V and u
-     * is householder vector. Must be run with number of work groups equal to
-     * size of resulting vectors and 64 threads per work group.
-     * @param P Packed matrix being constructed.
-     * @param V Matrix V.
-     * @param[out] Uu First resulting vector.
-     * @param[out] Vu Second resulting vector.
-     * @param P_rows Number of rows of the packed matrix
-     * @param V_rows Number of rows of the matrix V
-     * @param k Index of the householder vector in the block we use as input
-     */
-    __kernel void tridiagonalization_v_step_1(
-        const __global double* P, const __global double* V, __global double* Uu,
-        __global double* Vu, const int P_rows, const int V_rows, const int k) {
-      const int lid = get_local_id(0);
-      const int gid = get_global_id(0);
-      const int gsize = get_global_size(0);
-      const int lsize = get_local_size(0);
-      const int ngroups = get_num_groups(0);
-      const int wgid = get_group_id(0);
+static constexpr const char* tridiagonalization_v_step_1_kernel_code
+    = STRINGIFY(
+        // \endcond
+        /**
+         * Calculates first part of constructing the vector v: Uu = Pb * u and
+         * Vu = Vl * u. Pb is a block of packed matrix, Vl is left part of
+         * matrix V and u is householder vector. Must be run with number of work
+         * groups equal to size of resulting vectors and 64 threads per work
+         * group.
+         * @param P Packed matrix being constructed.
+         * @param V Matrix V.
+         * @param[out] Uu First resulting vector.
+         * @param[out] Vu Second resulting vector.
+         * @param P_rows Number of rows of the packed matrix
+         * @param V_rows Number of rows of the matrix V
+         * @param k Index of the householder vector in the block we use as input
+         */
+        __kernel void tridiagonalization_v_step_1(
+            const __global double* P, const __global double* V,
+            __global double* Uu, __global double* Vu, const int P_rows,
+            const int V_rows, const int k) {
+          const int lid = get_local_id(0);
+          const int gid = get_global_id(0);
+          const int gsize = get_global_size(0);
+          const int lsize = get_local_size(0);
+          const int ngroups = get_num_groups(0);
+          const int wgid = get_group_id(0);
 
-      __local double res_loc1[LOCAL_SIZE_];
-      __local double res_loc2[LOCAL_SIZE_];
-      double acc1 = 0;
-      double acc2 = 0;
+          __local double res_loc1[LOCAL_SIZE_];
+          __local double res_loc2[LOCAL_SIZE_];
+          double acc1 = 0;
+          double acc2 = 0;
 
-      const __global double* vec = P + P_rows * (k + ngroups) + k + ngroups + 1;
-      const __global double* M1 = P + P_rows * (k + wgid) + k + ngroups + 1;
-      const __global double* M2 = V + V_rows * wgid + ngroups;
-      for (int i = lid; i < P_rows - k - ngroups - 1;
-           i += LOCAL_SIZE_) {  // go over column of the matrix in steps of 64
-        double v = vec[i];
-        acc1 += M1[i] * v;
-        acc2 += M2[i] * v;
-      }
-      res_loc1[lid] = acc1;
-      res_loc2[lid] = acc2;
-      barrier(CLK_LOCAL_MEM_FENCE);
+          const __global double* vec
+              = P + P_rows * (k + ngroups) + k + ngroups + 1;
+          const __global double* M1 = P + P_rows * (k + wgid) + k + ngroups + 1;
+          const __global double* M2 = V + V_rows * wgid + ngroups;
+          for (int i = lid; i < P_rows - k - ngroups - 1;
+               i
+               += LOCAL_SIZE_) {  // go over column of the matrix in steps of 64
+            double v = vec[i];
+            acc1 += M1[i] * v;
+            acc2 += M2[i] * v;
+          }
+          res_loc1[lid] = acc1;
+          res_loc2[lid] = acc2;
+          barrier(CLK_LOCAL_MEM_FENCE);
 
-      for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
-           step /= REDUCTION_STEP_SIZE) {
-        if (lid < step) {
-          for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
-            res_loc1[lid] += res_loc1[lid + step * i];
-            res_loc2[lid] += res_loc2[lid + step * i];
+          for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
+               step /= REDUCTION_STEP_SIZE) {
+            if (lid < step) {
+              for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
+                res_loc1[lid] += res_loc1[lid + step * i];
+                res_loc2[lid] += res_loc2[lid + step * i];
+              }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+          }
+          if (lid == 0) {
+            Uu[wgid] = res_loc1[0];
+            Vu[wgid] = res_loc2[0];
           }
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
-      if (lid == 0) {
-        Uu[wgid] = res_loc1[0];
-        Vu[wgid] = res_loc2[0];
-      }
-    }
-    // \cond
-);
+        // \cond
+    );
 // \endcond
 
 // \cond
-static constexpr const char* tridiagonalization_v_step_2_kernel_code = STRINGIFY(
-    // \endcond
-    /**
-     * Second part in constructing vector v: v = Pb * u + V * Uu + U * Vu. Pb is
-     * a block of packed matrix and U is householder vector. Pb is symmetric
-     * with only lower triangle having values. That is why two columns of V are
-     * written that must be added to obtain the vector v. Must be run with 64
-     * threads per work group and total number of threads equal or greater than
-     * size of result vector.
-     * @param P Packed matrix being constructed.
-     * @param V Matrix V.
-     * @param Uu Uu from previous kernel.
-     * @param Vu Vu from previous kernel.
-     * @param P_rows Number of rows of the packed matrix
-     * @param V_rows Number of rows of the matrix V
-     * @param k Index of the householder vector in the block we use as input
-     * @param j Start column of the block to work on
-     */
-    __kernel void tridiagonalization_v_step_2(
-        const __global double* P, __global double* V, const __global double* Uu,
-        const __global double* Vu, const int P_rows, const int V_rows,
-        const int k, const int j) {
-      const int lid = get_local_id(0);
-      const int gid = get_global_id(0);
-      const int gsize = get_global_size(0);
-      const int lsize = get_local_size(0);
-      const int ngroups = get_num_groups(0);
-      const int wgid = get_group_id(0);
+static constexpr const char* tridiagonalization_v_step_2_kernel_code
+    = STRINGIFY(
+        // \endcond
+        /**
+         * Second part in constructing vector v: v = Pb * u + V * Uu + U * Vu.
+         * Pb is a block of packed matrix and U is householder vector. Pb is
+         * symmetric with only lower triangle having values. That is why two
+         * columns of V are written that must be added to obtain the vector v.
+         * Must be run with 64 threads per work group and total number of
+         * threads equal or greater than size of result vector.
+         * @param P Packed matrix being constructed.
+         * @param V Matrix V.
+         * @param Uu Uu from previous kernel.
+         * @param Vu Vu from previous kernel.
+         * @param P_rows Number of rows of the packed matrix
+         * @param V_rows Number of rows of the matrix V
+         * @param k Index of the householder vector in the block we use as input
+         * @param j Start column of the block to work on
+         */
+        __kernel void tridiagonalization_v_step_2(
+            const __global double* P, __global double* V,
+            const __global double* Uu, const __global double* Vu,
+            const int P_rows, const int V_rows, const int k, const int j) {
+          const int lid = get_local_id(0);
+          const int gid = get_global_id(0);
+          const int gsize = get_global_size(0);
+          const int lsize = get_local_size(0);
+          const int ngroups = get_num_groups(0);
+          const int wgid = get_group_id(0);
 
-      int work = P_rows - k - j - 1;
-      double acc = 0;
+          int work = P_rows - k - j - 1;
+          double acc = 0;
 
-      const __global double* vec = P + P_rows * (k + j) + k + j + 1;
-      const __global double* M1 = P + P_rows * (k + j + 1) + k + j + 1;
-      const __global double* M2 = P + P_rows * k + k + j + 1;
-      const __global double* M3 = V + j;
-      int i;
-      if (gid < work) {
-        for (i = 0; i <= gid; i++) {
-          acc += M1[P_rows * i + gid] * vec[i];
-        }
-        for (int i = 0; i < j; i++) {
-          acc -= M2[P_rows * i + gid] * Vu[i];
-          acc -= M3[V_rows * i + gid] * Uu[i];
-        }
-        V[V_rows * j + gid + j] = acc;
-      }
-      float work_per_group
-          = (float)work / ngroups;  // NOLINT(readability/casting)
-      int start = work_per_group * wgid;
-      int end = work_per_group * (wgid + 1);
-      __local double res_loc[LOCAL_SIZE_];
-      for (int i = start; i < end; i += 1) {
-        acc = 0;
-        for (int l = i + 1 + lid; l < work; l += LOCAL_SIZE_) {
-          acc += M1[P_rows * i + l] * vec[l];
-        }
-        res_loc[lid] = acc;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
-             step /= REDUCTION_STEP_SIZE) {
-          if (lid < step) {
-            for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
-              res_loc[lid] += res_loc[lid + step * i];
+          const __global double* vec = P + P_rows * (k + j) + k + j + 1;
+          const __global double* M1 = P + P_rows * (k + j + 1) + k + j + 1;
+          const __global double* M2 = P + P_rows * k + k + j + 1;
+          const __global double* M3 = V + j;
+          int i;
+          if (gid < work) {
+            for (i = 0; i <= gid; i++) {
+              acc += M1[P_rows * i + gid] * vec[i];
             }
+            for (int i = 0; i < j; i++) {
+              acc -= M2[P_rows * i + gid] * Vu[i];
+              acc -= M3[V_rows * i + gid] * Uu[i];
+            }
+            V[V_rows * j + gid + j] = acc;
+          }
+          float work_per_group
+              = (float)work / ngroups;  // NOLINT(readability/casting)
+          int start = work_per_group * wgid;
+          int end = work_per_group * (wgid + 1);
+          __local double res_loc[LOCAL_SIZE_];
+          for (int i = start; i < end; i += 1) {
+            acc = 0;
+            for (int l = i + 1 + lid; l < work; l += LOCAL_SIZE_) {
+              acc += M1[P_rows * i + l] * vec[l];
+            }
+            res_loc[lid] = acc;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
+                 step /= REDUCTION_STEP_SIZE) {
+              if (lid < step) {
+                for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
+                  res_loc[lid] += res_loc[lid + step * i];
+                }
+              }
+              barrier(CLK_LOCAL_MEM_FENCE);
+            }
+            if (lid == 0) {
+              V[V_rows * (j + 1) + i + j] = res_loc[lid];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
           }
-          barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        if (lid == 0) {
-          V[V_rows * (j + 1) + i + j] = res_loc[lid];
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
-    }
-    // \cond
-);
+        // \cond
+    );
 // \endcond
 
 // \cond
-static constexpr const char* tridiagonalization_v_step_3_kernel_code = STRINGIFY(
-    // \endcond
-    /**
-     * Third part in constructing vector v: v-=0.5*(v^T*u)*u, where u is the
-     * householder vector.
-     * @param[in,out] P packed matrix being constructed
-     * @param[in,out] V matrix V
-     * @param[out] q q
-     * @param P_rows Number of rows of the packed matrix
-     * @param V_rows Number of rows of the matrix V
-     * @param k Index of the householder vector in the block to create
-     * @param j Start column of the block to work on
-     */
-    __kernel void tridiagonalization_v_step_3(
-        __global double* P, __global double* V, __global double* q,
-        const int P_rows, const int V_rows, const int k, const int j) {
-      const int lid = get_local_id(0);
-      const int gid = get_global_id(0);
-      const int gsize = get_global_size(0);
-      const int lsize = get_local_size(0);
-      const int ngroups = get_num_groups(0);
-      const int wgid = get_group_id(0);
+static constexpr const char* tridiagonalization_v_step_3_kernel_code
+    = STRINGIFY(
+        // \endcond
+        /**
+         * Third part in constructing vector v: v-=0.5*(v^T*u)*u, where u is the
+         * householder vector.
+         * @param[in,out] P packed matrix being constructed
+         * @param[in,out] V matrix V
+         * @param[out] q q
+         * @param P_rows Number of rows of the packed matrix
+         * @param V_rows Number of rows of the matrix V
+         * @param k Index of the householder vector in the block to create
+         * @param j Start column of the block to work on
+         */
+        __kernel void tridiagonalization_v_step_3(
+            __global double* P, __global double* V, __global double* q,
+            const int P_rows, const int V_rows, const int k, const int j) {
+          const int lid = get_local_id(0);
+          const int gid = get_global_id(0);
+          const int gsize = get_global_size(0);
+          const int lsize = get_local_size(0);
+          const int ngroups = get_num_groups(0);
+          const int wgid = get_group_id(0);
 
-      __global double* u = P + P_rows * (k + j) + k + j + 1;
-      __global double* v = V + V_rows * j + j;
-      double acc = 0;
+          __global double* u = P + P_rows * (k + j) + k + j + 1;
+          __global double* v = V + V_rows * j + j;
+          double acc = 0;
 
-      for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) {
-        double vi = v[i] + v[i + V_rows];
-        v[i] = vi;
-        acc += u[i] * vi;
-      }
-      __local double res_loc[LOCAL_SIZE_];
-      res_loc[lid] = acc;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
-           step /= REDUCTION_STEP_SIZE) {
-        if (lid < step) {
-          for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
-            res_loc[lid] += res_loc[lid + step * i];
+          for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) {
+            double vi = v[i] + v[i + V_rows];
+            v[i] = vi;
+            acc += u[i] * vi;
+          }
+          __local double res_loc[LOCAL_SIZE_];
+          res_loc[lid] = acc;
+          barrier(CLK_LOCAL_MEM_FENCE);
+          for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
+               step /= REDUCTION_STEP_SIZE) {
+            if (lid < step) {
+              for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
+                res_loc[lid] += res_loc[lid + step * i];
+              }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+          }
+          acc = res_loc[0] * 0.5;
+          for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) {
+            v[i] -= acc * u[i];
+          }
+          if (gid == 0) {
+            P[P_rows * (k + j + 1) + k + j] -= *q / M_SQRT2 * u[0];
           }
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
-      }
-      acc = res_loc[0] * 0.5;
-      for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) {
-        v[i] -= acc * u[i];
-      }
-      if (gid == 0) {
-        P[P_rows * (k + j + 1) + k + j] -= *q / M_SQRT2 * u[0];
-      }
-    }
-    // \cond
-);
+        // \cond
+    );
 // \endcond
 
 const kernel_cl<in_out_buffer, in_out_buffer, out_buffer, int, int, int, int>