diff --git a/codegen/annotations.h b/codegen/annotations.h
index f39ae0b..78e6546 100644
--- a/codegen/annotations.h
+++ b/codegen/annotations.h
@@ -12141,6 +12141,7 @@ cublasStatus_t cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
  */
 cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12148,17 +12149,17 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12166,17 +12167,17 @@ cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t trans
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12184,17 +12185,17 @@ cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12202,17 +12203,17 @@ cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t trans
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12220,17 +12221,17 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12238,17 +12239,17 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12256,17 +12257,17 @@ cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12274,17 +12275,17 @@ cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t trans
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12292,17 +12293,17 @@ cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12310,17 +12311,17 @@ cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t tra
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12328,17 +12329,17 @@ cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12346,14 +12347,13 @@ cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t trans
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount);
 /**
@@ -12609,6 +12609,8 @@ cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_
  */
 cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* B, int64_t ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc, long long int strideC, int64_t batchCount);
 /**
+ * @disabled
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12616,22 +12618,22 @@ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param Atype SEND_ONLY
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param Btype SEND_ONLY
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param Ctype SEND_ONLY
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  * @param computeType SEND_ONLY
  * @param algo SEND_ONLY
  */
 cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int lda, const void* const Barray[], cudaDataType Btype, int ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int ldc, int batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
  * @param transb SEND_ONLY
@@ -12639,17 +12641,16 @@ cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t tran
  * @param n SEND_ONLY
  * @param k SEND_ONLY
  * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchCount
  * @param Atype SEND_ONLY
  * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchCount
  * @param Btype SEND_ONLY
  * @param ldb SEND_ONLY
  * @param beta SEND_RECV
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchCount
  * @param Ctype SEND_ONLY
  * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
  * @param computeType SEND_ONLY
  * @param algo SEND_ONLY
  */
@@ -12835,6 +12836,7 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cubl
  */
 cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12843,14 +12845,14 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12859,14 +12861,14 @@ cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12875,14 +12877,14 @@ cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12891,14 +12893,14 @@ cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12907,14 +12909,14 @@ cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12923,14 +12925,14 @@ cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12939,14 +12941,14 @@ cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount);
 /**
+ * @param batchCount SEND_ONLY
  * @param handle SEND_ONLY
  * @param side SEND_ONLY
  * @param uplo SEND_ONLY
@@ -12955,11 +12957,10 @@ cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param alpha SEND_RECV
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchCount
  * @param lda SEND_ONLY
- * @param B SEND_ONLY
+ * @param B SEND_ONLY LENGTH:batchCount
  * @param ldb SEND_ONLY
- * @param batchCount SEND_ONLY
  */
 cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount);
 /**
@@ -13067,151 +13068,151 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
  */
 cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Ainv SEND_ONLY
+ * @param Ainv SEND_ONLY LENGTH:batchSize
  * @param lda_inv SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Ainv SEND_ONLY
+ * @param Ainv SEND_ONLY LENGTH:batchSize
  * @param lda_inv SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Ainv SEND_ONLY
+ * @param Ainv SEND_ONLY LENGTH:batchSize
  * @param lda_inv SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Ainv SEND_ONLY
+ * @param Ainv SEND_ONLY LENGTH:batchSize
  * @param lda_inv SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param TauArray SEND_ONLY
+ * @param TauArray SEND_ONLY LENGTH:batchSize
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param TauArray SEND_ONLY
+ * @param TauArray SEND_ONLY LENGTH:batchSize
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param TauArray SEND_ONLY
+ * @param TauArray SEND_ONLY LENGTH:batchSize
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param TauArray SEND_ONLY
+ * @param TauArray SEND_ONLY LENGTH:batchSize
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
  * @param devInfoArray SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
  * @param devInfoArray SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
  * @param devInfoArray SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param m SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
- * @param Carray SEND_ONLY
+ * @param Carray SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
  * @param devInfoArray SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize);
 /**
@@ -13327,107 +13328,107 @@ cublasStatus_t cublasCgetrfBatched(cublasHandle_t handle, int n, cuComplex* cons
  */
 cublasStatus_t cublasZgetrfBatched(cublasHandle_t handle, int n, cuDoubleComplex* const A[], int lda, int* P, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param P SEND_RECV
- * @param C SEND_ONLY
+ * @param C SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param P SEND_RECV
- * @param C SEND_ONLY
+ * @param C SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param P SEND_RECV
- * @param C SEND_ONLY
+ * @param C SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param n SEND_ONLY
- * @param A SEND_ONLY
+ * @param A SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param P SEND_RECV
- * @param C SEND_ONLY
+ * @param C SEND_ONLY LENGTH:batchSize
  * @param ldc SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param devIpiv SEND_RECV
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchSize
  * @param ldb SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param devIpiv SEND_RECV
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchSize
  * @param ldb SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param devIpiv SEND_RECV
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchSize
  * @param ldb SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize);
 /**
+ * @param batchSize SEND_ONLY
  * @param handle SEND_ONLY
  * @param trans SEND_ONLY
  * @param n SEND_ONLY
  * @param nrhs SEND_ONLY
- * @param Aarray SEND_ONLY
+ * @param Aarray SEND_ONLY LENGTH:batchSize
  * @param lda SEND_ONLY
  * @param devIpiv SEND_RECV
- * @param Barray SEND_ONLY
+ * @param Barray SEND_ONLY LENGTH:batchSize
  * @param ldb SEND_ONLY
  * @param info SEND_RECV
- * @param batchSize SEND_ONLY
  */
 cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize);
 /**
@@ -13480,29 +13481,6 @@ cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle, cudaDataType_t da
  * @param algo SEND_ONLY
  */
 cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* A, cudaDataType Atype, int lda, const void* B, cudaDataType Btype, int ldb, const void* beta, void* C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo);
-/**
- * @param handle SEND_ONLY
- * @param transa SEND_ONLY
- * @param transb SEND_ONLY
- * @param m SEND_ONLY
- * @param n SEND_ONLY
- * @param k SEND_ONLY
- * @param alpha SEND_RECV
- * @param Aarray SEND_ONLY
- * @param Atype SEND_ONLY
- * @param lda SEND_ONLY
- * @param Barray SEND_ONLY
- * @param Btype SEND_ONLY
- * @param ldb SEND_ONLY
- * @param beta SEND_RECV
- * @param Carray SEND_ONLY
- * @param Ctype SEND_ONLY
- * @param ldc SEND_ONLY
- * @param batchCount SEND_ONLY
- * @param computeType SEND_ONLY
- * @param algo SEND_ONLY
- */
-cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int lda, const void* const Barray[], cudaDataType Btype, int ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int ldc, int batchCount, cudaDataType computeType, cublasGemmAlgo_t algo);
 /**
  * @param handle SEND_ONLY
  * @param transa SEND_ONLY
diff --git a/codegen/codegen.py b/codegen/codegen.py
index c91ac7a..aa10f9f 100644
--- a/codegen/codegen.py
+++ b/codegen/codegen.py
@@ -190,10 +190,9 @@ def client_rpc_write(self, f):
         # array length operations are handled differently than char
         elif isinstance(self.ptr, Array):
             f.write(
-                "        rpc_write(0, {param_name}, sizeof({param_type}[{length}])) < 0 ||\n".format(
+                "        rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format(
                     param_name=self.parameter.name,
-                    param_type=self.ptr.format().replace("[]", ""),
-                    length=self.length.name,
+                    param_type=self.parameter.name,
                 )
             )
         else:
@@ -216,7 +215,7 @@ def server_declaration(self) -> str:
             c = self.ptr.const
             self.ptr.const = False
             # const[] isn't a valid part of a variable declaration
-            s = f"    {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = new {self.ptr.format().replace("const[]", "")}[{self.length.name}];\n"
+            s = f"    {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = nullptr;\n"
             self.ptr.const = c
         else:
             c = self.ptr.ptr_to.const
@@ -230,17 +229,16 @@ def server_rpc_read(self, f):
             return
         elif isinstance(self.length, int):
             f.write(
-                "        rpc_read(conn, {param_name}, {size}) < 0 ||\n".format(
+                "        rpc_read(conn, &{param_name}, {size}) < 0 ||\n".format(
                     param_name=self.parameter.name,
                     size=self.length,
                 )
             )
         elif isinstance(self.ptr, Array):
             f.write(
-                "        rpc_read(conn, {param_name}, sizeof({param_type}[{length}])) < 0 ||\n".format(
+                "        rpc_read(conn, &{param_name}, sizeof({param_type})) < 0 ||\n".format(
                     param_name=self.parameter.name,
                     param_type=self.ptr.format().replace("[]", ""),
-                    length=self.length.name,
                 )
             )
         else:
@@ -256,12 +254,6 @@ def server_rpc_read(self, f):
                 )
             )
 
-    def server_len_rpc_read(self, f):
-        f.write("   if (rpc_read(conn, &{length_param}, sizeof(int)) < 0)\n".format(
-                        length_param=self.length.name,
-                ))
-        f.write("       return -1;\n")
-
     @property
     def server_reference(self) -> str:
         return self.parameter.name
@@ -403,12 +395,20 @@ class OpaqueTypeOperation:
     def client_rpc_write(self, f):
         if not self.send:
             return
-        f.write(
-            "        rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format(
-                param_name=self.parameter.name,
-                param_type=self.type_.format(),
+        elif "const double*" in self.type_.format():
+            f.write(
+                "        rpc_write(0, {param_name}, sizeof({param_type})) < 0 ||\n".format(
+                    param_name=self.parameter.name,
+                    param_type=self.type_.format(),
+                )
+            ) 
+        else:
+            f.write(
+                "        rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format(
+                    param_name=self.parameter.name,
+                    param_type=self.type_.format(),
+                )
             )
-        )
 
     @property
     def server_declaration(self) -> str:
@@ -418,7 +418,10 @@ def server_declaration(self) -> str:
         # but "const cudnnTensorDescriptor_t *xDesc" IS valid. This subtle change carries reprecussions.
         elif "const " in self.type_.format() and not "void" in self.type_.format() and not "*" in self.type_.format():
             return f"   {self.type_.format().replace("const", "")} {self.parameter.name};\n"
-        else: return f"    {self.type_.format()} {self.parameter.name};\n"
+        elif "const double*" in self.type_.format():
+            return f"    double {self.parameter.name};\n"
+        else:
+            return f"    {self.type_.format()} {self.parameter.name};\n"
 
     def server_rpc_read(self, f):
         if not self.send:
@@ -434,6 +437,8 @@ def server_rpc_read(self, f):
     def server_reference(self) -> str:
         if self.recv:
             return f"&{self.parameter.name}"
+        if "const double*" in self.type_.format():
+            return f"&{self.parameter.name}"
         return self.parameter.name
 
     def server_rpc_write(self, f):
@@ -703,7 +708,15 @@ def main():
 
     functions_with_annotations: list[tuple[Function, Function, list[Operation]]] = []
 
+    dupes = {}
+
     for function in functions:
+        # ensure duplicate functions can't be written
+        if dupes.get(function.name.format()):
+            continue
+
+        dupes[function.name.format()] = True
+
         try:
             annotation = next(
                 f for f in annotations.namespace.functions if f.name == function.name
@@ -915,14 +928,6 @@ def main():
         for function, annotation, operations, disabled in functions_with_annotations:
             if function.name.format() in MANUAL_IMPLEMENTATIONS or disabled: continue
 
-            batched = False
-
-            # not a fan of this, but the batched functions are pretty standard with the flow below.
-            # batched functions are cublas functions that send pointer arrays where batchCount describes...
-            # the number of pointers in the arrays. This is non-trivial to generate.
-            if "Batched" in function.name.format():
-                batched = True
-
             # parse the annotation doxygen
             f.write(
                 "int handle_{name}(void *conn)\n".format(
@@ -933,70 +938,28 @@ def main():
 
             defers = []
 
-            if batched:
-                array_batches = []
-                non_array_batches = []
-
-                for operation in operations:
-                    if isinstance(operation, NullTerminatedOperation):
-                        if error := operation.server_rpc_read(f, len(defers)):
-                            defers.append(error)
-                    if isinstance(operation, ArrayOperation):
-                        array_batches.append(operation)
-                    if not isinstance(operation, ArrayOperation):
-                        non_array_batches.append(operation)
-
-                # print our normal operations the same
-                for operation in operations:
-                    if operation not in array_batches:
-                        f.write(operation.server_declaration)
-
-                # do something with array batches
-                if len(array_batches) > 0 and hasattr(array_batches[0], "server_len_rpc_read"):
-                    array_batches[0].server_len_rpc_read(f)
-
-                    # pop here, because we already accounted for the batchCount integer
-                    non_array_batches.pop(0)
-
-                for op in array_batches:
-                    f.write(op.server_declaration)
-
-                f.write("    int request_id;\n")
-                if function.return_type.format() != "void":
-                    f.write("    {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format()))
-                else:
-                    f.write("    void* scuda_intercept_result;\n".format(return_type=function.return_type.format()))
+            for operation in operations:
+                f.write(operation.server_declaration)
 
-                f.write("    if (\n")
-                for operation in operations:
-                    operation.server_rpc_read(f)
-                f.write("        false)\n")
-                f.write("        goto ERROR_{index};\n".format(index=len(defers)))
+            f.write("    int request_id;\n")
 
-                f.write("\n")
+            # we only generate return from non-void types
+            if function.return_type.format() != "void":
+                f.write("    {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format()))
             else:
-                for operation in operations:
-                    f.write(operation.server_declaration)
-
-                f.write("    int request_id;\n")
+                f.write("    void* scuda_intercept_result;\n".format(return_type=function.return_type.format()))
 
-                # we only generate return from non-void types
-                if function.return_type.format() != "void":
-                    f.write("    {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format()))
+            f.write("    if (\n")
+            for operation in operations:
+                if isinstance(operation, NullTerminatedOperation):
+                    if error := operation.server_rpc_read(f, len(defers)):
+                        defers.append(error)
                 else:
-                    f.write("    void* scuda_intercept_result;\n".format(return_type=function.return_type.format()))
-
-                f.write("    if (\n")
-                for operation in operations:
-                    if isinstance(operation, NullTerminatedOperation):
-                        if error := operation.server_rpc_read(f, len(defers)):
-                            defers.append(error)
-                    else:
-                        operation.server_rpc_read(f)
-                f.write("        false)\n")
-                f.write("        goto ERROR_{index};\n".format(index=len(defers)))
+                    operation.server_rpc_read(f)
+            f.write("        false)\n")
+            f.write("        goto ERROR_{index};\n".format(index=len(defers)))
 
-                f.write("\n")
+            f.write("\n")
 
             f.write(
                 "    request_id = rpc_end_request(conn);\n".format(
diff --git a/codegen/gen_api.h b/codegen/gen_api.h
index 8fb6410..2545cf2 100644
--- a/codegen/gen_api.h
+++ b/codegen/gen_api.h
@@ -1252,118 +1252,160 @@
 #define RPC_cublasCtrmm_v2_64 1251
 #define RPC_cublasZtrmm_v2 1252
 #define RPC_cublasZtrmm_v2_64 1253
-#define RPC_cublasHgemmStridedBatched 1254
-#define RPC_cublasHgemmStridedBatched_64 1255
-#define RPC_cublasSgemmStridedBatched 1256
-#define RPC_cublasSgemmStridedBatched_64 1257
-#define RPC_cublasDgemmStridedBatched 1258
-#define RPC_cublasDgemmStridedBatched_64 1259
-#define RPC_cublasCgemmStridedBatched 1260
-#define RPC_cublasCgemmStridedBatched_64 1261
-#define RPC_cublasCgemm3mStridedBatched 1262
-#define RPC_cublasCgemm3mStridedBatched_64 1263
-#define RPC_cublasZgemmStridedBatched 1264
-#define RPC_cublasZgemmStridedBatched_64 1265
-#define RPC_cublasSgeam 1266
-#define RPC_cublasSgeam_64 1267
-#define RPC_cublasDgeam 1268
-#define RPC_cublasDgeam_64 1269
-#define RPC_cublasCgeam 1270
-#define RPC_cublasCgeam_64 1271
-#define RPC_cublasZgeam 1272
-#define RPC_cublasZgeam_64 1273
-#define RPC_cublasSdgmm 1274
-#define RPC_cublasSdgmm_64 1275
-#define RPC_cublasDdgmm 1276
-#define RPC_cublasDdgmm_64 1277
-#define RPC_cublasCdgmm 1278
-#define RPC_cublasCdgmm_64 1279
-#define RPC_cublasZdgmm 1280
-#define RPC_cublasZdgmm_64 1281
-#define RPC_cublasStpttr 1282
-#define RPC_cublasDtpttr 1283
-#define RPC_cublasCtpttr 1284
-#define RPC_cublasZtpttr 1285
-#define RPC_cublasStrttp 1286
-#define RPC_cublasDtrttp 1287
-#define RPC_cublasCtrttp 1288
-#define RPC_cublasZtrttp 1289
-#define RPC_cublasUint8gemmBias 1290
-#define RPC_cublasMigrateComputeType 1291
-#define RPC_cudnnGetVersion 1292
-#define RPC_cudnnGetMaxDeviceVersion 1293
-#define RPC_cudnnGetCudartVersion 1294
-#define RPC_cudnnGetErrorString 1295
-#define RPC_cudnnGetLastErrorString 1296
-#define RPC_cudnnQueryRuntimeError 1297
-#define RPC_cudnnGetProperty 1298
-#define RPC_cudnnCreate 1299
-#define RPC_cudnnDestroy 1300
-#define RPC_cudnnSetStream 1301
-#define RPC_cudnnGetStream 1302
-#define RPC_cudnnGetCallback 1303
-#define RPC_cudnnGraphVersionCheck 1304
-#define RPC_cudnnBackendCreateDescriptor 1305
-#define RPC_cudnnBackendDestroyDescriptor 1306
-#define RPC_cudnnBackendInitialize 1307
-#define RPC_cudnnBackendFinalize 1308
-#define RPC_cudnnBackendSetAttribute 1309
-#define RPC_cudnnBackendExecute 1310
-#define RPC_cudnnBackendPopulateCudaGraph 1311
-#define RPC_cudnnBackendUpdateCudaGraph 1312
-#define RPC_cudnnCreateTensorDescriptor 1313
-#define RPC_cudnnSetTensor4dDescriptor 1314
-#define RPC_cudnnSetTensor4dDescriptorEx 1315
-#define RPC_cudnnGetTensor4dDescriptor 1316
-#define RPC_cudnnGetTensorSizeInBytes 1317
-#define RPC_cudnnDestroyTensorDescriptor 1318
-#define RPC_cudnnInitTransformDest 1319
-#define RPC_cudnnCreateTensorTransformDescriptor 1320
-#define RPC_cudnnDestroyTensorTransformDescriptor 1321
-#define RPC_cudnnCreateOpTensorDescriptor 1322
-#define RPC_cudnnSetOpTensorDescriptor 1323
-#define RPC_cudnnGetOpTensorDescriptor 1324
-#define RPC_cudnnDestroyOpTensorDescriptor 1325
-#define RPC_cudnnCreateReduceTensorDescriptor 1326
-#define RPC_cudnnSetReduceTensorDescriptor 1327
-#define RPC_cudnnGetReduceTensorDescriptor 1328
-#define RPC_cudnnDestroyReduceTensorDescriptor 1329
-#define RPC_cudnnGetReductionIndicesSize 1330
-#define RPC_cudnnGetReductionWorkspaceSize 1331
-#define RPC_cudnnCreateFilterDescriptor 1332
-#define RPC_cudnnSetFilter4dDescriptor 1333
-#define RPC_cudnnGetFilter4dDescriptor 1334
-#define RPC_cudnnGetFilterSizeInBytes 1335
-#define RPC_cudnnDestroyFilterDescriptor 1336
-#define RPC_cudnnCreatePoolingDescriptor 1337
-#define RPC_cudnnSetPooling2dDescriptor 1338
-#define RPC_cudnnGetPooling2dDescriptor 1339
-#define RPC_cudnnGetPooling2dForwardOutputDim 1340
-#define RPC_cudnnDestroyPoolingDescriptor 1341
-#define RPC_cudnnCreateActivationDescriptor 1342
-#define RPC_cudnnSetActivationDescriptor 1343
-#define RPC_cudnnGetActivationDescriptor 1344
-#define RPC_cudnnSetActivationDescriptorSwishBeta 1345
-#define RPC_cudnnGetActivationDescriptorSwishBeta 1346
-#define RPC_cudnnDestroyActivationDescriptor 1347
-#define RPC_cudnnActivationForward 1348
-#define RPC_cudnnCreateLRNDescriptor 1349
-#define RPC_cudnnSetLRNDescriptor 1350
-#define RPC_cudnnGetLRNDescriptor 1351
-#define RPC_cudnnDestroyLRNDescriptor 1352
-#define RPC_cudnnDeriveBNTensorDescriptor 1353
-#define RPC_cudnnDeriveNormTensorDescriptor 1354
-#define RPC_cudnnCreateSpatialTransformerDescriptor 1355
-#define RPC_cudnnDestroySpatialTransformerDescriptor 1356
-#define RPC_cudnnCreateDropoutDescriptor 1357
-#define RPC_cudnnDestroyDropoutDescriptor 1358
-#define RPC_cudnnDropoutGetStatesSize 1359
-#define RPC_cudnnDropoutGetReserveSpaceSize 1360
-#define RPC_cudnnGetDropoutDescriptor 1361
-#define RPC_cudnnOpsVersionCheck 1362
-#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1363
-#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1364
-#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1365
-#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1366
-#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1367
-#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1368
+#define RPC_cublasHgemmBatched 1254
+#define RPC_cublasHgemmBatched_64 1255
+#define RPC_cublasSgemmBatched 1256
+#define RPC_cublasSgemmBatched_64 1257
+#define RPC_cublasDgemmBatched 1258
+#define RPC_cublasDgemmBatched_64 1259
+#define RPC_cublasCgemmBatched 1260
+#define RPC_cublasCgemmBatched_64 1261
+#define RPC_cublasCgemm3mBatched 1262
+#define RPC_cublasCgemm3mBatched_64 1263
+#define RPC_cublasZgemmBatched 1264
+#define RPC_cublasZgemmBatched_64 1265
+#define RPC_cublasHgemmStridedBatched 1266
+#define RPC_cublasHgemmStridedBatched_64 1267
+#define RPC_cublasSgemmStridedBatched 1268
+#define RPC_cublasSgemmStridedBatched_64 1269
+#define RPC_cublasDgemmStridedBatched 1270
+#define RPC_cublasDgemmStridedBatched_64 1271
+#define RPC_cublasCgemmStridedBatched 1272
+#define RPC_cublasCgemmStridedBatched_64 1273
+#define RPC_cublasCgemm3mStridedBatched 1274
+#define RPC_cublasCgemm3mStridedBatched_64 1275
+#define RPC_cublasZgemmStridedBatched 1276
+#define RPC_cublasZgemmStridedBatched_64 1277
+#define RPC_cublasGemmBatchedEx 1278
+#define RPC_cublasGemmBatchedEx_64 1279
+#define RPC_cublasSgeam 1280
+#define RPC_cublasSgeam_64 1281
+#define RPC_cublasDgeam 1282
+#define RPC_cublasDgeam_64 1283
+#define RPC_cublasCgeam 1284
+#define RPC_cublasCgeam_64 1285
+#define RPC_cublasZgeam 1286
+#define RPC_cublasZgeam_64 1287
+#define RPC_cublasStrsmBatched 1288
+#define RPC_cublasStrsmBatched_64 1289
+#define RPC_cublasDtrsmBatched 1290
+#define RPC_cublasDtrsmBatched_64 1291
+#define RPC_cublasCtrsmBatched 1292
+#define RPC_cublasCtrsmBatched_64 1293
+#define RPC_cublasZtrsmBatched 1294
+#define RPC_cublasZtrsmBatched_64 1295
+#define RPC_cublasSdgmm 1296
+#define RPC_cublasSdgmm_64 1297
+#define RPC_cublasDdgmm 1298
+#define RPC_cublasDdgmm_64 1299
+#define RPC_cublasCdgmm 1300
+#define RPC_cublasCdgmm_64 1301
+#define RPC_cublasZdgmm 1302
+#define RPC_cublasZdgmm_64 1303
+#define RPC_cublasSmatinvBatched 1304
+#define RPC_cublasDmatinvBatched 1305
+#define RPC_cublasCmatinvBatched 1306
+#define RPC_cublasZmatinvBatched 1307
+#define RPC_cublasSgeqrfBatched 1308
+#define RPC_cublasDgeqrfBatched 1309
+#define RPC_cublasCgeqrfBatched 1310
+#define RPC_cublasZgeqrfBatched 1311
+#define RPC_cublasSgelsBatched 1312
+#define RPC_cublasDgelsBatched 1313
+#define RPC_cublasCgelsBatched 1314
+#define RPC_cublasZgelsBatched 1315
+#define RPC_cublasStpttr 1316
+#define RPC_cublasDtpttr 1317
+#define RPC_cublasCtpttr 1318
+#define RPC_cublasZtpttr 1319
+#define RPC_cublasStrttp 1320
+#define RPC_cublasDtrttp 1321
+#define RPC_cublasCtrttp 1322
+#define RPC_cublasZtrttp 1323
+#define RPC_cublasSgetriBatched 1324
+#define RPC_cublasDgetriBatched 1325
+#define RPC_cublasCgetriBatched 1326
+#define RPC_cublasZgetriBatched 1327
+#define RPC_cublasSgetrsBatched 1328
+#define RPC_cublasDgetrsBatched 1329
+#define RPC_cublasCgetrsBatched 1330
+#define RPC_cublasZgetrsBatched 1331
+#define RPC_cublasUint8gemmBias 1332
+#define RPC_cublasMigrateComputeType 1333
+#define RPC_cudnnGetVersion 1334
+#define RPC_cudnnGetMaxDeviceVersion 1335
+#define RPC_cudnnGetCudartVersion 1336
+#define RPC_cudnnGetErrorString 1337
+#define RPC_cudnnGetLastErrorString 1338
+#define RPC_cudnnQueryRuntimeError 1339
+#define RPC_cudnnGetProperty 1340
+#define RPC_cudnnCreate 1341
+#define RPC_cudnnDestroy 1342
+#define RPC_cudnnSetStream 1343
+#define RPC_cudnnGetStream 1344
+#define RPC_cudnnGetCallback 1345
+#define RPC_cudnnGraphVersionCheck 1346
+#define RPC_cudnnBackendCreateDescriptor 1347
+#define RPC_cudnnBackendDestroyDescriptor 1348
+#define RPC_cudnnBackendInitialize 1349
+#define RPC_cudnnBackendFinalize 1350
+#define RPC_cudnnBackendSetAttribute 1351
+#define RPC_cudnnBackendExecute 1352
+#define RPC_cudnnBackendPopulateCudaGraph 1353
+#define RPC_cudnnBackendUpdateCudaGraph 1354
+#define RPC_cudnnCreateTensorDescriptor 1355
+#define RPC_cudnnSetTensor4dDescriptor 1356
+#define RPC_cudnnSetTensor4dDescriptorEx 1357
+#define RPC_cudnnGetTensor4dDescriptor 1358
+#define RPC_cudnnGetTensorSizeInBytes 1359
+#define RPC_cudnnDestroyTensorDescriptor 1360
+#define RPC_cudnnInitTransformDest 1361
+#define RPC_cudnnCreateTensorTransformDescriptor 1362
+#define RPC_cudnnDestroyTensorTransformDescriptor 1363
+#define RPC_cudnnCreateOpTensorDescriptor 1364
+#define RPC_cudnnSetOpTensorDescriptor 1365
+#define RPC_cudnnGetOpTensorDescriptor 1366
+#define RPC_cudnnDestroyOpTensorDescriptor 1367
+#define RPC_cudnnCreateReduceTensorDescriptor 1368
+#define RPC_cudnnSetReduceTensorDescriptor 1369
+#define RPC_cudnnGetReduceTensorDescriptor 1370
+#define RPC_cudnnDestroyReduceTensorDescriptor 1371
+#define RPC_cudnnGetReductionIndicesSize 1372
+#define RPC_cudnnGetReductionWorkspaceSize 1373
+#define RPC_cudnnCreateFilterDescriptor 1374
+#define RPC_cudnnSetFilter4dDescriptor 1375
+#define RPC_cudnnGetFilter4dDescriptor 1376
+#define RPC_cudnnGetFilterSizeInBytes 1377
+#define RPC_cudnnDestroyFilterDescriptor 1378
+#define RPC_cudnnCreatePoolingDescriptor 1379
+#define RPC_cudnnSetPooling2dDescriptor 1380
+#define RPC_cudnnGetPooling2dDescriptor 1381
+#define RPC_cudnnGetPooling2dForwardOutputDim 1382
+#define RPC_cudnnDestroyPoolingDescriptor 1383
+#define RPC_cudnnCreateActivationDescriptor 1384
+#define RPC_cudnnSetActivationDescriptor 1385
+#define RPC_cudnnGetActivationDescriptor 1386
+#define RPC_cudnnSetActivationDescriptorSwishBeta 1387
+#define RPC_cudnnGetActivationDescriptorSwishBeta 1388
+#define RPC_cudnnDestroyActivationDescriptor 1389
+#define RPC_cudnnActivationForward 1390
+#define RPC_cudnnCreateLRNDescriptor 1391
+#define RPC_cudnnSetLRNDescriptor 1392
+#define RPC_cudnnGetLRNDescriptor 1393
+#define RPC_cudnnDestroyLRNDescriptor 1394
+#define RPC_cudnnDeriveBNTensorDescriptor 1395
+#define RPC_cudnnDeriveNormTensorDescriptor 1396
+#define RPC_cudnnCreateSpatialTransformerDescriptor 1397
+#define RPC_cudnnDestroySpatialTransformerDescriptor 1398
+#define RPC_cudnnCreateDropoutDescriptor 1399
+#define RPC_cudnnDestroyDropoutDescriptor 1400
+#define RPC_cudnnDropoutGetStatesSize 1401
+#define RPC_cudnnDropoutGetReserveSpaceSize 1402
+#define RPC_cudnnGetDropoutDescriptor 1403
+#define RPC_cudnnOpsVersionCheck 1404
+#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1405
+#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1406
+#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1407
+#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1408
+#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1409
+#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1410
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
index b6324f9..4588260 100644
--- a/codegen/gen_client.cpp
+++ b/codegen/gen_client.cpp
@@ -11361,7 +11361,7 @@ cublasStatus_t cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int
     if (rpc_start_request(0, RPC_cublasDnrm2_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, result, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11377,7 +11377,7 @@ cublasStatus_t cublasDnrm2_v2_64(cublasHandle_t handle, int64_t n, const double*
     if (rpc_start_request(0, RPC_cublasDnrm2_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, result, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11493,9 +11493,9 @@ cublasStatus_t cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int
     if (rpc_start_request(0, RPC_cublasDdot_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_write(0, result, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11511,9 +11511,9 @@ cublasStatus_t cublasDdot_v2_64(cublasHandle_t handle, int64_t n, const double*
     if (rpc_start_request(0, RPC_cublasDdot_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_write(0, result, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11705,7 +11705,7 @@ cublasStatus_t cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha,
     if (rpc_start_request(0, RPC_cublasDscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11721,7 +11721,7 @@ cublasStatus_t cublasDscal_v2_64(cublasHandle_t handle, int64_t n, const double*
     if (rpc_start_request(0, RPC_cublasDscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11833,7 +11833,7 @@ cublasStatus_t cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha
     if (rpc_start_request(0, RPC_cublasZdscal_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11849,7 +11849,7 @@ cublasStatus_t cublasZdscal_v2_64(cublasHandle_t handle, int64_t n, const double
     if (rpc_start_request(0, RPC_cublasZdscal_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -11901,8 +11901,8 @@ cublasStatus_t cublasDaxpy_v2(cublasHandle_t handle, int n, const double* alpha,
     if (rpc_start_request(0, RPC_cublasDaxpy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
@@ -11919,8 +11919,8 @@ cublasStatus_t cublasDaxpy_v2_64(cublasHandle_t handle, int64_t n, const double*
     if (rpc_start_request(0, RPC_cublasDaxpy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
@@ -12043,7 +12043,7 @@ cublasStatus_t cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int
     if (rpc_start_request(0, RPC_cublasDcopy_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
@@ -12060,7 +12060,7 @@ cublasStatus_t cublasDcopy_v2_64(cublasHandle_t handle, int64_t n, const double*
     if (rpc_start_request(0, RPC_cublasDcopy_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
@@ -12321,7 +12321,7 @@ cublasStatus_t cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, in
     if (rpc_start_request(0, RPC_cublasIdamax_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, result, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -12337,7 +12337,7 @@ cublasStatus_t cublasIdamax_v2_64(cublasHandle_t handle, int64_t n, const double
     if (rpc_start_request(0, RPC_cublasIdamax_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, result, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -12483,7 +12483,7 @@ cublasStatus_t cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, in
     if (rpc_start_request(0, RPC_cublasIdamin_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, result, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -12499,7 +12499,7 @@ cublasStatus_t cublasIdamin_v2_64(cublasHandle_t handle, int64_t n, const double
     if (rpc_start_request(0, RPC_cublasIdamin_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, result, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -12645,7 +12645,7 @@ cublasStatus_t cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int
     if (rpc_start_request(0, RPC_cublasDasum_v2) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, result, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -12661,7 +12661,7 @@ cublasStatus_t cublasDasum_v2_64(cublasHandle_t handle, int64_t n, const double*
     if (rpc_start_request(0, RPC_cublasDasum_v2_64) < 0 ||
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, result, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -12785,8 +12785,8 @@ cublasStatus_t cublasDrot_v2(cublasHandle_t handle, int n, double* x, int incx,
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
-        rpc_write(0, &c, sizeof(const double*)) < 0 ||
-        rpc_write(0, &s, sizeof(const double*)) < 0 ||
+        rpc_write(0, c, sizeof(const double*)) < 0 ||
+        rpc_write(0, s, sizeof(const double*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_read(0, y, sizeof(double)) < 0 ||
@@ -12805,8 +12805,8 @@ cublasStatus_t cublasDrot_v2_64(cublasHandle_t handle, int64_t n, double* x, int
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &c, sizeof(const double*)) < 0 ||
-        rpc_write(0, &s, sizeof(const double*)) < 0 ||
+        rpc_write(0, c, sizeof(const double*)) < 0 ||
+        rpc_write(0, s, sizeof(const double*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_read(0, y, sizeof(double)) < 0 ||
@@ -12905,7 +12905,7 @@ cublasStatus_t cublasZrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, i
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
-        rpc_write(0, &c, sizeof(const double*)) < 0 ||
+        rpc_write(0, c, sizeof(const double*)) < 0 ||
         rpc_write(0, &s, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
@@ -12925,7 +12925,7 @@ cublasStatus_t cublasZrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComple
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &c, sizeof(const double*)) < 0 ||
+        rpc_write(0, c, sizeof(const double*)) < 0 ||
         rpc_write(0, &s, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
@@ -12945,8 +12945,8 @@ cublasStatus_t cublasZdrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x,
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
-        rpc_write(0, &c, sizeof(const double*)) < 0 ||
-        rpc_write(0, &s, sizeof(const double*)) < 0 ||
+        rpc_write(0, c, sizeof(const double*)) < 0 ||
+        rpc_write(0, s, sizeof(const double*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
@@ -12965,8 +12965,8 @@ cublasStatus_t cublasZdrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleCompl
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &c, sizeof(const double*)) < 0 ||
-        rpc_write(0, &s, sizeof(const double*)) < 0 ||
+        rpc_write(0, c, sizeof(const double*)) < 0 ||
+        rpc_write(0, s, sizeof(const double*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 ||
         rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 ||
@@ -13099,7 +13099,7 @@ cublasStatus_t cublasDrotm_v2(cublasHandle_t handle, int n, double* x, int incx,
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
-        rpc_write(0, &param, sizeof(const double*)) < 0 ||
+        rpc_write(0, param, sizeof(const double*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_read(0, y, sizeof(double)) < 0 ||
@@ -13118,7 +13118,7 @@ cublasStatus_t cublasDrotm_v2_64(cublasHandle_t handle, int64_t n, double* x, in
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &param, sizeof(const double*)) < 0 ||
+        rpc_write(0, param, sizeof(const double*)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, x, sizeof(double)) < 0 ||
         rpc_read(0, y, sizeof(double)) < 0 ||
@@ -13155,7 +13155,7 @@ cublasStatus_t cublasDrotmg_v2(cublasHandle_t handle, double* d1, double* d2, do
         rpc_write(0, d1, sizeof(double)) < 0 ||
         rpc_write(0, d2, sizeof(double)) < 0 ||
         rpc_write(0, x1, sizeof(double)) < 0 ||
-        rpc_write(0, &y1, sizeof(const double*)) < 0 ||
+        rpc_write(0, y1, sizeof(const double*)) < 0 ||
         rpc_write(0, param, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_read(0, d1, sizeof(double)) < 0 ||
@@ -13221,12 +13221,12 @@ cublasStatus_t cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -13244,12 +13244,12 @@ cublasStatus_t cublasDgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -13411,12 +13411,12 @@ cublasStatus_t cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &kl, sizeof(int)) < 0 ||
         rpc_write(0, &ku, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -13436,12 +13436,12 @@ cublasStatus_t cublasDgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans,
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &kl, sizeof(int64_t)) < 0 ||
         rpc_write(0, &ku, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -13600,7 +13600,7 @@ cublasStatus_t cublasDtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
@@ -13620,7 +13620,7 @@ cublasStatus_t cublasDtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
@@ -13763,7 +13763,7 @@ cublasStatus_t cublasDtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
@@ -13784,7 +13784,7 @@ cublasStatus_t cublasDtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
@@ -13926,7 +13926,7 @@ cublasStatus_t cublasDtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &AP, sizeof(const double*)) < 0 ||
+        rpc_write(0, AP, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -13945,7 +13945,7 @@ cublasStatus_t cublasDtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &AP, sizeof(const double*)) < 0 ||
+        rpc_write(0, AP, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -14080,7 +14080,7 @@ cublasStatus_t cublasDtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
@@ -14100,7 +14100,7 @@ cublasStatus_t cublasDtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
@@ -14238,7 +14238,7 @@ cublasStatus_t cublasDtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &AP, sizeof(const double*)) < 0 ||
+        rpc_write(0, AP, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -14257,7 +14257,7 @@ cublasStatus_t cublasDtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &AP, sizeof(const double*)) < 0 ||
+        rpc_write(0, AP, sizeof(const double*)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -14395,7 +14395,7 @@ cublasStatus_t cublasDtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
@@ -14416,7 +14416,7 @@ cublasStatus_t cublasDtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, x, sizeof(double)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
@@ -14562,12 +14562,12 @@ cublasStatus_t cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -14584,12 +14584,12 @@ cublasStatus_t cublasDsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -14829,12 +14829,12 @@ cublasStatus_t cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -14852,12 +14852,12 @@ cublasStatus_t cublasDsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -15008,11 +15008,11 @@ cublasStatus_t cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &AP, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, AP, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -15029,11 +15029,11 @@ cublasStatus_t cublasDspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &AP, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, AP, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -15176,10 +15176,10 @@ cublasStatus_t cublasDger_v2(cublasHandle_t handle, int m, int n, const double*
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_write(0, A, sizeof(double)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
@@ -15197,10 +15197,10 @@ cublasStatus_t cublasDger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, con
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_write(0, A, sizeof(double)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
@@ -15424,8 +15424,8 @@ cublasStatus_t cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, A, sizeof(double)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
@@ -15443,8 +15443,8 @@ cublasStatus_t cublasDsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, A, sizeof(double)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
@@ -15576,7 +15576,7 @@ cublasStatus_t cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 ||
@@ -15595,7 +15595,7 @@ cublasStatus_t cublasZher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 ||
@@ -15650,8 +15650,8 @@ cublasStatus_t cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, AP, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -15668,8 +15668,8 @@ cublasStatus_t cublasDspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, AP, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -15722,7 +15722,7 @@ cublasStatus_t cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 ||
@@ -15740,7 +15740,7 @@ cublasStatus_t cublasZhpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 ||
@@ -15800,10 +15800,10 @@ cublasStatus_t cublasDsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_write(0, A, sizeof(double)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
@@ -15821,10 +15821,10 @@ cublasStatus_t cublasDsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_write(0, A, sizeof(double)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
@@ -16050,10 +16050,10 @@ cublasStatus_t cublasDspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_write(0, AP, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -16070,10 +16070,10 @@ cublasStatus_t cublasDspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &y, sizeof(const double*)) < 0 ||
+        rpc_write(0, y, sizeof(const double*)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_write(0, AP, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -16173,12 +16173,12 @@ cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &alpha, sizeof(const float*)) < 0 ||
-        rpc_write(0, Aarray, sizeof(const float* const[batchCount])) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, xarray, sizeof(const float* const[batchCount])) < 0 ||
+        rpc_write(0, &xarray, sizeof(xarray)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, &beta, sizeof(const float*)) < 0 ||
-        rpc_write(0, yarray, sizeof(float* const[batchCount])) < 0 ||
+        rpc_write(0, &yarray, sizeof(yarray)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
@@ -16196,12 +16196,12 @@ cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t tra
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &alpha, sizeof(const float*)) < 0 ||
-        rpc_write(0, Aarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, xarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 ||
+        rpc_write(0, &xarray, sizeof(xarray)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, &beta, sizeof(const float*)) < 0 ||
-        rpc_write(0, yarray, sizeof(__nv_bfloat16* const[batchCount])) < 0 ||
+        rpc_write(0, &yarray, sizeof(yarray)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
@@ -16271,14 +16271,14 @@ cublasStatus_t cublasDgemvStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, &stridex, sizeof(long long int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int)) < 0 ||
         rpc_write(0, &stridey, sizeof(long long int)) < 0 ||
@@ -16298,14 +16298,14 @@ cublasStatus_t cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, &stridex, sizeof(long long int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, y, sizeof(double)) < 0 ||
         rpc_write(0, &incy, sizeof(int64_t)) < 0 ||
         rpc_write(0, &stridey, sizeof(long long int)) < 0 ||
@@ -16702,12 +16702,12 @@ cublasStatus_t cublasDgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -16727,12 +16727,12 @@ cublasStatus_t cublasDgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17045,10 +17045,10 @@ cublasStatus_t cublasDsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17067,10 +17067,10 @@ cublasStatus_t cublasDsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17221,10 +17221,10 @@ cublasStatus_t cublasZherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17243,10 +17243,10 @@ cublasStatus_t cublasZherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
         rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17313,12 +17313,12 @@ cublasStatus_t cublasDsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17337,12 +17337,12 @@ cublasStatus_t cublasDsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17510,7 +17510,7 @@ cublasStatus_t cublasZher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17534,7 +17534,7 @@ cublasStatus_t cublasZher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17601,12 +17601,12 @@ cublasStatus_t cublasDsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17625,12 +17625,12 @@ cublasStatus_t cublasDsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17798,7 +17798,7 @@ cublasStatus_t cublasZherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17822,7 +17822,7 @@ cublasStatus_t cublasZherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17889,12 +17889,12 @@ cublasStatus_t cublasDsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -17913,12 +17913,12 @@ cublasStatus_t cublasDsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -18177,8 +18177,8 @@ cublasStatus_t cublasDtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, B, sizeof(double)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
@@ -18200,8 +18200,8 @@ cublasStatus_t cublasDtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, B, sizeof(double)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
@@ -18365,10 +18365,10 @@ cublasStatus_t cublasDtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
@@ -18390,10 +18390,10 @@ cublasStatus_t cublasDtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
         rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
@@ -18504,6 +18504,306 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c
     return return_value;
 }
 
+cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasHgemmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &k, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const __half*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, &beta, sizeof(const __half*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasHgemmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const __half*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &beta, sizeof(const __half*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasSgemmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &k, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const float*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, &beta, sizeof(const float*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasSgemmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const float*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &beta, sizeof(const float*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &k, sizeof(int)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDgemmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgemmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &k, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgemmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgemm3mBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &k, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgemm3mBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZgemmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &k, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZgemmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
 cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount)
 {
     cublasStatus_t return_value;
@@ -18630,14 +18930,14 @@ cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
         rpc_write(0, &k, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
         rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
         rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
@@ -18659,14 +18959,14 @@ cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
         rpc_write(0, &k, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
         rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
         rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
         rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
@@ -18852,6 +19152,36 @@ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperati
     return return_value;
 }
 
+cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int64_t lda, const void* const Barray[], cudaDataType Btype, int64_t ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int64_t ldc, int64_t batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasGemmBatchedEx_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const void*)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &Atype, sizeof(cudaDataType)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &Btype, sizeof(cudaDataType)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &beta, sizeof(const void*)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &Ctype, sizeof(cudaDataType)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &computeType, sizeof(cublasComputeType_t)) < 0 ||
+        rpc_write(0, &algo, sizeof(cublasGemmAlgo_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
 cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc)
 {
     cublasStatus_t return_value;
@@ -18909,11 +19239,11 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cubl
         rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
@@ -18933,11 +19263,11 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, c
         rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &alpha, sizeof(const double*)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &beta, sizeof(const double*)) < 0 ||
-        rpc_write(0, &B, sizeof(const double*)) < 0 ||
+        rpc_write(0, beta, sizeof(const double*)) < 0 ||
+        rpc_write(0, B, sizeof(const double*)) < 0 ||
         rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
@@ -19044,6 +19374,190 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c
     return return_value;
 }
 
+cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasStrsmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const float*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasStrsmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const float*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDtrsmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDtrsmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, alpha, sizeof(const double*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCtrsmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCtrsmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZtrsmBatched) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZtrsmBatched_64) < 0 ||
+        rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+        rpc_write(0, &B, sizeof(B)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
 cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const float* A, int lda, const float* x, int incx, float* C, int ldc)
 {
     cublasStatus_t return_value;
@@ -19094,9 +19608,9 @@ cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
         rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
         rpc_write(0, &m, sizeof(int)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int)) < 0 ||
@@ -19115,9 +19629,9 @@ cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6
         rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
         rpc_write(0, &m, sizeof(int64_t)) < 0 ||
         rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-        rpc_write(0, &x, sizeof(const double*)) < 0 ||
+        rpc_write(0, x, sizeof(const double*)) < 0 ||
         rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
         rpc_write(0, C, sizeof(double)) < 0 ||
         rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
@@ -19212,6 +19726,254 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6
     return return_value;
 }
 
+cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasSmatinvBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Ainv, sizeof(Ainv)) < 0 ||
+        rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDmatinvBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Ainv, sizeof(Ainv)) < 0 ||
+        rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCmatinvBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Ainv, sizeof(Ainv)) < 0 ||
+        rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZmatinvBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Ainv, sizeof(Ainv)) < 0 ||
+        rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasSgeqrfBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &TauArray, sizeof(TauArray)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDgeqrfBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &TauArray, sizeof(TauArray)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgeqrfBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &TauArray, sizeof(TauArray)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZgeqrfBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &TauArray, sizeof(TauArray)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasSgelsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDgelsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgelsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZgelsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &m, sizeof(int)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &Carray, sizeof(Carray)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
 cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda)
 {
     cublasStatus_t return_value;
@@ -19236,7 +19998,7 @@ cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &AP, sizeof(const double*)) < 0 ||
+        rpc_write(0, AP, sizeof(const double*)) < 0 ||
         rpc_write(0, A, sizeof(double)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -19304,7 +20066,7 @@ cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
         rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_write(0, &n, sizeof(int)) < 0 ||
-        rpc_write(0, &A, sizeof(const double*)) < 0 ||
+        rpc_write(0, A, sizeof(const double*)) < 0 ||
         rpc_write(0, &lda, sizeof(int)) < 0 ||
         rpc_write(0, AP, sizeof(double)) < 0 ||
         rpc_wait_for_response(0) < 0 ||
@@ -19348,6 +20110,174 @@ cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
     return return_value;
 }
 
+cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasSgetriBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &P, sizeof(const int*)) < 0 ||
+        rpc_write(0, &C, sizeof(C)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDgetriBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &P, sizeof(const int*)) < 0 ||
+        rpc_write(0, &C, sizeof(C)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgetriBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &P, sizeof(const int*)) < 0 ||
+        rpc_write(0, &C, sizeof(C)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZgetriBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &A, sizeof(A)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &P, sizeof(const int*)) < 0 ||
+        rpc_write(0, &C, sizeof(C)) < 0 ||
+        rpc_write(0, &ldc, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasSgetrsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasDgetrsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasCgetrsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
+cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize)
+{
+    cublasStatus_t return_value;
+    if (rpc_start_request(0, RPC_cublasZgetrsBatched) < 0 ||
+        rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+        rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_write(0, &n, sizeof(int)) < 0 ||
+        rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+        rpc_write(0, &Aarray, sizeof(Aarray)) < 0 ||
+        rpc_write(0, &lda, sizeof(int)) < 0 ||
+        rpc_write(0, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_write(0, &Barray, sizeof(Barray)) < 0 ||
+        rpc_write(0, &ldb, sizeof(int)) < 0 ||
+        rpc_write(0, info, sizeof(int)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, info, sizeof(int)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUBLAS_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
 cublasStatus_t cublasUint8gemmBias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc, int m, int n, int k, const unsigned char* A, int A_bias, int lda, const unsigned char* B, int B_bias, int ldb, unsigned char* C, int C_bias, int ldc, int C_mult, int C_shift)
 {
     cublasStatus_t return_value;
@@ -21644,6 +22574,18 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCtrmm_v2_64", (void *)cublasCtrmm_v2_64},
     {"cublasZtrmm_v2", (void *)cublasZtrmm_v2},
     {"cublasZtrmm_v2_64", (void *)cublasZtrmm_v2_64},
+    {"cublasHgemmBatched", (void *)cublasHgemmBatched},
+    {"cublasHgemmBatched_64", (void *)cublasHgemmBatched_64},
+    {"cublasSgemmBatched", (void *)cublasSgemmBatched},
+    {"cublasSgemmBatched_64", (void *)cublasSgemmBatched_64},
+    {"cublasDgemmBatched", (void *)cublasDgemmBatched},
+    {"cublasDgemmBatched_64", (void *)cublasDgemmBatched_64},
+    {"cublasCgemmBatched", (void *)cublasCgemmBatched},
+    {"cublasCgemmBatched_64", (void *)cublasCgemmBatched_64},
+    {"cublasCgemm3mBatched", (void *)cublasCgemm3mBatched},
+    {"cublasCgemm3mBatched_64", (void *)cublasCgemm3mBatched_64},
+    {"cublasZgemmBatched", (void *)cublasZgemmBatched},
+    {"cublasZgemmBatched_64", (void *)cublasZgemmBatched_64},
     {"cublasHgemmStridedBatched", (void *)cublasHgemmStridedBatched},
     {"cublasHgemmStridedBatched_64", (void *)cublasHgemmStridedBatched_64},
     {"cublasSgemmStridedBatched", (void *)cublasSgemmStridedBatched},
@@ -21656,6 +22598,7 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCgemm3mStridedBatched_64", (void *)cublasCgemm3mStridedBatched_64},
     {"cublasZgemmStridedBatched", (void *)cublasZgemmStridedBatched},
     {"cublasZgemmStridedBatched_64", (void *)cublasZgemmStridedBatched_64},
+    {"cublasGemmBatchedEx_64", (void *)cublasGemmBatchedEx_64},
     {"cublasSgeam", (void *)cublasSgeam},
     {"cublasSgeam_64", (void *)cublasSgeam_64},
     {"cublasDgeam", (void *)cublasDgeam},
@@ -21664,6 +22607,14 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCgeam_64", (void *)cublasCgeam_64},
     {"cublasZgeam", (void *)cublasZgeam},
     {"cublasZgeam_64", (void *)cublasZgeam_64},
+    {"cublasStrsmBatched", (void *)cublasStrsmBatched},
+    {"cublasStrsmBatched_64", (void *)cublasStrsmBatched_64},
+    {"cublasDtrsmBatched", (void *)cublasDtrsmBatched},
+    {"cublasDtrsmBatched_64", (void *)cublasDtrsmBatched_64},
+    {"cublasCtrsmBatched", (void *)cublasCtrsmBatched},
+    {"cublasCtrsmBatched_64", (void *)cublasCtrsmBatched_64},
+    {"cublasZtrsmBatched", (void *)cublasZtrsmBatched},
+    {"cublasZtrsmBatched_64", (void *)cublasZtrsmBatched_64},
     {"cublasSdgmm", (void *)cublasSdgmm},
     {"cublasSdgmm_64", (void *)cublasSdgmm_64},
     {"cublasDdgmm", (void *)cublasDdgmm},
@@ -21672,6 +22623,18 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCdgmm_64", (void *)cublasCdgmm_64},
     {"cublasZdgmm", (void *)cublasZdgmm},
     {"cublasZdgmm_64", (void *)cublasZdgmm_64},
+    {"cublasSmatinvBatched", (void *)cublasSmatinvBatched},
+    {"cublasDmatinvBatched", (void *)cublasDmatinvBatched},
+    {"cublasCmatinvBatched", (void *)cublasCmatinvBatched},
+    {"cublasZmatinvBatched", (void *)cublasZmatinvBatched},
+    {"cublasSgeqrfBatched", (void *)cublasSgeqrfBatched},
+    {"cublasDgeqrfBatched", (void *)cublasDgeqrfBatched},
+    {"cublasCgeqrfBatched", (void *)cublasCgeqrfBatched},
+    {"cublasZgeqrfBatched", (void *)cublasZgeqrfBatched},
+    {"cublasSgelsBatched", (void *)cublasSgelsBatched},
+    {"cublasDgelsBatched", (void *)cublasDgelsBatched},
+    {"cublasCgelsBatched", (void *)cublasCgelsBatched},
+    {"cublasZgelsBatched", (void *)cublasZgelsBatched},
     {"cublasStpttr", (void *)cublasStpttr},
     {"cublasDtpttr", (void *)cublasDtpttr},
     {"cublasCtpttr", (void *)cublasCtpttr},
@@ -21680,6 +22643,14 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasDtrttp", (void *)cublasDtrttp},
     {"cublasCtrttp", (void *)cublasCtrttp},
     {"cublasZtrttp", (void *)cublasZtrttp},
+    {"cublasSgetriBatched", (void *)cublasSgetriBatched},
+    {"cublasDgetriBatched", (void *)cublasDgetriBatched},
+    {"cublasCgetriBatched", (void *)cublasCgetriBatched},
+    {"cublasZgetriBatched", (void *)cublasZgetriBatched},
+    {"cublasSgetrsBatched", (void *)cublasSgetrsBatched},
+    {"cublasDgetrsBatched", (void *)cublasDgetrsBatched},
+    {"cublasCgetrsBatched", (void *)cublasCgetrsBatched},
+    {"cublasZgetrsBatched", (void *)cublasZgetrsBatched},
     {"cublasUint8gemmBias", (void *)cublasUint8gemmBias},
     {"cudnnGetProperty", (void *)cudnnGetProperty},
     {"cudnnCreate", (void *)cudnnCreate},
diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
index b243014..78c48fd 100644
--- a/codegen/gen_server.cpp
+++ b/codegen/gen_server.cpp
@@ -24098,7 +24098,7 @@ int handle_cublasDnrm2_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* x;
+    double x;
     int incx;
     double result;
     int request_id;
@@ -24115,7 +24115,7 @@ int handle_cublasDnrm2_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDnrm2_v2(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasDnrm2_v2(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(double)) < 0 ||
@@ -24131,7 +24131,7 @@ int handle_cublasDnrm2_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* x;
+    double x;
     int64_t incx;
     double result;
     int request_id;
@@ -24148,7 +24148,7 @@ int handle_cublasDnrm2_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDnrm2_v2_64(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasDnrm2_v2_64(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(double)) < 0 ||
@@ -24370,9 +24370,9 @@ int handle_cublasDdot_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* x;
+    double x;
     int incx;
-    const double* y;
+    double y;
     int incy;
     double result;
     int request_id;
@@ -24391,7 +24391,7 @@ int handle_cublasDdot_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDdot_v2(handle, n, x, incx, y, incy, &result);
+    scuda_intercept_result = cublasDdot_v2(handle, n, &x, incx, &y, incy, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(double)) < 0 ||
@@ -24407,9 +24407,9 @@ int handle_cublasDdot_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* x;
+    double x;
     int64_t incx;
-    const double* y;
+    double y;
     int64_t incy;
     double result;
     int request_id;
@@ -24428,7 +24428,7 @@ int handle_cublasDdot_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDdot_v2_64(handle, n, x, incx, y, incy, &result);
+    scuda_intercept_result = cublasDdot_v2_64(handle, n, &x, incx, &y, incy, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(double)) < 0 ||
@@ -24806,7 +24806,7 @@ int handle_cublasDscal_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* alpha;
+    double alpha;
     double x;
     int incx;
     int request_id;
@@ -24823,7 +24823,7 @@ int handle_cublasDscal_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDscal_v2(handle, n, alpha, &x, incx);
+    scuda_intercept_result = cublasDscal_v2(handle, n, &alpha, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -24839,7 +24839,7 @@ int handle_cublasDscal_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* alpha;
+    double alpha;
     double x;
     int64_t incx;
     int request_id;
@@ -24856,7 +24856,7 @@ int handle_cublasDscal_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDscal_v2_64(handle, n, alpha, &x, incx);
+    scuda_intercept_result = cublasDscal_v2_64(handle, n, &alpha, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -25070,7 +25070,7 @@ int handle_cublasZdscal_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* alpha;
+    double alpha;
     cuDoubleComplex x;
     int incx;
     int request_id;
@@ -25087,7 +25087,7 @@ int handle_cublasZdscal_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZdscal_v2(handle, n, alpha, &x, incx);
+    scuda_intercept_result = cublasZdscal_v2(handle, n, &alpha, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 ||
@@ -25103,7 +25103,7 @@ int handle_cublasZdscal_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* alpha;
+    double alpha;
     cuDoubleComplex x;
     int64_t incx;
     int request_id;
@@ -25120,7 +25120,7 @@ int handle_cublasZdscal_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZdscal_v2_64(handle, n, alpha, &x, incx);
+    scuda_intercept_result = cublasZdscal_v2_64(handle, n, &alpha, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 ||
@@ -25210,8 +25210,8 @@ int handle_cublasDaxpy_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int incx;
     double y;
     int incy;
@@ -25231,7 +25231,7 @@ int handle_cublasDaxpy_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDaxpy_v2(handle, n, alpha, x, incx, &y, incy);
+    scuda_intercept_result = cublasDaxpy_v2(handle, n, &alpha, &x, incx, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -25247,8 +25247,8 @@ int handle_cublasDaxpy_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int64_t incx;
     double y;
     int64_t incy;
@@ -25268,7 +25268,7 @@ int handle_cublasDaxpy_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDaxpy_v2_64(handle, n, alpha, x, incx, &y, incy);
+    scuda_intercept_result = cublasDaxpy_v2_64(handle, n, &alpha, &x, incx, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -25502,7 +25502,7 @@ int handle_cublasDcopy_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* x;
+    double x;
     int incx;
     double y;
     int incy;
@@ -25521,7 +25521,7 @@ int handle_cublasDcopy_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDcopy_v2(handle, n, x, incx, &y, incy);
+    scuda_intercept_result = cublasDcopy_v2(handle, n, &x, incx, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -25537,7 +25537,7 @@ int handle_cublasDcopy_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* x;
+    double x;
     int64_t incx;
     double y;
     int64_t incy;
@@ -25556,7 +25556,7 @@ int handle_cublasDcopy_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDcopy_v2_64(handle, n, x, incx, &y, incy);
+    scuda_intercept_result = cublasDcopy_v2_64(handle, n, &x, incx, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -26066,7 +26066,7 @@ int handle_cublasIdamax_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* x;
+    double x;
     int incx;
     int result;
     int request_id;
@@ -26083,7 +26083,7 @@ int handle_cublasIdamax_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasIdamax_v2(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasIdamax_v2(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(int)) < 0 ||
@@ -26099,7 +26099,7 @@ int handle_cublasIdamax_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* x;
+    double x;
     int64_t incx;
     int64_t result;
     int request_id;
@@ -26116,7 +26116,7 @@ int handle_cublasIdamax_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasIdamax_v2_64(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasIdamax_v2_64(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(int64_t)) < 0 ||
@@ -26400,7 +26400,7 @@ int handle_cublasIdamin_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* x;
+    double x;
     int incx;
     int result;
     int request_id;
@@ -26417,7 +26417,7 @@ int handle_cublasIdamin_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasIdamin_v2(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasIdamin_v2(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(int)) < 0 ||
@@ -26433,7 +26433,7 @@ int handle_cublasIdamin_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* x;
+    double x;
     int64_t incx;
     int64_t result;
     int request_id;
@@ -26450,7 +26450,7 @@ int handle_cublasIdamin_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasIdamin_v2_64(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasIdamin_v2_64(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(int64_t)) < 0 ||
@@ -26734,7 +26734,7 @@ int handle_cublasDasum_v2(void *conn)
 {
     cublasHandle_t handle;
     int n;
-    const double* x;
+    double x;
     int incx;
     double result;
     int request_id;
@@ -26751,7 +26751,7 @@ int handle_cublasDasum_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDasum_v2(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasDasum_v2(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(double)) < 0 ||
@@ -26767,7 +26767,7 @@ int handle_cublasDasum_v2_64(void *conn)
 {
     cublasHandle_t handle;
     int64_t n;
-    const double* x;
+    double x;
     int64_t incx;
     double result;
     int request_id;
@@ -26784,7 +26784,7 @@ int handle_cublasDasum_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDasum_v2_64(handle, n, x, incx, &result);
+    scuda_intercept_result = cublasDasum_v2_64(handle, n, &x, incx, &result);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &result, sizeof(double)) < 0 ||
@@ -27016,8 +27016,8 @@ int handle_cublasDrot_v2(void *conn)
     int incx;
     double y;
     int incy;
-    const double* c;
-    const double* s;
+    double c;
+    double s;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -27035,7 +27035,7 @@ int handle_cublasDrot_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDrot_v2(handle, n, &x, incx, &y, incy, c, s);
+    scuda_intercept_result = cublasDrot_v2(handle, n, &x, incx, &y, incy, &c, &s);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -27056,8 +27056,8 @@ int handle_cublasDrot_v2_64(void *conn)
     int64_t incx;
     double y;
     int64_t incy;
-    const double* c;
-    const double* s;
+    double c;
+    double s;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -27075,7 +27075,7 @@ int handle_cublasDrot_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDrot_v2_64(handle, n, &x, incx, &y, incy, c, s);
+    scuda_intercept_result = cublasDrot_v2_64(handle, n, &x, incx, &y, incy, &c, &s);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -27256,7 +27256,7 @@ int handle_cublasZrot_v2(void *conn)
     int incx;
     cuDoubleComplex y;
     int incy;
-    const double* c;
+    double c;
     const cuDoubleComplex* s;
     int request_id;
     cublasStatus_t scuda_intercept_result;
@@ -27275,7 +27275,7 @@ int handle_cublasZrot_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZrot_v2(handle, n, &x, incx, &y, incy, c, s);
+    scuda_intercept_result = cublasZrot_v2(handle, n, &x, incx, &y, incy, &c, s);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 ||
@@ -27296,7 +27296,7 @@ int handle_cublasZrot_v2_64(void *conn)
     int64_t incx;
     cuDoubleComplex y;
     int64_t incy;
-    const double* c;
+    double c;
     const cuDoubleComplex* s;
     int request_id;
     cublasStatus_t scuda_intercept_result;
@@ -27315,7 +27315,7 @@ int handle_cublasZrot_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZrot_v2_64(handle, n, &x, incx, &y, incy, c, s);
+    scuda_intercept_result = cublasZrot_v2_64(handle, n, &x, incx, &y, incy, &c, s);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 ||
@@ -27336,8 +27336,8 @@ int handle_cublasZdrot_v2(void *conn)
     int incx;
     cuDoubleComplex y;
     int incy;
-    const double* c;
-    const double* s;
+    double c;
+    double s;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -27355,7 +27355,7 @@ int handle_cublasZdrot_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZdrot_v2(handle, n, &x, incx, &y, incy, c, s);
+    scuda_intercept_result = cublasZdrot_v2(handle, n, &x, incx, &y, incy, &c, &s);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 ||
@@ -27376,8 +27376,8 @@ int handle_cublasZdrot_v2_64(void *conn)
     int64_t incx;
     cuDoubleComplex y;
     int64_t incy;
-    const double* c;
-    const double* s;
+    double c;
+    double s;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -27395,7 +27395,7 @@ int handle_cublasZdrot_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZdrot_v2_64(handle, n, &x, incx, &y, incy, c, s);
+    scuda_intercept_result = cublasZdrot_v2_64(handle, n, &x, incx, &y, incy, &c, &s);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 ||
@@ -27636,7 +27636,7 @@ int handle_cublasDrotm_v2(void *conn)
     int incx;
     double y;
     int incy;
-    const double* param;
+    double param;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -27653,7 +27653,7 @@ int handle_cublasDrotm_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDrotm_v2(handle, n, &x, incx, &y, incy, param);
+    scuda_intercept_result = cublasDrotm_v2(handle, n, &x, incx, &y, incy, &param);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -27674,7 +27674,7 @@ int handle_cublasDrotm_v2_64(void *conn)
     int64_t incx;
     double y;
     int64_t incy;
-    const double* param;
+    double param;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -27691,7 +27691,7 @@ int handle_cublasDrotm_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDrotm_v2_64(handle, n, &x, incx, &y, incy, param);
+    scuda_intercept_result = cublasDrotm_v2_64(handle, n, &x, incx, &y, incy, &param);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -27748,7 +27748,7 @@ int handle_cublasDrotmg_v2(void *conn)
     double d1;
     double d2;
     double x1;
-    const double* y1;
+    double y1;
     double param;
     int request_id;
     cublasStatus_t scuda_intercept_result;
@@ -27765,7 +27765,7 @@ int handle_cublasDrotmg_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDrotmg_v2(handle, &d1, &d2, &x1, y1, &param);
+    scuda_intercept_result = cublasDrotmg_v2(handle, &d1, &d2, &x1, &y1, &param);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &d1, sizeof(double)) < 0 ||
@@ -27880,12 +27880,12 @@ int handle_cublasDgemv_v2(void *conn)
     cublasOperation_t trans;
     int m;
     int n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* x;
+    double x;
     int incx;
-    const double* beta;
+    double beta;
     double y;
     int incy;
     int request_id;
@@ -27909,7 +27909,7 @@ int handle_cublasDgemv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemv_v2(handle, trans, m, n, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDgemv_v2(handle, trans, m, n, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -27927,12 +27927,12 @@ int handle_cublasDgemv_v2_64(void *conn)
     cublasOperation_t trans;
     int64_t m;
     int64_t n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* x;
+    double x;
     int64_t incx;
-    const double* beta;
+    double beta;
     double y;
     int64_t incy;
     int request_id;
@@ -27956,7 +27956,7 @@ int handle_cublasDgemv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemv_v2_64(handle, trans, m, n, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDgemv_v2_64(handle, trans, m, n, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -28266,12 +28266,12 @@ int handle_cublasDgbmv_v2(void *conn)
     int n;
     int kl;
     int ku;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* x;
+    double x;
     int incx;
-    const double* beta;
+    double beta;
     double y;
     int incy;
     int request_id;
@@ -28297,7 +28297,7 @@ int handle_cublasDgbmv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgbmv_v2(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDgbmv_v2(handle, trans, m, n, kl, ku, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -28317,12 +28317,12 @@ int handle_cublasDgbmv_v2_64(void *conn)
     int64_t n;
     int64_t kl;
     int64_t ku;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* x;
+    double x;
     int64_t incx;
-    const double* beta;
+    double beta;
     double y;
     int64_t incy;
     int request_id;
@@ -28348,7 +28348,7 @@ int handle_cublasDgbmv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgbmv_v2_64(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDgbmv_v2_64(handle, trans, m, n, kl, ku, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -28653,7 +28653,7 @@ int handle_cublasDtrmv_v2(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int n;
-    const double* A;
+    double A;
     int lda;
     double x;
     int incx;
@@ -28675,7 +28675,7 @@ int handle_cublasDtrmv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrmv_v2(handle, uplo, trans, diag, n, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtrmv_v2(handle, uplo, trans, diag, n, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -28694,7 +28694,7 @@ int handle_cublasDtrmv_v2_64(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int64_t n;
-    const double* A;
+    double A;
     int64_t lda;
     double x;
     int64_t incx;
@@ -28716,7 +28716,7 @@ int handle_cublasDtrmv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrmv_v2_64(handle, uplo, trans, diag, n, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtrmv_v2_64(handle, uplo, trans, diag, n, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -28986,7 +28986,7 @@ int handle_cublasDtbmv_v2(void *conn)
     cublasDiagType_t diag;
     int n;
     int k;
-    const double* A;
+    double A;
     int lda;
     double x;
     int incx;
@@ -29009,7 +29009,7 @@ int handle_cublasDtbmv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtbmv_v2(handle, uplo, trans, diag, n, k, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtbmv_v2(handle, uplo, trans, diag, n, k, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -29029,7 +29029,7 @@ int handle_cublasDtbmv_v2_64(void *conn)
     cublasDiagType_t diag;
     int64_t n;
     int64_t k;
-    const double* A;
+    double A;
     int64_t lda;
     double x;
     int64_t incx;
@@ -29052,7 +29052,7 @@ int handle_cublasDtbmv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtbmv_v2_64(handle, uplo, trans, diag, n, k, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtbmv_v2_64(handle, uplo, trans, diag, n, k, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -29321,7 +29321,7 @@ int handle_cublasDtpmv_v2(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int n;
-    const double* AP;
+    double AP;
     double x;
     int incx;
     int request_id;
@@ -29341,7 +29341,7 @@ int handle_cublasDtpmv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtpmv_v2(handle, uplo, trans, diag, n, AP, &x, incx);
+    scuda_intercept_result = cublasDtpmv_v2(handle, uplo, trans, diag, n, &AP, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -29360,7 +29360,7 @@ int handle_cublasDtpmv_v2_64(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int64_t n;
-    const double* AP;
+    double AP;
     double x;
     int64_t incx;
     int request_id;
@@ -29380,7 +29380,7 @@ int handle_cublasDtpmv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtpmv_v2_64(handle, uplo, trans, diag, n, AP, &x, incx);
+    scuda_intercept_result = cublasDtpmv_v2_64(handle, uplo, trans, diag, n, &AP, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -29637,7 +29637,7 @@ int handle_cublasDtrsv_v2(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int n;
-    const double* A;
+    double A;
     int lda;
     double x;
     int incx;
@@ -29659,7 +29659,7 @@ int handle_cublasDtrsv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrsv_v2(handle, uplo, trans, diag, n, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtrsv_v2(handle, uplo, trans, diag, n, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -29678,7 +29678,7 @@ int handle_cublasDtrsv_v2_64(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int64_t n;
-    const double* A;
+    double A;
     int64_t lda;
     double x;
     int64_t incx;
@@ -29700,7 +29700,7 @@ int handle_cublasDtrsv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrsv_v2_64(handle, uplo, trans, diag, n, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtrsv_v2_64(handle, uplo, trans, diag, n, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -29961,7 +29961,7 @@ int handle_cublasDtpsv_v2(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int n;
-    const double* AP;
+    double AP;
     double x;
     int incx;
     int request_id;
@@ -29981,7 +29981,7 @@ int handle_cublasDtpsv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtpsv_v2(handle, uplo, trans, diag, n, AP, &x, incx);
+    scuda_intercept_result = cublasDtpsv_v2(handle, uplo, trans, diag, n, &AP, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -30000,7 +30000,7 @@ int handle_cublasDtpsv_v2_64(void *conn)
     cublasOperation_t trans;
     cublasDiagType_t diag;
     int64_t n;
-    const double* AP;
+    double AP;
     double x;
     int64_t incx;
     int request_id;
@@ -30020,7 +30020,7 @@ int handle_cublasDtpsv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtpsv_v2_64(handle, uplo, trans, diag, n, AP, &x, incx);
+    scuda_intercept_result = cublasDtpsv_v2_64(handle, uplo, trans, diag, n, &AP, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -30282,7 +30282,7 @@ int handle_cublasDtbsv_v2(void *conn)
     cublasDiagType_t diag;
     int n;
     int k;
-    const double* A;
+    double A;
     int lda;
     double x;
     int incx;
@@ -30305,7 +30305,7 @@ int handle_cublasDtbsv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtbsv_v2(handle, uplo, trans, diag, n, k, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtbsv_v2(handle, uplo, trans, diag, n, k, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -30325,7 +30325,7 @@ int handle_cublasDtbsv_v2_64(void *conn)
     cublasDiagType_t diag;
     int64_t n;
     int64_t k;
-    const double* A;
+    double A;
     int64_t lda;
     double x;
     int64_t incx;
@@ -30348,7 +30348,7 @@ int handle_cublasDtbsv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtbsv_v2_64(handle, uplo, trans, diag, n, k, A, lda, &x, incx);
+    scuda_intercept_result = cublasDtbsv_v2_64(handle, uplo, trans, diag, n, k, &A, lda, &x, incx);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &x, sizeof(double)) < 0 ||
@@ -30627,12 +30627,12 @@ int handle_cublasDsymv_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* x;
+    double x;
     int incx;
-    const double* beta;
+    double beta;
     double y;
     int incy;
     int request_id;
@@ -30655,7 +30655,7 @@ int handle_cublasDsymv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsymv_v2(handle, uplo, n, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDsymv_v2(handle, uplo, n, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -30672,12 +30672,12 @@ int handle_cublasDsymv_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* x;
+    double x;
     int64_t incx;
-    const double* beta;
+    double beta;
     double y;
     int64_t incy;
     int request_id;
@@ -30700,7 +30700,7 @@ int handle_cublasDsymv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsymv_v2_64(handle, uplo, n, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDsymv_v2_64(handle, uplo, n, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -31172,12 +31172,12 @@ int handle_cublasDsbmv_v2(void *conn)
     cublasFillMode_t uplo;
     int n;
     int k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* x;
+    double x;
     int incx;
-    const double* beta;
+    double beta;
     double y;
     int incy;
     int request_id;
@@ -31201,7 +31201,7 @@ int handle_cublasDsbmv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsbmv_v2(handle, uplo, n, k, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDsbmv_v2(handle, uplo, n, k, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -31219,12 +31219,12 @@ int handle_cublasDsbmv_v2_64(void *conn)
     cublasFillMode_t uplo;
     int64_t n;
     int64_t k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* x;
+    double x;
     int64_t incx;
-    const double* beta;
+    double beta;
     double y;
     int64_t incy;
     int request_id;
@@ -31248,7 +31248,7 @@ int handle_cublasDsbmv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsbmv_v2_64(handle, uplo, n, k, alpha, A, lda, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDsbmv_v2_64(handle, uplo, n, k, &alpha, &A, lda, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -31539,11 +31539,11 @@ int handle_cublasDspmv_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
-    const double* AP;
-    const double* x;
+    double alpha;
+    double AP;
+    double x;
     int incx;
-    const double* beta;
+    double beta;
     double y;
     int incy;
     int request_id;
@@ -31565,7 +31565,7 @@ int handle_cublasDspmv_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDspmv_v2(handle, uplo, n, alpha, AP, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDspmv_v2(handle, uplo, n, &alpha, &AP, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -31582,11 +31582,11 @@ int handle_cublasDspmv_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
-    const double* AP;
-    const double* x;
+    double alpha;
+    double AP;
+    double x;
     int64_t incx;
-    const double* beta;
+    double beta;
     double y;
     int64_t incy;
     int request_id;
@@ -31608,7 +31608,7 @@ int handle_cublasDspmv_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDspmv_v2_64(handle, uplo, n, alpha, AP, x, incx, beta, &y, incy);
+    scuda_intercept_result = cublasDspmv_v2_64(handle, uplo, n, &alpha, &AP, &x, incx, &beta, &y, incy);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -31883,10 +31883,10 @@ int handle_cublasDger_v2(void *conn)
     cublasHandle_t handle;
     int m;
     int n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int incx;
-    const double* y;
+    double y;
     int incy;
     double A;
     int lda;
@@ -31909,7 +31909,7 @@ int handle_cublasDger_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDger_v2(handle, m, n, alpha, x, incx, y, incy, &A, lda);
+    scuda_intercept_result = cublasDger_v2(handle, m, n, &alpha, &x, incx, &y, incy, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(double)) < 0 ||
@@ -31926,10 +31926,10 @@ int handle_cublasDger_v2_64(void *conn)
     cublasHandle_t handle;
     int64_t m;
     int64_t n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int64_t incx;
-    const double* y;
+    double y;
     int64_t incy;
     double A;
     int64_t lda;
@@ -31952,7 +31952,7 @@ int handle_cublasDger_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDger_v2_64(handle, m, n, alpha, x, incx, y, incy, &A, lda);
+    scuda_intercept_result = cublasDger_v2_64(handle, m, n, &alpha, &x, incx, &y, incy, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(double)) < 0 ||
@@ -32391,8 +32391,8 @@ int handle_cublasDsyr_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int incx;
     double A;
     int lda;
@@ -32413,7 +32413,7 @@ int handle_cublasDsyr_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyr_v2(handle, uplo, n, alpha, x, incx, &A, lda);
+    scuda_intercept_result = cublasDsyr_v2(handle, uplo, n, &alpha, &x, incx, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(double)) < 0 ||
@@ -32430,8 +32430,8 @@ int handle_cublasDsyr_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int64_t incx;
     double A;
     int64_t lda;
@@ -32452,7 +32452,7 @@ int handle_cublasDsyr_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyr_v2_64(handle, uplo, n, alpha, x, incx, &A, lda);
+    scuda_intercept_result = cublasDsyr_v2_64(handle, uplo, n, &alpha, &x, incx, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(double)) < 0 ||
@@ -32703,7 +32703,7 @@ int handle_cublasZher_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
+    double alpha;
     const cuDoubleComplex* x;
     int incx;
     cuDoubleComplex A;
@@ -32725,7 +32725,7 @@ int handle_cublasZher_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZher_v2(handle, uplo, n, alpha, x, incx, &A, lda);
+    scuda_intercept_result = cublasZher_v2(handle, uplo, n, &alpha, x, incx, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 ||
@@ -32742,7 +32742,7 @@ int handle_cublasZher_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
+    double alpha;
     const cuDoubleComplex* x;
     int64_t incx;
     cuDoubleComplex A;
@@ -32764,7 +32764,7 @@ int handle_cublasZher_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZher_v2_64(handle, uplo, n, alpha, x, incx, &A, lda);
+    scuda_intercept_result = cublasZher_v2_64(handle, uplo, n, &alpha, x, incx, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 ||
@@ -32855,8 +32855,8 @@ int handle_cublasDspr_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int incx;
     double AP;
     int request_id;
@@ -32875,7 +32875,7 @@ int handle_cublasDspr_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDspr_v2(handle, uplo, n, alpha, x, incx, &AP);
+    scuda_intercept_result = cublasDspr_v2(handle, uplo, n, &alpha, &x, incx, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &AP, sizeof(double)) < 0 ||
@@ -32892,8 +32892,8 @@ int handle_cublasDspr_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int64_t incx;
     double AP;
     int request_id;
@@ -32912,7 +32912,7 @@ int handle_cublasDspr_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDspr_v2_64(handle, uplo, n, alpha, x, incx, &AP);
+    scuda_intercept_result = cublasDspr_v2_64(handle, uplo, n, &alpha, &x, incx, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &AP, sizeof(double)) < 0 ||
@@ -33003,7 +33003,7 @@ int handle_cublasZhpr_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
+    double alpha;
     const cuDoubleComplex* x;
     int incx;
     cuDoubleComplex AP;
@@ -33023,7 +33023,7 @@ int handle_cublasZhpr_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZhpr_v2(handle, uplo, n, alpha, x, incx, &AP);
+    scuda_intercept_result = cublasZhpr_v2(handle, uplo, n, &alpha, x, incx, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 ||
@@ -33040,7 +33040,7 @@ int handle_cublasZhpr_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
+    double alpha;
     const cuDoubleComplex* x;
     int64_t incx;
     cuDoubleComplex AP;
@@ -33060,7 +33060,7 @@ int handle_cublasZhpr_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZhpr_v2_64(handle, uplo, n, alpha, x, incx, &AP);
+    scuda_intercept_result = cublasZhpr_v2_64(handle, uplo, n, &alpha, x, incx, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 ||
@@ -33163,10 +33163,10 @@ int handle_cublasDsyr2_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int incx;
-    const double* y;
+    double y;
     int incy;
     double A;
     int lda;
@@ -33189,7 +33189,7 @@ int handle_cublasDsyr2_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyr2_v2(handle, uplo, n, alpha, x, incx, y, incy, &A, lda);
+    scuda_intercept_result = cublasDsyr2_v2(handle, uplo, n, &alpha, &x, incx, &y, incy, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(double)) < 0 ||
@@ -33206,10 +33206,10 @@ int handle_cublasDsyr2_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int64_t incx;
-    const double* y;
+    double y;
     int64_t incy;
     double A;
     int64_t lda;
@@ -33232,7 +33232,7 @@ int handle_cublasDsyr2_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyr2_v2_64(handle, uplo, n, alpha, x, incx, y, incy, &A, lda);
+    scuda_intercept_result = cublasDsyr2_v2_64(handle, uplo, n, &alpha, &x, incx, &y, incy, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &A, sizeof(double)) < 0 ||
@@ -33675,10 +33675,10 @@ int handle_cublasDspr2_v2(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int incx;
-    const double* y;
+    double y;
     int incy;
     double AP;
     int request_id;
@@ -33699,7 +33699,7 @@ int handle_cublasDspr2_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDspr2_v2(handle, uplo, n, alpha, x, incx, y, incy, &AP);
+    scuda_intercept_result = cublasDspr2_v2(handle, uplo, n, &alpha, &x, incx, &y, incy, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &AP, sizeof(double)) < 0 ||
@@ -33716,10 +33716,10 @@ int handle_cublasDspr2_v2_64(void *conn)
     cublasHandle_t handle;
     cublasFillMode_t uplo;
     int64_t n;
-    const double* alpha;
-    const double* x;
+    double alpha;
+    double x;
     int64_t incx;
-    const double* y;
+    double y;
     int64_t incy;
     double AP;
     int request_id;
@@ -33740,7 +33740,7 @@ int handle_cublasDspr2_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDspr2_v2_64(handle, uplo, n, alpha, x, incx, y, incy, &AP);
+    scuda_intercept_result = cublasDspr2_v2_64(handle, uplo, n, &alpha, &x, incx, &y, incy, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &AP, sizeof(double)) < 0 ||
@@ -33924,15 +33924,13 @@ int handle_cublasSgemvBatched(void *conn)
     int m;
     int n;
     const float* alpha;
+    const float* * Aarray = nullptr;
     int lda;
+    const float* * xarray = nullptr;
     int incx;
     const float* beta;
+    float* * yarray = nullptr;
     int incy;
-   if (rpc_read(conn, &batchCount, sizeof(int)) < 0)
-       return -1;
-    const float* * Aarray = new const float* [batchCount];
-    const float* * xarray = new const float* [batchCount];
-    float* * yarray = new float* [batchCount];
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -33942,12 +33940,12 @@ int handle_cublasSgemvBatched(void *conn)
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
-        rpc_read(conn, Aarray, sizeof(const float* const[batchCount])) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, xarray, sizeof(const float* const[batchCount])) < 0 ||
+        rpc_read(conn, &xarray, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &incx, sizeof(int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
-        rpc_read(conn, yarray, sizeof(float* const[batchCount])) < 0 ||
+        rpc_read(conn, &yarray, sizeof(float* const)) < 0 ||
         rpc_read(conn, &incy, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
@@ -33974,15 +33972,13 @@ int handle_cublasTSTgemvBatched(void *conn)
     int m;
     int n;
     const float* alpha;
+    const __nv_bfloat16* * Aarray = nullptr;
     int lda;
+    const __nv_bfloat16* * xarray = nullptr;
     int incx;
     const float* beta;
+    __nv_bfloat16* * yarray = nullptr;
     int incy;
-   if (rpc_read(conn, &batchCount, sizeof(int)) < 0)
-       return -1;
-    const __nv_bfloat16* * Aarray = new const __nv_bfloat16* [batchCount];
-    const __nv_bfloat16* * xarray = new const __nv_bfloat16* [batchCount];
-    __nv_bfloat16* * yarray = new __nv_bfloat16* [batchCount];
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -33992,12 +33988,12 @@ int handle_cublasTSTgemvBatched(void *conn)
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
-        rpc_read(conn, Aarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const __nv_bfloat16* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, xarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 ||
+        rpc_read(conn, &xarray, sizeof(const __nv_bfloat16* const)) < 0 ||
         rpc_read(conn, &incx, sizeof(int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
-        rpc_read(conn, yarray, sizeof(__nv_bfloat16* const[batchCount])) < 0 ||
+        rpc_read(conn, &yarray, sizeof(__nv_bfloat16* const)) < 0 ||
         rpc_read(conn, &incy, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
@@ -34132,14 +34128,14 @@ int handle_cublasDgemvStridedBatched(void *conn)
     cublasOperation_t trans;
     int m;
     int n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
     long long int strideA;
-    const double* x;
+    double x;
     int incx;
     long long int stridex;
-    const double* beta;
+    double beta;
     double y;
     int incy;
     long long int stridey;
@@ -34169,7 +34165,7 @@ int handle_cublasDgemvStridedBatched(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemvStridedBatched(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex, beta, &y, incy, stridey, batchCount);
+    scuda_intercept_result = cublasDgemvStridedBatched(handle, trans, m, n, &alpha, &A, lda, strideA, &x, incx, stridex, &beta, &y, incy, stridey, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -34187,14 +34183,14 @@ int handle_cublasDgemvStridedBatched_64(void *conn)
     cublasOperation_t trans;
     int64_t m;
     int64_t n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
     long long int strideA;
-    const double* x;
+    double x;
     int64_t incx;
     long long int stridex;
-    const double* beta;
+    double beta;
     double y;
     int64_t incy;
     long long int stridey;
@@ -34224,7 +34220,7 @@ int handle_cublasDgemvStridedBatched_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemvStridedBatched_64(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex, beta, &y, incy, stridey, batchCount);
+    scuda_intercept_result = cublasDgemvStridedBatched_64(handle, trans, m, n, &alpha, &A, lda, strideA, &x, incx, stridex, &beta, &y, incy, stridey, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &y, sizeof(double)) < 0 ||
@@ -35009,12 +35005,12 @@ int handle_cublasDgemm_v2(void *conn)
     int m;
     int n;
     int k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* B;
+    double B;
     int ldb;
-    const double* beta;
+    double beta;
     double C;
     int ldc;
     int request_id;
@@ -35040,7 +35036,7 @@ int handle_cublasDgemm_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemm_v2(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDgemm_v2(handle, transa, transb, m, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -35060,12 +35056,12 @@ int handle_cublasDgemm_v2_64(void *conn)
     int64_t m;
     int64_t n;
     int64_t k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* B;
+    double B;
     int64_t ldb;
-    const double* beta;
+    double beta;
     double C;
     int64_t ldc;
     int request_id;
@@ -35091,7 +35087,7 @@ int handle_cublasDgemm_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemm_v2_64(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDgemm_v2_64(handle, transa, transb, m, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -35710,10 +35706,10 @@ int handle_cublasDsyrk_v2(void *conn)
     cublasOperation_t trans;
     int n;
     int k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* beta;
+    double beta;
     double C;
     int ldc;
     int request_id;
@@ -35736,7 +35732,7 @@ int handle_cublasDsyrk_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyrk_v2(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc);
+    scuda_intercept_result = cublasDsyrk_v2(handle, uplo, trans, n, k, &alpha, &A, lda, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -35755,10 +35751,10 @@ int handle_cublasDsyrk_v2_64(void *conn)
     cublasOperation_t trans;
     int64_t n;
     int64_t k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* beta;
+    double beta;
     double C;
     int64_t ldc;
     int request_id;
@@ -35781,7 +35777,7 @@ int handle_cublasDsyrk_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyrk_v2_64(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc);
+    scuda_intercept_result = cublasDsyrk_v2_64(handle, uplo, trans, n, k, &alpha, &A, lda, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -36070,10 +36066,10 @@ int handle_cublasZherk_v2(void *conn)
     cublasOperation_t trans;
     int n;
     int k;
-    const double* alpha;
+    double alpha;
     const cuDoubleComplex* A;
     int lda;
-    const double* beta;
+    double beta;
     cuDoubleComplex C;
     int ldc;
     int request_id;
@@ -36096,7 +36092,7 @@ int handle_cublasZherk_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZherk_v2(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc);
+    scuda_intercept_result = cublasZherk_v2(handle, uplo, trans, n, k, &alpha, A, lda, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -36115,10 +36111,10 @@ int handle_cublasZherk_v2_64(void *conn)
     cublasOperation_t trans;
     int64_t n;
     int64_t k;
-    const double* alpha;
+    double alpha;
     const cuDoubleComplex* A;
     int64_t lda;
-    const double* beta;
+    double beta;
     cuDoubleComplex C;
     int64_t ldc;
     int request_id;
@@ -36141,7 +36137,7 @@ int handle_cublasZherk_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZherk_v2_64(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc);
+    scuda_intercept_result = cublasZherk_v2_64(handle, uplo, trans, n, k, &alpha, A, lda, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -36258,12 +36254,12 @@ int handle_cublasDsyr2k_v2(void *conn)
     cublasOperation_t trans;
     int n;
     int k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* B;
+    double B;
     int ldb;
-    const double* beta;
+    double beta;
     double C;
     int ldc;
     int request_id;
@@ -36288,7 +36284,7 @@ int handle_cublasDsyr2k_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyr2k_v2(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDsyr2k_v2(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -36307,12 +36303,12 @@ int handle_cublasDsyr2k_v2_64(void *conn)
     cublasOperation_t trans;
     int64_t n;
     int64_t k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* B;
+    double B;
     int64_t ldb;
-    const double* beta;
+    double beta;
     double C;
     int64_t ldc;
     int request_id;
@@ -36337,7 +36333,7 @@ int handle_cublasDsyr2k_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyr2k_v2_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDsyr2k_v2_64(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -36655,7 +36651,7 @@ int handle_cublasZher2k_v2(void *conn)
     int lda;
     const cuDoubleComplex* B;
     int ldb;
-    const double* beta;
+    double beta;
     cuDoubleComplex C;
     int ldc;
     int request_id;
@@ -36680,7 +36676,7 @@ int handle_cublasZher2k_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZher2k_v2(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasZher2k_v2(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -36704,7 +36700,7 @@ int handle_cublasZher2k_v2_64(void *conn)
     int64_t lda;
     const cuDoubleComplex* B;
     int64_t ldb;
-    const double* beta;
+    double beta;
     cuDoubleComplex C;
     int64_t ldc;
     int request_id;
@@ -36729,7 +36725,7 @@ int handle_cublasZher2k_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZher2k_v2_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasZher2k_v2_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -36846,12 +36842,12 @@ int handle_cublasDsyrkx(void *conn)
     cublasOperation_t trans;
     int n;
     int k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* B;
+    double B;
     int ldb;
-    const double* beta;
+    double beta;
     double C;
     int ldc;
     int request_id;
@@ -36876,7 +36872,7 @@ int handle_cublasDsyrkx(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyrkx(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDsyrkx(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -36895,12 +36891,12 @@ int handle_cublasDsyrkx_64(void *conn)
     cublasOperation_t trans;
     int64_t n;
     int64_t k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* B;
+    double B;
     int64_t ldb;
-    const double* beta;
+    double beta;
     double C;
     int64_t ldc;
     int request_id;
@@ -36925,7 +36921,7 @@ int handle_cublasDsyrkx_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsyrkx_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDsyrkx_64(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -37243,7 +37239,7 @@ int handle_cublasZherkx(void *conn)
     int lda;
     const cuDoubleComplex* B;
     int ldb;
-    const double* beta;
+    double beta;
     cuDoubleComplex C;
     int ldc;
     int request_id;
@@ -37268,7 +37264,7 @@ int handle_cublasZherkx(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZherkx(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasZherkx(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -37292,7 +37288,7 @@ int handle_cublasZherkx_64(void *conn)
     int64_t lda;
     const cuDoubleComplex* B;
     int64_t ldb;
-    const double* beta;
+    double beta;
     cuDoubleComplex C;
     int64_t ldc;
     int request_id;
@@ -37317,7 +37313,7 @@ int handle_cublasZherkx_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZherkx_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasZherkx_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -37434,12 +37430,12 @@ int handle_cublasDsymm_v2(void *conn)
     cublasFillMode_t uplo;
     int m;
     int n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* B;
+    double B;
     int ldb;
-    const double* beta;
+    double beta;
     double C;
     int ldc;
     int request_id;
@@ -37464,7 +37460,7 @@ int handle_cublasDsymm_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsymm_v2(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDsymm_v2(handle, side, uplo, m, n, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -37483,12 +37479,12 @@ int handle_cublasDsymm_v2_64(void *conn)
     cublasFillMode_t uplo;
     int64_t m;
     int64_t n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* B;
+    double B;
     int64_t ldb;
-    const double* beta;
+    double beta;
     double C;
     int64_t ldc;
     int request_id;
@@ -37513,7 +37509,7 @@ int handle_cublasDsymm_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDsymm_v2_64(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, &C, ldc);
+    scuda_intercept_result = cublasDsymm_v2_64(handle, side, uplo, m, n, &alpha, &A, lda, &B, ldb, &beta, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -38020,8 +38016,8 @@ int handle_cublasDtrsm_v2(void *conn)
     cublasDiagType_t diag;
     int m;
     int n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
     double B;
     int ldb;
@@ -38046,7 +38042,7 @@ int handle_cublasDtrsm_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrsm_v2(handle, side, uplo, trans, diag, m, n, alpha, A, lda, &B, ldb);
+    scuda_intercept_result = cublasDtrsm_v2(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &B, sizeof(double)) < 0 ||
@@ -38067,8 +38063,8 @@ int handle_cublasDtrsm_v2_64(void *conn)
     cublasDiagType_t diag;
     int64_t m;
     int64_t n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
     double B;
     int64_t ldb;
@@ -38093,7 +38089,7 @@ int handle_cublasDtrsm_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrsm_v2_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, &B, ldb);
+    scuda_intercept_result = cublasDtrsm_v2_64(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &B, sizeof(double)) < 0 ||
@@ -38404,10 +38400,10 @@ int handle_cublasDtrmm_v2(void *conn)
     cublasDiagType_t diag;
     int m;
     int n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int lda;
-    const double* B;
+    double B;
     int ldb;
     double C;
     int ldc;
@@ -38434,7 +38430,7 @@ int handle_cublasDtrmm_v2(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrmm_v2(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasDtrmm_v2(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -38455,10 +38451,10 @@ int handle_cublasDtrmm_v2_64(void *conn)
     cublasDiagType_t diag;
     int64_t m;
     int64_t n;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* B;
+    double B;
     int64_t ldb;
     double C;
     int64_t ldc;
@@ -38485,7 +38481,7 @@ int handle_cublasDtrmm_v2_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrmm_v2_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasDtrmm_v2_64(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -38701,8 +38697,9 @@ int handle_cublasZtrmm_v2_64(void *conn)
     return -1;
 }
 
-int handle_cublasHgemmStridedBatched(void *conn)
+int handle_cublasHgemmBatched(void *conn)
 {
+    int batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -38710,20 +38707,17 @@ int handle_cublasHgemmStridedBatched(void *conn)
     int n;
     int k;
     const __half* alpha;
-    const __half* A;
+    const __half* * Aarray = nullptr;
     int lda;
-    long long int strideA;
-    const __half* B;
+    const __half* * Barray = nullptr;
     int ldb;
-    long long int strideB;
     const __half* beta;
-    __half C;
+    __half* * Carray = nullptr;
     int ldc;
-    long long int strideC;
-    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -38731,27 +38725,22 @@ int handle_cublasHgemmStridedBatched(void *conn)
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const __half*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const __half* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const __half* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const __half*)) < 0 ||
-        rpc_read(conn, &C, sizeof(__half)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(__half* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasHgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(__half)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -38760,8 +38749,9 @@ int handle_cublasHgemmStridedBatched(void *conn)
     return -1;
 }
 
-int handle_cublasHgemmStridedBatched_64(void *conn)
+int handle_cublasHgemmBatched_64(void *conn)
 {
+    int64_t batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -38769,20 +38759,17 @@ int handle_cublasHgemmStridedBatched_64(void *conn)
     int64_t n;
     int64_t k;
     const __half* alpha;
-    const __half* A;
+    const __half* * Aarray = nullptr;
     int64_t lda;
-    long long int strideA;
-    const __half* B;
+    const __half* * Barray = nullptr;
     int64_t ldb;
-    long long int strideB;
     const __half* beta;
-    __half C;
+    __half* * Carray = nullptr;
     int64_t ldc;
-    long long int strideC;
-    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -38790,27 +38777,22 @@ int handle_cublasHgemmStridedBatched_64(void *conn)
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const __half*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const __half* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const __half* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const __half*)) < 0 ||
-        rpc_read(conn, &C, sizeof(__half)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(__half* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasHgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasHgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(__half)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -38819,8 +38801,9 @@ int handle_cublasHgemmStridedBatched_64(void *conn)
     return -1;
 }
 
-int handle_cublasSgemmStridedBatched(void *conn)
+int handle_cublasSgemmBatched(void *conn)
 {
+    int batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -38828,20 +38811,17 @@ int handle_cublasSgemmStridedBatched(void *conn)
     int n;
     int k;
     const float* alpha;
-    const float* A;
+    const float* * Aarray = nullptr;
     int lda;
-    long long int strideA;
-    const float* B;
+    const float* * Barray = nullptr;
     int ldb;
-    long long int strideB;
     const float* beta;
-    float C;
+    float* * Carray = nullptr;
     int ldc;
-    long long int strideC;
-    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -38849,27 +38829,22 @@ int handle_cublasSgemmStridedBatched(void *conn)
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
-        rpc_read(conn, &C, sizeof(float)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(float* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(float)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -38878,8 +38853,9 @@ int handle_cublasSgemmStridedBatched(void *conn)
     return -1;
 }
 
-int handle_cublasSgemmStridedBatched_64(void *conn)
+int handle_cublasSgemmBatched_64(void *conn)
 {
+    int64_t batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -38887,20 +38863,17 @@ int handle_cublasSgemmStridedBatched_64(void *conn)
     int64_t n;
     int64_t k;
     const float* alpha;
-    const float* A;
+    const float* * Aarray = nullptr;
     int64_t lda;
-    long long int strideA;
-    const float* B;
+    const float* * Barray = nullptr;
     int64_t ldb;
-    long long int strideB;
     const float* beta;
-    float C;
+    float* * Carray = nullptr;
     int64_t ldc;
-    long long int strideC;
-    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -38908,27 +38881,22 @@ int handle_cublasSgemmStridedBatched_64(void *conn)
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
-        rpc_read(conn, &C, sizeof(float)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(float* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasSgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasSgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(float)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -38937,29 +38905,27 @@ int handle_cublasSgemmStridedBatched_64(void *conn)
     return -1;
 }
 
-int handle_cublasDgemmStridedBatched(void *conn)
+int handle_cublasDgemmBatched(void *conn)
 {
+    int batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int m;
     int n;
     int k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    const double* * Aarray = nullptr;
     int lda;
-    long long int strideA;
-    const double* B;
+    const double* * Barray = nullptr;
     int ldb;
-    long long int strideB;
-    const double* beta;
-    double C;
+    double beta;
+    double* * Carray = nullptr;
     int ldc;
-    long long int strideC;
-    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -38967,27 +38933,22 @@ int handle_cublasDgemmStridedBatched(void *conn)
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const double* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &C, sizeof(double)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(double* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasDgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(double)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -38996,29 +38957,27 @@ int handle_cublasDgemmStridedBatched(void *conn)
     return -1;
 }
 
-int handle_cublasDgemmStridedBatched_64(void *conn)
+int handle_cublasDgemmBatched_64(void *conn)
 {
+    int64_t batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int64_t m;
     int64_t n;
     int64_t k;
-    const double* alpha;
-    const double* A;
+    double alpha;
+    const double* * Aarray = nullptr;
     int64_t lda;
-    long long int strideA;
-    const double* B;
+    const double* * Barray = nullptr;
     int64_t ldb;
-    long long int strideB;
-    const double* beta;
-    double C;
+    double beta;
+    double* * Carray = nullptr;
     int64_t ldc;
-    long long int strideC;
-    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -39026,27 +38985,22 @@ int handle_cublasDgemmStridedBatched_64(void *conn)
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const double* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &C, sizeof(double)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(double* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasDgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(double)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39055,8 +39009,9 @@ int handle_cublasDgemmStridedBatched_64(void *conn)
     return -1;
 }
 
-int handle_cublasCgemmStridedBatched(void *conn)
+int handle_cublasCgemmBatched(void *conn)
 {
+    int batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -39064,20 +39019,17 @@ int handle_cublasCgemmStridedBatched(void *conn)
     int n;
     int k;
     const cuComplex* alpha;
-    const cuComplex* A;
+    const cuComplex* * Aarray = nullptr;
     int lda;
-    long long int strideA;
-    const cuComplex* B;
+    const cuComplex* * Barray = nullptr;
     int ldb;
-    long long int strideB;
     const cuComplex* beta;
-    cuComplex C;
+    cuComplex* * Carray = nullptr;
     int ldc;
-    long long int strideC;
-    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -39085,27 +39037,22 @@ int handle_cublasCgemmStridedBatched(void *conn)
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasCgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39114,8 +39061,9 @@ int handle_cublasCgemmStridedBatched(void *conn)
     return -1;
 }
 
-int handle_cublasCgemmStridedBatched_64(void *conn)
+int handle_cublasCgemmBatched_64(void *conn)
 {
+    int64_t batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -39123,20 +39071,17 @@ int handle_cublasCgemmStridedBatched_64(void *conn)
     int64_t n;
     int64_t k;
     const cuComplex* alpha;
-    const cuComplex* A;
+    const cuComplex* * Aarray = nullptr;
     int64_t lda;
-    long long int strideA;
-    const cuComplex* B;
+    const cuComplex* * Barray = nullptr;
     int64_t ldb;
-    long long int strideB;
     const cuComplex* beta;
-    cuComplex C;
+    cuComplex* * Carray = nullptr;
     int64_t ldc;
-    long long int strideC;
-    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -39144,27 +39089,22 @@ int handle_cublasCgemmStridedBatched_64(void *conn)
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasCgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39173,8 +39113,9 @@ int handle_cublasCgemmStridedBatched_64(void *conn)
     return -1;
 }
 
-int handle_cublasCgemm3mStridedBatched(void *conn)
+int handle_cublasCgemm3mBatched(void *conn)
 {
+    int batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -39182,20 +39123,17 @@ int handle_cublasCgemm3mStridedBatched(void *conn)
     int n;
     int k;
     const cuComplex* alpha;
-    const cuComplex* A;
+    const cuComplex* * Aarray = nullptr;
     int lda;
-    long long int strideA;
-    const cuComplex* B;
+    const cuComplex* * Barray = nullptr;
     int ldb;
-    long long int strideB;
     const cuComplex* beta;
-    cuComplex C;
+    cuComplex* * Carray = nullptr;
     int ldc;
-    long long int strideC;
-    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -39203,27 +39141,22 @@ int handle_cublasCgemm3mStridedBatched(void *conn)
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCgemm3mStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasCgemm3mBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39232,8 +39165,9 @@ int handle_cublasCgemm3mStridedBatched(void *conn)
     return -1;
 }
 
-int handle_cublasCgemm3mStridedBatched_64(void *conn)
+int handle_cublasCgemm3mBatched_64(void *conn)
 {
+    int64_t batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -39241,20 +39175,17 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn)
     int64_t n;
     int64_t k;
     const cuComplex* alpha;
-    const cuComplex* A;
+    const cuComplex* * Aarray = nullptr;
     int64_t lda;
-    long long int strideA;
-    const cuComplex* B;
+    const cuComplex* * Barray = nullptr;
     int64_t ldb;
-    long long int strideB;
     const cuComplex* beta;
-    cuComplex C;
+    cuComplex* * Carray = nullptr;
     int64_t ldc;
-    long long int strideC;
-    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
         rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -39262,27 +39193,22 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn)
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCgemm3mStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasCgemm3mBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39291,8 +39217,9 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn)
     return -1;
 }
 
-int handle_cublasZgemmStridedBatched(void *conn)
+int handle_cublasZgemmBatched(void *conn)
 {
+    int batchCount;
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
@@ -39300,14 +39227,117 @@ int handle_cublasZgemmStridedBatched(void *conn)
     int n;
     int k;
     const cuDoubleComplex* alpha;
-    const cuDoubleComplex* A;
+    const cuDoubleComplex* * Aarray = nullptr;
+    int lda;
+    const cuDoubleComplex* * Barray = nullptr;
+    int ldb;
+    const cuDoubleComplex* beta;
+    cuDoubleComplex* * Carray = nullptr;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &k, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZgemmBatched_64(void *conn)
+{
+    int64_t batchCount;
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int64_t m;
+    int64_t n;
+    int64_t k;
+    const cuDoubleComplex* alpha;
+    const cuDoubleComplex* * Aarray = nullptr;
+    int64_t lda;
+    const cuDoubleComplex* * Barray = nullptr;
+    int64_t ldb;
+    const cuDoubleComplex* beta;
+    cuDoubleComplex* * Carray = nullptr;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasHgemmStridedBatched(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int m;
+    int n;
+    int k;
+    const __half* alpha;
+    const __half* A;
     int lda;
     long long int strideA;
-    const cuDoubleComplex* B;
+    const __half* B;
     int ldb;
     long long int strideB;
-    const cuDoubleComplex* beta;
-    cuDoubleComplex C;
+    const __half* beta;
+    __half C;
     int ldc;
     long long int strideC;
     int batchCount;
@@ -39320,15 +39350,15 @@ int handle_cublasZgemmStridedBatched(void *conn)
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &k, sizeof(int)) < 0 ||
-        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const __half*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
         rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const __half*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
         rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &C, sizeof(__half)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
         rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
         rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
@@ -39338,10 +39368,10 @@ int handle_cublasZgemmStridedBatched(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_write(conn, &C, sizeof(__half)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39350,7 +39380,7 @@ int handle_cublasZgemmStridedBatched(void *conn)
     return -1;
 }
 
-int handle_cublasZgemmStridedBatched_64(void *conn)
+int handle_cublasHgemmStridedBatched_64(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
@@ -39358,15 +39388,15 @@ int handle_cublasZgemmStridedBatched_64(void *conn)
     int64_t m;
     int64_t n;
     int64_t k;
-    const cuDoubleComplex* alpha;
-    const cuDoubleComplex* A;
+    const __half* alpha;
+    const __half* A;
     int64_t lda;
     long long int strideA;
-    const cuDoubleComplex* B;
+    const __half* B;
     int64_t ldb;
     long long int strideB;
-    const cuDoubleComplex* beta;
-    cuDoubleComplex C;
+    const __half* beta;
+    __half C;
     int64_t ldc;
     long long int strideC;
     int64_t batchCount;
@@ -39379,15 +39409,15 @@ int handle_cublasZgemmStridedBatched_64(void *conn)
         rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const __half*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const __half*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const __half*)) < 0 ||
+        rpc_read(conn, &C, sizeof(__half)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
         rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
@@ -39397,10 +39427,10 @@ int handle_cublasZgemmStridedBatched_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+    scuda_intercept_result = cublasHgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_write(conn, &C, sizeof(__half)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39409,21 +39439,26 @@ int handle_cublasZgemmStridedBatched_64(void *conn)
     return -1;
 }
 
-int handle_cublasSgeam(void *conn)
+int handle_cublasSgemmStridedBatched(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int m;
     int n;
+    int k;
     const float* alpha;
     const float* A;
     int lda;
-    const float* beta;
+    long long int strideA;
     const float* B;
     int ldb;
+    long long int strideB;
+    const float* beta;
     float C;
     int ldc;
+    long long int strideC;
+    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39432,21 +39467,26 @@ int handle_cublasSgeam(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
         rpc_read(conn, &A, sizeof(const float*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const float*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
         rpc_read(conn, &C, sizeof(float)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(float)) < 0 ||
@@ -39458,21 +39498,26 @@ int handle_cublasSgeam(void *conn)
     return -1;
 }
 
-int handle_cublasSgeam_64(void *conn)
+int handle_cublasSgemmStridedBatched_64(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int64_t m;
     int64_t n;
+    int64_t k;
     const float* alpha;
     const float* A;
     int64_t lda;
-    const float* beta;
+    long long int strideA;
     const float* B;
     int64_t ldb;
+    long long int strideB;
+    const float* beta;
     float C;
     int64_t ldc;
+    long long int strideC;
+    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39481,21 +39526,26 @@ int handle_cublasSgeam_64(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
         rpc_read(conn, &A, sizeof(const float*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const float*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
         rpc_read(conn, &C, sizeof(float)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasSgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(float)) < 0 ||
@@ -39507,21 +39557,26 @@ int handle_cublasSgeam_64(void *conn)
     return -1;
 }
 
-int handle_cublasDgeam(void *conn)
+int handle_cublasDgemmStridedBatched(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int m;
     int n;
-    const double* alpha;
-    const double* A;
+    int k;
+    double alpha;
+    double A;
     int lda;
-    const double* beta;
-    const double* B;
+    long long int strideA;
+    double B;
     int ldb;
+    long long int strideB;
+    double beta;
     double C;
     int ldc;
+    long long int strideC;
+    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39530,21 +39585,26 @@ int handle_cublasDgeam(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
         rpc_read(conn, &A, sizeof(const double*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const double*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
         rpc_read(conn, &C, sizeof(double)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasDgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, &A, lda, strideA, &B, ldb, strideB, &beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -39556,21 +39616,26 @@ int handle_cublasDgeam(void *conn)
     return -1;
 }
 
-int handle_cublasDgeam_64(void *conn)
+int handle_cublasDgemmStridedBatched_64(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int64_t m;
     int64_t n;
-    const double* alpha;
-    const double* A;
+    int64_t k;
+    double alpha;
+    double A;
     int64_t lda;
-    const double* beta;
-    const double* B;
+    long long int strideA;
+    double B;
     int64_t ldb;
+    long long int strideB;
+    double beta;
     double C;
     int64_t ldc;
+    long long int strideC;
+    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39579,21 +39644,26 @@ int handle_cublasDgeam_64(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
         rpc_read(conn, &A, sizeof(const double*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const double*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
         rpc_read(conn, &C, sizeof(double)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasDgemmStridedBatched_64(handle, transa, transb, m, n, k, &alpha, &A, lda, strideA, &B, ldb, strideB, &beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -39605,21 +39675,26 @@ int handle_cublasDgeam_64(void *conn)
     return -1;
 }
 
-int handle_cublasCgeam(void *conn)
+int handle_cublasCgemmStridedBatched(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int m;
     int n;
+    int k;
     const cuComplex* alpha;
     const cuComplex* A;
     int lda;
-    const cuComplex* beta;
+    long long int strideA;
     const cuComplex* B;
     int ldb;
+    long long int strideB;
+    const cuComplex* beta;
     cuComplex C;
     int ldc;
+    long long int strideC;
+    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39628,21 +39703,26 @@ int handle_cublasCgeam(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasCgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
@@ -39654,21 +39734,26 @@ int handle_cublasCgeam(void *conn)
     return -1;
 }
 
-int handle_cublasCgeam_64(void *conn)
+int handle_cublasCgemmStridedBatched_64(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int64_t m;
     int64_t n;
+    int64_t k;
     const cuComplex* alpha;
     const cuComplex* A;
     int64_t lda;
-    const cuComplex* beta;
+    long long int strideA;
     const cuComplex* B;
     int64_t ldb;
+    long long int strideB;
+    const cuComplex* beta;
     cuComplex C;
     int64_t ldc;
+    long long int strideC;
+    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39677,21 +39762,144 @@ int handle_cublasCgeam_64(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCgemm3mStridedBatched(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int m;
+    int n;
+    int k;
+    const cuComplex* alpha;
+    const cuComplex* A;
+    int lda;
+    long long int strideA;
+    const cuComplex* B;
+    int ldb;
+    long long int strideB;
+    const cuComplex* beta;
+    cuComplex C;
+    int ldc;
+    long long int strideC;
+    int batchCount;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &k, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
         rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCgemm3mStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCgemm3mStridedBatched_64(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int64_t m;
+    int64_t n;
+    int64_t k;
+    const cuComplex* alpha;
+    const cuComplex* A;
+    int64_t lda;
+    long long int strideA;
+    const cuComplex* B;
+    int64_t ldb;
+    long long int strideB;
+    const cuComplex* beta;
+    cuComplex C;
+    int64_t ldc;
+    long long int strideC;
+    int64_t batchCount;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasCgemm3mStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
@@ -39703,21 +39911,26 @@ int handle_cublasCgeam_64(void *conn)
     return -1;
 }
 
-int handle_cublasZgeam(void *conn)
+int handle_cublasZgemmStridedBatched(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int m;
     int n;
+    int k;
     const cuDoubleComplex* alpha;
     const cuDoubleComplex* A;
     int lda;
-    const cuDoubleComplex* beta;
+    long long int strideA;
     const cuDoubleComplex* B;
     int ldb;
+    long long int strideB;
+    const cuDoubleComplex* beta;
     cuDoubleComplex C;
     int ldc;
+    long long int strideC;
+    int batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39726,21 +39939,26 @@ int handle_cublasZgeam(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &k, sizeof(int)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasZgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -39752,21 +39970,26 @@ int handle_cublasZgeam(void *conn)
     return -1;
 }
 
-int handle_cublasZgeam_64(void *conn)
+int handle_cublasZgemmStridedBatched_64(void *conn)
 {
     cublasHandle_t handle;
     cublasOperation_t transa;
     cublasOperation_t transb;
     int64_t m;
     int64_t n;
+    int64_t k;
     const cuDoubleComplex* alpha;
     const cuDoubleComplex* A;
     int64_t lda;
-    const cuDoubleComplex* beta;
+    long long int strideA;
     const cuDoubleComplex* B;
     int64_t ldb;
+    long long int strideB;
+    const cuDoubleComplex* beta;
     cuDoubleComplex C;
     int64_t ldc;
+    long long int strideC;
+    int64_t batchCount;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
@@ -39775,21 +39998,26 @@ int handle_cublasZgeam_64(void *conn)
         rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
         rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
         rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+    scuda_intercept_result = cublasZgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
@@ -39801,29 +40029,97 @@ int handle_cublasZgeam_64(void *conn)
     return -1;
 }
 
-int handle_cublasSdgmm(void *conn)
+int handle_cublasGemmBatchedEx_64(void *conn)
 {
+    int64_t batchCount;
     cublasHandle_t handle;
-    cublasSideMode_t mode;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int64_t m;
+    int64_t n;
+    int64_t k;
+    const void* alpha;
+    const void* * Aarray = nullptr;
+    cudaDataType Atype;
+    int64_t lda;
+    const void* * Barray = nullptr;
+    cudaDataType Btype;
+    int64_t ldb;
+    const void* beta;
+    void* * Carray = nullptr;
+    cudaDataType Ctype;
+    int64_t ldc;
+    cublasComputeType_t computeType;
+    cublasGemmAlgo_t algo;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const void*)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const void* const)) < 0 ||
+        rpc_read(conn, &Atype, sizeof(cudaDataType)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(const void* const)) < 0 ||
+        rpc_read(conn, &Btype, sizeof(cudaDataType)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const void*)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(void* const)) < 0 ||
+        rpc_read(conn, &Ctype, sizeof(cudaDataType)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &computeType, sizeof(cublasComputeType_t)) < 0 ||
+        rpc_read(conn, &algo, sizeof(cublasGemmAlgo_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasGemmBatchedEx_64(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda, Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount, computeType, algo);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasSgeam(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
     int m;
     int n;
+    const float* alpha;
     const float* A;
     int lda;
-    const float* x;
-    int incx;
+    const float* beta;
+    const float* B;
+    int ldb;
     float C;
     int ldc;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
         rpc_read(conn, &A, sizeof(const float*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &x, sizeof(const float*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
         rpc_read(conn, &C, sizeof(float)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int)) < 0 ||
         false)
@@ -39832,7 +40128,7 @@ int handle_cublasSdgmm(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasSdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(float)) < 0 ||
@@ -39844,29 +40140,35 @@ int handle_cublasSdgmm(void *conn)
     return -1;
 }
 
-int handle_cublasSdgmm_64(void *conn)
+int handle_cublasSgeam_64(void *conn)
 {
     cublasHandle_t handle;
-    cublasSideMode_t mode;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
     int64_t m;
     int64_t n;
+    const float* alpha;
     const float* A;
     int64_t lda;
-    const float* x;
-    int64_t incx;
+    const float* beta;
+    const float* B;
+    int64_t ldb;
     float C;
     int64_t ldc;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
         rpc_read(conn, &A, sizeof(const float*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &x, sizeof(const float*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
         rpc_read(conn, &C, sizeof(float)) < 0 ||
         rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
         false)
@@ -39875,7 +40177,7 @@ int handle_cublasSdgmm_64(void *conn)
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasSdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
 
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &C, sizeof(float)) < 0 ||
@@ -39887,41 +40189,1629 @@ int handle_cublasSdgmm_64(void *conn)
     return -1;
 }
 
-int handle_cublasDdgmm(void *conn)
+int handle_cublasDgeam(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int m;
+    int n;
+    double alpha;
+    double A;
+    int lda;
+    double beta;
+    double B;
+    int ldb;
+    double C;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &C, sizeof(double)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, &alpha, &A, lda, &beta, &B, ldb, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(double)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDgeam_64(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int64_t m;
+    int64_t n;
+    double alpha;
+    double A;
+    int64_t lda;
+    double beta;
+    double B;
+    int64_t ldb;
+    double C;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &C, sizeof(double)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, &alpha, &A, lda, &beta, &B, ldb, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(double)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCgeam(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int m;
+    int n;
+    const cuComplex* alpha;
+    const cuComplex* A;
+    int lda;
+    const cuComplex* beta;
+    const cuComplex* B;
+    int ldb;
+    cuComplex C;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCgeam_64(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int64_t m;
+    int64_t n;
+    const cuComplex* alpha;
+    const cuComplex* A;
+    int64_t lda;
+    const cuComplex* beta;
+    const cuComplex* B;
+    int64_t ldb;
+    cuComplex C;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZgeam(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int m;
+    int n;
+    const cuDoubleComplex* alpha;
+    const cuDoubleComplex* A;
+    int lda;
+    const cuDoubleComplex* beta;
+    const cuDoubleComplex* B;
+    int ldb;
+    cuDoubleComplex C;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZgeam_64(void *conn)
+{
+    cublasHandle_t handle;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+    int64_t m;
+    int64_t n;
+    const cuDoubleComplex* alpha;
+    const cuDoubleComplex* A;
+    int64_t lda;
+    const cuDoubleComplex* beta;
+    const cuDoubleComplex* B;
+    int64_t ldb;
+    cuDoubleComplex C;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasStrsmBatched(void *conn)
+{
+    int batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int m;
+    int n;
+    const float* alpha;
+    const float* * A = nullptr;
+    int lda;
+    float* * B = nullptr;
+    int ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const float* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &B, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasStrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasStrsmBatched_64(void *conn)
+{
+    int64_t batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int64_t m;
+    int64_t n;
+    const float* alpha;
+    const float* * A = nullptr;
+    int64_t lda;
+    float* * B = nullptr;
+    int64_t ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const float* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &B, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasStrsmBatched_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDtrsmBatched(void *conn)
+{
+    int batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int m;
+    int n;
+    double alpha;
+    const double* * A = nullptr;
+    int lda;
+    double* * B = nullptr;
+    int ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &B, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDtrsmBatched_64(void *conn)
+{
+    int64_t batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int64_t m;
+    int64_t n;
+    double alpha;
+    const double* * A = nullptr;
+    int64_t lda;
+    double* * B = nullptr;
+    int64_t ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &B, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCtrsmBatched(void *conn)
+{
+    int batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int m;
+    int n;
+    const cuComplex* alpha;
+    const cuComplex* * A = nullptr;
+    int lda;
+    cuComplex* * B = nullptr;
+    int ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &B, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCtrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCtrsmBatched_64(void *conn)
+{
+    int64_t batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int64_t m;
+    int64_t n;
+    const cuComplex* alpha;
+    const cuComplex* * A = nullptr;
+    int64_t lda;
+    cuComplex* * B = nullptr;
+    int64_t ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &B, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCtrsmBatched_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZtrsmBatched(void *conn)
+{
+    int batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int m;
+    int n;
+    const cuDoubleComplex* alpha;
+    const cuDoubleComplex* * A = nullptr;
+    int lda;
+    cuDoubleComplex* * B = nullptr;
+    int ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &B, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZtrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZtrsmBatched_64(void *conn)
+{
+    int64_t batchCount;
+    cublasHandle_t handle;
+    cublasSideMode_t side;
+    cublasFillMode_t uplo;
+    cublasOperation_t trans;
+    cublasDiagType_t diag;
+    int64_t m;
+    int64_t n;
+    const cuDoubleComplex* alpha;
+    const cuDoubleComplex* * A = nullptr;
+    int64_t lda;
+    cuDoubleComplex* * B = nullptr;
+    int64_t ldb;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &B, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZtrsmBatched_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasSdgmm(void *conn)
+{
+    cublasHandle_t handle;
+    cublasSideMode_t mode;
+    int m;
+    int n;
+    const float* A;
+    int lda;
+    const float* x;
+    int incx;
+    float C;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &x, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int)) < 0 ||
+        rpc_read(conn, &C, sizeof(float)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasSdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(float)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasSdgmm_64(void *conn)
+{
+    cublasHandle_t handle;
+    cublasSideMode_t mode;
+    int64_t m;
+    int64_t n;
+    const float* A;
+    int64_t lda;
+    const float* x;
+    int64_t incx;
+    float C;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &A, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &x, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &C, sizeof(float)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasSdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(float)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDdgmm(void *conn)
+{
+    cublasHandle_t handle;
+    cublasSideMode_t mode;
+    int m;
+    int n;
+    double A;
+    int lda;
+    double x;
+    int incx;
+    double C;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &x, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int)) < 0 ||
+        rpc_read(conn, &C, sizeof(double)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDdgmm(handle, mode, m, n, &A, lda, &x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(double)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDdgmm_64(void *conn)
+{
+    cublasHandle_t handle;
+    cublasSideMode_t mode;
+    int64_t m;
+    int64_t n;
+    double A;
+    int64_t lda;
+    double x;
+    int64_t incx;
+    double C;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &x, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &C, sizeof(double)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDdgmm_64(handle, mode, m, n, &A, lda, &x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(double)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCdgmm(void *conn)
+{
+    cublasHandle_t handle;
+    cublasSideMode_t mode;
+    int m;
+    int n;
+    const cuComplex* A;
+    int lda;
+    const cuComplex* x;
+    int incx;
+    cuComplex C;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCdgmm_64(void *conn)
+{
+    cublasHandle_t handle;
+    cublasSideMode_t mode;
+    int64_t m;
+    int64_t n;
+    const cuComplex* A;
+    int64_t lda;
+    const cuComplex* x;
+    int64_t incx;
+    cuComplex C;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZdgmm(void *conn)
+{
+    cublasHandle_t handle;
+    cublasSideMode_t mode;
+    int m;
+    int n;
+    const cuDoubleComplex* A;
+    int lda;
+    const cuDoubleComplex* x;
+    int incx;
+    cuDoubleComplex C;
+    int ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZdgmm_64(void *conn)
 {
     cublasHandle_t handle;
     cublasSideMode_t mode;
+    int64_t m;
+    int64_t n;
+    const cuDoubleComplex* A;
+    int64_t lda;
+    const cuDoubleComplex* x;
+    int64_t incx;
+    cuDoubleComplex C;
+    int64_t ldc;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasSmatinvBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int n;
+    const float* * A = nullptr;
+    int lda;
+    float* * Ainv = nullptr;
+    int lda_inv;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const float* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Ainv, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasSmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDmatinvBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int n;
+    const double* * A = nullptr;
+    int lda;
+    double* * Ainv = nullptr;
+    int lda_inv;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Ainv, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCmatinvBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int n;
+    const cuComplex* * A = nullptr;
+    int lda;
+    cuComplex* * Ainv = nullptr;
+    int lda_inv;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Ainv, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZmatinvBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int n;
+    const cuDoubleComplex* * A = nullptr;
+    int lda;
+    cuDoubleComplex* * Ainv = nullptr;
+    int lda_inv;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Ainv, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasSgeqrfBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int m;
+    int n;
+    float* * Aarray = nullptr;
+    int lda;
+    float* * TauArray = nullptr;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &TauArray, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasSgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDgeqrfBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int m;
+    int n;
+    double* * Aarray = nullptr;
+    int lda;
+    double* * TauArray = nullptr;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &TauArray, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCgeqrfBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int m;
+    int n;
+    cuComplex* * Aarray = nullptr;
+    int lda;
+    cuComplex* * TauArray = nullptr;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &TauArray, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZgeqrfBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    int m;
+    int n;
+    cuDoubleComplex* * Aarray = nullptr;
+    int lda;
+    cuDoubleComplex* * TauArray = nullptr;
+    int info;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &TauArray, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasSgelsBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    cublasOperation_t trans;
+    int m;
+    int n;
+    int nrhs;
+    float* * Aarray = nullptr;
+    int lda;
+    float* * Carray = nullptr;
+    int ldc;
+    int info;
+    int devInfoArray;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        rpc_read(conn, &devInfoArray, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDgelsBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    cublasOperation_t trans;
+    int m;
+    int n;
+    int nrhs;
+    double* * Aarray = nullptr;
+    int lda;
+    double* * Carray = nullptr;
+    int ldc;
+    int info;
+    int devInfoArray;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        rpc_read(conn, &devInfoArray, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCgelsBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    cublasOperation_t trans;
+    int m;
+    int n;
+    int nrhs;
+    cuComplex* * Aarray = nullptr;
+    int lda;
+    cuComplex* * Carray = nullptr;
+    int ldc;
+    int info;
+    int devInfoArray;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        rpc_read(conn, &devInfoArray, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasCgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasZgelsBatched(void *conn)
+{
+    int batchSize;
+    cublasHandle_t handle;
+    cublasOperation_t trans;
     int m;
     int n;
-    const double* A;
+    int nrhs;
+    cuDoubleComplex* * Aarray = nullptr;
+    int lda;
+    cuDoubleComplex* * Carray = nullptr;
+    int ldc;
+    int info;
+    int devInfoArray;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
+        rpc_read(conn, &devInfoArray, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasZgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
+        rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasStpttr(void *conn)
+{
+    cublasHandle_t handle;
+    cublasFillMode_t uplo;
+    int n;
+    const float* AP;
+    float A;
+    int lda;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &AP, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &A, sizeof(float)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasStpttr(handle, uplo, n, AP, &A, lda);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &A, sizeof(float)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasDtpttr(void *conn)
+{
+    cublasHandle_t handle;
+    cublasFillMode_t uplo;
+    int n;
+    double AP;
+    double A;
+    int lda;
+    int request_id;
+    cublasStatus_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &AP, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &A, sizeof(double)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cublasDtpttr(handle, uplo, n, &AP, &A, lda);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &A, sizeof(double)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cublasCtpttr(void *conn)
+{
+    cublasHandle_t handle;
+    cublasFillMode_t uplo;
+    int n;
+    const cuComplex* AP;
+    cuComplex A;
     int lda;
-    const double* x;
-    int incx;
-    double C;
-    int ldc;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &AP, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(cuComplex)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &x, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int)) < 0 ||
-        rpc_read(conn, &C, sizeof(double)) < 0 ||
-        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasCtpttr(handle, uplo, n, AP, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(double)) < 0 ||
+        rpc_write(conn, &A, sizeof(cuComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39930,41 +41820,33 @@ int handle_cublasDdgmm(void *conn)
     return -1;
 }
 
-int handle_cublasDdgmm_64(void *conn)
+int handle_cublasZtpttr(void *conn)
 {
     cublasHandle_t handle;
-    cublasSideMode_t mode;
-    int64_t m;
-    int64_t n;
-    const double* A;
-    int64_t lda;
-    const double* x;
-    int64_t incx;
-    double C;
-    int64_t ldc;
+    cublasFillMode_t uplo;
+    int n;
+    const cuDoubleComplex* AP;
+    cuDoubleComplex A;
+    int lda;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &x, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &C, sizeof(double)) < 0 ||
-        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &AP, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasZtpttr(handle, uplo, n, AP, &A, lda);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(double)) < 0 ||
+        rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -39973,41 +41855,33 @@ int handle_cublasDdgmm_64(void *conn)
     return -1;
 }
 
-int handle_cublasCdgmm(void *conn)
+int handle_cublasStrttp(void *conn)
 {
     cublasHandle_t handle;
-    cublasSideMode_t mode;
-    int m;
+    cublasFillMode_t uplo;
     int n;
-    const cuComplex* A;
+    const float* A;
     int lda;
-    const cuComplex* x;
-    int incx;
-    cuComplex C;
-    int ldc;
+    float AP;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const float*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &AP, sizeof(float)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasStrttp(handle, uplo, n, A, lda, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_write(conn, &AP, sizeof(float)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40016,41 +41890,33 @@ int handle_cublasCdgmm(void *conn)
     return -1;
 }
 
-int handle_cublasCdgmm_64(void *conn)
+int handle_cublasDtrttp(void *conn)
 {
     cublasHandle_t handle;
-    cublasSideMode_t mode;
-    int64_t m;
-    int64_t n;
-    const cuComplex* A;
-    int64_t lda;
-    const cuComplex* x;
-    int64_t incx;
-    cuComplex C;
-    int64_t ldc;
+    cublasFillMode_t uplo;
+    int n;
+    double A;
+    int lda;
+    double AP;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &AP, sizeof(double)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasDtrttp(handle, uplo, n, &A, lda, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+        rpc_write(conn, &AP, sizeof(double)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40059,41 +41925,33 @@ int handle_cublasCdgmm_64(void *conn)
     return -1;
 }
 
-int handle_cublasZdgmm(void *conn)
+int handle_cublasCtrttp(void *conn)
 {
     cublasHandle_t handle;
-    cublasSideMode_t mode;
-    int m;
+    cublasFillMode_t uplo;
     int n;
-    const cuDoubleComplex* A;
+    const cuComplex* A;
     int lda;
-    const cuDoubleComplex* x;
-    int incx;
-    cuDoubleComplex C;
-    int ldc;
+    cuComplex AP;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-        rpc_read(conn, &m, sizeof(int)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &AP, sizeof(cuComplex)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasCtrttp(handle, uplo, n, A, lda, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_write(conn, &AP, sizeof(cuComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40102,41 +41960,33 @@ int handle_cublasZdgmm(void *conn)
     return -1;
 }
 
-int handle_cublasZdgmm_64(void *conn)
+int handle_cublasZtrttp(void *conn)
 {
     cublasHandle_t handle;
-    cublasSideMode_t mode;
-    int64_t m;
-    int64_t n;
+    cublasFillMode_t uplo;
+    int n;
     const cuDoubleComplex* A;
-    int64_t lda;
-    const cuDoubleComplex* x;
-    int64_t incx;
-    cuDoubleComplex C;
-    int64_t ldc;
+    int lda;
+    cuDoubleComplex AP;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-        rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &n, sizeof(int)) < 0 ||
         rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
-        rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-        rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+        rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &AP, sizeof(cuDoubleComplex)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc);
+    scuda_intercept_result = cublasZtrttp(handle, uplo, n, A, lda, &AP);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40145,33 +41995,39 @@ int handle_cublasZdgmm_64(void *conn)
     return -1;
 }
 
-int handle_cublasStpttr(void *conn)
+int handle_cublasSgetriBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
     int n;
-    const float* AP;
-    float A;
+    const float* * A = nullptr;
     int lda;
+    const int* P;
+    float* * C = nullptr;
+    int ldc;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(const float*)) < 0 ||
-        rpc_read(conn, &A, sizeof(float)) < 0 ||
+        rpc_read(conn, &A, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &P, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &C, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasStpttr(handle, uplo, n, AP, &A, lda);
+    scuda_intercept_result = cublasSgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &A, sizeof(float)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40180,33 +42036,39 @@ int handle_cublasStpttr(void *conn)
     return -1;
 }
 
-int handle_cublasDtpttr(void *conn)
+int handle_cublasDgetriBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
     int n;
-    const double* AP;
-    double A;
+    const double* * A = nullptr;
     int lda;
+    const int* P;
+    double* * C = nullptr;
+    int ldc;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(const double*)) < 0 ||
-        rpc_read(conn, &A, sizeof(double)) < 0 ||
+        rpc_read(conn, &A, sizeof(const double* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &P, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &C, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtpttr(handle, uplo, n, AP, &A, lda);
+    scuda_intercept_result = cublasDgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &A, sizeof(double)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40215,33 +42077,39 @@ int handle_cublasDtpttr(void *conn)
     return -1;
 }
 
-int handle_cublasCtpttr(void *conn)
+int handle_cublasCgetriBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
     int n;
-    const cuComplex* AP;
-    cuComplex A;
+    const cuComplex* * A = nullptr;
     int lda;
+    const int* P;
+    cuComplex* * C = nullptr;
+    int ldc;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(const cuComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &P, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCtpttr(handle, uplo, n, AP, &A, lda);
+    scuda_intercept_result = cublasCgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &A, sizeof(cuComplex)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40250,33 +42118,39 @@ int handle_cublasCtpttr(void *conn)
     return -1;
 }
 
-int handle_cublasZtpttr(void *conn)
+int handle_cublasZgetriBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
     int n;
-    const cuDoubleComplex* AP;
-    cuDoubleComplex A;
+    const cuDoubleComplex* * A = nullptr;
     int lda;
+    const int* P;
+    cuDoubleComplex* * C = nullptr;
+    int ldc;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(const cuDoubleComplex*)) < 0 ||
-        rpc_read(conn, &A, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
+        rpc_read(conn, &P, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &C, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZtpttr(handle, uplo, n, AP, &A, lda);
+    scuda_intercept_result = cublasZgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40285,33 +42159,43 @@ int handle_cublasZtpttr(void *conn)
     return -1;
 }
 
-int handle_cublasStrttp(void *conn)
+int handle_cublasSgetrsBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
+    cublasOperation_t trans;
     int n;
-    const float* A;
+    int nrhs;
+    const float* * Aarray = nullptr;
     int lda;
-    float AP;
+    const int* devIpiv;
+    float* * Barray = nullptr;
+    int ldb;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &A, sizeof(const float*)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(float)) < 0 ||
+        rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(float* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasStrttp(handle, uplo, n, A, lda, &AP);
+    scuda_intercept_result = cublasSgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &AP, sizeof(float)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40320,33 +42204,43 @@ int handle_cublasStrttp(void *conn)
     return -1;
 }
 
-int handle_cublasDtrttp(void *conn)
+int handle_cublasDgetrsBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
+    cublasOperation_t trans;
     int n;
-    const double* A;
+    int nrhs;
+    const double* * Aarray = nullptr;
     int lda;
-    double AP;
+    const int* devIpiv;
+    double* * Barray = nullptr;
+    int ldb;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &A, sizeof(const double*)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(double)) < 0 ||
+        rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(double* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasDtrttp(handle, uplo, n, A, lda, &AP);
+    scuda_intercept_result = cublasDgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &AP, sizeof(double)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40355,33 +42249,43 @@ int handle_cublasDtrttp(void *conn)
     return -1;
 }
 
-int handle_cublasCtrttp(void *conn)
+int handle_cublasCgetrsBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
+    cublasOperation_t trans;
     int n;
-    const cuComplex* A;
+    int nrhs;
+    const cuComplex* * Aarray = nullptr;
     int lda;
-    cuComplex AP;
+    const int* devIpiv;
+    cuComplex* * Barray = nullptr;
+    int ldb;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(cuComplex)) < 0 ||
+        rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(cuComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasCtrttp(handle, uplo, n, A, lda, &AP);
+    scuda_intercept_result = cublasCgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &AP, sizeof(cuComplex)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -40390,33 +42294,43 @@ int handle_cublasCtrttp(void *conn)
     return -1;
 }
 
-int handle_cublasZtrttp(void *conn)
+int handle_cublasZgetrsBatched(void *conn)
 {
+    int batchSize;
     cublasHandle_t handle;
-    cublasFillMode_t uplo;
+    cublasOperation_t trans;
     int n;
-    const cuDoubleComplex* A;
+    int nrhs;
+    const cuDoubleComplex* * Aarray = nullptr;
     int lda;
-    cuDoubleComplex AP;
+    const int* devIpiv;
+    cuDoubleComplex* * Barray = nullptr;
+    int ldb;
+    int info;
     int request_id;
     cublasStatus_t scuda_intercept_result;
     if (
+        rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
         rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-        rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+        rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
         rpc_read(conn, &n, sizeof(int)) < 0 ||
-        rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 ||
+        rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+        rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 ||
         rpc_read(conn, &lda, sizeof(int)) < 0 ||
-        rpc_read(conn, &AP, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 ||
+        rpc_read(conn, &Barray, sizeof(cuDoubleComplex* const)) < 0 ||
+        rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+        rpc_read(conn, &info, sizeof(int)) < 0 ||
         false)
         goto ERROR_0;
 
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;
-    scuda_intercept_result = cublasZtrttp(handle, uplo, n, A, lda, &AP);
+    scuda_intercept_result = cublasZgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 ||
+        rpc_write(conn, &info, sizeof(int)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 
@@ -43904,6 +45818,18 @@ static RequestHandler opHandlers[] = {
     handle_cublasCtrmm_v2_64,
     handle_cublasZtrmm_v2,
     handle_cublasZtrmm_v2_64,
+    handle_cublasHgemmBatched,
+    handle_cublasHgemmBatched_64,
+    handle_cublasSgemmBatched,
+    handle_cublasSgemmBatched_64,
+    handle_cublasDgemmBatched,
+    handle_cublasDgemmBatched_64,
+    handle_cublasCgemmBatched,
+    handle_cublasCgemmBatched_64,
+    handle_cublasCgemm3mBatched,
+    handle_cublasCgemm3mBatched_64,
+    handle_cublasZgemmBatched,
+    handle_cublasZgemmBatched_64,
     handle_cublasHgemmStridedBatched,
     handle_cublasHgemmStridedBatched_64,
     handle_cublasSgemmStridedBatched,
@@ -43916,6 +45842,8 @@ static RequestHandler opHandlers[] = {
     handle_cublasCgemm3mStridedBatched_64,
     handle_cublasZgemmStridedBatched,
     handle_cublasZgemmStridedBatched_64,
+    nullptr,
+    handle_cublasGemmBatchedEx_64,
     handle_cublasSgeam,
     handle_cublasSgeam_64,
     handle_cublasDgeam,
@@ -43924,6 +45852,14 @@ static RequestHandler opHandlers[] = {
     handle_cublasCgeam_64,
     handle_cublasZgeam,
     handle_cublasZgeam_64,
+    handle_cublasStrsmBatched,
+    handle_cublasStrsmBatched_64,
+    handle_cublasDtrsmBatched,
+    handle_cublasDtrsmBatched_64,
+    handle_cublasCtrsmBatched,
+    handle_cublasCtrsmBatched_64,
+    handle_cublasZtrsmBatched,
+    handle_cublasZtrsmBatched_64,
     handle_cublasSdgmm,
     handle_cublasSdgmm_64,
     handle_cublasDdgmm,
@@ -43932,6 +45868,18 @@ static RequestHandler opHandlers[] = {
     handle_cublasCdgmm_64,
     handle_cublasZdgmm,
     handle_cublasZdgmm_64,
+    handle_cublasSmatinvBatched,
+    handle_cublasDmatinvBatched,
+    handle_cublasCmatinvBatched,
+    handle_cublasZmatinvBatched,
+    handle_cublasSgeqrfBatched,
+    handle_cublasDgeqrfBatched,
+    handle_cublasCgeqrfBatched,
+    handle_cublasZgeqrfBatched,
+    handle_cublasSgelsBatched,
+    handle_cublasDgelsBatched,
+    handle_cublasCgelsBatched,
+    handle_cublasZgelsBatched,
     handle_cublasStpttr,
     handle_cublasDtpttr,
     handle_cublasCtpttr,
@@ -43940,6 +45888,14 @@ static RequestHandler opHandlers[] = {
     handle_cublasDtrttp,
     handle_cublasCtrttp,
     handle_cublasZtrttp,
+    handle_cublasSgetriBatched,
+    handle_cublasDgetriBatched,
+    handle_cublasCgetriBatched,
+    handle_cublasZgetriBatched,
+    handle_cublasSgetrsBatched,
+    handle_cublasDgetrsBatched,
+    handle_cublasCgetrsBatched,
+    handle_cublasZgetrsBatched,
     handle_cublasUint8gemmBias,
     nullptr,
     nullptr,
diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp
index 2b1305c..cd2b0fa 100755
--- a/codegen/manual_server.cpp
+++ b/codegen/manual_server.cpp
@@ -137,7 +137,7 @@ int handle_cudaMemcpyAsync(void *conn)
             std::cerr << "Failed to allocate host memory for device-to-host transfer." << std::endl;
             return -1;
         }
-
+        
         int request_id = rpc_end_request(conn);
         if (request_id < 0)
         {
diff --git a/local.sh b/local.sh
index 56fe3de..ae0b62d 100755
--- a/local.sh
+++ b/local.sh
@@ -27,6 +27,8 @@ build() {
   nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
 
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
+
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."
     exit 1
diff --git a/test/cublas_batched.cu b/test/cublas_batched.cu
new file mode 100644
index 0000000..d5e3f92
--- /dev/null
+++ b/test/cublas_batched.cu
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include "cublas_utils.h"
+
+using data_type = double;
+
+int main(int argc, char *argv[]) {
+    cublasHandle_t cublasH = NULL;
+    cudaStream_t stream = NULL;
+
+    const int m = 2;
+    const int n = 2;
+    const int k = 2;
+    const int lda = 2;
+    const int ldb = 2;
+    const int ldc = 2;
+    const int batch_count = 2;
+
+    /*
+     *   A = | 1.0 | 2.0 | 5.0 | 6.0 |
+     *       | 3.0 | 4.0 | 7.0 | 8.0 |
+     *
+     *   B = | 5.0 | 6.0 |  9.0 | 10.0 |
+     *       | 7.0 | 8.0 | 11.0 | 12.0 |
+     */
+
+    const std::vector<std::vector<data_type>> A_array = {{1.0 ,3.0, 2.0, 4.0},
+                                                         {5.0, 7.0, 6.0, 8.0}};
+    const std::vector<std::vector<data_type>> B_array = {{5.0, 7.0, 6.0, 8.0},
+                                                         {9.0, 11.0, 10.0, 12.0}};
+    std::vector<std::vector<data_type>> C_array(batch_count, std::vector<data_type>(m * n));
+
+    const data_type alpha = 1.0;
+    const data_type beta = 0.0;
+
+    data_type **d_A_array = nullptr;
+    data_type **d_B_array = nullptr;
+    data_type **d_C_array = nullptr;
+
+    std::vector<data_type *> d_A(batch_count, nullptr);
+    std::vector<data_type *> d_B(batch_count, nullptr);
+    std::vector<data_type *> d_C(batch_count, nullptr);
+
+    cublasOperation_t transa = CUBLAS_OP_N;
+    cublasOperation_t transb = CUBLAS_OP_N;
+
+    printf("A[0]\n");
+    print_matrix(m, k, A_array[0].data(), lda);
+    printf("=====\n");
+
+    printf("A[1]\n");
+    print_matrix(m, k, A_array[1].data(), lda);
+    printf("=====\n");
+
+    printf("B[0]\n");
+    print_matrix(k, n, B_array[0].data(), ldb);
+    printf("=====\n");
+
+    printf("B[1]\n");
+    print_matrix(k, n, B_array[1].data(), ldb);
+    printf("=====\n");
+
+    /* step 1: create cublas handle, bind a stream */
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CUBLAS_CHECK(cublasSetStream(cublasH, stream));
+
+    /* step 2: copy data to device */
+    for (int i = 0; i < batch_count; i++) {
+        CUDA_CHECK(
+            cudaMalloc(reinterpret_cast<void **>(&d_A[i]), sizeof(data_type) * A_array[i].size()));
+        CUDA_CHECK(
+            cudaMalloc(reinterpret_cast<void **>(&d_B[i]), sizeof(data_type) * B_array[i].size()));
+        CUDA_CHECK(
+            cudaMalloc(reinterpret_cast<void **>(&d_C[i]), sizeof(data_type) * C_array[i].size()));
+    }
+
+    CUDA_CHECK(
+        cudaMalloc(reinterpret_cast<void **>(&d_A_array), sizeof(data_type *) * batch_count));
+    CUDA_CHECK(
+        cudaMalloc(reinterpret_cast<void **>(&d_B_array), sizeof(data_type *) * batch_count));
+    CUDA_CHECK(
+        cudaMalloc(reinterpret_cast<void **>(&d_C_array), sizeof(data_type *) * batch_count));
+
+    for (int i = 0; i < batch_count; i++) {
+        CUDA_CHECK(cudaMemcpyAsync(d_A[i], A_array[i].data(), sizeof(data_type) * A_array[i].size(),
+                                   cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_B[i], B_array[i].data(), sizeof(data_type) * B_array[i].size(),
+                                   cudaMemcpyHostToDevice, stream));
+    }
+
+    CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type *) * batch_count,
+                               cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type *) * batch_count,
+                               cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type *) * batch_count,
+                               cudaMemcpyHostToDevice, stream));
+
+    /* step 3: compute */
+    CUBLAS_CHECK(cublasDgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
+                                    d_B_array, ldb, &beta, d_C_array, ldc, batch_count));
+
+    /* step 4: copy data to host */
+    for (int i = 0; i < batch_count; i++) {
+        CUDA_CHECK(cudaMemcpy(C_array[i].data(), d_C[i], sizeof(data_type) * C_array[i].size(),
+                                   cudaMemcpyDeviceToHost));
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    /*
+     *   C = | 19.0 | 22.0 | 111.0 | 122.0 |
+     *       | 43.0 | 50.0 | 151.0 | 166.0 |
+     */
+
+    printf("C[0]\n");
+    print_matrix(m, n, C_array[0].data(), ldc);
+    printf("=====\n");
+
+    printf("C[1]\n");
+    print_matrix(m, n, C_array[1].data(), ldc);
+    printf("=====\n");
+
+    /* free resources */
+    CUDA_CHECK(cudaFree(d_A_array));
+    CUDA_CHECK(cudaFree(d_B_array));
+    CUDA_CHECK(cudaFree(d_C_array));
+    for (int i = 0; i < batch_count; i++) {
+        CUDA_CHECK(cudaFree(d_A[i]));
+        CUDA_CHECK(cudaFree(d_B[i]));
+        CUDA_CHECK(cudaFree(d_C[i]));
+    }
+
+    CUBLAS_CHECK(cublasDestroy(cublasH));
+
+    CUDA_CHECK(cudaStreamDestroy(stream));
+
+    CUDA_CHECK(cudaDeviceReset());
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/test/cublas_utils.h b/test/cublas_utils.h
new file mode 100644
index 0000000..61b64ea
--- /dev/null
+++ b/test/cublas_utils.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include <cuComplex.h>
+#include <cublas_api.h>
+#include <cuda_runtime_api.h>
+#include <library_types.h>
+
+// CUDA API error checking
+#define CUDA_CHECK(err)                                                                            \
+    do {                                                                                           \
+        cudaError_t err_ = (err);                                                                  \
+        if (err_ != cudaSuccess) {                                                                 \
+            std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__);                     \
+            throw std::runtime_error("CUDA error");                                                \
+        }                                                                                          \
+    } while (0)
+
+// cublas API error checking
+#define CUBLAS_CHECK(err)                                                                          \
+    do {                                                                                           \
+        cublasStatus_t err_ = (err);                                                               \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                                       \
+            std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                   \
+            throw std::runtime_error("cublas error");                                              \
+        }                                                                                          \
+    } while (0)
+
+// memory alignment
+#define ALIGN_TO(A, B) (((A + B - 1) / B) * B)
+
+// device memory pitch alignment
+static const size_t device_alignment = 32;
+
+// type traits
+template <typename T> struct traits;
+
+template <> struct traits<float> {
+    // scalar type
+    typedef float T;
+    typedef T S;
+
+    static constexpr T zero = 0.f;
+    static constexpr cudaDataType cuda_data_type = CUDA_R_32F;
+
+    inline static S abs(T val) { return fabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
+
+    inline static T add(T a, T b) { return a + b; }
+
+    inline static T mul(T v, double f) { return v * f; }
+};
+
+template <> struct traits<double> {
+    // scalar type
+    typedef double T;
+    typedef T S;
+
+    static constexpr T zero = 0.;
+    static constexpr cudaDataType cuda_data_type = CUDA_R_64F;
+
+    inline static S abs(T val) { return fabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
+
+    inline static T add(T a, T b) { return a + b; }
+
+    inline static T mul(T v, double f) { return v * f; }
+};
+
+template <> struct traits<cuFloatComplex> {
+    // scalar type
+    typedef float S;
+    typedef cuFloatComplex T;
+
+    static constexpr T zero = {0.f, 0.f};
+    static constexpr cudaDataType cuda_data_type = CUDA_C_32F;
+
+    inline static S abs(T val) { return cuCabsf(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) {
+        return make_cuFloatComplex((S)gen(), (S)gen());
+    }
+
+    inline static T add(T a, T b) { return cuCaddf(a, b); }
+    inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); }
+
+    inline static T mul(T v, double f) { return make_cuFloatComplex(v.x * f, v.y * f); }
+};
+
+template <> struct traits<cuDoubleComplex> {
+    // scalar type
+    typedef double S;
+    typedef cuDoubleComplex T;
+
+    static constexpr T zero = {0., 0.};
+    static constexpr cudaDataType cuda_data_type = CUDA_C_64F;
+
+    inline static S abs(T val) { return cuCabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) {
+        return make_cuDoubleComplex((S)gen(), (S)gen());
+    }
+
+    inline static T add(T a, T b) { return cuCadd(a, b); }
+    inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); }
+
+    inline static T mul(T v, double f) { return make_cuDoubleComplex(v.x * f, v.y * f); }
+};
+
+template <typename T> void print_matrix(const int &m, const int &n, const T *A, const int &lda);
+
+template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f ", A[j * lda + i]);
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f ", A[j * lda + i]);
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
+        }
+        std::printf("\n");
+    }
+}
+
+template <>
+void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
+        }
+        std::printf("\n");
+    }
+}
+
+template <typename T> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const T *A);
+
+template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const float *A) {
+    size_t off = 0;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) ||
+                (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) {
+                std::printf("%6.2f ", A[off++]);
+            } else if (uplo == CUBLAS_FILL_MODE_UPPER) {
+                std::printf("       ");
+            }
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const double *A) {
+    size_t off = 0;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) ||
+                (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) {
+                std::printf("%6.2f ", A[off++]);
+            } else if (uplo == CUBLAS_FILL_MODE_UPPER) {
+                std::printf("       ");
+            }
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const cuComplex *A) {
+    size_t off = 0;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) ||
+                (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) {
+                std::printf("%6.2f + %6.2fj ", A[off].x, A[off].y);
+                off++;
+            } else if (uplo == CUBLAS_FILL_MODE_UPPER) {
+                std::printf("                 ");
+            }
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const cuDoubleComplex *A) {
+    size_t off = 0;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) ||
+                (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) {
+                std::printf("%6.2f + %6.2fj ", A[off].x, A[off].y);
+                off++;
+            } else if (uplo == CUBLAS_FILL_MODE_UPPER) {
+                std::printf("                 ");
+            }
+        }
+        std::printf("\n");
+    }
+}
+
+template <typename T> void print_vector(const int &m, const T *A);
+
+template <> void print_vector(const int &m, const float *A) {
+    for (int i = 0; i < m; i++) {
+        std::printf("%0.2f ", A[i]);
+    }
+    std::printf("\n");
+}
+
+template <> void print_vector(const int &m, const double *A) {
+    for (int i = 0; i < m; i++) {
+        std::printf("%0.2f ", A[i]);
+    }
+    std::printf("\n");
+}
+
+template <> void print_vector(const int &m, const cuComplex *A) {
+    for (int i = 0; i < m; i++) {
+        std::printf("%0.2f + %0.2fj ", A[i].x, A[i].y);
+    }
+    std::printf("\n");
+}
+
+template <> void print_vector(const int &m, const cuDoubleComplex *A) {
+    for (int i = 0; i < m; i++) {
+        std::printf("%0.2f + %0.2fj ", A[i].x, A[i].y);
+    }
+    std::printf("\n");
+}
+
+template <typename T> void generate_random_matrix(int m, int n, T **A, int *lda) {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<typename traits<T>::S> dis(-1.0, 1.0);
+    auto rand_gen = std::bind(dis, gen);
+
+    *lda = n;
+
+    size_t matrix_mem_size = static_cast<size_t>(*lda * m * sizeof(T));
+    // suppress gcc 7 size warning
+    if (matrix_mem_size <= PTRDIFF_MAX)
+        *A = (T *)malloc(matrix_mem_size);
+    else
+        throw std::runtime_error("Memory allocation size is too large");
+
+    if (*A == NULL)
+        throw std::runtime_error("Unable to allocate host matrix");
+
+    // random matrix and accumulate row sums
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            T *A_row = (*A) + *lda * i;
+            A_row[j] = traits<T>::rand(rand_gen);
+        }
+    }
+}
+
+// Makes matrix A of size mxn and leading dimension lda diagonal dominant
+template <typename T> void make_diag_dominant_matrix(int m, int n, T *A, int lda) {
+    for (int i = 0; i < std::min(m, n); ++i) {
+        T *A_row = A + lda * i;
+        auto row_sum = traits<typename traits<T>::S>::zero;
+        for (int j = 0; j < n; ++j) {
+            row_sum += traits<T>::abs(A_row[j]);
+        }
+        A_row[i] = traits<T>::add(A_row[i], row_sum);
+    }
+}
+
+// Returns cudaDataType value as defined in library_types.h for the string
+// containing type name
+cudaDataType get_cuda_library_type(std::string type_string) {
+    if (type_string.compare("CUDA_R_16F") == 0)
+        return CUDA_R_16F;
+    else if (type_string.compare("CUDA_C_16F") == 0)
+        return CUDA_C_16F;
+    else if (type_string.compare("CUDA_R_32F") == 0)
+        return CUDA_R_32F;
+    else if (type_string.compare("CUDA_C_32F") == 0)
+        return CUDA_C_32F;
+    else if (type_string.compare("CUDA_R_64F") == 0)
+        return CUDA_R_64F;
+    else if (type_string.compare("CUDA_C_64F") == 0)
+        return CUDA_C_64F;
+    else if (type_string.compare("CUDA_R_8I") == 0)
+        return CUDA_R_8I;
+    else if (type_string.compare("CUDA_C_8I") == 0)
+        return CUDA_C_8I;
+    else if (type_string.compare("CUDA_R_8U") == 0)
+        return CUDA_R_8U;
+    else if (type_string.compare("CUDA_C_8U") == 0)
+        return CUDA_C_8U;
+    else if (type_string.compare("CUDA_R_32I") == 0)
+        return CUDA_R_32I;
+    else if (type_string.compare("CUDA_C_32I") == 0)
+        return CUDA_C_32I;
+    else if (type_string.compare("CUDA_R_32U") == 0)
+        return CUDA_R_32U;
+    else if (type_string.compare("CUDA_C_32U") == 0)
+        return CUDA_C_32U;
+    else
+        throw std::runtime_error("Unknown CUDA datatype");
+}
\ No newline at end of file