From cce7cdfca92c0052408dafa480e01d69f64660c3 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Thu, 5 Dec 2024 06:55:50 +0000 Subject: [PATCH] chore: batched updates --- codegen/annotations.h | 304 ++-- codegen/codegen.py | 133 +- codegen/gen_api.h | 272 +-- codegen/gen_client.cpp | 1389 ++++++++++++--- codegen/gen_server.cpp | 3388 +++++++++++++++++++++++++++++-------- codegen/manual_server.cpp | 2 +- local.sh | 2 + test/cublas_batched.cu | 196 +++ test/cublas_utils.h | 351 ++++ 9 files changed, 4748 insertions(+), 1289 deletions(-) create mode 100644 test/cublas_batched.cu create mode 100644 test/cublas_utils.h diff --git a/codegen/annotations.h b/codegen/annotations.h index f39ae0b..78e6546 100644 --- a/codegen/annotations.h +++ b/codegen/annotations.h @@ -12141,6 +12141,7 @@ cublasStatus_t cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl */ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12148,17 +12149,17 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12166,17 +12167,17 @@ cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t trans * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12184,17 +12185,17 @@ cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12202,17 +12203,17 @@ cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t trans * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12220,17 +12221,17 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12238,17 +12239,17 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12256,17 +12257,17 @@ cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12274,17 +12275,17 @@ cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t trans * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12292,17 +12293,17 @@ cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12310,17 +12311,17 @@ cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t tra * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12328,17 +12329,17 @@ cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12346,14 +12347,13 @@ cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t trans * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount); /** @@ -12609,6 +12609,8 @@ cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_ */ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* B, int64_t ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc, long long int strideC, int64_t batchCount); /** + * @disabled + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12616,22 +12618,22 @@ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperati * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param Atype SEND_ONLY * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param Btype SEND_ONLY * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param Ctype SEND_ONLY * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY * @param computeType SEND_ONLY * @param algo SEND_ONLY */ cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int lda, const void* const Barray[], cudaDataType Btype, int ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int ldc, int batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12639,17 +12641,16 @@ cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t tran * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param Atype SEND_ONLY * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param Btype SEND_ONLY * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param Ctype SEND_ONLY * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY * @param computeType SEND_ONLY * @param algo SEND_ONLY */ @@ -12835,6 +12836,7 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cubl */ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12843,14 +12845,14 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12859,14 +12861,14 @@ cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12875,14 +12877,14 @@ cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12891,14 +12893,14 @@ cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12907,14 +12909,14 @@ cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12923,14 +12925,14 @@ cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12939,14 +12941,14 @@ cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12955,11 +12957,10 @@ cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param m SEND_ONLY * @param n SEND_ONLY * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount); /** @@ -13067,151 +13068,151 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, */ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** @@ -13327,107 +13328,107 @@ cublasStatus_t cublasCgetrfBatched(cublasHandle_t handle, int n, cuComplex* cons */ cublasStatus_t cublasZgetrfBatched(cublasHandle_t handle, int n, cuDoubleComplex* const A[], int lda, int* P, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param P SEND_RECV - * @param C SEND_ONLY + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param P SEND_RECV - * @param C SEND_ONLY + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param P SEND_RECV - * @param C SEND_ONLY + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param P SEND_RECV - * @param C SEND_ONLY + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize); /** @@ -13480,29 +13481,6 @@ cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle, cudaDataType_t da * @param algo SEND_ONLY */ cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* A, cudaDataType Atype, int lda, const void* B, cudaDataType Btype, int ldb, const void* beta, void* C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo); -/** - * @param handle SEND_ONLY - * @param transa SEND_ONLY - * @param transb SEND_ONLY - * @param m SEND_ONLY - * @param n SEND_ONLY - * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY - * @param Atype SEND_ONLY - * @param lda SEND_ONLY - * @param Barray SEND_ONLY - * @param Btype SEND_ONLY - * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY - * @param Ctype SEND_ONLY - * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY - * @param computeType SEND_ONLY - * @param algo SEND_ONLY - */ -cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int lda, const void* const Barray[], cudaDataType Btype, int ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int ldc, int batchCount, cudaDataType computeType, cublasGemmAlgo_t algo); /** * @param handle SEND_ONLY * @param transa SEND_ONLY diff --git a/codegen/codegen.py b/codegen/codegen.py index c91ac7a..aa10f9f 100644 --- a/codegen/codegen.py +++ b/codegen/codegen.py @@ -190,10 +190,9 @@ def client_rpc_write(self, f): # array length operations are handled differently than char elif isinstance(self.ptr, Array): f.write( - " rpc_write(0, {param_name}, sizeof({param_type}[{length}])) < 0 ||\n".format( + " rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format( param_name=self.parameter.name, - param_type=self.ptr.format().replace("[]", ""), - length=self.length.name, + param_type=self.parameter.name, ) ) else: @@ -216,7 +215,7 @@ def server_declaration(self) -> str: c = self.ptr.const self.ptr.const = False # const[] isn't a valid part of a variable declaration - s = f" {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = new {self.ptr.format().replace("const[]", "")}[{self.length.name}];\n" + s = f" {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = nullptr;\n" self.ptr.const = c else: c = self.ptr.ptr_to.const @@ -230,17 +229,16 @@ def server_rpc_read(self, f): return elif isinstance(self.length, int): f.write( - " rpc_read(conn, {param_name}, {size}) < 0 ||\n".format( + " rpc_read(conn, &{param_name}, {size}) < 0 ||\n".format( param_name=self.parameter.name, size=self.length, ) ) elif isinstance(self.ptr, Array): f.write( - " rpc_read(conn, {param_name}, sizeof({param_type}[{length}])) < 0 ||\n".format( + " rpc_read(conn, &{param_name}, sizeof({param_type})) < 0 ||\n".format( param_name=self.parameter.name, param_type=self.ptr.format().replace("[]", ""), - length=self.length.name, ) ) else: @@ -256,12 +254,6 @@ def server_rpc_read(self, f): ) ) - def server_len_rpc_read(self, f): - f.write(" if (rpc_read(conn, &{length_param}, sizeof(int)) < 0)\n".format( - length_param=self.length.name, - )) - f.write(" return -1;\n") - @property def server_reference(self) -> str: return self.parameter.name @@ -403,12 +395,20 @@ class OpaqueTypeOperation: def client_rpc_write(self, f): if not self.send: return - f.write( - " rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format( - param_name=self.parameter.name, - param_type=self.type_.format(), + elif "const double*" in self.type_.format(): + f.write( + " rpc_write(0, {param_name}, sizeof({param_type})) < 0 ||\n".format( + param_name=self.parameter.name, + param_type=self.type_.format(), + ) + ) + else: + f.write( + " rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format( + param_name=self.parameter.name, + param_type=self.type_.format(), + ) ) - ) @property def server_declaration(self) -> str: @@ -418,7 +418,10 @@ def server_declaration(self) -> str: # but "const cudnnTensorDescriptor_t *xDesc" IS valid. This subtle change carries reprecussions. elif "const " in self.type_.format() and not "void" in self.type_.format() and not "*" in self.type_.format(): return f" {self.type_.format().replace("const", "")} {self.parameter.name};\n" - else: return f" {self.type_.format()} {self.parameter.name};\n" + elif "const double*" in self.type_.format(): + return f" double {self.parameter.name};\n" + else: + return f" {self.type_.format()} {self.parameter.name};\n" def server_rpc_read(self, f): if not self.send: @@ -434,6 +437,8 @@ def server_rpc_read(self, f): def server_reference(self) -> str: if self.recv: return f"&{self.parameter.name}" + if "const double*" in self.type_.format(): + return f"&{self.parameter.name}" return self.parameter.name def server_rpc_write(self, f): @@ -703,7 +708,15 @@ def main(): functions_with_annotations: list[tuple[Function, Function, list[Operation]]] = [] + dupes = {} + for function in functions: + # ensure duplicate functions can't be written + if dupes.get(function.name.format()): + continue + + dupes[function.name.format()] = True + try: annotation = next( f for f in annotations.namespace.functions if f.name == function.name @@ -915,14 +928,6 @@ def main(): for function, annotation, operations, disabled in functions_with_annotations: if function.name.format() in MANUAL_IMPLEMENTATIONS or disabled: continue - batched = False - - # not a fan of this, but the batched functions are pretty standard with the flow below. - # batched functions are cublas functions that send pointer arrays where batchCount describes... - # the number of pointers in the arrays. This is non-trivial to generate. - if "Batched" in function.name.format(): - batched = True - # parse the annotation doxygen f.write( "int handle_{name}(void *conn)\n".format( @@ -933,70 +938,28 @@ def main(): defers = [] - if batched: - array_batches = [] - non_array_batches = [] - - for operation in operations: - if isinstance(operation, NullTerminatedOperation): - if error := operation.server_rpc_read(f, len(defers)): - defers.append(error) - if isinstance(operation, ArrayOperation): - array_batches.append(operation) - if not isinstance(operation, ArrayOperation): - non_array_batches.append(operation) - - # print our normal operations the same - for operation in operations: - if operation not in array_batches: - f.write(operation.server_declaration) - - # do something with array batches - if len(array_batches) > 0 and hasattr(array_batches[0], "server_len_rpc_read"): - array_batches[0].server_len_rpc_read(f) - - # pop here, because we already accounted for the batchCount integer - non_array_batches.pop(0) - - for op in array_batches: - f.write(op.server_declaration) - - f.write(" int request_id;\n") - if function.return_type.format() != "void": - f.write(" {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format())) - else: - f.write(" void* scuda_intercept_result;\n".format(return_type=function.return_type.format())) + for operation in operations: + f.write(operation.server_declaration) - f.write(" if (\n") - for operation in operations: - operation.server_rpc_read(f) - f.write(" false)\n") - f.write(" goto ERROR_{index};\n".format(index=len(defers))) + f.write(" int request_id;\n") - f.write("\n") + # we only generate return from non-void types + if function.return_type.format() != "void": + f.write(" {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format())) else: - for operation in operations: - f.write(operation.server_declaration) - - f.write(" int request_id;\n") + f.write(" void* scuda_intercept_result;\n".format(return_type=function.return_type.format())) - # we only generate return from non-void types - if function.return_type.format() != "void": - f.write(" {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format())) + f.write(" if (\n") + for operation in operations: + if isinstance(operation, NullTerminatedOperation): + if error := operation.server_rpc_read(f, len(defers)): + defers.append(error) else: - f.write(" void* scuda_intercept_result;\n".format(return_type=function.return_type.format())) - - f.write(" if (\n") - for operation in operations: - if isinstance(operation, NullTerminatedOperation): - if error := operation.server_rpc_read(f, len(defers)): - defers.append(error) - else: - operation.server_rpc_read(f) - f.write(" false)\n") - f.write(" goto ERROR_{index};\n".format(index=len(defers))) + operation.server_rpc_read(f) + f.write(" false)\n") + f.write(" goto ERROR_{index};\n".format(index=len(defers))) - f.write("\n") + f.write("\n") f.write( " request_id = rpc_end_request(conn);\n".format( diff --git a/codegen/gen_api.h b/codegen/gen_api.h index 8fb6410..2545cf2 100644 --- a/codegen/gen_api.h +++ b/codegen/gen_api.h @@ -1252,118 +1252,160 @@ #define RPC_cublasCtrmm_v2_64 1251 #define RPC_cublasZtrmm_v2 1252 #define RPC_cublasZtrmm_v2_64 1253 -#define RPC_cublasHgemmStridedBatched 1254 -#define RPC_cublasHgemmStridedBatched_64 1255 -#define RPC_cublasSgemmStridedBatched 1256 -#define RPC_cublasSgemmStridedBatched_64 1257 -#define RPC_cublasDgemmStridedBatched 1258 -#define RPC_cublasDgemmStridedBatched_64 1259 -#define RPC_cublasCgemmStridedBatched 1260 -#define RPC_cublasCgemmStridedBatched_64 1261 -#define RPC_cublasCgemm3mStridedBatched 1262 -#define RPC_cublasCgemm3mStridedBatched_64 1263 -#define RPC_cublasZgemmStridedBatched 1264 -#define RPC_cublasZgemmStridedBatched_64 1265 -#define RPC_cublasSgeam 1266 -#define RPC_cublasSgeam_64 1267 -#define RPC_cublasDgeam 1268 -#define RPC_cublasDgeam_64 1269 -#define RPC_cublasCgeam 1270 -#define RPC_cublasCgeam_64 1271 -#define RPC_cublasZgeam 1272 -#define RPC_cublasZgeam_64 1273 -#define RPC_cublasSdgmm 1274 -#define RPC_cublasSdgmm_64 1275 -#define RPC_cublasDdgmm 1276 -#define RPC_cublasDdgmm_64 1277 -#define RPC_cublasCdgmm 1278 -#define RPC_cublasCdgmm_64 1279 -#define RPC_cublasZdgmm 1280 -#define RPC_cublasZdgmm_64 1281 -#define RPC_cublasStpttr 1282 -#define RPC_cublasDtpttr 1283 -#define RPC_cublasCtpttr 1284 -#define RPC_cublasZtpttr 1285 -#define RPC_cublasStrttp 1286 -#define RPC_cublasDtrttp 1287 -#define RPC_cublasCtrttp 1288 -#define RPC_cublasZtrttp 1289 -#define RPC_cublasUint8gemmBias 1290 -#define RPC_cublasMigrateComputeType 1291 -#define RPC_cudnnGetVersion 1292 -#define RPC_cudnnGetMaxDeviceVersion 1293 -#define RPC_cudnnGetCudartVersion 1294 -#define RPC_cudnnGetErrorString 1295 -#define RPC_cudnnGetLastErrorString 1296 -#define RPC_cudnnQueryRuntimeError 1297 -#define RPC_cudnnGetProperty 1298 -#define RPC_cudnnCreate 1299 -#define RPC_cudnnDestroy 1300 -#define RPC_cudnnSetStream 1301 -#define RPC_cudnnGetStream 1302 -#define RPC_cudnnGetCallback 1303 -#define RPC_cudnnGraphVersionCheck 1304 -#define RPC_cudnnBackendCreateDescriptor 1305 -#define RPC_cudnnBackendDestroyDescriptor 1306 -#define RPC_cudnnBackendInitialize 1307 -#define RPC_cudnnBackendFinalize 1308 -#define RPC_cudnnBackendSetAttribute 1309 -#define RPC_cudnnBackendExecute 1310 -#define RPC_cudnnBackendPopulateCudaGraph 1311 -#define RPC_cudnnBackendUpdateCudaGraph 1312 -#define RPC_cudnnCreateTensorDescriptor 1313 -#define RPC_cudnnSetTensor4dDescriptor 1314 -#define RPC_cudnnSetTensor4dDescriptorEx 1315 -#define RPC_cudnnGetTensor4dDescriptor 1316 -#define RPC_cudnnGetTensorSizeInBytes 1317 -#define RPC_cudnnDestroyTensorDescriptor 1318 -#define RPC_cudnnInitTransformDest 1319 -#define RPC_cudnnCreateTensorTransformDescriptor 1320 -#define RPC_cudnnDestroyTensorTransformDescriptor 1321 -#define RPC_cudnnCreateOpTensorDescriptor 1322 -#define RPC_cudnnSetOpTensorDescriptor 1323 -#define RPC_cudnnGetOpTensorDescriptor 1324 -#define RPC_cudnnDestroyOpTensorDescriptor 1325 -#define RPC_cudnnCreateReduceTensorDescriptor 1326 -#define RPC_cudnnSetReduceTensorDescriptor 1327 -#define RPC_cudnnGetReduceTensorDescriptor 1328 -#define RPC_cudnnDestroyReduceTensorDescriptor 1329 -#define RPC_cudnnGetReductionIndicesSize 1330 -#define RPC_cudnnGetReductionWorkspaceSize 1331 -#define RPC_cudnnCreateFilterDescriptor 1332 -#define RPC_cudnnSetFilter4dDescriptor 1333 -#define RPC_cudnnGetFilter4dDescriptor 1334 -#define RPC_cudnnGetFilterSizeInBytes 1335 -#define RPC_cudnnDestroyFilterDescriptor 1336 -#define RPC_cudnnCreatePoolingDescriptor 1337 -#define RPC_cudnnSetPooling2dDescriptor 1338 -#define RPC_cudnnGetPooling2dDescriptor 1339 -#define RPC_cudnnGetPooling2dForwardOutputDim 1340 -#define RPC_cudnnDestroyPoolingDescriptor 1341 -#define RPC_cudnnCreateActivationDescriptor 1342 -#define RPC_cudnnSetActivationDescriptor 1343 -#define RPC_cudnnGetActivationDescriptor 1344 -#define RPC_cudnnSetActivationDescriptorSwishBeta 1345 -#define RPC_cudnnGetActivationDescriptorSwishBeta 1346 -#define RPC_cudnnDestroyActivationDescriptor 1347 -#define RPC_cudnnActivationForward 1348 -#define RPC_cudnnCreateLRNDescriptor 1349 -#define RPC_cudnnSetLRNDescriptor 1350 -#define RPC_cudnnGetLRNDescriptor 1351 -#define RPC_cudnnDestroyLRNDescriptor 1352 -#define RPC_cudnnDeriveBNTensorDescriptor 1353 -#define RPC_cudnnDeriveNormTensorDescriptor 1354 -#define RPC_cudnnCreateSpatialTransformerDescriptor 1355 -#define RPC_cudnnDestroySpatialTransformerDescriptor 1356 -#define RPC_cudnnCreateDropoutDescriptor 1357 -#define RPC_cudnnDestroyDropoutDescriptor 1358 -#define RPC_cudnnDropoutGetStatesSize 1359 -#define RPC_cudnnDropoutGetReserveSpaceSize 1360 -#define RPC_cudnnGetDropoutDescriptor 1361 -#define RPC_cudnnOpsVersionCheck 1362 -#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1363 -#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1364 -#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1365 -#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1366 -#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1367 -#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1368 +#define RPC_cublasHgemmBatched 1254 +#define RPC_cublasHgemmBatched_64 1255 +#define RPC_cublasSgemmBatched 1256 +#define RPC_cublasSgemmBatched_64 1257 +#define RPC_cublasDgemmBatched 1258 +#define RPC_cublasDgemmBatched_64 1259 +#define RPC_cublasCgemmBatched 1260 +#define RPC_cublasCgemmBatched_64 1261 +#define RPC_cublasCgemm3mBatched 1262 +#define RPC_cublasCgemm3mBatched_64 1263 +#define RPC_cublasZgemmBatched 1264 +#define RPC_cublasZgemmBatched_64 1265 +#define RPC_cublasHgemmStridedBatched 1266 +#define RPC_cublasHgemmStridedBatched_64 1267 +#define RPC_cublasSgemmStridedBatched 1268 +#define RPC_cublasSgemmStridedBatched_64 1269 +#define RPC_cublasDgemmStridedBatched 1270 +#define RPC_cublasDgemmStridedBatched_64 1271 +#define RPC_cublasCgemmStridedBatched 1272 +#define RPC_cublasCgemmStridedBatched_64 1273 +#define RPC_cublasCgemm3mStridedBatched 1274 +#define RPC_cublasCgemm3mStridedBatched_64 1275 +#define RPC_cublasZgemmStridedBatched 1276 +#define RPC_cublasZgemmStridedBatched_64 1277 +#define RPC_cublasGemmBatchedEx 1278 +#define RPC_cublasGemmBatchedEx_64 1279 +#define RPC_cublasSgeam 1280 +#define RPC_cublasSgeam_64 1281 +#define RPC_cublasDgeam 1282 +#define RPC_cublasDgeam_64 1283 +#define RPC_cublasCgeam 1284 +#define RPC_cublasCgeam_64 1285 +#define RPC_cublasZgeam 1286 +#define RPC_cublasZgeam_64 1287 +#define RPC_cublasStrsmBatched 1288 +#define RPC_cublasStrsmBatched_64 1289 +#define RPC_cublasDtrsmBatched 1290 +#define RPC_cublasDtrsmBatched_64 1291 +#define RPC_cublasCtrsmBatched 1292 +#define RPC_cublasCtrsmBatched_64 1293 +#define RPC_cublasZtrsmBatched 1294 +#define RPC_cublasZtrsmBatched_64 1295 +#define RPC_cublasSdgmm 1296 +#define RPC_cublasSdgmm_64 1297 +#define RPC_cublasDdgmm 1298 +#define RPC_cublasDdgmm_64 1299 +#define RPC_cublasCdgmm 1300 +#define RPC_cublasCdgmm_64 1301 +#define RPC_cublasZdgmm 1302 +#define RPC_cublasZdgmm_64 1303 +#define RPC_cublasSmatinvBatched 1304 +#define RPC_cublasDmatinvBatched 1305 +#define RPC_cublasCmatinvBatched 1306 +#define RPC_cublasZmatinvBatched 1307 +#define RPC_cublasSgeqrfBatched 1308 +#define RPC_cublasDgeqrfBatched 1309 +#define RPC_cublasCgeqrfBatched 1310 +#define RPC_cublasZgeqrfBatched 1311 +#define RPC_cublasSgelsBatched 1312 +#define RPC_cublasDgelsBatched 1313 +#define RPC_cublasCgelsBatched 1314 +#define RPC_cublasZgelsBatched 1315 +#define RPC_cublasStpttr 1316 +#define RPC_cublasDtpttr 1317 +#define RPC_cublasCtpttr 1318 +#define RPC_cublasZtpttr 1319 +#define RPC_cublasStrttp 1320 +#define RPC_cublasDtrttp 1321 +#define RPC_cublasCtrttp 1322 +#define RPC_cublasZtrttp 1323 +#define RPC_cublasSgetriBatched 1324 +#define RPC_cublasDgetriBatched 1325 +#define RPC_cublasCgetriBatched 1326 +#define RPC_cublasZgetriBatched 1327 +#define RPC_cublasSgetrsBatched 1328 +#define RPC_cublasDgetrsBatched 1329 +#define RPC_cublasCgetrsBatched 1330 +#define RPC_cublasZgetrsBatched 1331 +#define RPC_cublasUint8gemmBias 1332 +#define RPC_cublasMigrateComputeType 1333 +#define RPC_cudnnGetVersion 1334 +#define RPC_cudnnGetMaxDeviceVersion 1335 +#define RPC_cudnnGetCudartVersion 1336 +#define RPC_cudnnGetErrorString 1337 +#define RPC_cudnnGetLastErrorString 1338 +#define RPC_cudnnQueryRuntimeError 1339 +#define RPC_cudnnGetProperty 1340 +#define RPC_cudnnCreate 1341 +#define RPC_cudnnDestroy 1342 +#define RPC_cudnnSetStream 1343 +#define RPC_cudnnGetStream 1344 +#define RPC_cudnnGetCallback 1345 +#define RPC_cudnnGraphVersionCheck 1346 +#define RPC_cudnnBackendCreateDescriptor 1347 +#define RPC_cudnnBackendDestroyDescriptor 1348 +#define RPC_cudnnBackendInitialize 1349 +#define RPC_cudnnBackendFinalize 1350 +#define RPC_cudnnBackendSetAttribute 1351 +#define RPC_cudnnBackendExecute 1352 +#define RPC_cudnnBackendPopulateCudaGraph 1353 +#define RPC_cudnnBackendUpdateCudaGraph 1354 +#define RPC_cudnnCreateTensorDescriptor 1355 +#define RPC_cudnnSetTensor4dDescriptor 1356 +#define RPC_cudnnSetTensor4dDescriptorEx 1357 +#define RPC_cudnnGetTensor4dDescriptor 1358 +#define RPC_cudnnGetTensorSizeInBytes 1359 +#define RPC_cudnnDestroyTensorDescriptor 1360 +#define RPC_cudnnInitTransformDest 1361 +#define RPC_cudnnCreateTensorTransformDescriptor 1362 +#define RPC_cudnnDestroyTensorTransformDescriptor 1363 +#define RPC_cudnnCreateOpTensorDescriptor 1364 +#define RPC_cudnnSetOpTensorDescriptor 1365 +#define RPC_cudnnGetOpTensorDescriptor 1366 +#define RPC_cudnnDestroyOpTensorDescriptor 1367 +#define RPC_cudnnCreateReduceTensorDescriptor 1368 +#define RPC_cudnnSetReduceTensorDescriptor 1369 +#define RPC_cudnnGetReduceTensorDescriptor 1370 +#define RPC_cudnnDestroyReduceTensorDescriptor 1371 +#define RPC_cudnnGetReductionIndicesSize 1372 +#define RPC_cudnnGetReductionWorkspaceSize 1373 +#define RPC_cudnnCreateFilterDescriptor 1374 +#define RPC_cudnnSetFilter4dDescriptor 1375 +#define RPC_cudnnGetFilter4dDescriptor 1376 +#define RPC_cudnnGetFilterSizeInBytes 1377 +#define RPC_cudnnDestroyFilterDescriptor 1378 +#define RPC_cudnnCreatePoolingDescriptor 1379 +#define RPC_cudnnSetPooling2dDescriptor 1380 +#define RPC_cudnnGetPooling2dDescriptor 1381 +#define RPC_cudnnGetPooling2dForwardOutputDim 1382 +#define RPC_cudnnDestroyPoolingDescriptor 1383 +#define RPC_cudnnCreateActivationDescriptor 1384 +#define RPC_cudnnSetActivationDescriptor 1385 +#define RPC_cudnnGetActivationDescriptor 1386 +#define RPC_cudnnSetActivationDescriptorSwishBeta 1387 +#define RPC_cudnnGetActivationDescriptorSwishBeta 1388 +#define RPC_cudnnDestroyActivationDescriptor 1389 +#define RPC_cudnnActivationForward 1390 +#define RPC_cudnnCreateLRNDescriptor 1391 +#define RPC_cudnnSetLRNDescriptor 1392 +#define RPC_cudnnGetLRNDescriptor 1393 +#define RPC_cudnnDestroyLRNDescriptor 1394 +#define RPC_cudnnDeriveBNTensorDescriptor 1395 +#define RPC_cudnnDeriveNormTensorDescriptor 1396 +#define RPC_cudnnCreateSpatialTransformerDescriptor 1397 +#define RPC_cudnnDestroySpatialTransformerDescriptor 1398 +#define RPC_cudnnCreateDropoutDescriptor 1399 +#define RPC_cudnnDestroyDropoutDescriptor 1400 +#define RPC_cudnnDropoutGetStatesSize 1401 +#define RPC_cudnnDropoutGetReserveSpaceSize 1402 +#define RPC_cudnnGetDropoutDescriptor 1403 +#define RPC_cudnnOpsVersionCheck 1404 +#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1405 +#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1406 +#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1407 +#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1408 +#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1409 +#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1410 diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index b6324f9..4588260 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -11361,7 +11361,7 @@ cublasStatus_t cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int if (rpc_start_request(0, RPC_cublasDnrm2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, result, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11377,7 +11377,7 @@ cublasStatus_t cublasDnrm2_v2_64(cublasHandle_t handle, int64_t n, const double* if (rpc_start_request(0, RPC_cublasDnrm2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, result, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11493,9 +11493,9 @@ cublasStatus_t cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int if (rpc_start_request(0, RPC_cublasDdot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_write(0, result, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11511,9 +11511,9 @@ cublasStatus_t cublasDdot_v2_64(cublasHandle_t handle, int64_t n, const double* if (rpc_start_request(0, RPC_cublasDdot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_write(0, result, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11705,7 +11705,7 @@ cublasStatus_t cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha, if (rpc_start_request(0, RPC_cublasDscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11721,7 +11721,7 @@ cublasStatus_t cublasDscal_v2_64(cublasHandle_t handle, int64_t n, const double* if (rpc_start_request(0, RPC_cublasDscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11833,7 +11833,7 @@ cublasStatus_t cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha if (rpc_start_request(0, RPC_cublasZdscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11849,7 +11849,7 @@ cublasStatus_t cublasZdscal_v2_64(cublasHandle_t handle, int64_t n, const double if (rpc_start_request(0, RPC_cublasZdscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -11901,8 +11901,8 @@ cublasStatus_t cublasDaxpy_v2(cublasHandle_t handle, int n, const double* alpha, if (rpc_start_request(0, RPC_cublasDaxpy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || @@ -11919,8 +11919,8 @@ cublasStatus_t cublasDaxpy_v2_64(cublasHandle_t handle, int64_t n, const double* if (rpc_start_request(0, RPC_cublasDaxpy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || @@ -12043,7 +12043,7 @@ cublasStatus_t cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int if (rpc_start_request(0, RPC_cublasDcopy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || @@ -12060,7 +12060,7 @@ cublasStatus_t cublasDcopy_v2_64(cublasHandle_t handle, int64_t n, const double* if (rpc_start_request(0, RPC_cublasDcopy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || @@ -12321,7 +12321,7 @@ cublasStatus_t cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, in if (rpc_start_request(0, RPC_cublasIdamax_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, result, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -12337,7 +12337,7 @@ cublasStatus_t cublasIdamax_v2_64(cublasHandle_t handle, int64_t n, const double if (rpc_start_request(0, RPC_cublasIdamax_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, result, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -12483,7 +12483,7 @@ cublasStatus_t cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, in if (rpc_start_request(0, RPC_cublasIdamin_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, result, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -12499,7 +12499,7 @@ cublasStatus_t cublasIdamin_v2_64(cublasHandle_t handle, int64_t n, const double if (rpc_start_request(0, RPC_cublasIdamin_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, result, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -12645,7 +12645,7 @@ cublasStatus_t cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int if (rpc_start_request(0, RPC_cublasDasum_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, result, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -12661,7 +12661,7 @@ cublasStatus_t cublasDasum_v2_64(cublasHandle_t handle, int64_t n, const double* if (rpc_start_request(0, RPC_cublasDasum_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, result, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -12785,8 +12785,8 @@ cublasStatus_t cublasDrot_v2(cublasHandle_t handle, int n, double* x, int incx, rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || - rpc_write(0, &c, sizeof(const double*)) < 0 || - rpc_write(0, &s, sizeof(const double*)) < 0 || + rpc_write(0, c, sizeof(const double*)) < 0 || + rpc_write(0, s, sizeof(const double*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(double)) < 0 || rpc_read(0, y, sizeof(double)) < 0 || @@ -12805,8 +12805,8 @@ cublasStatus_t cublasDrot_v2_64(cublasHandle_t handle, int64_t n, double* x, int rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || - rpc_write(0, &c, sizeof(const double*)) < 0 || - rpc_write(0, &s, sizeof(const double*)) < 0 || + rpc_write(0, c, sizeof(const double*)) < 0 || + rpc_write(0, s, sizeof(const double*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(double)) < 0 || rpc_read(0, y, sizeof(double)) < 0 || @@ -12905,7 +12905,7 @@ cublasStatus_t cublasZrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, i rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || - rpc_write(0, &c, sizeof(const double*)) < 0 || + rpc_write(0, c, sizeof(const double*)) < 0 || rpc_write(0, &s, sizeof(const cuDoubleComplex*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || @@ -12925,7 +12925,7 @@ cublasStatus_t cublasZrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComple rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || - rpc_write(0, &c, sizeof(const double*)) < 0 || + rpc_write(0, c, sizeof(const double*)) < 0 || rpc_write(0, &s, sizeof(const cuDoubleComplex*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || @@ -12945,8 +12945,8 @@ cublasStatus_t cublasZdrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || - rpc_write(0, &c, sizeof(const double*)) < 0 || - rpc_write(0, &s, sizeof(const double*)) < 0 || + rpc_write(0, c, sizeof(const double*)) < 0 || + rpc_write(0, s, sizeof(const double*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || @@ -12965,8 +12965,8 @@ cublasStatus_t cublasZdrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleCompl rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || - rpc_write(0, &c, sizeof(const double*)) < 0 || - rpc_write(0, &s, sizeof(const double*)) < 0 || + rpc_write(0, c, sizeof(const double*)) < 0 || + rpc_write(0, s, sizeof(const double*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || @@ -13099,7 +13099,7 @@ cublasStatus_t cublasDrotm_v2(cublasHandle_t handle, int n, double* x, int incx, rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || - rpc_write(0, ¶m, sizeof(const double*)) < 0 || + rpc_write(0, param, sizeof(const double*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(double)) < 0 || rpc_read(0, y, sizeof(double)) < 0 || @@ -13118,7 +13118,7 @@ cublasStatus_t cublasDrotm_v2_64(cublasHandle_t handle, int64_t n, double* x, in rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || - rpc_write(0, ¶m, sizeof(const double*)) < 0 || + rpc_write(0, param, sizeof(const double*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, x, sizeof(double)) < 0 || rpc_read(0, y, sizeof(double)) < 0 || @@ -13155,7 +13155,7 @@ cublasStatus_t cublasDrotmg_v2(cublasHandle_t handle, double* d1, double* d2, do rpc_write(0, d1, sizeof(double)) < 0 || rpc_write(0, d2, sizeof(double)) < 0 || rpc_write(0, x1, sizeof(double)) < 0 || - rpc_write(0, &y1, sizeof(const double*)) < 0 || + rpc_write(0, y1, sizeof(const double*)) < 0 || rpc_write(0, param, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, d1, sizeof(double)) < 0 || @@ -13221,12 +13221,12 @@ cublasStatus_t cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -13244,12 +13244,12 @@ cublasStatus_t cublasDgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -13411,12 +13411,12 @@ cublasStatus_t cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &kl, sizeof(int)) < 0 || rpc_write(0, &ku, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -13436,12 +13436,12 @@ cublasStatus_t cublasDgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &kl, sizeof(int64_t)) < 0 || rpc_write(0, &ku, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -13600,7 +13600,7 @@ cublasStatus_t cublasDtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || @@ -13620,7 +13620,7 @@ cublasStatus_t cublasDtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || @@ -13763,7 +13763,7 @@ cublasStatus_t cublasDtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || @@ -13784,7 +13784,7 @@ cublasStatus_t cublasDtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || @@ -13926,7 +13926,7 @@ cublasStatus_t cublasDtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &AP, sizeof(const double*)) < 0 || + rpc_write(0, AP, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -13945,7 +13945,7 @@ cublasStatus_t cublasDtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &AP, sizeof(const double*)) < 0 || + rpc_write(0, AP, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -14080,7 +14080,7 @@ cublasStatus_t cublasDtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || @@ -14100,7 +14100,7 @@ cublasStatus_t cublasDtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || @@ -14238,7 +14238,7 @@ cublasStatus_t cublasDtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &AP, sizeof(const double*)) < 0 || + rpc_write(0, AP, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -14257,7 +14257,7 @@ cublasStatus_t cublasDtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &AP, sizeof(const double*)) < 0 || + rpc_write(0, AP, sizeof(const double*)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -14395,7 +14395,7 @@ cublasStatus_t cublasDtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || @@ -14416,7 +14416,7 @@ cublasStatus_t cublasDtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, x, sizeof(double)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || @@ -14562,12 +14562,12 @@ cublasStatus_t cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -14584,12 +14584,12 @@ cublasStatus_t cublasDsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -14829,12 +14829,12 @@ cublasStatus_t cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -14852,12 +14852,12 @@ cublasStatus_t cublasDsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -15008,11 +15008,11 @@ cublasStatus_t cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &AP, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, AP, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -15029,11 +15029,11 @@ cublasStatus_t cublasDspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &AP, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, AP, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -15176,10 +15176,10 @@ cublasStatus_t cublasDger_v2(cublasHandle_t handle, int m, int n, const double* rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || @@ -15197,10 +15197,10 @@ cublasStatus_t cublasDger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, con rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || @@ -15424,8 +15424,8 @@ cublasStatus_t cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || @@ -15443,8 +15443,8 @@ cublasStatus_t cublasDsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || @@ -15576,7 +15576,7 @@ cublasStatus_t cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 || @@ -15595,7 +15595,7 @@ cublasStatus_t cublasZher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 || @@ -15650,8 +15650,8 @@ cublasStatus_t cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -15668,8 +15668,8 @@ cublasStatus_t cublasDspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -15722,7 +15722,7 @@ cublasStatus_t cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 || @@ -15740,7 +15740,7 @@ cublasStatus_t cublasZhpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 || @@ -15800,10 +15800,10 @@ cublasStatus_t cublasDsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || @@ -15821,10 +15821,10 @@ cublasStatus_t cublasDsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || @@ -16050,10 +16050,10 @@ cublasStatus_t cublasDspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -16070,10 +16070,10 @@ cublasStatus_t cublasDspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, &y, sizeof(const double*)) < 0 || + rpc_write(0, y, sizeof(const double*)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -16173,12 +16173,12 @@ cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || - rpc_write(0, Aarray, sizeof(const float* const[batchCount])) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, xarray, sizeof(const float* const[batchCount])) < 0 || + rpc_write(0, &xarray, sizeof(xarray)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, &beta, sizeof(const float*)) < 0 || - rpc_write(0, yarray, sizeof(float* const[batchCount])) < 0 || + rpc_write(0, &yarray, sizeof(yarray)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) @@ -16196,12 +16196,12 @@ cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t tra rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || - rpc_write(0, Aarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, xarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_write(0, &xarray, sizeof(xarray)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, &beta, sizeof(const float*)) < 0 || - rpc_write(0, yarray, sizeof(__nv_bfloat16* const[batchCount])) < 0 || + rpc_write(0, &yarray, sizeof(yarray)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) @@ -16271,14 +16271,14 @@ cublasStatus_t cublasDgemvStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, &stridex, sizeof(long long int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_write(0, &stridey, sizeof(long long int)) < 0 || @@ -16298,14 +16298,14 @@ cublasStatus_t cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, &stridex, sizeof(long long int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, y, sizeof(double)) < 0 || rpc_write(0, &incy, sizeof(int64_t)) < 0 || rpc_write(0, &stridey, sizeof(long long int)) < 0 || @@ -16702,12 +16702,12 @@ cublasStatus_t cublasDgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -16727,12 +16727,12 @@ cublasStatus_t cublasDgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17045,10 +17045,10 @@ cublasStatus_t cublasDsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17067,10 +17067,10 @@ cublasStatus_t cublasDsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17221,10 +17221,10 @@ cublasStatus_t cublasZherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17243,10 +17243,10 @@ cublasStatus_t cublasZherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17313,12 +17313,12 @@ cublasStatus_t cublasDsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17337,12 +17337,12 @@ cublasStatus_t cublasDsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17510,7 +17510,7 @@ cublasStatus_t cublasZher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17534,7 +17534,7 @@ cublasStatus_t cublasZher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17601,12 +17601,12 @@ cublasStatus_t cublasDsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17625,12 +17625,12 @@ cublasStatus_t cublasDsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17798,7 +17798,7 @@ cublasStatus_t cublasZherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17822,7 +17822,7 @@ cublasStatus_t cublasZherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17889,12 +17889,12 @@ cublasStatus_t cublasDsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -17913,12 +17913,12 @@ cublasStatus_t cublasDsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || @@ -18177,8 +18177,8 @@ cublasStatus_t cublasDtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, B, sizeof(double)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || @@ -18200,8 +18200,8 @@ cublasStatus_t cublasDtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, B, sizeof(double)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || @@ -18365,10 +18365,10 @@ cublasStatus_t cublasDtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || @@ -18390,10 +18390,10 @@ cublasStatus_t cublasDtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || @@ -18504,6 +18504,306 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c return return_value; } +cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasHgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const __half*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const __half*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasHgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const __half*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const __half*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const float*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const float*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgemm3mBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgemm3mBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount) { cublasStatus_t return_value; @@ -18630,14 +18930,14 @@ cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_write(0, &strideB, sizeof(long long int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_write(0, &strideC, sizeof(long long int)) < 0 || @@ -18659,14 +18959,14 @@ cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_write(0, &strideB, sizeof(long long int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_write(0, &strideC, sizeof(long long int)) < 0 || @@ -18852,6 +19152,36 @@ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperati return return_value; } +cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int64_t lda, const void* const Barray[], cudaDataType Btype, int64_t ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int64_t ldc, int64_t batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasGemmBatchedEx_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const void*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &Atype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &Btype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const void*)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &Ctype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &computeType, sizeof(cublasComputeType_t)) < 0 || + rpc_write(0, &algo, sizeof(cublasGemmAlgo_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { cublasStatus_t return_value; @@ -18909,11 +19239,11 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cubl rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || @@ -18933,11 +19263,11 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, c rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, beta, sizeof(const double*)) < 0 || + rpc_write(0, B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || @@ -19044,6 +19374,190 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c return return_value; } +cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, alpha, sizeof(const double*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const float* A, int lda, const float* x, int incx, float* C, int ldc) { cublasStatus_t return_value; @@ -19094,9 +19608,9 @@ cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || @@ -19115,9 +19629,9 @@ cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double*)) < 0 || + rpc_write(0, x, sizeof(const double*)) < 0 || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || @@ -19212,6 +19726,254 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 return return_value; } +cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda) { cublasStatus_t return_value; @@ -19236,7 +19998,7 @@ cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &AP, sizeof(const double*)) < 0 || + rpc_write(0, AP, sizeof(const double*)) < 0 || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19304,7 +20066,7 @@ cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19348,6 +20110,174 @@ cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, return return_value; } +cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasUint8gemmBias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc, int m, int n, int k, const unsigned char* A, int A_bias, int lda, const unsigned char* B, int B_bias, int ldb, unsigned char* C, int C_bias, int ldc, int C_mult, int C_shift) { cublasStatus_t return_value; @@ -21644,6 +22574,18 @@ std::unordered_map functionMap = { {"cublasCtrmm_v2_64", (void *)cublasCtrmm_v2_64}, {"cublasZtrmm_v2", (void *)cublasZtrmm_v2}, {"cublasZtrmm_v2_64", (void *)cublasZtrmm_v2_64}, + {"cublasHgemmBatched", (void *)cublasHgemmBatched}, + {"cublasHgemmBatched_64", (void *)cublasHgemmBatched_64}, + {"cublasSgemmBatched", (void *)cublasSgemmBatched}, + {"cublasSgemmBatched_64", (void *)cublasSgemmBatched_64}, + {"cublasDgemmBatched", (void *)cublasDgemmBatched}, + {"cublasDgemmBatched_64", (void *)cublasDgemmBatched_64}, + {"cublasCgemmBatched", (void *)cublasCgemmBatched}, + {"cublasCgemmBatched_64", (void *)cublasCgemmBatched_64}, + {"cublasCgemm3mBatched", (void *)cublasCgemm3mBatched}, + {"cublasCgemm3mBatched_64", (void *)cublasCgemm3mBatched_64}, + {"cublasZgemmBatched", (void *)cublasZgemmBatched}, + {"cublasZgemmBatched_64", (void *)cublasZgemmBatched_64}, {"cublasHgemmStridedBatched", (void *)cublasHgemmStridedBatched}, {"cublasHgemmStridedBatched_64", (void *)cublasHgemmStridedBatched_64}, {"cublasSgemmStridedBatched", (void *)cublasSgemmStridedBatched}, @@ -21656,6 +22598,7 @@ std::unordered_map functionMap = { {"cublasCgemm3mStridedBatched_64", (void *)cublasCgemm3mStridedBatched_64}, {"cublasZgemmStridedBatched", (void *)cublasZgemmStridedBatched}, {"cublasZgemmStridedBatched_64", (void *)cublasZgemmStridedBatched_64}, + {"cublasGemmBatchedEx_64", (void *)cublasGemmBatchedEx_64}, {"cublasSgeam", (void *)cublasSgeam}, {"cublasSgeam_64", (void *)cublasSgeam_64}, {"cublasDgeam", (void *)cublasDgeam}, @@ -21664,6 +22607,14 @@ std::unordered_map functionMap = { {"cublasCgeam_64", (void *)cublasCgeam_64}, {"cublasZgeam", (void *)cublasZgeam}, {"cublasZgeam_64", (void *)cublasZgeam_64}, + {"cublasStrsmBatched", (void *)cublasStrsmBatched}, + {"cublasStrsmBatched_64", (void *)cublasStrsmBatched_64}, + {"cublasDtrsmBatched", (void *)cublasDtrsmBatched}, + {"cublasDtrsmBatched_64", (void *)cublasDtrsmBatched_64}, + {"cublasCtrsmBatched", (void *)cublasCtrsmBatched}, + {"cublasCtrsmBatched_64", (void *)cublasCtrsmBatched_64}, + {"cublasZtrsmBatched", (void *)cublasZtrsmBatched}, + {"cublasZtrsmBatched_64", (void *)cublasZtrsmBatched_64}, {"cublasSdgmm", (void *)cublasSdgmm}, {"cublasSdgmm_64", (void *)cublasSdgmm_64}, {"cublasDdgmm", (void *)cublasDdgmm}, @@ -21672,6 +22623,18 @@ std::unordered_map functionMap = { {"cublasCdgmm_64", (void *)cublasCdgmm_64}, {"cublasZdgmm", (void *)cublasZdgmm}, {"cublasZdgmm_64", (void *)cublasZdgmm_64}, + {"cublasSmatinvBatched", (void *)cublasSmatinvBatched}, + {"cublasDmatinvBatched", (void *)cublasDmatinvBatched}, + {"cublasCmatinvBatched", (void *)cublasCmatinvBatched}, + {"cublasZmatinvBatched", (void *)cublasZmatinvBatched}, + {"cublasSgeqrfBatched", (void *)cublasSgeqrfBatched}, + {"cublasDgeqrfBatched", (void *)cublasDgeqrfBatched}, + {"cublasCgeqrfBatched", (void *)cublasCgeqrfBatched}, + {"cublasZgeqrfBatched", (void *)cublasZgeqrfBatched}, + {"cublasSgelsBatched", (void *)cublasSgelsBatched}, + {"cublasDgelsBatched", (void *)cublasDgelsBatched}, + {"cublasCgelsBatched", (void *)cublasCgelsBatched}, + {"cublasZgelsBatched", (void *)cublasZgelsBatched}, {"cublasStpttr", (void *)cublasStpttr}, {"cublasDtpttr", (void *)cublasDtpttr}, {"cublasCtpttr", (void *)cublasCtpttr}, @@ -21680,6 +22643,14 @@ std::unordered_map functionMap = { {"cublasDtrttp", (void *)cublasDtrttp}, {"cublasCtrttp", (void *)cublasCtrttp}, {"cublasZtrttp", (void *)cublasZtrttp}, + {"cublasSgetriBatched", (void *)cublasSgetriBatched}, + {"cublasDgetriBatched", (void *)cublasDgetriBatched}, + {"cublasCgetriBatched", (void *)cublasCgetriBatched}, + {"cublasZgetriBatched", (void *)cublasZgetriBatched}, + {"cublasSgetrsBatched", (void *)cublasSgetrsBatched}, + {"cublasDgetrsBatched", (void *)cublasDgetrsBatched}, + {"cublasCgetrsBatched", (void *)cublasCgetrsBatched}, + {"cublasZgetrsBatched", (void *)cublasZgetrsBatched}, {"cublasUint8gemmBias", (void *)cublasUint8gemmBias}, {"cudnnGetProperty", (void *)cudnnGetProperty}, {"cudnnCreate", (void *)cudnnCreate}, diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp index b243014..78c48fd 100644 --- a/codegen/gen_server.cpp +++ b/codegen/gen_server.cpp @@ -24098,7 +24098,7 @@ int handle_cublasDnrm2_v2(void *conn) { cublasHandle_t handle; int n; - const double* x; + double x; int incx; double result; int request_id; @@ -24115,7 +24115,7 @@ int handle_cublasDnrm2_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDnrm2_v2(handle, n, x, incx, &result); + scuda_intercept_result = cublasDnrm2_v2(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(double)) < 0 || @@ -24131,7 +24131,7 @@ int handle_cublasDnrm2_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* x; + double x; int64_t incx; double result; int request_id; @@ -24148,7 +24148,7 @@ int handle_cublasDnrm2_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDnrm2_v2_64(handle, n, x, incx, &result); + scuda_intercept_result = cublasDnrm2_v2_64(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(double)) < 0 || @@ -24370,9 +24370,9 @@ int handle_cublasDdot_v2(void *conn) { cublasHandle_t handle; int n; - const double* x; + double x; int incx; - const double* y; + double y; int incy; double result; int request_id; @@ -24391,7 +24391,7 @@ int handle_cublasDdot_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDdot_v2(handle, n, x, incx, y, incy, &result); + scuda_intercept_result = cublasDdot_v2(handle, n, &x, incx, &y, incy, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(double)) < 0 || @@ -24407,9 +24407,9 @@ int handle_cublasDdot_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* x; + double x; int64_t incx; - const double* y; + double y; int64_t incy; double result; int request_id; @@ -24428,7 +24428,7 @@ int handle_cublasDdot_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDdot_v2_64(handle, n, x, incx, y, incy, &result); + scuda_intercept_result = cublasDdot_v2_64(handle, n, &x, incx, &y, incy, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(double)) < 0 || @@ -24806,7 +24806,7 @@ int handle_cublasDscal_v2(void *conn) { cublasHandle_t handle; int n; - const double* alpha; + double alpha; double x; int incx; int request_id; @@ -24823,7 +24823,7 @@ int handle_cublasDscal_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDscal_v2(handle, n, alpha, &x, incx); + scuda_intercept_result = cublasDscal_v2(handle, n, &alpha, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -24839,7 +24839,7 @@ int handle_cublasDscal_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* alpha; + double alpha; double x; int64_t incx; int request_id; @@ -24856,7 +24856,7 @@ int handle_cublasDscal_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDscal_v2_64(handle, n, alpha, &x, incx); + scuda_intercept_result = cublasDscal_v2_64(handle, n, &alpha, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -25070,7 +25070,7 @@ int handle_cublasZdscal_v2(void *conn) { cublasHandle_t handle; int n; - const double* alpha; + double alpha; cuDoubleComplex x; int incx; int request_id; @@ -25087,7 +25087,7 @@ int handle_cublasZdscal_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdscal_v2(handle, n, alpha, &x, incx); + scuda_intercept_result = cublasZdscal_v2(handle, n, &alpha, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 || @@ -25103,7 +25103,7 @@ int handle_cublasZdscal_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* alpha; + double alpha; cuDoubleComplex x; int64_t incx; int request_id; @@ -25120,7 +25120,7 @@ int handle_cublasZdscal_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdscal_v2_64(handle, n, alpha, &x, incx); + scuda_intercept_result = cublasZdscal_v2_64(handle, n, &alpha, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 || @@ -25210,8 +25210,8 @@ int handle_cublasDaxpy_v2(void *conn) { cublasHandle_t handle; int n; - const double* alpha; - const double* x; + double alpha; + double x; int incx; double y; int incy; @@ -25231,7 +25231,7 @@ int handle_cublasDaxpy_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDaxpy_v2(handle, n, alpha, x, incx, &y, incy); + scuda_intercept_result = cublasDaxpy_v2(handle, n, &alpha, &x, incx, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -25247,8 +25247,8 @@ int handle_cublasDaxpy_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* alpha; - const double* x; + double alpha; + double x; int64_t incx; double y; int64_t incy; @@ -25268,7 +25268,7 @@ int handle_cublasDaxpy_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDaxpy_v2_64(handle, n, alpha, x, incx, &y, incy); + scuda_intercept_result = cublasDaxpy_v2_64(handle, n, &alpha, &x, incx, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -25502,7 +25502,7 @@ int handle_cublasDcopy_v2(void *conn) { cublasHandle_t handle; int n; - const double* x; + double x; int incx; double y; int incy; @@ -25521,7 +25521,7 @@ int handle_cublasDcopy_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDcopy_v2(handle, n, x, incx, &y, incy); + scuda_intercept_result = cublasDcopy_v2(handle, n, &x, incx, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -25537,7 +25537,7 @@ int handle_cublasDcopy_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* x; + double x; int64_t incx; double y; int64_t incy; @@ -25556,7 +25556,7 @@ int handle_cublasDcopy_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDcopy_v2_64(handle, n, x, incx, &y, incy); + scuda_intercept_result = cublasDcopy_v2_64(handle, n, &x, incx, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -26066,7 +26066,7 @@ int handle_cublasIdamax_v2(void *conn) { cublasHandle_t handle; int n; - const double* x; + double x; int incx; int result; int request_id; @@ -26083,7 +26083,7 @@ int handle_cublasIdamax_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasIdamax_v2(handle, n, x, incx, &result); + scuda_intercept_result = cublasIdamax_v2(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(int)) < 0 || @@ -26099,7 +26099,7 @@ int handle_cublasIdamax_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* x; + double x; int64_t incx; int64_t result; int request_id; @@ -26116,7 +26116,7 @@ int handle_cublasIdamax_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasIdamax_v2_64(handle, n, x, incx, &result); + scuda_intercept_result = cublasIdamax_v2_64(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(int64_t)) < 0 || @@ -26400,7 +26400,7 @@ int handle_cublasIdamin_v2(void *conn) { cublasHandle_t handle; int n; - const double* x; + double x; int incx; int result; int request_id; @@ -26417,7 +26417,7 @@ int handle_cublasIdamin_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasIdamin_v2(handle, n, x, incx, &result); + scuda_intercept_result = cublasIdamin_v2(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(int)) < 0 || @@ -26433,7 +26433,7 @@ int handle_cublasIdamin_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* x; + double x; int64_t incx; int64_t result; int request_id; @@ -26450,7 +26450,7 @@ int handle_cublasIdamin_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasIdamin_v2_64(handle, n, x, incx, &result); + scuda_intercept_result = cublasIdamin_v2_64(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(int64_t)) < 0 || @@ -26734,7 +26734,7 @@ int handle_cublasDasum_v2(void *conn) { cublasHandle_t handle; int n; - const double* x; + double x; int incx; double result; int request_id; @@ -26751,7 +26751,7 @@ int handle_cublasDasum_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDasum_v2(handle, n, x, incx, &result); + scuda_intercept_result = cublasDasum_v2(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(double)) < 0 || @@ -26767,7 +26767,7 @@ int handle_cublasDasum_v2_64(void *conn) { cublasHandle_t handle; int64_t n; - const double* x; + double x; int64_t incx; double result; int request_id; @@ -26784,7 +26784,7 @@ int handle_cublasDasum_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDasum_v2_64(handle, n, x, incx, &result); + scuda_intercept_result = cublasDasum_v2_64(handle, n, &x, incx, &result); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &result, sizeof(double)) < 0 || @@ -27016,8 +27016,8 @@ int handle_cublasDrot_v2(void *conn) int incx; double y; int incy; - const double* c; - const double* s; + double c; + double s; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -27035,7 +27035,7 @@ int handle_cublasDrot_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDrot_v2(handle, n, &x, incx, &y, incy, c, s); + scuda_intercept_result = cublasDrot_v2(handle, n, &x, incx, &y, incy, &c, &s); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -27056,8 +27056,8 @@ int handle_cublasDrot_v2_64(void *conn) int64_t incx; double y; int64_t incy; - const double* c; - const double* s; + double c; + double s; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -27075,7 +27075,7 @@ int handle_cublasDrot_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDrot_v2_64(handle, n, &x, incx, &y, incy, c, s); + scuda_intercept_result = cublasDrot_v2_64(handle, n, &x, incx, &y, incy, &c, &s); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -27256,7 +27256,7 @@ int handle_cublasZrot_v2(void *conn) int incx; cuDoubleComplex y; int incy; - const double* c; + double c; const cuDoubleComplex* s; int request_id; cublasStatus_t scuda_intercept_result; @@ -27275,7 +27275,7 @@ int handle_cublasZrot_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZrot_v2(handle, n, &x, incx, &y, incy, c, s); + scuda_intercept_result = cublasZrot_v2(handle, n, &x, incx, &y, incy, &c, s); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 || @@ -27296,7 +27296,7 @@ int handle_cublasZrot_v2_64(void *conn) int64_t incx; cuDoubleComplex y; int64_t incy; - const double* c; + double c; const cuDoubleComplex* s; int request_id; cublasStatus_t scuda_intercept_result; @@ -27315,7 +27315,7 @@ int handle_cublasZrot_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZrot_v2_64(handle, n, &x, incx, &y, incy, c, s); + scuda_intercept_result = cublasZrot_v2_64(handle, n, &x, incx, &y, incy, &c, s); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 || @@ -27336,8 +27336,8 @@ int handle_cublasZdrot_v2(void *conn) int incx; cuDoubleComplex y; int incy; - const double* c; - const double* s; + double c; + double s; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -27355,7 +27355,7 @@ int handle_cublasZdrot_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdrot_v2(handle, n, &x, incx, &y, incy, c, s); + scuda_intercept_result = cublasZdrot_v2(handle, n, &x, incx, &y, incy, &c, &s); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 || @@ -27376,8 +27376,8 @@ int handle_cublasZdrot_v2_64(void *conn) int64_t incx; cuDoubleComplex y; int64_t incy; - const double* c; - const double* s; + double c; + double s; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -27395,7 +27395,7 @@ int handle_cublasZdrot_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdrot_v2_64(handle, n, &x, incx, &y, incy, c, s); + scuda_intercept_result = cublasZdrot_v2_64(handle, n, &x, incx, &y, incy, &c, &s); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(cuDoubleComplex)) < 0 || @@ -27636,7 +27636,7 @@ int handle_cublasDrotm_v2(void *conn) int incx; double y; int incy; - const double* param; + double param; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -27653,7 +27653,7 @@ int handle_cublasDrotm_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDrotm_v2(handle, n, &x, incx, &y, incy, param); + scuda_intercept_result = cublasDrotm_v2(handle, n, &x, incx, &y, incy, ¶m); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -27674,7 +27674,7 @@ int handle_cublasDrotm_v2_64(void *conn) int64_t incx; double y; int64_t incy; - const double* param; + double param; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -27691,7 +27691,7 @@ int handle_cublasDrotm_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDrotm_v2_64(handle, n, &x, incx, &y, incy, param); + scuda_intercept_result = cublasDrotm_v2_64(handle, n, &x, incx, &y, incy, ¶m); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -27748,7 +27748,7 @@ int handle_cublasDrotmg_v2(void *conn) double d1; double d2; double x1; - const double* y1; + double y1; double param; int request_id; cublasStatus_t scuda_intercept_result; @@ -27765,7 +27765,7 @@ int handle_cublasDrotmg_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDrotmg_v2(handle, &d1, &d2, &x1, y1, ¶m); + scuda_intercept_result = cublasDrotmg_v2(handle, &d1, &d2, &x1, &y1, ¶m); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &d1, sizeof(double)) < 0 || @@ -27880,12 +27880,12 @@ int handle_cublasDgemv_v2(void *conn) cublasOperation_t trans; int m; int n; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* x; + double x; int incx; - const double* beta; + double beta; double y; int incy; int request_id; @@ -27909,7 +27909,7 @@ int handle_cublasDgemv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemv_v2(handle, trans, m, n, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDgemv_v2(handle, trans, m, n, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -27927,12 +27927,12 @@ int handle_cublasDgemv_v2_64(void *conn) cublasOperation_t trans; int64_t m; int64_t n; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* x; + double x; int64_t incx; - const double* beta; + double beta; double y; int64_t incy; int request_id; @@ -27956,7 +27956,7 @@ int handle_cublasDgemv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemv_v2_64(handle, trans, m, n, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDgemv_v2_64(handle, trans, m, n, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -28266,12 +28266,12 @@ int handle_cublasDgbmv_v2(void *conn) int n; int kl; int ku; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* x; + double x; int incx; - const double* beta; + double beta; double y; int incy; int request_id; @@ -28297,7 +28297,7 @@ int handle_cublasDgbmv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgbmv_v2(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDgbmv_v2(handle, trans, m, n, kl, ku, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -28317,12 +28317,12 @@ int handle_cublasDgbmv_v2_64(void *conn) int64_t n; int64_t kl; int64_t ku; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* x; + double x; int64_t incx; - const double* beta; + double beta; double y; int64_t incy; int request_id; @@ -28348,7 +28348,7 @@ int handle_cublasDgbmv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgbmv_v2_64(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDgbmv_v2_64(handle, trans, m, n, kl, ku, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -28653,7 +28653,7 @@ int handle_cublasDtrmv_v2(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int n; - const double* A; + double A; int lda; double x; int incx; @@ -28675,7 +28675,7 @@ int handle_cublasDtrmv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrmv_v2(handle, uplo, trans, diag, n, A, lda, &x, incx); + scuda_intercept_result = cublasDtrmv_v2(handle, uplo, trans, diag, n, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -28694,7 +28694,7 @@ int handle_cublasDtrmv_v2_64(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int64_t n; - const double* A; + double A; int64_t lda; double x; int64_t incx; @@ -28716,7 +28716,7 @@ int handle_cublasDtrmv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrmv_v2_64(handle, uplo, trans, diag, n, A, lda, &x, incx); + scuda_intercept_result = cublasDtrmv_v2_64(handle, uplo, trans, diag, n, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -28986,7 +28986,7 @@ int handle_cublasDtbmv_v2(void *conn) cublasDiagType_t diag; int n; int k; - const double* A; + double A; int lda; double x; int incx; @@ -29009,7 +29009,7 @@ int handle_cublasDtbmv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtbmv_v2(handle, uplo, trans, diag, n, k, A, lda, &x, incx); + scuda_intercept_result = cublasDtbmv_v2(handle, uplo, trans, diag, n, k, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -29029,7 +29029,7 @@ int handle_cublasDtbmv_v2_64(void *conn) cublasDiagType_t diag; int64_t n; int64_t k; - const double* A; + double A; int64_t lda; double x; int64_t incx; @@ -29052,7 +29052,7 @@ int handle_cublasDtbmv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtbmv_v2_64(handle, uplo, trans, diag, n, k, A, lda, &x, incx); + scuda_intercept_result = cublasDtbmv_v2_64(handle, uplo, trans, diag, n, k, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -29321,7 +29321,7 @@ int handle_cublasDtpmv_v2(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int n; - const double* AP; + double AP; double x; int incx; int request_id; @@ -29341,7 +29341,7 @@ int handle_cublasDtpmv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtpmv_v2(handle, uplo, trans, diag, n, AP, &x, incx); + scuda_intercept_result = cublasDtpmv_v2(handle, uplo, trans, diag, n, &AP, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -29360,7 +29360,7 @@ int handle_cublasDtpmv_v2_64(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int64_t n; - const double* AP; + double AP; double x; int64_t incx; int request_id; @@ -29380,7 +29380,7 @@ int handle_cublasDtpmv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtpmv_v2_64(handle, uplo, trans, diag, n, AP, &x, incx); + scuda_intercept_result = cublasDtpmv_v2_64(handle, uplo, trans, diag, n, &AP, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -29637,7 +29637,7 @@ int handle_cublasDtrsv_v2(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int n; - const double* A; + double A; int lda; double x; int incx; @@ -29659,7 +29659,7 @@ int handle_cublasDtrsv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrsv_v2(handle, uplo, trans, diag, n, A, lda, &x, incx); + scuda_intercept_result = cublasDtrsv_v2(handle, uplo, trans, diag, n, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -29678,7 +29678,7 @@ int handle_cublasDtrsv_v2_64(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int64_t n; - const double* A; + double A; int64_t lda; double x; int64_t incx; @@ -29700,7 +29700,7 @@ int handle_cublasDtrsv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrsv_v2_64(handle, uplo, trans, diag, n, A, lda, &x, incx); + scuda_intercept_result = cublasDtrsv_v2_64(handle, uplo, trans, diag, n, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -29961,7 +29961,7 @@ int handle_cublasDtpsv_v2(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int n; - const double* AP; + double AP; double x; int incx; int request_id; @@ -29981,7 +29981,7 @@ int handle_cublasDtpsv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtpsv_v2(handle, uplo, trans, diag, n, AP, &x, incx); + scuda_intercept_result = cublasDtpsv_v2(handle, uplo, trans, diag, n, &AP, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -30000,7 +30000,7 @@ int handle_cublasDtpsv_v2_64(void *conn) cublasOperation_t trans; cublasDiagType_t diag; int64_t n; - const double* AP; + double AP; double x; int64_t incx; int request_id; @@ -30020,7 +30020,7 @@ int handle_cublasDtpsv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtpsv_v2_64(handle, uplo, trans, diag, n, AP, &x, incx); + scuda_intercept_result = cublasDtpsv_v2_64(handle, uplo, trans, diag, n, &AP, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -30282,7 +30282,7 @@ int handle_cublasDtbsv_v2(void *conn) cublasDiagType_t diag; int n; int k; - const double* A; + double A; int lda; double x; int incx; @@ -30305,7 +30305,7 @@ int handle_cublasDtbsv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtbsv_v2(handle, uplo, trans, diag, n, k, A, lda, &x, incx); + scuda_intercept_result = cublasDtbsv_v2(handle, uplo, trans, diag, n, k, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -30325,7 +30325,7 @@ int handle_cublasDtbsv_v2_64(void *conn) cublasDiagType_t diag; int64_t n; int64_t k; - const double* A; + double A; int64_t lda; double x; int64_t incx; @@ -30348,7 +30348,7 @@ int handle_cublasDtbsv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtbsv_v2_64(handle, uplo, trans, diag, n, k, A, lda, &x, incx); + scuda_intercept_result = cublasDtbsv_v2_64(handle, uplo, trans, diag, n, k, &A, lda, &x, incx); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &x, sizeof(double)) < 0 || @@ -30627,12 +30627,12 @@ int handle_cublasDsymv_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* x; + double x; int incx; - const double* beta; + double beta; double y; int incy; int request_id; @@ -30655,7 +30655,7 @@ int handle_cublasDsymv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsymv_v2(handle, uplo, n, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDsymv_v2(handle, uplo, n, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -30672,12 +30672,12 @@ int handle_cublasDsymv_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* x; + double x; int64_t incx; - const double* beta; + double beta; double y; int64_t incy; int request_id; @@ -30700,7 +30700,7 @@ int handle_cublasDsymv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsymv_v2_64(handle, uplo, n, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDsymv_v2_64(handle, uplo, n, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -31172,12 +31172,12 @@ int handle_cublasDsbmv_v2(void *conn) cublasFillMode_t uplo; int n; int k; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* x; + double x; int incx; - const double* beta; + double beta; double y; int incy; int request_id; @@ -31201,7 +31201,7 @@ int handle_cublasDsbmv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsbmv_v2(handle, uplo, n, k, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDsbmv_v2(handle, uplo, n, k, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -31219,12 +31219,12 @@ int handle_cublasDsbmv_v2_64(void *conn) cublasFillMode_t uplo; int64_t n; int64_t k; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* x; + double x; int64_t incx; - const double* beta; + double beta; double y; int64_t incy; int request_id; @@ -31248,7 +31248,7 @@ int handle_cublasDsbmv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsbmv_v2_64(handle, uplo, n, k, alpha, A, lda, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDsbmv_v2_64(handle, uplo, n, k, &alpha, &A, lda, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -31539,11 +31539,11 @@ int handle_cublasDspmv_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; - const double* AP; - const double* x; + double alpha; + double AP; + double x; int incx; - const double* beta; + double beta; double y; int incy; int request_id; @@ -31565,7 +31565,7 @@ int handle_cublasDspmv_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDspmv_v2(handle, uplo, n, alpha, AP, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDspmv_v2(handle, uplo, n, &alpha, &AP, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -31582,11 +31582,11 @@ int handle_cublasDspmv_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; - const double* AP; - const double* x; + double alpha; + double AP; + double x; int64_t incx; - const double* beta; + double beta; double y; int64_t incy; int request_id; @@ -31608,7 +31608,7 @@ int handle_cublasDspmv_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDspmv_v2_64(handle, uplo, n, alpha, AP, x, incx, beta, &y, incy); + scuda_intercept_result = cublasDspmv_v2_64(handle, uplo, n, &alpha, &AP, &x, incx, &beta, &y, incy); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -31883,10 +31883,10 @@ int handle_cublasDger_v2(void *conn) cublasHandle_t handle; int m; int n; - const double* alpha; - const double* x; + double alpha; + double x; int incx; - const double* y; + double y; int incy; double A; int lda; @@ -31909,7 +31909,7 @@ int handle_cublasDger_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDger_v2(handle, m, n, alpha, x, incx, y, incy, &A, lda); + scuda_intercept_result = cublasDger_v2(handle, m, n, &alpha, &x, incx, &y, incy, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(double)) < 0 || @@ -31926,10 +31926,10 @@ int handle_cublasDger_v2_64(void *conn) cublasHandle_t handle; int64_t m; int64_t n; - const double* alpha; - const double* x; + double alpha; + double x; int64_t incx; - const double* y; + double y; int64_t incy; double A; int64_t lda; @@ -31952,7 +31952,7 @@ int handle_cublasDger_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDger_v2_64(handle, m, n, alpha, x, incx, y, incy, &A, lda); + scuda_intercept_result = cublasDger_v2_64(handle, m, n, &alpha, &x, incx, &y, incy, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(double)) < 0 || @@ -32391,8 +32391,8 @@ int handle_cublasDsyr_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; - const double* x; + double alpha; + double x; int incx; double A; int lda; @@ -32413,7 +32413,7 @@ int handle_cublasDsyr_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyr_v2(handle, uplo, n, alpha, x, incx, &A, lda); + scuda_intercept_result = cublasDsyr_v2(handle, uplo, n, &alpha, &x, incx, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(double)) < 0 || @@ -32430,8 +32430,8 @@ int handle_cublasDsyr_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; - const double* x; + double alpha; + double x; int64_t incx; double A; int64_t lda; @@ -32452,7 +32452,7 @@ int handle_cublasDsyr_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyr_v2_64(handle, uplo, n, alpha, x, incx, &A, lda); + scuda_intercept_result = cublasDsyr_v2_64(handle, uplo, n, &alpha, &x, incx, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(double)) < 0 || @@ -32703,7 +32703,7 @@ int handle_cublasZher_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; + double alpha; const cuDoubleComplex* x; int incx; cuDoubleComplex A; @@ -32725,7 +32725,7 @@ int handle_cublasZher_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZher_v2(handle, uplo, n, alpha, x, incx, &A, lda); + scuda_intercept_result = cublasZher_v2(handle, uplo, n, &alpha, x, incx, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 || @@ -32742,7 +32742,7 @@ int handle_cublasZher_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; + double alpha; const cuDoubleComplex* x; int64_t incx; cuDoubleComplex A; @@ -32764,7 +32764,7 @@ int handle_cublasZher_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZher_v2_64(handle, uplo, n, alpha, x, incx, &A, lda); + scuda_intercept_result = cublasZher_v2_64(handle, uplo, n, &alpha, x, incx, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 || @@ -32855,8 +32855,8 @@ int handle_cublasDspr_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; - const double* x; + double alpha; + double x; int incx; double AP; int request_id; @@ -32875,7 +32875,7 @@ int handle_cublasDspr_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDspr_v2(handle, uplo, n, alpha, x, incx, &AP); + scuda_intercept_result = cublasDspr_v2(handle, uplo, n, &alpha, &x, incx, &AP); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &AP, sizeof(double)) < 0 || @@ -32892,8 +32892,8 @@ int handle_cublasDspr_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; - const double* x; + double alpha; + double x; int64_t incx; double AP; int request_id; @@ -32912,7 +32912,7 @@ int handle_cublasDspr_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDspr_v2_64(handle, uplo, n, alpha, x, incx, &AP); + scuda_intercept_result = cublasDspr_v2_64(handle, uplo, n, &alpha, &x, incx, &AP); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &AP, sizeof(double)) < 0 || @@ -33003,7 +33003,7 @@ int handle_cublasZhpr_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; + double alpha; const cuDoubleComplex* x; int incx; cuDoubleComplex AP; @@ -33023,7 +33023,7 @@ int handle_cublasZhpr_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZhpr_v2(handle, uplo, n, alpha, x, incx, &AP); + scuda_intercept_result = cublasZhpr_v2(handle, uplo, n, &alpha, x, incx, &AP); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 || @@ -33040,7 +33040,7 @@ int handle_cublasZhpr_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; + double alpha; const cuDoubleComplex* x; int64_t incx; cuDoubleComplex AP; @@ -33060,7 +33060,7 @@ int handle_cublasZhpr_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZhpr_v2_64(handle, uplo, n, alpha, x, incx, &AP); + scuda_intercept_result = cublasZhpr_v2_64(handle, uplo, n, &alpha, x, incx, &AP); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 || @@ -33163,10 +33163,10 @@ int handle_cublasDsyr2_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; - const double* x; + double alpha; + double x; int incx; - const double* y; + double y; int incy; double A; int lda; @@ -33189,7 +33189,7 @@ int handle_cublasDsyr2_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyr2_v2(handle, uplo, n, alpha, x, incx, y, incy, &A, lda); + scuda_intercept_result = cublasDsyr2_v2(handle, uplo, n, &alpha, &x, incx, &y, incy, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(double)) < 0 || @@ -33206,10 +33206,10 @@ int handle_cublasDsyr2_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; - const double* x; + double alpha; + double x; int64_t incx; - const double* y; + double y; int64_t incy; double A; int64_t lda; @@ -33232,7 +33232,7 @@ int handle_cublasDsyr2_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyr2_v2_64(handle, uplo, n, alpha, x, incx, y, incy, &A, lda); + scuda_intercept_result = cublasDsyr2_v2_64(handle, uplo, n, &alpha, &x, incx, &y, incy, &A, lda); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &A, sizeof(double)) < 0 || @@ -33675,10 +33675,10 @@ int handle_cublasDspr2_v2(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int n; - const double* alpha; - const double* x; + double alpha; + double x; int incx; - const double* y; + double y; int incy; double AP; int request_id; @@ -33699,7 +33699,7 @@ int handle_cublasDspr2_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDspr2_v2(handle, uplo, n, alpha, x, incx, y, incy, &AP); + scuda_intercept_result = cublasDspr2_v2(handle, uplo, n, &alpha, &x, incx, &y, incy, &AP); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &AP, sizeof(double)) < 0 || @@ -33716,10 +33716,10 @@ int handle_cublasDspr2_v2_64(void *conn) cublasHandle_t handle; cublasFillMode_t uplo; int64_t n; - const double* alpha; - const double* x; + double alpha; + double x; int64_t incx; - const double* y; + double y; int64_t incy; double AP; int request_id; @@ -33740,7 +33740,7 @@ int handle_cublasDspr2_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDspr2_v2_64(handle, uplo, n, alpha, x, incx, y, incy, &AP); + scuda_intercept_result = cublasDspr2_v2_64(handle, uplo, n, &alpha, &x, incx, &y, incy, &AP); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &AP, sizeof(double)) < 0 || @@ -33924,15 +33924,13 @@ int handle_cublasSgemvBatched(void *conn) int m; int n; const float* alpha; + const float* * Aarray = nullptr; int lda; + const float* * xarray = nullptr; int incx; const float* beta; + float* * yarray = nullptr; int incy; - if (rpc_read(conn, &batchCount, sizeof(int)) < 0) - return -1; - const float* * Aarray = new const float* [batchCount]; - const float* * xarray = new const float* [batchCount]; - float* * yarray = new float* [batchCount]; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -33942,12 +33940,12 @@ int handle_cublasSgemvBatched(void *conn) rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, Aarray, sizeof(const float* const[batchCount])) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, xarray, sizeof(const float* const[batchCount])) < 0 || + rpc_read(conn, &xarray, sizeof(const float* const)) < 0 || rpc_read(conn, &incx, sizeof(int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, yarray, sizeof(float* const[batchCount])) < 0 || + rpc_read(conn, &yarray, sizeof(float* const)) < 0 || rpc_read(conn, &incy, sizeof(int)) < 0 || false) goto ERROR_0; @@ -33974,15 +33972,13 @@ int handle_cublasTSTgemvBatched(void *conn) int m; int n; const float* alpha; + const __nv_bfloat16* * Aarray = nullptr; int lda; + const __nv_bfloat16* * xarray = nullptr; int incx; const float* beta; + __nv_bfloat16* * yarray = nullptr; int incy; - if (rpc_read(conn, &batchCount, sizeof(int)) < 0) - return -1; - const __nv_bfloat16* * Aarray = new const __nv_bfloat16* [batchCount]; - const __nv_bfloat16* * xarray = new const __nv_bfloat16* [batchCount]; - __nv_bfloat16* * yarray = new __nv_bfloat16* [batchCount]; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -33992,12 +33988,12 @@ int handle_cublasTSTgemvBatched(void *conn) rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, Aarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_read(conn, &Aarray, sizeof(const __nv_bfloat16* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, xarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_read(conn, &xarray, sizeof(const __nv_bfloat16* const)) < 0 || rpc_read(conn, &incx, sizeof(int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, yarray, sizeof(__nv_bfloat16* const[batchCount])) < 0 || + rpc_read(conn, &yarray, sizeof(__nv_bfloat16* const)) < 0 || rpc_read(conn, &incy, sizeof(int)) < 0 || false) goto ERROR_0; @@ -34132,14 +34128,14 @@ int handle_cublasDgemvStridedBatched(void *conn) cublasOperation_t trans; int m; int n; - const double* alpha; - const double* A; + double alpha; + double A; int lda; long long int strideA; - const double* x; + double x; int incx; long long int stridex; - const double* beta; + double beta; double y; int incy; long long int stridey; @@ -34169,7 +34165,7 @@ int handle_cublasDgemvStridedBatched(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemvStridedBatched(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex, beta, &y, incy, stridey, batchCount); + scuda_intercept_result = cublasDgemvStridedBatched(handle, trans, m, n, &alpha, &A, lda, strideA, &x, incx, stridex, &beta, &y, incy, stridey, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -34187,14 +34183,14 @@ int handle_cublasDgemvStridedBatched_64(void *conn) cublasOperation_t trans; int64_t m; int64_t n; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; long long int strideA; - const double* x; + double x; int64_t incx; long long int stridex; - const double* beta; + double beta; double y; int64_t incy; long long int stridey; @@ -34224,7 +34220,7 @@ int handle_cublasDgemvStridedBatched_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemvStridedBatched_64(handle, trans, m, n, alpha, A, lda, strideA, x, incx, stridex, beta, &y, incy, stridey, batchCount); + scuda_intercept_result = cublasDgemvStridedBatched_64(handle, trans, m, n, &alpha, &A, lda, strideA, &x, incx, stridex, &beta, &y, incy, stridey, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &y, sizeof(double)) < 0 || @@ -35009,12 +35005,12 @@ int handle_cublasDgemm_v2(void *conn) int m; int n; int k; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* B; + double B; int ldb; - const double* beta; + double beta; double C; int ldc; int request_id; @@ -35040,7 +35036,7 @@ int handle_cublasDgemm_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemm_v2(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDgemm_v2(handle, transa, transb, m, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -35060,12 +35056,12 @@ int handle_cublasDgemm_v2_64(void *conn) int64_t m; int64_t n; int64_t k; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* B; + double B; int64_t ldb; - const double* beta; + double beta; double C; int64_t ldc; int request_id; @@ -35091,7 +35087,7 @@ int handle_cublasDgemm_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemm_v2_64(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDgemm_v2_64(handle, transa, transb, m, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -35710,10 +35706,10 @@ int handle_cublasDsyrk_v2(void *conn) cublasOperation_t trans; int n; int k; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* beta; + double beta; double C; int ldc; int request_id; @@ -35736,7 +35732,7 @@ int handle_cublasDsyrk_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyrk_v2(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc); + scuda_intercept_result = cublasDsyrk_v2(handle, uplo, trans, n, k, &alpha, &A, lda, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -35755,10 +35751,10 @@ int handle_cublasDsyrk_v2_64(void *conn) cublasOperation_t trans; int64_t n; int64_t k; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* beta; + double beta; double C; int64_t ldc; int request_id; @@ -35781,7 +35777,7 @@ int handle_cublasDsyrk_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyrk_v2_64(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc); + scuda_intercept_result = cublasDsyrk_v2_64(handle, uplo, trans, n, k, &alpha, &A, lda, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -36070,10 +36066,10 @@ int handle_cublasZherk_v2(void *conn) cublasOperation_t trans; int n; int k; - const double* alpha; + double alpha; const cuDoubleComplex* A; int lda; - const double* beta; + double beta; cuDoubleComplex C; int ldc; int request_id; @@ -36096,7 +36092,7 @@ int handle_cublasZherk_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZherk_v2(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc); + scuda_intercept_result = cublasZherk_v2(handle, uplo, trans, n, k, &alpha, A, lda, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -36115,10 +36111,10 @@ int handle_cublasZherk_v2_64(void *conn) cublasOperation_t trans; int64_t n; int64_t k; - const double* alpha; + double alpha; const cuDoubleComplex* A; int64_t lda; - const double* beta; + double beta; cuDoubleComplex C; int64_t ldc; int request_id; @@ -36141,7 +36137,7 @@ int handle_cublasZherk_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZherk_v2_64(handle, uplo, trans, n, k, alpha, A, lda, beta, &C, ldc); + scuda_intercept_result = cublasZherk_v2_64(handle, uplo, trans, n, k, &alpha, A, lda, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -36258,12 +36254,12 @@ int handle_cublasDsyr2k_v2(void *conn) cublasOperation_t trans; int n; int k; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* B; + double B; int ldb; - const double* beta; + double beta; double C; int ldc; int request_id; @@ -36288,7 +36284,7 @@ int handle_cublasDsyr2k_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyr2k_v2(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDsyr2k_v2(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -36307,12 +36303,12 @@ int handle_cublasDsyr2k_v2_64(void *conn) cublasOperation_t trans; int64_t n; int64_t k; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* B; + double B; int64_t ldb; - const double* beta; + double beta; double C; int64_t ldc; int request_id; @@ -36337,7 +36333,7 @@ int handle_cublasDsyr2k_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyr2k_v2_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDsyr2k_v2_64(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -36655,7 +36651,7 @@ int handle_cublasZher2k_v2(void *conn) int lda; const cuDoubleComplex* B; int ldb; - const double* beta; + double beta; cuDoubleComplex C; int ldc; int request_id; @@ -36680,7 +36676,7 @@ int handle_cublasZher2k_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZher2k_v2(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasZher2k_v2(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -36704,7 +36700,7 @@ int handle_cublasZher2k_v2_64(void *conn) int64_t lda; const cuDoubleComplex* B; int64_t ldb; - const double* beta; + double beta; cuDoubleComplex C; int64_t ldc; int request_id; @@ -36729,7 +36725,7 @@ int handle_cublasZher2k_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZher2k_v2_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasZher2k_v2_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -36846,12 +36842,12 @@ int handle_cublasDsyrkx(void *conn) cublasOperation_t trans; int n; int k; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* B; + double B; int ldb; - const double* beta; + double beta; double C; int ldc; int request_id; @@ -36876,7 +36872,7 @@ int handle_cublasDsyrkx(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyrkx(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDsyrkx(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -36895,12 +36891,12 @@ int handle_cublasDsyrkx_64(void *conn) cublasOperation_t trans; int64_t n; int64_t k; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* B; + double B; int64_t ldb; - const double* beta; + double beta; double C; int64_t ldc; int request_id; @@ -36925,7 +36921,7 @@ int handle_cublasDsyrkx_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsyrkx_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDsyrkx_64(handle, uplo, trans, n, k, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -37243,7 +37239,7 @@ int handle_cublasZherkx(void *conn) int lda; const cuDoubleComplex* B; int ldb; - const double* beta; + double beta; cuDoubleComplex C; int ldc; int request_id; @@ -37268,7 +37264,7 @@ int handle_cublasZherkx(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZherkx(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasZherkx(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -37292,7 +37288,7 @@ int handle_cublasZherkx_64(void *conn) int64_t lda; const cuDoubleComplex* B; int64_t ldb; - const double* beta; + double beta; cuDoubleComplex C; int64_t ldc; int request_id; @@ -37317,7 +37313,7 @@ int handle_cublasZherkx_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZherkx_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasZherkx_64(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -37434,12 +37430,12 @@ int handle_cublasDsymm_v2(void *conn) cublasFillMode_t uplo; int m; int n; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* B; + double B; int ldb; - const double* beta; + double beta; double C; int ldc; int request_id; @@ -37464,7 +37460,7 @@ int handle_cublasDsymm_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsymm_v2(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDsymm_v2(handle, side, uplo, m, n, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -37483,12 +37479,12 @@ int handle_cublasDsymm_v2_64(void *conn) cublasFillMode_t uplo; int64_t m; int64_t n; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* B; + double B; int64_t ldb; - const double* beta; + double beta; double C; int64_t ldc; int request_id; @@ -37513,7 +37509,7 @@ int handle_cublasDsymm_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDsymm_v2_64(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, &C, ldc); + scuda_intercept_result = cublasDsymm_v2_64(handle, side, uplo, m, n, &alpha, &A, lda, &B, ldb, &beta, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -38020,8 +38016,8 @@ int handle_cublasDtrsm_v2(void *conn) cublasDiagType_t diag; int m; int n; - const double* alpha; - const double* A; + double alpha; + double A; int lda; double B; int ldb; @@ -38046,7 +38042,7 @@ int handle_cublasDtrsm_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrsm_v2(handle, side, uplo, trans, diag, m, n, alpha, A, lda, &B, ldb); + scuda_intercept_result = cublasDtrsm_v2(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &B, sizeof(double)) < 0 || @@ -38067,8 +38063,8 @@ int handle_cublasDtrsm_v2_64(void *conn) cublasDiagType_t diag; int64_t m; int64_t n; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; double B; int64_t ldb; @@ -38093,7 +38089,7 @@ int handle_cublasDtrsm_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrsm_v2_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, &B, ldb); + scuda_intercept_result = cublasDtrsm_v2_64(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &B, sizeof(double)) < 0 || @@ -38404,10 +38400,10 @@ int handle_cublasDtrmm_v2(void *conn) cublasDiagType_t diag; int m; int n; - const double* alpha; - const double* A; + double alpha; + double A; int lda; - const double* B; + double B; int ldb; double C; int ldc; @@ -38434,7 +38430,7 @@ int handle_cublasDtrmm_v2(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrmm_v2(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, &C, ldc); + scuda_intercept_result = cublasDtrmm_v2(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -38455,10 +38451,10 @@ int handle_cublasDtrmm_v2_64(void *conn) cublasDiagType_t diag; int64_t m; int64_t n; - const double* alpha; - const double* A; + double alpha; + double A; int64_t lda; - const double* B; + double B; int64_t ldb; double C; int64_t ldc; @@ -38485,7 +38481,7 @@ int handle_cublasDtrmm_v2_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrmm_v2_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, &C, ldc); + scuda_intercept_result = cublasDtrmm_v2_64(handle, side, uplo, trans, diag, m, n, &alpha, &A, lda, &B, ldb, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -38701,8 +38697,9 @@ int handle_cublasZtrmm_v2_64(void *conn) return -1; } -int handle_cublasHgemmStridedBatched(void *conn) +int handle_cublasHgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38710,20 +38707,17 @@ int handle_cublasHgemmStridedBatched(void *conn) int n; int k; const __half* alpha; - const __half* A; + const __half* * Aarray = nullptr; int lda; - long long int strideA; - const __half* B; + const __half* * Barray = nullptr; int ldb; - long long int strideB; const __half* beta; - __half C; + __half* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38731,27 +38725,22 @@ int handle_cublasHgemmStridedBatched(void *conn) rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const __half*)) < 0 || - rpc_read(conn, &A, sizeof(const __half*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const __half* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const __half*)) < 0 || + rpc_read(conn, &Barray, sizeof(const __half* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const __half*)) < 0 || - rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &Carray, sizeof(__half* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasHgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38760,8 +38749,9 @@ int handle_cublasHgemmStridedBatched(void *conn) return -1; } -int handle_cublasHgemmStridedBatched_64(void *conn) +int handle_cublasHgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38769,20 +38759,17 @@ int handle_cublasHgemmStridedBatched_64(void *conn) int64_t n; int64_t k; const __half* alpha; - const __half* A; + const __half* * Aarray = nullptr; int64_t lda; - long long int strideA; - const __half* B; + const __half* * Barray = nullptr; int64_t ldb; - long long int strideB; const __half* beta; - __half C; + __half* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38790,27 +38777,22 @@ int handle_cublasHgemmStridedBatched_64(void *conn) rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const __half*)) < 0 || - rpc_read(conn, &A, sizeof(const __half*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const __half* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const __half*)) < 0 || + rpc_read(conn, &Barray, sizeof(const __half* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const __half*)) < 0 || - rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &Carray, sizeof(__half* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasHgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasHgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38819,8 +38801,9 @@ int handle_cublasHgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasSgemmStridedBatched(void *conn) +int handle_cublasSgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38828,20 +38811,17 @@ int handle_cublasSgemmStridedBatched(void *conn) int n; int k; const float* alpha; - const float* A; + const float* * Aarray = nullptr; int lda; - long long int strideA; - const float* B; + const float* * Barray = nullptr; int ldb; - long long int strideB; const float* beta; - float C; + float* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38849,27 +38829,22 @@ int handle_cublasSgemmStridedBatched(void *conn) rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &Barray, sizeof(const float* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &Carray, sizeof(float* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38878,8 +38853,9 @@ int handle_cublasSgemmStridedBatched(void *conn) return -1; } -int handle_cublasSgemmStridedBatched_64(void *conn) +int handle_cublasSgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38887,20 +38863,17 @@ int handle_cublasSgemmStridedBatched_64(void *conn) int64_t n; int64_t k; const float* alpha; - const float* A; + const float* * Aarray = nullptr; int64_t lda; - long long int strideA; - const float* B; + const float* * Barray = nullptr; int64_t ldb; - long long int strideB; const float* beta; - float C; + float* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38908,27 +38881,22 @@ int handle_cublasSgemmStridedBatched_64(void *conn) rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &Barray, sizeof(const float* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &Carray, sizeof(float* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasSgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38937,29 +38905,27 @@ int handle_cublasSgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasDgemmStridedBatched(void *conn) +int handle_cublasDgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; - const double* alpha; - const double* A; + double alpha; + const double* * Aarray = nullptr; int lda; - long long int strideA; - const double* B; + const double* * Barray = nullptr; int ldb; - long long int strideB; - const double* beta; - double C; + double beta; + double* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38967,27 +38933,22 @@ int handle_cublasDgemmStridedBatched(void *conn) rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const double*)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &Barray, sizeof(const double* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const double*)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &Carray, sizeof(double* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasDgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38996,29 +38957,27 @@ int handle_cublasDgemmStridedBatched(void *conn) return -1; } -int handle_cublasDgemmStridedBatched_64(void *conn) +int handle_cublasDgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; int64_t k; - const double* alpha; - const double* A; + double alpha; + const double* * Aarray = nullptr; int64_t lda; - long long int strideA; - const double* B; + const double* * Barray = nullptr; int64_t ldb; - long long int strideB; - const double* beta; - double C; + double beta; + double* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -39026,27 +38985,22 @@ int handle_cublasDgemmStridedBatched_64(void *conn) rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const double*)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &Barray, sizeof(const double* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const double*)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &Carray, sizeof(double* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasDgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39055,8 +39009,9 @@ int handle_cublasDgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasCgemmStridedBatched(void *conn) +int handle_cublasCgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -39064,20 +39019,17 @@ int handle_cublasCgemmStridedBatched(void *conn) int n; int k; const cuComplex* alpha; - const cuComplex* A; + const cuComplex* * Aarray = nullptr; int lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int ldb; - long long int strideB; const cuComplex* beta; - cuComplex C; + cuComplex* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -39085,27 +39037,22 @@ int handle_cublasCgemmStridedBatched(void *conn) rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39114,8 +39061,9 @@ int handle_cublasCgemmStridedBatched(void *conn) return -1; } -int handle_cublasCgemmStridedBatched_64(void *conn) +int handle_cublasCgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -39123,20 +39071,17 @@ int handle_cublasCgemmStridedBatched_64(void *conn) int64_t n; int64_t k; const cuComplex* alpha; - const cuComplex* A; + const cuComplex* * Aarray = nullptr; int64_t lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int64_t ldb; - long long int strideB; const cuComplex* beta; - cuComplex C; + cuComplex* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -39144,27 +39089,22 @@ int handle_cublasCgemmStridedBatched_64(void *conn) rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39173,8 +39113,9 @@ int handle_cublasCgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasCgemm3mStridedBatched(void *conn) +int handle_cublasCgemm3mBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -39182,20 +39123,17 @@ int handle_cublasCgemm3mStridedBatched(void *conn) int n; int k; const cuComplex* alpha; - const cuComplex* A; + const cuComplex* * Aarray = nullptr; int lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int ldb; - long long int strideB; const cuComplex* beta; - cuComplex C; + cuComplex* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -39203,27 +39141,22 @@ int handle_cublasCgemm3mStridedBatched(void *conn) rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemm3mStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemm3mBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39232,8 +39165,9 @@ int handle_cublasCgemm3mStridedBatched(void *conn) return -1; } -int handle_cublasCgemm3mStridedBatched_64(void *conn) +int handle_cublasCgemm3mBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -39241,20 +39175,17 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) int64_t n; int64_t k; const cuComplex* alpha; - const cuComplex* A; + const cuComplex* * Aarray = nullptr; int64_t lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int64_t ldb; - long long int strideB; const cuComplex* beta; - cuComplex C; + cuComplex* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -39262,27 +39193,22 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemm3mStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemm3mBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39291,8 +39217,9 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) return -1; } -int handle_cublasZgemmStridedBatched(void *conn) +int handle_cublasZgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -39300,14 +39227,117 @@ int handle_cublasZgemmStridedBatched(void *conn) int n; int k; const cuDoubleComplex* alpha; - const cuDoubleComplex* A; + const cuDoubleComplex* * Aarray = nullptr; + int lda; + const cuDoubleComplex* * Barray = nullptr; + int ldb; + const cuDoubleComplex* beta; + cuDoubleComplex* * Carray = nullptr; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgemmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + const cuDoubleComplex* alpha; + const cuDoubleComplex* * Aarray = nullptr; + int64_t lda; + const cuDoubleComplex* * Barray = nullptr; + int64_t ldb; + const cuDoubleComplex* beta; + cuDoubleComplex* * Carray = nullptr; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasHgemmStridedBatched(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + const __half* alpha; + const __half* A; int lda; long long int strideA; - const cuDoubleComplex* B; + const __half* B; int ldb; long long int strideB; - const cuDoubleComplex* beta; - cuDoubleComplex C; + const __half* beta; + __half C; int ldc; long long int strideC; int batchCount; @@ -39320,15 +39350,15 @@ int handle_cublasZgemmStridedBatched(void *conn) rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &alpha, sizeof(const __half*)) < 0 || + rpc_read(conn, &A, sizeof(const __half*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &B, sizeof(const __half*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &beta, sizeof(const __half*)) < 0 || + rpc_read(conn, &C, sizeof(__half)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || rpc_read(conn, &strideC, sizeof(long long int)) < 0 || rpc_read(conn, &batchCount, sizeof(int)) < 0 || @@ -39338,10 +39368,10 @@ int handle_cublasZgemmStridedBatched(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39350,7 +39380,7 @@ int handle_cublasZgemmStridedBatched(void *conn) return -1; } -int handle_cublasZgemmStridedBatched_64(void *conn) +int handle_cublasHgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; @@ -39358,15 +39388,15 @@ int handle_cublasZgemmStridedBatched_64(void *conn) int64_t m; int64_t n; int64_t k; - const cuDoubleComplex* alpha; - const cuDoubleComplex* A; + const __half* alpha; + const __half* A; int64_t lda; long long int strideA; - const cuDoubleComplex* B; + const __half* B; int64_t ldb; long long int strideB; - const cuDoubleComplex* beta; - cuDoubleComplex C; + const __half* beta; + __half C; int64_t ldc; long long int strideC; int64_t batchCount; @@ -39379,15 +39409,15 @@ int handle_cublasZgemmStridedBatched_64(void *conn) rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &alpha, sizeof(const __half*)) < 0 || + rpc_read(conn, &A, sizeof(const __half*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &B, sizeof(const __half*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &beta, sizeof(const __half*)) < 0 || + rpc_read(conn, &C, sizeof(__half)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || rpc_read(conn, &strideC, sizeof(long long int)) < 0 || rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || @@ -39397,10 +39427,10 @@ int handle_cublasZgemmStridedBatched_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasHgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39409,21 +39439,26 @@ int handle_cublasZgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasSgeam(void *conn) +int handle_cublasSgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; + int k; const float* alpha; const float* A; int lda; - const float* beta; + long long int strideA; const float* B; int ldb; + long long int strideB; + const float* beta; float C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39432,21 +39467,26 @@ int handle_cublasSgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const float*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const float*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const float*)) < 0 || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(float)) < 0 || @@ -39458,21 +39498,26 @@ int handle_cublasSgeam(void *conn) return -1; } -int handle_cublasSgeam_64(void *conn) +int handle_cublasSgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; + int64_t k; const float* alpha; const float* A; int64_t lda; - const float* beta; + long long int strideA; const float* B; int64_t ldb; + long long int strideB; + const float* beta; float C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39481,21 +39526,26 @@ int handle_cublasSgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta, sizeof(const float*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const float*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const float*)) < 0 || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasSgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(float)) < 0 || @@ -39507,21 +39557,26 @@ int handle_cublasSgeam_64(void *conn) return -1; } -int handle_cublasDgeam(void *conn) +int handle_cublasDgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; - const double* alpha; - const double* A; + int k; + double alpha; + double A; int lda; - const double* beta; - const double* B; + long long int strideA; + double B; int ldb; + long long int strideB; + double beta; double C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39530,21 +39585,26 @@ int handle_cublasDgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const double*)) < 0 || rpc_read(conn, &A, sizeof(const double*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const double*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const double*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const double*)) < 0 || rpc_read(conn, &C, sizeof(double)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasDgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, &A, lda, strideA, &B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -39556,21 +39616,26 @@ int handle_cublasDgeam(void *conn) return -1; } -int handle_cublasDgeam_64(void *conn) +int handle_cublasDgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; - const double* alpha; - const double* A; + int64_t k; + double alpha; + double A; int64_t lda; - const double* beta; - const double* B; + long long int strideA; + double B; int64_t ldb; + long long int strideB; + double beta; double C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39579,21 +39644,26 @@ int handle_cublasDgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const double*)) < 0 || rpc_read(conn, &A, sizeof(const double*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta, sizeof(const double*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const double*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const double*)) < 0 || rpc_read(conn, &C, sizeof(double)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasDgemmStridedBatched_64(handle, transa, transb, m, n, k, &alpha, &A, lda, strideA, &B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -39605,21 +39675,26 @@ int handle_cublasDgeam_64(void *conn) return -1; } -int handle_cublasCgeam(void *conn) +int handle_cublasCgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; + int k; const cuComplex* alpha; const cuComplex* A; int lda; - const cuComplex* beta; + long long int strideA; const cuComplex* B; int ldb; + long long int strideB; + const cuComplex* beta; cuComplex C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39628,21 +39703,26 @@ int handle_cublasCgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &C, sizeof(cuComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasCgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuComplex)) < 0 || @@ -39654,21 +39734,26 @@ int handle_cublasCgeam(void *conn) return -1; } -int handle_cublasCgeam_64(void *conn) +int handle_cublasCgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; + int64_t k; const cuComplex* alpha; const cuComplex* A; int64_t lda; - const cuComplex* beta; + long long int strideA; const cuComplex* B; int64_t ldb; + long long int strideB; + const cuComplex* beta; cuComplex C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39677,21 +39762,144 @@ int handle_cublasCgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgemm3mStridedBatched(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + const cuComplex* alpha; + const cuComplex* A; + int lda; + long long int strideA; + const cuComplex* B; + int ldb; + long long int strideB; + const cuComplex* beta; + cuComplex C; + int ldc; + long long int strideC; + int batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgemm3mStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgemm3mStridedBatched_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + const cuComplex* alpha; + const cuComplex* A; + int64_t lda; + long long int strideA; + const cuComplex* B; + int64_t ldb; + long long int strideB; + const cuComplex* beta; + cuComplex C; + int64_t ldc; + long long int strideC; + int64_t batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &C, sizeof(cuComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasCgemm3mStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuComplex)) < 0 || @@ -39703,21 +39911,26 @@ int handle_cublasCgeam_64(void *conn) return -1; } -int handle_cublasZgeam(void *conn) +int handle_cublasZgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; + int k; const cuDoubleComplex* alpha; const cuDoubleComplex* A; int lda; - const cuDoubleComplex* beta; + long long int strideA; const cuDoubleComplex* B; int ldb; + long long int strideB; + const cuDoubleComplex* beta; cuDoubleComplex C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39726,21 +39939,26 @@ int handle_cublasZgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasZgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -39752,21 +39970,26 @@ int handle_cublasZgeam(void *conn) return -1; } -int handle_cublasZgeam_64(void *conn) +int handle_cublasZgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; + int64_t k; const cuDoubleComplex* alpha; const cuDoubleComplex* A; int64_t lda; - const cuDoubleComplex* beta; + long long int strideA; const cuDoubleComplex* B; int64_t ldb; + long long int strideB; + const cuDoubleComplex* beta; cuDoubleComplex C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39775,21 +39998,26 @@ int handle_cublasZgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasZgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -39801,29 +40029,97 @@ int handle_cublasZgeam_64(void *conn) return -1; } -int handle_cublasSdgmm(void *conn) +int handle_cublasGemmBatchedEx_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; - cublasSideMode_t mode; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + const void* alpha; + const void* * Aarray = nullptr; + cudaDataType Atype; + int64_t lda; + const void* * Barray = nullptr; + cudaDataType Btype; + int64_t ldb; + const void* beta; + void* * Carray = nullptr; + cudaDataType Ctype; + int64_t ldc; + cublasComputeType_t computeType; + cublasGemmAlgo_t algo; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const void*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const void* const)) < 0 || + rpc_read(conn, &Atype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &Barray, sizeof(const void* const)) < 0 || + rpc_read(conn, &Btype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta, sizeof(const void*)) < 0 || + rpc_read(conn, &Carray, sizeof(void* const)) < 0 || + rpc_read(conn, &Ctype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &computeType, sizeof(cublasComputeType_t)) < 0 || + rpc_read(conn, &algo, sizeof(cublasGemmAlgo_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasGemmBatchedEx_64(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda, Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount, computeType, algo); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; int m; int n; + const float* alpha; const float* A; int lda; - const float* x; - int incx; + const float* beta; + const float* B; + int ldb; float C; int ldc; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const float*)) < 0 || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const float*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &beta, sizeof(const float*)) < 0 || + rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || false) @@ -39832,7 +40128,7 @@ int handle_cublasSdgmm(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(float)) < 0 || @@ -39844,29 +40140,35 @@ int handle_cublasSdgmm(void *conn) return -1; } -int handle_cublasSdgmm_64(void *conn) +int handle_cublasSgeam_64(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; + cublasOperation_t transa; + cublasOperation_t transb; int64_t m; int64_t n; + const float* alpha; const float* A; int64_t lda; - const float* x; - int64_t incx; + const float* beta; + const float* B; + int64_t ldb; float C; int64_t ldc; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const float*)) < 0 || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const float*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta, sizeof(const float*)) < 0 || + rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) @@ -39875,7 +40177,7 @@ int handle_cublasSdgmm_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(float)) < 0 || @@ -39887,41 +40189,1629 @@ int handle_cublasSdgmm_64(void *conn) return -1; } -int handle_cublasDdgmm(void *conn) +int handle_cublasDgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + double alpha; + double A; + int lda; + double beta; + double B; + int ldb; + double C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const double*)) < 0 || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta, sizeof(const double*)) < 0 || + rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, &alpha, &A, lda, &beta, &B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgeam_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + double alpha; + double A; + int64_t lda; + double beta; + double B; + int64_t ldb; + double C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const double*)) < 0 || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta, sizeof(const double*)) < 0 || + rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, &alpha, &A, lda, &beta, &B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + const cuComplex* alpha; + const cuComplex* A; + int lda; + const cuComplex* beta; + const cuComplex* B; + int ldb; + cuComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeam_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + const cuComplex* alpha; + const cuComplex* A; + int64_t lda; + const cuComplex* beta; + const cuComplex* B; + int64_t ldb; + cuComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + const cuDoubleComplex* alpha; + const cuDoubleComplex* A; + int lda; + const cuDoubleComplex* beta; + const cuDoubleComplex* B; + int ldb; + cuDoubleComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeam_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + const cuDoubleComplex* alpha; + const cuDoubleComplex* A; + int64_t lda; + const cuDoubleComplex* beta; + const cuDoubleComplex* B; + int64_t ldb; + cuDoubleComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + const float* alpha; + const float* * A = nullptr; + int lda; + float* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const float*)) < 0 || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(float* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasStrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + const float* alpha; + const float* * A = nullptr; + int64_t lda; + float* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const float*)) < 0 || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(float* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasStrsmBatched_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + double alpha; + const double* * A = nullptr; + int lda; + double* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const double*)) < 0 || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(double* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + double alpha; + const double* * A = nullptr; + int64_t lda; + double* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const double*)) < 0 || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(double* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCtrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + const cuComplex* alpha; + const cuComplex* * A = nullptr; + int lda; + cuComplex* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCtrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCtrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + const cuComplex* alpha; + const cuComplex* * A = nullptr; + int64_t lda; + cuComplex* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCtrsmBatched_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZtrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + const cuDoubleComplex* alpha; + const cuDoubleComplex* * A = nullptr; + int lda; + cuDoubleComplex* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZtrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZtrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + const cuDoubleComplex* alpha; + const cuDoubleComplex* * A = nullptr; + int64_t lda; + cuDoubleComplex* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZtrsmBatched_64(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const float* A; + int lda; + const float* x; + int incx; + float C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x, sizeof(const float*)) < 0 || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSdgmm_64(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const float* A; + int64_t lda; + const float* x; + int64_t incx; + float C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x, sizeof(const float*)) < 0 || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + double A; + int lda; + double x; + int incx; + double C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x, sizeof(const double*)) < 0 || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDdgmm(handle, mode, m, n, &A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDdgmm_64(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + double A; + int64_t lda; + double x; + int64_t incx; + double C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x, sizeof(const double*)) < 0 || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDdgmm_64(handle, mode, m, n, &A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const cuComplex* A; + int lda; + const cuComplex* x; + int incx; + cuComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCdgmm_64(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const cuComplex* A; + int64_t lda; + const cuComplex* x; + int64_t incx; + cuComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const cuDoubleComplex* A; + int lda; + const cuDoubleComplex* x; + int incx; + cuDoubleComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZdgmm_64(void *conn) { cublasHandle_t handle; cublasSideMode_t mode; + int64_t m; + int64_t n; + const cuDoubleComplex* A; + int64_t lda; + const cuDoubleComplex* x; + int64_t incx; + cuDoubleComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const float* * A = nullptr; + int lda; + float* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(float* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const double* * A = nullptr; + int lda; + double* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(double* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const cuComplex* * A = nullptr; + int lda; + cuComplex* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const cuDoubleComplex* * A = nullptr; + int lda; + cuDoubleComplex* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + float* * Aarray = nullptr; + int lda; + float* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(float* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + double* * Aarray = nullptr; + int lda; + double* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(double* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + cuComplex* * Aarray = nullptr; + int lda; + cuComplex* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + cuDoubleComplex* * Aarray = nullptr; + int lda; + cuDoubleComplex* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + int nrhs; + float* * Aarray = nullptr; + int lda; + float* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(float* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + int nrhs; + double* * Aarray = nullptr; + int lda; + double* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(double* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + int nrhs; + cuComplex* * Aarray = nullptr; + int lda; + cuComplex* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; int m; int n; - const double* A; + int nrhs; + cuDoubleComplex* * Aarray = nullptr; + int lda; + cuDoubleComplex* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStpttr(void *conn) +{ + cublasHandle_t handle; + cublasFillMode_t uplo; + int n; + const float* AP; + float A; + int lda; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(const float*)) < 0 || + rpc_read(conn, &A, sizeof(float)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasStpttr(handle, uplo, n, AP, &A, lda); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &A, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtpttr(void *conn) +{ + cublasHandle_t handle; + cublasFillMode_t uplo; + int n; + double AP; + double A; + int lda; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(const double*)) < 0 || + rpc_read(conn, &A, sizeof(double)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDtpttr(handle, uplo, n, &AP, &A, lda); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &A, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCtpttr(void *conn) +{ + cublasHandle_t handle; + cublasFillMode_t uplo; + int n; + const cuComplex* AP; + cuComplex A; int lda; - const double* x; - int incx; - double C; - int ldc; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &AP, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(cuComplex)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const double*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasCtpttr(handle, uplo, n, AP, &A, lda); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_write(conn, &A, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39930,41 +41820,33 @@ int handle_cublasDdgmm(void *conn) return -1; } -int handle_cublasDdgmm_64(void *conn) +int handle_cublasZtpttr(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const double* A; - int64_t lda; - const double* x; - int64_t incx; - double C; - int64_t ldc; + cublasFillMode_t uplo; + int n; + const cuDoubleComplex* AP; + cuDoubleComplex A; + int lda; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const double*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &A, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasZtpttr(handle, uplo, n, AP, &A, lda); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39973,41 +41855,33 @@ int handle_cublasDdgmm_64(void *conn) return -1; } -int handle_cublasCdgmm(void *conn) +int handle_cublasStrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int m; + cublasFillMode_t uplo; int n; - const cuComplex* A; + const float* A; int lda; - const cuComplex* x; - int incx; - cuComplex C; - int ldc; + float AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(float)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasStrttp(handle, uplo, n, A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_write(conn, &AP, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40016,41 +41890,33 @@ int handle_cublasCdgmm(void *conn) return -1; } -int handle_cublasCdgmm_64(void *conn) +int handle_cublasDtrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const cuComplex* A; - int64_t lda; - const cuComplex* x; - int64_t incx; - cuComplex C; - int64_t ldc; + cublasFillMode_t uplo; + int n; + double A; + int lda; + double AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(double)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasDtrttp(handle, uplo, n, &A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_write(conn, &AP, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40059,41 +41925,33 @@ int handle_cublasCdgmm_64(void *conn) return -1; } -int handle_cublasZdgmm(void *conn) +int handle_cublasCtrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int m; + cublasFillMode_t uplo; int n; - const cuDoubleComplex* A; + const cuComplex* A; int lda; - const cuDoubleComplex* x; - int incx; - cuDoubleComplex C; - int ldc; + cuComplex AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(cuComplex)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasCtrttp(handle, uplo, n, A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &AP, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40102,41 +41960,33 @@ int handle_cublasZdgmm(void *conn) return -1; } -int handle_cublasZdgmm_64(void *conn) +int handle_cublasZtrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; + cublasFillMode_t uplo; + int n; const cuDoubleComplex* A; - int64_t lda; - const cuDoubleComplex* x; - int64_t incx; - cuDoubleComplex C; - int64_t ldc; + int lda; + cuDoubleComplex AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(cuDoubleComplex)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasZtrttp(handle, uplo, n, A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40145,33 +41995,39 @@ int handle_cublasZdgmm_64(void *conn) return -1; } -int handle_cublasStpttr(void *conn) +int handle_cublasSgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const float* AP; - float A; + const float* * A = nullptr; int lda; + const int* P; + float* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const float*)) < 0 || - rpc_read(conn, &A, sizeof(float)) < 0 || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P, sizeof(const int*)) < 0 || + rpc_read(conn, &C, sizeof(float* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasStpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasSgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(float)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40180,33 +42036,39 @@ int handle_cublasStpttr(void *conn) return -1; } -int handle_cublasDtpttr(void *conn) +int handle_cublasDgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const double* AP; - double A; + const double* * A = nullptr; int lda; + const int* P; + double* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const double*)) < 0 || - rpc_read(conn, &A, sizeof(double)) < 0 || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P, sizeof(const int*)) < 0 || + rpc_read(conn, &C, sizeof(double* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasDgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(double)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40215,33 +42077,39 @@ int handle_cublasDtpttr(void *conn) return -1; } -int handle_cublasCtpttr(void *conn) +int handle_cublasCgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const cuComplex* AP; - cuComplex A; + const cuComplex* * A = nullptr; int lda; + const int* P; + cuComplex* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(cuComplex)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P, sizeof(const int*)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCtpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasCgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(cuComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40250,33 +42118,39 @@ int handle_cublasCtpttr(void *conn) return -1; } -int handle_cublasZtpttr(void *conn) +int handle_cublasZgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const cuDoubleComplex* AP; - cuDoubleComplex A; + const cuDoubleComplex* * A = nullptr; int lda; + const int* P; + cuDoubleComplex* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &A, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P, sizeof(const int*)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZtpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasZgetriBatched(handle, n, A, lda, P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40285,33 +42159,43 @@ int handle_cublasZtpttr(void *conn) return -1; } -int handle_cublasStrttp(void *conn) +int handle_cublasSgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const float* A; + int nrhs; + const float* * Aarray = nullptr; int lda; - float AP; + const int* devIpiv; + float* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(float)) < 0 || + rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 || + rpc_read(conn, &Barray, sizeof(float* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasStrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasSgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(float)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40320,33 +42204,43 @@ int handle_cublasStrttp(void *conn) return -1; } -int handle_cublasDtrttp(void *conn) +int handle_cublasDgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const double* A; + int nrhs; + const double* * Aarray = nullptr; int lda; - double AP; + const int* devIpiv; + double* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(double)) < 0 || + rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 || + rpc_read(conn, &Barray, sizeof(double* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasDgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(double)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40355,33 +42249,43 @@ int handle_cublasDtrttp(void *conn) return -1; } -int handle_cublasCtrttp(void *conn) +int handle_cublasCgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const cuComplex* A; + int nrhs; + const cuComplex* * Aarray = nullptr; int lda; - cuComplex AP; + const int* devIpiv; + cuComplex* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(cuComplex)) < 0 || + rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 || + rpc_read(conn, &Barray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCtrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasCgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(cuComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40390,33 +42294,43 @@ int handle_cublasCtrttp(void *conn) return -1; } -int handle_cublasZtrttp(void *conn) +int handle_cublasZgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const cuDoubleComplex* A; + int nrhs; + const cuDoubleComplex* * Aarray = nullptr; int lda; - cuDoubleComplex AP; + const int* devIpiv; + cuDoubleComplex* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 || + rpc_read(conn, &Barray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZtrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasZgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -43904,6 +45818,18 @@ static RequestHandler opHandlers[] = { handle_cublasCtrmm_v2_64, handle_cublasZtrmm_v2, handle_cublasZtrmm_v2_64, + handle_cublasHgemmBatched, + handle_cublasHgemmBatched_64, + handle_cublasSgemmBatched, + handle_cublasSgemmBatched_64, + handle_cublasDgemmBatched, + handle_cublasDgemmBatched_64, + handle_cublasCgemmBatched, + handle_cublasCgemmBatched_64, + handle_cublasCgemm3mBatched, + handle_cublasCgemm3mBatched_64, + handle_cublasZgemmBatched, + handle_cublasZgemmBatched_64, handle_cublasHgemmStridedBatched, handle_cublasHgemmStridedBatched_64, handle_cublasSgemmStridedBatched, @@ -43916,6 +45842,8 @@ static RequestHandler opHandlers[] = { handle_cublasCgemm3mStridedBatched_64, handle_cublasZgemmStridedBatched, handle_cublasZgemmStridedBatched_64, + nullptr, + handle_cublasGemmBatchedEx_64, handle_cublasSgeam, handle_cublasSgeam_64, handle_cublasDgeam, @@ -43924,6 +45852,14 @@ static RequestHandler opHandlers[] = { handle_cublasCgeam_64, handle_cublasZgeam, handle_cublasZgeam_64, + handle_cublasStrsmBatched, + handle_cublasStrsmBatched_64, + handle_cublasDtrsmBatched, + handle_cublasDtrsmBatched_64, + handle_cublasCtrsmBatched, + handle_cublasCtrsmBatched_64, + handle_cublasZtrsmBatched, + handle_cublasZtrsmBatched_64, handle_cublasSdgmm, handle_cublasSdgmm_64, handle_cublasDdgmm, @@ -43932,6 +45868,18 @@ static RequestHandler opHandlers[] = { handle_cublasCdgmm_64, handle_cublasZdgmm, handle_cublasZdgmm_64, + handle_cublasSmatinvBatched, + handle_cublasDmatinvBatched, + handle_cublasCmatinvBatched, + handle_cublasZmatinvBatched, + handle_cublasSgeqrfBatched, + handle_cublasDgeqrfBatched, + handle_cublasCgeqrfBatched, + handle_cublasZgeqrfBatched, + handle_cublasSgelsBatched, + handle_cublasDgelsBatched, + handle_cublasCgelsBatched, + handle_cublasZgelsBatched, handle_cublasStpttr, handle_cublasDtpttr, handle_cublasCtpttr, @@ -43940,6 +45888,14 @@ static RequestHandler opHandlers[] = { handle_cublasDtrttp, handle_cublasCtrttp, handle_cublasZtrttp, + handle_cublasSgetriBatched, + handle_cublasDgetriBatched, + handle_cublasCgetriBatched, + handle_cublasZgetriBatched, + handle_cublasSgetrsBatched, + handle_cublasDgetrsBatched, + handle_cublasCgetrsBatched, + handle_cublasZgetrsBatched, handle_cublasUint8gemmBias, nullptr, nullptr, diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp index 2b1305c..cd2b0fa 100755 --- a/codegen/manual_server.cpp +++ b/codegen/manual_server.cpp @@ -137,7 +137,7 @@ int handle_cudaMemcpyAsync(void *conn) std::cerr << "Failed to allocate host memory for device-to-host transfer." << std::endl; return -1; } - + int request_id = rpc_end_request(conn); if (request_id < 0) { diff --git a/local.sh b/local.sh index 56fe3de..ae0b62d 100755 --- a/local.sh +++ b/local.sh @@ -27,6 +27,8 @@ build() { nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o + if [ ! -f "$libscuda_path" ]; then echo "libscuda.so not found. build may have failed." exit 1 diff --git a/test/cublas_batched.cu b/test/cublas_batched.cu new file mode 100644 index 0000000..d5e3f92 --- /dev/null +++ b/test/cublas_batched.cu @@ -0,0 +1,196 @@ +/* + * Copyright 2020 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#include +#include +#include + +#include +#include + +#include "cublas_utils.h" + +using data_type = double; + +int main(int argc, char *argv[]) { + cublasHandle_t cublasH = NULL; + cudaStream_t stream = NULL; + + const int m = 2; + const int n = 2; + const int k = 2; + const int lda = 2; + const int ldb = 2; + const int ldc = 2; + const int batch_count = 2; + + /* + * A = | 1.0 | 2.0 | 5.0 | 6.0 | + * | 3.0 | 4.0 | 7.0 | 8.0 | + * + * B = | 5.0 | 6.0 | 9.0 | 10.0 | + * | 7.0 | 8.0 | 11.0 | 12.0 | + */ + + const std::vector> A_array = {{1.0 ,3.0, 2.0, 4.0}, + {5.0, 7.0, 6.0, 8.0}}; + const std::vector> B_array = {{5.0, 7.0, 6.0, 8.0}, + {9.0, 11.0, 10.0, 12.0}}; + std::vector> C_array(batch_count, std::vector(m * n)); + + const data_type alpha = 1.0; + const data_type beta = 0.0; + + data_type **d_A_array = nullptr; + data_type **d_B_array = nullptr; + data_type **d_C_array = nullptr; + + std::vector d_A(batch_count, nullptr); + std::vector d_B(batch_count, nullptr); + std::vector d_C(batch_count, nullptr); + + cublasOperation_t transa = CUBLAS_OP_N; + cublasOperation_t transb = CUBLAS_OP_N; + + printf("A[0]\n"); + print_matrix(m, k, A_array[0].data(), lda); + printf("=====\n"); + + printf("A[1]\n"); + print_matrix(m, k, A_array[1].data(), lda); + printf("=====\n"); + + printf("B[0]\n"); + print_matrix(k, n, B_array[0].data(), ldb); + printf("=====\n"); + + printf("B[1]\n"); + print_matrix(k, n, B_array[1].data(), ldb); + printf("=====\n"); + + /* step 1: create cublas handle, bind a stream */ + CUBLAS_CHECK(cublasCreate(&cublasH)); + + CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUBLAS_CHECK(cublasSetStream(cublasH, stream)); + + /* step 2: copy data to device */ + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * A_array[i].size())); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_B[i]), sizeof(data_type) * B_array[i].size())); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_C[i]), sizeof(data_type) * C_array[i].size())); + } + + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_A_array), sizeof(data_type *) * batch_count)); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_B_array), sizeof(data_type *) * batch_count)); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_C_array), sizeof(data_type *) * batch_count)); + + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK(cudaMemcpyAsync(d_A[i], A_array[i].data(), sizeof(data_type) * A_array[i].size(), + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_B[i], B_array[i].data(), sizeof(data_type) * B_array[i].size(), + cudaMemcpyHostToDevice, stream)); + } + + CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type *) * batch_count, + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type *) * batch_count, + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type *) * batch_count, + cudaMemcpyHostToDevice, stream)); + + /* step 3: compute */ + CUBLAS_CHECK(cublasDgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda, + d_B_array, ldb, &beta, d_C_array, ldc, batch_count)); + + /* step 4: copy data to host */ + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK(cudaMemcpy(C_array[i].data(), d_C[i], sizeof(data_type) * C_array[i].size(), + cudaMemcpyDeviceToHost)); + } + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + /* + * C = | 19.0 | 22.0 | 111.0 | 122.0 | + * | 43.0 | 50.0 | 151.0 | 166.0 | + */ + + printf("C[0]\n"); + print_matrix(m, n, C_array[0].data(), ldc); + printf("=====\n"); + + printf("C[1]\n"); + print_matrix(m, n, C_array[1].data(), ldc); + printf("=====\n"); + + /* free resources */ + CUDA_CHECK(cudaFree(d_A_array)); + CUDA_CHECK(cudaFree(d_B_array)); + CUDA_CHECK(cudaFree(d_C_array)); + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK(cudaFree(d_A[i])); + CUDA_CHECK(cudaFree(d_B[i])); + CUDA_CHECK(cudaFree(d_C[i])); + } + + CUBLAS_CHECK(cublasDestroy(cublasH)); + + CUDA_CHECK(cudaStreamDestroy(stream)); + + CUDA_CHECK(cudaDeviceReset()); + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/test/cublas_utils.h b/test/cublas_utils.h new file mode 100644 index 0000000..61b64ea --- /dev/null +++ b/test/cublas_utils.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// CUDA API error checking +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("CUDA error"); \ + } \ + } while (0) + +// cublas API error checking +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cublas error"); \ + } \ + } while (0) + +// memory alignment +#define ALIGN_TO(A, B) (((A + B - 1) / B) * B) + +// device memory pitch alignment +static const size_t device_alignment = 32; + +// type traits +template struct traits; + +template <> struct traits { + // scalar type + typedef float T; + typedef T S; + + static constexpr T zero = 0.f; + static constexpr cudaDataType cuda_data_type = CUDA_R_32F; + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, double f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef double T; + typedef T S; + + static constexpr T zero = 0.; + static constexpr cudaDataType cuda_data_type = CUDA_R_64F; + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, double f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef float S; + typedef cuFloatComplex T; + + static constexpr T zero = {0.f, 0.f}; + static constexpr cudaDataType cuda_data_type = CUDA_C_32F; + + inline static S abs(T val) { return cuCabsf(val); } + + template inline static T rand(RNG &gen) { + return make_cuFloatComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCaddf(a, b); } + inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); } + + inline static T mul(T v, double f) { return make_cuFloatComplex(v.x * f, v.y * f); } +}; + +template <> struct traits { + // scalar type + typedef double S; + typedef cuDoubleComplex T; + + static constexpr T zero = {0., 0.}; + static constexpr cudaDataType cuda_data_type = CUDA_C_64F; + + inline static S abs(T val) { return cuCabs(val); } + + template inline static T rand(RNG &gen) { + return make_cuDoubleComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCadd(a, b); } + inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); } + + inline static T mul(T v, double f) { return make_cuDoubleComplex(v.x * f, v.y * f); } +}; + +template void print_matrix(const int &m, const int &n, const T *A, const int &lda); + +template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template <> +void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template void print_packed_matrix(cublasFillMode_t uplo, const int &n, const T *A); + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const float *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f ", A[off++]); + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const double *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f ", A[off++]); + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const cuComplex *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f + %6.2fj ", A[off].x, A[off].y); + off++; + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const cuDoubleComplex *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f + %6.2fj ", A[off].x, A[off].y); + off++; + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template void print_vector(const int &m, const T *A); + +template <> void print_vector(const int &m, const float *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f ", A[i]); + } + std::printf("\n"); +} + +template <> void print_vector(const int &m, const double *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f ", A[i]); + } + std::printf("\n"); +} + +template <> void print_vector(const int &m, const cuComplex *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f + %0.2fj ", A[i].x, A[i].y); + } + std::printf("\n"); +} + +template <> void print_vector(const int &m, const cuDoubleComplex *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f + %0.2fj ", A[i].x, A[i].y); + } + std::printf("\n"); +} + +template void generate_random_matrix(int m, int n, T **A, int *lda) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution::S> dis(-1.0, 1.0); + auto rand_gen = std::bind(dis, gen); + + *lda = n; + + size_t matrix_mem_size = static_cast(*lda * m * sizeof(T)); + // suppress gcc 7 size warning + if (matrix_mem_size <= PTRDIFF_MAX) + *A = (T *)malloc(matrix_mem_size); + else + throw std::runtime_error("Memory allocation size is too large"); + + if (*A == NULL) + throw std::runtime_error("Unable to allocate host matrix"); + + // random matrix and accumulate row sums + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + T *A_row = (*A) + *lda * i; + A_row[j] = traits::rand(rand_gen); + } + } +} + +// Makes matrix A of size mxn and leading dimension lda diagonal dominant +template void make_diag_dominant_matrix(int m, int n, T *A, int lda) { + for (int i = 0; i < std::min(m, n); ++i) { + T *A_row = A + lda * i; + auto row_sum = traits::S>::zero; + for (int j = 0; j < n; ++j) { + row_sum += traits::abs(A_row[j]); + } + A_row[i] = traits::add(A_row[i], row_sum); + } +} + +// Returns cudaDataType value as defined in library_types.h for the string +// containing type name +cudaDataType get_cuda_library_type(std::string type_string) { + if (type_string.compare("CUDA_R_16F") == 0) + return CUDA_R_16F; + else if (type_string.compare("CUDA_C_16F") == 0) + return CUDA_C_16F; + else if (type_string.compare("CUDA_R_32F") == 0) + return CUDA_R_32F; + else if (type_string.compare("CUDA_C_32F") == 0) + return CUDA_C_32F; + else if (type_string.compare("CUDA_R_64F") == 0) + return CUDA_R_64F; + else if (type_string.compare("CUDA_C_64F") == 0) + return CUDA_C_64F; + else if (type_string.compare("CUDA_R_8I") == 0) + return CUDA_R_8I; + else if (type_string.compare("CUDA_C_8I") == 0) + return CUDA_C_8I; + else if (type_string.compare("CUDA_R_8U") == 0) + return CUDA_R_8U; + else if (type_string.compare("CUDA_C_8U") == 0) + return CUDA_C_8U; + else if (type_string.compare("CUDA_R_32I") == 0) + return CUDA_R_32I; + else if (type_string.compare("CUDA_C_32I") == 0) + return CUDA_C_32I; + else if (type_string.compare("CUDA_R_32U") == 0) + return CUDA_R_32U; + else if (type_string.compare("CUDA_C_32U") == 0) + return CUDA_C_32U; + else + throw std::runtime_error("Unknown CUDA datatype"); +} \ No newline at end of file